Little HTML-Linkparser

Share your advanced PureBasic knowledge/code with the community.
User avatar
Hroudtwolf
Addict
Addict
Posts: 803
Joined: Sat Feb 12, 2005 3:35 am
Location: Germany(Hessen)
Contact:

Little HTML-Linkparser

Post by Hroudtwolf »

Code updated For 5.20+

Hello friends (enemies, too :D ),

I needed a HTML-Parser, which retrieves links from a HTML-File.
This code is the result of my efforts to solve this problem.

Have fun with it.

Code: Select all

; HTML-Link Parser
; 2006 Hroudtwolf
; PureBasic-Lounge.de

Declare InitHTMLParser ()
Declare LoadHTML_Offline (FileName.s)
Declare ParseHTML_Links()
Declare.s HTMLParser_GetPart (Code.s,PartName.s)
Declare.s GetParsedHTML_Link_URL (LinkNO.l)
Declare.s GetParsedHTML_Link_Target (LinkNO.l)
Declare.s GetParsedHTML_Link_Description (LinkNO.l)

Procedure InitHTMLParser ()
  Shared Is_InitHTMLParser_Initated.l
  If Is_InitHTMLParser_Initated.l=0
    Structure LinkStruct
       Link.s
       Target.s
       Description.s
    EndStructure
    Global NewList HyperLinkList.LinkStruct()
    Is_InitHTMLParser_Initated.l=#True
  EndIf 
EndProcedure 


Procedure LoadHTML_Offline (FileName.s)  
  Shared Is_InitHTMLParser_Initated.l,*HTMLParser_Buffer,HTMLParser_BufferLenght.l
  Protected FileID.l,*Buffer
  FileName.s=Trim(FileName.s)
  If FileName.s="" Or Is_InitHTMLParser_Initated.l=0: ProcedureReturn #False : EndIf 
  FileID.l=ReadFile (#PB_Any,FileName.s)
  If FileID.l
     *HTMLParser_Buffer=AllocateMemory (Lof(FileID.l))
     HTMLParser_BufferLenght.l=Lof(FileID.l)-1
     ReadData (FileID.l,*HTMLParser_Buffer,Lof(FileID.l))
     CloseFile (FileID.l)
  EndIf 
EndProcedure 



Procedure ParseHTML_Links()
  Protected CurrentByte.b,TagName.b,TagOpened.l,Taged.s,CountLinks.l
  Shared Is_InitHTMLParser_Initated.l,*HTMLParser_Buffer,HTMLParser_BufferLenght.l
  If HTMLParser_BufferLenght.l<1 Or Is_InitHTMLParser_Initated.l=0: ProcedureReturn #False : EndIf 
  For x=0 To HTMLParser_BufferLenght.l-1
  If TagOpened.l=#False
    CurrentByte.b=PeekB(*HTMLParser_Buffer+x)
    TagName.b=PeekB(*HTMLParser_Buffer+x+1)
    ElseIf TagOpened.l=#True
    CurrentByte.b=PeekB(*HTMLParser_Buffer+x+1)
    TagName.b=PeekB(*HTMLParser_Buffer+x)
  EndIf 
  If (CurrentByte.b=60 And (TagName.b=65 Or TagName.b=97))
     TagOpened.l=#True
     tagstring.s=""     
     ElseIf (CurrentByte.b=62 And (TagName.b=65 Or TagName.b=97))
     TagOpened.l=#False
     Taged.s=PeekS(@tagstring+1,Len(tagstring.s)-1)
     HTML_URL.s=HTMLParser_GetPart (Taged.s,"href")
     HTML_Target.s=HTMLParser_GetPart (Taged.s,"target")
     HTML_Description.s=StringField(StringField (Taged.s,2,">"),1,"<")
     If HTML_URL.s
        AddElement (HyperLinkList())
        HyperLinkList()\Link=HTML_URL.s
        HyperLinkList()\Target=HTML_Target.s
        HyperLinkList()\Description=HTML_Description.s
        CountLinks.l+1
     EndIf 
     tagstring.s=""     
  EndIf 

  If TagOpened.l=#True And CurrentByte.b<>9 And CurrentByte.b<>10 And CurrentByte.b<>13: tagstring.s+Chr(CurrentByte.b):EndIf 
  Next x
  ProcedureReturn CountLinks.l
  FreeMemory (*HTMLParser_Buffer)
EndProcedure 



Procedure.s HTMLParser_GetPart (Code.s,PartName.s); Just internal needed
   Protected tmppos.l,tmp.s
   tmppos.l=FindString (LCase(Code.s),PartName.s,1)
   If tmppos.l
      tmp.s=Mid(Code.s,tmppos.l+Len(PartName.s),Len(Code.s)-(tmppos.l+Len(PartName.s)))
      tmp.s=ReplaceString (tmp.s,"=","")
      ProcedureReturn StringField (tmp.s,2,Chr(34))
   EndIf 
EndProcedure 


Procedure.s GetParsedHTML_Link_URL (LinkNO.l)
  Shared Is_InitHTMLParser_Initated.l
  If LinkNO.l=0 Or Is_InitHTMLParser_Initated.l=0: ProcedureReturn "" : EndIf 
  SelectElement (HyperLinkList(),LinkNO.l-1)
  ProcedureReturn HyperLinkList()\Link
EndProcedure 

Procedure.s GetParsedHTML_Link_Target (LinkNO.l)
  Shared Is_InitHTMLParser_Initated.l
  If LinkNO.l=0 Or Is_InitHTMLParser_Initated.l=0: ProcedureReturn "" : EndIf 
  SelectElement (HyperLinkList(),LinkNO.l-1)
  ProcedureReturn HyperLinkList()\Target
EndProcedure 

Procedure.s GetParsedHTML_Link_Description (LinkNO.l)
  Shared Is_InitHTMLParser_Initated.l
  If LinkNO.l=0 Or Is_InitHTMLParser_Initated.l=0: ProcedureReturn "" : EndIf 
  SelectElement (HyperLinkList(),LinkNO.l-1)
  ProcedureReturn HyperLinkList()\Description
EndProcedure 



;_--------------------------------------------------------------------------------
InitHTMLParser () ;Initiates everything what the parser needs.
LoadHTML_Offline ("linkparser.html")  ; loads a HTML-File
Links.l=ParseHTML_Links(); Parses the HTML-Code and retrieves the the amount of the found links.
For x=1 To Links.l
  Debug "URL: "+GetParsedHTML_Link_URL (x)
  Debug "Target: "+GetParsedHTML_Link_Target (x)
  Debug "Description: "+GetParsedHTML_Link_Description (x)
  Debug "------------------------------------------------------------------"
Next x