Little HTML-Linkparser
Posted: Mon Feb 13, 2006 11:33 pm
Code updated For 5.20+
Hello friends (enemies, too
),
I needed a HTML-Parser, which retrieves links from a HTML-File.
This code is the result of my efforts to solve this problem.
Have fun with it.
Hello friends (enemies, too

I needed a HTML-Parser, which retrieves links from a HTML-File.
This code is the result of my efforts to solve this problem.
Have fun with it.
Code: Select all
; HTML-Link Parser
; 2006 Hroudtwolf
; PureBasic-Lounge.de
Declare InitHTMLParser ()
Declare LoadHTML_Offline (FileName.s)
Declare ParseHTML_Links()
Declare.s HTMLParser_GetPart (Code.s,PartName.s)
Declare.s GetParsedHTML_Link_URL (LinkNO.l)
Declare.s GetParsedHTML_Link_Target (LinkNO.l)
Declare.s GetParsedHTML_Link_Description (LinkNO.l)
Procedure InitHTMLParser ()
Shared Is_InitHTMLParser_Initated.l
If Is_InitHTMLParser_Initated.l=0
Structure LinkStruct
Link.s
Target.s
Description.s
EndStructure
Global NewList HyperLinkList.LinkStruct()
Is_InitHTMLParser_Initated.l=#True
EndIf
EndProcedure
Procedure LoadHTML_Offline (FileName.s)
Shared Is_InitHTMLParser_Initated.l,*HTMLParser_Buffer,HTMLParser_BufferLenght.l
Protected FileID.l,*Buffer
FileName.s=Trim(FileName.s)
If FileName.s="" Or Is_InitHTMLParser_Initated.l=0: ProcedureReturn #False : EndIf
FileID.l=ReadFile (#PB_Any,FileName.s)
If FileID.l
*HTMLParser_Buffer=AllocateMemory (Lof(FileID.l))
HTMLParser_BufferLenght.l=Lof(FileID.l)-1
ReadData (FileID.l,*HTMLParser_Buffer,Lof(FileID.l))
CloseFile (FileID.l)
EndIf
EndProcedure
Procedure ParseHTML_Links()
Protected CurrentByte.b,TagName.b,TagOpened.l,Taged.s,CountLinks.l
Shared Is_InitHTMLParser_Initated.l,*HTMLParser_Buffer,HTMLParser_BufferLenght.l
If HTMLParser_BufferLenght.l<1 Or Is_InitHTMLParser_Initated.l=0: ProcedureReturn #False : EndIf
For x=0 To HTMLParser_BufferLenght.l-1
If TagOpened.l=#False
CurrentByte.b=PeekB(*HTMLParser_Buffer+x)
TagName.b=PeekB(*HTMLParser_Buffer+x+1)
ElseIf TagOpened.l=#True
CurrentByte.b=PeekB(*HTMLParser_Buffer+x+1)
TagName.b=PeekB(*HTMLParser_Buffer+x)
EndIf
If (CurrentByte.b=60 And (TagName.b=65 Or TagName.b=97))
TagOpened.l=#True
tagstring.s=""
ElseIf (CurrentByte.b=62 And (TagName.b=65 Or TagName.b=97))
TagOpened.l=#False
Taged.s=PeekS(@tagstring+1,Len(tagstring.s)-1)
HTML_URL.s=HTMLParser_GetPart (Taged.s,"href")
HTML_Target.s=HTMLParser_GetPart (Taged.s,"target")
HTML_Description.s=StringField(StringField (Taged.s,2,">"),1,"<")
If HTML_URL.s
AddElement (HyperLinkList())
HyperLinkList()\Link=HTML_URL.s
HyperLinkList()\Target=HTML_Target.s
HyperLinkList()\Description=HTML_Description.s
CountLinks.l+1
EndIf
tagstring.s=""
EndIf
If TagOpened.l=#True And CurrentByte.b<>9 And CurrentByte.b<>10 And CurrentByte.b<>13: tagstring.s+Chr(CurrentByte.b):EndIf
Next x
ProcedureReturn CountLinks.l
FreeMemory (*HTMLParser_Buffer)
EndProcedure
Procedure.s HTMLParser_GetPart (Code.s,PartName.s); Just internal needed
Protected tmppos.l,tmp.s
tmppos.l=FindString (LCase(Code.s),PartName.s,1)
If tmppos.l
tmp.s=Mid(Code.s,tmppos.l+Len(PartName.s),Len(Code.s)-(tmppos.l+Len(PartName.s)))
tmp.s=ReplaceString (tmp.s,"=","")
ProcedureReturn StringField (tmp.s,2,Chr(34))
EndIf
EndProcedure
Procedure.s GetParsedHTML_Link_URL (LinkNO.l)
Shared Is_InitHTMLParser_Initated.l
If LinkNO.l=0 Or Is_InitHTMLParser_Initated.l=0: ProcedureReturn "" : EndIf
SelectElement (HyperLinkList(),LinkNO.l-1)
ProcedureReturn HyperLinkList()\Link
EndProcedure
Procedure.s GetParsedHTML_Link_Target (LinkNO.l)
Shared Is_InitHTMLParser_Initated.l
If LinkNO.l=0 Or Is_InitHTMLParser_Initated.l=0: ProcedureReturn "" : EndIf
SelectElement (HyperLinkList(),LinkNO.l-1)
ProcedureReturn HyperLinkList()\Target
EndProcedure
Procedure.s GetParsedHTML_Link_Description (LinkNO.l)
Shared Is_InitHTMLParser_Initated.l
If LinkNO.l=0 Or Is_InitHTMLParser_Initated.l=0: ProcedureReturn "" : EndIf
SelectElement (HyperLinkList(),LinkNO.l-1)
ProcedureReturn HyperLinkList()\Description
EndProcedure
;_--------------------------------------------------------------------------------
InitHTMLParser () ;Initiates everything what the parser needs.
LoadHTML_Offline ("linkparser.html") ; loads a HTML-File
Links.l=ParseHTML_Links(); Parses the HTML-Code and retrieves the the amount of the found links.
For x=1 To Links.l
Debug "URL: "+GetParsedHTML_Link_URL (x)
Debug "Target: "+GetParsedHTML_Link_Target (x)
Debug "Description: "+GetParsedHTML_Link_Description (x)
Debug "------------------------------------------------------------------"
Next x