if you have ever tried to parse an HTML website, you may ran into trouble like me. My first idea was to download the webpage and use the XML library to read through the website, but i have to learn that it is not that easy. HTML is not XML and the XML reader stops at the first error. So i wrote my own parser. First i downloaded the webpage content to a string and performed all parsing by using string functions, but that was terribly slow. Now i download the content to memory and use pointers to read through the content. This is fast enough, especially for larger sites.
For instance, to parse a webpage and grab all images, you can do like this:
Code: Select all
If HtmlReceive("http://www.purebasic.com")
While HtmlNext("img")
Debug HtmlAttribute("src")
Wend
EndIf
Code: Select all
If HtmlReceive("http://www.purebasic.com")
While HtmlNext()
Select HtmlTag()
Case "title"
Debug HtmlText()
Case "a"
href.s = HtmlAttribute("href")
If Left(href, 6) = "mailto"
Debug href
EndIf
EndSelect
Wend
EndIf
Code: Select all
EnableExplicit
Global *_HtmlBufferStart, *_HtmlBufferEnd, *_HtmlPointer
Global *_HtmlTagStart, *_HtmlTagEnd, *_HtmlTextStart, *_HtmlTextEnd
Procedure.s HtmlDecode(Text.s)
;http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
Protected xl, xr, encode.s, code.s, ascii
;read possible conversions into dictionary
Static NewMap a.s()
If Not MapSize(a())
a("nbsp") = Chr(32)
a("quot") = Chr(34)
a("amp") = Chr(38)
a("lt") = Chr(60)
a("gt") = Chr(62)
a("iexcl") = Chr(161)
a("cent") = Chr(162)
a("pound") = Chr(163)
a("curren") = Chr(164)
a("yen") = Chr(165)
a("brvbar") = Chr(166)
a("sect") = Chr(167)
a("uml") = Chr(168)
a("copy") = Chr(169)
a("ordf") = Chr(170)
a("laquo") = Chr(171)
a("not") = Chr(172)
a("shy") = Chr(173)
a("reg") = Chr(174)
a("macr") = Chr(175)
a("deg") = Chr(176)
a("plusmn") = Chr(177)
a("sup2") = Chr(178)
a("sup3") = Chr(179)
a("acute") = Chr(180)
a("micro") = Chr(181)
a("para") = Chr(182)
a("middot") = Chr(183)
a("cedil") = Chr(184)
a("sup1") = Chr(185)
a("ordm") = Chr(186)
a("raquo") = Chr(187)
a("frac14") = Chr(188)
a("frac12") = Chr(189)
a("frac34") = Chr(190)
a("iquest") = Chr(191)
a("Agrave") = Chr(192)
a("Aacute") = Chr(193)
a("Acirc") = Chr(194)
a("Atilde") = Chr(195)
a("Auml") = Chr(196)
a("Aring") = Chr(197)
a("AElig") = Chr(198)
a("Ccedil") = Chr(199)
a("Egrave") = Chr(200)
a("Eacute") = Chr(201)
a("Ecirc") = Chr(202)
a("Euml") = Chr(203)
a("lgrave") = Chr(204)
a("lacute") = Chr(205)
a("lcirc") = Chr(206)
a("luml") = Chr(207)
a("ETH") = Chr(208)
a("Ntilde") = Chr(209)
a("Ograve") = Chr(210)
a("Oacute") = Chr(211)
a("Ocirc") = Chr(212)
a("Otilde") = Chr(213)
a("Ouml") = Chr(214)
a("times") = Chr(215)
a("Oslash") = Chr(216)
a("Ugrave") = Chr(217)
a("Uacute") = Chr(218)
a("Ucirc") = Chr(219)
a("Uuml") = Chr(220)
a("Yacute") = Chr(221)
a("THORN") = Chr(222)
a("szlig") = Chr(223)
a("agrave") = Chr(224)
a("aacute") = Chr(225)
a("acirc") = Chr(226)
a("atilde") = Chr(227)
a("auml") = Chr(228)
a("aring") = Chr(229)
a("aelig") = Chr(230)
a("ccedil") = Chr(231)
a("egrave") = Chr(232)
a("eacute") = Chr(233)
a("ecirc") = Chr(234)
a("euml") = Chr(235)
a("igrave") = Chr(236)
a("iacute") = Chr(237)
a("icirc") = Chr(238)
a("iuml") = Chr(239)
a("eth") = Chr(240)
a("ntilde") = Chr(241)
a("ograve") = Chr(242)
a("oacute") = Chr(243)
a("ocirc") = Chr(244)
a("otilde") = Chr(245)
a("ouml") = Chr(246)
a("divide") = Chr(247)
a("oslash") = Chr(248)
a("ugrave") = Chr(249)
a("uacute") = Chr(250)
a("ucirc") = Chr(251)
a("uuml") = Chr(252)
a("yacute") = Chr(253)
a("thorn") = Chr(254)
a("yuml") = Chr(255)
;include unicode characters in
a("OElig") = Chr(338)
a("oelig") = Chr(339)
a("Scaron") = Chr(352)
a("scaron") = Chr(353)
a("Yuml") = Chr(376)
a("fnof") = Chr(402)
a("circ") = Chr(710)
a("tilde") = Chr(732)
a("Alpha") = Chr(913)
a("Beta") = Chr(914)
a("Gamma") = Chr(915)
a("Delta") = Chr(916)
a("Epsilon") = Chr(917)
a("Zeta") = Chr(918)
a("Eta") = Chr(919)
a("Theta") = Chr(920)
a("Iota") = Chr(921)
a("Kappa") = Chr(922)
a("Lambda") = Chr(923)
a("Mu") = Chr(924)
a("Nu") = Chr(925)
a("Xi") = Chr(926)
a("Omicron") = Chr(927)
a("Pi") = Chr(928)
a("Rho") = Chr(929)
a("Sigma") = Chr(931)
a("Tau") = Chr(932)
a("Upsilon") = Chr(933)
a("Phi") = Chr(934)
a("Chi") = Chr(935)
a("Psi") = Chr(936)
a("Omega") = Chr(937)
a("alpha") = Chr(945)
a("beta") = Chr(946)
a("gamma") = Chr(947)
a("delta") = Chr(948)
a("epsilon") = Chr(949)
a("zeta") = Chr(950)
a("eta") = Chr(951)
a("theta") = Chr(952)
a("iota") = Chr(953)
a("kappa") = Chr(954)
a("lambda") = Chr(955)
a("mu") = Chr(956)
a("nu") = Chr(957)
a("xi") = Chr(958)
a("omicron") = Chr(959)
a("pi") = Chr(960)
a("rho") = Chr(961)
a("sigmaf") = Chr(962)
a("sigma") = Chr(963)
a("tau") = Chr(964)
a("upsilon") = Chr(965)
a("phi") = Chr(966)
a("chi") = Chr(967)
a("psi") = Chr(968)
a("omega") = Chr(969)
a("thetasym") = Chr(977)
a("upsih") = Chr(978)
a("piv") = Chr(982)
a("ensp") = Chr(8194)
a("emsp") = Chr(8195)
a("thinsp") = Chr(8201)
a("zwnj") = Chr(8204)
a("zwj") = Chr(8205)
a("lrm") = Chr(8206)
a("rlm") = Chr(8207)
a("ndash") = Chr(8211)
a("mdash") = Chr(8212)
a("lsquo") = Chr(8216)
a("rsquo") = Chr(8217)
a("sbquo") = Chr(8218)
a("ldquo") = Chr(8220)
a("rdquo") = Chr(8221)
a("bdquo") = Chr(8222)
a("dagger") = Chr(8224)
a("Dagger") = Chr(8225)
a("bull") = Chr(8226)
a("hellip") = Chr(8230)
a("permil") = Chr(8240)
a("prime") = Chr(8242)
a("Prime") = Chr(8243)
a("lsaquo") = Chr(8249)
a("rsaquo") = Chr(8250)
a("oline") = Chr(8254)
a("frasl") = Chr(8260)
a("euro") = Chr(8364) ;new euro sign
a("image") = Chr(8465)
a("weierp") = Chr(8472)
a("real") = Chr(8476)
a("trade") = Chr(8482)
a("alefsym") = Chr(8501)
a("larr") = Chr(8592)
a("uarr") = Chr(8593)
a("rarr") = Chr(8594)
a("darr") = Chr(8595)
a("harr") = Chr(8596)
a("crarr") = Chr(8629)
a("lArr") = Chr(8656)
a("uArr") = Chr(8657)
a("rArr") = Chr(8658)
a("dArr") = Chr(8659)
a("hArr") = Chr(8660)
a("forall") = Chr(8704)
a("part") = Chr(8706)
a("exist") = Chr(8707)
a("empty") = Chr(8709)
a("nabla") = Chr(8711)
a("isin") = Chr(8712)
a("notin") = Chr(8713)
a("ni") = Chr(8715)
a("prod") = Chr(8719)
a("sum") = Chr(8721)
a("minus") = Chr(8722)
a("lowast") = Chr(8727)
a("radic") = Chr(8730)
a("prop") = Chr(8733)
a("infin") = Chr(8734)
a("ang") = Chr(8736)
a("And") = Chr(8743)
a("Or") = Chr(8744)
a("cap") = Chr(8745)
a("cup") = Chr(8746)
a("int") = Chr(8747)
a("there4") = Chr(8756)
a("sim") = Chr(8764)
a("cong") = Chr(8773)
a("asymp") = Chr(8776)
a("ne") = Chr(8800)
a("equiv") = Chr(8801)
a("le") = Chr(8804)
a("ge") = Chr(8805)
a("sub") = Chr(8834)
a("sup") = Chr(8835)
a("nsub") = Chr(8836)
a("sube") = Chr(8838)
a("supe") = Chr(8839)
a("oplus") = Chr(8853)
a("otimes") = Chr(8855)
a("perp") = Chr(8869)
a("sdot") = Chr(8901)
a("vellip") = Chr(8942)
a("lceil") = Chr(8968)
a("rceil") = Chr(8969)
a("lfloor") = Chr(8970)
a("rfloor") = Chr(8971)
a("lang") = Chr(9001)
a("rang") = Chr(9002)
a("loz") = Chr(9674)
a("spades") = Chr(9824)
a("clubs") = Chr(9827)
a("hearts") = Chr(9829)
a("diams") = Chr(9830)
EndIf
;decode HTML text
Repeat
;lookup code prefix
xl = FindString(Text, "&", xl + 1)
If Not xl
Break
EndIf
;lookup code suffix
xr = FindString(Text, ";", xl)
If Not xr
Break
EndIf
;get identifier between
encode = Mid(Text, xl + 1, xr - xl - 1)
;lookup ascii code
If FindMapElement(a(), encode)
Text = ReplaceString(Text, "&" + encode + ";", a())
xl - 1 ;this position must be checked again
;or get from numeric char ref
ElseIf Left(encode, 1) = "#"
;http://en.wikipedia.org/wiki/HTML_decimal_character_rendering#HTML_character_references
;take care on hex
ascii = Val("$" + Right(encode, 4))
If ascii
Text = ReplaceString(Text, "&" + encode + ";", Chr(ascii))
EndIf
EndIf
ForEver
ProcedureReturn Text
EndProcedure
Procedure.s HtmlTag()
;return tag name
Protected *p
If *_HtmlTagStart And *_HtmlTagEnd > *_HtmlTagStart
For *p = *_HtmlTagStart To *_HtmlTagEnd
Select PeekB(*p)
Case ' ', '>'
Break
EndSelect
Next
;convert to lower case for easier comparison
ProcedureReturn LCase(PeekS(*_HtmlTagStart, *p - *_HtmlTagStart, #PB_UTF8))
EndIf
EndProcedure
Procedure.s HtmlText()
;return text behind tag
Protected Text.s
If *_HtmlTextStart And *_HtmlTextEnd > *_HtmlTextStart
Text = PeekS(*_HtmlTextStart, *_HtmlTextEnd - *_HtmlTextStart, #PB_UTF8 | #PB_ByteLength)
;remove XML formating
Text = ReplaceString(Text, #TAB$, "")
ProcedureReturn HtmlDecode(Text)
EndIf
EndProcedure
Procedure.s HtmlAttribute(Name.s)
;return an argument
Protected namelen, *i, quote, j, *valuestart
;lookup argument name
namelen = Len(Name)
For *i = *_HtmlTagStart + 1 To *_HtmlTagEnd - namelen - 4
;compare full name (set bit 5 to compare case insensitive)
For j = 0 To namelen - 1
If PeekB(*i + j) | %100000 <> PeekB(@Name + j + j) | %100000
Goto NextChar
EndIf
Next
;check character before (space needed)
If PeekB(*i - 1) = ' '
*i + namelen
;skip spaces
While PeekB(*i) = ' ' And *i < *_HtmlTagEnd
*i + 1
Wend
;=
If PeekB(*i) = '='
*i + 1
;skip spaces
While PeekB(*i) = ' ' And *i < *_HtmlTagEnd
*i + 1
Wend
;check quote
quote = PeekB(*i)
If quote = '"' Or quote = 39 ;'
*i + 1
*valuestart = *i
;look for closing quote
While PeekB(*i) <> quote And *i < *_HtmlTagEnd
*i + 1
Wend
;take value
ProcedureReturn PeekS(*valuestart, *i - *valuestart, #PB_UTF8 | #PB_ByteLength)
EndIf
EndIf
EndIf
NextChar:
Next
EndProcedure
Procedure HtmlNext(Tag.s=#Empty$)
;lookup next Tag with text
While *_HtmlPointer < *_HtmlBufferEnd
;tag start
If PeekB(*_HtmlPointer) = '<'
*_HtmlTagStart = *_HtmlPointer + 1
;tag end
ElseIf PeekB(*_HtmlPointer) = '>'
*_HtmlTagEnd = *_HtmlPointer
;get text
*_HtmlPointer + 1
*_HtmlTextStart = *_HtmlPointer
While *_HtmlPointer < *_HtmlBufferEnd
If PeekB(*_HtmlPointer) = '<'
*_HtmlTextEnd = *_HtmlPointer
If Tag = #Empty$ Or Tag = HtmlTag()
ProcedureReturn #True
Else
Goto Unwanted
EndIf
EndIf
*_HtmlPointer + 1
Wend
EndIf
*_HtmlPointer + 1
Unwanted:
Wend
FreeMemory(*_HtmlBufferStart)
EndProcedure
Procedure HtmlLoad(Filename.s)
Protected f = ReadFile(#PB_Any, Filename)
If f
*_HtmlBufferStart = AllocateMemory(Lof(f), #PB_Memory_NoClear)
ReadData(f, *_HtmlBufferStart, Lof(f))
*_HtmlBufferEnd = *_HtmlBufferStart + Lof(f)
*_HtmlPointer = *_HtmlBufferStart
CloseFile(f)
ProcedureReturn #True
EndIf
EndProcedure
Procedure HtmlReceive(Url.s)
If InitNetwork()
*_HtmlBufferStart = ReceiveHTTPMemory(Url)
If *_HtmlBufferStart
*_HtmlBufferEnd = *_HtmlBufferStart + MemorySize(*_HtmlBufferStart)
*_HtmlPointer = *_HtmlBufferStart
ProcedureReturn #True
EndIf
EndIf
EndProcedure
DisableExplicit
CompilerIf #PB_Compiler_IsMainFile
If HtmlReceive("http://www.purebasic.com")
While HtmlNext()
Select HtmlTag()
Case "title"
Debug HtmlText()
Case "a"
href.s = HtmlAttribute("href")
If Left(href, 6) = "mailto"
Debug href
EndIf
EndSelect
Wend
EndIf
CompilerEndIf
Regards
Uwe