Page 1 of 1

Parsing an HTML website

Posted: Tue Jul 26, 2016 4:37 pm
by uwekel
Hi,

if you have ever tried to parse an HTML website, you may ran into trouble like me. My first idea was to download the webpage and use the XML library to read through the website, but i have to learn that it is not that easy. HTML is not XML and the XML reader stops at the first error. So i wrote my own parser. First i downloaded the webpage content to a string and performed all parsing by using string functions, but that was terribly slow. Now i download the content to memory and use pointers to read through the content. This is fast enough, especially for larger sites.

For instance, to parse a webpage and grab all images, you can do like this:

Code: Select all

If HtmlReceive("http://www.purebasic.com")
  While HtmlNext("img")
    Debug HtmlAttribute("src")
  Wend
EndIf
But you can also loop through all HTML tags in this way:

Code: Select all

  If HtmlReceive("http://www.purebasic.com")
    While HtmlNext()
      Select HtmlTag()
      Case "title"
        Debug HtmlText()
      Case "a"
        href.s = HtmlAttribute("href")
        If Left(href, 6) = "mailto"
          Debug href
        EndIf
      EndSelect
    Wend
  EndIf
And here is the complete code:

Code: Select all

EnableExplicit

Global *_HtmlBufferStart, *_HtmlBufferEnd, *_HtmlPointer
Global *_HtmlTagStart, *_HtmlTagEnd, *_HtmlTextStart, *_HtmlTextEnd

Procedure.s HtmlDecode(Text.s)
  ;http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
  Protected xl, xr, encode.s, code.s, ascii
  ;read possible conversions into dictionary
  Static NewMap a.s()
  If Not MapSize(a())
    a("nbsp") = Chr(32)
    a("quot") = Chr(34)
    a("amp") = Chr(38)
    a("lt") = Chr(60)
    a("gt") = Chr(62)
    a("iexcl") = Chr(161)
    a("cent") = Chr(162)
    a("pound") = Chr(163)
    a("curren") = Chr(164)
    a("yen") = Chr(165)
    a("brvbar") = Chr(166)
    a("sect") = Chr(167)
    a("uml") = Chr(168)
    a("copy") = Chr(169)
    a("ordf") = Chr(170)
    a("laquo") = Chr(171)
    a("not") = Chr(172)
    a("shy") = Chr(173)
    a("reg") = Chr(174)
    a("macr") = Chr(175)
    a("deg") = Chr(176)
    a("plusmn") = Chr(177)
    a("sup2") = Chr(178)
    a("sup3") = Chr(179)
    a("acute") = Chr(180)
    a("micro") = Chr(181)
    a("para") = Chr(182)
    a("middot") = Chr(183)
    a("cedil") = Chr(184)
    a("sup1") = Chr(185)
    a("ordm") = Chr(186)
    a("raquo") = Chr(187)
    a("frac14") = Chr(188)
    a("frac12") = Chr(189)
    a("frac34") = Chr(190)
    a("iquest") = Chr(191)
    a("Agrave") = Chr(192)
    a("Aacute") = Chr(193)
    a("Acirc") = Chr(194)
    a("Atilde") = Chr(195)
    a("Auml") = Chr(196)
    a("Aring") = Chr(197)
    a("AElig") = Chr(198)
    a("Ccedil") = Chr(199)
    a("Egrave") = Chr(200)
    a("Eacute") = Chr(201)
    a("Ecirc") = Chr(202)
    a("Euml") = Chr(203)
    a("lgrave") = Chr(204)
    a("lacute") = Chr(205)
    a("lcirc") = Chr(206)
    a("luml") = Chr(207)
    a("ETH") = Chr(208)
    a("Ntilde") = Chr(209)
    a("Ograve") = Chr(210)
    a("Oacute") = Chr(211)
    a("Ocirc") = Chr(212)
    a("Otilde") = Chr(213)
    a("Ouml") = Chr(214)
    a("times") = Chr(215)
    a("Oslash") = Chr(216)
    a("Ugrave") = Chr(217)
    a("Uacute") = Chr(218)
    a("Ucirc") = Chr(219)
    a("Uuml") = Chr(220)
    a("Yacute") = Chr(221)
    a("THORN") = Chr(222)
    a("szlig") = Chr(223)
    a("agrave") = Chr(224)
    a("aacute") = Chr(225)
    a("acirc") = Chr(226)
    a("atilde") = Chr(227)
    a("auml") = Chr(228)
    a("aring") = Chr(229)
    a("aelig") = Chr(230)
    a("ccedil") = Chr(231)
    a("egrave") = Chr(232)
    a("eacute") = Chr(233)
    a("ecirc") = Chr(234)
    a("euml") = Chr(235)
    a("igrave") = Chr(236)
    a("iacute") = Chr(237)
    a("icirc") = Chr(238)
    a("iuml") = Chr(239)
    a("eth") = Chr(240)
    a("ntilde") = Chr(241)
    a("ograve") = Chr(242)
    a("oacute") = Chr(243)
    a("ocirc") = Chr(244)
    a("otilde") = Chr(245)
    a("ouml") = Chr(246)
    a("divide") = Chr(247)
    a("oslash") = Chr(248)
    a("ugrave") = Chr(249)
    a("uacute") = Chr(250)
    a("ucirc") = Chr(251)
    a("uuml") = Chr(252)
    a("yacute") = Chr(253)
    a("thorn") = Chr(254)
    a("yuml") = Chr(255)
    ;include unicode characters in
    a("OElig") = Chr(338)
    a("oelig") = Chr(339)
    a("Scaron") = Chr(352)
    a("scaron") = Chr(353)
    a("Yuml") = Chr(376)
    a("fnof") = Chr(402)
    a("circ") = Chr(710)
    a("tilde") = Chr(732)
    a("Alpha") = Chr(913)
    a("Beta") = Chr(914)
    a("Gamma") = Chr(915)
    a("Delta") = Chr(916)
    a("Epsilon") = Chr(917)
    a("Zeta") = Chr(918)
    a("Eta") = Chr(919)
    a("Theta") = Chr(920)
    a("Iota") = Chr(921)
    a("Kappa") = Chr(922)
    a("Lambda") = Chr(923)
    a("Mu") = Chr(924)
    a("Nu") = Chr(925)
    a("Xi") = Chr(926)
    a("Omicron") = Chr(927)
    a("Pi") = Chr(928)
    a("Rho") = Chr(929)
    a("Sigma") = Chr(931)
    a("Tau") = Chr(932)
    a("Upsilon") = Chr(933)
    a("Phi") = Chr(934)
    a("Chi") = Chr(935)
    a("Psi") = Chr(936)
    a("Omega") = Chr(937)
    a("alpha") = Chr(945)
    a("beta") = Chr(946)
    a("gamma") = Chr(947)
    a("delta") = Chr(948)
    a("epsilon") = Chr(949)
    a("zeta") = Chr(950)
    a("eta") = Chr(951)
    a("theta") = Chr(952)
    a("iota") = Chr(953)
    a("kappa") = Chr(954)
    a("lambda") = Chr(955)
    a("mu") = Chr(956)
    a("nu") = Chr(957)
    a("xi") = Chr(958)
    a("omicron") = Chr(959)
    a("pi") = Chr(960)
    a("rho") = Chr(961)
    a("sigmaf") = Chr(962)
    a("sigma") = Chr(963)
    a("tau") = Chr(964)
    a("upsilon") = Chr(965)
    a("phi") = Chr(966)
    a("chi") = Chr(967)
    a("psi") = Chr(968)
    a("omega") = Chr(969)
    a("thetasym") = Chr(977)
    a("upsih") = Chr(978)
    a("piv") = Chr(982)
    a("ensp") = Chr(8194)
    a("emsp") = Chr(8195)
    a("thinsp") = Chr(8201)
    a("zwnj") = Chr(8204)
    a("zwj") = Chr(8205)
    a("lrm") = Chr(8206)
    a("rlm") = Chr(8207)
    a("ndash") = Chr(8211)
    a("mdash") = Chr(8212)
    a("lsquo") = Chr(8216)
    a("rsquo") = Chr(8217)
    a("sbquo") = Chr(8218)
    a("ldquo") = Chr(8220)
    a("rdquo") = Chr(8221)
    a("bdquo") = Chr(8222)
    a("dagger") = Chr(8224)
    a("Dagger") = Chr(8225)
    a("bull") = Chr(8226)
    a("hellip") = Chr(8230)
    a("permil") = Chr(8240)
    a("prime") = Chr(8242)
    a("Prime") = Chr(8243)
    a("lsaquo") = Chr(8249)
    a("rsaquo") = Chr(8250)
    a("oline") = Chr(8254)
    a("frasl") = Chr(8260)
    a("euro") = Chr(8364) ;new euro sign
    a("image") = Chr(8465)
    a("weierp") = Chr(8472)
    a("real") = Chr(8476)
    a("trade") = Chr(8482)
    a("alefsym") = Chr(8501)
    a("larr") = Chr(8592)
    a("uarr") = Chr(8593)
    a("rarr") = Chr(8594)
    a("darr") = Chr(8595)
    a("harr") = Chr(8596)
    a("crarr") = Chr(8629)
    a("lArr") = Chr(8656)
    a("uArr") = Chr(8657)
    a("rArr") = Chr(8658)
    a("dArr") = Chr(8659)
    a("hArr") = Chr(8660)
    a("forall") = Chr(8704)
    a("part") = Chr(8706)
    a("exist") = Chr(8707)
    a("empty") = Chr(8709)
    a("nabla") = Chr(8711)
    a("isin") = Chr(8712)
    a("notin") = Chr(8713)
    a("ni") = Chr(8715)
    a("prod") = Chr(8719)
    a("sum") = Chr(8721)
    a("minus") = Chr(8722)
    a("lowast") = Chr(8727)
    a("radic") = Chr(8730)
    a("prop") = Chr(8733)
    a("infin") = Chr(8734)
    a("ang") = Chr(8736)
    a("And") = Chr(8743)
    a("Or") = Chr(8744)
    a("cap") = Chr(8745)
    a("cup") = Chr(8746)
    a("int") = Chr(8747)
    a("there4") = Chr(8756)
    a("sim") = Chr(8764)
    a("cong") = Chr(8773)
    a("asymp") = Chr(8776)
    a("ne") = Chr(8800)
    a("equiv") = Chr(8801)
    a("le") = Chr(8804)
    a("ge") = Chr(8805)
    a("sub") = Chr(8834)
    a("sup") = Chr(8835)
    a("nsub") = Chr(8836)
    a("sube") = Chr(8838)
    a("supe") = Chr(8839)
    a("oplus") = Chr(8853)
    a("otimes") = Chr(8855)
    a("perp") = Chr(8869)
    a("sdot") = Chr(8901)
    a("vellip") = Chr(8942)
    a("lceil") = Chr(8968)
    a("rceil") = Chr(8969)
    a("lfloor") = Chr(8970)
    a("rfloor") = Chr(8971)
    a("lang") = Chr(9001)
    a("rang") = Chr(9002)
    a("loz") = Chr(9674)
    a("spades") = Chr(9824)
    a("clubs") = Chr(9827)
    a("hearts") = Chr(9829)
    a("diams") = Chr(9830)
  EndIf
  ;decode HTML text
  Repeat
    ;lookup code prefix
    xl = FindString(Text, "&", xl + 1)
    If Not xl
      Break
    EndIf
    ;lookup code suffix
    xr = FindString(Text, ";", xl)
    If Not xr
      Break
    EndIf
    ;get identifier between
    encode = Mid(Text, xl + 1, xr - xl - 1)
    ;lookup ascii code
    If FindMapElement(a(), encode)
      Text = ReplaceString(Text, "&" + encode + ";", a())
      xl - 1 ;this position must be checked again
      ;or get from numeric char ref
    ElseIf Left(encode, 1) = "#"
      ;http://en.wikipedia.org/wiki/HTML_decimal_character_rendering#HTML_character_references
      ;take care on hex
      ascii = Val("$" + Right(encode, 4))
      If ascii
        Text = ReplaceString(Text, "&" + encode + ";", Chr(ascii))
      EndIf
    EndIf
  ForEver
  ProcedureReturn Text
EndProcedure
Procedure.s HtmlTag()
  ;return tag name
  Protected *p
  If *_HtmlTagStart And *_HtmlTagEnd > *_HtmlTagStart
    For *p = *_HtmlTagStart To *_HtmlTagEnd
      Select PeekB(*p)
      Case ' ', '>'
        Break
      EndSelect
    Next
    ;convert to lower case for easier comparison
    ProcedureReturn LCase(PeekS(*_HtmlTagStart, *p - *_HtmlTagStart, #PB_UTF8))
  EndIf
EndProcedure
Procedure.s HtmlText()
  ;return text behind tag
  Protected Text.s
  If *_HtmlTextStart And *_HtmlTextEnd > *_HtmlTextStart
    Text = PeekS(*_HtmlTextStart, *_HtmlTextEnd - *_HtmlTextStart, #PB_UTF8 | #PB_ByteLength)
    ;remove XML formating
    Text = ReplaceString(Text, #TAB$, "")
    ProcedureReturn HtmlDecode(Text)
  EndIf
EndProcedure
Procedure.s HtmlAttribute(Name.s)
  ;return an argument
  Protected namelen, *i, quote, j, *valuestart
  
  ;lookup argument name
  namelen = Len(Name)
  For *i = *_HtmlTagStart + 1 To *_HtmlTagEnd - namelen - 4
    
    ;compare full name (set bit 5 to compare case insensitive)
    For j = 0 To namelen - 1
      If PeekB(*i + j) | %100000 <> PeekB(@Name + j + j) | %100000
        Goto NextChar
      EndIf
    Next
    
    ;check character before (space needed)
    If PeekB(*i - 1) = ' '
      *i + namelen
      ;skip spaces
      While PeekB(*i) = ' ' And *i < *_HtmlTagEnd
        *i + 1
      Wend
      ;=
      If PeekB(*i) = '='
        *i + 1
        ;skip spaces
        While PeekB(*i) = ' ' And *i < *_HtmlTagEnd
          *i + 1
        Wend
        ;check quote
        quote = PeekB(*i)
        If quote = '"' Or quote = 39 ;'
          *i + 1
          *valuestart = *i
          ;look for closing quote
          While PeekB(*i) <> quote And *i < *_HtmlTagEnd
            *i + 1
          Wend
          ;take value
          ProcedureReturn PeekS(*valuestart, *i - *valuestart, #PB_UTF8 | #PB_ByteLength)
        EndIf
      EndIf
    EndIf
    
    NextChar:
  Next
  
EndProcedure
Procedure HtmlNext(Tag.s=#Empty$)
  ;lookup next Tag with text
  While *_HtmlPointer < *_HtmlBufferEnd
    ;tag start
    If PeekB(*_HtmlPointer) = '<'
      *_HtmlTagStart = *_HtmlPointer + 1
      ;tag end
    ElseIf PeekB(*_HtmlPointer) = '>'
      *_HtmlTagEnd = *_HtmlPointer
      ;get text
      *_HtmlPointer + 1
      *_HtmlTextStart = *_HtmlPointer
      While *_HtmlPointer < *_HtmlBufferEnd
        If PeekB(*_HtmlPointer) = '<'
          *_HtmlTextEnd = *_HtmlPointer
          If Tag = #Empty$ Or Tag = HtmlTag()
            ProcedureReturn #True
          Else
            Goto Unwanted
          EndIf
        EndIf
        *_HtmlPointer + 1
      Wend
    EndIf
    *_HtmlPointer + 1
    Unwanted:
  Wend
  FreeMemory(*_HtmlBufferStart)
EndProcedure
Procedure HtmlLoad(Filename.s)
  Protected f = ReadFile(#PB_Any, Filename)
  If f
    *_HtmlBufferStart = AllocateMemory(Lof(f), #PB_Memory_NoClear)
    ReadData(f, *_HtmlBufferStart, Lof(f))
    *_HtmlBufferEnd = *_HtmlBufferStart + Lof(f)
    *_HtmlPointer = *_HtmlBufferStart
    CloseFile(f)
    ProcedureReturn #True
  EndIf
EndProcedure
Procedure HtmlReceive(Url.s)
  If InitNetwork()
    *_HtmlBufferStart = ReceiveHTTPMemory(Url)
    If *_HtmlBufferStart
      *_HtmlBufferEnd = *_HtmlBufferStart + MemorySize(*_HtmlBufferStart)
      *_HtmlPointer = *_HtmlBufferStart
      ProcedureReturn #True
    EndIf
  EndIf
EndProcedure

DisableExplicit

CompilerIf #PB_Compiler_IsMainFile
  
  If HtmlReceive("http://www.purebasic.com")
    While HtmlNext()
      Select HtmlTag()
      Case "title"
        Debug HtmlText()
      Case "a"
        href.s = HtmlAttribute("href")
        If Left(href, 6) = "mailto"
          Debug href
        EndIf
      EndSelect
    Wend
  EndIf
  
CompilerEndIf
I don't like modules so every private stuff starts with an underscore and shouldn't be touched. Please let me know, if you see an error or can improve this or that.

Regards
Uwe

Re: Parsing an HTML website

Posted: Tue Mar 27, 2018 7:29 pm
by Pypsikan
Here memory leak.
sry for necrobump.

Re: Parsing an HTML website

Posted: Sat Mar 31, 2018 8:00 pm
by Kwai chang caine
Not for me in W10 v5.61 x86
It's never too late for thanks for sharing 8)

Re: Parsing an HTML website

Posted: Mon Sep 27, 2021 10:27 am
by Rinzwind
Surprised no HTML parser is included with PB! Something that should be there IMHO, modern times etc... Anyway, thanks for sharing 5 years ago ;)

Re: Parsing an HTML website

Posted: Tue Sep 28, 2021 4:43 am
by Rinzwind
A small addition to load from string:

Code: Select all

Procedure LoadHtmlText(Html.s)
  Protected size

  If FindString(Html, "<")
    size = StringByteLength(Html, #PB_UTF8)
    *_HtmlBufferStart = AllocateMemory(size, #PB_Memory_NoClear)
    PokeS(*_HtmlBufferStart, Html, size, #PB_UTF8 | #PB_String_NoZero)
    *_HtmlBufferEnd = *_HtmlBufferStart + size
    *_HtmlPointer = *_HtmlBufferStart      
    ProcedureReturn #True
  EndIf
EndProcedure

Re: Parsing an HTML website

Posted: Tue Sep 28, 2021 8:51 am
by Mesa
Something is wrong with sites written in iso-8859-1, some characters don't appear like "รจ", ...

M.

Re: Parsing an HTML website

Posted: Tue Sep 28, 2021 11:24 am
by Rinzwind
It assumes utf-8

Bugfix HtmlDecode:

Code: Select all

    If FindMapElement(a(), encode)
      Text = ReplaceString(Text, "&" + encode + ";", a())
      xl - 1 ;this position must be checked again
             ;or get from numeric char ref
    ElseIf Left(encode, 2) = "#x"
      ascii = Val("$" + Right(encode, Len(encode) -2))
      If ascii
        Text = ReplaceString(Text, "&" + encode + ";", Chr(ascii))
      EndIf      
    ElseIf Left(encode, 1) = "#"
      ;http://en.wikipedia.org/wiki/HTML_decimal_character_rendering#HTML_character_references
      ;take care on hex
      ascii = Val("$" + Right(encode, Len(encode) -1))
      If ascii
        Text = ReplaceString(Text, "&" + encode + ";", Chr(ascii))
      EndIf
(see wiki link)

Re: Parsing an HTML website

Posted: Fri Oct 08, 2021 8:55 pm
by Kwai chang caine
Thanks Rinzwind for this adding 8)
Uwekel made a real good and useful works 8)

Re: Parsing an HTML website

Posted: Fri Aug 26, 2022 6:58 pm
by Seymour Clufley
This is very useful, Uwekel. Thank you.

Re: Parsing an HTML website

Posted: Fri Aug 26, 2022 7:33 pm
by infratec
But be careful ...

the code is from ASCII times :wink:

Re: Parsing an HTML website

Posted: Sat Aug 27, 2022 1:31 am
by BarryG
.

Re: Parsing an HTML website

Posted: Mon Aug 29, 2022 7:31 am
by dige

Code: Select all

FindMapElement(LCase(a()), encode)
What is Lcase() for around the map parameter?