Page 2 of 3

Re: I need to do a fast string search.

Posted: Mon Aug 11, 2014 5:57 am
by Danilo
Small start for strings:

Code: Select all

EnableExplicit

Global NewMap allowedTags()
Macro AddTag(string):allowedTags(string) = 1:EndMacro

AddTag( "h1") : AddTag("h2") : AddTag( "h3")
AddTag( "b" ) : AddTag("i" ) : AddTag( "u" ) : AddTag("p")
AddTag("div") : ;AddTag("br")

#RemoveOnlyAllowedTags = 0

Macro EatWhiteSpace(charPointer)
        While charPointer\c And ( charPointer\c = ' ' Or charPointer\c = #TAB )   ; remove SPACE and TAB
                charPointer + SizeOf(Character)
        Wend
EndMacro

Macro EatEverythingIncludingChar(charPointer, charToSearch)
        While charPointer\c And charPointer\c <> charToSearch                     ; remove any chars until
                charPointer + SizeOf(Character)                                   ; charToSearch was found
        Wend
        If charPointer\c = charToSearch : charPointer + SizeOf(Character) : EndIf ; remove charToSearch, too
EndMacro

Procedure.s GetTagFromPointer(*pInput.Character,*foundSlash.Integer)              ; extracts an identifier/name/word
    Protected result.s, slash                                                     ;
    If *pInput                                                                    ;
        If *foundSlash : *foundSlash\i = 0 : EndIf                                ; reset: *foundSlash\i = 0
        EatWhiteSpace(*pInput)                                                    ; remove whitespace
        If *pInput\c = '/'                                                        ;
            slash = #True                                                         ; check for optional '/' in front
            *pInput + SizeOf(Character)                                           ; of the identifier
        EndIf
        If (*pInput\c >= 'a' And *pInput\c <= 'z') Or (*pInput\c >= 'A' And *pInput\c <= 'Z') ; identifiers start with a-zA-Z,
            While *pInput\c And ((*pInput\c >= 'a' And *pInput\c <= 'z') Or                   ; followed by a-zA-Z0-9
                  (*pInput\c >= 'A' And *pInput\c <= 'Z') Or                                  ;
                  (*pInput\c >= '0' And *pInput\c <= '9'))                                    ;
                result + Chr( *pInput\c )                                                     ; extract the identifier (tag name)
                *pInput + SizeOf(Character)
            Wend
            result = LCase(result)                                                            ; we work internally with lowercase
                                                                                              ; tag names
            CompilerIf #RemoveOnlyAllowedTags <> 0                                ; CompilerIf to check if extracted
                If allowedTags(result)=0                                          ; tag name is within the list of
                    ProcedureReturn ""                                            ; allowed tags
                EndIf
            CompilerEndIf
            If slash And *foundSlash                                              ; write slash state to output variable
                *foundSlash\i = #True
            EndIf
            ProcedureReturn result                                                ; return the result
        EndIf
    EndIf
EndProcedure

Procedure.s ExtractTag(inputText.s,tagToExtract.s,*foundAtPos.Integer,startPos=0)
    Protected result.s, recording, tag.s, foundSlash, startPosWritten
    Protected NewMap tagCount()
    Protected *pInput.Character = @inputText
    If *foundAtPos : *foundAtPos\i = -1 : EndIf
    tagToExtract = LCase(tagToExtract)
    If *pInput
        *pInput + startPos*SizeOf(Character)                                      ; begin search at 'inputText' + 'startPos' chars
        Repeat
            If *pInput\c = '<'                                                    ; found tag start char '<'
                tag = GetTagFromPointer(*pInput+SizeOf(Character),@foundSlash)    ; get the identifier/name of the tag
                If tag
                    If foundSlash                                                 ; ending tag was found
                        If FindMapElement(tagCount(),tag)
                            If tagCount(tag) > 0
                                tagCount(tag) - 1
                            EndIf
                            If tag = tagToExtract And tagCount(tag) = 0
                                ProcedureReturn result
                            EndIf
                        EndIf
                    Else                                                          ; start tag was found
                        tagCount(tag) + 1
                        If tag = tagToExtract
                            If startPosWritten=0 And *foundAtPos
                                *foundAtPos\i = (*pInput - @inputText)/SizeOf(Character)
                                startPosWritten = 1
                            EndIf
                            recording = #True                                     ; we found our start tag, start recording
                        EndIf
                    EndIf
                    EatEverythingIncludingChar(*pInput,'>')                       ; ignore all other stuff after the tag name
                Else                                                              ; no allowed or valid tag name was found:
                    If recording                                                  ;
                        result + Chr(*pInput\c)                                   ; add chars to the result
                    EndIf
                    *pInput + SizeOf(Character)
                EndIf
            Else
                If recording                                                      ; we are after the start tag, so
                    result + Chr(*pInput\c)                                       ; add all chars outside tags to the result
                EndIf
                *pInput + SizeOf(Character)
            EndIf
        Until *pInput\c = 0
    EndIf
    ProcedureReturn result.s
EndProcedure

Define text.s, pos

text = ExtractTag("<div>This is <div>the content</div> of the div</div>","div",@pos)
If text = ""
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf

Debug "---------------"

text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag</div></p>","div",@pos)
If pos = -1
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf

Debug "---------------"

text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
                  "<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","p",@pos)
If pos = -1
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf

Debug "---------------"

text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
                  "<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","br",@pos)
If pos = -1
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf

Debug "---------------"

;
; find all <div>
;
Define startpos = 0

Repeat
    text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
                      "<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","div",@pos,startpos)
    If pos <> -1
        Debug "Extracted text (pos:"+pos+") "+text
        startpos = pos+1
    EndIf
Until pos = -1
EDIT: changed it, see last example (get all <div> elements)
EDIT2: added some comments

Re: I need to do a fast string search.

Posted: Mon Aug 11, 2014 7:42 am
by wilbert
My attempt :D
Hopefully fast enough ...

Parser and strip tags procedure

Code: Select all

; Simple HTML parser v1.4

#CharSize = SizeOf(Character)
#CharSizeShift = #CharSize - 1

Structure TagValue
  q1.q
  q2.q
EndStructure

Structure Tag
  Name.s      ; tag name
  NameValue.q ; lowercase ascii representation of first 8 characters
  Start.l     ; inner text start pos 
  Length.l    ; length of inner text
EndStructure

Global Dim CharacterType.a($ffff)

Procedure.s StripHTMLTags(Source.s)
  Protected *src.Character = @Source
  Protected *dst.Character = @Source
  Protected Ignore.i
  If *src\c
    Repeat
      Select *src\c
        Case 60; '<'
          Ignore = #True
        Case 62; '>'
          Ignore = #False
        Default
          If Ignore = #False
            *dst\c = *src\c
            *dst + #CharSize
          EndIf
      EndSelect
      *src + #CharSize
    Until *src\c = 0
    *dst\c = 0
  EndIf
  ProcedureReturn PeekS(@Source)  
EndProcedure

Procedure.q TagNameValue(TagName.s)
  Protected Value.q
  PokeS(@Value, LCase(TagName), 8, #PB_Ascii)
  ProcedureReturn Value
EndProcedure

Procedure InitHTMLParser()
  Protected c.c
  CharacterType( 9) = 1       ; Whitespace
  CharacterType(10) = 1       ; Whitespace
  CharacterType(13) = 1       ; Whitespace
  CharacterType(32) = 1       ; Whitespace
  CharacterType(33) = 2       ; '!'
  CharacterType(47) = 3       ; '/'
  CharacterType(60) = 4       ; '<'
  CharacterType(62) = 5       ; '>'
  For c = 48 To 57
    CharacterType(c) = 6      ; 0-9
  Next
  For c = 65 To 90
    CharacterType(c) = 7      ; A-Z and a-z
    CharacterType(c | 32) = 7
  Next
EndProcedure

InitHTMLParser()

Procedure ParseHTML(Source.s, List TagList.Tag())
  Protected *Tag.Tag, *TagName.Ascii, TagValue.TagValue
  Protected.i ProcessingTagName, ContinueTagName, IsClosingTag, InnerEndPos, Pos
  Protected *c.Character = @Source
  ClearList(TagList())
  If *c\c
    Repeat
      If ProcessingTagName
        Select CharacterType(*c\c)
          Case 1              ; Whitespace
            ContinueTagName = #False
          Case 2              ; '!'
            If ContinueTagName
              ProcessingTagName = #False
              If TagValue\q1 = 0 And Mid(Source, InnerEndPos, 4) = "<!--"
                ; Skip comment
                Pos = FindString (Source, "-->", InnerEndPos + 4)
                If Pos
                  *c = @Source + (Pos + 1) << #CharSizeShift
                Else
                  Break  
                EndIf
              EndIf
            EndIf
          Case 3              ; '/'
            If TagValue\q1 = 0
              IsClosingTag = #True
            EndIf
          Case 5              ; '>'
            ProcessingTagName = #False
            *TagName\a = 0
            ; *** Process tag code ***
            If IsClosingTag
              *Tag = LastElement(TagList())
              While *Tag
                If *Tag\Length = 0 And *Tag\NameValue = TagValue\q1
                  *Tag\Length = InnerEndPos - *Tag\Start
                  If *Tag\Length = 0
                    *Tag\Length = -1
                  EndIf
                  Break
                EndIf
                *Tag = PreviousElement(TagList())
              Wend
              LastElement(TagList())
            Else
              *Tag = AddElement(TagList())
              *Tag\Name = PeekS(@TagValue, 16, #PB_Ascii)
              *Tag\NameValue = TagValue\q1
              *Tag\Start = (*c - @Source) >> #CharSizeShift + 2
              If TagValue\q1 = $747069726373
                ; Handle script tag
                Pos = FindString(Source, "</script", InnerEndPos + 8, #PB_String_NoCase)
                If Pos
                  *c = @Source + (Pos - 2) << #CharSizeShift
                Else
                  Break  
                EndIf
              EndIf
            EndIf
            ; *** End of process tag code ***
          Case 6              ; 0-9
            If TagValue\q1 = 0
              ProcessingTagName = #False
            ElseIf ContinueTagName And *TagName - @TagValue < 15
              *TagName\a = *c\c
              *TagName + 1
            EndIf
          Case 7              ; A-Z a-z
            If ContinueTagName And *TagName - @TagValue < 15
              *TagName\a = *c\c | $20
              *TagName + 1
            EndIf
          Default
            If ContinueTagName
              ProcessingTagName = #False
            EndIf
        EndSelect
      ElseIf *c\c = 60
        InnerEndPos = (*c - @Source) >> #CharSizeShift + 1
        ProcessingTagName = #True
        ContinueTagName = #True
        IsClosingTag = #False
        *TagName = @TagValue
        TagValue\q1 = 0
      EndIf
      *c + #CharSize
    Until *c\c = 0
    ; remove empty tags
    ResetList(TagList())
    While NextElement(TagList())
      If TagList()\Length < 1
        DeleteElement(TagList())
      EndIf
    Wend
  EndIf
EndProcedure
Example 1

Code: Select all

Source.s = "<div>This is<Div align='left'> a</div> test</div>"
NewList TagList.Tag()

ParseHTML(Source, TagList())

ResetList(TagList())
While NextElement(TagList())
  *Tag.Tag = TagList()
  Debug *Tag\Name
  Debug "Start pos : " + *Tag\Start + " length : " + *Tag\Length
  Debug Mid(Source, *Tag\Start, *Tag\Length)
  Debug StripHTMLTags(Mid(Source, *Tag\Start, *Tag\Length))
  Debug "---"
Wend
Example 2

Code: Select all

H1TagValue.q = TagNameValue("h1") 

Source.s = "<h1>This<br> is<div align='left'> a test</div> to parse html</h1>"
NewList TagList.Tag()

ParseHTML(Source, TagList())

ResetList(TagList())
While NextElement(TagList())
  *Tag.Tag = TagList()
  If *Tag\NameValue = H1TagValue; Show only H1 tags
    Debug *Tag\Name
    Debug "Start pos : " + *Tag\Start + " length : " + *Tag\Length
    Debug Mid(Source, *Tag\Start, *Tag\Length)
    Debug StripHTMLTags(Mid(Source, *Tag\Start, *Tag\Length))
    Debug "---"
  EndIf
Wend
Edit : My first code contained a bug so I updated it.

Re: I need to do a fast string search.

Posted: Mon Aug 11, 2014 12:43 pm
by ricardo
Danilo wrote:Small start for strings:

Code: Select all

EnableExplicit

Global NewMap allowedTags()
Macro AddTag(string):allowedTags(string) = 1:EndMacro

AddTag( "h1") : AddTag("h2") : AddTag( "h3")
AddTag( "b" ) : AddTag("i" ) : AddTag( "u" ) : AddTag("p")
AddTag("div") : ;AddTag("br")

#RemoveOnlyAllowedTags = 0

Macro EatWhiteSpace(charPointer)
        While charPointer\c And ( charPointer\c = ' ' Or charPointer\c = #TAB )   ; remove SPACE and TAB
                charPointer + SizeOf(Character)
        Wend
EndMacro

Macro EatEverythingIncludingChar(charPointer, charToSearch)
        While charPointer\c And charPointer\c <> charToSearch                     ; remove any chars until
                charPointer + SizeOf(Character)                                   ; charToSearch was found
        Wend
        If charPointer\c = charToSearch : charPointer + SizeOf(Character) : EndIf ; remove charToSearch, too
EndMacro

Procedure.s GetTagFromPointer(*pInput.Character,*foundSlash.Integer)              ; extracts an identifier/name/word
    Protected result.s, slash                                                     ;
    If *pInput                                                                    ;
        If *foundSlash : *foundSlash\i = 0 : EndIf                                ; reset: *foundSlash\i = 0
        EatWhiteSpace(*pInput)                                                    ; remove whitespace
        If *pInput\c = '/'                                                        ;
            slash = #True                                                         ; check for optional '/' in front
            *pInput + SizeOf(Character)                                           ; of the identifier
        EndIf
        If (*pInput\c >= 'a' And *pInput\c <= 'z') Or (*pInput\c >= 'A' And *pInput\c <= 'Z') ; identifiers start with a-zA-Z,
            While *pInput\c And ((*pInput\c >= 'a' And *pInput\c <= 'z') Or                   ; followed by a-zA-Z0-9
                  (*pInput\c >= 'A' And *pInput\c <= 'Z') Or                                  ;
                  (*pInput\c >= '0' And *pInput\c <= '9'))                                    ;
                result + Chr( *pInput\c )                                                     ; extract the identifier (tag name)
                *pInput + SizeOf(Character)
            Wend
            result = LCase(result)                                                            ; we work internally with lowercase
                                                                                              ; tag names
            CompilerIf #RemoveOnlyAllowedTags <> 0                                ; CompilerIf to check if extracted
                If allowedTags(result)=0                                          ; tag name is within the list of
                    ProcedureReturn ""                                            ; allowed tags
                EndIf
            CompilerEndIf
            If slash And *foundSlash                                              ; write slash state to output variable
                *foundSlash\i = #True
            EndIf
            ProcedureReturn result                                                ; return the result
        EndIf
    EndIf
EndProcedure

Procedure.s ExtractTag(inputText.s,tagToExtract.s,*foundAtPos.Integer,startPos=0)
    Protected result.s, recording, tag.s, foundSlash, startPosWritten
    Protected NewMap tagCount()
    Protected *pInput.Character = @inputText
    If *foundAtPos : *foundAtPos\i = -1 : EndIf
    tagToExtract = LCase(tagToExtract)
    If *pInput
        *pInput + startPos*SizeOf(Character)                                      ; begin search at 'inputText' + 'startPos' chars
        Repeat
            If *pInput\c = '<'                                                    ; found tag start char '<'
                tag = GetTagFromPointer(*pInput+SizeOf(Character),@foundSlash)    ; get the identifier/name of the tag
                If tag
                    If foundSlash                                                 ; ending tag was found
                        If FindMapElement(tagCount(),tag)
                            If tagCount(tag) > 0
                                tagCount(tag) - 1
                            EndIf
                            If tag = tagToExtract And tagCount(tag) = 0
                                ProcedureReturn result
                            EndIf
                        EndIf
                    Else                                                          ; start tag was found
                        tagCount(tag) + 1
                        If tag = tagToExtract
                            If startPosWritten=0 And *foundAtPos
                                *foundAtPos\i = (*pInput - @inputText)/SizeOf(Character)
                                startPosWritten = 1
                            EndIf
                            recording = #True                                     ; we found our start tag, start recording
                        EndIf
                    EndIf
                    EatEverythingIncludingChar(*pInput,'>')                       ; ignore all other stuff after the tag name
                Else                                                              ; no allowed or valid tag name was found:
                    If recording                                                  ;
                        result + Chr(*pInput\c)                                   ; add chars to the result
                    EndIf
                    *pInput + SizeOf(Character)
                EndIf
            Else
                If recording                                                      ; we are after the start tag, so
                    result + Chr(*pInput\c)                                       ; add all chars outside tags to the result
                EndIf
                *pInput + SizeOf(Character)
            EndIf
        Until *pInput\c = 0
    EndIf
    ProcedureReturn result.s
EndProcedure

Define text.s, pos

text = ExtractTag("<div>This is <div>the content</div> of the div</div>","div",@pos)
If text = ""
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf

Debug "---------------"

text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag</div></p>","div",@pos)
If pos = -1
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf

Debug "---------------"

text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
                  "<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","p",@pos)
If pos = -1
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf

Debug "---------------"

text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
                  "<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","br",@pos)
If pos = -1
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf

Debug "---------------"

;
; find all <div>
;
Define startpos = 0

Repeat
    text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
                      "<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","div",@pos,startpos)
    If pos <> -1
        Debug "Extracted text (pos:"+pos+") "+text
        startpos = pos+1
    EndIf
Until pos = -1
EDIT: changed it, see last example (get all <div> elements)
EDIT2: added some comments

Excellent!! Thanks Danilo, i will test it right now.
This can be runned in threads (each thread for a different content), right?

Best Regards

Re: I need to do a fast string search.

Posted: Mon Aug 11, 2014 12:49 pm
by ricardo
wilbert wrote:My attempt :D
Hopefully fast enough ...

Thanks!! I will run it now.
This can run in threads (one thread for each different content), right?

Re: I need to do a fast string search.

Posted: Mon Aug 11, 2014 1:02 pm
by wilbert
ricardo wrote:This can run in threads (one thread for each different content), right?
It should be safe to let each thread process it's own html document provided you give every thread it's own LinkedList to fill with tag information.
The parsing itself of a 2KB html file to get all tag information probably takes a millisecond or so. After that you can process the tags that were found as you wish.

Re: I need to do a fast string search.

Posted: Tue Aug 30, 2016 10:15 pm
by Oliver13
wilbert wrote:My attempt :D
Hopefully fast enough ...
Works great, thank you.
Now I also would like to process the tag attributes, so I need the whole contents of the tag itself

Code: Select all

Source.s = "<div id='1'>This is<Div align='left' id='2'> a</div> test</div>"
Do you have an idea how to extract this additional tag info ?
*Tag\Contents -> id='1'
*Tag\Contents -> align='left' id='2'

TIA

Re: I need to do a fast string search.

Posted: Wed Aug 31, 2016 5:47 am
by wilbert
You can modify the code a bit to extract the tag with all attributes as a single string but you will have to process that further to get the information you need.

Example of extracting tag with attributes

Code: Select all

#CharSize = SizeOf(Character)
#CharSizeShift = #CharSize - 1

Structure TagValue
  q1.q
  q2.q
EndStructure

Structure Tag
  Name.s      ; tag name
  NameValue.q ; lowercase ascii representation of first 8 characters
  Start.l     ; inner text start pos 
  Length.l    ; length of inner text
EndStructure

Global Dim CharacterType.a($ffff)

Procedure.s StripHTMLTags(Source.s)
  Protected *src.Character = @Source
  Protected *dst.Character = @Source
  Protected Ignore.i
  If *src\c
    Repeat
      Select *src\c
        Case 60; '<'
          Ignore = #True
        Case 62; '>'
          Ignore = #False
        Default
          If Ignore = #False
            *dst\c = *src\c
            *dst + #CharSize
          EndIf
      EndSelect
      *src + #CharSize
    Until *src\c = 0
    *dst\c = 0
  EndIf
  ProcedureReturn PeekS(@Source)  
EndProcedure

Procedure.q TagNameValue(TagName.s)
  Protected Value.q
  PokeS(@Value, LCase(TagName), 8, #PB_Ascii)
  ProcedureReturn Value
EndProcedure

Procedure InitHTMLParser()
  Protected c.c
  CharacterType( 9) = 1       ; Whitespace
  CharacterType(10) = 1       ; Whitespace
  CharacterType(13) = 1       ; Whitespace
  CharacterType(32) = 1       ; Whitespace
  CharacterType(33) = 2       ; '!'
  CharacterType(47) = 3       ; '/'
  CharacterType(60) = 4       ; '<'
  CharacterType(62) = 5       ; '>'
  For c = 48 To 57
    CharacterType(c) = 6      ; 0-9
  Next
  For c = 65 To 90
    CharacterType(c) = 7      ; A-Z and a-z
    CharacterType(c | 32) = 7
  Next
EndProcedure

InitHTMLParser()

Procedure ParseHTML(Source.s, List TagList.Tag())
  Protected *Tag.Tag, *TagName.Ascii, TagValue.TagValue
  Protected.i ProcessingTagName, ContinueTagName, IsClosingTag, InnerEndPos, Pos
  Protected *c.Character = @Source
  ClearList(TagList())
  If *c\c
    Repeat
      If ProcessingTagName
        Select CharacterType(*c\c)
          Case 1              ; Whitespace
            ContinueTagName = #False
          Case 2              ; '!'
            If ContinueTagName
              ProcessingTagName = #False
              If TagValue\q1 = 0 And Mid(Source, InnerEndPos, 4) = "<!--"
                ; Skip comment
                Pos = FindString (Source, "-->", InnerEndPos + 4)
                If Pos
                  *c = @Source + (Pos + 1) << #CharSizeShift
                Else
                  Break  
                EndIf
              EndIf
            EndIf
          Case 3              ; '/'
            If TagValue\q1 = 0
              IsClosingTag = #True
            EndIf
          Case 5              ; '>'
            ProcessingTagName = #False
            *TagName\a = 0
            ; *** Process tag code ***
            If IsClosingTag
              *Tag = LastElement(TagList())
              While *Tag
                If *Tag\Length = 0 And *Tag\NameValue = TagValue\q1
                  *Tag\Length = InnerEndPos - *Tag\Start
                  If *Tag\Length = 0
                    *Tag\Length = -1
                  EndIf
                  Break
                EndIf
                *Tag = PreviousElement(TagList())
              Wend
              LastElement(TagList())
            Else
              *Tag = AddElement(TagList())
              *Tag\NameValue = TagValue\q1
              *Tag\Start = (*c - @Source) >> #CharSizeShift + 2
              *Tag\Name = Mid(Source, InnerEndPos, *Tag\Start - InnerEndPos)
              If TagValue\q1 = $747069726373
                ; Handle script tag
                Pos = FindString(Source, "</script", InnerEndPos + 8, #PB_String_NoCase)
                If Pos
                  *c = @Source + (Pos - 2) << #CharSizeShift
                Else
                  Break  
                EndIf
              EndIf
            EndIf
            ; *** End of process tag code ***
          Case 6              ; 0-9
            If TagValue\q1 = 0
              ProcessingTagName = #False
            ElseIf ContinueTagName And *TagName - @TagValue < 15
              *TagName\a = *c\c
              *TagName + 1
            EndIf
          Case 7              ; A-Z a-z
            If ContinueTagName And *TagName - @TagValue < 15
              *TagName\a = *c\c | $20
              *TagName + 1
            EndIf
          Default
            If ContinueTagName
              ProcessingTagName = #False
            EndIf
        EndSelect
      ElseIf *c\c = 60
        InnerEndPos = (*c - @Source) >> #CharSizeShift + 1
        ProcessingTagName = #True
        ContinueTagName = #True
        IsClosingTag = #False
        *TagName = @TagValue
        TagValue\q1 = 0
      EndIf
      *c + #CharSize
    Until *c\c = 0
    ; remove empty tags
    ResetList(TagList())
    While NextElement(TagList())
      If TagList()\Length < 1
        DeleteElement(TagList())
      EndIf
    Wend
  EndIf
EndProcedure

Re: I need to do a fast string search.

Posted: Thu Sep 01, 2016 6:49 pm
by Oliver13
wilbert wrote:You can modify the code a bit to extract the tag with all attributes as a single string but you will have to process that further to get the information you need.
Great, 1000+tx !