Code: Select all
EnableExplicit
Global NewMap allowedTags()
Macro AddTag(string):allowedTags(string) = 1:EndMacro
AddTag( "h1") : AddTag("h2") : AddTag( "h3")
AddTag( "b" ) : AddTag("i" ) : AddTag( "u" ) : AddTag("p")
AddTag("div") : ;AddTag("br")
#RemoveOnlyAllowedTags = 0
Macro EatWhiteSpace(charPointer)
        While charPointer\c And ( charPointer\c = ' ' Or charPointer\c = #TAB )   ; remove SPACE and TAB
                charPointer + SizeOf(Character)
        Wend
EndMacro
Macro EatEverythingIncludingChar(charPointer, charToSearch)
        While charPointer\c And charPointer\c <> charToSearch                     ; remove any chars until
                charPointer + SizeOf(Character)                                   ; charToSearch was found
        Wend
        If charPointer\c = charToSearch : charPointer + SizeOf(Character) : EndIf ; remove charToSearch, too
EndMacro
Procedure.s GetTagFromPointer(*pInput.Character,*foundSlash.Integer)              ; extracts an identifier/name/word
    Protected result.s, slash                                                     ;
    If *pInput                                                                    ;
        If *foundSlash : *foundSlash\i = 0 : EndIf                                ; reset: *foundSlash\i = 0
        EatWhiteSpace(*pInput)                                                    ; remove whitespace
        If *pInput\c = '/'                                                        ;
            slash = #True                                                         ; check for optional '/' in front
            *pInput + SizeOf(Character)                                           ; of the identifier
        EndIf
        If (*pInput\c >= 'a' And *pInput\c <= 'z') Or (*pInput\c >= 'A' And *pInput\c <= 'Z') ; identifiers start with a-zA-Z,
            While *pInput\c And ((*pInput\c >= 'a' And *pInput\c <= 'z') Or                   ; followed by a-zA-Z0-9
                  (*pInput\c >= 'A' And *pInput\c <= 'Z') Or                                  ;
                  (*pInput\c >= '0' And *pInput\c <= '9'))                                    ;
                result + Chr( *pInput\c )                                                     ; extract the identifier (tag name)
                *pInput + SizeOf(Character)
            Wend
            result = LCase(result)                                                            ; we work internally with lowercase
                                                                                              ; tag names
            CompilerIf #RemoveOnlyAllowedTags <> 0                                ; CompilerIf to check if extracted
                If allowedTags(result)=0                                          ; tag name is within the list of
                    ProcedureReturn ""                                            ; allowed tags
                EndIf
            CompilerEndIf
            If slash And *foundSlash                                              ; write slash state to output variable
                *foundSlash\i = #True
            EndIf
            ProcedureReturn result                                                ; return the result
        EndIf
    EndIf
EndProcedure
Procedure.s ExtractTag(inputText.s,tagToExtract.s,*foundAtPos.Integer,startPos=0)
    Protected result.s, recording, tag.s, foundSlash, startPosWritten
    Protected NewMap tagCount()
    Protected *pInput.Character = @inputText
    If *foundAtPos : *foundAtPos\i = -1 : EndIf
    tagToExtract = LCase(tagToExtract)
    If *pInput
        *pInput + startPos*SizeOf(Character)                                      ; begin search at 'inputText' + 'startPos' chars
        Repeat
            If *pInput\c = '<'                                                    ; found tag start char '<'
                tag = GetTagFromPointer(*pInput+SizeOf(Character),@foundSlash)    ; get the identifier/name of the tag
                If tag
                    If foundSlash                                                 ; ending tag was found
                        If FindMapElement(tagCount(),tag)
                            If tagCount(tag) > 0
                                tagCount(tag) - 1
                            EndIf
                            If tag = tagToExtract And tagCount(tag) = 0
                                ProcedureReturn result
                            EndIf
                        EndIf
                    Else                                                          ; start tag was found
                        tagCount(tag) + 1
                        If tag = tagToExtract
                            If startPosWritten=0 And *foundAtPos
                                *foundAtPos\i = (*pInput - @inputText)/SizeOf(Character)
                                startPosWritten = 1
                            EndIf
                            recording = #True                                     ; we found our start tag, start recording
                        EndIf
                    EndIf
                    EatEverythingIncludingChar(*pInput,'>')                       ; ignore all other stuff after the tag name
                Else                                                              ; no allowed or valid tag name was found:
                    If recording                                                  ;
                        result + Chr(*pInput\c)                                   ; add chars to the result
                    EndIf
                    *pInput + SizeOf(Character)
                EndIf
            Else
                If recording                                                      ; we are after the start tag, so
                    result + Chr(*pInput\c)                                       ; add all chars outside tags to the result
                EndIf
                *pInput + SizeOf(Character)
            EndIf
        Until *pInput\c = 0
    EndIf
    ProcedureReturn result.s
EndProcedure
Define text.s, pos
text = ExtractTag("<div>This is <div>the content</div> of the div</div>","div",@pos)
If text = ""
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf
Debug "---------------"
text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag</div></p>","div",@pos)
If pos = -1
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf
Debug "---------------"
text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
                  "<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","p",@pos)
If pos = -1
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf
Debug "---------------"
text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
                  "<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","br",@pos)
If pos = -1
    Debug "nothing found."
Else
    Debug "Extracted text (pos:"+pos+") "+text
EndIf
Debug "---------------"
;
; find all <div>
;
Define startpos = 0
Repeat
    text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
                      "<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","div",@pos,startpos)
    If pos <> -1
        Debug "Extracted text (pos:"+pos+") "+text
        startpos = pos+1
    EndIf
Until pos = -1EDIT2: added some comments


