Code: Select all
EnableExplicit
Global NewMap allowedTags()
Macro AddTag(string):allowedTags(string) = 1:EndMacro
AddTag( "h1") : AddTag("h2") : AddTag( "h3")
AddTag( "b" ) : AddTag("i" ) : AddTag( "u" ) : AddTag("p")
AddTag("div") : ;AddTag("br")
#RemoveOnlyAllowedTags = 0
Macro EatWhiteSpace(charPointer)
While charPointer\c And ( charPointer\c = ' ' Or charPointer\c = #TAB ) ; remove SPACE and TAB
charPointer + SizeOf(Character)
Wend
EndMacro
Macro EatEverythingIncludingChar(charPointer, charToSearch)
While charPointer\c And charPointer\c <> charToSearch ; remove any chars until
charPointer + SizeOf(Character) ; charToSearch was found
Wend
If charPointer\c = charToSearch : charPointer + SizeOf(Character) : EndIf ; remove charToSearch, too
EndMacro
Procedure.s GetTagFromPointer(*pInput.Character,*foundSlash.Integer) ; extracts an identifier/name/word
Protected result.s, slash ;
If *pInput ;
If *foundSlash : *foundSlash\i = 0 : EndIf ; reset: *foundSlash\i = 0
EatWhiteSpace(*pInput) ; remove whitespace
If *pInput\c = '/' ;
slash = #True ; check for optional '/' in front
*pInput + SizeOf(Character) ; of the identifier
EndIf
If (*pInput\c >= 'a' And *pInput\c <= 'z') Or (*pInput\c >= 'A' And *pInput\c <= 'Z') ; identifiers start with a-zA-Z,
While *pInput\c And ((*pInput\c >= 'a' And *pInput\c <= 'z') Or ; followed by a-zA-Z0-9
(*pInput\c >= 'A' And *pInput\c <= 'Z') Or ;
(*pInput\c >= '0' And *pInput\c <= '9')) ;
result + Chr( *pInput\c ) ; extract the identifier (tag name)
*pInput + SizeOf(Character)
Wend
result = LCase(result) ; we work internally with lowercase
; tag names
CompilerIf #RemoveOnlyAllowedTags <> 0 ; CompilerIf to check if extracted
If allowedTags(result)=0 ; tag name is within the list of
ProcedureReturn "" ; allowed tags
EndIf
CompilerEndIf
If slash And *foundSlash ; write slash state to output variable
*foundSlash\i = #True
EndIf
ProcedureReturn result ; return the result
EndIf
EndIf
EndProcedure
Procedure.s ExtractTag(inputText.s,tagToExtract.s,*foundAtPos.Integer,startPos=0)
Protected result.s, recording, tag.s, foundSlash, startPosWritten
Protected NewMap tagCount()
Protected *pInput.Character = @inputText
If *foundAtPos : *foundAtPos\i = -1 : EndIf
tagToExtract = LCase(tagToExtract)
If *pInput
*pInput + startPos*SizeOf(Character) ; begin search at 'inputText' + 'startPos' chars
Repeat
If *pInput\c = '<' ; found tag start char '<'
tag = GetTagFromPointer(*pInput+SizeOf(Character),@foundSlash) ; get the identifier/name of the tag
If tag
If foundSlash ; ending tag was found
If FindMapElement(tagCount(),tag)
If tagCount(tag) > 0
tagCount(tag) - 1
EndIf
If tag = tagToExtract And tagCount(tag) = 0
ProcedureReturn result
EndIf
EndIf
Else ; start tag was found
tagCount(tag) + 1
If tag = tagToExtract
If startPosWritten=0 And *foundAtPos
*foundAtPos\i = (*pInput - @inputText)/SizeOf(Character)
startPosWritten = 1
EndIf
recording = #True ; we found our start tag, start recording
EndIf
EndIf
EatEverythingIncludingChar(*pInput,'>') ; ignore all other stuff after the tag name
Else ; no allowed or valid tag name was found:
If recording ;
result + Chr(*pInput\c) ; add chars to the result
EndIf
*pInput + SizeOf(Character)
EndIf
Else
If recording ; we are after the start tag, so
result + Chr(*pInput\c) ; add all chars outside tags to the result
EndIf
*pInput + SizeOf(Character)
EndIf
Until *pInput\c = 0
EndIf
ProcedureReturn result.s
EndProcedure
Define text.s, pos
text = ExtractTag("<div>This is <div>the content</div> of the div</div>","div",@pos)
If text = ""
Debug "nothing found."
Else
Debug "Extracted text (pos:"+pos+") "+text
EndIf
Debug "---------------"
text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag</div></p>","div",@pos)
If pos = -1
Debug "nothing found."
Else
Debug "Extracted text (pos:"+pos+") "+text
EndIf
Debug "---------------"
text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
"<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","p",@pos)
If pos = -1
Debug "nothing found."
Else
Debug "Extracted text (pos:"+pos+") "+text
EndIf
Debug "---------------"
text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
"<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","br",@pos)
If pos = -1
Debug "nothing found."
Else
Debug "Extracted text (pos:"+pos+") "+text
EndIf
Debug "---------------"
;
; find all <div>
;
Define startpos = 0
Repeat
text = ExtractTag("<p><div id='1'>This is <b><u>the</u></b> <div id=2>content</div> of <i>the</i> tag.</div><br />"+
"<div id='1'>This is <b><u>the</u></b> <div id=2>second content</div> of <i>the</i> tag</div></p>","div",@pos,startpos)
If pos <> -1
Debug "Extracted text (pos:"+pos+") "+text
startpos = pos+1
EndIf
Until pos = -1
EDIT2: added some comments