- RegexMatches retrieves:
- matched string position (PB position)
- sub groups by index
- sub groups by name
- sub groups are supported in "Replacement" paraneter
- RegexInfo & RegexMatchesInfo retrieves some debugging info
- UTF8 supported
Code: Select all
;:=============================================================================
;:- MoreRegex.pbi
;:- Author : Eddy
;:- Date : October 6, 2013
;:- Compiler : PureBasic 5.20 LTS
;:- Target OS : Mac, Linux, Windows
;:- Source --------------------------------------------------------------------
;:- http://www.purebasic.fr/english/viewtopic.php?f=40&t=56955
;:- Credits --------------------------------------------------------------------
;:- original post: http://www.purebasic.fr/english/viewtopic.php?f=13&t=55086
;:- PCRE doc: http://pcre.sourceforge.net/pcre.txt
;:- PCRE doc: http://www.pcre.org/pcre.txt
;:- pcre_fullinfo: http://www.unix.com/man-page/Linux/3/pcre_fullinfo/
;:=============================================================================
DeclareModule MoreRegex
Structure REGEX_MATCH
StartPosition.i ; position index of matched string (PB first index is 1)
Array Groups.s(0) ; array of sub groups
Map NamedGroups.s() ; map of sub groups
EndStructure
;this function returns array of sub group names
Declare.i RegexGroupNames(Regex, Array GroupNames.s(1))
;this function returns list of matched strings and their associated sub groups
Declare.i RegexMatches(Regex, Subject.s, Array Matches.REGEX_MATCH(1), StartPosition = 1)
;this function replaces some strings according to the Regex pattern (sub groups are supported in "Replacement" paraneter)
Declare.s RegexReplaces(Regex, Subject.s, Replacement.s, StartPosition = 1, maximumReference.l = 10)
;debug functions
Declare.s RegexInfo(Regex)
Declare.s RegexMatchesInfo(Array Matches.REGEX_MATCH(1))
EndDeclareModule
Module MoreRegex
EnableExplicit
ImportC ""
pb_pcre_exec(*pcre, *extra, subject.p-utf8, length, startoffset, options, *ovector, ovecsize)
pb_pcre_get_substring(subject.p-utf8, *ovector, stringcount, stringnumber, *stringptr)
pb_pcre_get_named_substring(*pcre, subject.p-utf8, *ovector, stringcount, stringname.s, *stringptr)
pb_pcre_fullinfo(*pcre, *extra, what, *where)
pb_pcre_free_substring(*stringptr)
EndImport
CompilerIf Not Defined(PCRE_INFO_NAMEENTRYSIZE, #PB_Constant)
#PCRE_INFO_CAPTURECOUNT=2
#PCRE_INFO_NAMEENTRYSIZE=7
#PCRE_INFO_NAMECOUNT=8
#PCRE_INFO_NAMETABLE=9
CompilerEndIf
Procedure.i RegexGroupNames(Regex, Array GroupNames.s(1))
Protected *nameTable, nameCount, nameEntrySize, groupCount
pb_pcre_fullinfo(PeekL(Regex), 0, #PCRE_INFO_CAPTURECOUNT, @groupCount)
pb_pcre_fullinfo(PeekL(Regex), 0, #PCRE_INFO_NAMECOUNT, @nameCount)
pb_pcre_fullinfo(PeekL(Regex), 0, #PCRE_INFO_NAMEENTRYSIZE, @nameEntrySize)
pb_pcre_fullinfo(PeekL(Regex), 0, #PCRE_INFO_NAMETABLE, @*nameTable)
Protected i
ReDim GroupNames(groupCount)
For i = 0 To ArraySize(GroupNames())
GroupNames(i)=""
Next
For i = 0 To nameCount-1
Protected *p=i*nameEntrySize + *nameTable
Protected groupNumber=(PeekB(*p)<<4)+(PeekB(*p+1))
Protected groupName.s=PeekS(*p+2, -1, #PB_UTF8)
GroupNames(groupNumber)=groupName
Next
;returns count of sub groups
ProcedureReturn groupCount
EndProcedure
Procedure.i RegexMatches(Regex, Subject.s, Array Matches.REGEX_MATCH(1), StartPosition = 1)
Protected matchCount = -1, len = StringByteLength(Subject, #PB_UTF8), offset=StartPosition-1
Protected groupCount.l = 0, groupNumber = 0, groupString = 0, groupLength.l = 0
Protected Dim ovec.l(30), Dim groupNames.s(0)
RegexGroupNames(Regex, groupNames())
ReDim Matches(0)
If Regex And StartPosition>0
While offset<len
groupCount=pb_pcre_exec(PeekL(Regex), 0, Subject, len, offset, 0, ovec(), ArraySize(ovec()))
If groupCount<=0 : Break : EndIf
;register new matched string and its position
matchCount+1
ReDim Matches(matchCount)
With Matches(matchCount)
\StartPosition=ovec(0)+1
ReDim \Groups(groupCount-1)
;register sub groups of new matched string
For groupNumber=0 To groupCount-1
groupLength=pb_pcre_get_substring(Subject, ovec(), groupCount, groupNumber, @groupString)
If groupString
If groupLength>=0
;register sub groups by index
\Groups(groupNumber)=PeekS(groupString, groupLength, #PB_UTF8)
;register sub groups by name (if its name is defined)
If groupNames(groupNumber)
\NamedGroups(groupNames(groupNumber))=\Groups(groupNumber)
EndIf
EndIf
pb_pcre_free_substring(groupString)
EndIf
Next
EndWith
;find next offset
offset=ovec(1)+1*Bool(offset = ovec(1))
Wend
EndIf
;returns count of found matches
ProcedureReturn matchCount+1
EndProcedure
Procedure.s RegexReplaces(Regex, Subject.s, Replacement.s, StartPosition = 1, maximumReference.l = 10) ; 0 <= maximum_reference <= 99
Protected Dim Matches.REGEX_MATCH(0), i, NewString.s, Result.s=Subject
Protected groupNumber
RegexMatches(Regex, Subject, Matches(), StartPosition)
For i=ArraySize(Matches())To 0 Step -1
With Matches(i)
NewString=Replacement
For groupNumber=0 To ArraySize(\Groups())
NewString=ReplaceString(NewString, "$"+groupNumber, \Groups(groupNumber))
Next
ForEach \NamedGroups()
NewString=ReplaceString(NewString, "${"+MapKey(\NamedGroups())+"}", \NamedGroups())
Next
Result=ReplaceString(Result, \Groups(0), NewString, 0, \StartPosition, 1)
EndWith
Next
ProcedureReturn Result
EndProcedure
Procedure.s RegexInfo(Regex)
Dim groupNames.s(0)
RegexGroupNames(Regex, groupNames())
Protected groupNumber, info$
info$="This regex contains "+ArraySize(groupNames())+" sub group(s)"+#CRLF$
For groupNumber=1 To ArraySize(groupNames())
info$+"Name of "+groupNumber+". group: "+groupNames(groupNumber)+#CRLF$
Next
ProcedureReturn info$
EndProcedure
Procedure.s RegexMatchesInfo(Array Matches.REGEX_MATCH(1))
Protected groupNumber, info$, i, j, matchCount=ArraySize(Matches())+1
info$="This subject contains "+matchCount+" matched string(s) described below:"+#CRLF$
info$+"--------------------------------"+#CRLF$
For i=0 To ArraySize(Matches())
With Matches(i)
For groupNumber=0 To ArraySize(\Groups())
If groupNumber = 0
info$+"Full solution at position "+\StartPosition+": "+\Groups(groupNumber)+#CRLF$
Else
info$+"Content of "+Str(groupNumber)+". group: "+\Groups(groupNumber)+#CRLF$
EndIf
Next
ForEach \NamedGroups()
info$+"Content of group <"+MapKey(\NamedGroups())+">: "+\NamedGroups()+#CRLF$
Next
info$+"--------------------------------"+#CRLF$
EndWith
Next
ProcedureReturn info$
EndProcedure
EndModule
CompilerIf #PB_Compiler_IsMainFile
DisableExplicit
UseModule MoreRegex
; ********************
; EXAMPLE
; ********************
Define subject.s = " <ID>123</ID> <u>title of message</u>"
Define pattern.s = "(?<begin><(\w+)>)(?<content>.+?)(?<end></\2>)"
;Define pattern.s = "(?<begin><(?<tag>\w+)>)(?<content>.+?)(?<end></\k<tag>>)"
Define regex, groupIndex, i
Define Dim groupNames.s(0)
Define Dim matches.REGEX_MATCH(0)
regex = CreateRegularExpression(#PB_Any, pattern)
RegexGroupNames(regex, groupNames())
Debug RegexInfo(regex)
If RegexMatches(regex, subject, matches())
Debug RegexMatchesInfo(matches())
EndIf
Debug matches(1)\Groups(0)
Debug matches(1)\Groups(1)
Debug matches(0)\NamedGroups("content")
Debug matches(0)\NamedGroups("begin")
Debug subject
Debug RegexReplaces(regex, subject, "<REPLACED>") ;replaced by new strings
Debug RegexReplaces(regex, subject, "$1 $4") ;replaced by sub groups
Debug RegexReplaces(regex, subject, "${begin} ${end}") ;replaced by named sub groups
CompilerEndIf