FastStringSplit improve speed ...

Share your advanced PureBasic knowledge/code with the community.
User avatar
CELTIC88
Enthusiast
Enthusiast
Posts: 154
Joined: Thu Sep 17, 2015 3:39 pm

FastStringSplit improve speed ...

Post by CELTIC88 »

hello everyone :mrgreen:

Code: Select all

EnableExplicit
ImportC ""
  wcsstr.i(*str1, *str2)
  _wcslwr.i(*cs)
  wcslen.l(*cs)
EndImport

DisableDebugger
Procedure FastStringSplit(sString.s, sDelimiters.s, Array StringArray.i(1), Casesense = 1)  
  
  Protected String = @sString
  Protected Delimiters = @sDelimiters
  
  Protected StringSize = wcslen(String) * 2
  Protected DelimitersSize = wcslen(Delimiters) * 2
  
  Protected aString = AllocateMemory(StringSize + 2):CopyMemory(String,aString,StringSize)
  If Not aString:ProcedureReturn 0:EndIf
  
  If Casesense = 0
    _wcslwr(String)
    _wcslwr(Delimiters)
  EndIf
  
  Protected String2 = String
  Protected ElementStart ,ElementSize
  
  Protected aItem
  Repeat   
    
    String2 = wcsstr(String2,Delimiters)
    If String2 = 0
      ElementSize = StringSize - ElementStart
    Else
      ElementSize = String2 - (String + ElementStart)
    EndIf
    
    If ArraySize(StringArray()) < aItem : ReDim StringArray(aItem +99):EndIf
    
    StringArray(aItem) = aString + ElementStart
    aItem + 1
    
    PokeW(aString + ElementStart + ElementSize, 0)
    
    ElementStart + ElementSize + DelimitersSize
    
    String2 + DelimitersSize
    
  Until String2 = DelimitersSize
  
  If ElementSize = StringSize
    FreeMemory(aString)
    ProcedureReturn 0
  EndIf
  
  ReDim StringArray(aItem-1)
  ProcedureReturn aString
EndProcedure

Procedure.i SplitStringStringField(String.s, Delimiter.s, Array Output.s(1))
  Protected n = 1 + CountString(String, Delimiter),i
  Dim Output.s(n - 1)
  For i = 0 To n-1
    Output(i) = Trim(StringField(String, 1+i, Delimiter))
  Next i
EndProcedure

Procedure ExplodeStringIntoArray(Separator$, String$, Array Output$(1))
  Protected ArrayIndex, StartPos, SeparatorPos
  
  ArrayIndex = -1
  StartPos = 1
  
  Repeat
    ArrayIndex + 1
    
    If ArraySize(Output$()) < ArrayIndex
      ReDim Output$(ArrayIndex + 99) ; Wenn Output-Array zu klein ist, es um 100 Elemente vergrößern
    EndIf
    
    SeparatorPos = FindString(String$, Separator$, StartPos)
    If SeparatorPos = 0
      Output$(ArrayIndex) = Mid(String$, StartPos)
      Break
    EndIf
    
    Output$(ArrayIndex) = Mid(String$, StartPos, SeparatorPos - StartPos)
    StartPos = SeparatorPos + 1
  ForEver
  
  ReDim Output$(ArrayIndex) ; Array auf die wirklich notwendige Größe setzen
  
  ProcedureReturn ArrayIndex
EndProcedure


EnableDebugger
DisableExplicit
selectfile.s = OpenFileRequester("Select big texte...","","txt|*.*;*.txt",0)

Dim StringArray.i(1000)

Procedure.s FileRead_String(FilePath.s,StringType = #PB_UTF8)
  Protected hfile = ReadFile(#PB_Any, FilePath,#PB_File_SharedRead|#PB_File_SharedWrite)
  If hfile
    Protected SizeFile = Lof(hfile)
    Protected *pFile = AllocateMemory(SizeFile+4)
    ReadData(hfile,*pFile,SizeFile)
    CloseFile(hfile)
  EndIf
  If *pFile:Protected Strfile.s= PeekS(*pFile,-1,StringType):FreeMemory(*pFile):EndIf
  ProcedureReturn Strfile
EndProcedure

FileString.s = FileRead_String(selectfile)

time = ElapsedMilliseconds()
parray = FastStringSplit(FileString,#CRLF$, StringArray())
If parray
  Debug "FastStringSplit : ElapsedMilliseconds = " + 
        Str(ElapsedMilliseconds() - time ) + " / ArraySize = " + 
        Str(ArraySize(StringArray()))
  ;   For r = 0 To ArraySize(StringArray())
  ;     Debug PeekS(StringArray(r))
  ;   Next
  FreeMemory(parray)
Else
  
EndIf

Dim Word.s(0)

time = ElapsedMilliseconds()
SplitStringStringField(FileString, #CRLF$, Word())
Debug "SplitStringStringField : ElapsedMilliseconds = " + 
      Str(ElapsedMilliseconds() - time ) + " / ArraySize = " + 
      Str(ArraySize(Word()))

time = ElapsedMilliseconds()
ExplodeStringIntoArray(#CRLF$, FileString, Word())
Debug "ExplodeStringIntoArray : ElapsedMilliseconds = " + 
      Str(ElapsedMilliseconds() - time ) + " / ArraySize = " + 
      Str(ArraySize(Word()))

; tested With "1.5 mb" text file
; FastStringSplit : ElapsedMilliseconds = 14 / ArraySize = 18710
; SplitStringStringField : ElapsedMilliseconds = 60798 / ArraySize = 18710
; ExplodeStringIntoArray : ElapsedMilliseconds = 28625 / ArraySize = 18710
Last edited by CELTIC88 on Fri Dec 22, 2017 2:41 pm, edited 1 time in total.
interested in Cybersecurity..
User avatar
nco2k
Addict
Addict
Posts: 1344
Joined: Mon Sep 15, 2003 5:55 am

Re: FastStringSplit improve speed ...

Post by nco2k »

way too complicated. you are already using MSVCRT, so why not use strtok?

Code: Select all

ImportC ""
  CompilerIf #PB_Compiler_Unicode
    CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
      StrTok_(*String, *Delimiter) As "wcstok"
    CompilerElse
      StrTok_(*String, *Delimiter) As "_wcstok"
    CompilerEndIf
  CompilerElse
    CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
      StrTok_(*String, *Delimiter) As "strtok"
    CompilerElse
      StrTok_(*String, *Delimiter) As "_strtok"
    CompilerEndIf
  CompilerEndIf
EndImport

Procedure SplitString(String$, Delimiter$)
  Protected *Token = StrTok_(@String$, @Delimiter$)
  While *Token
    Debug PeekS(*Token)
    *Token = StrTok_(0, @Delimiter$)
  Wend
EndProcedure

SplitString("AxBxC", "x")
i would usually use a pointer as the string parameter, but in this case its better not to, because strtok modifies the provided string. it replaces every occurance of Delimiter$ with a null-terminator.

also never do benchmarking with the debugger enabled. your results are nowhere near accurate.

c ya,
nco2k
If OSVersion() = #PB_OS_Windows_ME : End : EndIf
User avatar
Kukulkan
Addict
Addict
Posts: 1352
Joined: Mon Jun 06, 2005 2:35 pm
Location: germany
Contact:

Re: FastStringSplit improve speed ...

Post by Kukulkan »

Hi,

the version of nco2k looks interesting, but it fails on Linux with PB 5.46b2 x64 with unicode flag. If I disable unicode, it works.

I think the usage of wcstok is wrong as it needs three parameters to work (http://man7.org/linux/man-pages/man3/wcstok.3.html). At least on Linux...

Sadly I have not time to investigate further :(
User avatar
CELTIC88
Enthusiast
Enthusiast
Posts: 154
Joined: Thu Sep 17, 2015 3:39 pm

Re: FastStringSplit improve speed ...

Post by CELTIC88 »

update : first post..

@nco2k yes a good suggestion... but your function not support "case sense"

@Kukulkan i have same problem !
interested in Cybersecurity..
User avatar
Kwai chang caine
Always Here
Always Here
Posts: 5349
Joined: Sun Nov 05, 2006 11:42 pm
Location: Lyon - France

Re: FastStringSplit improve speed ...

Post by Kwai chang caine »

Works fine on W10 X64 v5.61 x86
FastStringSplit : ElapsedMilliseconds = 3 / ArraySize = 3253
SplitStringStringField : ElapsedMilliseconds = 5372 / ArraySize = 3253
ExplodeStringIntoArray : ElapsedMilliseconds = 1968 / ArraySize = 3253
Thanks for sharing 8)
ImageThe happiness is a road...
Not a destination
User avatar
skywalk
Addict
Addict
Posts: 3994
Joined: Wed Dec 23, 2009 10:14 pm
Location: Boston, MA

Re: FastStringSplit improve speed ...

Post by skywalk »

Yeah, nothing is free. The speed comes at the cost of durability.
Define.s s$ = ",,,, ,,A,,,B,,C,,"
Define.s d$ = ","
strtok(), wcstok(), etc. skip empty delimiters and only see "A,B,C" in the above example. :evil:
The nice thing about standards is there are so many to choose from. ~ Andrew Tanenbaum
User avatar
nco2k
Addict
Addict
Posts: 1344
Joined: Mon Sep 15, 2003 5:55 am

Re: FastStringSplit improve speed ...

Post by nco2k »

@Kukulkan
the XPG4 version keeps internally track of the progress. the ISO C version requires an additional *Context parameter, that stores the progress. so you should try that.

@skywalk
which is the expected behavior. its not about speed however. the way strtok works, they could have easily implement it the way you described. but usually you dont spam a string with delimiters, and when you explode the string, you want to get rid of such garbage. its simply just a design choice.

c ya,
nco2k
If OSVersion() = #PB_OS_Windows_ME : End : EndIf
User avatar
skywalk
Addict
Addict
Posts: 3994
Joined: Wed Dec 23, 2009 10:14 pm
Location: Boston, MA

Re: FastStringSplit improve speed ...

Post by skywalk »

It's also not thread safe.
Every database in the world must account for nulls. strtok() "expected" behavior is a fail and never used in any app I've written.
The nice thing about standards is there are so many to choose from. ~ Andrew Tanenbaum
User avatar
nco2k
Addict
Addict
Posts: 1344
Joined: Mon Sep 15, 2003 5:55 am

Re: FastStringSplit improve speed ...

Post by nco2k »

thats not true. even though microsofts MSVCRT version uses a static variable, its stored in a thread local storage. all C-Runtime functions are threadsafe.

here is an alternative version, that should work the way you want. it should also be a bit faster than CELTIC88's version. i didnt bother adding case insensitivity though.

personally i think it would be more comfortable if the array contained the actual strings, instead of pointers. but that would be slower of course.

Code: Select all

EnableExplicit

ImportC ""
  CompilerIf #PB_Compiler_Unicode
    CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
      __StrDup_(*String) As "_wcsdup"
      __StrLen_(*String) As "wcslen"
      __StrStr_(*String1, *String2) As "wcsstr"
      __StrFree_(*String) As "free"
    CompilerElse
      __StrDup_(*String) As "__wcsdup"
      __StrLen_(*String) As "_wcslen"
      __StrStr_(*String1, *String2) As "_wcsstr"
      __StrFree_(*String) As "_free"
    CompilerEndIf
  CompilerElse
    CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
      __StrDup_(*String) As "_strdup"
      __StrLen_(*String) As "strlen"
      __StrStr_(*String1, *String2) As "strstr"
      __StrFree_(*String) As "free"
    CompilerElse
      __StrDup_(*String) As "__strdup"
      __StrLen_(*String) As "_strlen"
      __StrStr_(*String1, *String2) As "_strstr"
      __StrFree_(*String) As "_free"
    CompilerEndIf
  CompilerEndIf
EndImport

Procedure SplitString(*String.Character, *Delimiter, Array PointerArray(1), ArrayGrowth=100)
  Protected *Result, *StringAddress1.Character, *StringAddress2.Character, DelimiterLength, ArrayIndex, ArrayBound = ArraySize(PointerArray())
  If *String\c
    *Result = __StrDup_(*String)
    If *Result
      *StringAddress1 = *Result
      DelimiterLength = __StrLen_(*Delimiter) * SizeOf(Character)
      If DelimiterLength
        Repeat
          *StringAddress2 = __StrStr_(*StringAddress1, *Delimiter)
          If *StringAddress2
            *StringAddress2\c = 0
            PointerArray(ArrayIndex) = *StringAddress1
            ArrayIndex + 1
            If ArrayIndex > ArrayBound
              ArrayBound + ArrayGrowth
              ReDim PointerArray(ArrayBound)
            EndIf
            *StringAddress1 = *StringAddress2 + DelimiterLength
          Else
            Break
          EndIf
        ForEver
      EndIf
      If *StringAddress1\c
        PointerArray(ArrayIndex) = *StringAddress1
        ArrayIndex + 1
      EndIf
      ReDim PointerArray(ArrayIndex - 1)
    EndIf
  EndIf
  ProcedureReturn *Result
EndProcedure

Define *Buffer1, String1$ = "AxBxC", Delimiter1$ = "x", Dim PointerArray1(99), ArrayIndex1, ArrayBound1

*Buffer1 = SplitString(@String1$, @Delimiter1$, PointerArray1())
If *Buffer1
  Debug String1$
  ArrayBound1 = ArraySize(PointerArray1())
  For ArrayIndex1 = 0 To ArrayBound1
    Debug "["+Str(ArrayIndex1)+"] "+PeekS(PointerArray1(ArrayIndex1))
  Next
  __StrFree_(*Buffer1)
  Debug ""
EndIf

Define *Buffer2, String2$ = ",,,, ,,A,,,B,,C,,", Delimiter2$ = ",", Dim PointerArray2(99), ArrayIndex2, ArrayBound2

*Buffer2 = SplitString(@String2$, @Delimiter2$, PointerArray2())
If *Buffer2
  Debug String2$
  ArrayBound2 = ArraySize(PointerArray2())
  For ArrayIndex2 = 0 To ArrayBound2
    Debug "["+Str(ArrayIndex2)+"] "+PeekS(PointerArray2(ArrayIndex2))
  Next
  __StrFree_(*Buffer2)
  Debug ""
EndIf
c ya,
nco2k
If OSVersion() = #PB_OS_Windows_ME : End : EndIf
User avatar
skywalk
Addict
Addict
Posts: 3994
Joined: Wed Dec 23, 2009 10:14 pm
Location: Boston, MA

Re: FastStringSplit improve speed ...

Post by skywalk »

Thanks nco2k.
I am confusing thread use with parallel use(calling another strtok() within a loop to parse a substring).
MSDN states this is bad. So, I just avoid it altogether.
I agree there is more confusion with pointers to the string contents and my subsequent operations on the split data is for further substring parsing and/or displaying the individual results using the indices of the array.
I'll post my Split() in a few...
The nice thing about standards is there are so many to choose from. ~ Andrew Tanenbaum
User avatar
CELTIC88
Enthusiast
Enthusiast
Posts: 154
Joined: Thu Sep 17, 2015 3:39 pm

Re: FastStringSplit improve speed ...

Post by CELTIC88 »

hi :)
@KCC :P pas de quoi

@nco2k thank you for your contribution

i update my code to FastStringBetween : Find strings between two string delimiters 8)

Code: Select all

EnableExplicit
ImportC ""
  wcsstr.i(*str1, *str2)
  _wcslwr.i(*cs)
  wcslen.l(*cs)
EndImport

DisableDebugger
Structure StringArray_St
  *pString
  *PBString.string
EndStructure

Procedure FastStringBetween(sString.s, ;The string to search.
                            sStarts.s, ;The beginning of the string to find.
                            sEnds.s,   ;The end of the string to find.
                            Array StringArray.StringArray_St(1), ;The Array contains found strings.
                            iMod=1,                              ;Search mode when sStarts = sEnds :
                                                                 ; (1) the sEnds string at the End of a match starts the Next possible match (Default)
                                                                 ; (2) a further instance of the sStarts starts the Next match
                                                                 ; you can combine with (8) to add start string and end to array.
                            Casesense = 1) ; False = case-insensitive. True (default setting) = case-sensitive
  Protected String = @sString
  Protected Starts = @sStarts
  Protected Ends = @sEnds
  
  Protected StringSize = wcslen(String) * 2
  Protected StartsSize = wcslen(Starts) * 2
  Protected EndsSize = wcslen(Ends) * 2
  
  Protected aString = AllocateMemory(StringSize + 2):CopyMemory(String,aString,StringSize)
  If Not aString:ProcedureReturn 0:EndIf
  Protected *aString.Word = aString
  
  If Casesense = 0
    _wcslwr(String)
    _wcslwr(Starts)
    _wcslwr(Ends)
  EndIf
  
  Protected String2 = String
  
  Protected Delimiters, DelimitersSize, ExitLooP, Save, aItem
  
  If iMod & 8 
    StringArray(aItem)\pString = aString
    StringArray(aItem)\PBString = @StringArray(aItem)\pString
    aItem + 1
  EndIf
  
  Repeat   
    If Delimiters = Starts
      Delimiters=Ends
      DelimitersSize=EndsSize
    Else
      Delimiters=Starts
      DelimitersSize=StartsSize
    EndIf
    
    String2 = wcsstr(String2,Delimiters)
    If String2 = 0
      String = 0
      Save =0
      If iMod & 8:Save =1:iMod = 0:EndIf
      ExitLooP = 1
    Else
      If iMod & 1:Save =0:If Delimiters = Ends :Save =1:EndIf:EndIf
    EndIf
    
    If Save
      If ArraySize(StringArray()) < aItem : ReDim StringArray(aItem +99):EndIf
      StringArray(aItem)\pString = *aString+ DelimitersSize
      StringArray(aItem)\PBString = @StringArray(aItem)\pString
      aItem + 1
    EndIf
    Save=1
    
    *aString + (String2 - String)
    *aString\w = 0
    String = String2
    String2 + DelimitersSize
    
  Until ExitLooP
  
  If *aString = aString
    FreeMemory(aString)
    ProcedureReturn 0
  EndIf
  
  ReDim StringArray(aItem-1)
  ProcedureReturn aString
EndProcedure


EnableDebugger
DisableExplicit

Dim StringArray.StringArray_St(1000)
parray = FastStringBetween("Oky !|i |am|BACk",
                           "|", "|", StringArray(),2|8)

If parray
  For r = 0 To ArraySize(StringArray())
    Debug StringArray(r)\PBString\s
  Next
  FreeMemory(parray)
Else
  Debug "---------------No strings found!"
EndIf

Last edited by CELTIC88 on Sun Dec 24, 2017 7:23 pm, edited 2 times in total.
interested in Cybersecurity..
User avatar
skywalk
Addict
Addict
Posts: 3994
Joined: Wed Dec 23, 2009 10:14 pm
Location: Boston, MA

Re: FastStringSplit improve speed ...

Post by skywalk »

Updated my Split() code with nco2k's suggestions in later posts.
I ran speed tests normalized to nco2k's SplitString() renamed as SplitC(). Added Wilbert's Split() code from another post and SplitPB() using PB's functions for comparison.
My Split() has options for; case sensitivity, trimming, UCase/LCase, Ascii/Unicode. (Not engaged in comparison.)
The results are appended to the code. Careful when setting #Tries too high if using SplitPB(). That function is extremely slow and is only shown for comparison.

Code: Select all

CompilerIf #PB_Compiler_Debugger
  MessageRequester("FYI", "Disable debugger for speed tests.")
CompilerEndIf
EnableExplicit
#CMA = ',' : #CMA$ = ","
Macro Min(x, y)
  (Bool((x) <= (y)) * (x) + Bool((y) < (x)) * (y))
EndMacro
ImportC ""
  CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
    __StrLen_(*String) As "wcslen"
    __StrStr_(*String1, *String2) As "wcsstr"
  CompilerElse
    __StrLen_(*String) As "_wcslen"
    __StrStr_(*String1, *String2) As "_wcsstr"
  CompilerEndIf
EndImport
Procedure.i SplitC(*String.Character, Array A$(1), *Delimiter, ArrayGrowth=1000)
  ;nco2k, http://www.purebasic.fr/english/viewtopic.php?p=516082#p516082
  ;skywalk; modified to catch trailing null's.
  ; Ex. "1,2,3,," Splits with ',' -> [1][2][3][][].
  Protected.i *StringEnd, DelimiterLength, ArrayIndex
  Protected.i Resize = 511
  Protected.i nStrings = ArraySize(A$())
  If *String    ; Avoid 'Pointer is null' if pointing to an array element = Empty$
    If *String\c
      DelimiterLength = _wcslen_(*Delimiter) * SizeOf(Character)
      If DelimiterLength
        Repeat
          *StringEnd = _wcsstr_(*String, *Delimiter)
          If *StringEnd
            A$(ArrayIndex) = PeekS(*String, (*StringEnd - *String) / SizeOf(Character))
            ArrayIndex + 1
            If ArrayIndex > nStrings
              nStrings + Resize
              ReDim A$(nStrings)
            EndIf
            *String = *StringEnd + DelimiterLength
          Else
            Break
          EndIf
        ForEver
      EndIf
      ReDim A$(ArrayIndex)
      A$(ArrayIndex) = PeekS(*String)   ; Catch final trailing null's if any.
      ArrayIndex + 1
    EndIf
  EndIf
  ProcedureReturn ArrayIndex
EndProcedure
Procedure.i SplitPB(s$, Array A$(1), Delm$=#CMA$, Trimsp.i=0, SetCase.i=#PB_String_Equal, UseCase.i=#PB_String_CaseSensitive)
  ;FYI; Do NOT use. Very slow for large strings. Shown for comparison only.
  ; Return: Count of elements or nStrings found.
  ;         If none, then the entire string is assigned to A$(0)
  ; s$ = normal null terminated string
  Protected.i i, nStrings
  Protected.i DelmLen = Len(Delm$)
  If s$ = #Empty$
    Dim A$(0)
    A$(0) = #Empty$
    nStrings = 0
  ElseIf Delm$ = #Empty$
    Dim A$(0)
    A$(0) = s$
    nStrings = 1
  Else    ; OK to Split
    nStrings = CountString(s$, Delm$)
    Dim A$(nStrings)
    While i <= nStrings
      A$(i) = StringField(s$, i+1, Delm$)
      i + 1
    Wend
    If trimsp
      For i = 0 To nStrings
        A$(i) = Trim(A$(i))
      Next i
      If trimsp = #TAB
        For i = 0 To nStrings
          A$(i) = Trim(A$(i), #TAB$)
        Next i
        ; Repeat Trim Spaces to catch any created by dropping of TAB's.
        For i = 0 To nStrings
          A$(i) = Trim(A$(i))
        Next i
      EndIf
    EndIf
    If SetCase = #PB_String_LowerCase
      For i = 0 To nStrings
        A$(i) = LCase(A$(i))
      Next i
    ElseIf SetCase = #PB_String_UpperCase
      For i = 0 To nStrings
        A$(i) = UCase(A$(i))
      Next i
    EndIf
    nStrings + 1      ; Catch #Empty$ due to dangling Delm$, "1,2,"
  EndIf
  ProcedureReturn nStrings
EndProcedure
Procedure.i SplitWilbert(s$, Array StringArray.s(1), Delm$)
  ;Wilbert, http://www.purebasic.fr/english/viewtopic.php?p=486360#p486360
  Protected S.String, *S.Integer = @S
  Protected.i asize, i, p, slen
  asize = CountString(s$, Delm$)
  slen = Len(Delm$)
  ReDim StringArray(asize)
  *S\i = @s$
  While i < asize
    p = FindString(S\s, Delm$)
    StringArray(i) = PeekS(*S\i, p - 1)
    *S\i + (p + slen - 1) << #PB_Compiler_Unicode
    i + 1
  Wend
  StringArray(i) = S\s
  *S\i = 0
  ProcedureReturn asize + 1
EndProcedure
Procedure.i Split(*s.Character, Array A$(1), Delm$=#CMA$, Trimsp.i=0, SetCase.i=#PB_String_Equal, UseCase.i=#PB_String_CaseSensitive, Enc.i=#PB_Unicode)
  ;skywalk
  ; RETURN: Count of elements or nStrings found. A$(nStrings-1)
  ;         If none, then the entire string is assigned to A$(0)
  ; IN:     *s = pointer to normal null terminated string
  ;         While searching *s, Split() does NOT inserts [Nulls] for each delimiter found.
  Protected.i i, nStrings  ;x86; use Quad for very large files. Change For..Next's to While..Wend's.
  Protected.i lenChar, iLen, DelmLen, Resize
  Protected.i *s_prev = *s
  DelmLen = MemoryStringLength(@Delm$)  ;, Enc)   ; Len in char's of Delm$, (Unicode native format).
  If DelmLen = 0
    Dim A$(0)
    A$(0) = PeekS(*s, -1, Enc)
    nStrings = 1
  ElseIf *s = 0           ; Avoid 'Pointer is null' if pointing to an array element = Empty$
    Dim A$(0)
    A$(0) = #Empty$
    nStrings = 0
  ElseIf *s\c = 0
    Dim A$(0)
    A$(0) = #Empty$
    nStrings = 0
  Else                    ; OK to Split
    If Enc = #PB_Unicode  ; 2 bytes/char
      lenChar = 2
    Else                  ; #PB_Ascii = 1 bytes/char, #PB_UTF8 = variable bytes/char
      lenChar = 1
    EndIf
    Resize = 511;Min(99, (sLen / DelmLen))
    Dim A$(Resize)
    nStrings = Resize
    If (UseCase = #PB_String_CaseSensitive) And (DelmLen = 1)
      ; Use faster single character Split routine.
      Protected.c Delmc = Asc(Delm$)
      While *s\c          ; > 0 means valid Character, = 0 means String terminated.
        If *s\c <> Delmc
          *s + lenChar
          iLen + 1
        Else
          ;*s\c = 0 ; Old method terminates string with Chr(0) here. Using iLen instead of altering source string.
          A$(i) = PeekS(*s_prev, iLen, Enc)
          i + 1
          *s_prev = *s + lenChar  ; remember last pointer
          *s = *s_prev
          iLen = 0
          If i > nStrings
            nStrings + Resize
            ReDim A$(nStrings)
          EndIf
        EndIf
      Wend
    Else              ; Use slower multi-char split routine.
      ; Convert Delm$ to appropriate Encoding before comparisons.
      Protected.i *d
      If Enc = #PB_Unicode
        *d = @Delm$
      ElseIf Enc = #PB_Ascii
        *d = Ascii(Delm$)
      Else
        *d = UTF8(Delm$)
      EndIf
      While *s\c      ; > 0 means valid Character, = 0 means String terminated.
        If CompareMemoryString(*s, *d, UseCase, DelmLen, Enc)  ; <> 0 means different memory
          *s + lenChar
          iLen + 1
        Else          ; = 0  means identical memory
                      ;*s\c = 0   ; Old method terminates string with Chr(0) here. Using iLen instead of altering source string.
          A$(i) = PeekS(*s_prev, iLen, Enc)
          i + 1
          *s_prev = *s + DelmLen * lenChar  ; Store previous pointer
          *s = *s_prev
          iLen = 0
          If i > nStrings
            nStrings + Resize
            ReDim A$(nStrings)
          EndIf
        EndIf
      Wend
      If Enc <> #PB_Unicode
        FreeMemory(*d)
      EndIf
    EndIf
    If *s <> *s_prev  ; Reached last valid element
      A$(i) = PeekS(*s_prev, iLen, Enc) ; Catch #Empty$ due to dangling Delm$, "1,2,"
    EndIf
    nStrings = i
    ReDim A$(nStrings)
    If trimsp
      For i = 0 To nStrings
        A$(i) = Trim(A$(i))
      Next i
      If trimsp = #TAB
        For i = 0 To nStrings
          A$(i) = Trim(A$(i), #TAB$)
        Next i
        ; Repeat Trim Spaces to catch any created by dropping of TAB's.
        For i = 0 To nStrings
          A$(i) = Trim(A$(i))
        Next i
      EndIf
    EndIf
    ; Consider UCase(s$)/LCase(s$) prior to running Split() operation to save loop time.
    ; But, this limits the casing of the Delm$.
    If SetCase = #PB_String_LowerCase
      For i = 0 To nStrings
        A$(i) = LCase(A$(i))
      Next i
    ElseIf SetCase = #PB_String_UpperCase
      For i = 0 To nStrings
        A$(i) = UCase(A$(i))
      Next i
    EndIf
    nStrings + 1
  EndIf
  ProcedureReturn nStrings
EndProcedure
;-{ TEST
Dim a$(0)
Define.i i, nPts
Define.s d$ = ","
Define.s s$, r$ = ",,,, ,,A,,,B,,C,,ZZ,"
For i = 1 To 10000
  s$ + r$; + #CRLF$
Next i
If 0
  nPts = Split(@s$, a$(), d$)
  Debug "Split(@'" + s$ + "', '" + d$ + "') = " + Str(nPts)
  For i = 0 To nPts - 1
    Debug Str(i) + ",{" + a$(i) + "}"
  Next i
EndIf
If 0
  nPts = SplitWilbert(s$, a$(), d$)
  Debug "SplitWil('" + s$ + "', '" + d$ + "') = " + Str(nPts)
  For i = 0 To nPts - 1
    Debug Str(i) + ",{" + a$(i) + "}"
  Next i
EndIf
If 0
  nPts = SplitC(@s$, a$(), @d$)
  Debug "SplitC(@'" + s$ + "', @'" + d$ + "') = " + Str(nPts)
  For i = 0 To nPts - 1
    Debug Str(i) + ",{" + a$(i) + "}"
  Next i
EndIf
;-} TEST
;-{ TEST SPEED
CompilerIf #PB_Compiler_Debugger = 0
  Macro ML_pcChange(y1, y2)
    ; Compute % change from y1 to y2.
    100.0 * (y2 - (y1)) / (y1 + 1e-16)
  EndMacro
  SetPriorityClass_(GetCurrentProcess_(), #REALTIME_PRIORITY_CLASS)
  #Tries = 1  ;-! SET #TRIES
  Define.i u,time,t1,t2,t3,t4
  Define.i tw = 32
  Define.s r$
  Define.s code1$ = "SplitC"
  Define.s code2$ = "SplitWil"  ;http://www.purebasic.fr/english/viewtopic.php?p=486360#p486360
  Define.s code3$ = "Split"
  Define.s code4$ = "SplitPB"
  
  time = ElapsedMilliseconds()
  For u = 1 To #Tries
    ;-> INSERT CODE 1 HERE...
    Dim a$(0)
    nPts = SplitC(@s$, a$(), @d$)
    Debug "SplitC(@'" + s$ + "', @'" + d$ + "') = " + Str(nPts)
    For i = 0 To nPts - 1
      Debug Str(i) + ",{" + a$(i) + "}"
    Next i
  Next u
  t1 = ElapsedMilliseconds()-time
  
  time = ElapsedMilliseconds()
  For u = 1 To #Tries
    ;-> INSERT CODE 2 HERE...
    Dim a$(0)
    nPts = SplitWilbert(s$, a$(), d$)
    Debug "SplitWil('" + s$ + "','" + d$ + "') = " + Str(nPts)
    For i = 0 To nPts - 1
      Debug Str(i) + ",{" + a$(i) + "}"
    Next i
  Next u
  t2 = ElapsedMilliseconds()-time
  
  time = ElapsedMilliseconds()
  For u = 1 To #Tries
    ;-> INSERT CODE 3 HERE...
    Dim a$(0)
    nPts = Split(@s$, a$(), d$)
    Debug "Split(@'" + s$ + "', '" + d$ + "') = " + Str(nPts)
    For i = 0 To nPts - 1
      Debug Str(i) + ",{" + a$(i) + "}"
    Next i
  Next u
  t3 = ElapsedMilliseconds()-time
  
  time = ElapsedMilliseconds()
  For u = 1 To #Tries
    ;-> INSERT CODE 4 HERE...
    nPts = SplitPB(s$, a$(), d$)
    Debug "SplitPB('" + s$ + "','" + d$ + "') = " + Str(nPts)
    For i = 0 To nPts - 1
      Debug Str(i) + ",{" + a$(i) + "}"
    Next i
  Next u
  t4 = ElapsedMilliseconds()-time
  
  r$ = LSet("; Count(n),",tw) + Str(#Tries) + #CRLF$
  r$ + LSet("; "+code1$+"(ms),",tw) + Str(t1) + #CRLF$
  r$ + LSet("; "+code2$+"(ms),",tw) + Str(t2) + #CRLF$
  r$ + LSet("; "+code3$+"(ms),",tw) + Str(t3) + #CRLF$
  r$ + LSet("; "+code4$+"(ms),",tw) + Str(t4) + #CRLF$
  r$ + LSet("; "+code1$+" : "+code2$+"(%),",tw) + StrD(ML_pcChange(t1,t2),2) + #CRLF$
  r$ + LSet("; "+code1$+" : "+code3$+"(%),",tw) + StrD(ML_pcChange(t1,t3),2) + #CRLF$
  r$ + LSet("; "+code1$+" : "+code4$+"(%),",tw) + StrD(ML_pcChange(t1,t4),2) + #CRLF$
  If MessageRequester("Speed Test - Copy To Clipboard?",r$,#PB_MessageRequester_YesNo) = #PB_MessageRequester_Yes
    SetClipboardText(r$)
  EndIf
  SetPriorityClass_(GetCurrentProcess_(), #NORMAL_PRIORITY_CLASS)
CompilerEndIf
Debug "--END--"
Debug s$
;-} TEST SPEED
;/////////////////////////////////////////////
; d$ = ",,"
; s$ = 10 x r$ = ",,,, ,,A,,,B,,C,,ZZ,"
; Count(n),                     100000
; SplitC(ms),                   1373
; SplitWil(ms),                 1380
; Split(ms),                    1050
; SplitPB(ms),                  3116
; SplitC : SplitWil(%),         0.51
; SplitC : Split(%),            -23.53
; SplitC : SplitPB(%),          126.95
;/////////////////////////////////////////////
; d$ = ",,"
; s$ = 10000 x r$ = ",,,, ,,A,,,B,,C,,ZZ,"
; Count(n),                     1
; SplitC(ms),                   8
; SplitWil(ms),                 2816
; Split(ms),                    17
; SplitPB(ms),                  21749
; SplitC : SplitWil(%),         35100.00
; SplitC : Split(%),            112.50
; SplitC : SplitPB(%),          271762.50
;/////////////////////////////////////////////
; d$ = ","
; s$ = 10000 x r$ = ",,,, ,,A,,,B,,C,,ZZ,"
; Count(n),                     1
; SplitC(ms),                   22
; SplitWil(ms),                 6441
; Split(ms),                    35
; SplitPB(ms),                  13602
; SplitC : SplitWil(%),         29177.27
; SplitC : Split(%),            59.09
; SplitC : SplitPB(%),          61727.27
;/////////////////////////////////////////////
EDIT: Edits made per nco2k suggestions and more optimizations.
Last edited by skywalk on Wed Apr 25, 2018 3:51 am, edited 8 times in total.
The nice thing about standards is there are so many to choose from. ~ Andrew Tanenbaum
User avatar
nco2k
Addict
Addict
Posts: 1344
Joined: Mon Sep 15, 2003 5:55 am

Re: FastStringSplit improve speed ...

Post by nco2k »

>> MSDN states this is bad. So, I just avoid it altogether.
yes, you wouldnt want to do that. you should finish one string, before you go to the next one.

>> SplitC() does not catch a trailing null?
yes, thats by design. why would you want to do that anyway? i mean you can change it if you really want to, i just dont see the point.

>> My Split() uses a CountString() at beginning.
not a good idea. browsing through the string twice, will give you terrible performance. this might be ok for such a small string, but now try it with a large string. the bigger the string is, the more your procedure will fall behind. and dont use a string parameter. use a pointer instead: http://www.purebasic.fr/english/viewtop ... 98#p513698

if you want to adjust my procedure to fill the array with strings, then you dont need strdup. you know where each string-part begins and ends, and can tell PeekS() where and how much to read.

Code: Select all

Procedure SplitString(*String.Character, *Delimiter, Array StringArray$(1), ArrayGrowth=100)
  Protected *StringEnd, DelimiterLength, ArrayIndex, ArrayBound = ArraySize(StringArray$())
  If *String\c
    DelimiterLength = __StrLen_(*Delimiter) * SizeOf(Character)
    If DelimiterLength
      Repeat
        *StringEnd = __StrStr_(*String, *Delimiter)
        If *StringEnd
          StringArray$(ArrayIndex) = PeekS(*String, (*StringEnd - *String) / SizeOf(Character))
          ArrayIndex + 1
          If ArrayIndex > ArrayBound
            ArrayBound + ArrayGrowth
            ReDim StringArray$(ArrayBound)
          EndIf
          *String = *StringEnd + DelimiterLength
        Else
          Break
        EndIf
      ForEver
    EndIf
    If *String\c
      StringArray$(ArrayIndex) = PeekS(*String)
      ArrayIndex + 1
    EndIf
    ReDim StringArray$(ArrayIndex - 1)
  EndIf
  ProcedureReturn ArrayIndex
EndProcedure
c ya,
nco2k
If OSVersion() = #PB_OS_Windows_ME : End : EndIf
User avatar
mk-soft
Always Here
Always Here
Posts: 5387
Joined: Fri May 12, 2006 6:51 pm
Location: Germany

Re: FastStringSplit improve speed ...

Post by mk-soft »

It only gets faster if you program directly in assembler.

My code is still in the middle of the field. Supports additional double-quotes, however

Code: Select all

Procedure SplitStringArray(String.s, Separator.s, Array Result.s(1), DQuote = #False)
  Protected *String.character, *Separator.character
  Protected *Start, *End, exit, lock, do, dq, len , count, size
  
  size = 8
  Dim Result(size)
  *String = @String
  *Separator = @Separator
  *Start = *String
  *End = *String
  Repeat
    If *String\c = 0
      exit = #True
      do = #True
      If Not dq
        *End = *String
      EndIf
    Else
      If DQuote And *String\c = '"'
        If Not lock
          lock = #True
          dq = #True
          *Start = *String + SizeOf(character)
        Else
          lock = #False
          *End = *String
        EndIf
      EndIf
      If *String\c = *Separator\c And Not lock
        do = #True
        If Not dq
          *End = *String
        EndIf
      EndIf
    EndIf
    If do
      If size < count
        size + 8
        ReDim Result(size)
      EndIf
      len = (*End - *Start) / SizeOf(character)
      If Len > 0
        Result(count) = PeekS(*Start, len) 
      EndIf
      *Start = *String + SizeOf(character)
      count + 1
      do = #False
      dq = #False
    EndIf
    *String + SizeOf(character)
  Until exit
  ReDim Result(count - 1)
  ProcedureReturn count
EndProcedure
Link: SplitStringArray
My Projects ThreadToGUI / OOP-BaseClass / EventDesigner V3
PB v3.30 / v5.75 - OS Mac Mini OSX 10.xx - VM Window Pro / Linux Ubuntu
Downloads on my Webspace / OneDrive
User avatar
CELTIC88
Enthusiast
Enthusiast
Posts: 154
Joined: Thu Sep 17, 2015 3:39 pm

Re: FastStringSplit improve speed ...

Post by CELTIC88 »

update my code with more options :)
interested in Cybersecurity..
Post Reply