High speed split string

Share your advanced PureBasic knowledge/code with the community.
callroot
User
User
Posts: 64
Joined: Sat Mar 05, 2016 10:46 pm

High speed split string

Post by callroot »

Code: Select all


Procedure.s GETFILETXT(PATH.i)
  sbuff.l
  Filehwnd.l = CreateFile_(PATH, #GENERIC_READ, 0, 0, #OPEN_ALWAYS, #FILE_ATTRIBUTE_NORMAL, 0)
  If Filehwnd
    ;MessageRequester("",Str(Filehwnd))
    fsize.l = GetFileSize_(Filehwnd, #Null)
    STRR.S = Space(fsize)
    ReadFile_(Filehwnd, @STRR, fsize, @sbuff, 0)
    CloseHandle_(Filehwnd)
  EndIf
  ProcedureReturn STRR
EndProcedure

;*************************************************
;SplitInMemory字符串切割函数
;参数1=Main字符串地址
;参数2=delim字符串地址
;参数3=返回字符串数组指针(用完必须手动FreeMemory)
;BY  startbin@126.com
;*************************************************
Procedure.l SplitInMemory(Main.i, delim.i, *pAddr.String)
  ;!INT 3
  !MOV ebx, [esp + $C]
  *pAddr = AllocateMemory(4000000)            ;申请内存用来存放切割后的指针,可以存放的下限为100w
  !CMP eax, 0
  !jnz Begin
  !RET $C
!Begin : 
  !MOV [ebx], eax                             ;把内存地址放入paddr指向的地址
  PokeL(*pAddr, Main.i)
!Start1 : 
  !MOV eax, [esp + $4]
  !PUSH esi
  !XOR esi, esi
  !TEST eax, eax
  !je short Exit1
  !PUSH ebp
  !MOV ebp, [esp + $10]
  !TEST ebp, ebp
  !je short Exit2
  !MOV dh, [ebp]
  !TEST dh, dh
  !je short Exit2
  !MOV cl, Byte [eax]
  !TEST cl, cl
  !je short Exit2
  !PUSH ebx
  !PUSH edi
  
!Six1 : 
  !INC eax
  !CMP cl, dh
  !jnz short First1
  !MOV dl, Byte [eax]
  !LEA ecx, [ebp + 1]
  !MOV edi, eax
  !TEST dl, dl
  !je short Second1
  
!Four1 : 
  !MOV bl, byte [ecx]
  !test bl, bl
  !je short Three1
  !CMP dl, dl
  !jnz short Second1
  !MOV dl, Byte [eax + 1]
  !INC eax
  !INC ecx
  !TEST dl, dl
  !jnz short Four1
  
!Second1 : 
  !CMP Byte [ecx], 0
  !jnz short Five1
  
!Three1 : 
  !INC esi                                    ;这里已经完整匹配到字符串,esi计数器 + 1
  !PUSH eax
  !PUSH ecx
  !SUB ecx, ebp
  !SUB eax, ecx
!AG : 
  !MOV Byte [eax], 0
  !ADD eax, 1
  !SUB ecx, 1
  !jnz AG
  !MOV ecx, [esp + $24]
  !LEA ecx, [ecx + esi*4]
  !MOV [ecx], eax
  !POP ecx
  !POP eax
  ;[esp+$1C]=参数3
  !jmp short First1
  
!Five1 : 
  !MOV eax, edi
  
!First1 : 
  !MOV cl, [eax]
  !TEST cl, cl
  !jnz short Six1
  !POP edi
  !POP ebx
  !POP ebp
  !MOV eax, esi
  !POP esi
  !RET $C
  
!Exit2 : 
  !POP ebp
  !MOV eax, esi
  !POP esi
  !RET $C
  
!Exit1 : 
  !MOV eax, esi
  !POP esi
  !RET $C
EndProcedure

c.s = GETFILETXT(@"C:\DEMO.TXT")
p.l = Len(c)
StartTime = timeGetTime_()

Define *pAddr.String, *ss.String
c1.s = #CRLF$ ;Segmentation symbol
pl.l = SplitInMemory(@c, @c1, @*pAddr)
ElapsedTime = timeGetTime_()-StartTime
If *pAddr
  MessageRequester("", Hex(*pAddr))
  ;MessageRequester("", *pAddr\s)
  *ss = *pAddr
  For i.l = 0 To pl
    ;MessageRequester("", *pAddr\s)
    c1 = *pAddr\s
    *pAddr = *pAddr + 4
  Next
  *pAddr = *pAddr - 4
  MessageRequester("", *pAddr\s)
  FreeMemory(*ss)
EndIf                                                 ;用完要释放内存
MessageRequester(Str(ElapsedTime), Str(pl))




infratec
Always Here
Always Here
Posts: 6817
Joined: Sun Sep 07, 2008 12:45 pm
Location: Germany

Re: High speed split string

Post by infratec »

Hi,

I did not test your code, but why using complicated API when PB is less code and clearer:

Code: Select all

Procedure.s GetFileText(Filename$)
  
  Protected File.i, Result$
  
  File = ReadFile(#PB_Any, Filename$)
  If File
    Result$ = ReadString(File, #PB_File_IgnoreEOL)
    CloseFile(File)
  EndIf
  
  ProcedureReturn Result$
  
EndProcedure
Bernd
uweb
User
User
Posts: 98
Joined: Wed Mar 15, 2006 9:40 am
Location: Germany

Re: High speed split string

Post by uweb »

May GETFILETXT() is not the best example but SplitInMemory() looks very good for me.

Thank you for the tip callroot.
And thank you for the code startbin@126.com!

Code: Select all

;*************************************************
;SplitInMemory???????
;??1=Main?????
;??2=delim?????
;??3=?????????(??????FreeMemory)
;BY  startbin@126.com
;*************************************************
Procedure.l SplitInMemory(Main.i, delim.i, *pAddr.String)
  ;!INT 3
  !MOV ebx, [esp + $C]
  *pAddr = AllocateMemory(4000000)            ;??????????????,????????100w
  !CMP eax, 0
  !jnz Begin
  !RET $C
!Begin :
  !MOV [ebx], eax                             ;???????paddr?????
  PokeL(*pAddr, Main.i)
!Start1 :
  !MOV eax, [esp + $4]
  !PUSH esi
  !XOR esi, esi
  !TEST eax, eax
  !je short Exit1
  !PUSH ebp
  !MOV ebp, [esp + $10]
  !TEST ebp, ebp
  !je short Exit2
  !MOV dh, [ebp]
  !TEST dh, dh
  !je short Exit2
  !MOV cl, Byte [eax]
  !TEST cl, cl
  !je short Exit2
  !PUSH ebx
  !PUSH edi
 
!Six1 :
  !INC eax
  !CMP cl, dh
  !jnz short First1
  !MOV dl, Byte [eax]
  !LEA ecx, [ebp + 1]
  !MOV edi, eax
  !TEST dl, dl
  !je short Second1
 
!Four1 :
  !MOV bl, byte [ecx]
  !test bl, bl
  !je short Three1
  !CMP dl, dl
  !jnz short Second1
  !MOV dl, Byte [eax + 1]
  !INC eax
  !INC ecx
  !TEST dl, dl
  !jnz short Four1
 
!Second1 :
  !CMP Byte [ecx], 0
  !jnz short Five1
 
!Three1 :
  !INC esi                                    ;????????????,esi??? + 1
  !PUSH eax
  !PUSH ecx
  !SUB ecx, ebp
  !SUB eax, ecx
!AG :
  !MOV Byte [eax], 0
  !ADD eax, 1
  !SUB ecx, 1
  !jnz AG
  !MOV ecx, [esp + $24]
  !LEA ecx, [ecx + esi*4]
  !MOV [ecx], eax
  !POP ecx
  !POP eax
  ;[esp+$1C]=??3
  !jmp short First1
 
!Five1 :
  !MOV eax, edi
 
!First1 :
  !MOV cl, [eax]
  !TEST cl, cl
  !jnz short Six1
  !POP edi
  !POP ebx
  !POP ebp
  !MOV eax, esi
  !POP esi
  !RET $C
 
!Exit2 :
  !POP ebp
  !MOV eax, esi
  !POP esi
  !RET $C
 
!Exit1 :
  !MOV eax, esi
  !POP esi
  !RET $C
EndProcedure


;- Read.s
DataSection
Commands:
Data.s "ABOR", "ACCT", "ADAT", "ALLO", "APPE", "AUTH", "CCC", "CDUP", "CONF", "CWD", "DELE", "ENC", "EPRT", "EPSV", "FEAT", "HELP", "HOST", "LANG", "LIST", "LPRT", "LPSV", "MDTM", "MIC", "MKD", "MLSD", "MLST", "MODE", "NLST", "NOOP", "OPTS", "PASS", "PASV", "PBSZ", "PORT", "PROT", "PWD", "QUIT", "REIN", "REST", "RETR", "RMD", "RNFR", "RNTO", "SITE", "SIZE", "SMNT", "STAT", "STOR", "STOU", "STRU", "SYST", "TYPE", "USER", "XCUP", "XMKD", "XPWD", "XRCP", "XRMD", "XRSQ", "XSEM", "XSEN" 
EndDataSection
Maximal = 61
StartTime = timeGetTime_()
For t = 1 To 10000
Restore Commands
For i = 1 To Maximal 
  Read.s test.s
  ;Debug test
Next
Next
Debug timeGetTime_()-StartTime 


;- StringField
c.s = "ABOR;ACCT;ADAT;ALLO;APPE;AUTH;CCC;CDUP;CONF;CWD;DELE;ENC;EPRT;EPSV;FEAT;HELP;HOST;LANG;LIST;LPRT;LPSV;MDTM;MIC;MKD;MLSD;MLST;MODE;NLST;NOOP;OPTS;PASS;PASV;PBSZ;PORT;PROT;PWD;QUIT;REIN;REST;RETR;RMD;RNFR;RNTO;SITE;SIZE;SMNT;STAT;STOR;STOU;STRU;SYST;TYPE;USER;XCUP;XMKD;XPWD;XRCP;XRMD;XRSQ;XSEM;XSEN"
c1.s = ";" ;Segmentation symbol
StartTime = timeGetTime_()
For t = 1 To 10000
  j = CountString(c, c1)
  For i = 1 To j+1
    test.s = StringField(c, i, c1)
    ;Debug test
  Next
Next
Debug timeGetTime_()-StartTime 


;- Split multible
c.s = "ABOR;ACCT;ADAT;ALLO;APPE;AUTH;CCC;CDUP;CONF;CWD;DELE;ENC;EPRT;EPSV;FEAT;HELP;HOST;LANG;LIST;LPRT;LPSV;MDTM;MIC;MKD;MLSD;MLST;MODE;NLST;NOOP;OPTS;PASS;PASV;PBSZ;PORT;PROT;PWD;QUIT;REIN;REST;RETR;RMD;RNFR;RNTO;SITE;SIZE;SMNT;STAT;STOR;STOU;STRU;SYST;TYPE;USER;XCUP;XMKD;XPWD;XRCP;XRMD;XRSQ;XSEM;XSEN"
c1.s = ";" ;Segmentation symbol
Define *pAddr.String,
StartTime = timeGetTime_() 
For t = 1 To 10000
pl.l = SplitInMemory(@c, @c1, @*pAddr)
If *pAddr
  test.s = *pAddr\s
  ;Debug test    
  For i = 1 To pl
    *pAddr = *pAddr + 4
    test.s = *pAddr\s
    ;Debug test
  Next
EndIf  
Next
Debug timeGetTime_()-StartTime 


;- Split once, use multible
c.s = "ABOR;ACCT;ADAT;ALLO;APPE;AUTH;CCC;CDUP;CONF;CWD;DELE;ENC;EPRT;EPSV;FEAT;HELP;HOST;LANG;LIST;LPRT;LPSV;MDTM;MIC;MKD;MLSD;MLST;MODE;NLST;NOOP;OPTS;PASS;PASV;PBSZ;PORT;PROT;PWD;QUIT;REIN;REST;RETR;RMD;RNFR;RNTO;SITE;SIZE;SMNT;STAT;STOR;STOU;STRU;SYST;TYPE;USER;XCUP;XMKD;XPWD;XRCP;XRMD;XRSQ;XSEM;XSEN"
c1.s = ";" ;Segmentation symbol
StartTime = timeGetTime_() 
pl.l = SplitInMemory(@c, @c1, @*pAddr)
If *pAddr
*ss = *pAddr
For t = 1 To 10000
  *pAddr = *ss
  test.s = *pAddr\s
  ;Debug test    
  For i = 1 To pl
    *pAddr = *pAddr + 4
    test.s = *pAddr\s
    ;Debug test
  Next
  Next  
  FreeMemory(*ss)
EndIf  
Debug timeGetTime_()-StartTime 
Please pardon my English, my native tongue is German.
Bo Marchais
User
User
Posts: 61
Joined: Sun Apr 03, 2016 12:03 am

Re: High speed split string

Post by Bo Marchais »

Can someone reformat this with an example and some easier to understand notes on use?

I use this kind of function all the time. I'm using a slow substitute in Pb, but I really long for
a function like this with simple calling syntax:

new a$()
a$() = split(string$, token$)

It appears that this version (IMPORTANTLY) releases intermediate memory when done.
I have applications that use split functions perhaps tens of thousands of time on large
strings in just moments and the performance boost would be great!

Yes, I see my own laziness in making the request. :)
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3870
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: High speed split string

Post by wilbert »

This is not an assembler approach but you might want to see if it's fast enough for you.

Code: Select all

Procedure Split(String.s, Array StringArray.s(1), Separator.s = " ")
  
  Protected S.String, *S.Integer = @S
  Protected.i asize, i, p, slen
  asize = CountString(String, Separator)
  slen = Len(Separator)
  ReDim StringArray(asize)
  
  *S\i = @String
  While i < asize
    p = FindString(S\s, Separator)
    StringArray(i) = PeekS(*S\i, p - 1)
    *S\i + (p + slen - 1) << #PB_Compiler_Unicode
    i + 1
  Wend
  StringArray(i) = S\s
  *S\i = 0
  
EndProcedure

Procedure.s Join(Array StringArray.s(1), Separator.s = " ")
  
  Protected.i asize, i, slen, tlen, *buffer
  asize = ArraySize(StringArray())
  slen = Len(Separator)
  For i = 0 To asize
    tlen + Len(StringArray(i)) + slen
  Next
  tlen - slen
  
  Protected Dim buffer.c(tlen)
  *buffer = @buffer()
  CopyMemoryString(StringArray(0), @*buffer)
  For i = 1 To asize
    CopyMemoryString(Separator)
    CopyMemoryString(StringArray(i))
  Next
  
  ProcedureReturn PeekS(@buffer())
  
EndProcedure



; test

S.s = "This is a test string to see if split and join are working."

Dim MyStrings.s(0)
Split(S, MyStrings())

For i = 0 To ArraySize(MyStrings())
  Debug MyStrings(i)
Next

JS.s = Join(MyStrings(), "*")

Debug JS
Windows (x64)
Raspberry Pi OS (Arm64)
Bo Marchais
User
User
Posts: 61
Joined: Sun Apr 03, 2016 12:03 am

Re: High speed split string

Post by Bo Marchais »

Wilbert, thank you!

I haven't profiled this to see the speed, but it's the best of all the samples I saw with the clearest example.

Is it possible to assign the array as the output using an array copy or something, so the array is returned
from the ProcedureReturn ? From a beginning programmer's point of view, there are some advantages in using
"result = verb(input)" syntax, and it is more clear.
Is there any hope to do something like this:

dim array()
array() = split(string$)

This syntax is attractive to people coming from outside PB. But I think maybe Purebasic won't allow it,
because it performs the function of copy array.

I find myself wondering if the only way to pull this off is to create a map on the fly and then return the split data
as keys, instead of returning the array. I'd have to put the sequence #s in the value fields, and then...
No, It's an ugly idea.


I am sure I will adapt to the purebasic syntax.
Thank you again... are you the same Wilbert from PB days? If so, I always liked your posts...
Very helpful.
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3870
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: High speed split string

Post by wilbert »

Bo Marchais wrote:Is it possible to assign the array as the output
Unfortunately not.
If it would be possible I would have used that approach since I agree it's clearer.
Bo Marchais wrote:Thank you again... are you the same Wilbert from PB days?
If this time PB means PowerBasic, then the answer is no; didn't know there's a Wilbert there also. :shock:
You can find posts online from me about RapidQ (long time ago before I found PureBasic) and KoolMoves (Flash / ActionScript related) but I haven't used PowerBasic.
Windows (x64)
Raspberry Pi OS (Arm64)
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3870
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: High speed split string

Post by wilbert »

Sometimes a List is easier to use compared to an array (removing or inserting items).
Here's split and join for List also.

Code: Select all

Procedure Split(String.s, Array StringArray.s(1), Separator.s = " ")
  
  Protected S.String, *S.Integer = @S
  Protected.i asize, i, p, slen
  asize = CountString(String, Separator)
  slen = Len(Separator)
  ReDim StringArray(asize)
  
  *S\i = @String
  While i < asize
    p = FindString(S\s, Separator)
    StringArray(i) = PeekS(*S\i, p - 1)
    *S\i + (p + slen - 1) << #PB_Compiler_Unicode
    i + 1
  Wend
  StringArray(i) = S\s
  *S\i = 0
  
EndProcedure

Procedure.s Join(Array StringArray.s(1), Separator.s = " ")
  
  Protected.i asize, i, slen, tlen, *buffer
  asize = ArraySize(StringArray())
  slen = Len(Separator)
  For i = 0 To asize
    tlen + Len(StringArray(i)) + slen
  Next
  tlen - slen
  
  Protected Dim buffer.c(tlen)
  *buffer = @buffer()
  CopyMemoryString(StringArray(0), @*buffer)
  For i = 1 To asize
    CopyMemoryString(Separator)
    CopyMemoryString(StringArray(i))
  Next
  
  ProcedureReturn PeekS(@buffer())
  
EndProcedure

Procedure SplitL(String.s, List StringList.s(), Separator.s = " ")
  
  Protected S.String, *S.Integer = @S
  Protected.i p, slen
  slen = Len(Separator)
  ClearList(StringList())
  
  *S\i = @String
  Repeat
    AddElement(StringList())
    p = FindString(S\s, Separator)
    StringList() = PeekS(*S\i, p - 1)
    *S\i + (p + slen - 1) << #PB_Compiler_Unicode
  Until p = 0
  *S\i = 0
  
EndProcedure

Procedure.s JoinL(List StringList.s(), Separator.s = " ")
  
  Protected.i slen, tlen, *buffer
  slen = Len(Separator)
  ForEach StringList()
    tlen + Len(StringList()) + slen
  Next
  tlen - slen
  
  Protected Dim buffer.c(tlen)
  *buffer = @buffer()
  If FirstElement(StringList())
    CopyMemoryString(StringList(), @*buffer)
    While NextElement(StringList())
      CopyMemoryString(Separator)
      CopyMemoryString(StringList())
    Wend
  EndIf

  ProcedureReturn PeekS(@buffer())
  
EndProcedure



; test

S.s = "This is a test string to see if split and join are working."

NewList MyStrings.s()
SplitL(S, MyStrings())

ForEach MyStrings()
  Debug MyStrings()
Next

JS.s = JoinL(MyStrings(), "*")
Debug JS

; remove fifth element (first element is position 0)
SelectElement(MyStrings(), 4)
DeleteElement(MyStrings())

JS = JoinL(MyStrings(), "*")
Debug JS
Windows (x64)
Raspberry Pi OS (Arm64)
Joris
Addict
Addict
Posts: 885
Joined: Fri Oct 16, 2009 10:12 am
Location: BE

Re: High speed split string

Post by Joris »

Bo Marchais wrote:Is it possible to assign the array as the output using an array copy or something, so the array is returned
from the ProcedureReturn ?[/color][/i]
If I do understand you correctly...
This can be done with the ExtractRegularExpression, but I don't know if the speeds is optimal then...

Code: Select all

; This expression will match every word of 3 letter which begin by a lower case letter,
; followed with the character 'b' and which ends with an uppercase letter. ex: abC
;    
If CreateRegularExpression(0, "[a-z]b[A-Z]")
  Dim Result$(0)
  NbFound = ExtractRegularExpression(0, "abC ABc zbA abc", Result$())
  For k = 0 To NbFound-1
    Debug Result$(k)
  Next
Else
  Debug RegularExpressionError()
EndIf
This example comes from the PB-help, so you'll have to change some things.

__________________________________________________
Quote tags>Code tags
10.04.2016
RSBasic
Yeah I know, but keep in mind ... Leonardo da Vinci was also an autodidact.
User avatar
minimy
Enthusiast
Enthusiast
Posts: 344
Joined: Mon Jul 08, 2013 8:43 pm

Re: High speed split string

Post by minimy »

Very good! javascripters and 80´s assemblers are lucky! :mrgreen:
Thanks callroot, Infratec and willbert for share code and examples! :D
Really good!
If translation=Error: reply="Sorry, Im Spanish": Endif
Fred
Administrator
Administrator
Posts: 16621
Joined: Fri May 17, 2002 4:39 pm
Location: France
Contact:

Re: High speed split string

Post by Fred »

On a side note, you should not write code like this:

Code: Select all

*S\i + (p + slen - 1) << #PB_Compiler_Unicode
A constant value can change in a future version of PB, so even if it works now (because it's 1), it's not granted to work in the future and could cause hard to find bugs
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3870
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: High speed split string

Post by wilbert »

Fred wrote:A constant value can change in a future version of PB, so even if it works now (because it's 1), it's not granted to work in the future and could cause hard to find bugs
I assumed in this case it would be okay because it is mentioned in the help file the constant is either 0 or 1.
Well, in the future we don't have to check for ascii or unicode since there will be only unicode. :)
Windows (x64)
Raspberry Pi OS (Arm64)
Fred
Administrator
Administrator
Posts: 16621
Joined: Fri May 17, 2002 4:39 pm
Location: France
Contact:

Re: High speed split string

Post by Fred »

Actually, you are right, for this case it is OK, my bad. Should be the lack of sleep :)
linkerstorm
User
User
Posts: 47
Joined: Sun Feb 18, 2007 11:57 am

Re: High speed split string

Post by linkerstorm »

Hi everyone.

My 2 cents : I need a fast string split procedure as I, professionaly, regularly process huge strings.

I tried the above solutions and the others on different posts, all are ok but for my specific need (performance), I wrote a little tricked procedure (in a module) with mixed ASM to achieve my goal.

Though it is reasonably fast, obviously it has drawbacks to achieve the performance.

[*] Pros
  • - Reasonably fast.
    - No string memory allocation (apart the separator string).
[*]Cons.
  • - Original string touched as I work directly in it.
    - x86 only but the adaptation to x64 seems trivial.
    - Unicode only but the adaptation to ASCII seems trivial (maybe using Character structure ?).
    - Assumptions on pointer size hard coded (some constants should be welcomed here).
    - I haven't any clue on how PB properly frees the original string (touched) and the array (touched also).
Here is the code, happy tests and waiting for your feedback on the technique used :D

Code: Select all

EnableExplicit

DeclareModule FastSplit
    
    Declare.i Fast_Split_To_Array(*pio_to_split, pi_sep.s, Array result.s(1))
    
EndDeclareModule

Module FastSplit
    
    ; *pio_to_split :   Unicode string pointer assumed => 2 bytes size par character
    ; pi_sep        :   Unicode separator assumed, can have several chars (typically #CRLF$)
    ; result        :   will be updated
    ; RETURN        :   result array elements count
    Procedure.i Fast_Split_To_Array(*pio_to_split, pi_sep.s, Array result.s(1))
        
        ; Non printable Unicode char
        #REPLACE_CHAR = $FFFF
        
        ; Macro used here to avoid a costly asm CALL
        Macro Macro_Asm_Set_Array_String_Pointer
            ! mov eax, 4                            ; 4 bytes size assumed for a 32 bits (x86) pointer
            ! mov [p.p_pedx_backup], edx            ; "edx" register backup (can't use "push/pop edx" because PB local vars are "esp" indexed)
            ! mul dword [p.v_current_element]       ; Multiply by current element index and put the result in "eax"/ Beware that "mul" use "edx" register, so the previous backup
            ! mov edx, [p.p_pedx_backup]            ; Get "edx" register value back
            ! add eax, [p.p_parray_string_pointers] ; Add result to get the start address in the original string
            ! mov [eax], edx                        ; Put computed address in the current array index
        EndMacro
        
        ; Exit if nothing to split
        If Len(PeekS(*pio_to_split)) = 0
            ProcedureReturn 0
        EndIf
        
        ; One element array if no separator
        If Len(pi_sep) = 0
            ReDim result(0)
            result(0) = PeekS(*pio_to_split)
            ProcedureReturn 1
        EndIf
        
        ; Local définition of separator (why ? just to show that "pi_sep" should be readonly, no performance hit here)
        Define.s local_sep = pi_sep
        
        ; More than one character in separator => one time performance hit to replace all separator chars by one (#REPLACE_CHAR)
        ; but "ReplaceString" is very well optimized (thanks Fred :) so performance hit is limited
        If Len(local_sep) > 1
            PokeS(*pio_to_split, ReplaceString(PeekS(*pio_to_split), local_sep, Chr(#REPLACE_CHAR)))
            local_sep = Chr(#REPLACE_CHAR)
        EndIf
        
        ; Get the size of zero based result array ("CountString" is very optimized too :)
        Define.i sep_count = CountString(PeekS(*pio_to_split), local_sep)
        
        ; Prepare the result array
        ReDim result(sep_count)
        
        ; No separators found ? Return the original string in an one-element result array
        If sep_count = 0
            result(0) = PeekS(*pio_to_split)
            ProcedureReturn 1
        EndIf
        
        ; Definitions
        Define.i sep = Asc(local_sep) ; Two chars in Unicode, which is assumed here
        Define.i current_element = 0  ; Array current element processing
        
        ; Trick #1 : get the array REAL address
        
        ; In fact, PB here returns the array first element address (but in the MemoryViewer, "@result()" show the real array address)...
        Define *parray_addr = @result()
        Define *parray_string_pointers = PeekI(@*parray_addr) ; ...have to go up to find the real array address, which contains the vTable of strings in array
        Define *pedx_backup                                   ; Our "edx" register backup (see explanations above in the macro)
        
        ! mov ecx, [p.p_pio_to_split]                   ; Load string start address here...
        ! mov edx, [p.p_pio_to_split]                   ; ...and here
        
        Scan:                                           ; Scan loop
        ! mov ax, [ecx]                                 ; Get current char (Unicode => 2 bytes => ax is ok)
        ! cmp ax, 0                                     ; String end ?
        ! jz fastsplit.ll_fast_split_to_array_endscan   ; Exit to EndScan:
        
        ! cmp ax, [p.v_sep]                             ; Separator found ?
        ! jz fastsplit.ll_fast_split_to_array_sepfound  ; Process to SepFound:
        
        NextChar:                                       ; Point to next char
        ! add ecx, 2                                    ; Unicode => 2 bytes increment
        ! jmp fastsplit.ll_fast_split_to_array_scan     ; Loop to Scan:
        
        ; Trick #2 : replace array original pointer by pointer in original string => no string memory allocation
        
        SepFound:                                       ; Separator found
        ! mov word [ecx], 0                             ; Replace separator by 2 zero-bytes (=> string end)
        Macro_Asm_Set_Array_String_Pointer              ; Our killer macro :)
        ! mov edx, ecx                                  ; Point to...
        ! add edx, 2                                    ; ...next char
        ! inc dword [p.v_current_element]               ; Increment array current element
        ! jmp fastsplit.ll_fast_split_to_array_nextchar ; Read next char to NextChar:
        
        EndScan:
        Macro_Asm_Set_Array_String_Pointer              ; Our killer Macro again :) for the last value
        
        ProcedureReturn sep_count + 1                   ; Array elements count (PB ArraySize() return the array upper bound, not necessarily the elements count)
        
    EndProcedure
    
EndModule

If #PB_Compiler_IsMainFile
    
    ; For querying performance
    Define.q qpc_s, qpc_e
    Define *pqpc_s = @qpc_s
    Define *pqpc_e = @qpc_e

    Define.i elements_count, i
    Define s$ = "a;string;to;slice" ; Try a 50 Mb string, from a big file for example :)
    Define sep$ = ";"               ; Could be a multiple chars separator (eg. #CRLF$) but performance hit in this case
    Dim a.s(0)
    
    Debug "Fast_Split_To_Array"
    
    QueryPerformanceCounter_(*pqpc_s)
    elements_count = FastSplit::Fast_Split_To_Array(@s$, ";", a())
    QueryPerformanceCounter_(*pqpc_e)
    Debug Str(qpc_e - qpc_s)
    
    ; Be careful here with big sized string :)
    For i = 0 To elements_count - 1 ; Or ArraySize (a())
        Debug a(i)
    Next
    
EndIf
User avatar
Andre
PureBasic Team
PureBasic Team
Posts: 2056
Joined: Fri Apr 25, 2003 6:14 pm
Location: Germany (Saxony, Deutscheinsiedel)
Contact:

Re: High speed split string

Post by Andre »

I didn't any speed tests, but noted wilbert's Split & Join codes for later use. Thank you :D
Bye,
...André
(PureBasicTeam::Docs & Support - PureArea.net | Order:: PureBasic | PureVisionXP)
Post Reply