It is currently Thu Jun 21, 2018 7:20 pm

All times are UTC + 1 hour




Post new topic Reply to topic  [ 16 posts ]  Go to page 1, 2  Next
Author Message
 Post subject: High speed split string
PostPosted: Sun Mar 13, 2016 6:15 am 
Offline
User
User

Joined: Sat Mar 05, 2016 10:46 pm
Posts: 64
Code:

Procedure.s GETFILETXT(PATH.i)
  sbuff.l
  Filehwnd.l = CreateFile_(PATH, #GENERIC_READ, 0, 0, #OPEN_ALWAYS, #FILE_ATTRIBUTE_NORMAL, 0)
  If Filehwnd
    ;MessageRequester("",Str(Filehwnd))
    fsize.l = GetFileSize_(Filehwnd, #Null)
    STRR.S = Space(fsize)
    ReadFile_(Filehwnd, @STRR, fsize, @sbuff, 0)
    CloseHandle_(Filehwnd)
  EndIf
  ProcedureReturn STRR
EndProcedure

;*************************************************
;SplitInMemory字符串切割函数
;参数1=Main字符串地址
;参数2=delim字符串地址
;参数3=返回字符串数组指针(用完必须手动FreeMemory)
;BY  startbin@126.com
;*************************************************
Procedure.l SplitInMemory(Main.i, delim.i, *pAddr.String)
  ;!INT 3
  !MOV ebx, [esp + $C]
  *pAddr = AllocateMemory(4000000)            ;申请内存用来存放切割后的指针,可以存放的下限为100w
  !CMP eax, 0
  !jnz Begin
  !RET $C
!Begin :
  !MOV [ebx], eax                             ;把内存地址放入paddr指向的地址
  PokeL(*pAddr, Main.i)
!Start1 :
  !MOV eax, [esp + $4]
  !PUSH esi
  !XOR esi, esi
  !TEST eax, eax
  !je short Exit1
  !PUSH ebp
  !MOV ebp, [esp + $10]
  !TEST ebp, ebp
  !je short Exit2
  !MOV dh, [ebp]
  !TEST dh, dh
  !je short Exit2
  !MOV cl, Byte [eax]
  !TEST cl, cl
  !je short Exit2
  !PUSH ebx
  !PUSH edi
 
!Six1 :
  !INC eax
  !CMP cl, dh
  !jnz short First1
  !MOV dl, Byte [eax]
  !LEA ecx, [ebp + 1]
  !MOV edi, eax
  !TEST dl, dl
  !je short Second1
 
!Four1 :
  !MOV bl, byte [ecx]
  !test bl, bl
  !je short Three1
  !CMP dl, dl
  !jnz short Second1
  !MOV dl, Byte [eax + 1]
  !INC eax
  !INC ecx
  !TEST dl, dl
  !jnz short Four1
 
!Second1 :
  !CMP Byte [ecx], 0
  !jnz short Five1
 
!Three1 :
  !INC esi                                    ;这里已经完整匹配到字符串,esi计数器 + 1
  !PUSH eax
  !PUSH ecx
  !SUB ecx, ebp
  !SUB eax, ecx
!AG :
  !MOV Byte [eax], 0
  !ADD eax, 1
  !SUB ecx, 1
  !jnz AG
  !MOV ecx, [esp + $24]
  !LEA ecx, [ecx + esi*4]
  !MOV [ecx], eax
  !POP ecx
  !POP eax
  ;[esp+$1C]=参数3
  !jmp short First1
 
!Five1 :
  !MOV eax, edi
 
!First1 :
  !MOV cl, [eax]
  !TEST cl, cl
  !jnz short Six1
  !POP edi
  !POP ebx
  !POP ebp
  !MOV eax, esi
  !POP esi
  !RET $C
 
!Exit2 :
  !POP ebp
  !MOV eax, esi
  !POP esi
  !RET $C
 
!Exit1 :
  !MOV eax, esi
  !POP esi
  !RET $C
EndProcedure

c.s = GETFILETXT(@"C:\DEMO.TXT")
p.l = Len(c)
StartTime = timeGetTime_()

Define *pAddr.String, *ss.String
c1.s = #CRLF$ ;Segmentation symbol
pl.l = SplitInMemory(@c, @c1, @*pAddr)
ElapsedTime = timeGetTime_()-StartTime
If *pAddr
  MessageRequester("", Hex(*pAddr))
  ;MessageRequester("", *pAddr\s)
  *ss = *pAddr
  For i.l = 0 To pl
    ;MessageRequester("", *pAddr\s)
    c1 = *pAddr\s
    *pAddr = *pAddr + 4
  Next
  *pAddr = *pAddr - 4
  MessageRequester("", *pAddr\s)
  FreeMemory(*ss)
EndIf                                                 ;用完要释放内存
MessageRequester(Str(ElapsedTime), Str(pl))






Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sun Mar 13, 2016 1:08 pm 
Offline
Addict
Addict

Joined: Sun Sep 07, 2008 12:45 pm
Posts: 3863
Location: Germany
Hi,

I did not test your code, but why using complicated API when PB is less code and clearer:

Code:
Procedure.s GetFileText(Filename$)
 
  Protected File.i, Result$
 
  File = ReadFile(#PB_Any, Filename$)
  If File
    Result$ = ReadString(File, #PB_File_IgnoreEOL)
    CloseFile(File)
  EndIf
 
  ProcedureReturn Result$
 
EndProcedure


Bernd


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Wed Mar 16, 2016 6:27 pm 
Offline
User
User
User avatar

Joined: Wed Mar 15, 2006 9:40 am
Posts: 96
Location: Germany
May GETFILETXT() is not the best example but SplitInMemory() looks very good for me.

Thank you for the tip callroot.
And thank you for the code startbin@126.com!

Code:
;*************************************************
;SplitInMemory???????
;??1=Main?????
;??2=delim?????
;??3=?????????(??????FreeMemory)
;BY  startbin@126.com
;*************************************************
Procedure.l SplitInMemory(Main.i, delim.i, *pAddr.String)
  ;!INT 3
  !MOV ebx, [esp + $C]
  *pAddr = AllocateMemory(4000000)            ;??????????????,????????100w
  !CMP eax, 0
  !jnz Begin
  !RET $C
!Begin :
  !MOV [ebx], eax                             ;???????paddr?????
  PokeL(*pAddr, Main.i)
!Start1 :
  !MOV eax, [esp + $4]
  !PUSH esi
  !XOR esi, esi
  !TEST eax, eax
  !je short Exit1
  !PUSH ebp
  !MOV ebp, [esp + $10]
  !TEST ebp, ebp
  !je short Exit2
  !MOV dh, [ebp]
  !TEST dh, dh
  !je short Exit2
  !MOV cl, Byte [eax]
  !TEST cl, cl
  !je short Exit2
  !PUSH ebx
  !PUSH edi
 
!Six1 :
  !INC eax
  !CMP cl, dh
  !jnz short First1
  !MOV dl, Byte [eax]
  !LEA ecx, [ebp + 1]
  !MOV edi, eax
  !TEST dl, dl
  !je short Second1
 
!Four1 :
  !MOV bl, byte [ecx]
  !test bl, bl
  !je short Three1
  !CMP dl, dl
  !jnz short Second1
  !MOV dl, Byte [eax + 1]
  !INC eax
  !INC ecx
  !TEST dl, dl
  !jnz short Four1
 
!Second1 :
  !CMP Byte [ecx], 0
  !jnz short Five1
 
!Three1 :
  !INC esi                                    ;????????????,esi??? + 1
  !PUSH eax
  !PUSH ecx
  !SUB ecx, ebp
  !SUB eax, ecx
!AG :
  !MOV Byte [eax], 0
  !ADD eax, 1
  !SUB ecx, 1
  !jnz AG
  !MOV ecx, [esp + $24]
  !LEA ecx, [ecx + esi*4]
  !MOV [ecx], eax
  !POP ecx
  !POP eax
  ;[esp+$1C]=??3
  !jmp short First1
 
!Five1 :
  !MOV eax, edi
 
!First1 :
  !MOV cl, [eax]
  !TEST cl, cl
  !jnz short Six1
  !POP edi
  !POP ebx
  !POP ebp
  !MOV eax, esi
  !POP esi
  !RET $C
 
!Exit2 :
  !POP ebp
  !MOV eax, esi
  !POP esi
  !RET $C
 
!Exit1 :
  !MOV eax, esi
  !POP esi
  !RET $C
EndProcedure


;- Read.s
DataSection
Commands:
Data.s "ABOR", "ACCT", "ADAT", "ALLO", "APPE", "AUTH", "CCC", "CDUP", "CONF", "CWD", "DELE", "ENC", "EPRT", "EPSV", "FEAT", "HELP", "HOST", "LANG", "LIST", "LPRT", "LPSV", "MDTM", "MIC", "MKD", "MLSD", "MLST", "MODE", "NLST", "NOOP", "OPTS", "PASS", "PASV", "PBSZ", "PORT", "PROT", "PWD", "QUIT", "REIN", "REST", "RETR", "RMD", "RNFR", "RNTO", "SITE", "SIZE", "SMNT", "STAT", "STOR", "STOU", "STRU", "SYST", "TYPE", "USER", "XCUP", "XMKD", "XPWD", "XRCP", "XRMD", "XRSQ", "XSEM", "XSEN"
EndDataSection
Maximal = 61
StartTime = timeGetTime_()
For t = 1 To 10000
Restore Commands
For i = 1 To Maximal
  Read.s test.s
  ;Debug test
Next
Next
Debug timeGetTime_()-StartTime


;- StringField
c.s = "ABOR;ACCT;ADAT;ALLO;APPE;AUTH;CCC;CDUP;CONF;CWD;DELE;ENC;EPRT;EPSV;FEAT;HELP;HOST;LANG;LIST;LPRT;LPSV;MDTM;MIC;MKD;MLSD;MLST;MODE;NLST;NOOP;OPTS;PASS;PASV;PBSZ;PORT;PROT;PWD;QUIT;REIN;REST;RETR;RMD;RNFR;RNTO;SITE;SIZE;SMNT;STAT;STOR;STOU;STRU;SYST;TYPE;USER;XCUP;XMKD;XPWD;XRCP;XRMD;XRSQ;XSEM;XSEN"
c1.s = ";" ;Segmentation symbol
StartTime = timeGetTime_()
For t = 1 To 10000
  j = CountString(c, c1)
  For i = 1 To j+1
    test.s = StringField(c, i, c1)
    ;Debug test
  Next
Next
Debug timeGetTime_()-StartTime


;- Split multible
c.s = "ABOR;ACCT;ADAT;ALLO;APPE;AUTH;CCC;CDUP;CONF;CWD;DELE;ENC;EPRT;EPSV;FEAT;HELP;HOST;LANG;LIST;LPRT;LPSV;MDTM;MIC;MKD;MLSD;MLST;MODE;NLST;NOOP;OPTS;PASS;PASV;PBSZ;PORT;PROT;PWD;QUIT;REIN;REST;RETR;RMD;RNFR;RNTO;SITE;SIZE;SMNT;STAT;STOR;STOU;STRU;SYST;TYPE;USER;XCUP;XMKD;XPWD;XRCP;XRMD;XRSQ;XSEM;XSEN"
c1.s = ";" ;Segmentation symbol
Define *pAddr.String,
StartTime = timeGetTime_()
For t = 1 To 10000
pl.l = SplitInMemory(@c, @c1, @*pAddr)
If *pAddr
  test.s = *pAddr\s
  ;Debug test   
  For i = 1 To pl
    *pAddr = *pAddr + 4
    test.s = *pAddr\s
    ;Debug test
  Next
EndIf 
Next
Debug timeGetTime_()-StartTime


;- Split once, use multible
c.s = "ABOR;ACCT;ADAT;ALLO;APPE;AUTH;CCC;CDUP;CONF;CWD;DELE;ENC;EPRT;EPSV;FEAT;HELP;HOST;LANG;LIST;LPRT;LPSV;MDTM;MIC;MKD;MLSD;MLST;MODE;NLST;NOOP;OPTS;PASS;PASV;PBSZ;PORT;PROT;PWD;QUIT;REIN;REST;RETR;RMD;RNFR;RNTO;SITE;SIZE;SMNT;STAT;STOR;STOU;STRU;SYST;TYPE;USER;XCUP;XMKD;XPWD;XRCP;XRMD;XRSQ;XSEM;XSEN"
c1.s = ";" ;Segmentation symbol
StartTime = timeGetTime_()
pl.l = SplitInMemory(@c, @c1, @*pAddr)
If *pAddr
*ss = *pAddr
For t = 1 To 10000
  *pAddr = *ss
  test.s = *pAddr\s
  ;Debug test   
  For i = 1 To pl
    *pAddr = *pAddr + 4
    test.s = *pAddr\s
    ;Debug test
  Next
  Next 
  FreeMemory(*ss)
EndIf 
Debug timeGetTime_()-StartTime

_________________
Please pardon my English, my native tongue is German.


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sat Apr 09, 2016 4:39 pm 
Offline
User
User

Joined: Sun Apr 03, 2016 12:03 am
Posts: 61
Can someone reformat this with an example and some easier to understand notes on use?

I use this kind of function all the time. I'm using a slow substitute in Pb, but I really long for
a function like this with simple calling syntax:

new a$()
a$() = split(string$, token$)

It appears that this version (IMPORTANTLY) releases intermediate memory when done.
I have applications that use split functions perhaps tens of thousands of time on large
strings in just moments and the performance boost would be great!

Yes, I see my own laziness in making the request. :)


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sat Apr 09, 2016 6:19 pm 
Offline
PureBasic Expert
PureBasic Expert

Joined: Sun Aug 08, 2004 5:21 am
Posts: 3142
Location: Netherlands
This is not an assembler approach but you might want to see if it's fast enough for you.
Code:
Procedure Split(String.s, Array StringArray.s(1), Separator.s = " ")
 
  Protected S.String, *S.Integer = @S
  Protected.i asize, i, p, slen
  asize = CountString(String, Separator)
  slen = Len(Separator)
  ReDim StringArray(asize)
 
  *S\i = @String
  While i < asize
    p = FindString(S\s, Separator)
    StringArray(i) = PeekS(*S\i, p - 1)
    *S\i + (p + slen - 1) << #PB_Compiler_Unicode
    i + 1
  Wend
  StringArray(i) = S\s
  *S\i = 0
 
EndProcedure

Procedure.s Join(Array StringArray.s(1), Separator.s = " ")
 
  Protected.i asize, i, slen, tlen, *buffer
  asize = ArraySize(StringArray())
  slen = Len(Separator)
  For i = 0 To asize
    tlen + Len(StringArray(i)) + slen
  Next
  tlen - slen
 
  Protected Dim buffer.c(tlen)
  *buffer = @buffer()
  CopyMemoryString(StringArray(0), @*buffer)
  For i = 1 To asize
    CopyMemoryString(Separator)
    CopyMemoryString(StringArray(i))
  Next
 
  ProcedureReturn PeekS(@buffer())
 
EndProcedure



; test

S.s = "This is a test string to see if split and join are working."

Dim MyStrings.s(0)
Split(S, MyStrings())

For i = 0 To ArraySize(MyStrings())
  Debug MyStrings(i)
Next

JS.s = Join(MyStrings(), "*")

Debug JS

_________________
MacOS 10.13 High Sierra, PB 5.60 x64


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sun Apr 10, 2016 2:14 am 
Offline
User
User

Joined: Sun Apr 03, 2016 12:03 am
Posts: 61
Wilbert, thank you!

I haven't profiled this to see the speed, but it's the best of all the samples I saw with the clearest example.

Is it possible to assign the array as the output using an array copy or something, so the array is returned
from the ProcedureReturn ? From a beginning programmer's point of view, there are some advantages in using
"result = verb(input)" syntax, and it is more clear.
Is there any hope to do something like this:

dim array()
array() = split(string$)

This syntax is attractive to people coming from outside PB. But I think maybe Purebasic won't allow it,
because it performs the function of copy array.

I find myself wondering if the only way to pull this off is to create a map on the fly and then return the split data
as keys, instead of returning the array. I'd have to put the sequence #s in the value fields, and then...
No, It's an ugly idea.


I am sure I will adapt to the purebasic syntax.
Thank you again... are you the same Wilbert from PB days? If so, I always liked your posts...
Very helpful.


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sun Apr 10, 2016 5:09 am 
Offline
PureBasic Expert
PureBasic Expert

Joined: Sun Aug 08, 2004 5:21 am
Posts: 3142
Location: Netherlands
Bo Marchais wrote:
Is it possible to assign the array as the output

Unfortunately not.
If it would be possible I would have used that approach since I agree it's clearer.

Bo Marchais wrote:
Thank you again... are you the same Wilbert from PB days?

If this time PB means PowerBasic, then the answer is no; didn't know there's a Wilbert there also. :shock:
You can find posts online from me about RapidQ (long time ago before I found PureBasic) and KoolMoves (Flash / ActionScript related) but I haven't used PowerBasic.

_________________
MacOS 10.13 High Sierra, PB 5.60 x64


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sun Apr 10, 2016 6:26 am 
Offline
PureBasic Expert
PureBasic Expert

Joined: Sun Aug 08, 2004 5:21 am
Posts: 3142
Location: Netherlands
Sometimes a List is easier to use compared to an array (removing or inserting items).
Here's split and join for List also.
Code:
Procedure Split(String.s, Array StringArray.s(1), Separator.s = " ")
 
  Protected S.String, *S.Integer = @S
  Protected.i asize, i, p, slen
  asize = CountString(String, Separator)
  slen = Len(Separator)
  ReDim StringArray(asize)
 
  *S\i = @String
  While i < asize
    p = FindString(S\s, Separator)
    StringArray(i) = PeekS(*S\i, p - 1)
    *S\i + (p + slen - 1) << #PB_Compiler_Unicode
    i + 1
  Wend
  StringArray(i) = S\s
  *S\i = 0
 
EndProcedure

Procedure.s Join(Array StringArray.s(1), Separator.s = " ")
 
  Protected.i asize, i, slen, tlen, *buffer
  asize = ArraySize(StringArray())
  slen = Len(Separator)
  For i = 0 To asize
    tlen + Len(StringArray(i)) + slen
  Next
  tlen - slen
 
  Protected Dim buffer.c(tlen)
  *buffer = @buffer()
  CopyMemoryString(StringArray(0), @*buffer)
  For i = 1 To asize
    CopyMemoryString(Separator)
    CopyMemoryString(StringArray(i))
  Next
 
  ProcedureReturn PeekS(@buffer())
 
EndProcedure

Procedure SplitL(String.s, List StringList.s(), Separator.s = " ")
 
  Protected S.String, *S.Integer = @S
  Protected.i p, slen
  slen = Len(Separator)
  ClearList(StringList())
 
  *S\i = @String
  Repeat
    AddElement(StringList())
    p = FindString(S\s, Separator)
    StringList() = PeekS(*S\i, p - 1)
    *S\i + (p + slen - 1) << #PB_Compiler_Unicode
  Until p = 0
  *S\i = 0
 
EndProcedure

Procedure.s JoinL(List StringList.s(), Separator.s = " ")
 
  Protected.i slen, tlen, *buffer
  slen = Len(Separator)
  ForEach StringList()
    tlen + Len(StringList()) + slen
  Next
  tlen - slen
 
  Protected Dim buffer.c(tlen)
  *buffer = @buffer()
  If FirstElement(StringList())
    CopyMemoryString(StringList(), @*buffer)
    While NextElement(StringList())
      CopyMemoryString(Separator)
      CopyMemoryString(StringList())
    Wend
  EndIf

  ProcedureReturn PeekS(@buffer())
 
EndProcedure



; test

S.s = "This is a test string to see if split and join are working."

NewList MyStrings.s()
SplitL(S, MyStrings())

ForEach MyStrings()
  Debug MyStrings()
Next

JS.s = JoinL(MyStrings(), "*")
Debug JS

; remove fifth element (first element is position 0)
SelectElement(MyStrings(), 4)
DeleteElement(MyStrings())

JS = JoinL(MyStrings(), "*")
Debug JS

_________________
MacOS 10.13 High Sierra, PB 5.60 x64


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sun Apr 10, 2016 8:41 am 
Offline
Enthusiast
Enthusiast

Joined: Fri Oct 16, 2009 10:12 am
Posts: 534
Location: BE
Bo Marchais wrote:
Is it possible to assign the array as the output using an array copy or something, so the array is returned
from the ProcedureReturn ?[/color][/i]
If I do understand you correctly...
This can be done with the ExtractRegularExpression, but I don't know if the speeds is optimal then...
Code:
; This expression will match every word of 3 letter which begin by a lower case letter,
; followed with the character 'b' and which ends with an uppercase letter. ex: abC
;   
If CreateRegularExpression(0, "[a-z]b[A-Z]")
  Dim Result$(0)
  NbFound = ExtractRegularExpression(0, "abC ABc zbA abc", Result$())
  For k = 0 To NbFound-1
    Debug Result$(k)
  Next
Else
  Debug RegularExpressionError()
EndIf
This example comes from the PB-help, so you'll have to change some things.

__________________________________________________
Quote tags>Code tags
10.04.2016
RSBasic

_________________
Yeah I know, but keep in mind ... Leonardo da Vinci was also an autodidact.


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sat Apr 16, 2016 10:06 pm 
Offline
Enthusiast
Enthusiast
User avatar

Joined: Mon Jul 08, 2013 8:43 pm
Posts: 228
Very good! javascripters and 80´s assemblers are lucky! :mrgreen:
Thanks callroot, Infratec and willbert for share code and examples! :D
Really good!

_________________
If translation=Error: reply="Sorry, Im Spanish": Endif


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sun Apr 17, 2016 6:47 am 
Offline
Administrator
Administrator

Joined: Fri May 17, 2002 4:39 pm
Posts: 13304
Location: France
On a side note, you should not write code like this:

Code:
*S\i + (p + slen - 1) << #PB_Compiler_Unicode


A constant value can change in a future version of PB, so even if it works now (because it's 1), it's not granted to work in the future and could cause hard to find bugs


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sun Apr 17, 2016 7:57 am 
Offline
PureBasic Expert
PureBasic Expert

Joined: Sun Aug 08, 2004 5:21 am
Posts: 3142
Location: Netherlands
Fred wrote:
A constant value can change in a future version of PB, so even if it works now (because it's 1), it's not granted to work in the future and could cause hard to find bugs

I assumed in this case it would be okay because it is mentioned in the help file the constant is either 0 or 1.
Well, in the future we don't have to check for ascii or unicode since there will be only unicode. :)

_________________
MacOS 10.13 High Sierra, PB 5.60 x64


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sun Apr 17, 2016 9:55 am 
Offline
Administrator
Administrator

Joined: Fri May 17, 2002 4:39 pm
Posts: 13304
Location: France
Actually, you are right, for this case it is OK, my bad. Should be the lack of sleep :)


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Mon Dec 11, 2017 11:38 am 
Offline
User
User

Joined: Sun Feb 18, 2007 11:57 am
Posts: 45
Hi everyone.

My 2 cents : I need a fast string split procedure as I, professionaly, regularly process huge strings.

I tried the above solutions and the others on different posts, all are ok but for my specific need (performance), I wrote a little tricked procedure (in a module) with mixed ASM to achieve my goal.

Though it is reasonably fast, obviously it has drawbacks to achieve the performance.

[*] Pros
    - Reasonably fast.
    - No string memory allocation (apart the separator string).
[*]Cons.
    - Original string touched as I work directly in it.
    - x86 only but the adaptation to x64 seems trivial.
    - Unicode only but the adaptation to ASCII seems trivial (maybe using Character structure ?).
    - Assumptions on pointer size hard coded (some constants should be welcomed here).
    - I haven't any clue on how PB properly frees the original string (touched) and the array (touched also).


Here is the code, happy tests and waiting for your feedback on the technique used :D
Code:
EnableExplicit

DeclareModule FastSplit
   
    Declare.i Fast_Split_To_Array(*pio_to_split, pi_sep.s, Array result.s(1))
   
EndDeclareModule

Module FastSplit
   
    ; *pio_to_split :   Unicode string pointer assumed => 2 bytes size par character
    ; pi_sep        :   Unicode separator assumed, can have several chars (typically #CRLF$)
    ; result        :   will be updated
    ; RETURN        :   result array elements count
    Procedure.i Fast_Split_To_Array(*pio_to_split, pi_sep.s, Array result.s(1))
       
        ; Non printable Unicode char
        #REPLACE_CHAR = $FFFF
       
        ; Macro used here to avoid a costly asm CALL
        Macro Macro_Asm_Set_Array_String_Pointer
            ! mov eax, 4                            ; 4 bytes size assumed for a 32 bits (x86) pointer
            ! mov [p.p_pedx_backup], edx            ; "edx" register backup (can't use "push/pop edx" because PB local vars are "esp" indexed)
            ! mul dword [p.v_current_element]       ; Multiply by current element index and put the result in "eax"/ Beware that "mul" use "edx" register, so the previous backup
            ! mov edx, [p.p_pedx_backup]            ; Get "edx" register value back
            ! add eax, [p.p_parray_string_pointers] ; Add result to get the start address in the original string
            ! mov [eax], edx                        ; Put computed address in the current array index
        EndMacro
       
        ; Exit if nothing to split
        If Len(PeekS(*pio_to_split)) = 0
            ProcedureReturn 0
        EndIf
       
        ; One element array if no separator
        If Len(pi_sep) = 0
            ReDim result(0)
            result(0) = PeekS(*pio_to_split)
            ProcedureReturn 1
        EndIf
       
        ; Local définition of separator (why ? just to show that "pi_sep" should be readonly, no performance hit here)
        Define.s local_sep = pi_sep
       
        ; More than one character in separator => one time performance hit to replace all separator chars by one (#REPLACE_CHAR)
        ; but "ReplaceString" is very well optimized (thanks Fred :) so performance hit is limited
        If Len(local_sep) > 1
            PokeS(*pio_to_split, ReplaceString(PeekS(*pio_to_split), local_sep, Chr(#REPLACE_CHAR)))
            local_sep = Chr(#REPLACE_CHAR)
        EndIf
       
        ; Get the size of zero based result array ("CountString" is very optimized too :)
        Define.i sep_count = CountString(PeekS(*pio_to_split), local_sep)
       
        ; Prepare the result array
        ReDim result(sep_count)
       
        ; No separators found ? Return the original string in an one-element result array
        If sep_count = 0
            result(0) = PeekS(*pio_to_split)
            ProcedureReturn 1
        EndIf
       
        ; Definitions
        Define.i sep = Asc(local_sep) ; Two chars in Unicode, which is assumed here
        Define.i current_element = 0  ; Array current element processing
       
        ; Trick #1 : get the array REAL address
       
        ; In fact, PB here returns the array first element address (but in the MemoryViewer, "@result()" show the real array address)...
        Define *parray_addr = @result()
        Define *parray_string_pointers = PeekI(@*parray_addr) ; ...have to go up to find the real array address, which contains the vTable of strings in array
        Define *pedx_backup                                   ; Our "edx" register backup (see explanations above in the macro)
       
        ! mov ecx, [p.p_pio_to_split]                   ; Load string start address here...
        ! mov edx, [p.p_pio_to_split]                   ; ...and here
       
        Scan:                                           ; Scan loop
        ! mov ax, [ecx]                                 ; Get current char (Unicode => 2 bytes => ax is ok)
        ! cmp ax, 0                                     ; String end ?
        ! jz fastsplit.ll_fast_split_to_array_endscan   ; Exit to EndScan:
       
        ! cmp ax, [p.v_sep]                             ; Separator found ?
        ! jz fastsplit.ll_fast_split_to_array_sepfound  ; Process to SepFound:
       
        NextChar:                                       ; Point to next char
        ! add ecx, 2                                    ; Unicode => 2 bytes increment
        ! jmp fastsplit.ll_fast_split_to_array_scan     ; Loop to Scan:
       
        ; Trick #2 : replace array original pointer by pointer in original string => no string memory allocation
       
        SepFound:                                       ; Separator found
        ! mov word [ecx], 0                             ; Replace separator by 2 zero-bytes (=> string end)
        Macro_Asm_Set_Array_String_Pointer              ; Our killer macro :)
        ! mov edx, ecx                                  ; Point to...
        ! add edx, 2                                    ; ...next char
        ! inc dword [p.v_current_element]               ; Increment array current element
        ! jmp fastsplit.ll_fast_split_to_array_nextchar ; Read next char to NextChar:
       
        EndScan:
        Macro_Asm_Set_Array_String_Pointer              ; Our killer Macro again :) for the last value
       
        ProcedureReturn sep_count + 1                   ; Array elements count (PB ArraySize() return the array upper bound, not necessarily the elements count)
       
    EndProcedure
   
EndModule

If #PB_Compiler_IsMainFile
   
    ; For querying performance
    Define.q qpc_s, qpc_e
    Define *pqpc_s = @qpc_s
    Define *pqpc_e = @qpc_e

    Define.i elements_count, i
    Define s$ = "a;string;to;slice" ; Try a 50 Mb string, from a big file for example :)
    Define sep$ = ";"               ; Could be a multiple chars separator (eg. #CRLF$) but performance hit in this case
    Dim a.s(0)
   
    Debug "Fast_Split_To_Array"
   
    QueryPerformanceCounter_(*pqpc_s)
    elements_count = FastSplit::Fast_Split_To_Array(@s$, ";", a())
    QueryPerformanceCounter_(*pqpc_e)
    Debug Str(qpc_e - qpc_s)
   
    ; Be careful here with big sized string :)
    For i = 0 To elements_count - 1 ; Or ArraySize (a())
        Debug a(i)
    Next
   
EndIf


Top
 Profile  
Reply with quote  
 Post subject: Re: High speed split string
PostPosted: Sat Dec 23, 2017 12:40 am 
Offline
PureBasic Team
PureBasic Team
User avatar

Joined: Fri Apr 25, 2003 6:14 pm
Posts: 1559
Location: Germany (Saxony, Deutscheinsiedel)
I didn't any speed tests, but noted wilbert's Split & Join codes for later use. Thank you :D

_________________
Bye,
...André
(PureBasicTeam::Docs & Support - PureArea.net | Order:: PureBasic | PureVisionXP)


Top
 Profile  
Reply with quote  
Display posts from previous:  Sort by  
Post new topic Reply to topic  [ 16 posts ]  Go to page 1, 2  Next

All times are UTC + 1 hour


Who is online

Users browsing this forum: No registered users and 5 guests


You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum

Search for:
Jump to:  

 


Powered by phpBB © 2008 phpBB Group
subSilver+ theme by Canver Software, sponsor Sanal Modifiye