Re: Why FindString() is so terribly slow?
Posted: Mon Nov 26, 2018 2:15 pm
Or maybe s.find('∑') is even evaluated at compile-time, maybe even the whole loop.
http://www.purebasic.com
https://www.purebasic.fr/english/
I don't think that is the reason. 'find' is not an built-in function the compiler knows about and can evaluate it at compile-time. But that is only a vague assumption.#NULL wrote:Or maybe s.find('∑') is even evaluated at compile-time, maybe even the whole loop.
Code: Select all
Procedure find(s.s, c.s)
EndProcedure
Procedure find(s.s, c.s)
EndProcedure
s.s = Space(1000000) + "!"
t = ElapsedMilliseconds()
For i = 1 To 1000
x = find(s, "!")
Next
Debug ElapsedMilliseconds()-t ; 3278
t = ElapsedMilliseconds()
For i = 1 To 1000
x = FindString(s, "!")
Next
Debug ElapsedMilliseconds()-t ; 2469
Yeah, that sounds very convincing. If the strings is always or at least sometimes in an other place in memory it has to be cached again everytime. And because our example uses such long strings it will not fit completely in the cache anyhow unless you've got a server CPU or something like this.#NULL wrote:As has been said, that's the copying. You don't even need a function. just assign (copy) the string to another string variable in the loop. That copying could also affect caching because every function call will work an a new/different string memory instead of working always on the same when passed by referrence.
Even if it appears that the string is passed ByVal, this is not necessarily the case. For example, the ReplaceString() function obviously passes all strings ByVal, but this can't be with the #PB_String_InPlace mode.#NULL wrote:As has been said, that's the copying.
Code: Select all
Import ""
wcsstr(*s, *c)
EndImport
Procedure FindW(*s, *c)
Protected *pcs = wcsstr(*s, *c)
If *pcs
ProcedureReturn (*pcs - *s) >> 1 + 1
EndIf
EndProcedure
s.s = Space(1000000) + "!"
c.s = "!"
t = ElapsedMilliseconds()
For i = 1 To 1000
x1 = FindW(@s, @c)
;x1 = FindW(@s, @"!") ; Bug OSX ?
Next
t1 = ElapsedMilliseconds()-t ; 3278
t = ElapsedMilliseconds()
For i = 1 To 1000
x2 = FindString(s, c)
Next
t2 = ElapsedMilliseconds()-t ; 2469
MessageRequester("Result:", "X1 = " + x1 + " / T1 = " + t1 + #LF$ + "X2 = " + x2 + " / T2 = " + t2)
Code: Select all
ImportC ""
CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
_wcslen_(*String) As "wcslen"
_wcsstr_(*String1, *String2) As "wcsstr"
_wcschr_(*String1, *String2) As "wcschr" ;FAILS
_StrStrIW_(*String1, *String2) As "StrStrIW"
CompilerElse
_wcslen_(*String) As "_wcslen"
_wcsstr_(*String1, *String2) As "_wcsstr"
_wcschr_(*String1, *String2) As "_wcschr" ;FAILS
_StrStrIA_(*String1, *String2) As "_StrStrIA"
CompilerEndIf
EndImport
Procedure.i FindStringC(*s, *c)
; Find Unicode substring$ within search$, case sensitive.
Protected.i *pcs = _wcsstr_(*s, *c)
If *pcs
;ProcedureReturn (*pcs - *s) ;<-- 0-based Byte position
ProcedureReturn (*pcs - *s) >> 1 + 1 ;<-- 1-based Char position
EndIf
EndProcedure
Procedure.i FindStringCnc(*s, *c)
; Find Unicode substring$ within search$, case insensitive.
Protected.i *pcs = _StrStrIW_(*s, *c)
If *pcs
;ProcedureReturn (*pcs - *s) ;<-- 0-based Byte position
ProcedureReturn (*pcs - *s) >> 1 + 1 ;<-- 1-based Char position
EndIf
EndProcedure
Procedure.i FindStringChr(*s, *c)
Protected.i *pcs = _wcschr_(*s, *c)
If *pcs
ProcedureReturn (*pcs - *s) >> 1 + 1 ;<-- 1-based Char position
EndIf
EndProcedure
EnableASM
CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
Macro rax : eax : EndMacro
Macro rbx : ebx : EndMacro
Macro rcx : ecx : EndMacro
Macro rdx : edx : EndMacro
Macro rsi : esi : EndMacro
Macro rdi : edi : EndMacro
Macro rbp : ebp : EndMacro
Macro rsp : esp : EndMacro
CompilerEndIf
Macro M_movdqa(arg1, arg2)
!movdqa arg1, arg2
EndMacro
Procedure FindStringSSE2(*HayStack, HayStackSize.i, *Needle, NeedleSize.i, Pos.i=0, Count.i=0)
; backup some registers
mov [rsp - 8], rbx
mov [rsp - 16], rsi
mov [rsp - 24], rdi
; init count
mov rax, [p.v_Count]
sub rax, 1
sbb rax, rax
mov [rsp - 40], rax
; perform some checks
mov rcx, [p.v_Pos]
sub [p.v_HayStackSize], rcx
!jbe .l_sse2_search8 ; exit when HayStackSize <= Pos
add [p.p_HayStack], rcx
mov rax, [p.v_HayStackSize]
mov rbx, [p.v_NeedleSize]
sub rax, rbx
!jc .l_sse2_search8 ; exit if NeedleSize > HaystackSize
add rax, [p.p_HayStack]
mov [rsp - 32], rax ; rsp - 32 = *SearchEnd
cmp rbx, 1
!jl .l_sse2_search8 ; exit if NeedleSize < 1
; load first two needle bytes
!pcmpeqb xmm4, xmm4
mov rdi, [p.p_Needle]
movzx eax, byte [rdi]
!je .l_sse2_search0
mov ah, [rdi + 1]
!pslldq xmm4, 15
!.l_sse2_search0:
!movd xmm2, eax
!punpcklbw xmm2, xmm2
!punpcklwd xmm2, xmm2
!pshufd xmm3, xmm2, 01010101b ; xmm3 = 16 times second needle byte
!pshufd xmm2, xmm2, 0 ; xmm2 = 16 times first needle byte
; start search
mov rsi, [p.p_HayStack]
mov rcx, rsi
shr rsi, 4 ; align Haystack to 16 bytes
shl rsi, 4
sub rcx, rsi
M_movdqa(xmm0, [rsi]) ; handle first 16 bytes
!movdqa xmm1, xmm0
!pcmpeqb xmm0, xmm2 ; compare against first needle byte
!pmovmskb eax, xmm0
!shr eax, cl ; shift off unwanted bytes
!shl eax, cl
!test eax, eax
!jnz .l_sse2_search2
; main search loop
!.l_sse2_search1:
add rsi, 16 ; next 16 bytes
cmp rsi, [rsp - 32]
!ja .l_sse2_search8
M_movdqa(xmm0, [rsi])
!movdqa xmm1, xmm0
!pcmpeqb xmm0, xmm2 ; compare against first needle byte
!pmovmskb eax, xmm0
!test eax, eax
!jz .l_sse2_search1 ; no match ? => search1
!.l_sse2_search2:
!pcmpeqb xmm1, xmm3 ; compare against second needle byte
!psrldq xmm1, 1
!por xmm1, xmm4
!pmovmskb ecx, xmm1
!and eax, ecx ; combine both searches
!jz .l_sse2_search1 ; no match ? => search1
; compare rest of bytes
!.l_sse2_search3:
!bsf ecx, eax ; get index of first match
!jz .l_sse2_search1
!btr eax, ecx
lea rdx, [rsi + rcx] ; create a pointer to it
cmp rdx, [rsp - 32]
mov rcx, [p.v_NeedleSize]
!ja .l_sse2_search8
sub rcx, 2
!jb .l_sse2_search5 ; NeedleSize < 2 ? => search5 (already done)
!.l_sse2_search4:
movzx ebx, word [rdx + rcx] ; compare rest of needle right-to-left
cmp bx, [rdi + rcx] ; two bytes at a time
!jne .l_sse2_search3
sub rcx, 2
!jae .l_sse2_search4
!.l_sse2_search5:
mov rbx, [rsp - 40]
cmp rbx, -1
!je .l_sse2_search6
add rbx, 1 ; increase count
mov [rsp - 40], rbx
!jmp .l_sse2_search3
!.l_sse2_search6:
mov rax, rdx ; return result
sub rax, [p.p_HayStack]
add rax, [p.v_Pos]
!.l_sse2_search7:
mov rbx, [rsp - 8]
mov rsi, [rsp - 16]
mov rdi, [rsp - 24]
ProcedureReturn
; not found / return count
!.l_sse2_search8:
mov rax, [rsp - 40]
!jmp .l_sse2_search7
EndProcedure
Procedure.i FindStringASMU(*sIn, *sToFind)
!mov rcx,[p.p_sIn]
!mov rdx,[p.p_sToFind]
!xor rax,rax
!mov r10,rcx
!cmp word[rdx],0h
!je LocStr_Return
!LocStr_Loop:
!cmp word[rcx],0h
!je LocStr_Return
!mov bx,word[rcx]
!cmp bx,word[rdx]
!jne LocStr_Next
!lea r8,[rcx+2h]
!lea r9,[rdx+2h]
!LocStr_Match:
!cmp word[r9],0h
!je LocStr_Result
!cmp word[r8],0h
!je LocStr_Next
!mov bx,word[r8]
!cmp bx,word[r9]
!jne LocStr_Next
!lea r8,[r8+2h]
!lea r9,[r9+2h]
!jmp LocStr_Match
!LocStr_Result:
!sub rcx,r10
!lea rax,[rcx+2h]
!shr rax,1h
!jmp LocStr_Return
!LocStr_Next:
!lea rcx,[rcx+2h]
!jmp LocStr_Loop
!LocStr_Return:
ProcedureReturn
EndProcedure
Procedure.i SF_ToMem(Unicode$, Enc.i=#PB_Ascii)
; Useful as 1 function with Enc parameter vs multiple functions; Ascii(), Unicode().
Protected *b = AllocateMemory(StringByteLength(Unicode$) + SizeOf(character))
PokeS(*b, Unicode$, -1, Enc) ;|#PB_String_NoZero) ; Not dropping trailing Zero.
ProcedureReturn *b
EndProcedure
Procedure.i FindStringMem(*String, LenStr.i, Srch$, UseCase.i=#PB_String_CaseSensitive, Enc.i=#PB_Unicode)
; REV: 140905, skywalk
; Return byte location(0-based) of Srch$ in *String.
; Useful to search an Ascii string in Unicode app(default as of v5.4).
Protected.i lenSrch = Len(Srch$)
Protected.i lenChar
Protected.i *Srch = SF_ToMem(srch$, Enc)
If Enc = #PB_Unicode ; 2 bytes/char
lenChar = 2
Else ; #PB_Ascii = 1 bytes/char, #PB_UTF8 = variable bytes/char
lenChar = 1
EndIf
If *Srch
Protected.i *pos = *String
lenStr * lenchar + *String
While *pos <= lenStr
If CompareMemoryString(*pos, *Srch, UseCase, lenSrch, Enc) = #PB_String_Equal
ProcedureReturn *pos - *String
EndIf
*pos + lenChar
Wend
FreeMemory(*Srch)
Else
ProcedureReturn 0
EndIf
EndProcedure
If 1 ;-! TRY FindStringMem()
Define.s haystack$, needle$
Define.i lenhaystack, posmem, *haystack
;haystack$ = "ABCDEF"+"|"+"GHIJKLMNO" + "|" + "123" ; '|' is placeholder for #Null.
haystack$ = Space(1000000) + "AbC!#"
lenhaystack = Len(haystack$) ; Remember len of this string. Adding #Null's will "shorten" it by 1st Null appearance.
;*haystack = SF_ToMem(haystack$, #PB_Unicode) ; Convert string to mem. Note this = Unicode.
needle$ = "Abc!#" ; Do not mix Unicode and Ascii strings! Here we search for a Unicode string.
;needle$ = "x" ; Do not mix Unicode and Ascii strings! Here we search for a Unicode string.
Debug ";-- Unicode needle$ --> " + needle$
Debug ";-- FindStringC() --"
posmem = FindStringC(@haystack$, @needle$)
Debug Str(posmem) + " --> " + Mid(haystack$, posmem, Len(needle$))
;Debug Str(posmem) + " --> " + PeekS(@haystack$ + posmem, Len(needle$))
Debug ";-- FindStringCnc() --"
posmem = FindStringCnc(@haystack$, @needle$)
Debug Str(posmem) + " --> " + Mid(haystack$, posmem, Len(needle$))
Debug ";-- StrStrIW() --"
posmem = _StrStrIW_(@haystack$, @needle$)
;Debug Str(posmem) + " --> " + Mid(haystack$, posmem, Len(needle$))
Debug Str(posmem) + " --> " + PeekS(posmem, Len(needle$))
Debug ";-- FinsStringASMU() --"
posmem = FindStringASMU(@haystack$, @needle$)
Debug Str(posmem) + " --> " + Mid(haystack$, posmem, Len(needle$))
;Debug Str(posmem) + " --> " + PeekS(@haystack$ + posmem, Len(needle$))
;BUG;posmem = FindStringChr(@haystack$, @needle$)
;BUG;Debug Str(posmem) + " --> " + PeekS(@haystack$ + posmem, Len(needle$))
Debug ";-- FindStringMem() --"
posmem = FindStringMem(@haystack$, lenhaystack, needle$, #PB_String_NoCase, #PB_Unicode)
Debug Str(posmem) + " --> " + PeekS(@haystack$ + posmem, Len(needle$))
EndIf
;-{ TEST SPEED
; There is a small bias where 1st procedure is always slower.
; Options:
; Ignore/Sacrifice results of Code1$ but repeat at end.
; Or comment out each procedure to run only 1 at a time.
CompilerIf #PB_Compiler_Debugger = 0
Macro ML_pcChange(y1, y2)
; Compute % change going from y1 to y2.
100.0 * Sign(y2-(y1)) * Abs((y2 - (y1)) / (y1 + 1e-16))
EndMacro
SetPriorityClass_(GetCurrentProcess_(), #REALTIME_PRIORITY_CLASS)
#Tries = 2e2 ;-! SET #TRIES
Define.i u,time,t1,t2,t3,t4,t5,t6,t7,t8
Define.i tw = 55
Define.s r$
Define.s code1$ = "findstring_+case"
Define.s code2$ = "findstringmem_+case"
Define.s code3$ = "findstring_-case"
Define.s code4$ = "findstringmem_-case"
Define.s code5$ = "findstringC_+case"
Define.s code6$ = "findstringASMU_+case"
Define.s code7$ = "findstringSSE2_+case"
Define.s code8$ = "findstringC_-case"
Define.i COMMMONVARIABLES_HERE
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> INSERT CODE 1 HERE...
posmem = FindString(haystack$, needle$, 1, #PB_String_CaseSensitive)
Next u
t1 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> INSERT CODE 2 HERE...
posmem = FindStringMem(@haystack$, lenhaystack, needle$, #PB_String_CaseSensitive, #PB_Unicode)
Next u
t2 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 3 HERE...
posmem = FindString(haystack$, needle$, 1, #PB_String_NoCase)
Next u
t3 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 4 HERE...
posmem = FindStringMem(@haystack$, lenhaystack, needle$, #PB_String_NoCase, #PB_Unicode)
Next u
t4 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 5 HERE...
posmem = FindStringC(@haystack$, @needle$)
Next u
t5 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 6 HERE...
posmem = FindStringASMU(@haystack$, @needle$)
Next u
t6 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 7 HERE...
HayStackSize = Len(HayStack$)
NeedleSize = Len(Needle$)
Posmem = FindStringSSE2(@haystack$, HayStackSize, @Needle$, NeedleSize)
Next u
t7 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 8 HERE...
posmem = FindStringCnc(@haystack$, @needle$)
Next u
t8 = ElapsedMilliseconds()-time
r$ = LSet("; Count(n),",tw) + Str(#Tries) + #CRLF$
r$ + LSet("; "+code1$+"(ms),",tw) + Str(t1) + #CRLF$
r$ + LSet("; "+code2$+"(ms),",tw) + Str(t2) + #CRLF$
r$ + LSet("; "+code3$+"(ms),",tw) + Str(t3) + #CRLF$
r$ + LSet("; "+code4$+"(ms),",tw) + Str(t4) + #CRLF$
r$ + LSet("; "+code5$+"(ms),",tw) + Str(t5) + #CRLF$
r$ + LSet("; "+code6$+"(ms),",tw) + Str(t6) + #CRLF$
r$ + LSet("; "+code7$+"(ms),",tw) + Str(t7) + #CRLF$
r$ + LSet("; "+code8$+"(ms),",tw) + Str(t8) + #CRLF$
r$ + LSet("; "+code1$+" : "+code2$+"(%),",tw) + StrD(ML_pcChange(t1,t2),0) + #CRLF$
r$ + LSet("; "+code3$+" : "+code4$+"(%),",tw) + StrD(ML_pcChange(t3,t4),0) + #CRLF$
r$ + LSet("; "+code1$+" : "+code3$+"(%),",tw) + StrD(ML_pcChange(t1,t3),0) + #CRLF$
r$ + LSet("; "+code2$+" : "+code4$+"(%),",tw) + StrD(ML_pcChange(t2,t4),0) + #CRLF$
r$ + LSet("; "+code1$+" : "+code5$+"(%),",tw) + StrD(ML_pcChange(t1,t5),0) + #CRLF$
r$ + LSet("; "+code1$+" : "+code6$+"(%),",tw) + StrD(ML_pcChange(t1,t6),0) + #CRLF$
r$ + LSet("; "+code1$+" : "+code7$+"(%),",tw) + StrD(ML_pcChange(t1,t7),0) + #CRLF$
r$ + LSet("; "+code1$+" : "+code8$+"(%),",tw) + StrD(ML_pcChange(t1,t8),0) + #CRLF$
If MessageRequester("Speed Test - Copy To Clipboard?",r$,#PB_MessageRequester_YesNo) = #PB_MessageRequester_Yes
SetClipboardText(r$)
EndIf
SetPriorityClass_(GetCurrentProcess_(), #NORMAL_PRIORITY_CLASS)
CompilerEndIf
;-} TEST SPEED
; Count(n), 200
; findstring_+case(ms), 227
; findstringmem_+case(ms), 1660
; findstring_-case(ms), 3886
; findstringmem_-case(ms), 2073
; findstringC_+case(ms), 233
; findstringASMU_+case(ms), 229
; findstringSSE2_+case(ms), 125
; findstringC_-case(ms), 12539
; findstring_+case : findstringmem_+case(%), 631
; findstring_-case : findstringmem_-case(%), -47
; findstring_+case : findstring_-case(%), 1612
; findstringmem_+case : findstringmem_-case(%), 25
; findstring_+case : findstringC_+case(%), 3
; findstring_+case : findstringASMU_+case(%), 1
; findstring_+case : findstringSSE2_+case(%), -45
; findstring_+case : findstringC_-case(%), 5424
Code: Select all
ProcedureReturn (*pcs - *s) >> 1 + 1
Linux: 267/1827 (without debugger)mk-soft wrote: OSX: 327/2337
Win: 1662/2078
Linux: Bug ?
Linux: Position ist Zero (not find)uwekel wrote:Linux: 267/1827 (without debugger)mk-soft wrote: OSX: 327/2337
Win: 1662/2078
Linux: Bug ?
Code: Select all
Structure SkipTable
a.a[256]
EndStructure
Procedure IdleSundayFind(*haystack,len,*needle,nlen)
;A modified Sunday search for larger patterns
;
Protected SkipTable.SkipTable
!xor rcx,rcx
!for1:
!mov rax,[p.v_nlen]
!dec rax
!cmp rax, rcx
!jl next1
!mov rax, [p.p_needle]
!movzx rdx, byte [rax+rcx]
!lea rax,[p.v_SkipTable]
!mov [rax+rdx],byte cl
!inc rcx
!jmp for1
!next1:
!movzx ecx,byte [p.v_nlen]
!and ecx,7
!mov r8,rcx
!mov r9, -1
!neg ecx
!and ecx, 7
!shl ecx, 3
!shr r9, cl
!cmp r8,0
!jnz notzero
!xor r9,r9
!notzero:
!mov rdx,[p.v_nlen]
!cmp rdx,8
!jle else_modn
!mov rdx,[p.v_nlen]
!sub rdx,r8;
!mov r15,rdx
!mov rax,[p.p_needle]
!mov rax,[rax+r15]
!mov rdx,r9
!and rax, rdx
!mov r10,rax
!jmp end_modn
!else_modn:
!mov rax,[p.p_needle]
!mov rax,[rax]
!mov rdx,r9
!and rax, rdx
!mov r10,rax
!xor r15,r15
!end_modn:
!mov rax,[p.v_nlen]
!sub rax,1
!sub [p.v_len],rax
!xor rcx,rcx
!whilelen1:
!cmp rcx, [p.v_len]
!jge wend1
!xor r13,r13
!xor r14,r14
!while_n3:
!cmp r13,r15
!jge wend_n3
!mov rax,[p.p_haystack]
!mov rax,[rax+rcx]
!mov rdx,[p.p_needle]
!mov rdx,[rdx+r13]
!xor rax,rdx
!add r14,rax
!cmp r14,0
!jz end_sum
!mov rax,r15
!sub rax,r13
!add rcx,rax
!jmp skip
!end_sum:
!add rcx,8
!add r13,8
!jmp while_n3
!wend_n3:
!mov rax,[p.p_haystack]
!mov rax,[rax+rcx]
!and rax, r9
!xor rax, r10
!add r14,rax
!cmp rax,0
!jnz skip
!sub rcx,r15
!mov rax,rcx
ProcedureReturn
!skip:
!mov rax,[p.v_nlen]
!sub rax,r15
!add rcx, rax
!mov rax, [p.p_haystack]
!movzx rdx,byte [rax+rcx]
!lea rax,[p.v_SkipTable]
!movzx rax,byte [rax+rdx]
!sub rcx,rax
!jmp whilelen1
!wend1:
!mov rax,-1;
!found:
ProcedureReturn
EndProcedure
Import ""
wcsstr(*s, *c)
EndImport
Procedure FindW(*s, *c)
Protected *pcs = wcsstr(*s, *c)
If *pcs
ProcedureReturn (*pcs - *s) >> 1 + 1
EndIf
EndProcedure
Procedure.i SF_ToMem(Unicode$, Enc.i=#PB_Ascii)
; Useful as 1 function with Enc parameter vs multiple functions; Ascii(), Unicode().
Protected *b = AllocateMemory(StringByteLength(Unicode$) + SizeOf(character))
PokeS(*b, Unicode$, -1, Enc) ;|#PB_String_NoZero) ; Not dropping trailing Zero.
ProcedureReturn *b
EndProcedure
Procedure.i FindStringMem(*String, LenStr.i, Srch$, UseCase.i=#PB_String_CaseSensitive, Enc.i=#PB_Unicode)
; REV: 140905, skywalk
; Return byte location(0-based) of Srch$ in *String.
; Useful to search an Ascii string in Unicode app(default as of v5.4).
Protected.i lenSrch = Len(Srch$)
Protected.i lenChar
Protected.i *Srch = SF_ToMem(srch$, Enc)
If Enc = #PB_Unicode ; 2 bytes/char
lenChar = 2
Else ; #PB_Ascii = 1 bytes/char, #PB_UTF8 = variable bytes/char
lenChar = 1
EndIf
If *Srch
Protected.i *pos = *String
lenStr * lenchar + *String
While *pos <= lenStr
If CompareMemoryString(*pos, *Srch, UseCase, lenSrch, Enc) = #PB_String_Equal
ProcedureReturn *pos - *String
EndIf
*pos + lenChar
Wend
FreeMemory(*Srch)
Else
ProcedureReturn 0
EndIf
EndProcedure
Procedure SwarmiSearchSSE2(*haystack,len,*needle,lenN,pos=0)
;Swarmi search SSE2 bit parallel search algorithm for for 1 to 16 byte needles
;performs ~ 1:1 With booyer moore And other variants For a 16 byte needle With a 0-255 alphabet range
;efficiency increases with parallelism. 2:1 for 8 byte needle, 4:1 for 4 byte needle, 8:1 for 2 byte needle and 16:1 1 byte needle
Protected offset
!align 32
DataSection
!m16:
!dq 0xffffffffffffffff
!dq 0xffffffffffffffff
!dq 0xffffffffffffffff
!dq 0xffffffffffffff
!dq 0xffffffffffffffff
!dq 0xffffffffffff
!dq 0xffffffffffffffff
!dq 0xffffffffff
!dq 0xffffffffffffffff
!dq 0xffffffff
!dq 0xffffffffffffffff
!dq 0xffffff
!dq 0xffffffffffffffff
!dq 0xffff
!dq 0xffffffffffffffff
!dq 0xff
!dq 0xffffffffffffffff
!dq 0x0
!dq 0xffffffffffffff
!dq 0x0
!dq 0xffffffffffff
!dq 0x0
!dq 0xffffffffff
!dq 0x0
!dq 0xffffffff
!dq 0x0
!dq 0xffffff
!dq 0x0
!dq 0xfff
!dq 0x0
!dq 0xff
!dq 0x0
EndDataSection
!mov rcx,16
!sub rcx,[p.v_lenN]
!imul rcx,16
!lea r8, [m16]
!movdqu xmm8, [r8+rcx] ;load trailing byte mask
!mov r9, [p.p_needle] ;build First char mask eg "a0a0a0a0a0a0a0"
!movzx eax,byte [r9]
!movd xmm7, eax
!punpcklbw xmm7, xmm7
!punpcklwd xmm7, xmm7
!pshufd xmm7, xmm7, 0 ;FirstCharMask
!mov r8,[p.p_haystack] ;align haysack
!mov rax,r8
!and rax ,15
!mov [p.v_offset],rax ;store offset
!sub r8,rax
!movdqu xmm5,[r9] ;store needle
!mov r14,[p.v_len]
!mov r13,[p.v_pos]
!sub r13,16
!Swarmi_While_Len:
!cmp r13, r14
!jge Swarmi_Wend
!add r13,16
!movdqa xmm0,[r8+r13] ;Get Haystack and check against mask chars"
!pcmpeqb xmm0,xmm7 ;FirstCharMask
!pmovmskb eax,xmm0
!cmp eax,0
!je Swarmi_While_Len
!mov r11,r13 ;current position
!Swarmi_Redo:
!bsf rdx,rax ;if there was a match rax holds 1 at the byte positions where a match is found in mask
!btr rax,rdx
!add r11,rdx ;align the haystack step forward x bytes
!movdqu xmm0,[r8+r11] ;haystack
!pcmpeqb xmm0,xmm5 ;check equal haystack vs needle
!pandn xmm0,xmm8
!pmovmskb edx,xmm0
!cmp edx,0
!je Swarmi_Found
!Swarmi_Test_Redo:
!cmp eax,0 ;test if any remaining positions to process
!jne Swarmi_Redo ;
!jmp Swarmi_While_Len
!Swarmi_Found:
!cmp r13,r14
!jge Swarmi_Wend
!mov rax, r11
!sub rax,[p.v_offset]
ProcedureReturn
!Swarmi_Wend:
!mov rax, -1
ProcedureReturn
EndProcedure
s.s = Space(1000000) + "!!!!!!!!"
c.s = "!!!!!!!!"
ls= StringByteLength(s)
lc = StringByteLength(c)
t = ElapsedMilliseconds()
For i = 1 To 1000
x1 = SwarmiSearchSSE2(@s,ls,@c,lc) / 2 + 1
Next
t1 = ElapsedMilliseconds()-t ;
t = ElapsedMilliseconds()
For i = 1 To 1000
x2 = IdleSundayFind(@s,ls,@c,lc) / 2 + 1
Next
t2 = ElapsedMilliseconds()-t ; 2469
t = ElapsedMilliseconds()
For i = 1 To 1000
x3 = FindW(@s, @c)
;x1 = FindW(@s, @"!") ; Bug OSX ?
Next
t3 = ElapsedMilliseconds()-t ; 3278
t = ElapsedMilliseconds()
For i = 1 To 1000
x4 = FindString(s, c)
Next
t4 = ElapsedMilliseconds()-t ; 2469
t = ElapsedMilliseconds()
For i = 1 To 1000
x5 = FindStringMem(@s,ls,c) / 2 + 1
Next
t5 = ElapsedMilliseconds()-t ;
MessageRequester("Result:", "X1 = " + x1 + " / T1 = " + t1 + #LF$ + "X2 = " + x2 + " / T2 = " + t2 + #LF$ + "X3 = " + x3 + " / T3 = " + t3 + #LF$ + "X4 = " + x4 + " / T4 = " + t4 + #LF$ + "X5 = " + x5 + " / T5 = " + t5)