Re: Why FindString() is so terribly slow?
Posted: Fri Nov 30, 2018 5:22 am
@gurj
This must be repeated 1000 times!
This must be repeated 1000 times!
http://www.purebasic.com
https://www.purebasic.fr/english/
+1 Because it works for 32/64 bit as well as for Ascii and Unicode!RASHAD wrote:Great topic
Very big thanks to all contributors
And thanks idle for pointing to wilbert staff
Up to now the winner is wilbertCode: Select all
:
Code: Select all
HayStackSize = Len(HayStackstring$)
*HayStack = AllocateMemory(HayStackSize)
PokeS(*HayStack,HayStackstring$,-1,#PB_Ascii)
No, the code ignores Unicode simply by converting everything to Ascii (do this also outside the For-Loop).Michael Vogel wrote:Because it works ... as well as for Ascii and Unicode!
The code u posted does not work -> Pos = 0!RASHAD wrote:Great topic
Very big thanks to all contributors
And thanks idle for pointing to wilbert staff
Up to now the winner is wilbert
Code: Select all
;SwarmiSearchSSE2 ->
;~ 9 ms naked (all sting lengths are known)
;~ 12 - 90 ms (if 1 string length is known)
;~ 92 ms (both string lengths are unknown)
;FindString_SSE42_64Bit -> ~ 85 ms string length is not needed
;LocStrA -> ~ 128 ms string length is not needed
Code: Select all
EnableExplicit
Global TestString.s
Global Signature.s
Global Index.i
Global T1.q, R1.i, *A1
Global T2.q, R2.i, *A2
Global L1.i, L2.i
Procedure.q FindString_SSE42_64Bit(pStr1, pStr2, StartPos)
!mov rcx,[p.v_pStr1]
!mov rdx,[p.v_pStr2]
!mov rax,[p.v_StartPos]
!mov rax,r8 ;need RAX for return-value ProcedureReturn, R8=StartPos
!cmp rax,1
!jge @f
!mov rax,1 ;or End or Error
!@@:
!mov r8,rcx ;need RCX for return-value PCMPISTRI, RCX=pStr1
!sub r8,1
!add r8,rax ;rax=StartPos
!movdqu xmm0,[rdx] ;first max.16 Bytes of Find$, RDX=pStr2
!xor rcx,rcx
!Search:
!add rax,rcx
!add r8,rcx
!pcmpistri xmm0,dqword[r8],00001100b ;Least Significant Index, Positive Polarity (IntRes2=IntRes1), Equal Ordered, Unsigned Bytes (=No Unicode)
!jz Base$End ;Zero-Flag is set if found end of Base$
!js Find$End ;Sign-Flag is set if found end of Find$
!jrcxz @f ;jump if RCX is zero
!jmp Search
!@@:
!movdqa xmm1,xmm0 ;save xmm0
!mov r9,r8
!mov r10,rdx
!@@:
!add r10,16
!add r9,16 ;next 16 Bytes of Base$
!movdqu xmm1,[r10] ;next 16 Bytes of Find$
!pcmpistri xmm1,dqword[r9],00001100b ;compare the strings
!jz Base$End
!js Find$End
!jrcxz @b
!jmp Search
!Base$End:
!cmp rcx,16
!jne Found
!xor rax,rax
!jmp NoFound
!Find$End:
!or rcx,rcx ;RCX Zero? Conventional
!jnz Search
!Found:
!add rax,rcx
!NoFound:
ProcedureReturn ;rax
EndProcedure
Procedure SwarmiSearchSSE2(*haystack,len,*needle,lenN,pos=0);Result is wrong by 1 Character!
;Swarmi search SSE2 bit parallel search algorithm for for 1 to 16 byte needles
;performs ~ 1:1 With booyer moore And other variants For a 16 byte needle With a 0-255 alphabet range
;efficiency increases with parallelism. 2:1 for 8 byte needle, 4:1 for 4 byte needle, 8:1 for 2 byte needle and 16:1 1 byte needle
Protected offset
!align 32
DataSection
!m16:
!dq 0xffffffffffffffff
!dq 0xffffffffffffffff
!dq 0xffffffffffffffff
!dq 0xffffffffffffff
!dq 0xffffffffffffffff
!dq 0xffffffffffff
!dq 0xffffffffffffffff
!dq 0xffffffffff
!dq 0xffffffffffffffff
!dq 0xffffffff
!dq 0xffffffffffffffff
!dq 0xffffff
!dq 0xffffffffffffffff
!dq 0xffff
!dq 0xffffffffffffffff
!dq 0xff
!dq 0xffffffffffffffff
!dq 0x0
!dq 0xffffffffffffff
!dq 0x0
!dq 0xffffffffffff
!dq 0x0
!dq 0xffffffffff
!dq 0x0
!dq 0xffffffff
!dq 0x0
!dq 0xffffff
!dq 0x0
!dq 0xfff
!dq 0x0
!dq 0xff
!dq 0x0
EndDataSection
!mov rcx,16
!sub rcx,[p.v_lenN]
!imul rcx,16
!lea r8, [m16]
!movdqu xmm8, [r8+rcx] ;load trailing byte mask
!mov r9, [p.p_needle] ;build First char mask eg "a0a0a0a0a0a0a0"
!movzx eax,byte [r9]
!movd xmm7, eax
!punpcklbw xmm7, xmm7
!punpcklwd xmm7, xmm7
!pshufd xmm7, xmm7, 0 ;FirstCharMask
!mov r8,[p.p_haystack] ;align haysack
!mov rax,r8
!and rax ,15
!mov [p.v_offset],rax ;store offset
!sub r8,rax
!movdqu xmm5,[r9] ;store needle
!mov r14,[p.v_len]
!mov r13,[p.v_pos]
!sub r13,16
!Swarmi_While_Len:
!cmp r13, r14
!jge Swarmi_Wend
!add r13,16
!movdqa xmm0,[r8+r13] ;Get Haystack and check against mask chars"
!pcmpeqb xmm0,xmm7 ;FirstCharMask
!pmovmskb eax,xmm0
!cmp eax,0
!je Swarmi_While_Len
!mov r11,r13 ;current position
!Swarmi_Redo:
!bsf rdx,rax ;if there was a match rax holds 1 at the byte positions where a match is found in mask
!btr rax,rdx
!add r11,rdx ;align the haystack step forward x bytes
!movdqu xmm0,[r8+r11] ;haystack
!pcmpeqb xmm0,xmm5 ;check equal haystack vs needle
!pandn xmm0,xmm8
!pmovmskb edx,xmm0
!cmp edx,0
!je Swarmi_Found
!Swarmi_Test_Redo:
!cmp eax,0 ;test if any remaining positions to process
!jne Swarmi_Redo ;
!jmp Swarmi_While_Len
!Swarmi_Found:
!cmp r13,r14
!jge Swarmi_Wend
!mov rax, r11
!sub rax,[p.v_offset]
ProcedureReturn
!Swarmi_Wend:
!mov rax, -1
ProcedureReturn
EndProcedure
Procedure.i LocStrA(*Input,*Signature);Ascii
!mov rcx,[p.p_Input]
!mov rdx,[p.p_Signature]
!xor rax,rax
!mov r10,rcx
!cmp byte[rdx],0h
!je LocStr_Return
!LocStr_Loop:
!cmp byte[rcx],0h
!je LocStr_Return
!mov bl,byte[rcx]
!cmp bl,byte[rdx]
!jne LocStr_Next
!lea r8,[rcx+1h]
!lea r9,[rdx+1h]
!LocStr_Match:
!cmp byte[r9],0h
!je LocStr_Result
!cmp byte[r8],0h
!je LocStr_Next
!mov bl,byte[r8]
!cmp bl,byte[r9]
!jne LocStr_Next
!inc r8
!inc r9
!jmp LocStr_Match
!LocStr_Result:
!sub rcx,r10
!lea rax,[rcx+1h]
!jmp LocStr_Return
!LocStr_Next:
!inc rcx
!jmp LocStr_Loop
!LocStr_Return:
ProcedureReturn
EndProcedure
#Tries = 2e2
TestString = Space(1000000) + "8!#"
Signature = "8!"
*A1 = Ascii(TestString)
*A2 = Ascii(Signature)
L1 = Len(TestString)
L2 = Len(Signature)
T1 = ElapsedMilliseconds()
For Index = 1 To #Tries;10000000
;L1 = Len(TestString);<- needs to be here unless u only work with known strings!
;L2 = Len(Signature)
R1 = SwarmiSearchSSE2(*A1,L1,*A2,L2)
Next
T1 = ElapsedMilliseconds() - T1
T2 = ElapsedMilliseconds()
For Index = 1 To #Tries;10000000
R2 = FindString_SSE42_64Bit(*A1,*A2,0);LocStrA(*A1,*A2);LocStrA(*A1,*A2)
Next
T2 = ElapsedMilliseconds() - T2
MessageRequester("Result",
"T1: " + Str(T1) + #CR$ + "R1: " + Str(R1) + #CR$ + #CR$ +
"T2: " + Str(T2) + #CR$ + "R2: " + Str(R2))
End
No SwarmiSearchSSE2() was my creation but Wilberts SSE2_Find is a little better and it also supports 32 bit.Mijikai wrote:The code u posted does not work -> Pos = 0!RASHAD wrote:Great topic
Very big thanks to all contributors
And thanks idle for pointing to wilbert staff
Up to now the winner is wilbert
I assume Wilberts original code is SwarmiSearchSSE2() which does work.
Its just amazing what Wilbert and Helle can do (but both are ascii only!).
My mistake then all credits go to you, thanksidle wrote: No SwarmiSearchSSE2() was my creation but Wilberts SSE2_Find is a little better and it also supports 32 bit.
Code: Select all
EnableExplicit
Global TestString$
Global Signature$
Global i
Global T1.q, R1, *A1
Global T2.q, R2, *A2
Global T3.q, R3, *A3
Global L1, L2
...
...
...
...
#Tries = 1000
TestString$ = Space(1000000) + "8!#"
Signature$ = "8!"
T1 = ElapsedMilliseconds()
For i = 1 To #Tries
L1 = Len(TestString$)
L2 = Len(Signature$)
*A1 = Ascii(TestString$)
*A2 = Ascii(Signature$)
R1 = SwarmiSearchSSE2(*A1,L1,*A2,L2)
FreeMemory (*A1)
FreeMemory (*A2)
Next
T1 = ElapsedMilliseconds() - T1
T2 = ElapsedMilliseconds()
For i = 1 To #Tries
*A1 = Ascii(TestString$)
*A2 = Ascii(Signature$)
R2 = FindString_SSE42_64Bit(*A1,*A2,0)
FreeMemory (*A1)
FreeMemory (*A2)
Next
T2 = ElapsedMilliseconds() - T2
T3 = ElapsedMilliseconds()
For i = 1 To #Tries
R3 = FindString(TestString$, Signature$)
Next
T3 = ElapsedMilliseconds() - T3
MessageRequester("Result",
"T1: " + T1 + #CR$ + "R1: " + R1 + #CR$ + #CR$ +
"T2: " + T2 + #CR$ + "R2: " + R2 + #CR$ + #CR$ +
"T3: " + T3 + #CR$ + "R3: " + R3)
Code: Select all
.
.
.
t1 = ElapsedMilliseconds()
HayStackstring$ = Space(1000000) + "!"
HayStackSize = Len(HayStackstring$)
*HayStack = AllocateMemory(HayStackSize)
PokeS(*HayStack,HayStackstring$,-1,#PB_Ascii)
Needlestring$ = "!"
NeedleSize = Len(Needlestring$)
*Needle = AllocateMemory(NeedleSize)
PokeS(*Needle,Needlestring$,-1,#PB_Ascii)
Pos = SSE2_Find(*HayStack, HayStackSize, *Needle, NeedleSize)
t2 = ElapsedMilliseconds() - t1
S.s = "SSE2 Find : " + Str(t2) + "ms" + #CRLF$ + "At Position : "+ Str(Pos+NeedleSize)
MessageRequester("Test results", S)
HayStackstring$ = ""
Needlestring$ = ""
t1 = ElapsedMilliseconds()
Stackstring$ = Space(1000000) + "!"
Nstring$ = "!"
pos = FindString(Stackstring$,Nstring$,1)
t2 = ElapsedMilliseconds() - t1
S.s = "SSE2 Find : " + Str(t2) + "ms" + #CRLF$ + "At Position : "+ Str(Pos)
MessageRequester("Test results", S)
The preparations u refering to are done for both functions before they enter the loop so i dont see you point.Josh wrote:@Mijikai
The test is irrelevant if you make preparings outside the for loop.
In addition, there is no real Unicode search in your procedures.
Thanksdavido wrote:@Mijikai,
wilbert's FindData Module
The 1000 loops here are to get better comparisons in the times. In practice, you will hardly try to find the same searchstring in a string 1000 times in a row.Mijikai wrote:The preparations u refering to are done for both functions before they enter the loop so i dont see you point.
I thought you are interested in compring the speeds betweend different search routines.Josh wrote:The 1000 loops here are probably only there to get better comparisons in the times. In practice, you will hardly try to find the same searchstring in a string 1000 times in a row.Mijikai wrote:The preparations u refering to are done for both functions before they enter the loop so i dont see you point.
So, all your preparations have to be done in the loop. Then you have comparable results and you will notice that there are only marginale time differences between your code and the native Pb FindString().
Yes, i want nothing else then comparing the speeds between different search routines. In my code here, I even included the native Pb FindString ().Mijikai wrote:I thought you are interested in compring the speeds betweend different search routines.
You don't understand, the Asc() functions must be in the loop!!! Your code does not bring realistic conditions. The times your code spits out are completely irrelevant for realistic comparisons with the native Pb FindString().Mijikai wrote:Including the Ascii() function makes no sense unless u like to make the result less realistic. You can always measure the timing of Ascii() but then also dont include other calls.
For some versions, Pb works only in Unicode mode and FindString () logically too. Please tell me, why this will result in bogus results.Mijikai wrote:Ur example makes further no sense since FindString() will operate in unicode mode.
What obviously will result in bogus results.