Gleich mal ausprorbiert ob das was bringt!
Und tatsächlich:
StringLength mit SSE bei 255 Zeichen String und 1.000.000 Calls
auf x64 AMD Ryzen
ASM_SSE = 14ms
PB Len() = 63 ms
ACHTUNG: Das nur zu Demo-Zwecken verwenden! Es ist nicht klar ob das immer fehlerfrei funktionier!
Code: Alles auswählen
; SSE optimated String DEMO
Procedure.i StrLenSSE_ASCII(*String)
;
; IMM8[1:0] = 00b
; Src data is unsigned bytes(16 packed unsigned bytes)
; IMM8[3:2] = 10b
; We are using Equal Each aggregation
; IMM8[5:4] = 00b
; Positive Polarity, IntRes2 = IntRes1
; IMM8[6] = 0b
; ECX contains the least significant set bit in IntRes2
;
CompilerIf #PB_Compiler_64Bit
!MOV RDX, [p.p_String]
!PXOR XMM0, XMM0
!MOV RAX, -16
!loop_strLenAscii:
!ADD RAX, 16
!PCMPISTRI XMM0, [RDX+RAX], 0001000b
!JNZ loop_strLenAscii
; ECX will contain the offset from edx+eax where the first null
; terminating character was found.
!ADD RAX, RCX
CompilerElse
!MOV EDX, [p.p_String]
!PXOR XMM0, XMM0
!MOV EAX, -16
!loop_strLenAscii:
!ADD EAX, 16
!PCMPISTRI XMM0, [EDX+EAX], 0001000b
!JNZ loop_strLenAscii
; ECX will contain the offset from edx+eax where the first null
; terminating character was found.
!ADD EAX, ECX
CompilerEndIf
ProcedureReturn
EndProcedure
Procedure.i StrLenSSE(*String)
;
; IMM8[1:0] = 00b
; Src data is unsigned bytes(16 packed unsigned bytes)
; IMM8[3:2] = 10b
; We are using Equal Each aggregation
; IMM8[5:4] = 00b
; Positive Polarity, IntRes2 = IntRes1
; IMM8[6] = 0b
; ECX contains the least significant set bit in IntRes2
;
CompilerIf #PB_Compiler_64Bit
!MOV RDX, [p.p_String]
!PXOR XMM0, XMM0
!VMOVDQU XMM1, [l_packed]
!MOV RAX, -16
!loop_strlen:
!ADD RAX, 16
!VMOVDQU XMM2, [RDX+RAX]
!PXOR XMM2, XMM1
!PCMPISTRI XMM0, XMM2, 0001000b
!JNZ loop_strlen
; RCX will contain the offset from edx+eax where the first null
; terminating character was found.
!ADD RAX, RCX
!SHR RAX, 1
CompilerElse
!MOV EDX, [p.p_String]
!PXOR XMM0, XMM0
!VMOVDQU XMM1, [l_packed]
!MOV EAX, -16
!loop_strlen:
!ADD EAX, 16
!VMOVDQU XMM2, [EDX+EAX]
!PXOR XMM2, XMM1
!PCMPISTRI XMM0, XMM2, 0001000b
!JNZ loop_strlen
; RCX will contain the offset from edx+eax where the first null
; terminating character was found.
!ADD EAX, ECX
!SHR EAX, 1
CompilerEndIf
ProcedureReturn
EndProcedure
Procedure.i SSE_StringCompare(*String1, *String2)
; https://en.wikibooks.org/wiki/X86_Assembly/SSE
; EQUAL_ANY = 0000b
; RANGES = 0100b
; EQUAL_EACH = 1000b
; EQUAL_ORDERED = 1100b
; NEGATIVE_POLARITY = 010000b
; BYTE_MASK = 1000000b
!MOV EAX, [p.p_String1]
!MOV EDX, [p.p_String2]
; Subtract s2(EDX) from s1(EAX). This admititedly looks odd, but we
; can now use EDX to index into s1 and s2. As we adjust EDX to move
; forward into s2, we can then add EDX to EAX and this will give us
; the comparable offset into s1 i.e. if we take EDX + 16 then:
;
; EDX = EDX + 16 = EDX + 16
; EAX+EDX = EAX -EDX + EDX + 16 = EAX + 16
;
; therefore EDX points to s2 + 16 and EAX + EDX points to s1 + 16.
; We only need one index, convoluted but effective.
!SUB EAX, EDX
!SUB EDX, 16 ; Avoid extra jump in main loop
!VMOVDQU XMM2, [l_packed]
!strcmpLoop:
!ADD EDX, 16
!MOVDQU XMM0, [EDX]
!PXOR XMM0, XMM2
!MOVDQU XMM1, [EDX+EAX]
!PXOR XMM1, XMM2
; IMM8[1:0] = 00b
; Src data is unsigned bytes(16 packed unsigned bytes)
; IMM8[3:2] = 10b
; We are using Equal Each aggregation
; IMM8[5:4] = 01b
; Negative Polarity, IntRes2 = -1 XOR IntRes1
; IMM8[6] = 0b
; ECX contains the least significant set bit in IntRes2
!PCMPISTRI XMM0, XMM1, 0011000b ; EQUAL_EACH + NEGATIVE_POLARITY
; Loop while ZF=0 and CF=0:
; 1) We find a null in s1(EDX+EAX) ZF=1
; 2) We find a char that does not match CF=1
!JA strcmpLoop
; Jump if CF=1, we found a mismatched char
!JC strcmpDiff
; We terminated loop due to a null character i.e. CF=0 and ZF=1
; The Strings are equal
!XOR EAX, EAX
!jmp exitStrcmp
!strcmpDiff:
!ADD EAX, EDX ; Set offset into s1 to match s2
; ecx is offset from current poition where two strings do not match,
; so copy the respective non-matching byte into eax and edx and fill
; in remaining bits w/ zero.
!MOVZX EAX, byte[EAX+ECX]
!MOVZX EDX, byte[EDX+ECX]
; If S1>S2 : Return (+)
; If S1<S2 : Return (-)
!SUB EAX, EDX
!exitStrcmp:
ProcedureReturn
EndProcedure
DataSection
packed:
Data.a 0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255
; This is a Mask we need for 2Byte Wide-Strings because most time second Byte is 0
; We have to XOR Blend the 2nd 0-Byte of each Character with a non 0 value.
; Otherwise SSE PCMPISTRI command will detect this 0 as end of the String.
EndDataSection
EnableExplicit
Define sTest.s, sTest2.s, sASC.s
Define sDbg.s
Define I
Dim bChar.b(255) ; ASCII CHAR Array
For I=0 To 99 ; Fill Char Array with 100 Ascii Chars
bChar(i) = 33+I
Next
sTest= Space(255) ; Fill TestString with 255 Spaces
sDbg= "PB: Len() = " + Len(sTest)
Debug sDbg
sDbg = "SSE Len = " + Str(StrLenSSE(@sTest))
Debug sDbg
sDbg = "ASCII Len() = " + Str(StrLenSSE_ASCII(@bChar(0)))
Debug sDbg
; TIMIN TEST
#cst_Loops = 1000000
Define ret, T1, T2
; SSE Assembler Version
T1 = ElapsedMilliseconds()
For I = 1 To #cst_Loops
ret = StrLenSSE(@sTest)
Next
T1 = ElapsedMilliseconds() - T1
; Standard PB StringLenth
T2 = ElapsedMilliseconds()
For I = 1 To #cst_Loops
ret = Len(sTest)
Next
T2 = ElapsedMilliseconds() - T2
MessageRequester("Laufzeit StringLength " + #cst_Loops + " Calls", "ASM SSE = " + T1 + #CRLF$ + "PB LEN() = " + T2)
Thread verschoben
Allgemein>Code, Tipps und Tricks
08.08.2023
RSBasic