Seite 1 von 1

String Geschwindkeit mit SSE verbessern

Verfasst: 03.03.2023 19:17
von SMaag
hab mal wieder etwas rumgespielt und bin auf die SSE StringCompare Funktion gestoßen.
Gleich mal ausprorbiert ob das was bringt!

Und tatsächlich:
StringLength mit SSE bei 255 Zeichen String und 1.000.000 Calls

auf x64 AMD Ryzen
ASM_SSE = 14ms
PB Len() = 63 ms

ACHTUNG: Das nur zu Demo-Zwecken verwenden! Es ist nicht klar ob das immer fehlerfrei funktionier!

Code: Alles auswählen

; SSE optimated String DEMO

Procedure.i StrLenSSE_ASCII(*String)
   ;
	; IMM8[1:0]	= 00b
  ;	Src data is unsigned bytes(16 packed unsigned bytes)
  
	; IMM8[3:2]	= 10b
  ; 	We are using Equal Each aggregation
  
	; IMM8[5:4]	= 00b
  ;	Positive Polarity, IntRes2	= IntRes1
  
	; IMM8[6]	= 0b
	;	ECX contains the least significant set bit in IntRes2
	;
  
  CompilerIf #PB_Compiler_64Bit
    
    !MOV RDX, [p.p_String] 
    !PXOR XMM0, XMM0
    !MOV RAX, -16
    
    !loop_strLenAscii:  
      !ADD RAX, 16    
      !PCMPISTRI XMM0, [RDX+RAX], 0001000b
    !JNZ loop_strLenAscii
    ; ECX will contain the offset from edx+eax where the first null
  	; terminating character was found.
    !ADD RAX, RCX
    
  CompilerElse
    
    !MOV EDX, [p.p_String] 
    !PXOR XMM0, XMM0
    !MOV EAX, -16
    
    !loop_strLenAscii:  
      !ADD EAX, 16    
      !PCMPISTRI XMM0, [EDX+EAX], 0001000b
    !JNZ loop_strLenAscii
    ; ECX will contain the offset from edx+eax where the first null
  	; terminating character was found.
    !ADD EAX, ECX
  CompilerEndIf 
  
  ProcedureReturn
EndProcedure

Procedure.i StrLenSSE(*String)
  ;
	; IMM8[1:0]	= 00b
	;	Src data is unsigned bytes(16 packed unsigned bytes)
	; IMM8[3:2]	= 10b
	; 	We are using Equal Each aggregation
	; IMM8[5:4]	= 00b
	;	Positive Polarity, IntRes2	= IntRes1
	; IMM8[6]	= 0b
	;	ECX contains the least significant set bit in IntRes2
	;
  CompilerIf #PB_Compiler_64Bit 
    
    !MOV RDX, [p.p_String] 
    !PXOR XMM0, XMM0
    !VMOVDQU XMM1, [l_packed] 
    !MOV RAX, -16
    
    !loop_strlen:  
      !ADD RAX, 16
      !VMOVDQU XMM2, [RDX+RAX]
      !PXOR XMM2, XMM1
      !PCMPISTRI XMM0, XMM2, 0001000b  
    !JNZ loop_strlen
    
    ; RCX will contain the offset from edx+eax where the first null
  	; terminating character was found.
    !ADD RAX, RCX
    !SHR RAX, 1  
    
  CompilerElse
    
    !MOV EDX, [p.p_String] 
    !PXOR XMM0, XMM0
    !VMOVDQU XMM1, [l_packed] 
    !MOV EAX, -16
    
    !loop_strlen:  
      !ADD EAX, 16
      !VMOVDQU XMM2, [EDX+EAX]
      !PXOR XMM2, XMM1
      !PCMPISTRI XMM0, XMM2, 0001000b  
    !JNZ loop_strlen
    
    ; RCX will contain the offset from edx+eax where the first null
  	; terminating character was found.
    !ADD EAX, ECX
    !SHR EAX, 1  
   
  CompilerEndIf
  
  ProcedureReturn

EndProcedure

Procedure.i SSE_StringCompare(*String1, *String2)
;  https://en.wikibooks.org/wiki/X86_Assembly/SSE
  
;   EQUAL_ANY	        =   0000b
;   RANGES		        =   0100b
;   EQUAL_EACH	      =   1000b
;   EQUAL_ORDERED	    =   1100b
;   NEGATIVE_POLARITY = 010000b
;   BYTE_MASK	       = 1000000b

  !MOV EAX, [p.p_String1]
  !MOV EDX, [p.p_String2]
  ; Subtract s2(EDX) from s1(EAX). This admititedly looks odd, but we
	; can now use EDX to index into s1 and s2. As we adjust EDX to move
	; forward into s2, we can then add EDX to EAX and this will give us
	; the comparable offset into s1 i.e. if we take EDX + 16 then:
	;
	;	EDX     = EDX + 16		        = EDX + 16
	;	EAX+EDX	= EAX -EDX + EDX + 16	= EAX + 16
	;
	; therefore EDX points to s2 + 16 and EAX + EDX points to s1 + 16.
	; We only need one index, convoluted but effective.

	!SUB EAX, EDX
	!SUB EDX, 16		; Avoid extra jump in main loop 
  
  !VMOVDQU XMM2, [l_packed] 
  
  !strcmpLoop:
   	!ADD	EDX, 16
   	!MOVDQU XMM0, [EDX]
   	!PXOR XMM0, XMM2
   	
   	!MOVDQU XMM1, [EDX+EAX]
   	!PXOR XMM1, XMM2

  	; IMM8[1:0]	= 00b
  	;	Src data is unsigned bytes(16 packed unsigned bytes)
  	; IMM8[3:2]	= 10b
  	; 	We are using Equal Each aggregation
  	; IMM8[5:4]	= 01b
  	;	Negative Polarity, IntRes2	= -1 XOR IntRes1
  	; IMM8[6]	= 0b
  	;	ECX contains the least significant set bit in IntRes2  	
   	
  	!PCMPISTRI	XMM0, XMM1, 0011000b ; EQUAL_EACH + NEGATIVE_POLARITY  	
  	; Loop while ZF=0 and CF=0:
  	;	1) We find a null in s1(EDX+EAX) ZF=1
  	;	2) We find a char that does not match CF=1  	
  !JA	strcmpLoop
  	
	; Jump if CF=1, we found a mismatched char
	!JC	strcmpDiff

	; We terminated loop due to a null character i.e. CF=0 and ZF=1
  ; The Strings are equal
	!XOR EAX, EAX
	!jmp exitStrcmp	

	!strcmpDiff:
    !ADD EAX, EDX	  ; Set offset into s1 to match s2
  	; ecx is offset from current poition where two strings do not match,
  	; so copy the respective non-matching byte into eax and edx and fill
  	; in remaining bits w/ zero.
  	!MOVZX	EAX, byte[EAX+ECX]
  	!MOVZX	EDX, byte[EDX+ECX]	
    ; If S1>S2 : Return (+) 
    ; If S1<S2 : Return (-)
  	!SUB	EAX, EDX
  	
  !exitStrcmp:
  ProcedureReturn
  
EndProcedure

DataSection
  packed: 
  Data.a 0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255
   ; This is a Mask we need for 2Byte Wide-Strings because most time second Byte is 0
  ; We have to XOR Blend the 2nd 0-Byte of each Character with a non 0 value. 
  ; Otherwise SSE PCMPISTRI command  will detect this 0 as end of the String.

EndDataSection


EnableExplicit

Define sTest.s, sTest2.s, sASC.s
Define sDbg.s
Define I

Dim bChar.b(255)  ; ASCII CHAR Array

For I=0 To 99     ; Fill Char Array with 100 Ascii Chars
  bChar(i) = 33+I
Next  
  
sTest= Space(255)   ; Fill TestString with 255 Spaces

sDbg= "PB: Len() = "  + Len(sTest)
Debug sDbg
sDbg = "SSE Len = " + Str(StrLenSSE(@sTest))
Debug sDbg
sDbg = "ASCII Len() = " + Str(StrLenSSE_ASCII(@bChar(0)))
Debug sDbg

; TIMIN TEST
#cst_Loops = 1000000

Define ret, T1, T2

; SSE Assembler Version
T1 = ElapsedMilliseconds()
For I = 1 To #cst_Loops
  ret = StrLenSSE(@sTest) 
Next
T1 = ElapsedMilliseconds() - T1

; Standard PB StringLenth
T2 = ElapsedMilliseconds()
For I = 1 To #cst_Loops
  ret = Len(sTest) 
Next
T2 = ElapsedMilliseconds() - T2

MessageRequester("Laufzeit StringLength " + #cst_Loops + " Calls", "ASM SSE = " + T1 + #CRLF$ + "PB LEN() = " + T2)
__________________________________________________
Thread verschoben
Allgemein>Code, Tipps und Tricks
08.08.2023
RSBasic

Re: String Geschwindkeit mit SSE verbessern

Verfasst: 08.08.2023 11:33
von RSBasic
Danke für den Performance-Code. Funktioniert mit meiner Intel-CPU auch.
Aber leider hast du deinen Thread im falschen Forum erstellt. Ich habe den Thread verschoben.