For a first try I implemented String_Length and String_Compare
But !PCMPISTRI is a 16Byte Command, so it's possible to read outside the assigned memory of the string if the string is Alinged only 8 Bytes (or 4 Bytes at x32).
In my opinion this is not a problem as long as the 'OverRead Memory' is assigned to PureBasic. But can I be sure?
Questions:
1. How StringMemory is aligned?
2. Is it possible to get the correct size of assigned StringMemory
Updated 2023/08/23 Proof of concept code
Updated 2024/01/03 16 Byte Problem
Now I understood the Problem. So it is not possible to use
PCMPISTRI safely if memroy is not aligned to 16Byte.
----------------------------------------------------------------------
The Problem of 16 Byte operations on lower aligend memory
----------------------------------------------------------------------
If we process 16 Bytes on lower aligned memory we may run into an overflow at the
end of meory pages when the end of String is located in the last bytes
of the memory page and the following page is not allocated to our process.
Yes this will happen very seldom but it can happen. So it is a source of
crashes may happen in years in the future. Because it can happen, it will happen!
It is only a question of time!
A memory page in x64 Systems is 4096 Bytes
We look on a 8 Byte aligned String at the end of memory page to show the problem
a String followed by a NullChar and a further NullChar then the page ends
EndOfString at Byte 4092..93 and a void 00
| ..... 'I am a String at the end of a memroy page' 0000|
if we process 16 Bytes at 8 Byte align starting at Byte 4088 we read until Byte 4103
we read 8 Bytes into the next page. Now it will crash if the next page is not
allocated to our process! We can use 16 Byte PCMPISTRI operation
only if we are not at the end of a memory page or we have a 16 Byte align memory.
Code: Select all
 ; Proof of concept code for SSE String functions in PureBasic
DeclareModule StrSSE
  
  EnableExplicit
   
  ; ----------------------------------------------------------------------
  ;- STRUCTURES and CONSTANTS
  ;- ----------------------------------------------------------------------
  
  Declare.i SSE_LenAscii(*String)
  Declare.i SSE_LenStr(*String)
  Declare.i SSE_StringCompare(*String1, *String2)
EndDeclareModule
Module StrSSE
  
  EnableExplicit
  ;- ----------------------------------------------------------------------
  ;- Module Private
  ;- ----------------------------------------------------------------------
  Procedure.i SSE_LenAscii(*String)
    ; ============================================================================
    ; NAME: SSE_LenAscii
    ; DESC: Length in number of characters of Ascii Strings
    ; DESC: Use SSE PCmpIStrI operation. This is aprox. 3 times faster than PB Len()
    ; VAR(*String): Pointer to String 1
    ; RET.i: Number of Characters
    ; ============================================================================
     ;
  	; IMM8[1:0]	= 00b
    ;	Src data is unsigned bytes(16 packed unsigned bytes)
    
  	; IMM8[3:2]	= 10b
    ; 	We are using Equal Each aggregation
    
  	; IMM8[5:4]	= 00b
    ;	Positive Polarity, IntRes2	= IntRes1
    
  	; IMM8[6]	= 0b
  	;	ECX contains the least significant set bit in IntRes2
  	;
    ; XMM0 XMM1 XMM2 XMM3 XMM4
    ; XMM1 = [String1] : XMM2=[String2] : XMM3=WideCharMask
  
    CompilerIf #PB_Compiler_64Bit
      
      !MOV RDX, [p.p_String] 
      !PXOR XMM0, XMM0
      !MOV RAX, -16
      
      !loop_strLenAscii:  
        !ADD RAX, 16    
        !PCMPISTRI XMM0, [RDX+RAX], 0001010b ; EQUAL_EACH, Unis
      !JNZ loop_strLenAscii
      ; ECX will contain the offset from edx+eax where the first null
    	; terminating character was found.
      !ADD RAX, RCX    
      
    CompilerElse   
      !MOV EDX, [p.p_String] 
      !PXOR XMM0, XMM0
      !MOV EAX, -16
      
      !loop_strLenAscii:  
        !ADD EAX, 16    
        !PCMPISTRI XMM0, [EDX+EAX], 0001000b ; EQUAL_EACH
      !JNZ loop_strLenAscii
      ; ECX will contain the offset from edx+eax where the first null
    	; terminating character was found.
      !ADD EAX, ECX
    CompilerEndIf 
    
    ProcedureReturn
  EndProcedure
  
  Procedure.i SSE_LenStr(*String)
    ; ============================================================================
    ; NAME: SSE_LenStr
    ; DESC: Length in number of characters of 2-Byte Char Strings
    ; DESC: Use SSE PCmpIStrI operation. This is aprox. 3 times faster than PB Len()
    ; VAR(*String): Pointer to String
    ; RET.i: Number of Characters
    ; ============================================================================
    ;
  	; IMM8[1:0]	= 00b
  	;	Src data is unsigned bytes(16 packed unsigned bytes)
  	; IMM8[3:2]	= 10b
  	; 	We are using Equal Each aggregation
  	; IMM8[5:4]	= 00b
  	;	Positive Polarity, IntRes2	= IntRes1
  	; IMM8[6]	= 0b
  	;	ECX contains the least significant set bit in IntRes2
    
    ; XMM0 XMM1 XMM2 XMM3 XMM4
    ; XMM1 = [String1] : XMM2=[String2] : XMM3=WideCharMask
  
    CompilerIf #PB_Compiler_64Bit 
      
      !MOV RDX, [p.p_String] 
      !PXOR XMM0, XMM0
      !MOV RAX, -16
      
      !loop_strlen:  
        !ADD RAX, 16
        !MOVDQU XMM1, DQWORD[RDX+RAX]
        !PCMPISTRI XMM0, XMM1, 0001001b  ; EQUAL_EACH WORD
      !JNZ loop_strlen
      
      ; RCX will contain the offset from RDX+RAX where the first null
    	; terminating character was found.
      !SHR RAX, 1
      !ADD RAX, RCX
      ProcedureReturn
    CompilerElse
      
      !MOV EDX, [p.p_String] 
      !PXOR XMM0, XMM0
      !MOV EAX, -16
      
      !loop_strlen:  
        !ADD EAX, 16
        !MOVDQU XMM1, DQWORD[EDX+EAX]
        !PCMPISTRI XMM0, XMM1, 0001001b  ; EQUAL_EACH WORD
      !JNZ loop_strlen
      
      ; RCX will contain the offset from edx+eax where the first null
      ; terminating character was found.
      !SHR EAX, 1
      !ADD EAX, ECX
      ProcedureReturn
      
    CompilerEndIf
  
  EndProcedure
  
  Procedure.i SSE_StringCompare(*String1, *String2)
    ; ============================================================================
    ; NAME: SSE_StringCompare
    ; DESC: Compares 2 Strings with SSE operation (PCmpIStrI)
    ; VAR(*String1): Pointer to String 1
    ; VAR(*String2): Pointer to String 2
    ; RET.i: 0 = (S1=S2), >0 = (S1>S2), <0 = (S1<S2)
    ; ============================================================================
        
    ; XMM0 XMM1 XMM2 XMM3 XMM4
    ; XMM1 = [String1] : XMM2=[String2]
    
    CompilerIf #PB_Backend_Asm
      CompilerIf #PB_Compiler_64Bit 
  
        !MOV RAX, [p.p_String1]
        !MOV RDX, [p.p_String2]
        ; Subtract s2(RDX) from s1(RAX). This admititedly looks odd, but we
      	; can now use RDX to index into s1 and s2. As we adjust RDX to move
      	; forward into s2, we can then add RDX to RAX and this will give us
      	; the comparable offset into s1 i.e. if we take RDX + 16 then:
      	;
      	;	RDX     = RDX + 16		        = RDX + 16
      	;	RAX+RDX	= RAX -RDX + RDX + 16	= RAX + 16
      	;
      	; therefore RDX points to s2 + 16 and RAX + RDX points to s1 + 16.
      	; We only need one index, convoluted but effective.
      
      	!SUB RAX, RDX
      	!SUB RDX, 16		; Avoid extra jump in main loop 
           
        !strcmpLoop:
         	!ADD	RDX, 16
         	!MOVDQU XMM2, DQWORD[RDX]
         	
         	!MOVDQU XMM1, DQWORD[RDX+RAX]
      
        	; IMM8[1:0]	= 00b
        	  ;	00b: Src data is unsigned bytes(16 packed unsigned bytes)
         	  ;	01b: Src data is unsigned words( 8 packed unsigned words)
        	
         	; IMM8[3:2]	= 10b
        	; 	We are using Equal Each aggregation
        	; IMM8[5:4]	= 01b
        	;	Negative Polarity, IntRes2	= -1 XOR IntRes1
        	; IMM8[6]	= 0b
        	;	RCX contains the least significant set bit in IntRes2  	
         	
        	!PCMPISTRI XMM2, XMM1, 0011001b ; EQUAL_EACH + NEGATIVE_POLARITY + UNSIGNED_WORDS  	
        	; Loop while ZF=0 and CF=0:
        	;	1) We find a null in s1(RDX+RAX) ZF=1
        	;	2) We find a char that does not match CF=1  	
        !JA	strcmpLoop ; IF CF=0 And ZF=0     	
      	; Jump if CF=1, we found a mismatched char
      	!JC	strcmpDiff ; IF CF=1
      
      	; We terminated loop due to a null character i.e. CF=0 and ZF=1
        ; The Strings are equal
      	!XOR RAX, RAX
      	!jmp exitStrcmp	
      
      	!strcmpDiff:
          !ADD RAX, RDX	  ; Set offset into s1 to match s2
          
          ; RCX is offset from current poition in chars where two strings do not match,
        	; so copy the respective non-matching char into RAX and RDX and fill
        	; in remaining bits w/ zero. Because of 2ByteChar we have convert Word to Byte
          ; to get the correct AdressOffset by RCX*2
          
          !ADD RCX, RCX  ; Number of Chars to Adress Offset
          
          !MOVZX	RAX, WORD[RAX+RCX]
         	!MOVZX	RDX, WORD[RDX+RCX]
        	; If S1=S2 : Return (0) ; #PB_String_Equal
          ; If S1>S2 : Return (+) ; #PB_String_Greater
          ; If S1<S2 : Return (-) ; #PB_String_Lower
         	!SUB	RAX, RDX 
         	;!MOV RAX, RCX ; for test only, return Adress Offset
        !exitStrcmp:
        ProcedureReturn
        
      CompilerElse
        
        !MOV EAX, [p.p_String1]
        !MOV EDX, [p.p_String2]
      	!SUB EAX, EDX
      	!SUB EDX, 16		; Avoid extra jump in main loop 
           
        !strcmpLoop:
         	!ADD	EDX, 16
         	!MOVDQU XMM2, DQWORD[EDX]
         	
         	!MOVDQU XMM1, DQWORD[EDX+EAX]
       	  !PCMPISTRI XMM2, XMM1, 0011001b ; EQUAL_EACH + NEGATIVE_POLARITY + UNSIGNED_WORDS  	
        	; Loop while ZF=0 and CF=0:
        	;	1) We find a null in s1(EDX+EAX) ZF=1
        	;	2) We find a char that does not match CF=1  	
        !JA	strcmpLoop ; IF CF=0 And ZF=0   	
      	; Jump if CF=1, we found a mismatched char
      	!JC	strcmpDiff ; IF CF=1
      
      	; We terminated loop due to a null character i.e. CF=0 and ZF=1
        ; The Strings are equal
      	!XOR EAX, EAX
      	!jmp exitStrcmp	
      
      	!strcmpDiff:
          !ADD EAX, EDX	  ; Set offset into s1 to match s2             
          !ADD ECX, ECX  ; Number of Chars to Adress Offset       
          !MOVZX	EAX, WORD[EAX+ECX]
         	!MOVZX	EDX, WORD[EDX+ECX]
        	; If S1=S2 : Return (0) ; #PB_String_Equal
          ; If S1>S2 : Return (+) ; #PB_String_Greater
          ; If S1<S2 : Return (-) ; #PB_String_Lower
         	!SUB	EAX, EDX 
         	;!MOV EAX, ECX ; for test only, return Adress Offset
        !exitStrcmp:
        ProcedureReturn
        
      CompilerEndIf
      
    CompilerElse
      
  CompilerEndIf
    
  EndProcedure
   
EndModule
CompilerIf #PB_Compiler_IsMainFile    
  ;- ----------------------------------------------------------------------
  ;- TEST-CODE
  ;- ----------------------------------------------------------------------
  
  EnableExplicit
  UseModule StrSSE
  
  Define sTest.s, sTest2.s, sASC.s
  Define sDbg.s
  Define I
  
  Dim bChar.b(255)  ; ASCII CHAR Array
  
  For I=0 To 99     ; Fill Char Array with 100 Ascii Chars
    bChar(i) = 33+I
  Next  
    
  sTest= Space(255)   ; Fill TestString with 255 Spaces
  
  sDbg= "PB: Len() = "  + Len(sTest)
  Debug sDbg
  sDbg = "SSE Len = " + Str(SSE_LenStr(@sTest))
  Debug sDbg
  sDbg = "ASCII Len() = " + Str(SSE_LenAscii(@bChar(0)))
  Debug sDbg
  
  Define S1.s, S2.s
  Define ret
  S1 = "Ich bin ein langer String, in welchem man nach 1234 suchen kann 5677"
  S2 = "Ich bin ein langer String, in welchem man nach 1234 suchen kann 5679"
  ; S1 = "01234567"
  ; S2 = "01234568"
  
  ret = SSE_StringCompare(@S1, @S2)
  Debug "SSE-StringCompare = " + ret
  
  ; ----------------------------------------------------------------------
  ; TIMING TEST
  ; ----------------------------------------------------------------------
  
  #cst_Loops = 10000000
  
  Define T1, T2, txtStrLen.s, txtStrCompare.s
  
  Debug "Stringlength"
  Debug Str(@S1 % 32) + " : " + Hex(@S1)
  Debug Str(@S2 % 16) + " : " + Hex(@S2)
  
  ; ----------------    StringLength ----------------------
  ; SSE Assembler Version
  T1 = ElapsedMilliseconds()
  For I = 1 To #cst_Loops
    ret = SSE_LenStr(@S1) 
  Next
  T1 = ElapsedMilliseconds() - T1
  
  ; Standard PB StringLenth
  T2 = ElapsedMilliseconds()
  For I = 1 To #cst_Loops
    ;ret = Len(S1)
    ret = MemoryStringLength(@S1)
  Next
  T2 = ElapsedMilliseconds() - T2
  
  txtStrLen = "StringLength  " + #cst_Loops + " Calls : ASM SSE = " + T1 + " / " + "PB Version = " + T2
  
  
  ; ----------------    StringCompare ----------------------
  
  ; SSE Assembler Version
  T1 = ElapsedMilliseconds()
  For I = 1 To #cst_Loops
    ret = SSE_StringCompare(@S1, @S2)
  Next
  T1 = ElapsedMilliseconds() - T1
  
  ; Standard PB StringLenth
  T2 = ElapsedMilliseconds()
  For I = 1 To #cst_Loops
    ret = CompareMemoryString(@S1, @S2)
  Next
  T2 = ElapsedMilliseconds() - T2
  
  txtStrCompare = "StringCompare " + #cst_Loops + " Calls : ASM SSE = " + T1 + " / " + "PB Version = " + T2
  
  MessageRequester("Timing results", txtStrLen + #CRLF$ + txtStrCompare)
  
CompilerEndIf

