Unicode module (SSE2 optimized)

Share your advanced PureBasic knowledge/code with the community.
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3942
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Unicode module (SSE2 optimized)

Post by wilbert »

In another thread I started with some unicode procedures with SSE2 optimization.
I added a couple more and wrapped them inside a module.
Case insensitive functionality only ignores case for A-Z and a-z .
Since it's quite a bit of ASM code, testing is appreciated to detect any bugs :|

The UCountString method is not very useful for very short strings that occur a lot like a single space character.
In a case like that it is slow because it's not one continues operation but a new call to a find procedure every time a match occurs.
I added a simple UCountChar procedure to quickly count how often a single character occurs.

Here's the UCS2 module code

Code: Select all

DeclareModule UCS2
  
  ; v 1.01  24/08/2014
  
  ; Case insensitive functionality only ignores case for A-Z and a-z
  
  ; Procedure.i UCountChar(*UCS2String, CharCode.u)
  ; Procedure.i UCountString(*UCS2String, *UCS2StringToFind, Mode = #PB_String_CaseSensitive)
  ; Procedure.i UFind(*UCS2String, *UCS2StringToFind)
  ; Procedure.i UFindI(*UCS2String, *UCS2StringToFind)
  ; Procedure.i UFindString(*UCS2String, *UCS2StringToFind, StartPosition = -1, Mode = #PB_String_CaseSensitive)
  ; Procedure.i ULen(*UCS2String, MaxLen.i = -1)
  ; Procedure.s UMid(*UCS2String, StartPos.i, Length.i = -1)
  
  Declare.i UCountChar(*UCS2String, CharCode.u)
  Declare.i UCountString(*UCS2String, *UCS2StringToFind, Mode = #PB_String_CaseSensitive)
  Declare.i UFind(*UCS2String, *UCS2StringToFind)
  Declare.i UFindI(*UCS2String, *UCS2StringToFind)
  Declare.i UFindString(*UCS2String, *UCS2StringToFind, StartPosition = -1, Mode = #PB_String_CaseSensitive)
  Declare.i ULen(*UCS2String, MaxLen.i = -1)
  Declare.s UMid(*UCS2String, StartPos.i, Length.i = -1)
  
EndDeclareModule

Module UCS2
  DisableDebugger                 ; Required because of the use of (r/e)bp register !
  EnableASM
  
  CompilerIf #PB_Compiler_Processor = #PB_Processor_x86
    Macro rax : eax : EndMacro 
    Macro rbx : ebx : EndMacro
    Macro rcx : ecx : EndMacro
    Macro rdx : edx : EndMacro
    Macro rsi : esi : EndMacro
    Macro rdi : edi : EndMacro
    Macro rbp : ebp : EndMacro
    Macro rsp : esp : EndMacro
  CompilerEndIf
  
  Macro MOV_XR(xmm_reg, reg)
    !movdqa xmm_reg, [reg]
  EndMacro
  
  Macro PUSH_XMM(xmm_reg)
    sub rsp, 16
    !movdqu [rsp], xmm_reg
  EndMacro
  
  Macro POP_XMM(xmm_reg)
    !movdqu xmm_reg, [rsp]
    add rsp, 16
  EndMacro
  
  Macro ULen_(u)
    CompilerIf u
      !movdqa xmm3, xmm0
      !pslldq xmm0, 1
      !inc cl
    CompilerEndIf
    !pcmpeqw xmm0, xmm1
    !pmovmskb eax, xmm0
    !shr eax, cl
    !shl eax, cl
    lea rdx, [rdx + 16]
    !test eax, eax
    !jnz ucs2.l_len_e
    !ucs2.l_len_loop#u#i:
    MOV_XR(xmm0, rdx)
    CompilerIf u
      !movdqa xmm2, xmm0
      !pslldq xmm0, 1
      !psrldq xmm3, 15
      !por xmm0, xmm3
      !movdqa xmm3, xmm2
    CompilerEndIf
    !pcmpeqw xmm0, xmm1
    !pmovmskb eax, xmm0
    cmp rdx, rbx
    lea rdx, [rdx + 16]
    !ja ucs2.l_len_e
    !test eax, eax
    !jz ucs2.l_len_loop#u#i
    !jmp ucs2.l_len_e
  EndMacro
    
  Procedure.i ULen(*UCS2String, MaxLen.i = -1)
    
    mov rdx, *UCS2String
    test rdx, rdx
    !jz ucs2.l_len_ae    
    
    mov rax, MaxLen
    push rbx
    lea rbx, [rdx + rax * 2]
    sar rax, 63
    Or rbx, rax
    mov rcx, rdx
    And rdx, -16
    MOV_XR(xmm0, rdx)
    !pxor xmm1, xmm1
    sub rcx, rdx
    !test ecx, 1
    !jnz ucs2.l_len_unaligned
    ULen_(0)
    
    !ucs2.l_len_unaligned:
    ULen_(1)
    
    ; exit procedure
    !ucs2.l_len_e:
    bsf ecx, eax
    lea rax, [rcx + rdx - 16]
    cmp rax, rbx
    cmova rax, rbx
    pop rbx  
    sub rax, *UCS2String
    shr rax, 1
    ProcedureReturn
    
    !ucs2.l_len_ae:
    ProcedureReturn 0
  EndProcedure
  
  Macro UFind_(u, i)
    MOV_XR(xmm2, rdx)             ; get 8 characters
    CompilerIf u
      !movdqa xmm5, xmm2
      !pslldq xmm2, 1
      !inc cl
    CompilerEndIf
    !pxor xmm4, xmm4
    !pcmpeqw xmm4, xmm2           ; compare with 00000000
    CompilerIf i
      !pcmpeqw xmm6, xmm6
      !psrlw xmm6, 15
      !psllw xmm6, 5
      !por xmm0, xmm6
      !por xmm1, xmm6
      !por xmm2, xmm6
    CompilerEndIf
    !movdqa xmm3, xmm0
    !pcmpeqw xmm3, xmm2           ; compare with AAAAAAAA
    !psllw xmm3, 8
    !por xmm3, xmm4               ; combine results
    !pmovmskb eax, xmm3           ; move mask
    !shr eax, cl
    !shl eax, cl
    !jmp ucs2.l_find_loop0#u#i
    
    !ucs2.l_find_loop#u#i:
    add rdx, 16
    MOV_XR(xmm2, rdx)             ; get next 8 characters
    CompilerIf u
      !movdqa xmm4, xmm2
      !pslldq xmm2, 1
      !psrldq xmm5, 15
      !por xmm2, xmm5
      !movdqa xmm5, xmm4
    CompilerEndIf
    !pxor xmm4, xmm4
    !pcmpeqw xmm4, xmm2           ; compare with 00000000
    CompilerIf i
      !por xmm2, xmm6
    CompilerEndIf
    !movdqa xmm3, xmm0
    !pcmpeqw xmm3, xmm2           ; compare with AAAAAAAA
    !psllw xmm3, 8
    !por xmm3, xmm4               ; combine results
    !pmovmskb eax, xmm3           ; move mask
    !ucs2.l_find_loop0#u#i:
    !test eax, eax                ; check if any A or 0
    !jz ucs2.l_find_loop#u#i      ; loop if not
    !pcmpeqw xmm2, xmm1           ; compare with BBBBBBBB
    !pmovmskb ebx, xmm2           ; move mask
    !lea eax, [eax * 4]
    !or ebx, ebp
    !and eax, ebx
    !jz ucs2.l_find_loop#u#i      ; loop if nothing found
    !ucs2.l_bitscan#u#i:
    !bsf ebx, eax                 ; scan for first bit set
    !jz ucs2.l_find_loop#u#i
    !test ebx, 1                  ; bit set on an even place ?
    !jz ucs2.l_find_nf#i          ; if so, exit with not found
    !btr eax, ebx                 ; clear bit
    
    ; full check
    CompilerIf u
      lea rbx, [rdx + rbx - 2]    ; rbx = source
    CompilerElse
      lea rbx, [rdx + rbx - 1]    ; rbx = source
    CompilerEndIf
    CompilerIf i
      mov rsi, -2                 ; rsi = index
    CompilerElse
      XOr rsi, rsi                ; rsi = index
    CompilerEndIf
    !ucs2.l_fullcheck#u#i:
    lea rsi, [rsi + 2]      
    mov cx, [rdi + rsi]
    !test cx, cx
    !jz ucs2.l_find_f#i            ; *** exit found ***
    XOr cx, [rbx + rsi - 2]
    !jz ucs2.l_fullcheck#u#i
    CompilerIf i
      !test cx, 0xffdf
      !jnz ucs2.l_bitscan#u#i
      Or cx, [rbx + rsi - 2]
      !sub cx, 0x61
      !cmp cx, 26
      !jb ucs2.l_fullcheck#u#i
    CompilerEndIf   
    !jmp ucs2.l_bitscan#u#i
  EndMacro
  
  Procedure.i UFind(*UCS2String, *UCS2StringToFind)
    
    mov rdx, *UCS2String
    mov rax, *UCS2StringToFind
    test rdx, rdx
    !jz ucs2.l_find_ae
    test rax, rax
    !jz ucs2.l_find_ae
    mov cx, [rax]
    !test cx, cx
    !jz ucs2.l_find_ae
    
    push rbp
    push rbx
    push rdi
    push rsi
    !rol ecx, 16
    mov rdi, rax
    mov cx, [rdi + 2]
    !movd xmm1, ecx
    !test cx, cx
    !mov eax, 0x3ffff
    !punpcklwd xmm1, xmm1
    !mov ebp, 0x35555
    !pshufd xmm0, xmm1, 01010101b ; xmm0 = AAAAAAAA
    cmovz ebp, eax
    !pshufd xmm1, xmm1, 00000000b ; xmm1 = BBBBBBBB
    mov rcx, rdx
    And rdx, -16
    sub rcx, rdx
    !test ecx, 1
    !jnz ucs2.l_find_unaligned
    UFind_(0, 0)
    
    !ucs2.l_find_unaligned:
    UFind_(1, 0)
    
    !ucs2.l_find_f0:              ; found
    mov rax, rbx
    pop rsi
    pop rdi
    pop rbx
    pop rbp
    sub rax, *UCS2String
    shr rax, 1
    ProcedureReturn
    
    !ucs2.l_find_nf0:             ; exit point when not found
    pop rsi
    pop rdi
    pop rbx
    pop rbp
    !ucs2.l_find_ae:              ; exit point for argument error
    ProcedureReturn 0
  EndProcedure
  
  Procedure.i UFindI(*UCS2String, *UCS2StringToFind)
    
    mov rdx, *UCS2String
    mov rax, *UCS2StringToFind
    test rdx, rdx
    !jz ucs2.l_find_iae
    test rax, rax
    !jz ucs2.l_find_iae
    mov cx, [rax]
    !test cx, cx
    !jz ucs2.l_find_iae
    
    push rbp
    push rbx
    push rdi
    push rsi
    CompilerIf #PB_Compiler_OS = #PB_OS_Windows And #PB_Compiler_Processor = #PB_Processor_x64
      PUSH_XMM(xmm6)              ; on Windows x64, XMM6 is considered non-volatile
    CompilerEndIf
    !rol ecx, 16
    mov rdi, rax
    mov cx, [rdi + 2]
    !movd xmm1, ecx
    !test cx, cx
    !mov eax, 0x3ffff
    !punpcklwd xmm1, xmm1
    !mov ebp, 0x35555
    !pshufd xmm0, xmm1, 01010101b ; xmm0 = AAAAAAAA
    cmovz ebp, eax
    !pshufd xmm1, xmm1, 00000000b ; xmm1 = BBBBBBBB
    mov rcx, rdx
    And rdx, -16
    sub rcx, rdx
    !test ecx, 1
    !jnz ucs2.l_find_iunaligned
    UFind_(0, 1)
    
    !ucs2.l_find_iunaligned:
    UFind_(1, 1)
    
    !ucs2.l_find_f1:              ; found
    mov rax, rbx
    CompilerIf #PB_Compiler_OS = #PB_OS_Windows And #PB_Compiler_Processor = #PB_Processor_x64
      POP_XMM(xmm6)
    CompilerEndIf
    pop rsi
    pop rdi
    pop rbx
    pop rbp
    sub rax, *UCS2String
    shr rax, 1
    ProcedureReturn
    
    !ucs2.l_find_nf1:             ; exit point when not found
    CompilerIf #PB_Compiler_OS = #PB_OS_Windows And #PB_Compiler_Processor = #PB_Processor_x64
      POP_XMM(xmm6)
    CompilerEndIf
    pop rsi
    pop rdi
    pop rbx
    pop rbp
    !ucs2.l_find_iae:             ; exit point for argument error
    ProcedureReturn 0
  EndProcedure
  
  Procedure.i UCountChar(*UCS2String, CharCode.u)
    mov rdx, *UCS2String
    test rdx, rdx
    !jz ucs2.l_cc_ae
    mov cx, CharCode
    push rbx
    XOr rax, rax
    !jmp ucs2.l_cc_loop_entry
    !ucs2.l_cc_loop:
    sub bx, cx
    sub bx, 1
    adc rax, 0
    add rdx, 2
    !ucs2.l_cc_loop_entry:
    mov bx, [rdx]
    test bx, bx
    !jnz ucs2.l_cc_loop
    pop rbx
    ProcedureReturn
    !ucs2.l_cc_ae:
    ProcedureReturn 0   
  EndProcedure
    
  EnableDebugger
  
  Procedure.s UMid(*UCS2String, StartPos.i, Length.i = -1)
    If StartPos > 0
      ProcedureReturn PeekS(*UCS2String + ULen(*UCS2String, StartPos - 1) << 1, Length)
    Else
      ProcedureReturn PeekS(*UCS2String + ULen(*UCS2String, 0) << 1, Length)
    EndIf
  EndProcedure
  
  Procedure.i UFindString(*UCS2String, *UCS2StringToFind, StartPosition = -1, Mode = #PB_String_CaseSensitive)
    Protected.i Position
    If StartPosition > 0
      StartPosition - 1
      *UCS2String + ULen(*UCS2String, StartPosition) << 1
    Else
      StartPosition = 0  
    EndIf
    If Mode = #PB_String_NoCase
      Position = UFindI(*UCS2String, *UCS2StringToFind)
    Else
      Position = UFind(*UCS2String, *UCS2StringToFind)
    EndIf
    If Position
      Position + StartPosition
    EndIf
    ProcedureReturn Position
  EndProcedure
  
  Procedure.i UCountString(*UCS2String, *UCS2StringToFind, Mode = #PB_String_CaseSensitive)
    Protected.i Count, Offset, FSize = ULen(*UCS2StringToFind)
    If Mode = #PB_String_NoCase
      Offset = UFindI(*UCS2String, *UCS2StringToFind)
      While Offset
        *UCS2String + (Offset + FSize - 1) << 1
        Offset = UFindI(*UCS2String, *UCS2StringToFind)
        Count + 1
      Wend
    Else
      Offset = UFind(*UCS2String, *UCS2StringToFind)
      While Offset
        *UCS2String + (Offset + FSize - 1) << 1
        Offset = UFind(*UCS2String, *UCS2StringToFind)
        Count + 1
      Wend
    EndIf
    ProcedureReturn Count
  EndProcedure
  
EndModule
Last edited by wilbert on Sun Aug 24, 2014 6:38 pm, edited 5 times in total.
Windows (x64)
Raspberry Pi OS (Arm64)
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3942
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: Unicode module (SSE2 optimized)

Post by wilbert »

And here a little test

Code: Select all

UseModule UCS2

S.s = "Unicode find string, SSE2 vs PB" + #CRLF$ + #CRLF$

TestString.s = "This is a simple string for testing find procedures ~ the results will show, if a tested function is faster than the internal FindString() function"

t1 = ElapsedMilliseconds()
For i = 1 To 1000000
  c1 = UFind(@TestString, @"fast")
Next
t2 = ElapsedMilliseconds()
For i = 1 To 1000000
  c2 = FindString(TestString, "fast")
Next
t3 = ElapsedMilliseconds()

S + "Case sensitive string : " + Str(t2 - t1) + "(SSE2) vs " + Str(t3 - t2) + "(PB)" + #CRLF$

t1 = ElapsedMilliseconds()
For i = 1 To 1000000
  c1 = UFindI(@TestString, @"Fast")
Next
t2 = ElapsedMilliseconds()
For i = 1 To 1000000
  c2 = FindString(TestString, "Fast", 1, #PB_String_NoCase)
Next
t3 = ElapsedMilliseconds()

S + "Case insensitive string : " + Str(t2 - t1) + "(SSE2) vs " + Str(t3 - t2) + "(PB)" + #CRLF$

S = Str(OSVersion()) + #CRLF$ + CPUName() + #CRLF$ + #CRLF$ + S
SetClipboardText("[quote]" + S + "[/quote]")

MessageRequester("Test results", S)
Windows (x64)
Raspberry Pi OS (Arm64)
davido
Addict
Addict
Posts: 1890
Joined: Fri Nov 09, 2012 11:04 pm
Location: Uttoxeter, UK

Re: Unicode module (SSE2 optimized)

Post by davido »

@wilbert,
Excellent! Making a Module is a nice touch.
Thank you for sharing.
DE AA EB
User avatar
electrochrisso
Addict
Addict
Posts: 989
Joined: Mon May 14, 2007 2:13 am
Location: Darling River

Re: Unicode module (SSE2 optimized)

Post by electrochrisso »

Your code is very optimized Wilbert, especially fast on my netbook. :D

Debugger On
80
Intel(R) Atom(TM) CPU N550 @ 1.50GHz

Unicode find string, SSE2 vs PB

Case sensitive string : 641(SSE2) vs 1224(PB)
Case insensitive string : 447(SSE2) vs 44554(PB)
Debugger Off
80
Intel(R) Atom(TM) CPU N550 @ 1.50GHz

Unicode find string, SSE2 vs PB

Case sensitive string : 134(SSE2) vs 978(PB)
Case insensitive string : 157(SSE2) vs 45986(PB)
PureBasic! Purely the best 8)
User avatar
flaith
Enthusiast
Enthusiast
Posts: 704
Joined: Mon Apr 25, 2005 9:28 pm
Location: $300:20 58 FC 60 - Rennes
Contact:

Re: Unicode module (SSE2 optimized)

Post by flaith »

Sony Vaio Laptop:
Windows 8.1 x64-Purebasic 5.30 x86
Intel(R) Core(TM) i3-3227U CPU @ 1.90GHz

Debug off
Unicode find string, SSE2 vs PB

Case sensitive string : 45(SSE2) vs 475(PB)
Case insensitive string : 64(SSE2) vs 7991(PB)
“Fear is a reaction. Courage is a decision.” - WC
User avatar
netmaestro
PureBasic Bullfrog
PureBasic Bullfrog
Posts: 8451
Joined: Wed Jul 06, 2005 5:42 am
Location: Fort Nelson, BC, Canada

Re: Unicode module (SSE2 optimized)

Post by netmaestro »

Image
BERESHEIT
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3942
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: Unicode module (SSE2 optimized)

Post by wilbert »

Thanks for testing :-)

It shows a difference between Windows and OSX.

The case insensitive procedure I created is only case insensitive for A-Z and a-z. This also seems to be the case for the PureBasic FindString procedure on OSX.
On Windows the PB procedure seems to use a character map. As a result of this, foreign language characters are also correctly compared on Windows while not on OSX but that also makes the procedure very slow on Windows when the #PB_String_NoCase flag is used with FindString.
So the case insensitive test might be not an entirely fair comparison but if you only need case insensitivity for the true Ascii range to find for example both PureBasic and Purebasic you get an amazing speed improvement.

I wonder how PB on Linux behaves.

As you can see from the OSX results, the difference between case sensitive or not is much smaller compared to the Windows results
Intel(R) Core(TM)2 Duo CPU E7600 @ 3.06GHz

Unicode find string, SSE2 vs PB

Case sensitive string : 40(SSE2) vs 336(PB)
Case insensitive string : 54(SSE2) vs 987(PB)
Windows (x64)
Raspberry Pi OS (Arm64)
User avatar
NicTheQuick
Addict
Addict
Posts: 1504
Joined: Sun Jun 22, 2003 7:43 pm
Location: Germany, Saarbrücken
Contact:

Re: Unicode module (SSE2 optimized)

Post by NicTheQuick »

Ubuntu Gnome 14.04 x64 wrote:2000
Intel(R) Core(TM) i7-3820QM CPU @ 2.70GHz

Unicode find string, SSE2 vs PB

Case sensitive string : 25(SSE2) vs 298(PB)
Case insensitive string : 29(SSE2) vs 496(PB)
Looks great on Linux! :D
The english grammar is freeware, you can use it freely - But it's not Open Source, i.e. you can not change it or publish it in altered way.
User avatar
Lord
Addict
Addict
Posts: 900
Joined: Tue May 26, 2009 2:11 pm

Re: Unicode module (SSE2 optimized)

Post by Lord »

Windows 7 x64: wrote:80
Intel(R) Core(TM) i5 CPU 760 @ 2.80GHz

Unicode find string, SSE2 vs PB

Case sensitive string : 18(SSE2) vs 244(PB)
Case insensitive string : 21(SSE2) vs 7265(PB)
Image
User avatar
mk-soft
Always Here
Always Here
Posts: 6204
Joined: Fri May 12, 2006 6:51 pm
Location: Germany

Re: Unicode module (SSE2 optimized)

Post by mk-soft »

Mac-Mini
60000
Intel(R) Core(TM) i5-3210M CPU @ 2.50GHz

Unicode find string, SSE2 vs PB

Case sensitive string : 34(SSE2) vs 340(PB)
Case insensitive string : 36(SSE2) vs 915(PB)
My Projects ThreadToGUI / OOP-BaseClass / EventDesigner V3
PB v3.30 / v5.75 - OS Mac Mini OSX 10.xx - VM Window Pro / Linux Ubuntu
Downloads on my Webspace / OneDrive
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3942
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: Unicode module (SSE2 optimized)

Post by wilbert »

NicTheQuick wrote:Looks great on Linux! :D
Thanks :)
Good to hear it is working on Linux as well.

Looking at all the test results posted, it looks like my Core2Duo processor is already ancient among PB coders :shock:
Windows (x64)
Raspberry Pi OS (Arm64)
Post Reply