Count char in string, ultra fast SSE4.2

Share your advanced PureBasic knowledge/code with the community.
Thorium
Addict
Addict
Posts: 1305
Joined: Sat Aug 15, 2009 6:59 pm

Count char in string, ultra fast SSE4.2

Post by Thorium »

Someone on the german forum asked about a fast procedure to count the population of a specific character in a string or memory buffer. So here is the fastest solution i can imagine. It uses SSE4.2 (SSE2 instructions and POPCNT instruction). So it's only working on relativly new CPU's. The CPU have to support SSE2 and POPCNT. On Intel POPCNT is part of SSE4.2. AMD implemented POPCNT without the SSE4 standart. I think there is a code in the forum to check if your CPU supports it.

Well you can do it even faster by splitting the string and calling the procedure with multiple threads on multicore CPU's, of course.

For comparison a simple assembler code which does the job:

Code: Select all

Procedure.i CountCharAsm(*Buffer, BufferLength.i, Char.a)

  CompilerSelect #PB_Compiler_Processor

    CompilerCase #PB_Processor_x86
    
      !push edi
      !cld
      !xor edx,edx
      !mov ecx,[p.v_BufferLength+4]
      !mov edi,[p.p_Buffer+4]
      !mov al,[p.v_Char+4]
     
      !align 4
      !CountCharAsmLoop:
        !repne scasb
        !inc edx
        !test ecx,ecx
      !jne CountCharAsmLoop
     
      !mov bl,[edi]
      !cmp bl,al
      !je CountCharAsmEnd
        !dec edx 
      !CountCharAsmEnd:
     
      !mov eax, edx
      !pop edi

    CompilerCase #PB_Processor_x64
   
      !push rdi
      !cld
      !xor rdx,rdx
      !mov rcx,[p.v_BufferLength+8]
      !mov rdi,[p.p_Buffer+8]
      !mov al,[p.v_Char+8]
     
      !align 8
      !CountCharAsmLoop:
        !repne scasb
        !inc rdx
        !test rcx,rcx
      !jne CountCharAsmLoop
     
      !mov bl,[rdi]
      !cmp bl,al
      !je CountCharAsmEnd
        !dec rdx 
      !CountCharAsmEnd:
     
      !mov rax, rdx
      !pop rdi
      
  CompilerEndSelect

  ProcedureReturn

EndProcedure
And here the SSE4.2 code which does the same 15 times faster! 1500% speed up! POPCNT is awesome. :mrgreen:

The code is working with x86 and x64 and you don't have to align data. The code does align the data to 16 byte on it's own and can process data which is not a multiply of 16 without a problem.

In order to compile the code you have to update FASM. Download FASM from http://www.flatassembler.net and copy the fasm.exe to the compilers folder in your purebasic folder. You need to do that because the FASM version of purebasic does not know SSE4 and will exit with a assembler error.

Code: Select all

Procedure.i CountCharSSE42(*Buffer, BufferLength.i, Char.a)

  CompilerSelect #PB_Compiler_Processor

    CompilerCase #PB_Processor_x86
      
      !push ebx
      !push esi
      !push edi
      !push ebp

      !mov esi,[p.v_BufferLength+16]
      !mov edi,[p.p_Buffer+16]
      !mov al,[p.v_Char+16]
      !xor ebp,ebp
      
      ;process some bytes to get 16 byte alignment

      !mov ecx,edi
      !and ecx,15
      
      !test ecx,ecx
      !je CountCharSSE42_AlignEnd
      
      !sub esi,ecx
      
      !xor edx,edx
      
      !align 4
      !CountCharSSE42_AlignLoop:
      
        !mov bl,[edi]
        !cmp al,bl
        !sete dl
        !add ebp,edx
        !inc edi
        !dec ecx

      !jne CountCharSSE42_AlignLoop
      
      !CountCharSSE42_AlignEnd:
      
      ;cut length to a multiply of 16 and process it

      !mov cl,al
      !shl eax,8
      !mov al,cl
      !shl eax,8
      !mov al,cl
      !shl eax,8
      !mov al,cl
      !push eax
      !push eax
      !push eax
      !push eax
      !movdqu xmm0,[esp]
      !add esp,16

      !mov ecx,esi
      !shr ecx,4
      
      !test ecx,ecx
      !je CountCharSSE42_MainEnd
      
      !and esi,15

      !align 4
      !CountCharSSE42_Loop:
      
        !movdqa xmm1,[edi]
        !pcmpeqb xmm1,xmm0
        !pmovmskb eax,xmm1
        
        !popcnt eax,eax
        !add ebp,eax
        
        !add edi,16
        !dec ecx
        
      !jne CountCharSSE42_Loop
      
      !CountCharSSE42_MainEnd:
      
      ;process the rest of the string

      !test esi,esi
      !je CountCharSSE42_RestEnd

      !mov al,[p.v_Char+16]
      !mov ecx,esi
      !xor edx,edx
      
      !align 4
      !CountCharSSE42_RestLoop:
      
        !mov bl,[edi]
        !cmp al,bl
        !sete dl
        !add ebp,edx
        !inc edi
        !dec ecx

      !jne CountCharSSE42_RestLoop
      
      !CountCharSSE42_RestEnd:

      !mov eax,ebp
 
      !pop ebp
      !pop edi
      !pop esi
      !pop ebx

    CompilerCase #PB_Processor_x64

      !push rbx
      !push rsi
      !push rdi
      !push rbp

      !mov rsi,[p.v_BufferLength+32]
      !mov rdi,[p.p_Buffer+32]
      !mov al,[p.v_Char+32]
      !xor rbp,rbp
      
      ;process some bytes to get 16 byte alignment

      !mov rcx,rdi
      !and rcx,15
      
      !test rcx,rcx
      !je CountCharSSE42_AlignEnd
      
      !sub rsi,rcx
      
      !xor rdx,rdx
      
      !align 8
      !CountCharSSE42_AlignLoop:
      
        !mov bl,[rdi]
        !cmp al,bl
        !sete dl
        !add rbp,rdx
        !inc rdi
        !dec rcx

      !jne CountCharSSE42_AlignLoop
      
      !CountCharSSE42_AlignEnd:
      
      ;cut length to a multiply of 16 and process it

      !mov cl,al
      !shl rax,8
      !mov al,cl
      !shl rax,8
      !mov al,cl
      !shl rax,8
      !mov al,cl
      !shl rax,8
      !mov al,cl
      !shl rax,8
      !mov al,cl
      !shl rax,8
      !mov al,cl
      !shl rax,8
      !mov al,cl
      !push rax
      !push rax
      !movdqu xmm0,[rsp]
      !add rsp,16

      !mov rcx,rsi
      !shr rcx,4
      
      !test rcx,rcx
      !je CountCharSSE42_MainEnd
      
      !and rsi,15
      !xor rax,rax

      !align 8
      !CountCharSSE42_Loop:
      
        !movdqa xmm1,[rdi]
        !pcmpeqb xmm1,xmm0
        !pmovmskb eax,xmm1
        
        !popcnt eax,eax
        !add rbp,rax
        
        !add rdi,16
        !dec rcx
        
      !jne CountCharSSE42_Loop
      
      !CountCharSSE42_MainEnd:
      
      ;process the rest of the string

      !test rsi,rsi
      !je CountCharSSE42_RestEnd

      !mov al,[p.v_Char+32]
      !mov rcx,rsi
      !xor rdx,rdx
      
      !align 8
      !CountCharSSE42_RestLoop:
      
        !mov bl,[rdi]
        !cmp al,bl
        !sete dl
        !add rbp,rdx
        !inc rdi
        !dec rcx

      !jne CountCharSSE42_RestLoop
      
      !CountCharSSE42_RestEnd:

      !mov rax,rbp
 
      !pop rbp
      !pop rdi
      !pop rsi
      !pop rbx

  CompilerEndSelect

  ProcedureReturn

EndProcedure
Trond
Always Here
Always Here
Posts: 7446
Joined: Mon Sep 22, 2003 6:45 pm
Location: Norway

Re: Count char in string, ultra fast SSE4.2

Post by Trond »

Actually, you can get a big speedup (almost 6 times as fast on longer strings) even with only 486 instructions:

Code: Select all

Procedure CountChars(*B.Ascii, L, Char.a)
  !mov  ecx, [p.v_L]
  !mov  dl, [p.v_Char]
  !mov  eax, [p.p_B]
  
  !push edi
  !push ebp
  !mov  ebp, eax
  
  !xor  edi, edi
  !lp:
    !xor eax, eax
    !cmp byte [ecx+ebp-1], dl
    !sete al
    !add edi, eax
    !add ecx, -1
    !jnz lp
  !mov eax, edi
  
  !pop ebp
  !pop edi
  ProcedureReturn 
EndProcedure
Thorium
Addict
Addict
Posts: 1305
Joined: Sat Aug 15, 2009 6:59 pm

Re: Count char in string, ultra fast SSE4.2

Post by Thorium »

Interessting, i see i stil have to learn a lot about assembler optimization. But whats your CPU? On my CPU your procedure is just 5% faster than my simple Asm procedure so SSE4.2 is still 15 times faster on my CPU. I have a Core i7 920.
Trond
Always Here
Always Here
Posts: 7446
Joined: Mon Sep 22, 2003 6:45 pm
Location: Norway

Re: Count char in string, ultra fast SSE4.2

Post by Trond »

My CPU is a Core 2 Duo 2.1 Ghz. This is the test code:

Code: Select all

B.s
B = "abcdefabcdefaaaaaffffbdddffff"

B = ReplaceString(Space(1000), " ", B) ; thousand copies of B
L = Len(B)

#Tries = 10000

time = ElapsedMilliseconds()
For U = 0 To #Tries
  CountCharAsm(@B, L, 'a') ; 2300-2400 ms
Next
MessageRequester("", Str(ElapsedMilliseconds()-time))

time = ElapsedMilliseconds()
For U = 0 To #Tries
  CountChars(@B, L, 'a') ; 390-440 ms
Next
MessageRequester("", Str(ElapsedMilliseconds()-time))
The difference becomes slightly bigger if CountChars() runs first for some reason.
Thorium
Addict
Addict
Posts: 1305
Joined: Sat Aug 15, 2009 6:59 pm

Re: Count char in string, ultra fast SSE4.2

Post by Thorium »

I see, i test with a large text file. Will post my testing code later, i am currently at work. Try a textfile bigger 1MB.
Trond
Always Here
Always Here
Posts: 7446
Joined: Mon Sep 22, 2003 6:45 pm
Location: Norway

Re: Count char in string, ultra fast SSE4.2

Post by Trond »

Thorium wrote:I see, i test with a large text file. Will post my testing code later, i am currently at work. Try a textfile bigger 1MB.
Mine is 3.5 times as fast on a 1 mb textfile from lipsum.com.
Thorium
Addict
Addict
Posts: 1305
Joined: Sat Aug 15, 2009 6:59 pm

Re: Count char in string, ultra fast SSE4.2

Post by Thorium »

I think i know whats the difference, my code gets slower the more matches are in the string. Or your gets faster the more matches are there. I think this has something to do with branch prediction. But your code should be allways faster. I see you calculate the pointer within the accessing instruction. I will change my SSE4.2 code that it will do the same. Thats a nice trick.
Trond
Always Here
Always Here
Posts: 7446
Joined: Mon Sep 22, 2003 6:45 pm
Location: Norway

Re: Count char in string, ultra fast SSE4.2

Post by Trond »

I think i know whats the difference, my code gets slower the more matches are in the string. Or your gets faster the more matches are there. I think this has something to do with branch prediction.
Mine is theoretically not branching, so it must be yours which is slowing down.

I don't know if I gain much from calculating the address in the instruction, I only did it that way, because then I could keep a separate counter variable and check that against 0 instead of increasing the actual pointer and checking that agains a limit. Theoretically I would lose some by doing the calculation instead of just incrementing the pointer by 1 every iteration, however, I can then check the result of updating the counter against 0 by testing zf. That's where the real optimization is. I did away with the extra cmp ptr, bound.
User avatar
Kuron
Addict
Addict
Posts: 1626
Joined: Sat Oct 17, 2009 10:51 pm
Location: Pacific Northwest

Re: Count char in string, ultra fast SSE4.2

Post by Kuron »

Trond & Thorium: Awesome stuff!
Last edited by Kuron on Wed Mar 24, 2010 9:33 pm, edited 1 time in total.
Best wishes to the PB community. Thank you for the memories. ♥️
Thorium
Addict
Addict
Posts: 1305
Joined: Sat Aug 15, 2009 6:59 pm

Re: Count char in string, ultra fast SSE4.2

Post by Thorium »

The funny thing about assembler code is, it performs different on different CPUs. Tronds Code is actualy slower than mine on a old Athlon XP. Turned out rep scasb is more optimized on AMD than on Intel. ^^

Cant wait do get home to test tronds code against my SSE4.2 code. It should be still 15 times faster.
Thorium
Addict
Addict
Posts: 1305
Joined: Sat Aug 15, 2009 6:59 pm

Re: Count char in string, ultra fast SSE4.2

Post by Thorium »

Trond, do you have a nice asm code for counting the set bits in a byte and word? So we can make a MMX and SSE2 versions of the procedure, for older CPUs. But it could be that the bitcounting slows it down to much. Would be a interessting experiment.
Helle
Enthusiast
Enthusiast
Posts: 178
Joined: Wed Apr 12, 2006 7:59 pm
Location: Germany
Contact:

Re: Count char in string, ultra fast SSE4.2

Post by Helle »

A SSE2-version (32-bit, for rel. long files) with test:

Code: Select all

Procedure CountChars(*B.Ascii, L, Char.a)
  !mov  ecx, [p.v_L]
  !mov  dl, [p.v_Char]
  !mov  eax, [p.p_B]
 
  !push edi
  !push ebp
  !mov  ebp, eax
 
  !xor  edi, edi
  !lp:
    !xor eax, eax
    !cmp byte [ecx+ebp-1], dl
    !sete al
    !add edi, eax
    !add ecx, -1
    !jnz lp
  !mov eax, edi
 
  !pop ebp
  !pop edi
  ProcedureReturn
EndProcedure

Procedure CountCharAsmSSE2_x32(*B.Ascii, L, Char.a)
  !mov  ecx,[p.v_L]
  !mov  dl,[p.v_Char]
  !push esi
  !mov  esi,[p.p_B+4]  
  !push edi 
  !push ebp
  !push ebx

  !sub esp,400h              ;space for table
  !mov ebp,esp
  !mov al,dl
  !shl edx,8
  !mov dl,al
  !shl edx,8
  !mov dl,al  
  !shl edx,8
  !mov dl,al 
  !push edx
  !push edx
  !push edx
  !push edx  
  !movdqu xmm1,[esp]         ;16x Char from Stack
  !add esp,16

  !push ecx 
  
  ;generate a 8-bit-lookup-table for population-count   better way?  super-speed if pre-defined as a file!
  !mov ecx,256
  !xor edx,edx
  !@@:
  !mov eax,edx
  !mov ebx,eax
  !shr ebx,1
  !and ebx,55555555h
  !sub eax,ebx
  !mov ebx,eax
  !and ebx,33333333h
  !shl eax,2
  !and eax,33333333h
  !add eax,ebx
  !mov ebx,eax
  !shl ebx,4
  !add eax,ebx
  !and eax,0F0F0F0Fh
  !mov ebx,1010101h
  !push edx
  !mul ebx
  !pop edx
  !shr eax,24
  !inc edx

  !mov [ebp],eax
  !add ebp,4
  !dec ecx
  !jnz @b

  !pop ecx 
   
  ;prolog 
  !mov ebp,esp 
  !mov edx,ecx               ;ecx=Length
  !shr ecx,4                 ;div.16
  !mov ebx,ecx
  !shl ebx,4                 ;mul.16
  !sub edx,ebx               ;Modulo without Division

  !xor eax,eax

  !or ecx,ecx
  !jz Rest
  ;the well-known procedure
  !@@:
  !movdqu xmm0,[esi]         ;or pre-calculating for use of movdqa
  !pcmpeqb xmm0,xmm1
  !pmovmskb ebx,xmm0

  !mov edi,ebx
  !shr edi,8
  !and ebx,0FFh
  !shl ebx,2

  !add eax,[ebp+ebx]

  !mov ebx,edi
  !shl ebx,2
  !add eax,[ebp+ebx]
  
  !add esi,16
  !dec ecx
  !jnz @b

  !Rest:
  !or edx,edx
  !jz Ende

  !movd ecx,xmm1             ;Char
  !Rest1:
  !cmp byte[esi],cl
  !jne @f
  !inc eax
  !@@:
  !inc esi
  !dec edx
  !jnz Rest1
  !Ende:

  !add esp,400h
  !pop ebx
  !pop ebp
  !pop edi 
  !pop esi
 
  ProcedureReturn
EndProcedure

B.s
B = "abcdefabcdefaaaaaffffbdddffff"

B = ReplaceString(Space(1000), " ", B) ;thousand copies of B
L = Len(B)

#Tries = 10000

time = ElapsedMilliseconds()
For U = 0 To #Tries
 NumHelle = CountCharAsmSSE2_x32(@B, L, 'a')
Next
TimeHelle = ElapsedMilliseconds()-time
Helle$ = "NumHelle :  "+Str(NumHelle)+"       Time : "+Str(TimeHelle)

time = ElapsedMilliseconds()
For U = 0 To #Tries
  NumTrond = CountChars(@B, L, 'a')
Next
TimeTrond = ElapsedMilliseconds()-time
Trond$ = "NumTrond : "+Str(NumTrond)+"       Time : "+Str(TimeTrond)

MessageRequester("Results", Helle$ + #CRLF$ + Trond$)
Gruss
Helle
Thorium
Addict
Addict
Posts: 1305
Joined: Sat Aug 15, 2009 6:59 pm

Re: Count char in string, ultra fast SSE4.2

Post by Thorium »

Wsome will test it. By the way, you can do a faster modulo without sub by using and. and eax,15 will give you modulo 16.
Thorium
Addict
Addict
Posts: 1305
Joined: Sat Aug 15, 2009 6:59 pm

Re: Count char in string, ultra fast SSE4.2

Post by Thorium »

Ok, have done the speed test on my Core i7 920 @2,66GHz with helles test and 100,000 tries

Code: Select all

SSE4.2 (POPCNT) by Thorium:     234ms
SSE2 by Helle:                  562ms
traditional assembly by Trond: 3291ms
So counting bits by a lookup table is actualy very fast.
Trond
Always Here
Always Here
Posts: 7446
Joined: Mon Sep 22, 2003 6:45 pm
Location: Norway

Re: Count char in string, ultra fast SSE4.2

Post by Trond »

I don't know how to count bits efficiently with only old instructions (probably a reason they made POPCNT).

I made a native PB code for comparision (to make mine seem a bit faster...):

Code: Select all

Procedure CountCharAsmSSE2_x32(*B.Ascii, L, Char.a)
  While *B\a
    If *B\a = Char
      Count + 1
    EndIf
    *B + 1
  Wend
  ProcedureReturn
EndProcedure
Since it's 5 times as slow as mine, we can only imagine how many times faster the SSE4 version is! Unfortunately I don't think I have SSE4.

Btw: can you explain how popcnt helps you check if the characters are equal??
Post Reply