Well you can do it even faster by splitting the string and calling the procedure with multiple threads on multicore CPU's, of course.
For comparison a simple assembler code which does the job:
Code: Select all
Procedure.i CountCharAsm(*Buffer, BufferLength.i, Char.a)
CompilerSelect #PB_Compiler_Processor
CompilerCase #PB_Processor_x86
!push edi
!cld
!xor edx,edx
!mov ecx,[p.v_BufferLength+4]
!mov edi,[p.p_Buffer+4]
!mov al,[p.v_Char+4]
!align 4
!CountCharAsmLoop:
!repne scasb
!inc edx
!test ecx,ecx
!jne CountCharAsmLoop
!mov bl,[edi]
!cmp bl,al
!je CountCharAsmEnd
!dec edx
!CountCharAsmEnd:
!mov eax, edx
!pop edi
CompilerCase #PB_Processor_x64
!push rdi
!cld
!xor rdx,rdx
!mov rcx,[p.v_BufferLength+8]
!mov rdi,[p.p_Buffer+8]
!mov al,[p.v_Char+8]
!align 8
!CountCharAsmLoop:
!repne scasb
!inc rdx
!test rcx,rcx
!jne CountCharAsmLoop
!mov bl,[rdi]
!cmp bl,al
!je CountCharAsmEnd
!dec rdx
!CountCharAsmEnd:
!mov rax, rdx
!pop rdi
CompilerEndSelect
ProcedureReturn
EndProcedure

The code is working with x86 and x64 and you don't have to align data. The code does align the data to 16 byte on it's own and can process data which is not a multiply of 16 without a problem.
In order to compile the code you have to update FASM. Download FASM from http://www.flatassembler.net and copy the fasm.exe to the compilers folder in your purebasic folder. You need to do that because the FASM version of purebasic does not know SSE4 and will exit with a assembler error.
Code: Select all
Procedure.i CountCharSSE42(*Buffer, BufferLength.i, Char.a)
CompilerSelect #PB_Compiler_Processor
CompilerCase #PB_Processor_x86
!push ebx
!push esi
!push edi
!push ebp
!mov esi,[p.v_BufferLength+16]
!mov edi,[p.p_Buffer+16]
!mov al,[p.v_Char+16]
!xor ebp,ebp
;process some bytes to get 16 byte alignment
!mov ecx,edi
!and ecx,15
!test ecx,ecx
!je CountCharSSE42_AlignEnd
!sub esi,ecx
!xor edx,edx
!align 4
!CountCharSSE42_AlignLoop:
!mov bl,[edi]
!cmp al,bl
!sete dl
!add ebp,edx
!inc edi
!dec ecx
!jne CountCharSSE42_AlignLoop
!CountCharSSE42_AlignEnd:
;cut length to a multiply of 16 and process it
!mov cl,al
!shl eax,8
!mov al,cl
!shl eax,8
!mov al,cl
!shl eax,8
!mov al,cl
!push eax
!push eax
!push eax
!push eax
!movdqu xmm0,[esp]
!add esp,16
!mov ecx,esi
!shr ecx,4
!test ecx,ecx
!je CountCharSSE42_MainEnd
!and esi,15
!align 4
!CountCharSSE42_Loop:
!movdqa xmm1,[edi]
!pcmpeqb xmm1,xmm0
!pmovmskb eax,xmm1
!popcnt eax,eax
!add ebp,eax
!add edi,16
!dec ecx
!jne CountCharSSE42_Loop
!CountCharSSE42_MainEnd:
;process the rest of the string
!test esi,esi
!je CountCharSSE42_RestEnd
!mov al,[p.v_Char+16]
!mov ecx,esi
!xor edx,edx
!align 4
!CountCharSSE42_RestLoop:
!mov bl,[edi]
!cmp al,bl
!sete dl
!add ebp,edx
!inc edi
!dec ecx
!jne CountCharSSE42_RestLoop
!CountCharSSE42_RestEnd:
!mov eax,ebp
!pop ebp
!pop edi
!pop esi
!pop ebx
CompilerCase #PB_Processor_x64
!push rbx
!push rsi
!push rdi
!push rbp
!mov rsi,[p.v_BufferLength+32]
!mov rdi,[p.p_Buffer+32]
!mov al,[p.v_Char+32]
!xor rbp,rbp
;process some bytes to get 16 byte alignment
!mov rcx,rdi
!and rcx,15
!test rcx,rcx
!je CountCharSSE42_AlignEnd
!sub rsi,rcx
!xor rdx,rdx
!align 8
!CountCharSSE42_AlignLoop:
!mov bl,[rdi]
!cmp al,bl
!sete dl
!add rbp,rdx
!inc rdi
!dec rcx
!jne CountCharSSE42_AlignLoop
!CountCharSSE42_AlignEnd:
;cut length to a multiply of 16 and process it
!mov cl,al
!shl rax,8
!mov al,cl
!shl rax,8
!mov al,cl
!shl rax,8
!mov al,cl
!shl rax,8
!mov al,cl
!shl rax,8
!mov al,cl
!shl rax,8
!mov al,cl
!shl rax,8
!mov al,cl
!push rax
!push rax
!movdqu xmm0,[rsp]
!add rsp,16
!mov rcx,rsi
!shr rcx,4
!test rcx,rcx
!je CountCharSSE42_MainEnd
!and rsi,15
!xor rax,rax
!align 8
!CountCharSSE42_Loop:
!movdqa xmm1,[rdi]
!pcmpeqb xmm1,xmm0
!pmovmskb eax,xmm1
!popcnt eax,eax
!add rbp,rax
!add rdi,16
!dec rcx
!jne CountCharSSE42_Loop
!CountCharSSE42_MainEnd:
;process the rest of the string
!test rsi,rsi
!je CountCharSSE42_RestEnd
!mov al,[p.v_Char+32]
!mov rcx,rsi
!xor rdx,rdx
!align 8
!CountCharSSE42_RestLoop:
!mov bl,[rdi]
!cmp al,bl
!sete dl
!add rbp,rdx
!inc rdi
!dec rcx
!jne CountCharSSE42_RestLoop
!CountCharSSE42_RestEnd:
!mov rax,rbp
!pop rbp
!pop rdi
!pop rsi
!pop rbx
CompilerEndSelect
ProcedureReturn
EndProcedure