The SIMD registers: mm, xmm and ymm can be used if they are present. They are part of the instruction set extensions. You can check CPUID for what extensions are present on the CPU and then select the best available instruction set extension. If you dont do that you are missing the true power of the CPU, which is SIMD and it's wide registers up to 256 bit.
It filters image data for compression and uses the best register set available. This code will run on 386 but will use newer registers if they are available.
It has implementations for 80386 asm, MMX, SSE2 and PB (for none x86 CPU's) all of them implemented in 32bit and 64bit.
Only downside: It's a lot of code for all the implementations for just a simple small procedure.
Code: Select all
Global Tsi_MmxSupported.i
Global Tsi_Sse2Supported.i
Procedure.i Tsi_IsCpuidSupported()
!pushfd
!pop eax
!mov edx,eax
!xor eax,$00200000
!push eax
!popfd
!pushfd
!pop eax
!xor eax,edx
!jne Tsi_IsCpuidSupported_Supported
!xor eax,eax
ProcedureReturn
!Tsi_IsCpuidSupported_Supported:
!mov eax,1
ProcedureReturn
EndProcedure
Procedure.i Tsi_IsMmxSupported()
CompilerSelect #PB_Compiler_Processor
CompilerCase #PB_Processor_x86
!mov eax,1
!push ebx
!cpuid
!pop ebx
!test edx,$00800000
!jne Tsi_IsMmxSupported_Supported
!xor eax,eax
ProcedureReturn
!Tsi_IsMmxSupported_Supported:
!mov eax,1
ProcedureReturn
CompilerCase #PB_Processor_x64
!mov rax,1
!push rbx
!cpuid
!pop rbx
!test edx,$00800000
!jne Tsi_IsMmxSupported_Supported
!xor rax,rax
ProcedureReturn
!Tsi_IsMmxSupported_Supported:
!mov rax,1
ProcedureReturn
CompilerEndSelect
EndProcedure
Procedure.i Tsi_IsSse2Supported()
CompilerSelect #PB_Compiler_Processor
CompilerCase #PB_Processor_x86
!mov eax,1
!push ebx
!cpuid
!pop ebx
!test edx,$04000000
!jne Tsi_IsSse2Supported_Supported
!xor eax,eax
ProcedureReturn
!Tsi_IsSse2Supported_Supported:
!mov eax,1
ProcedureReturn
CompilerCase #PB_Processor_x64
!mov rax,1
!push rbx
!cpuid
!pop rbx
!test edx,$04000000
!jne Tsi_IsSse2Supported_Supported
!xor rax,rax
ProcedureReturn
!Tsi_IsSse2Supported_Supported:
!mov rax,1
ProcedureReturn
CompilerEndSelect
EndProcedure
Procedure Tsi_UnFilterUp(*ImageData, Width.i, Height.i, PixelSize.i)
CompilerSelect #PB_Compiler_Processor
CompilerCase #PB_Processor_x86
If Tsi_Sse2Supported = #True
;save registers
!push esi
!push edi
!push ebx
;calculate the pointers
!mov edi,[p.p_ImageData+12]
!mov esi,edi
!mov eax,[p.v_Width+12]
!mul dword[p.v_PixelSize+12]
!mov edx,eax
!add edi,edx
;calculate the counters
!mov eax,[p.v_Height+12]
!dec eax
!mul dword[p.v_Width+12]
!mul dword[p.v_PixelSize+12]
!mov ecx,eax
!shr ecx,7
!and eax,127
!mov ebx,eax
;process a part of the data to cut the length to a multiple of 128
!test ebx,ebx
!je Tsi_UnFilterUp_Sse2CutLengthEnd
!align 4
!Tsi_UnFilterUp_Sse2CutLengthStart:
!mov al,[edi]
!add al,[esi]
!mov [edi],al
!inc esi
!inc edi
!dec ebx
!jne Tsi_UnFilterUp_Sse2CutLengthStart
!align 4
!Tsi_UnFilterUp_Sse2CutLengthEnd:
;process the rest of the data
!test ecx,ecx
!je Tsi_UnFilterUp_Sse2LoopEnd
!align 4
!Tsi_UnFilterUp_Sse2LoopStart:
!movdqu xmm0,[esi]
!movdqu xmm1,[esi+16]
!movdqu xmm2,[esi+32]
!movdqu xmm3,[esi+48]
!movdqu xmm4,[esi+64]
!movdqu xmm5,[esi+80]
!movdqu xmm6,[esi+96]
!movdqu xmm7,[esi+112]
!paddb xmm0,[edi]
!paddb xmm1,[edi+16]
!paddb xmm2,[edi+32]
!paddb xmm3,[edi+48]
!paddb xmm4,[edi+64]
!paddb xmm5,[edi+80]
!paddb xmm6,[edi+96]
!paddb xmm7,[edi+112]
!movdqu [edi],xmm0
!movdqu [edi+16],xmm1
!movdqu [edi+32],xmm2
!movdqu [edi+48],xmm3
!movdqu [edi+64],xmm4
!movdqu [edi+80],xmm5
!movdqu [edi+96],xmm6
!movdqu [edi+112],xmm7
!add esi,128
!add edi,128
!dec ecx
!jne Tsi_UnFilterUp_Sse2LoopStart
!align 4
!Tsi_UnFilterUp_Sse2LoopEnd:
;restore the registers
!pop ebx
!pop edi
!pop esi
;end SSE2 state
!emms
ElseIf Tsi_MmxSupported = #True
;save registers
!push esi
!push edi
!push ebx
;calculate the pointers
!mov edi,[p.p_ImageData+12]
!mov esi,edi
!mov eax,[p.v_Width+12]
!mul dword[p.v_PixelSize+12]
!mov edx,eax
!add edi,edx
;calculate the counters
!mov eax,[p.v_Height+12]
!dec eax
!mul dword[p.v_Width+12]
!mul dword[p.v_PixelSize+12]
!mov ecx,eax
!shr ecx,6
!and eax,63
!mov ebx,eax
;process a part of the data to cut the length to a multiple of 64
!test ebx,ebx
!je Tsi_UnFilterUp_MmxCutLengthEnd
!align 4
!Tsi_UnFilterUp_MmxCutLengthStart:
!mov al,[edi]
!add al,[esi]
!mov [edi],al
!inc esi
!inc edi
!dec ebx
!jne Tsi_UnFilterUp_MmxCutLengthStart
!align 4
!Tsi_UnFilterUp_MmxCutLengthEnd:
;process the rest of the data
!test ecx,ecx
!je Tsi_UnFilterUp_MmxLoopEnd
!align 4
!Tsi_UnFilterUp_MmxLoopStart:
!movq mm0,[esi]
!movq mm1,[esi+8]
!movq mm2,[esi+16]
!movq mm3,[esi+24]
!movq mm4,[esi+32]
!movq mm5,[esi+40]
!movq mm6,[esi+48]
!movq mm7,[esi+56]
!paddb mm0,[edi]
!paddb mm1,[edi+8]
!paddb mm2,[edi+16]
!paddb mm3,[edi+24]
!paddb mm4,[edi+32]
!paddb mm5,[edi+40]
!paddb mm6,[edi+48]
!paddb mm7,[edi+56]
!movq [edi],mm0
!movq [edi+8],mm1
!movq [edi+16],mm2
!movq [edi+24],mm3
!movq [edi+32],mm4
!movq [edi+40],mm5
!movq [edi+48],mm6
!movq [edi+56],mm7
!add esi,64
!add edi,64
!dec ecx
!jne Tsi_UnFilterUp_MmxLoopStart
!align 4
!Tsi_UnFilterUp_MmxLoopEnd:
;restore the registers
!pop ebx
!pop edi
!pop esi
;end MMX state
!emms
Else
!push esi
!push edi
!mov eax,[p.v_Height+8]
!dec eax
!mul dword[p.v_Width+8]
!mul dword[p.v_PixelSize+8]
!mov ecx,eax
!mov edi,[p.p_ImageData+8]
!mov esi,edi
!mov eax,[p.v_Width+8]
!mul dword[p.v_PixelSize+8]
!mov edx,eax
!add edi,edx
!align 4
!Tsi_UnFilterUp_LoopStart:
!mov al,[edi]
!add al,[esi]
!mov [edi],al
!inc esi
!inc edi
!dec ecx
!jne Tsi_UnFilterUp_LoopStart
!pop edi
!pop esi
EndIf
CompilerCase #PB_Processor_x64
If Tsi_Sse2Supported = #True
;save registers
!push rsi
!push rdi
;calculate the pointers
!mov rdi,[p.p_ImageData+16]
!mov rsi,rdi
!mov rax,[p.v_Width+16]
!mul qword[p.v_PixelSize+16]
!mov rdx,rax
!add rdi,rdx
;calculate the counters
!mov rax,[p.v_Height+16]
!dec rax
!mul qword[p.v_Width+16]
!mul qword[p.v_PixelSize+16]
!mov rcx,rax
!shr rcx,7
!and rax,127
!mov r10,rax
;process a part of the data to cut the length to a multiple of 128
!test r10,r10
!je Tsi_UnFilterUp_Sse2CutLengthEnd
!align 8
!Tsi_UnFilterUp_Sse2CutLengthStart:
!mov al,[rdi]
!add al,[rsi]
!mov [rdi],al
!inc rsi
!inc rdi
!dec r10
!jne Tsi_UnFilterUp_Sse2CutLengthStart
!align 8
!Tsi_UnFilterUp_Sse2CutLengthEnd:
;process the rest of the data
!test rcx,rcx
!je Tsi_UnFilterUp_Sse2LoopEnd
!align 8
!Tsi_UnFilterUp_Sse2LoopStart:
!movdqu xmm0,[rsi]
!movdqu xmm1,[rsi+16]
!movdqu xmm2,[rsi+32]
!movdqu xmm3,[rsi+48]
!movdqu xmm4,[rsi+64]
!movdqu xmm5,[rsi+80]
!movdqu xmm6,[rsi+96]
!movdqu xmm7,[rsi+112]
!paddb xmm0,[rdi]
!paddb xmm1,[rdi+16]
!paddb xmm2,[rdi+32]
!paddb xmm3,[rdi+48]
!paddb xmm4,[rdi+64]
!paddb xmm5,[rdi+80]
!paddb xmm6,[rdi+96]
!paddb xmm7,[rdi+112]
!movdqu [rdi],xmm0
!movdqu [rdi+16],xmm1
!movdqu [rdi+32],xmm2
!movdqu [rdi+48],xmm3
!movdqu [rdi+64],xmm4
!movdqu [rdi+80],xmm5
!movdqu [rdi+96],xmm6
!movdqu [rdi+112],xmm7
!add rsi,128
!add rdi,128
!dec rcx
!jne Tsi_UnFilterUp_Sse2LoopStart
!align 8
!Tsi_UnFilterUp_Sse2LoopEnd:
;restore the registers
!pop rdi
!pop rsi
;end SSE2 state
!emms
ElseIf Tsi_MmxSupported = #True
;save registers
!push rsi
!push rdi
;calculate the pointers
!mov rdi,[p.p_ImageData+16]
!mov rsi,rdi
!mov rax,[p.v_Width+16]
!mul qword[p.v_PixelSize+16]
!mov rdx,rax
!add rdi,rdx
;calculate the counters
!mov rax,[p.v_Height+16]
!dec rax
!mul qword[p.v_Width+16]
!mul qword[p.v_PixelSize+16]
!mov rcx,rax
!shr rcx,6
!and rax,63
!mov r10,rax
;process a part of the data to cut the length to a multiple of 64
!test r10,r10
!je Tsi_UnFilterUp_MmxCutLengthEnd
!align 8
!Tsi_UnFilterUp_MmxCutLengthStart:
!mov al,[rdi]
!add al,[rsi]
!mov [rdi],al
!inc rsi
!inc rdi
!dec r10
!jne Tsi_UnFilterUp_MmxCutLengthStart
!align 8
!Tsi_UnFilterUp_MmxCutLengthEnd:
;process the rest of the data
!test rcx,rcx
!je Tsi_UnFilterUp_MmxLoopEnd
!align 8
!Tsi_UnFilterUp_MmxLoopStart:
!movq mm0,[rsi]
!movq mm1,[rsi+8]
!movq mm2,[rsi+16]
!movq mm3,[rsi+24]
!movq mm4,[rsi+32]
!movq mm5,[rsi+40]
!movq mm6,[rsi+48]
!movq mm7,[rsi+56]
!paddb mm0,[rdi]
!paddb mm1,[rdi+8]
!paddb mm2,[rdi+16]
!paddb mm3,[rdi+24]
!paddb mm4,[rdi+32]
!paddb mm5,[rdi+40]
!paddb mm6,[rdi+48]
!paddb mm7,[rdi+56]
!movq [rdi],mm0
!movq [rdi+8],mm1
!movq [rdi+16],mm2
!movq [rdi+24],mm3
!movq [rdi+32],mm4
!movq [rdi+40],mm5
!movq [rdi+48],mm6
!movq [rdi+56],mm7
!add rsi,64
!add rdi,64
!dec rcx
!jne Tsi_UnFilterUp_MmxLoopStart
!align 8
!Tsi_UnFilterUp_MmxLoopEnd:
;restore the registers
!pop rdi
!pop rsi
;end MMX state
!emms
Else
!push rsi
!push rdi
!mov rax,[p.v_Height+16]
!dec rax
!mul qword[p.v_Width+16]
!mul qword[p.v_PixelSize+16]
!mov rcx,rax
!mov rdi,[p.p_ImageData+16]
!mov rsi,rdi
!mov rax,[p.v_Width+16]
!mul qword[p.v_PixelSize+16]
!mov rdx,rax
!add rdi,rdx
!align 8
!Tsi_UnFilterUp_LoopStart:
!mov al,[rdi]
!add al,[rsi]
!mov [rdi],al
!inc rsi
!inc rdi
!dec rcx
!jne Tsi_UnFilterUp_LoopStart
!pop rdi
!pop rsi
EndIf
CompilerDefault
Protected.i X, ByteSize
Protected *ActualChannel.Tsi_Pixel_Channel
Protected *PriorChannel.Tsi_Pixel_Channel
*PriorChannel = *ImageData
*ActualChannel = *ImageData + Width * PixelSize
Height - 1
ByteSize = Width * Height * PixelSize
For X = 1 To ByteSize
*ActualChannel\Channel = *ActualChannel\Channel + *PriorChannel\Channel
*ActualChannel + 1
*PriorChannel + 1
Next
CompilerEndSelect
EndProcedure