Compared to my previous procedure, this is a little bit faster and shorter but you will never get gpu speeds.
This code is the same for x86 and x64.
Code: Select all
Procedure FilterCallback(x, y, SourceColor, TargetColor)
!movd xmm1, [p.v_SourceColor]
!movd xmm0, [p.v_TargetColor]
!punpcklbw xmm1, xmm1
!punpcklbw xmm0, xmm0
!pshuflw xmm1, xmm1, 0xff
!pmulhuw xmm0, xmm1
!psrlw xmm0, 8
!packuswb xmm0, xmm0
!movd eax, xmm0
ProcedureReturn
EndProcedure
Combined with the approach from Thorium it could be something like this for x64.
Code: Select all
Procedure AlphaMultiply(*SrcBuffer, *DstBuffer, Height.i, Width.i, SrcPitch.i, DstPitch.i)
!movq xmm2, [p.p_SrcBuffer]
!movq xmm3, [p.p_DstBuffer]
!movq xmm4, [p.v_SrcPitch]
!movq xmm5, [p.v_DstPitch]
!alpha_multiply_loop0:
!movq rax, xmm2
!movq rdx, xmm3
!mov rcx, qword [p.v_Width]
!alpha_multiply_loop1:
!movd xmm1, [rax]
!movd xmm0, [rdx]
!punpcklbw xmm1, xmm1
!punpcklbw xmm0, xmm0
!pshuflw xmm1, xmm1, 0xff
!pmulhuw xmm0, xmm1
!psrlw xmm0, 8
!packuswb xmm0, xmm0
!movd [rdx], xmm0
!add rax, 4
!add rdx, 4
!dec rcx
!jnz alpha_multiply_loop1
!paddq xmm2, xmm4
!paddq xmm3, xmm5
!dec qword [p.v_Height]
!jnz alpha_multiply_loop0
EndProcedure
For x86 with sse instead of sse2 so it also runs on older hardware.
Code: Select all
Procedure AlphaMultiply(*SrcBuffer, *DstBuffer, Height.i, Width.i, SrcPitch.i, DstPitch.i)
!movd mm2, [p.p_SrcBuffer]
!movd mm3, [p.p_DstBuffer]
!movd mm4, [p.v_SrcPitch]
!movd mm5, [p.v_DstPitch]
!alpha_multiply_loop0:
!movd eax, mm2
!movd edx, mm3
!mov ecx, dword [p.v_Width]
!alpha_multiply_loop1:
!movd mm1, [eax]
!movd mm0, [edx]
!punpcklbw mm1, mm1
!punpcklbw mm0, mm0
!pshufw mm1, mm1, 0xff
!pmulhuw mm0, mm1
!psrlw mm0, 8
!packuswb mm0, mm0
!movd [edx], mm0
!add eax, 4
!add edx, 4
!dec ecx
!jnz alpha_multiply_loop1
!paddd mm2, mm4
!paddd mm3, mm5
!dec dword [p.v_Height]
!jnz alpha_multiply_loop0
!emms
EndProcedure