parts of the code are (c) Advanced Micro Devices, Inc. (NYSE: AMD)
parts of the code are Copyright (C) 2009 Jan Boon (Kaetemi) see here: http://blog.kaetemi.be/post/2009/10/25/SSE2-memcpy
Code: Select all
Procedure memcopy_sse2(*src, *dst, nBytes.L)
!local l_slow
!local l_fast
!local l_first
!local l_more
!local l_aligned4k
!local l_aligned4kinp
!local l_alinged4kout
!local l_alignedlast
!local l_alignedlastinp
!local l_alignedlastout
!local l_unaligned4k
!local l_unaligned4kinp
!local l_unalinged4kout
!local l_unalignedlast
!local l_unalignedlastinp
!local l_unalignedlastout
!local l_last
!local l_end
! MOV ecx, [p.v_nBytes]
! MOV edi, [p.p_dst]
! MOV esi, [p.p_src]
! ADD ecx, edi
! prefetchnta [esi+0]
! prefetchnta [esi+32]
! prefetchnta [esi+64]
! prefetchnta [esi+96]
! CMP dword [p.v_nBytes], 512
! JGE l_fast
!l_slow: MOV bl, [esi]
! MOV [edi], bl
! INC edi
! INC esi
! CMP ecx, edi
! JNZ l_slow
! JMP l_end
!l_fast: AND ecx, $FFFFFF80
! MOV ebx, esi
! SUB ebx, edi
! ADD ebx, ecx
! MOV eax, edi
! AND edi, $FFFFFF80
! CMP eax, edi
! JNE l_first
! JMP l_more
!l_first: movdqu xmm0, [esi+0]
! movdqu xmm1, [esi+16]
! movdqu xmm2, [esi+32]
! movdqu xmm3, [esi+48]
! movdqu xmm4, [esi+64]
! movdqu xmm5, [esi+80]
! movdqu xmm6, [esi+96]
! movdqu xmm7, [esi+112]
! movdqu [eax+0], xmm0
! movdqu [eax+16], xmm1
! movdqu [eax+32], xmm2
! movdqu [eax+48], xmm3
! movdqu [eax+64], xmm4
! movdqu [eax+80], xmm5
! movdqu [eax+96], xmm6
! movdqu [eax+112], xmm7
! ADD edi, 128
! SUB eax, edi
! SUB esi, eax
! CMP ecx, edi
! JNZ l_more
! JMP l_last
!l_more: MOV eax, esi
! AND eax, $FFFFFF80
! CMP eax, esi
! JNE l_unaligned4k
!l_aligned4k: MOV eax, esi
! ADD eax, 4096
! CMP eax, ebx
! JLE l_aligned4kinp
! CMP ecx, edi
! JNE l_alignedlast
! JMP l_last
!l_aligned4kinp: prefetchnta [esi+0]
! prefetchnta [esi+32]
! prefetchnta [esi+64]
! prefetchnta [esi+96]
! ADD esi, 128
! CMP eax, esi
! JNE l_aligned4kinp
! SUB esi, 4096
!l_alinged4kout: movdqa xmm0, [esi+0]
! movdqa xmm1, [esi+16]
! movdqa xmm2, [esi+32]
! movdqa xmm3, [esi+48]
! movdqa xmm4, [esi+64]
! movdqa xmm5, [esi+80]
! movdqa xmm6, [esi+96]
! movdqa xmm7, [esi+112]
! movntdq [edi+0], xmm0
! movntdq [edi+16], xmm1
! movntdq [edi+32], xmm2
! movntdq [edi+48], xmm3
! movntdq [edi+64], xmm4
! movntdq [edi+80], xmm5
! movntdq [edi+96], xmm6
! movntdq [edi+112], xmm7
! ADD esi, 128
! ADD edi, 128
! CMP eax, esi
! JNE l_alinged4kout
! JMP l_aligned4k
!l_alignedlast: MOV eax, esi
!l_alignedlastinp: prefetchnta [esi+0]
! prefetchnta [esi+32]
! prefetchnta [esi+64]
! prefetchnta [esi+96]
! ADD esi, 128
! CMP ebx, esi
! JNE l_alignedlastinp
! MOV esi, eax
!l_alignedlastout: movdqa xmm0, [esi+0]
! movdqa xmm1, [esi+16]
! movdqa xmm2, [esi+32]
! movdqa xmm3, [esi+48]
! movdqa xmm4, [esi+64]
! movdqa xmm5, [esi+80]
! movdqa xmm6, [esi+96]
! movdqa xmm7, [esi+112]
! movntdq [edi+0], xmm0
! movntdq [edi+16], xmm1
! movntdq [edi+32], xmm2
! movntdq [edi+48], xmm3
! movntdq [edi+64], xmm4
! movntdq [edi+80], xmm5
! movntdq [edi+96], xmm6
! movntdq [edi+112], xmm7
! ADD esi, 128
! ADD edi, 128
! CMP ecx, edi
! JNE l_alignedlastout
! JMP l_last
!l_unaligned4k: MOV eax, esi
! ADD eax, 4096
! CMP eax, ebx
! JLE l_unaligned4kinp
! CMP ecx, edi
! JNE l_unalignedlast
! JMP l_last
!l_unaligned4kinp: prefetchnta [esi+0]
! prefetchnta [esi+32]
! prefetchnta [esi+64]
! prefetchnta [esi+96]
! ADD esi, 128
! CMP eax, esi
! JNE l_unaligned4kinp
! SUB esi, 4096
!l_unalinged4kout: movdqu xmm0, [esi+0]
! movdqu xmm1, [esi+16]
! movdqu xmm2, [esi+32]
! movdqu xmm3, [esi+48]
! movdqu xmm4, [esi+64]
! movdqu xmm5, [esi+80]
! movdqu xmm6, [esi+96]
! movdqu xmm7, [esi+112]
! movntdq [edi+0], xmm0
! movntdq [edi+16], xmm1
! movntdq [edi+32], xmm2
! movntdq [edi+48], xmm3
! movntdq [edi+64], xmm4
! movntdq [edi+80], xmm5
! movntdq [edi+96], xmm6
! movntdq [edi+112], xmm7
! ADD esi, 128
! ADD edi, 128
! CMP eax, esi
! JNE l_unalinged4kout
! JMP l_unaligned4k
!l_unalignedlast: MOV eax, esi
!l_unalignedlastinp: prefetchnta [esi+0]
! prefetchnta [esi+32]
! prefetchnta [esi+64]
! prefetchnta [esi+96]
! ADD esi, 128
! CMP ebx, esi
! JNE l_unalignedlastinp
! MOV esi, eax
!l_unalignedlastout: movdqu xmm0, [esi+0]
! movdqu xmm1, [esi+16]
! movdqu xmm2, [esi+32]
! movdqu xmm3, [esi+48]
! movdqu xmm4, [esi+64]
! movdqu xmm5, [esi+80]
! movdqu xmm6, [esi+96]
! movdqu xmm7, [esi+112]
! movntdq [edi+0], xmm0
! movntdq [edi+16], xmm1
! movntdq [edi+32], xmm2
! movntdq [edi+48], xmm3
! movntdq [edi+64], xmm4
! movntdq [edi+80], xmm5
! movntdq [edi+96], xmm6
! movntdq [edi+112], xmm7
! ADD esi, 128
! ADD edi, 128
! CMP ecx, edi
! JNE l_unalignedlastout
! JMP l_last
!l_last: MOV ecx, [p.v_nBytes]
! MOV edi, [p.p_dst]
! MOV esi, [p.p_src]
! ADD edi, ecx
! ADD esi, ecx
! SUB edi, 128
! SUB esi, 128
! movdqu xmm0, [esi+0]
! movdqu xmm1, [esi+16]
! movdqu xmm2, [esi+32]
! movdqu xmm3, [esi+48]
! movdqu xmm4, [esi+64]
! movdqu xmm5, [esi+80]
! movdqu xmm6, [esi+96]
! movdqu xmm7, [esi+112]
! movdqu [edi+0], xmm0
! movdqu [edi+16], xmm1
! movdqu [edi+32], xmm2
! movdqu [edi+48], xmm3
! movdqu [edi+64], xmm4
! movdqu [edi+80], xmm5
! movdqu [edi+96], xmm6
! movdqu [edi+112], xmm7
!l_end:
EndProcedure
Code: Select all
Global *mem1 = AllocateMemory($10000)
RandomData(*mem1, $10000)
Global *mem2 = AllocateMemory($1)
For testsize=1 To $10000
*mem2 = ReAllocateMemory(*mem2, testsize)
memcopy_sse2(*mem1, *mem2, testsize)
;;;;;;;;;;;;;;;;;;;;;;;;; CopyMemory(*mem1, *mem2, testsize)
If CompareMemory(*mem1, *mem2, testsize) = 0
Debug "Error with the testsize = " + Hex(testsize)
EndIf
Next