Hi,
This code is not mine, but from AMD itself. It copies memory as fast as an AMD can do it. It works on Intel, too, of course, and will probably be faster than normal CopyMemory() even in Intel.
This procedure works 12 times faster than native CopyMemory() in an AMD Athlon XP 3200+. It would be cool to hear from tests in other AMD or, even better, in Pentiums.
Enjoy it.
EDIT: this is the working code, although I haven't been able to use all those 'ALIGN'. Fasm complains: "section is not aligned enough for this operation".
Code: Select all
; Copyright (c) 2001 Advanced Micro Devices, Inc.
;
;LIMITATION OF LIABILITY: THE MATERIALS ARE PROVIDED *AS IS* WITHOUT ANY
;EXPRESS OR IMPLIED WARRANTY OF ANY KIND INCLUDING WARRANTIES OF MERCHANTABILITY,
;NONINFRINGEMENT OF THIRD-PARTY INTELLECTUAL PROPERTY, OR FITNESS FOR ANY
;PARTICULAR PURPOSE. IN NO EVENT SHALL AMD OR ITS SUPPLIERS BE LIABLE FOR ANY
;DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR LOSS OF PROFITS,
;BUSINESS INTERRUPTION, LOSS OF INFORMATION) ARISING OUT OF THE USE OF OR
;INABILITY TO USE THE MATERIALS, EVEN IF AMD HAS BEEN ADVISED OF THE POSSIBILITY
;OF SUCH DAMAGES. BECAUSE SOME JURISDICTIONS PROHIBIT THE EXCLUSION OR LIMITATION
;OF LIABILITY FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES, THE ABOVE LIMITATION MAY
;NOT APPLY TO YOU.
;
;AMD does not assume any responsibility for any errors which may appear in the
;Materials nor any responsibility to support or update the Materials. AMD retains
;the right to make changes to its test specifications at any time, without notice.
;
;NO SUPPORT OBLIGATION: AMD is not obligated to furnish, support, or make any
;further information, software, technical information, know-how, or show-how
;available to you.
;
;So that all may benefit from your experience, please report any problems
;or suggestions about this software to 3dsdk.support@amd.com
;
;AMD Developer Technologies, M/S 585
;Advanced Micro Devices, Inc.
;5900 E. Ben White Blvd.
;Austin, TX 78741
;3dsdk.support@amd.com
; Very optimized memcpy() routine for all AMD Athlon and Duron family.
; This code uses any of FOUR different basic copy methods, depending
; on the transfer size.
; NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
; "Streaming Store"), and also uses the software prefetchnta instructions,
; be sure you're running on Athlon/Duron or other recent CPU before calling!
Procedure CopyMemoryAMD(*src, *dst, size)
#CACHEBLOCK = $80
#CACHEBLOCKPREFETCH = #CACHEBLOCK/2
#CACHEBLOCKTOP = #CACHEBLOCK*64
#UNCACHED_COPY = 197*1024
#UNCACHED_COPYPREFETCH = #UNCACHED_COPY/64
#TINY_BLOCK_COPY = 64
#IN_CACHE_COPY = 64*1024
#IN_CACHE_COPYBIG = #IN_CACHE_COPY/64
EnableASM
MOV esi, *src ; source array
MOV edi, *dst ; destination array
MOV ecx, size
MOV ebx, ecx ; keep a copy of count
CLD
CMP ecx, #TINY_BLOCK_COPY
JB l_copymemoryamd_memcpy_ic_3 ; tiny? skip mmx copy
CMP ecx, 32*1024 ; don't align between 32k-64k because
JBE l_copymemoryamd_memcpy_do_align ; it appears to be slower
CMP ecx, 64*1024
JBE l_copymemoryamd_memcpy_align_done
memcpy_do_align:
MOV ecx, 8 ; a trick that's faster than rep movsb...
SUB ecx, edi ; align destination to qword
And ecx, 7 ; 111b ; get the low bits
SUB ebx, ecx ; update copy count
NEG ecx ; set up to jump into the array
ADD ecx, l_copymemoryamd_memcpy_align_done
JMP ecx ; jump to array of movsb's
!ALIGN 4
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
memcpy_align_done: ; destination is dword aligned
MOV ecx, ebx ; number of bytes left to copy
SHR ecx, 6 ; get 64-byte block count
JZ l_copymemoryamd_memcpy_ic_2 ; finish the last few bytes
CMP ecx, #IN_CACHE_COPYBIG ; too big 4 cache? use uncached copy
JAE l_copymemoryamd_memcpy_uc_test
;!ALIGN 16
memcpy_ic_1: ; 64-byte block copies, in-cache copy
!prefetchnta [esi+(200*64/34+192)] ; start reading ahead
!movq mm0, [esi+0] ; read 64 bits
!movq mm1, [esi+8]
!movq [edi+0], mm0 ; write 64 bits
!movq [edi+8], mm1 ; note: the normal !movq writes the
!movq mm2, [esi+16] ; data to cache; a cache line will be
!movq mm3, [esi+24] ; allocated as needed, to store the data
!movq [edi+16], mm2
!movq [edi+24], mm3
!movq mm0, [esi+32]
!movq mm1, [esi+40]
!movq [edi+32], mm0
!movq [edi+40], mm1
!movq mm2, [esi+48]
!movq mm3, [esi+56]
!movq [edi+48], mm2
!movq [edi+56], mm3
ADD esi, 64 ; update source pointer
ADD edi, 64 ; update destination pointer
DEC ecx ; count down
JNZ l_copymemoryamd_memcpy_ic_1 ; last 64-byte block?
memcpy_ic_2:
MOV ecx, ebx ; has valid low 6 bits of the byte count
memcpy_ic_3:
SHR ecx, 2 ; dword count
And ecx, 15 ; %1111 ; only look at the "remainder" bits
NEG ecx ; set up to jump into the array
ADD ecx, l_copymemoryamd_memcpy_last_few
JMP ecx ; jump to array of movsd's
memcpy_uc_test:
CMP ecx, #UNCACHED_COPYPREFETCH ; big enough? use block prefetch copy
JAE l_copymemoryamd_memcpy_bp_1
memcpy_64_test:
Or ecx, ecx ; tail end of block prefetch will jump here
JZ l_copymemoryamd_memcpy_ic_2 ; no more 64-byte blocks left
memcpy_uc_1: ; 64-byte blocks, uncached copy
!prefetchnta [esi+(200*64/34+192)] ; start reading ahead
!movq mm0, [esi+0] ; read 64 bits
ADD edi, 64 ; update destination pointer
!movq mm1, [esi+8]
ADD esi, 64 ; update source pointer
!movq mm2, [esi-48]
!movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
!movq mm0, [esi-40] ; note: !movntq also prevents the CPU
!movntq [edi-56], mm1 ; from READING the destination address
!movq mm1, [esi-32] ; into the cache, only to be over-written
!movntq [edi-48], mm2 ; so that also helps performance
!movq mm2, [esi-24]
!movntq [edi-40], mm0
!movq mm0, [esi-16]
!movntq [edi-32], mm1
!movq mm1, [esi-8]
!movntq [edi-24], mm2
!movntq [edi-16], mm0
DEC ecx
!movntq [edi-8], mm1
JNZ l_copymemoryamd_memcpy_uc_1 ; last 64-byte block?
JMP l_copymemoryamd_memcpy_ic_2 ; almost done
memcpy_bp_1: ; large blocks, block prefetch copy
CMP ecx, #CACHEBLOCK ; big enough to run another prefetch loop?
JL l_copymemoryamd_memcpy_64_test ; no, back to regular uncached copy
MOV eax, #CACHEBLOCKPREFETCH ; block prefetch loop, unrolled 2X
ADD esi, #CACHEBLOCKTOP ; move to the top of the block
;!ALIGN 16
memcpy_bp_2:
MOV edx, [esi-64] ; grab one address per cache line
MOV edx, [esi-128] ; grab one address per cache line
SUB esi, 128 ; go reverse order
DEC eax ; count down the cache lines
JNZ l_copymemoryamd_memcpy_bp_2 ; keep grabbing more lines into cache
MOV eax, #CACHEBLOCK ; now that it's in cache, do the copy
;!ALIGN 16
memcpy_bp_3:
!movq mm0, [esi] ; read 64 bits
!movq mm1, [esi+ 8]
!movq mm2, [esi+16]
!movq mm3, [esi+24]
!movq mm4, [esi+32]
!movq mm5, [esi+40]
!movq mm6, [esi+48]
!movq mm7, [esi+56]
ADD esi, 64 ; update source pointer
!movntq [edi], mm0 ; write 64 bits, bypassing cache
!movntq [edi+ 8], mm1 ; note: !movntq also prevents the CPU
!movntq [edi+16], mm2 ; from READING the destination address
!movntq [edi+24], mm3 ; into the cache, only to be over-written,
!movntq [edi+32], mm4 ; so that also helps performance
!movntq [edi+40], mm5
!movntq [edi+48], mm6
!movntq [edi+56], mm7
ADD edi, 64 ; update dest pointer
DEC eax ; count down
JNZ l_copymemoryamd_memcpy_bp_3 ; keep copying
SUB ecx, #CACHEBLOCK ; update the 64-byte block count
JMP l_copymemoryamd_memcpy_bp_1 ; keep processing chunks
;The smallest copy uses the X86 "!movsd" instruction, in an optimized
;form which is an "unrolled loop". Then it handles the last few bytes.
!ALIGN 4
!movsd
!movsd ; perform last 1-15 dword copies
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd ; perform last 1-7 dword copies
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
memcpy_last_few: ; dword aligned from before !movsd's
MOV ecx, ebx ; has valid low 2 bits of the byte count
And ecx, 3 ; %11 ; the last few cows must come home
JZ l_copymemoryamd_memcpy_final ; no more, let's leave
REP movsb ; the last 1, 2, or 3 bytes
memcpy_final:
!emms ; clean up the state
!sfence ; flush the write buffer
DisableASM
EndProcedure
manyk = Pow(2, 26)
source = AllocateMemory(manyk)
destination = AllocateMemory(manyk)
If source And destination
For a=0 To manyk-1 Step 4
PokeL(source+a, Random($fffffff))
Next
time = ElapsedMilliseconds()
For a=1 To 10
CopyMemoryAMD(source, destination, manyk)
Next
manyk_AMD.s = Str((ElapsedMilliseconds()-time))
For a=0 To manyk-1 Step 4
If PeekL(source+a)<>PeekL(destination+a)
MessageRequester("Wrong data", "CopyMemoryAMD 64 MB at offset "+Str(a))
Break
EndIf
Next
time = ElapsedMilliseconds()
For a=1 To 10
CopyMemory(source, destination, manyk)
Next
manyk_PB.s=Str((ElapsedMilliseconds()-time))
FreeMemory(source)
FreeMemory(destination)
source = 0
destination = 0
manyk_times.f = Val(manyk_PB)/Val(manyk_AMD)
Else
MessageRequester("Error", "Could not allocate two "+Str(manyk/1024)+" KB blocks.")
EndIf
ameg = Pow(2, 20)
source = AllocateMemory(ameg)
destination = AllocateMemory(ameg)
If source And destination
For a=0 To ameg-1 Step 4
PokeL(source+a, Random($fffffff))
Next
time = ElapsedMilliseconds()
For a=1 To 10000
CopyMemoryAMD(source, destination, ameg)
Next
onek_AMD.s = Str((ElapsedMilliseconds()-time))
For a=0 To ameg-1 Step 4
If PeekL(source+a)<>PeekL(destination+a)
MessageRequester("Wrong data", "CopyMemoryAMD 1 MB at offset "+Str(a))
Break
EndIf
Next
time = ElapsedMilliseconds()
For a=1 To 10000
CopyMemory(source, destination, ameg)
Next
onek_PB.s = Str((ElapsedMilliseconds()-time))
FreeMemory(source)
FreeMemory(destination)
source = 0
destination = 0
onek_times.f = Val(onek_PB)/Val(onek_AMD)
Else
MessageRequester("Error", "Could not allocate two "+Str(ameg/1024)+" KB blocks.")
EndIf
hk = 102400
source = AllocateMemory(hk)
destination = AllocateMemory(hk)
If source And destination
For a=0 To hk-1 Step 4
PokeB(source+a, Random($fffffff))
Next
time = ElapsedMilliseconds()
For a=1 To 10000
CopyMemoryAMD(source, destination, hk)
Next
hundredk_AMD.s = Str((ElapsedMilliseconds()-time))
For a=0 To hk-1 Step 4
If PeekL(source+a)<>PeekL(destination+a)
MessageRequester("Wrong data", "CopyMemoryAMD 100 K at offset "+Str(a))
Break
EndIf
Next
time = ElapsedMilliseconds()
For a=1 To 10000
CopyMemory(source, destination, hk)
Next
hundredk_PB.s = Str((ElapsedMilliseconds()-time))
FreeMemory(source)
FreeMemory(destination)
source = 0
destination = 0
hundredk_times.f = Val(hundredk_PB)/Val(hundredk_AMD)
Else
MessageRequester("Error", "Could not allocate two "+Str(hk/1024)+" KB blocks.")
EndIf
results.s="--- 64 MB tranfer test ---"+#LFCR$
results.s+"AMD Function : "+ manyk_AMD +#LFCR$
results.s+"Pure Function : "+ manyk_PB +#LFCR$
results.s+"AMD Function is "+StrF(manyk_times)+" times faster."+#LFCR$
results.s+#LFCR$
results.s+"--- 1 MB tranfer test ---"+#LFCR$
results.s+"AMD Function : "+ onek_AMD +#LFCR$
results.s+"Pure Function : "+ onek_PB +#LFCR$
results.s+"AMD Function is "+StrF(onek_times)+" times faster."+#LFCR$
results.s+#LFCR$
results.s+"--- 100kb tranfer test ---"+#LFCR$
results.s+"AMD Function : "+ hundredk_AMD +#LFCR$
results.s+"Pure Function : "+ hundredk_PB +#LFCR$
results.s+"AMD Function is "+StrF(hundredk_times)+" times faster."+#LFCR$
MessageRequester("Test Results", results.s)