Code: Select all
Procedure CopyMemoryAMD(*src, *dst, size)
#CACHEBLOCK = $80
#CACHEBLOCKPREFETCH = #CACHEBLOCK/2
#CACHEBLOCKTOP = #CACHEBLOCK*64
#UNCACHED_COPY = 197*1024
#UNCACHED_COPYPREFETCH = #UNCACHED_COPY/64
#TINY_BLOCK_COPY = 64
#IN_CACHE_COPY = 64*1024
#IN_CACHE_COPYBIG = #IN_CACHE_COPY/64
!MOV esi,dword [esp]
!MOV edi,dword [esp+4]
!MOV ecx,dword [esp+8]
!MOV ebx,ecx
!CLD
!CMP ecx, 64
!JB l_memcpy_ic_3
!CMP ecx, 32*1024
!JBE l_memcpy_do_align
!CMP ecx, 64*1024
!JBE l_memcpy_align_done
memcpy_do_align:
!MOV ecx,8
!SUB ecx,edi
!And ecx,7
!SUB ebx, ecx ; update copy count
!NEG ecx ; set up to jump into the array
!ADD ecx, l_memcpy_align_done
!JMP ecx ; jump to array of movsb's
!ALIGN 4
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
memcpy_align_done: ; destination is dword aligned
!MOV ecx, ebx ; number of bytes left to copy
!SHR ecx, 6 ; get 64-byte block count
!JZ l_memcpy_ic_2 ; finish the last few bytes
!CMP ecx, 1024 ; too big 4 cache? use uncached copy
!JAE l_memcpy_uc_test
;!ALIGN 16
memcpy_ic_1: ; 64-byte block copies, in-cache copy
!prefetchnta [esi+(200*64/34+192)] ; start reading ahead
!movq mm0, [esi+0] ; read 64 bits
!movq mm1, [esi+8]
!movq [edi+0], mm0 ; write 64 bits
!movq [edi+8], mm1 ; note: the normal !movq writes the
!movq mm2, [esi+16] ; data to cache; a cache line will be
!movq mm3, [esi+24] ; allocated as needed, to store the data
!movq [edi+16], mm2
!movq [edi+24], mm3
!movq mm0, [esi+32]
!movq mm1, [esi+40]
!movq [edi+32], mm0
!movq [edi+40], mm1
!movq mm2, [esi+48]
!movq mm3, [esi+56]
!movq [edi+48], mm2
!movq [edi+56], mm3
!ADD esi, 64 ; update source pointer
!ADD edi, 64 ; update destination pointer
!DEC ecx ; count down
!JNZ l_memcpy_ic_1 ; last 64-byte block?
memcpy_ic_2:
!MOV ecx, ebx ; has valid low 6 bits of the byte count
memcpy_ic_3:
!SHR ecx, 2 ; dword count
!And ecx, 15 ; %1111 ; only look at the "remainder" bits
!NEG ecx ; set up to jump into the array
!ADD ecx, l_memcpy_last_few
!JMP ecx ; jump to array of movsd's
memcpy_uc_test:
!CMP ecx, 3152 ; big enough? use block prefetch copy
!JAE l_memcpy_bp_1
memcpy_64_test:
!Or ecx, ecx ; tail end of block prefetch will jump here
!JZ l_memcpy_ic_2 ; no more 64-byte blocks left
memcpy_uc_1: ; 64-byte blocks, uncached copy
!prefetchnta [esi+(200*64/34+192)] ; start reading ahead
!movq mm0, [esi+0] ; read 64 bits
!ADD edi, 64 ; update destination pointer
!movq mm1, [esi+8]
!ADD esi, 64 ; update source pointer
!movq mm2, [esi-48]
!movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
!movq mm0, [esi-40] ; note: !movntq also prevents the CPU
!movntq [edi-56], mm1 ; from READING the destination address
!movq mm1, [esi-32] ; into the cache, only to be over-written
!movntq [edi-48], mm2 ; so that also helps performance
!movq mm2, [esi-24]
!movntq [edi-40], mm0
!movq mm0, [esi-16]
!movntq [edi-32], mm1
!movq mm1, [esi-8]
!movntq [edi-24], mm2
!movntq [edi-16], mm0
!DEC ecx
!movntq [edi-8], mm1
!JNZ l_memcpy_uc_1 ; last 64-byte block?
!JMP l_memcpy_ic_2 ; almost done
memcpy_bp_1: ; large blocks, block prefetch copy
!CMP ecx, 128 ; big enough to run another prefetch loop?
!JL l_memcpy_64_test ; no, back to regular uncached copy
!MOV eax, 64 ; block prefetch loop, unrolled 2X
!ADD esi, 8192 ; move to the top of the block
;!ALIGN 16
memcpy_bp_2:
!MOV edx, [esi-64] ; grab one address per cache line
!MOV edx, [esi-128] ; grab one address per cache line
!SUB esi, 128 ; go reverse order
!DEC eax ; count down the cache lines
!JNZ l_memcpy_bp_2 ; keep grabbing more lines into cache
!MOV eax, 128 ; now that it's in cache, do the copy
;!ALIGN 16
memcpy_bp_3:
!movq mm0, [esi] ; read 64 bits
!movq mm1, [esi+ 8]
!movq mm2, [esi+16]
!movq mm3, [esi+24]
!movq mm4, [esi+32]
!movq mm5, [esi+40]
!movq mm6, [esi+48]
!movq mm7, [esi+56]
!ADD esi, 64 ; update source pointer
!movntq [edi], mm0 ; write 64 bits, bypassing cache
!movntq [edi+ 8], mm1 ; note: !movntq also prevents the CPU
!movntq [edi+16], mm2 ; from READING the destination address
!movntq [edi+24], mm3 ; into the cache, only to be over-written,
!movntq [edi+32], mm4 ; so that also helps performance
!movntq [edi+40], mm5
!movntq [edi+48], mm6
!movntq [edi+56], mm7
!ADD edi, 64 ; update dest pointer
!DEC eax ; count down
!JNZ l_memcpy_bp_3 ; keep copying
!SUB ecx, 128 ; update the 64-byte block count
!JMP l_memcpy_bp_1 ; keep processing chunks
;The smallest copy uses the X86 "!movsd" instruction, in an optimized
;form which is an "unrolled loop". Then it handles the last few bytes.
!ALIGN 4
!movsd
!movsd ; perform last 1-15 dword copies
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd ; perform last 1-7 dword copies
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
memcpy_last_few: ; dword aligned from before !movsd's
!MOV ecx, ebx ; has valid low 2 bits of the byte count
!And ecx, 3 ; %11 ; the last few cows must come home
!JZ l_memcpy_final ; no more, let's leave
!REP movsb ; the last 1, 2, or 3 bytes
memcpy_final:
!emms ; clean up the state
!sfence ; flush the write buffer
EndProcedure
Structure LL_Pointer
next_.l
prev_.l
EndStructure
Procedure GiveMeLL(*LinkedList, Element, *Buffer, StructSize)
If *LinkedList And *Buffer
LL.LL_Pointer
*LinkedList - 8
CopyMemoryAMD(*LinkedList, @LL.LL_Pointer, SizeOf(LL_Pointer))
While LL\prev_ <> 0
*LinkedList = LL\prev_
CopyMemoryAMD(*LinkedList, @LL.LL_Pointer, SizeOf(LL_Pointer))
Wend
For k=1 To Element
If LL\next_ = 0 : Break : Else
*LinkedList = LL\next_
CopyMemoryAMD(*LinkedList, @LL.LL_Pointer, SizeOf(LL_Pointer))
EndIf
Next
CopyMemoryAMD(*LinkedList+SizeOf(LL_Pointer), *Buffer, StructSize)
ProcedureReturn 1
EndIf
EndProcedure
Structure TEST_
a.l
b.b
c.s
EndStructure
NewList Testing.TEST_()
For k=0 To 10
AddElement(Testing())
Testing()\a = k
Testing()\b = 100-k
Testing()\c = "I am "+Str(k)
Debug Testing()\c
Debug "---"
Next
FirstElement(Testing())
Debug "We catch the 5th element:"
Value.TEST_
GiveMeLL(@Testing(), 5, @Value, SizeOf(TEST_))
Debug Value\a
Debug Value\b
Debug Value\c