Je balance le code comme ça, il doit rester plein d'erreurs, et ce n'est pas mis en forme (surtout que moi j'utilise des tabs et que le forum les remplace par des espaces! ARGH!). C'est juste pour les amateurs

Si j'ai le temps un de ces 4 je ferai un test purement assembleur (ça ne sert pas à grand chose à mon avis, car en assembleur chaque effet peut être optimisé différement, et poker un point à la fois c'est ridicule!), et un autre opengl.
Code : Tout sélectionner
;Original code by CplBator
Structure DrawingInfoStruct
Type.l
Window.l
DC.l
ReleaseProcedure.l
PixelBuffer.l
Pitch.l
Width.l
Height.l
Depth.l
PixelFormat.l
StopDirectAccess.l
StartDirectAccess.l
EndStructure
Global FastImgOutputID.DrawingInfoStruct
;By Rescator
Procedure.l Ticks_HQ()
Static maxfreq.q
Protected t.q
If maxfreq=0
QueryPerformanceFrequency_(@maxfreq)
maxfreq=maxfreq/1000
EndIf
QueryPerformanceCounter_(@t.q)
ProcedureReturn t/maxfreq
EndProcedure
Procedure ___ReleaseFastImageOutput()
If FastImgOutputID\DC:DeleteDC_(FastImgOutputID\DC):FastImgOutputID\DC=0:EndIf ; free the created memory DC
EndProcedure
Procedure ___StopDirectAccess()
ProcedureReturn FastImgOutputID\DC
EndProcedure
Procedure ___StartDirectAccess()
GetPixel_(FastImgOutputID\DC,0,0) ; make sure all GDI operations are finished
ProcedureReturn FastImgOutputID\PixelBuffer
EndProcedure
; FastImageOutput() provides a faster pixel access for 32-,24- and 15 bit images(DIBSesctions).
; However, for now only plot(x,y,color) works faster. (point(x,y) seems to be not optimized for direct memory access at the moment. You can use the PointFast() command from the E2D Userlib to get a faster point command.)
Procedure FastImageOutput(Image)
If GetObject_(ImageID(Image),SizeOf(DIBSECTION),ds.DIBSECTION)=0
ProcedureReturn 0 ; no DIBSECTION
EndIf
FastImgOutputID\Type=7 ; allows direct memory access
FastImgOutputID\ReleaseProcedure=@___ReleaseFastImageOutput()
FastImgOutputID\PixelBuffer=ds\dsBm\bmBits+ds\dsBm\bmWidthBytes*(ds\dsBm\bmHeight-1) ;needed because the image if top down
FastImgOutputID\Pitch=-ds\dsBm\bmWidthBytes
FastImgOutputID\Width=ds\dsBm\bmWidth
FastImgOutputID\Height=ds\dsBm\bmHeight
FastImgOutputID\Depth=ds\dsBm\bmBitsPixel
Select FastImgOutputID\Depth
Case 32
FastImgOutputID\PixelFormat=#PB_PixelFormat_32Bits_BGR
Case 24
FastImgOutputID\PixelFormat=#PB_PixelFormat_24Bits_BGR
Case 16
FastImgOutputID\Depth=15
FastImgOutputID\PixelFormat=#PB_PixelFormat_15Bits
Default
ProcedureReturn 0 ; only 32-,24- and 15bit DIBSections are supported
EndSelect
MemDC=CreateCompatibleDC_(0)
If MemDC=0:ProcedureReturn 0:EndIf ; the memory DC cannot be created
SelectObject_(MemDC,ImageID(Image))
FastImgOutputID\DC=MemDC
FastImgOutputID\StopDirectAccess=@___StopDirectAccess()
FastImgOutputID\StartDirectAccess=@___StartDirectAccess()
ProcedureReturn FastImgOutputID
EndProcedure
InitSprite()
OpenScreen(800,600,32,"PEEK AND POKE VS POINTER :D ")
Structure Pixel
Pixel.l
EndStructure
Global *Ptr.Pixel, *Poke.l
Declare PointerDraw()
Declare PokeDraw()
;Declare PtrDrawMemCopy()
Declare SpritePointerDraw()
Declare SpritePointerDrawFX()
Declare SpritePokeDraw()
Declare FastImg()
PTR.l = PointerDraw()
PKE.l = PokeDraw()
Gosub PtrDrawMemCopy
PDC.l = TB-TA
SPR.l = SpritePointerDraw()
SPX.l = SpritePointerDrawFX()
SPD.l = SpritePokeDraw()
FIM.l = FastImg()
CloseScreen()
MessageRequester("","POINTEUR = "+Str(PTR)+"ms"+Chr(10)+"POKE = "+Str(PKE)+"ms"+Chr(10)+"MEMCOPY = "+Str(PDC)+"ms"+Chr(10)+"SPRPTR = "+Str(SPR)+"ms"+Chr(10)+"SPRPTRFX = "+Str(SPX)+"ms"+Chr(10)+"SPRPOKDR = "+Str(SPD)+"ms"+Chr(10)+"FASTIMG = "+Str(FIM)+"ms")
End
Procedure PointerDraw()
Protected DBuffer.l,DBufferP.l,PixelFormat.l,Pbyte.l
Protected TA.l,TB.l
StartDrawing(ScreenOutput())
DBuffer = DrawingBuffer()
DBufferP = DrawingBufferPitch()
PixelFormat = DrawingBufferPixelFormat()
StopDrawing()
Select PixelFormat
Case #PB_PixelFormat_8Bits : PixelFormat=1
Case #PB_PixelFormat_15Bits : PixelFormat=2
Case #PB_PixelFormat_16Bits : PixelFormat=2
Case #PB_PixelFormat_24Bits_RGB : PixelFormat=3
Case #PB_PixelFormat_24Bits_BGR : PixelFormat=3
Case #PB_PixelFormat_32Bits_RGB : PixelFormat=4
Case #PB_PixelFormat_32Bits_BGR : PixelFormat=4
EndSelect
diff.l = DBufferP-800*PixelFormat
TA=Ticks_HQ()
For i = 1 To 300
*Ptr = DBuffer
For y = 0 To 600-1
v.l=RGB(0,y*i,0)
For x = 0 To 800-1
*Ptr\Pixel = v
*Ptr + 4
Next
*Ptr+diff
Next
FlipBuffers(0)
Next i
TB=Ticks_HQ()
ProcedureReturn TB-TA
EndProcedure
Procedure PokeDraw()
Protected DBuffer.l,DBufferP.l,PixelFormat.l,Pbyte.l
Protected TA.l,TB.l
StartDrawing(ScreenOutput())
DBuffer = DrawingBuffer()
DBufferP = DrawingBufferPitch()
PixelFormat = DrawingBufferPixelFormat()
StopDrawing()
Select PixelFormat
Case #PB_PixelFormat_8Bits : PixelFormat=1
Case #PB_PixelFormat_15Bits : PixelFormat=2
Case #PB_PixelFormat_16Bits : PixelFormat=2
Case #PB_PixelFormat_24Bits_RGB : PixelFormat=3
Case #PB_PixelFormat_24Bits_BGR : PixelFormat=3
Case #PB_PixelFormat_32Bits_RGB : PixelFormat=4
Case #PB_PixelFormat_32Bits_BGR : PixelFormat=4
EndSelect
diff.l = DBufferP-800*PixelFormat
TA=Ticks_HQ()
For i = 1 To 300
*Poke = DBuffer
For y = 0 To 600-1
v.l=RGB(y*i,0,0)
For x = 0 To 800-1
PokeL(*Poke,v)
*Poke + 4
Next
*Poke+diff
Next
FlipBuffers(0)
Next i
TB=Ticks_HQ()
ProcedureReturn TB-TA
EndProcedure
PtrDrawMemCopy:
StartDrawing(ScreenOutput())
*dst = DrawingBuffer()
DBufferP = DrawingBufferPitch()
PixelFormat = DrawingBufferPixelFormat()
StopDrawing()
;juste pour la forme ;)
Select PixelFormat
Case #PB_PixelFormat_8Bits : PixelFormat=1
Case #PB_PixelFormat_15Bits : PixelFormat=2
Case #PB_PixelFormat_16Bits : PixelFormat=2
Case #PB_PixelFormat_24Bits_RGB : PixelFormat=3
Case #PB_PixelFormat_24Bits_BGR : PixelFormat=3
Case #PB_PixelFormat_32Bits_RGB : PixelFormat=4
Case #PB_PixelFormat_32Bits_BGR : PixelFormat=4
EndSelect
diff.l = DBufferP-800*PixelFormat
*src.l = AllocateMemory(DBufferP*600)
len=DBufferP*600
If *src<>0
Goto BCL
!section '.text' code readable executable align 256
BCL:
TA=Ticks_HQ()
For i = 1 To 300
*Ptr = *src
For y = 0 To 600-1
v.l=RGB(y*i,0,0)
For x = 0 To 800-1
*Ptr\Pixel = v
*Ptr + 4
Next
*Ptr + diff
Next
;*src=source
;*dst=DBuffer
Gosub CopyMemoryAMD;(source,DBuffer,DBufferP*600*8)
FlipBuffers(0)
Next i
TB=Ticks_HQ()
FreeMemory(source)
EndIf
Return
Procedure SpritePointerDraw()
Protected DBuffer.l,DBufferP.l,PixelFormat.l,Pbyte.l
Protected TA.l,TB.l
mem_spr.l=CreateSprite(#PB_Any,800,600,#PB_Sprite_Memory|#PB_Sprite_Alpha)
StartDrawing(SpriteOutput(mem_spr))
DBuffer = DrawingBuffer()
DBufferP = DrawingBufferPitch()
PixelFormat = DrawingBufferPixelFormat()
StopDrawing()
Select PixelFormat
Case #PB_PixelFormat_8Bits : PixelFormat=1
Case #PB_PixelFormat_15Bits : PixelFormat=2
Case #PB_PixelFormat_16Bits : PixelFormat=2
Case #PB_PixelFormat_24Bits_RGB : PixelFormat=3
Case #PB_PixelFormat_24Bits_BGR : PixelFormat=3
Case #PB_PixelFormat_32Bits_RGB : PixelFormat=4
Case #PB_PixelFormat_32Bits_BGR : PixelFormat=4
EndSelect
diff.l = DBufferP-800*PixelFormat
TA=Ticks_HQ()
For i = 1 To 300
*Ptr = DBuffer
For y = 0 To 600-1
v.l=RGB(0,y*i,0)
For x = 0 To 800-1
*Ptr\Pixel = v
*Ptr + 4
Next
*Ptr+diff
Next
DisplaySprite(mem_spr,0,0)
FlipBuffers(0)
Next i
TB=Ticks_HQ()
ProcedureReturn TB-TA
EndProcedure
Procedure SpritePointerDrawFX()
Protected DBuffer.l,DBufferP.l,PixelFormat.l,Pbyte.l
Protected TA.l,TB.l
mem_spr.l=CreateSprite(#PB_Any,800,600,#PB_Sprite_Memory|#PB_Sprite_Alpha)
StartDrawing(SpriteOutput(mem_spr))
DBuffer = DrawingBuffer()
DBufferP = DrawingBufferPitch()
PixelFormat = DrawingBufferPixelFormat()
StopDrawing()
Select PixelFormat
Case #PB_PixelFormat_8Bits : PixelFormat=1
Case #PB_PixelFormat_15Bits : PixelFormat=2
Case #PB_PixelFormat_16Bits : PixelFormat=2
Case #PB_PixelFormat_24Bits_RGB : PixelFormat=3
Case #PB_PixelFormat_24Bits_BGR : PixelFormat=3
Case #PB_PixelFormat_32Bits_RGB : PixelFormat=4
Case #PB_PixelFormat_32Bits_BGR : PixelFormat=4
EndSelect
diff.l = DBufferP-800*PixelFormat
StartSpecialFX()
TA=Ticks_HQ()
For i = 1 To 300
*Ptr = DBuffer
For y = 0 To 600-1
v.l=RGB(0,0,y*i)
For x = 0 To 800-1
*Ptr\Pixel = v
*Ptr + 4
Next
*Ptr+diff
Next
DisplaySprite(mem_spr,0,0)
FlipBuffers(0)
Next i
TB=Ticks_HQ()
StopSpecialFX()
ProcedureReturn TB-TA
EndProcedure
Procedure SpritePokeDraw()
Protected DBuffer.l,DBufferP.l,PixelFormat.l,Pbyte.l
Protected TA.l,TB.l
mem_spr.l=CreateSprite(#PB_Any,800,600,#PB_Sprite_Memory|#PB_Sprite_Alpha)
StartDrawing(SpriteOutput(mem_spr))
DBuffer = DrawingBuffer()
DBufferP = DrawingBufferPitch()
PixelFormat = DrawingBufferPixelFormat()
StopDrawing()
Select PixelFormat
Case #PB_PixelFormat_8Bits : PixelFormat=1
Case #PB_PixelFormat_15Bits : PixelFormat=2
Case #PB_PixelFormat_16Bits : PixelFormat=2
Case #PB_PixelFormat_24Bits_RGB : PixelFormat=3
Case #PB_PixelFormat_24Bits_BGR : PixelFormat=3
Case #PB_PixelFormat_32Bits_RGB : PixelFormat=4
Case #PB_PixelFormat_32Bits_BGR : PixelFormat=4
EndSelect
diff.l = DBufferP-800*PixelFormat
TA=Ticks_HQ()
For i = 1 To 300
*Poke = DBuffer
For y = 0 To 600-1
v.l=RGB(y*i,0,0)
For x = 0 To 800-1
PokeL(*Poke,v)
*Poke + 4
Next
*Poke+diff
Next
DisplaySprite(mem_spr,0,0)
FlipBuffers(0)
Next i
TB=Ticks_HQ()
ProcedureReturn TB-TA
EndProcedure
Procedure FastImg()
CreateImage(1,800,600,32) ; only 32bit seems to be really faster...
TA=Ticks_HQ()
For i = 1 To 300
StartDrawing(FastImageOutput(1)) ; replace this by ImageOutput(1)
For y = 0 To 600-1
v.l=RGB(y*i,0,0)
For x = 0 To 800-1
Plot(x,y,v)
Next
Next
StopDrawing()
StartDrawing(ScreenOutput())
DrawImage(ImageID(1),0,0)
StopDrawing()
FlipBuffers(0)
Next i
TB=Ticks_HQ()
StopDrawing()
ProcedureReturn TB-TA
EndProcedure
;Attention, cette routine copie des blocs multiples de 8
!section '.text' code readable executable align 256
CopyMemoryAMD:;(*src, *dst, len)
#CACHEBLOCK = $80
#CACHEBLOCKPREFETCH = #CACHEBLOCK/2
#CACHEBLOCKTOP = #CACHEBLOCK*64
#UNCACHED_COPY = 197*1024
#UNCACHED_COPYPREFETCH = #UNCACHED_COPY/64
#TINY_BLOCK_COPY = 64
#IN_CACHE_COPY = 64*1024
#IN_CACHE_COPYBIG = #IN_CACHE_COPY/64
; len = size/8
MOV esi, *src ; source array
MOV edi, *dst ; destination array
MOV ecx, len ; number of QWORDS (8 bytes)
MOV ebx, ecx ; keep a copy of count
CLD
CMP ecx, #TINY_BLOCK_COPY
JB l_memcpy_ic_3 ; tiny? skip mmx copy
CMP ecx, 32*1024 ; don't align between 32k-64k because
JBE l_memcpy_do_align ; it appears to be slower
CMP ecx, 64*1024
JBE l_memcpy_align_done
memcpy_do_align:
MOV ecx, 8 ; a trick that's faster than rep movsb...
SUB ecx, edi ; align destination to qword
AND ecx, 7 ; 111b ; get the low bits
SUB ebx, ecx ; update copy count
NEG ecx ; set up to jump into the array
ADD ecx, l_memcpy_align_done
JMP ecx ; jump to array of movsb's
!section '.text' code readable executable align 4
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
!movsb
memcpy_align_done: ; destination is dword aligned
MOV ecx, ebx ; number of bytes left to copy
SHR ecx, 6 ; get 64-byte block count
JZ l_memcpy_ic_2 ; finish the last few bytes
CMP ecx, #IN_CACHE_COPYBIG ; too big 4 cache? use uncached copy
JAE l_memcpy_uc_test
JMP l_memcpy_ic_1
!section '.text' code readable executable align 16
memcpy_ic_1: ; 64-byte block copies, in-cache copy
!prefetchnta [esi+(200*64/34+192)] ; start reading ahead
!movq mm0, [esi+0] ; read 64 bits
!movq mm1, [esi+8]
!movq [edi+0], mm0 ; write 64 bits
!movq [edi+8], mm1 ; note: the normal !movq writes the
!movq mm2, [esi+16] ; data to cache; a cache line will be
!movq mm3, [esi+24] ; allocated as needed, to store the data
!movq [edi+16], mm2
!movq [edi+24], mm3
!movq mm0, [esi+32]
!movq mm1, [esi+40]
!movq [edi+32], mm0
!movq [edi+40], mm1
!movq mm2, [esi+48]
!movq mm3, [esi+56]
!movq [edi+48], mm2
!movq [edi+56], mm3
ADD esi, 64 ; update source pointer
ADD edi, 64 ; update destination pointer
DEC ecx ; count down
JNZ l_memcpy_ic_1 ; last 64-byte block?
memcpy_ic_2:
MOV ecx, ebx ; has valid low 6 bits of the byte count
memcpy_ic_3:
SHR ecx, 2 ; dword count
AND ecx, 31 ; %1111 ; only look at the "remainder" bits
NEG ecx ; set up to jump into the array
ADD ecx, l_memcpy_last_few
JMP ecx ; jump to array of movsd's
memcpy_uc_test:
CMP ecx, #UNCACHED_COPYPREFETCH ; big enough? use block prefetch copy
JAE l_memcpy_bp_1
memcpy_64_test:
OR ecx, ecx ; tail end of block prefetch will jump here
JZ l_memcpy_ic_2 ; no more 64-byte blocks left
memcpy_uc_1: ; 64-byte blocks, uncached copy
!prefetchnta [esi+(200*64/34+192)] ; start reading ahead
!movq mm0, [esi+0] ; read 64 bits
ADD edi, 64 ; update destination pointer
!movq mm1, [esi+8]
ADD esi, 64 ; update source pointer
!movq mm2, [esi-48]
!movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
!movq mm0, [esi-40] ; note: !movntq also prevents the CPU
!movntq [edi-56], mm1 ; from READING the destination address
!movq mm1, [esi-32] ; into the cache, only to be over-written
!movntq [edi-48], mm2 ; so that also helps performance
!movq mm2, [esi-24]
!movntq [edi-40], mm0
!movq mm0, [esi-16]
!movntq [edi-32], mm1
!movq mm1, [esi-8]
!movntq [edi-24], mm2
!movntq [edi-16], mm0
DEC ecx
!movntq [edi-8], mm1
JNZ l_memcpy_uc_1 ; last 64-byte block?
JMP l_memcpy_ic_2 ; almost done
memcpy_bp_1: ; large blocks, block prefetch copy
CMP ecx, #CACHEBLOCK ; big enough to run another prefetch loop?
JL l_memcpy_64_test ; no, back to regular uncached copy
MOV eax, #CACHEBLOCKPREFETCH ; block prefetch loop, unrolled 2X
ADD esi, #CACHEBLOCKTOP ; move to the top of the block
JMP l_memcpy_bp_2
!section '.text' code readable executable align 16
memcpy_bp_2:
MOV edx, [esi-64] ; grab one address per cache line
MOV edx, [esi-128] ; grab one address per cache line
SUB esi, 128 ; go reverse order
DEC eax ; count down the cache lines
JNZ l_memcpy_bp_2 ; keep grabbing more lines into cache
MOV eax, #CACHEBLOCK ; now that it's in cache, do the copy
JMP l_memcpy_bp_3
!section '.text' code readable executable align 16
memcpy_bp_3:
!movq mm0, [esi] ; read 64 bits
!movq mm1, [esi+ 8]
!movq mm2, [esi+16]
!movq mm3, [esi+24]
!movq mm4, [esi+32]
!movq mm5, [esi+40]
!movq mm6, [esi+48]
!movq mm7, [esi+56]
ADD esi, 64 ; update source pointer
!movntq [edi], mm0 ; write 64 bits, bypassing cache
!movntq [edi+ 8], mm1 ; note: !movntq also prevents the CPU
!movntq [edi+16], mm2 ; from READING the destination address
!movntq [edi+24], mm3 ; into the cache, only to be over-written,
!movntq [edi+32], mm4 ; so that also helps performance
!movntq [edi+40], mm5
!movntq [edi+48], mm6
!movntq [edi+56], mm7
ADD edi, 64 ; update dest pointer
DEC eax ; count down
JNZ l_memcpy_bp_3 ; keep copying
SUB ecx, #CACHEBLOCK ; update the 64-byte block count
JMP l_memcpy_bp_1 ; keep processing chunks
;The smallest copy uses the X86 "!movsd" instruction, in an optimized
;form which is an "unrolled loop". Then it handles the last few bytes.
!section '.text' code readable executable align 4
!movsd
!movsd ; perform last 1-15 dword copies
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd ; perform last 1-7 dword copies
!movsd
!movsd
!movsd
!movsd
!movsd
!movsd
memcpy_last_few: ; dword aligned from before !movsd's
MOV ecx, ebx ; has valid low 2 bits of the byte count
AND ecx, 3 ; %11 ; the last few cows must come home
JZ l_memcpy_final ; no more, let's leave
REP movsb ; the last 1, 2, or 3 bytes
memcpy_final:
!emms ; clean up the state
!sfence ; flush the write buffer
Return