Page 4 sur 4

Publié : lun. 02/oct./2006 20:03
par djes
Je me suis amusé un peu avec la fonction de copie. Je me suis rendu compte aussi que le test était foireux à cause du ... flipbuffers()!!! Personne ne me l'aurait dit, bien sûr!

Je balance le code comme ça, il doit rester plein d'erreurs, et ce n'est pas mis en forme (surtout que moi j'utilise des tabs et que le forum les remplace par des espaces! ARGH!). C'est juste pour les amateurs ;)

Si j'ai le temps un de ces 4 je ferai un test purement assembleur (ça ne sert pas à grand chose à mon avis, car en assembleur chaque effet peut être optimisé différement, et poker un point à la fois c'est ridicule!), et un autre opengl.

Code : Tout sélectionner

;Original code by CplBator

Structure DrawingInfoStruct
  Type.l
  Window.l
  DC.l
  ReleaseProcedure.l
  PixelBuffer.l
  Pitch.l
  Width.l
  Height.l
  Depth.l
  PixelFormat.l
  StopDirectAccess.l
  StartDirectAccess.l
EndStructure

Global FastImgOutputID.DrawingInfoStruct

;By Rescator
Procedure.l Ticks_HQ()
  Static maxfreq.q
  Protected t.q
  If maxfreq=0
    QueryPerformanceFrequency_(@maxfreq)
    maxfreq=maxfreq/1000
  EndIf
  QueryPerformanceCounter_(@t.q)
  ProcedureReturn t/maxfreq
EndProcedure

Procedure ___ReleaseFastImageOutput()
  If FastImgOutputID\DC:DeleteDC_(FastImgOutputID\DC):FastImgOutputID\DC=0:EndIf ; free the created memory DC
EndProcedure
Procedure ___StopDirectAccess()
  ProcedureReturn FastImgOutputID\DC
EndProcedure
Procedure ___StartDirectAccess()
  GetPixel_(FastImgOutputID\DC,0,0) ; make sure all GDI operations are finished
  ProcedureReturn FastImgOutputID\PixelBuffer
EndProcedure
; FastImageOutput() provides a faster pixel access for 32-,24- and 15 bit images(DIBSesctions).
; However, for now only plot(x,y,color) works faster. (point(x,y) seems to be not optimized for direct memory access at the moment. You can use the PointFast() command from the E2D Userlib to get a faster point command.)
Procedure FastImageOutput(Image)
  If GetObject_(ImageID(Image),SizeOf(DIBSECTION),ds.DIBSECTION)=0
    ProcedureReturn 0 ; no DIBSECTION
  EndIf

  FastImgOutputID\Type=7 ; allows direct memory access
  FastImgOutputID\ReleaseProcedure=@___ReleaseFastImageOutput()
  FastImgOutputID\PixelBuffer=ds\dsBm\bmBits+ds\dsBm\bmWidthBytes*(ds\dsBm\bmHeight-1) ;needed because the image if top down
  FastImgOutputID\Pitch=-ds\dsBm\bmWidthBytes
  FastImgOutputID\Width=ds\dsBm\bmWidth
  FastImgOutputID\Height=ds\dsBm\bmHeight
  FastImgOutputID\Depth=ds\dsBm\bmBitsPixel

  Select FastImgOutputID\Depth
    Case 32
      FastImgOutputID\PixelFormat=#PB_PixelFormat_32Bits_BGR
    Case 24
      FastImgOutputID\PixelFormat=#PB_PixelFormat_24Bits_BGR
    Case 16
      FastImgOutputID\Depth=15
      FastImgOutputID\PixelFormat=#PB_PixelFormat_15Bits     
    Default
      ProcedureReturn 0 ; only 32-,24- and 15bit DIBSections are supported
  EndSelect

  MemDC=CreateCompatibleDC_(0)
  If MemDC=0:ProcedureReturn 0:EndIf ; the memory DC cannot be created
  SelectObject_(MemDC,ImageID(Image))
  FastImgOutputID\DC=MemDC

  FastImgOutputID\StopDirectAccess=@___StopDirectAccess()
  FastImgOutputID\StartDirectAccess=@___StartDirectAccess()
  ProcedureReturn FastImgOutputID
EndProcedure


InitSprite()
OpenScreen(800,600,32,"PEEK AND POKE VS POINTER  :D ")

Structure Pixel
  Pixel.l
EndStructure 
Global *Ptr.Pixel, *Poke.l

Declare PointerDraw()
Declare PokeDraw()
;Declare PtrDrawMemCopy()
Declare SpritePointerDraw()
Declare SpritePointerDrawFX()
Declare SpritePokeDraw()
Declare FastImg()

PTR.l = PointerDraw()
PKE.l = PokeDraw()
 
Gosub PtrDrawMemCopy
PDC.l = TB-TA
SPR.l = SpritePointerDraw()
SPX.l = SpritePointerDrawFX()
SPD.l = SpritePokeDraw()
FIM.l   =   FastImg()

CloseScreen()

MessageRequester("","POINTEUR = "+Str(PTR)+"ms"+Chr(10)+"POKE = "+Str(PKE)+"ms"+Chr(10)+"MEMCOPY = "+Str(PDC)+"ms"+Chr(10)+"SPRPTR = "+Str(SPR)+"ms"+Chr(10)+"SPRPTRFX = "+Str(SPX)+"ms"+Chr(10)+"SPRPOKDR = "+Str(SPD)+"ms"+Chr(10)+"FASTIMG = "+Str(FIM)+"ms")
End

Procedure PointerDraw()
   Protected DBuffer.l,DBufferP.l,PixelFormat.l,Pbyte.l
   Protected TA.l,TB.l
   
   StartDrawing(ScreenOutput())
   DBuffer      = DrawingBuffer()
   DBufferP     = DrawingBufferPitch()
   PixelFormat  = DrawingBufferPixelFormat()
   StopDrawing()

   Select PixelFormat
      Case #PB_PixelFormat_8Bits         : PixelFormat=1
      Case #PB_PixelFormat_15Bits        : PixelFormat=2
      Case #PB_PixelFormat_16Bits        : PixelFormat=2
      Case #PB_PixelFormat_24Bits_RGB    : PixelFormat=3
      Case #PB_PixelFormat_24Bits_BGR    : PixelFormat=3
      Case #PB_PixelFormat_32Bits_RGB    : PixelFormat=4
      Case #PB_PixelFormat_32Bits_BGR    : PixelFormat=4
   EndSelect
   diff.l           = DBufferP-800*PixelFormat


   TA=Ticks_HQ()
   For i = 1 To 300
      *Ptr = DBuffer
      For y = 0 To 600-1
         v.l=RGB(0,y*i,0)
         For x = 0 To 800-1
            *Ptr\Pixel = v
            *Ptr + 4
         Next
         *Ptr+diff
      Next
      FlipBuffers(0)
   Next i
   TB=Ticks_HQ()

   ProcedureReturn TB-TA

EndProcedure

Procedure PokeDraw()
   Protected DBuffer.l,DBufferP.l,PixelFormat.l,Pbyte.l
   Protected TA.l,TB.l
   
   StartDrawing(ScreenOutput())
   DBuffer      = DrawingBuffer()
   DBufferP     = DrawingBufferPitch()
   PixelFormat  = DrawingBufferPixelFormat()
   StopDrawing()
 
   Select PixelFormat
      Case #PB_PixelFormat_8Bits         : PixelFormat=1
      Case #PB_PixelFormat_15Bits        : PixelFormat=2
      Case #PB_PixelFormat_16Bits        : PixelFormat=2
      Case #PB_PixelFormat_24Bits_RGB    : PixelFormat=3
      Case #PB_PixelFormat_24Bits_BGR    : PixelFormat=3
      Case #PB_PixelFormat_32Bits_RGB    : PixelFormat=4
      Case #PB_PixelFormat_32Bits_BGR    : PixelFormat=4
   EndSelect
   diff.l           = DBufferP-800*PixelFormat

   TA=Ticks_HQ()
   For i = 1 To 300
      *Poke = DBuffer
      For y = 0 To 600-1
         v.l=RGB(y*i,0,0)
         For x = 0 To 800-1
            PokeL(*Poke,v)
            *Poke + 4
         Next
         *Poke+diff
      Next
      FlipBuffers(0)
   Next i
   TB=Ticks_HQ()

   ProcedureReturn TB-TA
   
EndProcedure

PtrDrawMemCopy:
   
   StartDrawing(ScreenOutput())
   *dst      = DrawingBuffer()
   DBufferP     = DrawingBufferPitch()
   PixelFormat  = DrawingBufferPixelFormat()
   StopDrawing()
   
   ;juste pour la forme ;)
   Select PixelFormat
      Case #PB_PixelFormat_8Bits         : PixelFormat=1
      Case #PB_PixelFormat_15Bits        : PixelFormat=2
      Case #PB_PixelFormat_16Bits        : PixelFormat=2
      Case #PB_PixelFormat_24Bits_RGB    : PixelFormat=3
      Case #PB_PixelFormat_24Bits_BGR    : PixelFormat=3
      Case #PB_PixelFormat_32Bits_RGB    : PixelFormat=4
      Case #PB_PixelFormat_32Bits_BGR    : PixelFormat=4
   EndSelect

	diff.l = DBufferP-800*PixelFormat
	*src.l = AllocateMemory(DBufferP*600)
	len=DBufferP*600

 	If *src<>0		
		Goto BCL
		!section '.text' code readable executable align 256
		BCL:
 		TA=Ticks_HQ()
		For i = 1 To 300
			*Ptr = *src
			For y = 0 To 600-1
			  v.l=RGB(y*i,0,0)
				For x = 0 To 800-1
					*Ptr\Pixel = v
					*Ptr + 4
				Next
				*Ptr + diff
			Next
			;*src=source
			;*dst=DBuffer
			Gosub CopyMemoryAMD;(source,DBuffer,DBufferP*600*8)
			FlipBuffers(0)
		Next i
		TB=Ticks_HQ()
	
		FreeMemory(source)
	EndIf
	Return

Procedure SpritePointerDraw()
   Protected DBuffer.l,DBufferP.l,PixelFormat.l,Pbyte.l
   Protected TA.l,TB.l
   
   mem_spr.l=CreateSprite(#PB_Any,800,600,#PB_Sprite_Memory|#PB_Sprite_Alpha)
   StartDrawing(SpriteOutput(mem_spr))
   
   DBuffer      = DrawingBuffer()
   DBufferP     = DrawingBufferPitch()
   PixelFormat  = DrawingBufferPixelFormat()
   StopDrawing()

   Select PixelFormat
      Case #PB_PixelFormat_8Bits         : PixelFormat=1
      Case #PB_PixelFormat_15Bits        : PixelFormat=2
      Case #PB_PixelFormat_16Bits        : PixelFormat=2
      Case #PB_PixelFormat_24Bits_RGB    : PixelFormat=3
      Case #PB_PixelFormat_24Bits_BGR    : PixelFormat=3
      Case #PB_PixelFormat_32Bits_RGB    : PixelFormat=4
      Case #PB_PixelFormat_32Bits_BGR    : PixelFormat=4
   EndSelect
  diff.l       = DBufferP-800*PixelFormat

   TA=Ticks_HQ()
   For i = 1 To 300
      *Ptr = DBuffer
      For y = 0 To 600-1
         v.l=RGB(0,y*i,0)
         For x = 0 To 800-1
            *Ptr\Pixel = v
            *Ptr + 4
         Next
      *Ptr+diff
      Next
      DisplaySprite(mem_spr,0,0)
      FlipBuffers(0)
   Next i
   TB=Ticks_HQ()
   
   ProcedureReturn TB-TA
   
EndProcedure

Procedure SpritePointerDrawFX()
   Protected DBuffer.l,DBufferP.l,PixelFormat.l,Pbyte.l
   Protected TA.l,TB.l
   
   mem_spr.l=CreateSprite(#PB_Any,800,600,#PB_Sprite_Memory|#PB_Sprite_Alpha)
   StartDrawing(SpriteOutput(mem_spr))
   DBuffer      = DrawingBuffer()
   DBufferP     = DrawingBufferPitch()
   PixelFormat  = DrawingBufferPixelFormat()
   StopDrawing()
   
   Select PixelFormat
      Case #PB_PixelFormat_8Bits         : PixelFormat=1
      Case #PB_PixelFormat_15Bits        : PixelFormat=2
      Case #PB_PixelFormat_16Bits        : PixelFormat=2
      Case #PB_PixelFormat_24Bits_RGB    : PixelFormat=3
      Case #PB_PixelFormat_24Bits_BGR    : PixelFormat=3
      Case #PB_PixelFormat_32Bits_RGB    : PixelFormat=4
      Case #PB_PixelFormat_32Bits_BGR    : PixelFormat=4
   EndSelect
  diff.l       = DBufferP-800*PixelFormat

   StartSpecialFX()
   TA=Ticks_HQ()
   For i = 1 To 300
      *Ptr = DBuffer
      For y = 0 To 600-1
         v.l=RGB(0,0,y*i)
         For x = 0 To 800-1
            *Ptr\Pixel = v
            *Ptr + 4
         Next
      *Ptr+diff
      Next
      DisplaySprite(mem_spr,0,0)
      FlipBuffers(0)
   Next i
   TB=Ticks_HQ()
   StopSpecialFX()
   ProcedureReturn TB-TA
   
EndProcedure

Procedure SpritePokeDraw()
   Protected DBuffer.l,DBufferP.l,PixelFormat.l,Pbyte.l
   Protected TA.l,TB.l
   
   mem_spr.l=CreateSprite(#PB_Any,800,600,#PB_Sprite_Memory|#PB_Sprite_Alpha)
   StartDrawing(SpriteOutput(mem_spr))
   
   DBuffer      = DrawingBuffer()
   DBufferP     = DrawingBufferPitch()
   PixelFormat  = DrawingBufferPixelFormat()
   StopDrawing()
   
   Select PixelFormat
      Case #PB_PixelFormat_8Bits         : PixelFormat=1
      Case #PB_PixelFormat_15Bits        : PixelFormat=2
      Case #PB_PixelFormat_16Bits        : PixelFormat=2
      Case #PB_PixelFormat_24Bits_RGB    : PixelFormat=3
      Case #PB_PixelFormat_24Bits_BGR    : PixelFormat=3
      Case #PB_PixelFormat_32Bits_RGB    : PixelFormat=4
      Case #PB_PixelFormat_32Bits_BGR    : PixelFormat=4
   EndSelect 
   diff.l       = DBufferP-800*PixelFormat

   TA=Ticks_HQ()
   For i = 1 To 300
      *Poke = DBuffer
      For y = 0 To 600-1
         v.l=RGB(y*i,0,0)
         For x = 0 To 800-1
            PokeL(*Poke,v)
            *Poke + 4
         Next
         *Poke+diff
      Next
      DisplaySprite(mem_spr,0,0)
      FlipBuffers(0)
   Next i
   TB=Ticks_HQ()
   
   ProcedureReturn TB-TA
   
EndProcedure

Procedure FastImg()

   CreateImage(1,800,600,32) ; only 32bit seems to be really faster...

   TA=Ticks_HQ()
   For i = 1 To 300
      StartDrawing(FastImageOutput(1)) ; replace this by ImageOutput(1)
      For y = 0 To 600-1
         v.l=RGB(y*i,0,0)
         For x = 0 To 800-1
            Plot(x,y,v)
         Next
      Next
      StopDrawing()
      StartDrawing(ScreenOutput())
      DrawImage(ImageID(1),0,0)
      StopDrawing()
      FlipBuffers(0)
   Next i
   TB=Ticks_HQ()

   StopDrawing()
   
   ProcedureReturn TB-TA
   
EndProcedure

;Attention, cette routine copie des blocs multiples de 8
!section '.text' code readable executable align 256
CopyMemoryAMD:;(*src, *dst, len)
  #CACHEBLOCK = $80
  #CACHEBLOCKPREFETCH = #CACHEBLOCK/2
  #CACHEBLOCKTOP = #CACHEBLOCK*64
  #UNCACHED_COPY = 197*1024
  #UNCACHED_COPYPREFETCH = #UNCACHED_COPY/64
  #TINY_BLOCK_COPY = 64
  #IN_CACHE_COPY = 64*1024
  #IN_CACHE_COPYBIG = #IN_CACHE_COPY/64
 ; len = size/8
  MOV esi, *src ; source array
  MOV edi, *dst ; destination array
  MOV ecx, len ; number of QWORDS (8 bytes)
 
  MOV  ebx, ecx  ; keep a copy of count
  CLD
  CMP  ecx, #TINY_BLOCK_COPY
  JB  l_memcpy_ic_3 ; tiny? skip mmx copy
  CMP  ecx, 32*1024  ; don't align between 32k-64k because
  JBE  l_memcpy_do_align ;  it appears to be slower
  CMP  ecx, 64*1024
  JBE  l_memcpy_align_done
  memcpy_do_align:
  MOV  ecx, 8   ; a trick that's faster than rep movsb...
  SUB  ecx, edi  ; align destination to qword
  AND  ecx, 7 ; 111b  ; get the low bits
  SUB  ebx, ecx  ; update copy count
  NEG  ecx    ; set up to jump into the array
  ADD  ecx, l_memcpy_align_done
  JMP  ecx    ; jump to array of movsb's
	!section '.text' code readable executable align 4
  !movsb
  !movsb
  !movsb
  !movsb
  !movsb
  !movsb
  !movsb
  !movsb
  memcpy_align_done:   ; destination is dword aligned
  MOV  ecx, ebx  ; number of bytes left to copy
  SHR  ecx, 6   ; get 64-byte block count
  JZ  l_memcpy_ic_2 ; finish the last few bytes
  CMP  ecx, #IN_CACHE_COPYBIG ; too big 4 cache? use uncached copy
  JAE  l_memcpy_uc_test
 
 	JMP l_memcpy_ic_1
	!section '.text' code readable executable align 16
  memcpy_ic_1:   ; 64-byte block copies, in-cache copy
  !prefetchnta [esi+(200*64/34+192)]  ; start reading ahead
  !movq mm0, [esi+0] ; read 64 bits
  !movq mm1, [esi+8]
  !movq [edi+0], mm0 ; write 64 bits
  !movq [edi+8], mm1 ;    note:  the normal !movq writes the
  !movq mm2, [esi+16] ;    data to cache; a cache line will be
  !movq mm3, [esi+24] ;    allocated as needed, to store the data
  !movq [edi+16], mm2
  !movq [edi+24], mm3
  !movq mm0, [esi+32]
  !movq mm1, [esi+40]
  !movq [edi+32], mm0
  !movq [edi+40], mm1
  !movq mm2, [esi+48]
  !movq mm3, [esi+56]
  !movq [edi+48], mm2
  !movq [edi+56], mm3
  ADD  esi, 64   ; update source pointer
  ADD  edi, 64   ; update destination pointer
  DEC  ecx    ; count down
  JNZ  l_memcpy_ic_1 ; last 64-byte block?
  memcpy_ic_2:
  MOV  ecx, ebx  ; has valid low 6 bits of the byte count
  memcpy_ic_3:
  SHR  ecx, 2   ; dword count
  AND  ecx, 31 ; %1111  ; only look at the "remainder" bits
  NEG  ecx    ; set up to jump into the array
  ADD  ecx, l_memcpy_last_few
  JMP  ecx    ; jump to array of movsd's
  memcpy_uc_test:
  CMP  ecx, #UNCACHED_COPYPREFETCH ; big enough? use block prefetch copy
  JAE  l_memcpy_bp_1
  memcpy_64_test:
  OR  ecx, ecx  ; tail end of block prefetch will jump here
  JZ  l_memcpy_ic_2 ; no more 64-byte blocks left
 
  memcpy_uc_1:    ; 64-byte blocks, uncached copy
  !prefetchnta [esi+(200*64/34+192)]  ; start reading ahead
  !movq mm0, [esi+0]  ; read 64 bits
  ADD  edi, 64   ; update destination pointer
  !movq mm1, [esi+8]
  ADD  esi, 64   ; update source pointer
  !movq mm2, [esi-48]
  !movntq [edi-64], mm0 ; write 64 bits, bypassing the cache
  !movq mm0, [esi-40] ;    note: !movntq also prevents the CPU
  !movntq [edi-56], mm1 ;    from READING the destination address
  !movq mm1, [esi-32] ;    into the cache, only to be over-written
  !movntq [edi-48], mm2 ;    so that also helps performance
  !movq mm2, [esi-24]
  !movntq [edi-40], mm0
  !movq mm0, [esi-16]
  !movntq [edi-32], mm1
  !movq mm1, [esi-8]
  !movntq [edi-24], mm2
  !movntq [edi-16], mm0
  DEC  ecx
  !movntq [edi-8], mm1
  JNZ  l_memcpy_uc_1 ; last 64-byte block?
  JMP  l_memcpy_ic_2  ; almost done
 
  memcpy_bp_1:   ; large blocks, block prefetch copy
  CMP  ecx, #CACHEBLOCK   ; big enough to run another prefetch loop?
  JL  l_memcpy_64_test   ; no, back to regular uncached copy
  MOV  eax, #CACHEBLOCKPREFETCH  ; block prefetch loop, unrolled 2X
  ADD  esi, #CACHEBLOCKTOP ; move to the top of the block

	JMP l_memcpy_bp_2
	!section '.text' code readable executable align 16
  memcpy_bp_2:
  MOV  edx, [esi-64]  ; grab one address per cache line
  MOV  edx, [esi-128]  ; grab one address per cache line
  SUB  esi, 128   ; go reverse order
  DEC  eax     ; count down the cache lines
  JNZ  l_memcpy_bp_2  ; keep grabbing more lines into cache
  MOV  eax, #CACHEBLOCK  ; now that it's in cache, do the copy

	JMP l_memcpy_bp_3
	!section '.text' code readable executable align 16
  memcpy_bp_3:
  !movq mm0, [esi]  ; read 64 bits
  !movq mm1, [esi+ 8]
  !movq mm2, [esi+16]
  !movq mm3, [esi+24]
  !movq mm4, [esi+32]
  !movq mm5, [esi+40]
  !movq mm6, [esi+48]
  !movq mm7, [esi+56]
  ADD  esi, 64    ; update source pointer
  !movntq [edi], mm0  ; write 64 bits, bypassing cache
  !movntq [edi+ 8], mm1  ;    note: !movntq also prevents the CPU
  !movntq [edi+16], mm2  ;    from READING the destination address
  !movntq [edi+24], mm3  ;    into the cache, only to be over-written,
  !movntq [edi+32], mm4  ;    so that also helps performance
  !movntq [edi+40], mm5
  !movntq [edi+48], mm6
  !movntq [edi+56], mm7
  ADD  edi, 64    ; update dest pointer
  DEC  eax     ; count down
  JNZ  l_memcpy_bp_3  ; keep copying
  SUB  ecx, #CACHEBLOCK  ; update the 64-byte block count
  JMP  l_memcpy_bp_1  ; keep processing chunks
  ;The smallest copy uses the X86 "!movsd" instruction, in an optimized
  ;form which is an "unrolled loop".   Then it handles the last few bytes.
	!section '.text' code readable executable align 4
  !movsd
  !movsd   ; perform last 1-15 dword copies
  !movsd
  !movsd
  !movsd
  !movsd
  !movsd
  !movsd
  !movsd
  !movsd   ; perform last 1-7 dword copies
  !movsd
  !movsd
  !movsd
  !movsd
  !movsd
  !movsd
  memcpy_last_few:  ; dword aligned from before !movsd's
  MOV  ecx, ebx ; has valid low 2 bits of the byte count
  AND  ecx, 3 ; %11 ; the last few cows must come home
  JZ  l_memcpy_final ; no more, let's leave
  REP  movsb  ; the last 1, 2, or 3 bytes
  memcpy_final:
  !emms    ; clean up the  state
  !sfence    ; flush the write buffer
  Return