Page 1 of 1

SSE/SSE2/SSE3/SSSE3 ASM - Optimization

Posted: Sun Jun 28, 2009 10:18 am
by cxAlex
Why PB doesn't use any SSE Instruction Sets? They could speed up PB - executables enormous.

For example, the lib used in this wrapper for CopyMemory() is switching at runtime between code for SSE - SSS3, and is here (with SSS3) more than 10 times faster than PB.

http://www.purebasic.fr/german/viewtopic.php?t=20335

Posted: Sun Jun 28, 2009 11:56 am
by eriansa
I wonder how this lib gets such results.

Following is a benchmark...
Only "SSE-Aligned" is faster here....

Code: Select all

Procedure MyCopyMemory(In,Out,length)
!PUSH Edi
!PUSH Ebx
; -------
!MOV Eax,[p.v_In+8]
!MOV Edi,[p.v_Out+8]
!MOV Ecx,[p.v_length+8]
!.While_001:
!movups xmm0,[Eax]
!movups [Edi],xmm0
!ADD Eax,16
!ADD Edi,16
!LOOP .While_001
; -------
!POP Ebx
!POP Edi
EndProcedure

Procedure MyCopyMemoryAligned(In,Out,length)
!PUSH Edi
!PUSH Ebx
; -------
!MOV Eax,[p.v_In+8]
!MOV Edi,[p.v_Out+8]
!MOV Ecx,[p.v_length+8]
!.While_002:
!movaps xmm0,[Eax]
!movaps [Edi],xmm0
!ADD Eax,16
!ADD Edi,16
!LOOP .While_002
; -------
!POP Ebx
!POP Edi
EndProcedure


#Count=500000

mem1=AllocateMemory(512+15)
mem2=AllocateMemory(512+15)
mem1=mem1+(16-(mem1%16))
mem2=mem2+(16-(mem2%16))


qTimeA.q=0
qTimeZ.q=0

QueryPerformanceCounter_(@qTimeA)
For i=0 To #Count
  CopyMemory(mem1,mem2,512)
Next  
QueryPerformanceCounter_(@qTimeZ)
qTime1.q=qTimeZ-qTimeA

qTimeA.q=0
qTimeZ.q=0

QueryPerformanceCounter_(@qTimeA)
For i=0 To #Count
  CopyMemory_(mem2,mem1,512)
Next  
QueryPerformanceCounter_(@qTimeZ)
qTime2.q=qTimeZ-qTimeA

qTimeA.q=0
qTimeZ.q=0
l=512/16

QueryPerformanceCounter_(@qTimeA)
For i=0 To #Count
  MyCopyMemory(mem1,mem2,l)
Next  
QueryPerformanceCounter_(@qTimeZ)
qTime3.q=qTimeZ-qTimeA

qTimeA.q=0
qTimeZ.q=0
l=512/16

QueryPerformanceCounter_(@qTimeA)
For i=0 To #Count
  MyCopyMemoryAligned(mem1,mem2,l)
Next  
QueryPerformanceCounter_(@qTimeZ)
qTime4.q=qTimeZ-qTimeA


MessageRequester("CopyMemory test","CopyMemory, CopyMemory_, SSE, SSE_Aligned" + Chr(13) + Chr(10) + Str(qTime1) + "/" + Str(qTime2) + "/" + Str(qTime3) + "/" + Str(qTime4))

Posted: Sun Jun 28, 2009 12:01 pm
by cxAlex
eriansa wrote:I wonder how this lib gets such results.
The lib is using SSE, SSE2 and SSSE3 id available.

Take a look at the source code:

http://www.agner.org/optimize/asmlib.zip

Posted: Sun Jun 28, 2009 1:34 pm
by Michael Vogel
I'm not sure, if CopyMemory could speed up array routines also. I just made a simple test now to see, if handling small arrays with doubles could be made a little bit faster...

Code: Select all

Structure Mat
	x.d[4]
	y.d[4]
	z.d[4]
	t.d[4]
EndStructure

Global Matrix.Mat
Global EinheitsMatrix.Mat
Global SizeMatrix=SizeOf(EinheitsMatrix)

Procedure InitNorm()

	Protected *m.mat=EinheitsMatrix

	*m\x[0]=1 : *m\y[0]=0 : *m\z[0]=0 : *m\t[0]=0
	*m\x[1]=0 : *m\y[1]=1 : *m\z[1]=0 : *m\t[1]=0
	*m\x[2]=0 : *m\y[2]=0 : *m\z[2]=1 : *m\t[2]=0
	*m\x[3]=0 : *m\y[3]=0 : *m\z[3]=0 : *m\t[3]=1

EndProcedure
Procedure SetNormNew(*m.mat)
	CopyMemory(EinheitsMatrix,*m,SizeMatrix)
EndProcedure
Procedure SetNormOld(*m.mat)
	*m\x[0]=1 : *m\y[0]=0 : *m\z[0]=0 : *m\t[0]=0
	*m\x[1]=0 : *m\y[1]=1 : *m\z[1]=0 : *m\t[1]=0
	*m\x[2]=0 : *m\y[2]=0 : *m\z[2]=1 : *m\t[2]=0
	*m\x[3]=0 : *m\y[3]=0 : *m\z[3]=0 : *m\t[3]=1
EndProcedure

InitNorm()

#n=9999999
t=-GetTickCount_()
For i=0 To #n
	SetNormNew(Matrix)
Next i
t+GetTickCount_()

s=-GetTickCount_()
For i=0 To #n
	SetNormOld(Matrix)
Next i
s+GetTickCount_()
MessageRequester(Str(t),Str(s))
My result here: (at least the standard version of) CopyMemory is much slower than doing it the common way :(

Michael

Posted: Sun Jun 28, 2009 1:41 pm
by cxAlex
Here Memory is the fastest way:
Norm: 359
CpyMem_PB: 265
CpyMem_FastMem: 94

Code: Select all

XIncludeFile "FastMem.pbi"

Structure Mat
  x.d[4]
  y.d[4]
  z.d[4]
  t.d[4]
EndStructure

Global Matrix.Mat
Global EinheitsMatrix.Mat
Global SizeMatrix = SizeOf(EinheitsMatrix)

Procedure InitNorm()
  
  Protected *m.mat = EinheitsMatrix
  
  *m\x[0] = 1 : *m\y[0] = 0 : *m\z[0] = 0 : *m\t[0] = 0
  *m\x[1] = 0 : *m\y[1] = 1 : *m\z[1] = 0 : *m\t[1] = 0
  *m\x[2] = 0 : *m\y[2] = 0 : *m\z[2] = 1 : *m\t[2] = 0
  *m\x[3] = 0 : *m\y[3] = 0 : *m\z[3] = 0 : *m\t[3] = 1
  
EndProcedure
Procedure SetNormNew(*m.mat)
  CopyMemory(EinheitsMatrix, *m, SizeMatrix)
EndProcedure
Procedure SetNormNew_FastMem(*m.mat)
  FastMem_Copy(EinheitsMatrix, *m, SizeMatrix)
EndProcedure
Procedure SetNormOld(*m.mat)
  *m\x[0] = 1 : *m\y[0] = 0 : *m\z[0] = 0 : *m\t[0] = 0
  *m\x[1] = 0 : *m\y[1] = 1 : *m\z[1] = 0 : *m\t[1] = 0
  *m\x[2] = 0 : *m\y[2] = 0 : *m\z[2] = 1 : *m\t[2] = 0
  *m\x[3] = 0 : *m\y[3] = 0 : *m\z[3] = 0 : *m\t[3] = 1
EndProcedure

InitNorm()

#n = 9999999
t = -GetTickCount_()
For i = 0 To #n
  SetNormNew(Matrix)
Next i
t + GetTickCount_()

#n = 9999999
t2 = -GetTickCount_()
For i = 0 To #n
  SetNormNew_FastMem(Matrix)
Next i
t2 + GetTickCount_()

s = -GetTickCount_()
For i = 0 To #n
  SetNormOld(Matrix)
Next i
s + GetTickCount_()


MessageRequester("Test", "Norm: " + Str(s) + Chr(13) + "CpyMem_PB: " + Str(t) + Chr(13) + "CpyMem_FastMem: " + Str(t2))