patches +, -, *, /
The patched ops are ~40% faster x87 ops on my AMD
I would be interested to see if there's any performance increase on Intel's
compile with debugger to verify it's working
and compile without debugger to rate it's performance
Code: Select all
Macro PatchSSE()
!macro FLD var
!{
!match =dword x , var \{ movss xmm0, var \}
!match =qword x , var \{ movsd xmm0, var \}
!}
!macro FADD var
!{
!match =dword x , var \{ addss xmm0, var \}
!match =qword x , var \{ addsd xmm0, var \}
!}
!macro FSTP var
!{
!match =dword x, var \{ movss var, xmm0 \}
!match =qword x, var \{ movsd var, xmm0 \}
!}
!macro FSUB var
!{
!match =dword x, var \{ subss xmm0, var \}
!match =qword x, var \{ subsd xmm0, var \}
!}
!macro FMUL var
!{
!match =dword x, var \{ mulss xmm0, var \}
!match =qword x, var \{ mulsd xmm0, var \}
!}
!macro FDIV var
!{
!match =dword x, var \{ divss xmm0, var \}
!match =qword x, var \{ divsd xmm0, var \}
!}
EndMacro
Macro _Cos(result,angle)
EnableASM
fld angle
fcos
fstp result
DisableASM
EndMacro
Macro _Sin(result,angle)
EnableASM
fld angle
fsin
fstp result
DisableASM
EndMacro
Macro _Tan(result,angle)
EnableASM
fld angle
fptan
fstp result
fstp result
DisableASM
EndMacro
Global a.f,b.f,c.f,aa.d,bb.d,cc.d,avg.i
Global t1.s,t2.s,t3.s,t4.s
CompilerIf #PB_Compiler_Debugger
a = #PI
b = #PI
c = a+b
Debug c
c * 2
Debug c
c = a-b
Debug c
c = a*b
Debug c
c = a/b
Debug c
a = Cos(c)
Debug a
b = Sin(c)
Debug b
a = Tan(b)
Debug a
Debug "============="
PatchSSE()
a = #PI
b = #PI
c = a+b
Debug c
c * 2
Debug c
c = a-b
Debug c
c = a*b
Debug c
c = a/b
Debug c
_Cos(a,c)
Debug a
_Sin(b,c)
Debug b
_Tan(a,b)
Debug a
CompilerElse
a = #PI
b = #PI
avg=0
For j = 1 To 10
st = ElapsedMilliseconds()
For i = 0 To 9000000
c = a+b
c * 2
c = a-b
c = a*b
c = a/b
Next
avg + (ElapsedMilliseconds() -st)
Next
avg / 10
t1.s = "x87 float " + Str(avg)
aa = #PI
bb = #PI
avg=0
For j = 1 To 10
st = ElapsedMilliseconds()
For i = 0 To 9000000
cc = aa+bb
cc * 2
cc = aa-bb
cc = aa*bb
cc = aa/bb
Next
avg + (ElapsedMilliseconds() -st)
Next
avg / 10
t2.s = "x87 double " + Str(avg)
PatchSSE() ;test with sse patch
a = #PI
b = #PI
avg=0
For j = 1 To 10
st = ElapsedMilliseconds()
For i = 0 To 9000000
c = a+b
c * 2
c = a-b
c = a*b
c = a/b
Next
avg + (ElapsedMilliseconds() -st)
Next
avg / 10
t3.s = "sse float " + Str(avg)
aa = #PI
bb = #PI
avg=0
For j = 1 To 10
st = ElapsedMilliseconds()
For i = 0 To 9000000
cc = aa+bb
cc * 2
cc = aa-bb
cc = aa*bb
cc = aa/bb
Next
avg + (ElapsedMilliseconds() -st)
Next
avg / 10
t4.s = "sse double " + Str(avg)
MessageRequester("times ",t1 + #CRLF$ + t2 + #CRLF$ + t3 + #CRLF$ + t4 )
CompilerEndIf