Had a chance to compare the ASM/CBE/CBE+opt.
I'll send code later.
Yes, I noticed the CBE +opt drove the test to 0ms. Hahaha
I had to loop a random value to avoid optimizer collapsing the loop to 1 iteration.
I was looking for the __builtin asm C commands for Endian by Q,L,W.
I use byte swapping in a high speed scenario, so this is critical for me.
The conclusion is not different from expectations.
CBE+opt is always faster.
CBE+opt endianW|L|Q() used shifts, bswap16|32|64 used __builtin_bswap function.
The __builtin function is ~5-10% faster but I worry that it is supported in all uprocessors?
Straight ASM compiles are 50% slower than CBE+opt.
CBE without opt is always slower and not to be considered unless optimizer breaks the app.
Code: Select all
;-{ TEST SPEED
; Gotchas!
; There is a small bias where 1st procedure is always slower.
; - Run some wasted code BEFORE speed test.
; Even with High Priority set, antivirus and/or uprocessor fan will
; impact currently running procedure.
; - Run test multiple times to get avg results. Throw away extemes.
; Options:
; Comment out each procedure to run only 1 at a time.
CompilerIf #PB_Compiler_Debugger = 0
Macro ML_pcChange(y1, y2)
; Compute % change going from y1 to y2. < 0 means y1 is slower.
(100.0 * Sign(y2-(y1)) * Abs((y2 - (y1)) / (y1 + 1e-16)))
EndMacro
SetPriorityClass_(GetCurrentProcess_(), #REALTIME_PRIORITY_CLASS)
#Tries = 1e8 ;-! SET #TRIES
Define.i u,time,t1,t2,t3,t4,t5,t6,t7,t8
Define.i tw = 55
Define.s r$
Define.s code1$
Define.s code2$
Define.s code3$
Define.s code4$
Define.s code5$
Define.s code6$
Define.s code7$
Define.s code8$
Define.i COMMMONVARIABLES_HERE
CompilerIf #PB_Compiler_Backend = #PB_Backend_Asm
Procedure.w EndianW(x.w)
; ProcedureReturn (e & $FF) << 8 + (e >> 8) & $FF
!MOV ax, word[p.v_x]
!XCHG al, ah ; Swap Lo byte <-> Hi byte
!MOV word[p.v_x], ax
ProcedureReturn x
EndProcedure
Procedure.l EndianL(x.l)
;ProcedureReturn (x & $FF) << 24 + (x & $FF00) << 8 + (x >> 8) & $FF00 + (x >> 24) & $FF
!MOV eax, dword[p.v_x]
!BSWAP eax ; 32 bit little endian <-> big endian
ProcedureReturn ; eax is returned by default
EndProcedure
Procedure.q EndianQ(x.q)
CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
!MOV rax, qword[p.v_x]
!BSWAP rax
CompilerElse
!MOV edx, dword[p.v_x]
!MOV eax, dword[p.v_x+4]
!BSWAP edx
!BSWAP eax
CompilerEndIf
ProcedureReturn ; eax is returned by default with quad picking up edx also.
EndProcedure
Procedure.w bswap16(x.w)
ProcedureReturn (x >> 8 & $ff) | (x << 8 & $ff00)
EndProcedure
Procedure.l bswap32(x.l)
ProcedureReturn (x >> 24 & $ff) | (x >> 8 & $ff00) | (x << 24 & $ff000000) | (x << 8 & $ff0000)
EndProcedure
Procedure.q bswap64(x.q)
ProcedureReturn (x >> 56 & $ff)|(x >> 40 & $ff00) | (x >> 24 & $ff0000) | (x >> 8 & $ff000000) | (x << 56 & $ff00000000000000) | (x << 40 & $ff000000000000) | (x << 24 & $ff0000000000) | (x << 8 & $ff00000000)
EndProcedure
CompilerElse ;#PB_Compiler_Backend = #PB_Backend_C
Procedure.w EndianW(x.w)
;!return __builtin_bswap16(v_value);
ProcedureReturn (x >> 8 & $ff) | (x << 8 & $ff00)
EndProcedure
Procedure.l EndianL(x.l)
;!return __builtin_bswap32(v_value);
ProcedureReturn (x >> 24 & $ff) | (x >> 8 & $ff00) | (x << 24 & $ff000000) | (x << 8 & $ff0000)
EndProcedure
Procedure.q EndianQ(x.q)
;!return __builtin_bswap64(v_value);
ProcedureReturn (x >> 56 & $ff)|(x >> 40 & $ff00) | (x >> 24 & $ff0000) | (x >> 8 & $ff000000) | (x << 56 & $ff00000000000000) | (x << 40 & $ff000000000000) | (x << 24 & $ff0000000000) | (x << 8 & $ff00000000)
EndProcedure
Procedure.w bswap16(x.w)
!return __builtin_bswap16(v_x);
EndProcedure
Procedure.l bswap32(x.l)
!return __builtin_bswap32(v_x);
EndProcedure
Procedure.q bswap64(x.q)
!return __builtin_bswap64(v_x);
EndProcedure
CompilerEndIf
Define.w w, we
Define.l l, le
Define.q q, qe
RandomSeed(123) ; Use same random values!
; Do some junk code here:
For u = 1 To 100
;-> CODE 1 HERE...
l = Random(1000)
w = l
q = l
we = EndianW(w)
le = EndianL(l)
qe = EndianQ(q)
Next u
; START OF SPEED TESTS:
code1$ = "EndianW,L,Q"
time = ElapsedMilliseconds()
For u = 1 To #Tries
l = Random(1000)
w = l
q = l
we = EndianW(w)
le = EndianL(l)
qe = EndianQ(q)
Next u
t1 = ElapsedMilliseconds()-time
code2$ = "bswap16,32,64"
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 2 HERE...
l = Random(1000)
w = l
q = l
we = bswap16(w)
le = bswap32(l)
qe = bswap64(q)
Next u
t2 = ElapsedMilliseconds()-time
CompilerIf 0
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 3 HERE...
Next u
t3 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 4 HERE...
Next u
t4 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 5 HERE...
Next u
t5 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 6 HERE...
Next u
t6 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 7 HERE...
Next u
t7 = ElapsedMilliseconds()-time
time = ElapsedMilliseconds()
For u = 1 To #Tries
;-> CODE 8 HERE...
Next u
t8 = ElapsedMilliseconds()-time
CompilerEndIf
r$ = LSet("; Count(n),",tw) + Str(#Tries) + #CRLF$
r$ + LSet("; "+code1$+"(ms),",tw) + Str(t1) + #CRLF$
r$ + LSet("; "+code2$+"(ms),",tw) + Str(t2) + #CRLF$
; r$ + LSet("; "+code3$+"(ms),",tw) + Str(t3) + #CRLF$
; r$ + LSet("; "+code4$+"(ms),",tw) + Str(t4) + #CRLF$
; r$ + LSet("; "+code5$+"(ms),",tw) + Str(t5) + #CRLF$
; r$ + LSet("; "+code6$+"(ms),",tw) + Str(t6) + #CRLF$
; r$ + LSet("; "+code7$+"(ms),",tw) + Str(t7) + #CRLF$
; r$ + LSet("; "+code8$+"(ms),",tw) + Str(t8) + #CRLF$
r$ + "; %Change(t1->t2): < 0 means t1 slower." + #CRLF$
r$ + LSet("; "+code1$+" : "+code2$+"(%),",tw) + StrD(ML_pcChange(t1,t2),0) + #CRLF$
; r$ + LSet("; "+code3$+" : "+code4$+"(%),",tw) + StrD(ML_pcChange(t3,t4),0) + #CRLF$
; r$ + LSet("; "+code1$+" : "+code3$+"(%),",tw) + StrD(ML_pcChange(t1,t3),0) + #CRLF$
; r$ + LSet("; "+code2$+" : "+code4$+"(%),",tw) + StrD(ML_pcChange(t2,t4),0) + #CRLF$
; r$ + LSet("; "+code1$+" : "+code5$+"(%),",tw) + StrD(ML_pcChange(t1,t5),0) + #CRLF$
; r$ + LSet("; "+code1$+" : "+code6$+"(%),",tw) + StrD(ML_pcChange(t1,t6),0) + #CRLF$
; r$ + LSet("; "+code1$+" : "+code7$+"(%),",tw) + StrD(ML_pcChange(t1,t7),0) + #CRLF$
; r$ + LSet("; "+code1$+" : "+code8$+"(%),",tw) + StrD(ML_pcChange(t1,t8),0) + #CRLF$
If MessageRequester("Speed Test - Copy To Clipboard?",r$,#PB_MessageRequester_YesNo) = #PB_MessageRequester_Yes
SetClipboardText(r$)
EndIf
SetPriorityClass_(GetCurrentProcess_(), #NORMAL_PRIORITY_CLASS)
CompilerEndIf
;-} TEST SPEED
; CBE+opt
; Count(n), 100000000
; EndianW,L,Q(ms), 621
; bswap16,32,64(ms),(using builtin ASM on CBE) 554
; %Change(t1->t2): < 0 means t1 slower.
; EndianW,L,Q : bswap16,32,64(%), -11
; Count(n), 100000000
; EndianW,L,Q(ms), 584
; bswap16,32,64(ms),(using builtin ASM on CBE) 551
; %Change(t1->t2): < 0 means t1 slower.
; EndianW,L,Q : bswap16,32,64(%), -6
; ASM
; Count(n), 100000000
; EndianW,L,Q(ms), 910
; pbswap16,32,64(ms), 1346
; %Change(t1->t2): < 0 means t1 slower.
; EndianW,L,Q : bswap16,32,64(%), 48
; Count(n), 100000000
; EndianW,L,Q(ms), 943
; pbswap16,32,64(ms), 1447
; %Change(t1->t2): < 0 means t1 slower.
; EndianW,L,Q : bswap16,32,64(%), 53
; CBE-opt
; Count(n), 100000000
; EndianW,L,Q(ms), 1497
; bswap16,32,64(ms), 1102
; %Change(t1->t2): < 0 means t1 slower.
; EndianW,L,Q : bswap16,32,64(%), -26
; Count(n), 100000000
; EndianW,L,Q(ms), 1436
; bswap16,32,64(ms), 1086
; %Change(t1->t2): < 0 means t1 slower.
; EndianW,L,Q : bswap16,32,64(%), -24