Just had an idea to make a quick speed test.
My Ryzen 5700 need for 1 Full HD Picture ~ 540Mil ticks when calling HSVadjustRBG() for each pixel.
That means ~7fps with the slowest version. With 128Bit SSE Vector commands I estimate ~21fps and with 256Bit SSE 42fps.
That's ~50% faster as my calculation bevor!
But a very strange behavior.
Float/ Int
240 / 160 Mio , 126 / 80ms: new Intel I7 8565U (Laptop Win10)
540 / 300 Mio , 142 / 80ms : Ryzen 5800X (Dektop Win 10))
780 / 370 Mio, 202 / 110 ms : old Intel I7 Laptop (2016, Win7) :
970 / 650 Mio : old AMD Laptop (2011, Win7)
Seems to be new Intel I7 has much faster floting point unit (or a bug in RuntimeCounter)
From this test I guess with fixed point integer it is possible to speed up a lot at AMD and Intel
UDATE: when adding time measuring and frequency calculation, it is clear!
The new Intel I7 isn't faster. It use 1.9MHz and internal double it for the CPU kernel.
UPDATE 2023/09/09 with SSE Version
Download additional Modules form Github
GitHub.
https://github.com/Maagic7/PureBasicFra ... in/Modules
"PbFw_Module_PbFw.pb"
"PbFw_Module_Debug.pb"
"PbFw_Module_VECTORf.pb"
Code: Select all
EnableExplicit
XIncludeFile "..\..\Modules\PbFw_Module_VECTORf.pb" ; VECf:: single precision Vector Modul
Procedure.q ReadRuntimeCounter() ; RDTSC
; ======================================================================
; NAME: ReadRuntimeCounter
; DESC: Reads the CPU Runtime Counter
; DESC: A counter incremented +1 at each CPU cycle
; RET.q : CPU ticks counted with the CPU's operating frequency
; ======================================================================
CompilerIf #PB_Compiler_Backend=#PB_Backend_C
; ----------------------------------------------------------------------
; C-Backend
; ----------------------------------------------------------------------
CompilerIf #PB_Compiler_Processor = #PB_Processor_x64 Or #PB_Compiler_Processor = #PB_Processor_x86
Protected t.q
!unsigned hi, lo;
!__asm__ __volatile__ ("lfence\n rdtsc\n lfence" : "=a"(lo), "=d"(hi));
!v_t =((unsigned long long)lo)|(((unsigned long long)hi)<<32 );
ProcedureReturn t
CompilerElseIf #PB_Compiler_Processor = #PB_Processor_Arm64 Or #PB_Compiler_Processor = #PB_Processor_Arm32
; ARM x32/x64
Protected pmuseren.l,pmcntenset.l,pmccntr.l;
!asm volatile("mrc p15, 0, %0, c9, c14, 0" : "=r"(v_pmuseren));
If pmuseren & 1
!asm volatile("mrc p15, 0, %0, c9, c12, 1" : "=r"(v_pmcntenset));
If pmcntenset & $80000000
!asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(v_pmccntr));
t = pmccntr
ProcedureReturn t << 6
EndIf
EndIf
CompilerEndIf
CompilerElse
; ----------------------------------------------------------------------
; ASM-Backend x64 / x32
; ----------------------------------------------------------------------
CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
DisableDebugger
; RDTSC transfers TimeStampCounter to EDX, EAX (on x32 and x 64)
!RDTSC
; on x64 a Quad is returned as RAX, so we have to combine hi and lo in RAX
!SHL RDX, 32 ; EDX to RDX_Hi
!OR RAX, RDX
ProcedureReturn
EnableDebugger
CompilerElse ; x32
DisableDebugger
; RDTSC transfers TimeStampCounter to EDX, EAX (on x32 and x 64)
!RDTSC
; on x32 a Quad is returned as EDX, EAX
ProcedureReturn ; return the TimeStampCounter [EDX, EAX} on x32 [RAX] on x64
EnableDebugger
CompilerEndIf
CompilerEndIf
EndProcedure
Procedure.q ElapsedMicroSeconds()
Protected v.Quad, f.Quad
QueryPerformanceFrequency_(f)
QueryPerformanceCounter_(v)
ProcedureReturn (v\q / (f\q /1e6) ) ; normalize value to MicroSeconds : 1e6 = 1.000.000 = 1Mhz
EndProcedure
Procedure.l HSVadjustRBG(RGB.l, h.f, s.f, v.f)
; ======================================================================
; NAME : HSVadjustRBG
; DESC : Do a HSV-Color Space adjust at a RGB color of 24/32 Bit
; DESC : it will keep the original Alpha
; VAR(RGB.l) : 24/32 Bit RGB Color
; VAR(h.f) : Hue shift (in degrees) [0..360]
; VAR(s.f) : saturation multiplier (scalar)
; VAR(v.f) : value multiplier (scalar)
; RET.l : The HSV adjusted color as RGB-Color-Value
; ======================================================================
Protected.f vsu, vsw
Protected.f r, g, b
Protected.a A ; Alpha value
r = Red(RGB)
g = Green(RGB)
b = Blue(RGB)
A = Alpha(RGB)
vsu = v * s * Cos(Radian(h))
vsw = v * s * Sin(Radian(h))
r = (0.299*v + 0.701*vsu + 0.168*vsw) * r + (0.587*v - 0.587*vsu + 0.330*vsw) * g + (0.114*v - 0.114*vsu - 0.497*vsw) * b
g = (0.299*v - 0.299*vsu - 0.328*vsw) * r + (0.587*v + 0.413*vsu + 0.035*vsw) * g + (0.114*v - 0.114*vsu + 0.292*vsw) * b
b = (0.299*v - 0.300*vsu + 1.250*vsw) * r + (0.587*v - 0.588*vsu - 1.050*vsw) * g + (0.114*v + 0.886*vsu - 0.203*vsw) * b
If r > 255
r = 255
ElseIf r < 0
r = 0
EndIf
If g > 255
g = 255
ElseIf g < 0
g = 0
EndIf
If b > 255
b = 255
ElseIf b < 0
b = 0
EndIf
ProcedureReturn RGBA(Int(r), Int(g), Int(b), A)
EndProcedure
Structure M3
m11.f : m12.f : m13.f
m21.f : m22.f : m23.f
m31.f : m32.f : m33.f
EndStructure
Procedure.i Get_HSVadjustRGB_Matrix (*m.M3, h.f, s.f, v.f)
Protected.f vsu, vsw
vsu = v * s * Cos(Radian(h))
vsw = v * s * Sin(Radian(h))
With *m
\m11 = 0.299*v + 0.701*vsu + 0.168*vsw
\m12 = 0.587*v - 0.587*vsu + 0.330*vsw
\m13 = 0.114*v - 0.114*vsu - 0.497*vsw
\m21 = 0.299*v - 0.299*vsu - 0.328*vsw
\m22 = 0.587*v + 0.413*vsu + 0.035*vsw
\m23 = 0.114*v - 0.114*vsu + 0.292*vsw
\m31 = 0.299*v - 0.300*vsu + 1.250*vsw
\m32 = 0.587*v - 0.588*vsu - 1.050*vsw
\m33 = 0.114*v + 0.886*vsu - 0.203*vsw
EndWith
ProcedureReturn *m
EndProcedure
Procedure.i Get_HSVadjustRBG_VecMatrix (*m.VECf::TMatrix, h.f, s.f, v.f)
Protected.f vsu, vsw
vsu = v * s * Cos(Radian(h))
vsw = v * s * Sin(Radian(h))
With *m
\m11 = 0.299*v + 0.701*vsu + 0.168*vsw
\m12 = 0.587*v - 0.587*vsu + 0.330*vsw
\m13 = 0.114*v - 0.114*vsu - 0.497*vsw
;\m14 = 0
\m21 = 0.299*v - 0.299*vsu - 0.328*vsw
\m22 = 0.587*v + 0.413*vsu + 0.035*vsw
\m23 = 0.114*v - 0.114*vsu + 0.292*vsw
;\m24 = 0
\m31 = 0.299*v - 0.300*vsu + 1.250*vsw
\m32 = 0.587*v - 0.588*vsu - 1.050*vsw
\m33 = 0.114*v + 0.886*vsu - 0.203*vsw
\m34 = 0
;\m41 = 0
;\m42 = 0
;\m43 = 0
;\m44 = 0
EndWith
ProcedureReturn *m
EndProcedure
Procedure.l HSVadjustRBG_MX(RGB.l, *Matrix.M3)
; ======================================================================
; NAME : HSVadjustRBG_MX
; DESC : Do a HSV-Color Space adjust at a RGB color of 24/32 Bit
; DESC : with Matrix caclulation.
; VAR(RGB.l) : 24/32 Bit RGB Color
; VAR(m.M3) : Matrix
; RET.l : The HSV adjusted color as RGB-Color-Value
; ======================================================================
Protected.f vsu, vsw
Protected.f r, g, b
r = Red(RGB)
g = Green(RGB)
b = Blue(RGB)
;( V 0 0 )
;( 0 VSU -VSW )
;( 0 VSW -VSU )
; m11 = 0.299*v + 0.701*vsu + 0.168*vsw
; m12 = 0.587*v - 0.587*vsu + 0.330*vsw
; m13 = 0.114*v - 0.114*vsu - 0.497*vsw
; m21 = 0.299*v - 0.299*vsu - 0.328*vsw
; m22 = 0.587*v + 0.413*vsu + 0.035*vsw
; m23 = 0.114*v - 0.114*vsu + 0.292*vsw
; m31 = 0.299*v - 0.300*vsu + 1.250*vsw
; m32 = 0.587*v - 0.588*vsu - 1.050*vsw
; m33 = 0.114*v + 0.886*vsu - 0.203*vsw
If *Matrix
With *Matrix
r = m11 * r + m12 * g + m13 * b
g = m21 * r + m22 * g + m23 * b
b = m31 * r + m32 * g + m33 * b
EndWith
If r > 255
r = 255
ElseIf r < 0
r = 0
EndIf
If g > 255
g = 255
ElseIf g < 0
g = 0
EndIf
If b > 255
b = 255
ElseIf b < 0
b = 0
EndIf
EndIf
ProcedureReturn RGBA(Int(r), Int(g), Int(b), Alpha(RGB))
EndProcedure
Procedure.l HSVadjustRBG_SSE(RGB.l, *Matrix.VECf::TMatrix)
; ======================================================================
; NAME : HSVadjustRBG_SSE
; DESC : Do a HSV-Color Space adjust at a RGB color of 24/32 Bit
; DESC : with Matrix caclulation.
; VAR(RGB.l) : 24/32 Bit RGB Color
; VAR(m.M3) : Matrix
; RET.l : The HSV adjusted color as RGB-Color-Value
; ======================================================================
Protected.f vsu, vsw
Protected IN.VECf::TVector
Protected OUT.VECf::TVector
; Protected *OUT.VECf::TVector = @OUT
; Protected *IN.VECf::TVector = @IN
With IN
\x = Red(RGB)
\y = Green(RGB)
\z = Blue(RGB)
EndWith
VECf::Vector_X_Matrix(Out, IN, *Matrix)
; ASM_Vector_X_Matrix(RAX, RDX, RCX)
With OUT
If \x > 255
\x = 255
ElseIf \x < 0
\x = 0
EndIf
If \y > 255
\y = 255
ElseIf \y < 0
\y = 0
EndIf
If \z > 255
\z = 255
ElseIf \z < 0
\z = 0
EndIf
EndWith
ProcedureReturn RGBA(Int(OUT\x), Int(OUT\y), Int(OUT\z), Alpha(RGB))
EndProcedure
EnableExplicit
Define.q ticks1, ticks2, ticks3, ticks4
Define.q time1, time2, time3, time4
Define.i I
Define.l col, newcol
Define.s msg
col = RGB(120, 200, 99)
#Pixels = 1920*1080
; --------------------------------------------------------
; Classic Version for ajustting 1 Pixel (Matrix caclulation included)
time1 = ElapsedMicroSeconds()
ticks1 = ReadRuntimeCounter()
For I = 1 To #Pixels
newCol = HSVadjustRBG(col, 33, 1.1, 0.7)
Next
ticks1 = ReadRuntimeCounter() - ticks1
time1 = ElapsedMicroSeconds() - time1
; --------------------------------------------------------
; Classic Version with precaculated Matrix
; this is much faster if adjusting more Pixels with same Parameters
Define hsvMatrix.M3
time2 = ElapsedMicroSeconds()
ticks2 = ReadRuntimeCounter()
Get_HSVadjustRGB_Matrix(hsvMatrix, 33, 1.1, 0.7)
For I = 1 To #Pixels
newCol = HSVadjustRBG_MX(col, hsvMatrix)
Next
ticks2 = ReadRuntimeCounter() - ticks2
time2 = ElapsedMicroSeconds() - time2
; --------------------------------------------------------
Define VecMatrix.VECf::TMatrix
time3 = ElapsedMicroSeconds()
ticks3 = ReadRuntimeCounter()
Get_HSVadjustRBG_VecMatrix(VecMatrix, 33, 1.1, 0.7)
For I = 1 To #Pixels
newCol = HSVadjustRBG_SSE(col, VecMatrix)
Next
ticks3 = ReadRuntimeCounter() - ticks3
time3 = ElapsedMicroSeconds() - time3
msg = "Classic Float version: ticks = " + Str(ticks1 / 1e6) + " Mio ; " + StrF(time1/1000,1) + "ms ; " + Str(ticks1/time1) + " MHz" + #CRLF$
msg + "Classic Matrix version: ticks = " + Str(ticks2 / 1e6) + " Mio ; " + StrF(time2/1000,1) + "ms ; " + Str(ticks2/time2) + " MHz" + #CRLF$
msg + "SSE-VEC-Matrix version: ticks = " + Str(ticks3 / 1e6) + " Mio ; " + StrF(time3/1000,1) + "ms ; " + Str(ticks3/time3) + " MHz"
SetClipboardText(msg)
MessageRequester("CPU Ticks of HSVadjustRBG of a FullHD Picture = ", msg , #PB_MessageRequester_Info)
; Float/ Int
; 240/160 Mio , 126/ 80ms: new Intel I7 8565U (Laptop Win10)
; 540/300 Mio , 142/ 80ms: Ryzen 5800X (Dektop Win 10))
; 780/370 Mio , 202/110ms: old Intel I7 Laptop (2016, Win7)
; 970/650 Mio : old AMD Laptop (2011, Win7)