I don't know.
At the end, you need also the C-SSE-instructions for optimization.
For the tests, I need to use this strange code with random matrix, because the C-optimizer canceled the code when the result matrix isn't used anymore.
Code: Select all
Structure UB2D_MATRIX4f
I11.f : I21.f : I31.f : I41.f
I12.f : I22.f : I32.f : I42.f
I13.f : I23.f : I33.f : I43.f
I14.f : I24.f : I34.f : I44.f
EndStructure
Procedure.i UB2D_m4fMultiplicationASM( *m4fResult.UB2D_MATRIX4f, *m4fLeft.UB2D_MATRIX4f, *m4fRight.UB2D_MATRIX4f )
Protected Backup.UB2D_MATRIX4f
CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
! MOV rax, [p.p_m4fResult]
! MOV rcx, [p.p_m4fLeft]
! MOV rdx, [p.p_m4fRight]
; Linke Matrix laden
! MOVUPS xmm0, [rcx+00]
! MOVUPS xmm1, [rcx+16]
! MOVUPS xmm2, [rcx+32]
! MOVUPS xmm3, [rcx+48]
; Backup von xmm4-xmm7
! MOVUPS [p.v_Backup+00], xmm4
! MOVUPS [p.v_Backup+16], xmm5
! MOVUPS [p.v_Backup+32], xmm6
CompilerElse
! MOV eax, [p.p_m4fResult]
! MOV ecx, [p.p_m4fLeft]
! MOV edx, [p.p_m4fRight]
; Linke Matrix laden
! MOVUPS xmm0, [ecx+00]
! MOVUPS xmm1, [ecx+16]
! MOVUPS xmm2, [ecx+32]
! MOVUPS xmm3, [ecx+48]
; Backup von xmm4-xmm7
! MOVUPS [p.v_Backup+00], xmm4
! MOVUPS [p.v_Backup+16], xmm5
! MOVUPS [p.v_Backup+32], xmm6
CompilerEndIf
CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
; Multiplikation mit rechter Matrix (1. Spalte)
! MOVUPS xmm4, [rdx+00]
! MOVAPS xmm6, xmm4
! SHUFPS xmm6, xmm6, 00000000b
! MULPS xmm6, xmm0
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 01010101b
! MULPS xmm5, xmm1
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 10101010b
! MULPS xmm5, xmm2
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 11111111b
! MULPS xmm5, xmm3
! ADDPS xmm6, xmm5
! MOVUPS [rax+00], xmm6
; Multiplikation mit rechter Matrix (2. Spalte)
! MOVUPS xmm4, [rdx+16]
! MOVAPS xmm6, xmm4
! SHUFPS xmm6, xmm6, 00000000b
! MULPS xmm6, xmm0
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 01010101b
! MULPS xmm5, xmm1
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 10101010b
! MULPS xmm5, xmm2
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 11111111b
! MULPS xmm5, xmm3
! ADDPS xmm6, xmm5
! MOVUPS [rax+16], xmm6
; Multiplikation mit rechter Matrix (3. Spalte)
! MOVUPS xmm4, [rdx+32]
! MOVAPS xmm6, xmm4
! SHUFPS xmm6, xmm6, 00000000b
! MULPS xmm6, xmm0
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 01010101b
! MULPS xmm5, xmm1
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 10101010b
! MULPS xmm5, xmm2
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 11111111b
! MULPS xmm5, xmm3
! ADDPS xmm6, xmm5
! MOVUPS [rax+32], xmm6
; Multiplikation mit rechter Matrix (4. Spalte)
! MOVUPS xmm4, [rdx+48]
! MOVAPS xmm6, xmm4
! SHUFPS xmm6, xmm6, 00000000b
! MULPS xmm6, xmm0
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 01010101b
! MULPS xmm5, xmm1
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 10101010b
! MULPS xmm5, xmm2
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 11111111b
! MULPS xmm5, xmm3
! ADDPS xmm6, xmm5
! MOVUPS [rax+48], xmm6
; Wiederherstellung von xmm4-xmm7
! MOVUPS xmm4, [p.v_Backup+00]
! MOVUPS xmm5, [p.v_Backup+16]
! MOVUPS xmm6, [p.v_Backup+32]
CompilerElse
; Multiplikation mit rechter Matrix (1. Spalte)
! MOVUPS xmm4, [edx+00]
! MOVAPS xmm6, xmm4
! SHUFPS xmm6, xmm6, 00000000b
! MULPS xmm6, xmm0
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 01010101b
! MULPS xmm5, xmm1
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 10101010b
! MULPS xmm5, xmm2
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 11111111b
! MULPS xmm5, xmm3
! ADDPS xmm6, xmm5
! MOVUPS [eax+00], xmm6
; Multiplikation mit rechter Matrix (2. Spalte)
! MOVUPS xmm4, [edx+16]
! MOVAPS xmm6, xmm4
! SHUFPS xmm6, xmm6, 00000000b
! MULPS xmm6, xmm0
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 01010101b
! MULPS xmm5, xmm1
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 10101010b
! MULPS xmm5, xmm2
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 11111111b
! MULPS xmm5, xmm3
! ADDPS xmm6, xmm5
! MOVUPS [eax+16], xmm6
; Multiplikation mit rechter Matrix (3. Spalte)
! MOVUPS xmm4, [edx+32]
! MOVAPS xmm6, xmm4
! SHUFPS xmm6, xmm6, 00000000b
! MULPS xmm6, xmm0
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 01010101b
! MULPS xmm5, xmm1
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 10101010b
! MULPS xmm5, xmm2
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 11111111b
! MULPS xmm5, xmm3
! ADDPS xmm6, xmm5
! MOVUPS [eax+32], xmm6
; Multiplikation mit rechter Matrix (4. Spalte)
! MOVUPS xmm4, [edx+48]
! MOVAPS xmm6, xmm4
! SHUFPS xmm6, xmm6, 00000000b
! MULPS xmm6, xmm0
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 01010101b
! MULPS xmm5, xmm1
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 10101010b
! MULPS xmm5, xmm2
! ADDPS xmm6, xmm5
! MOVAPS xmm5, xmm4
! SHUFPS xmm5, xmm5, 11111111b
! MULPS xmm5, xmm3
! ADDPS xmm6, xmm5
! MOVUPS [eax+48], xmm6
; Wiederherstellung von xmm4-xmm7
! MOVUPS xmm4, [p.v_Backup+00]
! MOVUPS xmm5, [p.v_Backup+16]
! MOVUPS xmm6, [p.v_Backup+32]
CompilerEndIf
ProcedureReturn
EndProcedure
Procedure.i UB2D_m4fMultiplicationNativ( *m4fResult.UB2D_MATRIX4f, *m4fLeft.UB2D_MATRIX4f, *m4fRight.UB2D_MATRIX4f )
Protected m4fBackup.UB2D_MATRIX4f
m4fBackup\I11 = *m4fLeft\I11 * *m4fRight\I11 + *m4fLeft\I12 * *m4fRight\I21 + *m4fLeft\I13 * *m4fRight\I31 + *m4fLeft\I14 * *m4fRight\I41
m4fBackup\I12 = *m4fLeft\I11 * *m4fRight\I12 + *m4fLeft\I12 * *m4fRight\I22 + *m4fLeft\I13 * *m4fRight\I32 + *m4fLeft\I14 * *m4fRight\I42
m4fBackup\I13 = *m4fLeft\I11 * *m4fRight\I13 + *m4fLeft\I12 * *m4fRight\I23 + *m4fLeft\I13 * *m4fRight\I33 + *m4fLeft\I14 * *m4fRight\I43
m4fBackup\I14 = *m4fLeft\I11 * *m4fRight\I14 + *m4fLeft\I12 * *m4fRight\I24 + *m4fLeft\I13 * *m4fRight\I34 + *m4fLeft\I14 * *m4fRight\I44
m4fBackup\I21 = *m4fLeft\I21 * *m4fRight\I11 + *m4fLeft\I22 * *m4fRight\I21 + *m4fLeft\I23 * *m4fRight\I31 + *m4fLeft\I24 * *m4fRight\I41
m4fBackup\I22 = *m4fLeft\I21 * *m4fRight\I12 + *m4fLeft\I22 * *m4fRight\I22 + *m4fLeft\I23 * *m4fRight\I32 + *m4fLeft\I24 * *m4fRight\I42
m4fBackup\I23 = *m4fLeft\I21 * *m4fRight\I13 + *m4fLeft\I22 * *m4fRight\I23 + *m4fLeft\I23 * *m4fRight\I33 + *m4fLeft\I24 * *m4fRight\I43
m4fBackup\I24 = *m4fLeft\I21 * *m4fRight\I14 + *m4fLeft\I22 * *m4fRight\I24 + *m4fLeft\I23 * *m4fRight\I34 + *m4fLeft\I24 * *m4fRight\I44
m4fBackup\I31 = *m4fLeft\I31 * *m4fRight\I11 + *m4fLeft\I32 * *m4fRight\I21 + *m4fLeft\I33 * *m4fRight\I31 + *m4fLeft\I34 * *m4fRight\I41
m4fBackup\I32 = *m4fLeft\I31 * *m4fRight\I12 + *m4fLeft\I32 * *m4fRight\I22 + *m4fLeft\I33 * *m4fRight\I32 + *m4fLeft\I34 * *m4fRight\I42
m4fBackup\I33 = *m4fLeft\I31 * *m4fRight\I13 + *m4fLeft\I32 * *m4fRight\I23 + *m4fLeft\I33 * *m4fRight\I33 + *m4fLeft\I34 * *m4fRight\I43
m4fBackup\I34 = *m4fLeft\I31 * *m4fRight\I14 + *m4fLeft\I32 * *m4fRight\I24 + *m4fLeft\I33 * *m4fRight\I34 + *m4fLeft\I34 * *m4fRight\I44
m4fBackup\I41 = *m4fLeft\I41 * *m4fRight\I11 + *m4fLeft\I42 * *m4fRight\I21 + *m4fLeft\I43 * *m4fRight\I31 + *m4fLeft\I44 * *m4fRight\I41
m4fBackup\I42 = *m4fLeft\I41 * *m4fRight\I12 + *m4fLeft\I42 * *m4fRight\I22 + *m4fLeft\I43 * *m4fRight\I32 + *m4fLeft\I44 * *m4fRight\I42
m4fBackup\I43 = *m4fLeft\I41 * *m4fRight\I13 + *m4fLeft\I42 * *m4fRight\I23 + *m4fLeft\I43 * *m4fRight\I33 + *m4fLeft\I44 * *m4fRight\I43
m4fBackup\I44 = *m4fLeft\I41 * *m4fRight\I14 + *m4fLeft\I42 * *m4fRight\I24 + *m4fLeft\I43 * *m4fRight\I34 + *m4fLeft\I44 * *m4fRight\I44
CopyMemory(@m4fBackup, *m4fResult, SizeOf(UB2D_MATRIX4f))
ProcedureReturn *m4fResult
EndProcedure
Procedure.i UB2D_m4fRandom( *m4fResult.UB2D_MATRIX4f, fMax.f = 1.0, fMin.f = 0.0 )
Protected I.i
For I = 0 To 15
PokeF(*m4fResult + SizeOf(Float)*I, (fMax-fMin) * 4.6566128752457969241e-10 * Random(2147483647) + fMin )
Next
ProcedureReturn *m4fResult
EndProcedure
Procedure UB2D_m4fPrint( *m4fSource.UB2D_MATRIX4f )
With *m4fSource
PrintN( RSet(StrF(\I11, 3), 9)+RSet(StrF(\I12, 3), 9)+RSet(StrF(\I13, 3), 9)+RSet(StrF(\I14, 3), 9) )
PrintN( RSet(StrF(\I21, 3), 9)+RSet(StrF(\I22, 3), 9)+RSet(StrF(\I23, 3), 9)+RSet(StrF(\I24, 3), 9) )
PrintN( RSet(StrF(\I31, 3), 9)+RSet(StrF(\I32, 3), 9)+RSet(StrF(\I33, 3), 9)+RSet(StrF(\I34, 3), 9) )
PrintN( RSet(StrF(\I41, 3), 9)+RSet(StrF(\I42, 3), 9)+RSet(StrF(\I43, 3), 9)+RSet(StrF(\I44, 3), 9) )
EndWith
EndProcedure
Define.UB2D_MATRIX4f A, B
Define Time.i, TimeBias.i, I.i
#Count = 10000000
OpenConsole()
RandomSeed(1)
UB2D_m4fRandom(A, 0.6438257)
UB2D_m4fRandom(B, 0.6438257)
TimeBias = ElapsedMilliseconds()
For I = 1 To #Count
Next
TimeBias = ElapsedMilliseconds() - TimeBias
Time = ElapsedMilliseconds()
For I = 1 To #Count
UB2D_m4fMultiplicationNativ(A, A, B)
;UB2D_m4fMultiplicationASM(A, A, B)
Next
Time = ElapsedMilliseconds() - Time
UB2D_m4fPrint(A)
PrintN("Time: " + Str(Time-TimeBias)+" ms")
PrintN("Single Time: " + StrF(1.0e6*(Time-TimeBias)/#Count, 3)+" ns")
Input()