PureBasic Forums - English

Posted: **Fri Aug 02, 2024 8:02 am**

Modern x64 CPU's are able to process 2 commands simultan. The CPU's Code Optimation and Branch Prediction will do this for us if we prepare a code which can be processed parallel. In general we have to use differnt registers for memory operations, so two 64Bit Address operations are done simultan in 1 cycle. Additional we keep all datas in Registers so we do not need any PUSh, POP or Address reload.

The PB-ASM-Compiler only do standard optimations and is not prepeard to use auto parallelisation of CPU's Code optimation!
With hand optimized Assembler-Code we can use this effects.

I prepeard just a simple example to show the effect: Adding two Integer Arrays with 64kB 10.000 times

; On Ryzen 5800
; PB_AddArray = 136ms
; ASM_AddArray = 26ms

Code: Select all

; Demo to show speed effects of hand optimized Assembler code
; when using possiblity off modern CPU code parallelisation

; For Function test RUN with Debugger
; For SpeedTest RUN without Debugger

; The example is a simple add of 2 Integer Arrays.
; On Ryzen 5800
; PB_AddArray = 136ms
; ASM_AddArray = 26ms
  
  
; Why Assembler is so much faster than PB-Compiler?
; Modern x64 CPU's are able to process 2 commands simultan. The CPU's Code Optimation 
; and Branch Prediction will do this for us if we prepare a code which can be processed
; parallel. In general we have to use differnt registers for memory operations, so
; two 64Bit Address operations are done simultan in 1 cycle. 
; Additional we keep all datas in Registers so we do not need any PUSh, POP or Address reload. 

; That's one of the reasons the C-Backend is sometimes much faster than PB-ASM-Compiler.
; C-Compilers are able to do many of such optimations automatically!

EnableExplicit

#ArraySize = 8191
Global Dim A.q(#ArraySize)
Global Dim B.q(#ArraySize)
Global Dim C.q(#ArraySize)

Define J

For J = 0 To #ArraySize
  A(J) = J
  B(J) = J+1
Next

Debug "Address of A() = " + @A()
Debug "Address of B() = " + @B()
Debug "Address of C() = " + @C()
Debug ""

Procedure PB_AddArray(Array Out.q(1), Array In1.q(1), Array In2.q(1), ArraySize)
  Protected I
  For I = 0 To ArraySize
    Out(I) = In1(I) + In2(I)
  Next
  ProcedureReturn 1
EndProcedure

Procedure ASM_AddArray1(Array Out.q(1), Array In1.q(1), Array In2.q(1), ArraySize)
  ; RAX, RDX   ADD Register
  ; RCX Size
  ; R8 *Out
  ; R9 *In1
  ; R10 *In2
    
  !MOV RCX, [p.v_ArraySize] 
  
  !MOV R8, [p.a_Out]      ; *Out() 
  !MOV R8, [R8]           ; Pointer to Data
  
  !MOV R9, [p.a_In1]      ; *In1()
  !MOV R9, [R9]           ; Pointer to Data
  
  !MOV R10, [p.a_In2]     ; *In2()
  !MOV R10, [R10]         ; Pointer to Data

  !SHL RCX, 3     ; RCX * 8 : Size *8
  !ADD RCX, 8
  
  !@@:
  !SUB RCX, 8
  !JS @f                  ; Jump if Sign : If RAX < 0 Then JUMP END:
    !MOV RAX, [R9 +RCX]   ; both MOV are executed simultan in 1 Cycle
    !MOV RDX, [R10+RCX]
    !ADD RAX, RDX
    !MOV [R8+RCX], RAX
    !JMP @b
  !@@:   
  !MOV RAX, 1
  ProcedureReturn
EndProcedure


Procedure SpeedTest()
  Protected J, res, msg$, t0, t1, t2
  Protected ratio.f
  #Loops = 10000
  
  t0 = ElapsedMilliseconds()
  For J=1 To #Loops
    res= PB_AddArray(C(), A(), B(), #ArraySize)
  Next
  t0 = ElapsedMilliseconds() - t0
  
  t1 = ElapsedMilliseconds()
  For J=1 To #Loops
    res= ASM_AddArray1(C(), A(), B(), #ArraySize)
  Next
  t1 = ElapsedMilliseconds() - t1  
  ratio = t0 / t1
  
  msg$ = "PB_AddArray = " + Str(t0) +"ms" + #CRLF$
  msg$ + "ASM_AddArray1 = " + Str(t1) +"ms" + #CRLF$
  msg$ + "Ratio = " + StrF(ratio,2)
  
  MessageRequester("Time", msg$)
EndProcedure

Procedure FunctionTest()
  Protected res, J
  
  res= ASM_AddArray1(C(), A(), B(), #ArraySize)
  
  For J = 0 To #ArraySize
    Debug Str(J) + " : " + Str(C(J))  
  Next
EndProcedure

CompilerIf #PB_Compiler_Debugger
  FunctionTest()
CompilerElse
  SpeedTest()
CompilerEndIf

PureBasic Forums - English

Demo to show effect of CPU's CodeOptimation

Demo to show effect of CPU's CodeOptimation