SSE1/2 Include with example

Share your advanced PureBasic knowledge/code with the community.
Dreglor
Enthusiast
Enthusiast
Posts: 759
Joined: Sat Aug 02, 2003 11:22 pm
Location: OR, USA

SSE1/2 Include with example

Post by Dreglor »

I wrote this to help me when writing code that uses SSE, I thought I would release it to the public.
not everything has been tested completely so if you see any improvement or errors that need to be fixed please feel free to modify (or even add)
I made it a rule to comment alot to help let every one understand how everything works.
note that is does not include every instruction in SSE and SSE2 only the the ones I see are useful but your welcome to add to it
sorry for the long post...

Code: Select all

;////////////////////////////////////////////////////////////////
;//
;// Filename: SSE.pbi
;// Version: 1.0.0.0
;// Date: 10-19-06
;// Author: Steven (Dreglor) Garcia
;//
;////////////////////////////////////////////////////////////////

;
;This Include is handy tool for any SSE and/or SSE2 programmer it wraps the more useful SSE instructions to simple macros
;and procedures. It also includes some procedures that helpful when using SSE instructions.
;don't forget to enable inline ASM before compiling
;
  
;-Structures

;
;SSE Registers are 128 bit wide, they can be cut up in 2 ways so that muiltable data can fit inside the same register
;4 floats Or 2 doubles. this data can be operated on all at once, making it nice to get alot of stuff done quickly
;Note: That these structures self align some times but not always but it is a good idea to assume that there are not either poke them into
;the SSE register (which is slower then if pushed).
;

Structure SSE32 ;SSE, for 32 bit Handling
  StructureUnion
    Float1.f ;For set data
    x.f ;For vectors
    r.f ;For colors
  EndStructureUnion
  StructureUnion
    Float2.f
    y.f
    g.f
  EndStructureUnion
  StructureUnion
    Float3.f
    z.f
    b.f
  EndStructureUnion
  StructureUnion
    Float4.f
    w.f
    a.f
  EndStructureUnion
EndStructure

Structure SSE64 ;SSE2, for 64 bit Handling
  StructureUnion
    Double1.d ;For set data
  EndStructureUnion
  StructureUnion
    Double2.d
  EndStructureUnion
EndStructure

Structure SSE ;All-Purpose SSE Access
  StructureUnion
    Fields8.SSE64 ;it would be easier if we could nest structure unions
    Fields4.SSE32
  EndStructureUnion
EndStructure

;- Macros

;Force a pointer To be 16 byte Aligned
Macro Align16(pointer) 
  (((pointer) + 15) &~ $0F)
EndMacro
  
Macro FreeAlignedMemory(MemoryBlock, Size)
  FreeMemory(MemoryBlock - PeekB(MemoryBlock + Size + 1))
EndMacro

;Tranportation of SSE data
;Must be a Aligned Pointer
Macro PushSSE(SSEVariable, Register = 0) ;A fast version of PokeSSE
  MOV Eax, SSEVariable
  !movaps xmm#Register, [Eax]
EndMacro

Macro PopSSE(SSEVariable, Register = 0) ;A fast version of PeekSSE
  MOV Eax, SSEVariable
  !movaps [Eax], xmm#Register 
EndMacro

;If The Variable is not a direct pointer then you will have to use these
Macro PushSSEClean(SSEVariable, Register = 0) ;A fast version of PokeSSE
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer
  !movaps xmm#Register, [Eax]
EndMacro

Macro PopSSEClean(SSEVariable, Register = 0) ;A fast version of PeekSSE
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer
  !movaps [Eax], xmm#Register 
EndMacro

;the pointer doesn't have to be aligned
Macro PokeSSE(SSEVariable, Register = 0)
  MOV Eax, SSEVariable
  !movups xmm#Register, [Eax]
EndMacro

Macro PeekSSE(SSEVariable, Register = 0)
  MOV Eax, SSEVariable
  !movups [Eax], xmm#Register 
EndMacro

;If The Variable is not a direct pointer then you will have to use these
Macro PokeSSEClean(SSEVariable, Register = 0)
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer
  !movups xmm#Register, [Eax]
EndMacro

Macro PeekSSEClean(SSEVariable, Register = 0)
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer
  !movups [Eax], xmm#Register 
EndMacro

;the many ways to move data
;the low high stuff is really reversed here to make the SSE Structure correct when operating
;with low and high
Macro CopySSE(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro CopyLowSSE(SourceRegister, DestinationRegister)
  !movhps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro CopyHighSSE(SourceRegister, DestinationRegister)
  !movlps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro CopyLowToHighSSE(SourceRegister, DestinationRegister)
  !movhlps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro CopyHighToLowSSE(SourceRegister, DestinationRegister)
  !movlhps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro InterweaveFromLowSSE(SourceRegister, DestinationRegister)
  !unpckhps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro InterweaveFromHighSSE(SourceRegister, DestinationRegister)
  !unpcklps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro ShuffleSSE(SourceRegister, DestinationRegister, ShuffleMask) ;use binary (no % prefix)
  !shufps xmm#DestinationRegister, xmm#DestinationRegister, ShuffleMask#b
EndMacro

Macro BroadcastSSEFloat1(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 00000000)
EndMacro

Macro BroadcastSSEFloat2(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 01010101)
EndMacro

Macro BroadcastSSEFloat3(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 10101010)
EndMacro

Macro BroadcastSSEFloat4(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 11111111)
EndMacro

;here are some useful debugging procedures
Macro DebugSSERegisterAsFloat(Register = 0)
  _TempDebug32.SSE32
  PopSSEClean(_TempDebug32, Register)
  Debug StrF(_TempDebug32\Float1) + ", " + StrF(_TempDebug32\Float2) + ", " + StrF(_TempDebug32\Float3) + ", " + StrF(_TempDebug32\Float4)
EndMacro

Macro DebugSSERegisterAsDouble(Register = 0)
  _TempDebug64.SSE64
  PopSSEClean(_TempDebug64, Register)
  Debug StrD(_TempDebug64\Double1) + ", " +StrD(_TempDebug64\Double2)
EndMacro

;
;The Entire Instruction set is not represented here nor is it as flexable as writing them by hand
;the instructions take only 2 parameters; Destination and Source. the source register can an address
;pointer but these macros don't allow that to make it easier it will be couple cycles longer to push 
;then operate on the Register
;please not that what ever is in DestinationRegister will be destroyed and be over written with the result
;

;SSE Instructions
;Arithmetic
Macro AddSingleFloat(SourceRegister, DestinationRegister)
  !addss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractSingleFloat(SourceRegister, DestinationRegister)
  !subss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplySingleFloat(SourceRegister, DestinationRegister)
  !mulss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideSingleFloat(SourceRegister, DestinationRegister)
  !divss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrSingleFloat(SourceRegister, DestinationRegister)
  !sqrtss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeSingleFloat(SourceRegister, DestinationRegister)
  !maxss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeSingleFloat(SourceRegister, DestinationRegister)
  !minss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro AddFloats(SourceRegister, DestinationRegister)
  !addps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractFloats(SourceRegister, DestinationRegister)
  !subps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplyFloats(SourceRegister, DestinationRegister)
  !mulps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideFloats(SourceRegister, DestinationRegister)
  !divps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrFloats(SourceRegister, DestinationRegister)
  !sqrtps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeFloats(SourceRegister, DestinationRegister)
  !maxps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeFloats(SourceRegister, DestinationRegister)
  !minps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;Recipicol
Macro RcpSingleFloat(SourceRegister, DestinationRegister)
  !rcpss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RSqrSingleFloat(SourceRegister, DestinationRegister)
  !rsqrtss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpFloats(SourceRegister, DestinationRegister)
  !rcpps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpSqrFloats(SourceRegister, DestinationRegister)
  !rsqrtps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro CompareIsEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 0 
EndMacro

Macro CompareIsLesserSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 1 
EndMacro

Macro CompareIsLesserOrEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 2 
EndMacro

Macro CompareIsUnorderedSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 3 
EndMacro

Macro CompareIsNotEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 4 
EndMacro

Macro CompareIsNotLesserSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 5 
EndMacro

Macro CompareIsNotLesserOrEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 6 
EndMacro

Macro CompareIsOrderedSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 7 
EndMacro

Macro CompareIsEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 0 
EndMacro

Macro CompareIsLesserFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 1 
EndMacro

Macro CompareIsLesserOrEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 2 
EndMacro

Macro CompareIsUnorderedFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 3 
EndMacro

Macro CompareIsNotEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 4 
EndMacro

Macro CompareIsNotLesserFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 5 
EndMacro

Macro CompareIsNotLesserOrEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 6 
EndMacro

Macro CompareIsOrderedFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 7 
EndMacro

Macro AndSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !andps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro NandSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !andnps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro OrSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !orps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro XorSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !xorps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;SSE2 Instructions
;Arithmetic
Macro AddSingleDouble(SourceRegister, DestinationRegister)
  !addsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractSingleDouble(SourceRegister, DestinationRegister)
  !subsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplySingleDouble(SourceRegister, DestinationRegister)
  !mulsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideSingleDouble(SourceRegister, DestinationRegister)
  !divsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrSingleDouble(SourceRegister, DestinationRegister)
  !sqrtsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeSingleDouble(SourceRegister, DestinationRegister)
  !maxsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeSingleDouble(SourceRegister, DestinationRegister)
  !minsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro AddDoubles(SourceRegister, DestinationRegister)
  !addpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractDoubles(SourceRegister, DestinationRegister)
  !subpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplyDoubles(SourceRegister, DestinationRegister)
  !mulpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideDoubles(SourceRegister, DestinationRegister)
  !divpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrDoubles(SourceRegister, DestinationRegister)
  !sqrtpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeDoubles(SourceRegister, DestinationRegister)
  !maxpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeDoubles(SourceRegister, DestinationRegister)
  !minpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;Recipicol
Macro RcpSingleDouble(SourceRegister, DestinationRegister)
  !rcpsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RSqrSingleDouble(SourceRegister, DestinationRegister)
  !rsqrtsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpDoubles(SourceRegister, DestinationRegister)
  !rcppd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpSqrDoubles(SourceRegister, DestinationRegister)
  !rsqrtpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;Logical

Macro CompareIsEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 0 
EndMacro

Macro CompareIsLesserSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 1 
EndMacro

Macro CompareIsLesserOrEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 2 
EndMacro

Macro CompareIsUnorderedSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 3 
EndMacro

Macro CompareIsNotEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 4 
EndMacro

Macro CompareIsNotLesserSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 5 
EndMacro

Macro CompareIsNotLesserOrEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 6 
EndMacro

Macro CompareIsOrderedSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 7 
EndMacro

Macro CompareIsEqualDoubles(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 0 
EndMacro

Macro CompareIsLesserDoubles(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 1 
EndMacro

Macro CompareIsLesserOrEqualDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 2 
EndMacro

Macro CompareIsUnorderedDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 3 
EndMacro

Macro CompareIsNotEqualDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 4 
EndMacro

Macro CompareIsNotLesserDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 5 
EndMacro

Macro CompareIsNotLesserOrEqualDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 6 
EndMacro

Macro CompareIsOrderedDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 7 
EndMacro

Macro ShiftLeftSSE(SourceRegister, Value) ;Operates on all 128bits
  !pslldq xmm#DestinationRegister, Value
EndMacro

Macro ShiftRightSSE(SourceRegister, Value) ;Operates on all 128bits
  !psrldq xmm#DestinationRegister, Value
EndMacro
  
;- Procedures
  
Procedure IsAligned(pointer)
  If pointer % 16 = 0
    ProcedureReturn #True
  Else
    ProcedureReturn #False
  EndIf
EndProcedure

;  
;Because allocating memory and aligning it causes memory leaks when freed later,
;This is one way of handling 16 byte aligned memory blocks, althought proably not the best way to do so,
;what it does is it allocates memory with a 16 bytes extra then it aligns the memory block.
;it will always have at least one byte at the end of the memory block this is where the offset is stored
;so when we go to free the memory block it will read the ending byte push the pointer back to the orignal
;pointer was and free from there other wise it will free the aligned block and leave up to 15 bytes
;this may not seam like a lot but when you have alot of this happing the left overs stack up
;and it is not good practice to leave leaks such as theses. 
;

Procedure AllocateAlignedMemory(Size.l)
  pointer = AllocateMemory(Size + 16)
  ReturningPointer = Align16(pointer)
  PokeB(ReturningPointer + Size + 1, (pointer % 16))
  ProcedureReturn ReturningPointer
EndProcedure
  
Procedure ReAllocateAlignedMemory(MemoryBlock.l, Size.l)
  pointer = ReAllocateMemory(MemoryBlock, Size + 16)
  ReturningPointer = Align16(pointer)
  PokeB(ReturningPointer + Size + 1, pointer % 16)
  ProcedureReturn ReturningPointer
EndProcedure
  
;Instruction Set Checks
Procedure CheckSSE()
  FeatureFlags.l
  MOV Eax, 1
  !CPUID
  MOV FeatureFlags, Edx
  ProcedureReturn (FeatureFlags >> 25) & 1
EndProcedure
  
Procedure CheckSSE2()
  FeatureFlags.l
  MOV Eax, 1
  !CPUID
  MOV FeatureFlags, Edx
  ProcedureReturn (FeatureFlags >> 26) & 1
EndProcedure
you notice that i almost exclusivly use macros for several reasons they have no call overhead and they can be very flexable when apllied correctly but this can also be it's downside macors can screw with the code and aren't exactly isolated like procedures are so it can also be difficult when trying to use these in more dynamic situations.
now for the example

Code: Select all

;SSE Example

IncludeFile "SSE.pbi"

Structure MatrixRow
  Col.f[4]
EndStructure

Structure Matrix
  Row.MatrixRow[4]
EndStructure

Structure Vector
  x.f
  y.f
  z.f
  w.f
EndStructure

;matrix is represented as such
;  c1  c2  c3  c4
;r1 0   0   0   0
;r2 0   0   0   0
;r3 0   0   0   0
;r4 0   0   0   0
;

;Matrix Transform

;PureBasic method
Procedure MatrixTransformPB(*Matrix.Matrix, *Vector.Vector, *Output.Vector)
  ;Nothing special here 
  *Output\x = *Matrix\Row[0]\Col[0] * *Vector\x + *Matrix\Row[0]\Col[1] * *Vector\y + *Matrix\Row[0]\Col[2] * *Vector\z + *Matrix\Row[0]\Col[3] * *Vector\w
  *Output\y = *Matrix\Row[1]\Col[0] * *Vector\x + *Matrix\Row[1]\Col[1] * *Vector\y + *Matrix\Row[1]\Col[2] * *Vector\z + *Matrix\Row[1]\Col[3] * *Vector\w
  *Output\z = *Matrix\Row[2]\Col[0] * *Vector\x + *Matrix\Row[2]\Col[1] * *Vector\y + *Matrix\Row[2]\Col[2] * *Vector\z + *Matrix\Row[2]\Col[3] * *Vector\w
  *Output\w = *Matrix\Row[3]\Col[0] * *Vector\x + *Matrix\Row[3]\Col[1] * *Vector\y + *Matrix\Row[3]\Col[2] * *Vector\z + *Matrix\Row[3]\Col[3] * *Vector\w
EndProcedure

Procedure MatrixTransformSSE(*Matrix.Matrix, *Vector.Vector, *Output.Vector)
  ;
  ;I am no SSE wizard I have to look up 90% of the instructions to make sure what there doing it correct
  ;so this is possabily the worst way about doing this but, still, obtaining ~70% increase is impressive
  ;if you want to redo this procedure the correct way and get the best out of SSE the look no further than here
  ;http://www.cortstratton.org/articles/OptimizingForSSE.php
  ;
  
  
  PushSSEClean(*Matrix\Row[0], 0)
  PushSSEClean(*Matrix\Row[1], 1)
  PushSSEClean(*Matrix\Row[2], 2)
  PushSSEClean(*Matrix\Row[3], 3)
  
  PushSSE(*Vector, 4)
  
  ;Muiltiply the vector to the matrix rows
  MulitplyFloats(4, 0)
  MulitplyFloats(4, 1)
  MulitplyFloats(4, 2)
  MulitplyFloats(4, 3)
  
  ;add up the rows
  CopySSE(0, 4)
  ShuffleSSE(4, 0, 01001110)
  AddFloats(4, 0)
  CopySSE(0, 4)
  ShuffleSSE(4, 0, 00010001)
  AddFloats(4, 0)
  
  CopySSE(1, 4)
  ShuffleSSE(4, 1, 01001110)
  AddFloats(4, 1)
  CopySSE(1, 4)
  ShuffleSSE(4, 1, 00010001)
  AddFloats(4, 1)
  
  CopySSE(2, 4)
  ShuffleSSE(4, 2, 01001110)
  AddFloats(4, 2)
  CopySSE(2, 4)
  ShuffleSSE(4, 2, 00010001)
  AddFloats(4, 2)
  
  CopySSE(3, 4)
  ShuffleSSE(4, 3, 01001110)
  AddFloats(4, 3)
  CopySSE(3, 4)
  ShuffleSSE(4, 3, 00010001)
  AddFloats(4, 3)
  
  ;mash the rows sum of the rows into once SSE register 
  CopySSE(0, 4)
  CopySSE(2, 5)
  InterweaveFromLowSSE(1, 4)
  InterweaveFromLowSSE(3, 5)
  CopyHighSSE(4, 5)
  PopSSE(*Output, 5)
EndProcedure


;check to see if SSE is available if not end the program
If CheckSSE() = #False
  MessageRequester("Error", "SSE Instructions where not detected on this computer and are needed", #MB_ICONERROR)
  End
EndIf

;for the older hardware users,
;if SSE2 is not available then just show a message (because it isn't required to run this example)
If CheckSSE2() = #False
  MessageRequester("Info", "SSE Instructions where not detected on this computer but are not needed", #MB_ICONWARNING)
EndIf

;set up the test values
*TestMatrix.Matrix = AllocateAlignedMemory(SizeOf(Matrix))
*TestMatrix\Row[0]\Col[0] = 1
*TestMatrix\Row[0]\Col[1] = 2
*TestMatrix\Row[0]\Col[2] = 3
*TestMatrix\Row[0]\Col[3] = 4

*TestMatrix\Row[1]\Col[0] = 5
*TestMatrix\Row[1]\Col[1] = 6
*TestMatrix\Row[1]\Col[2] = 7
*TestMatrix\Row[1]\Col[3] = 8

*TestMatrix\Row[2]\Col[0] = 9
*TestMatrix\Row[2]\Col[1] = 10
*TestMatrix\Row[2]\Col[2] = 11
*TestMatrix\Row[2]\Col[3] = 12

*TestMatrix\Row[3]\Col[0] = 13
*TestMatrix\Row[3]\Col[1] = 14
*TestMatrix\Row[3]\Col[2] = 15
*TestMatrix\Row[3]\Col[3] = 16

*TestVector.Vector = AllocateAlignedMemory(SizeOf(Vector)) 
*TestVector\x = 1
*TestVector\y = 2
*TestVector\z = 3
*TestVector\w = 4

*Output.Vector = AllocateAlignedMemory(SizeOf(Vector)) 

startPB = ElapsedMilliseconds()
For i=0 To 100000000
  MatrixTransformPB(*TestMatrix, *TestVector, *Output)
Next
EndPB = ElapsedMilliseconds()

startSSE = ElapsedMilliseconds()
For i=0 To 100000000
  MatrixTransformSSE(*TestMatrix, *TestVector, *Output)
Next
EndSSE = ElapsedMilliseconds()

;it is good practice to free allocated memory before ending the program
FreeAlignedMemory(*TestMatrix, SizeOf(Matrix))
FreeAlignedMemory(*TestVector, SizeOf(Vector))
FreeAlignedMemory(*Output, SizeOf(Vector))

;show the advatage SSE has
MessageRequester("Info","Purebasic Timming: " + Str(EndPB - startPB) + Chr(10) + "SSE Timming: " + Str(EndSSE - startSSE))
~Dreglor
KarLKoX
Enthusiast
Enthusiast
Posts: 681
Joined: Mon Oct 06, 2003 7:13 pm
Location: France
Contact:

Post by KarLKoX »

Very usefull and clean code Image
"Qui baise trop bouffe un poil." P. Desproges

http://karlkox.blogspot.com/
va!n
Addict
Addict
Posts: 1104
Joined: Wed Apr 20, 2005 12:48 pm

Post by va!n »

nice! ;)
va!n aka Thorsten

Intel i7-980X Extreme Edition, 12 GB DDR3, Radeon 5870 2GB, Windows7 x64,
Ollivier
Enthusiast
Enthusiast
Posts: 281
Joined: Mon Jul 23, 2007 8:30 pm
Location: FR

Post by Ollivier »

It's late but thank you!
User avatar
grabiller
Enthusiast
Enthusiast
Posts: 309
Joined: Wed Jun 01, 2011 9:38 am
Location: France - 89220 Rogny-Les-Septs-Ecluses
Contact:

Re: SSE1/2 Include with example

Post by grabiller »

I was trying to run the test with PB v5.30b2 but I get these errors:

On x64:
PureBasic.asm [1494]:
MOV Eax,qword[rsp+PS12+8]
error: operand sizes do not match.

On x86:
PureBasic.asm [4481]:
movlps xmm5, xmm4
error: invalid operand

Any idea what should be done to make this work with the latest version of PureBasic ?
guy rabiller | radfac founder / ceo | raafal.org
Kuzmat
User
User
Posts: 13
Joined: Mon Jan 27, 2014 9:09 am

Re: SSE1/2 Include with example

Post by Kuzmat »

PB 5.31 (x32 only)
sse.pbi

Code: Select all

;////////////////////////////////////////////////////////////////
;//
;// Filename: SSE.pbi
;// Version: 1.0.0.0
;// Date: 10-19-06
;// Author: Steven (Dreglor) Garcia
;//
;////////////////////////////////////////////////////////////////

;
;This Include is handy tool for any SSE and/or SSE2 programmer it wraps the more useful SSE instructions to simple macros
;and procedures. It also includes some procedures that helpful when using SSE instructions.
;don't forget to enable inline ASM before compiling
;
 
;-Structures

;
;SSE Registers are 128 bit wide, they can be cut up in 2 ways so that muiltable data can fit inside the same register
;4 floats Or 2 doubles. this data can be operated on all at once, making it nice to get alot of stuff done quickly
;Note: That these structures self align some times but not always but it is a good idea to assume that there are not either poke them into
;the SSE register (which is slower then if pushed).
;
EnableASM

Structure SSE32 ;SSE, for 32 bit Handling
  StructureUnion
    Float1.f ;For set data
    x.f ;For vectors
    r.f ;For colors
  EndStructureUnion
  StructureUnion
    Float2.f
    y.f
    g.f
  EndStructureUnion
  StructureUnion
    Float3.f
    z.f
    b.f
  EndStructureUnion
  StructureUnion
    Float4.f
    w.f
    a.f
  EndStructureUnion
EndStructure

Structure SSE64 ;SSE2, for 64 bit Handling
  StructureUnion
    Double1.d ;For set data
  EndStructureUnion
  StructureUnion
    Double2.d
  EndStructureUnion
EndStructure

Structure SSE ;All-Purpose SSE Access
  StructureUnion
    Fields8.SSE64 ;it would be easier if we could nest structure unions
    Fields4.SSE32
  EndStructureUnion
EndStructure

;- Macros

;Force a pointer To be 16 byte Aligned
Macro Align16(pointer)
  (((pointer) + 15) &~ $0F)
EndMacro
 
Macro FreeAlignedMemory(MemoryBlock, Size)
  FreeMemory(MemoryBlock - PeekB(MemoryBlock + Size + 1))
EndMacro

;Tranportation of SSE data
;Must be a Aligned Pointer
Macro PushSSE(SSEVariable, Register = 0) ;A fast version of PokeSSE
  MOV Eax, SSEVariable
  !movaps xmm#Register, [Eax]
EndMacro

Macro PopSSE(SSEVariable, Register = 0) ;A fast version of PeekSSE
  MOV Eax, SSEVariable
  !movaps [Eax], xmm#Register
EndMacro

;If The Variable is not a direct pointer then you will have to use these
Macro PushSSEClean(SSEVariable, Register = 0) ;A fast version of PokeSSE
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer
  !movaps xmm#Register, [Eax]
EndMacro

Macro PopSSEClean(SSEVariable, Register = 0) ;A fast version of PeekSSE
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer
  !movaps [Eax], xmm#Register
EndMacro

;the pointer doesn't have to be aligned
Macro PokeSSE(SSEVariable, Register = 0)
  MOV Eax, SSEVariable
  !movups xmm#Register, [Eax]
EndMacro

Macro PeekSSE(SSEVariable, Register = 0)
  MOV Eax, SSEVariable
  !movups [Eax], xmm#Register
EndMacro

;If The Variable is not a direct pointer then you will have to use these
Macro PokeSSEClean(SSEVariable, Register = 0)
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer
  !movups xmm#Register, [Eax]
EndMacro

Macro PeekSSEClean(SSEVariable, Register = 0)
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer
  !movups [Eax], xmm#Register
EndMacro

;the many ways to move data
;the low high stuff is really reversed here to make the SSE Structure correct when operating
;with low and high
Macro CopySSE(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

; Macro CopyLowSSE(SourceRegister, DestinationRegister)
;   !movhps xmm#DestinationRegister, xmm#SourceRegister
; EndMacro
Macro PopLowSSE(SSEVariable, Register)
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer
  !movhps [Eax], xmm#Register
EndMacro

Macro PushLowSSE(SSEVariable, Register)
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer
  !movhps xmm#Register, [Eax]
EndMacro

; Macro CopyHighSSE(SourceRegister, DestinationRegister)
;   !movlps xmm#DestinationRegister, xmm#SourceRegister
; EndMacro
Macro PopHighSSE(SSEVariable, Register)
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer  
  !movlps [Eax], xmm#Register
EndMacro

Macro PushHighSSE(SSEVariable, Register)
  _Pointer.l = SSEVariable
  MOV Eax, _Pointer  
  !movlps xmm#Register, [Eax]
EndMacro


Macro CopyLowToHighSSE(SourceRegister, DestinationRegister)
  !movhlps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro CopyHighToLowSSE(SourceRegister, DestinationRegister)
  !movlhps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro InterweaveFromLowSSE(SourceRegister, DestinationRegister)
  !unpckhps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro InterweaveFromHighSSE(SourceRegister, DestinationRegister)
  !unpcklps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro ShuffleSSE(SourceRegister, DestinationRegister, ShuffleMask) ;use binary (no % prefix)
  !shufps xmm#DestinationRegister, xmm#DestinationRegister, ShuffleMask#b
EndMacro

Macro BroadcastSSEFloat1(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 00000000)
EndMacro

Macro BroadcastSSEFloat2(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 01010101)
EndMacro

Macro BroadcastSSEFloat3(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 10101010)
EndMacro

Macro BroadcastSSEFloat4(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 11111111)
EndMacro

;here are some useful debugging procedures
Macro DebugSSERegisterAsFloat(Register = 0)
  _TempDebug32.SSE32
  PopSSEClean(_TempDebug32, Register)
  Debug StrF(_TempDebug32\Float1) + ", " + StrF(_TempDebug32\Float2) + ", " + StrF(_TempDebug32\Float3) + ", " + StrF(_TempDebug32\Float4)
EndMacro

Macro DebugSSERegisterAsDouble(Register = 0)
  _TempDebug64.SSE64
  PopSSEClean(_TempDebug64, Register)
  Debug StrD(_TempDebug64\Double1) + ", " +StrD(_TempDebug64\Double2)
EndMacro

;
;The Entire Instruction set is not represented here nor is it as flexable as writing them by hand
;the instructions take only 2 parameters; Destination and Source. the source register can an address
;pointer but these macros don't allow that to make it easier it will be couple cycles longer to push
;then operate on the Register
;please not that what ever is in DestinationRegister will be destroyed and be over written with the result
;

;SSE Instructions
;Arithmetic
Macro AddSingleFloat(SourceRegister, DestinationRegister)
  !addss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractSingleFloat(SourceRegister, DestinationRegister)
  !subss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplySingleFloat(SourceRegister, DestinationRegister)
  !mulss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideSingleFloat(SourceRegister, DestinationRegister)
  !divss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrSingleFloat(SourceRegister, DestinationRegister)
  !sqrtss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeSingleFloat(SourceRegister, DestinationRegister)
  !maxss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeSingleFloat(SourceRegister, DestinationRegister)
  !minss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro AddFloats(SourceRegister, DestinationRegister)
  !addps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractFloats(SourceRegister, DestinationRegister)
  !subps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplyFloats(SourceRegister, DestinationRegister)
  !mulps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideFloats(SourceRegister, DestinationRegister)
  !divps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrFloats(SourceRegister, DestinationRegister)
  !sqrtps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeFloats(SourceRegister, DestinationRegister)
  !maxps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeFloats(SourceRegister, DestinationRegister)
  !minps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;Recipicol
Macro RcpSingleFloat(SourceRegister, DestinationRegister)
  !rcpss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RSqrSingleFloat(SourceRegister, DestinationRegister)
  !rsqrtss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpFloats(SourceRegister, DestinationRegister)
  !rcpps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpSqrFloats(SourceRegister, DestinationRegister)
  !rsqrtps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro CompareIsEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 0
EndMacro

Macro CompareIsLesserSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 1
EndMacro

Macro CompareIsLesserOrEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 2
EndMacro

Macro CompareIsUnorderedSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 3
EndMacro

Macro CompareIsNotEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 4
EndMacro

Macro CompareIsNotLesserSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 5
EndMacro

Macro CompareIsNotLesserOrEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 6
EndMacro

Macro CompareIsOrderedSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 7
EndMacro

Macro CompareIsEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 0
EndMacro

Macro CompareIsLesserFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 1
EndMacro

Macro CompareIsLesserOrEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 2
EndMacro

Macro CompareIsUnorderedFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 3
EndMacro

Macro CompareIsNotEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 4
EndMacro

Macro CompareIsNotLesserFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 5
EndMacro

Macro CompareIsNotLesserOrEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 6
EndMacro

Macro CompareIsOrderedFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 7
EndMacro

Macro AndSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !andps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro NandSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !andnps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro OrSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !orps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro XorSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !xorps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;SSE2 Instructions
;Arithmetic
Macro AddSingleDouble(SourceRegister, DestinationRegister)
  !addsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractSingleDouble(SourceRegister, DestinationRegister)
  !subsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplySingleDouble(SourceRegister, DestinationRegister)
  !mulsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideSingleDouble(SourceRegister, DestinationRegister)
  !divsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrSingleDouble(SourceRegister, DestinationRegister)
  !sqrtsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeSingleDouble(SourceRegister, DestinationRegister)
  !maxsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeSingleDouble(SourceRegister, DestinationRegister)
  !minsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro AddDoubles(SourceRegister, DestinationRegister)
  !addpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractDoubles(SourceRegister, DestinationRegister)
  !subpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplyDoubles(SourceRegister, DestinationRegister)
  !mulpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideDoubles(SourceRegister, DestinationRegister)
  !divpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrDoubles(SourceRegister, DestinationRegister)
  !sqrtpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeDoubles(SourceRegister, DestinationRegister)
  !maxpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeDoubles(SourceRegister, DestinationRegister)
  !minpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;Recipicol
Macro RcpSingleDouble(SourceRegister, DestinationRegister)
  !rcpsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RSqrSingleDouble(SourceRegister, DestinationRegister)
  !rsqrtsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpDoubles(SourceRegister, DestinationRegister)
  !rcppd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpSqrDoubles(SourceRegister, DestinationRegister)
  !rsqrtpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;Logical

Macro CompareIsEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 0
EndMacro

Macro CompareIsLesserSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 1
EndMacro

Macro CompareIsLesserOrEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 2
EndMacro

Macro CompareIsUnorderedSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 3
EndMacro

Macro CompareIsNotEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 4
EndMacro

Macro CompareIsNotLesserSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 5
EndMacro

Macro CompareIsNotLesserOrEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 6
EndMacro

Macro CompareIsOrderedSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 7
EndMacro

Macro CompareIsEqualDoubles(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 0
EndMacro

Macro CompareIsLesserDoubles(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 1
EndMacro

Macro CompareIsLesserOrEqualDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 2
EndMacro

Macro CompareIsUnorderedDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 3
EndMacro

Macro CompareIsNotEqualDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 4
EndMacro

Macro CompareIsNotLesserDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 5
EndMacro

Macro CompareIsNotLesserOrEqualDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 6
EndMacro

Macro CompareIsOrderedDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 7
EndMacro

Macro ShiftLeftSSE(SourceRegister, Value) ;Operates on all 128bits
  !pslldq xmm#DestinationRegister, Value
EndMacro

Macro ShiftRightSSE(SourceRegister, Value) ;Operates on all 128bits
  !psrldq xmm#DestinationRegister, Value
EndMacro
 
;- Procedures
 
Procedure IsAligned(pointer)
  If pointer % 16 = 0
    ProcedureReturn #True
  Else
    ProcedureReturn #False
  EndIf
EndProcedure

; 
;Because allocating memory and aligning it causes memory leaks when freed later,
;This is one way of handling 16 byte aligned memory blocks, althought proably not the best way to do so,
;what it does is it allocates memory with a 16 bytes extra then it aligns the memory block.
;it will always have at least one byte at the end of the memory block this is where the offset is stored
;so when we go to free the memory block it will read the ending byte push the pointer back to the orignal
;pointer was and free from there other wise it will free the aligned block and leave up to 15 bytes
;this may not seam like a lot but when you have alot of this happing the left overs stack up
;and it is not good practice to leave leaks such as theses.
;

Procedure AllocateAlignedMemory(Size.l)
  pointer = AllocateMemory(Size + 16)
  ReturningPointer = Align16(pointer)
  PokeB(ReturningPointer + Size + 1, ReturningPointer - pointer)
  ProcedureReturn ReturningPointer
EndProcedure
 
Procedure ReAllocateAlignedMemory(MemoryBlock.l, Size.l)
  pointer = ReAllocateMemory(MemoryBlock, Size + 16)
  ReturningPointer = Align16(pointer)
  PokeB(ReturningPointer + Size + 1, ReturningPointer - pointer)
  ProcedureReturn ReturningPointer
EndProcedure
 
;Instruction Set Checks
Procedure CheckSSE()
  FeatureFlags.l
  MOV Eax, 1
  !CPUID
  MOV FeatureFlags, Edx
  ProcedureReturn (FeatureFlags >> 25) & 1
EndProcedure
 
Procedure CheckSSE2()
  FeatureFlags.l
  MOV Eax, 1
  !CPUID
  MOV FeatureFlags, Edx
  ProcedureReturn (FeatureFlags >> 26) & 1
EndProcedure
sse_example.pb

Code: Select all

;SSE Example

XIncludeFile "SSE.pbi"

Structure MatrixRow
  Col.f[4]
EndStructure

Structure Matrix
  Row.MatrixRow[4]
EndStructure

Structure Vector
  x.f
  y.f
  z.f
  w.f
EndStructure

;matrix is represented as such
;  c1  c2  c3  c4
;r1 0   0   0   0
;r2 0   0   0   0
;r3 0   0   0   0
;r4 0   0   0   0
;

;Matrix Transform

;PureBasic method
Procedure MatrixTransformPB(*Matrix.Matrix, *Vector.Vector, *Output.Vector)
  ;Nothing special here
  *Output\x = *Matrix\Row[0]\Col[0] * *Vector\x + *Matrix\Row[0]\Col[1] * *Vector\y + *Matrix\Row[0]\Col[2] * *Vector\z + *Matrix\Row[0]\Col[3] * *Vector\w
  *Output\y = *Matrix\Row[1]\Col[0] * *Vector\x + *Matrix\Row[1]\Col[1] * *Vector\y + *Matrix\Row[1]\Col[2] * *Vector\z + *Matrix\Row[1]\Col[3] * *Vector\w
  *Output\z = *Matrix\Row[2]\Col[0] * *Vector\x + *Matrix\Row[2]\Col[1] * *Vector\y + *Matrix\Row[2]\Col[2] * *Vector\z + *Matrix\Row[2]\Col[3] * *Vector\w
  *Output\w = *Matrix\Row[3]\Col[0] * *Vector\x + *Matrix\Row[3]\Col[1] * *Vector\y + *Matrix\Row[3]\Col[2] * *Vector\z + *Matrix\Row[3]\Col[3] * *Vector\w
EndProcedure

Procedure MatrixTransformSSE(*Matrix.Matrix, *Vector.Vector, *Output.Vector)
  ;
  ;I am no SSE wizard I have to look up 90% of the instructions to make sure what there doing it correct
  ;so this is possabily the worst way about doing this but, still, obtaining ~70% increase is impressive
  ;if you want to redo this procedure the correct way and get the best out of SSE the look no further than here
  ;http://www.cortstratton.org/articles/OptimizingForSSE.php
  ;
 
 
  PushSSEClean(*Matrix\Row[0], 0)
  PushSSEClean(*Matrix\Row[1], 1)
  PushSSEClean(*Matrix\Row[2], 2)
  PushSSEClean(*Matrix\Row[3], 3)
 
  PushSSE(*Vector, 4)
 
  ;Muiltiply the vector to the matrix rows
  MulitplyFloats(4, 0)
  MulitplyFloats(4, 1)
  MulitplyFloats(4, 2)
  MulitplyFloats(4, 3)
 
  ;add up the rows
  CopySSE(0, 4)
  ShuffleSSE(4, 0, 01001110)
  AddFloats(4, 0)
  CopySSE(0, 4)
  ShuffleSSE(4, 0, 00010001)
  AddFloats(4, 0)
 
  CopySSE(1, 4)
  ShuffleSSE(4, 1, 01001110)
  AddFloats(4, 1)
  CopySSE(1, 4)
  ShuffleSSE(4, 1, 00010001)
  AddFloats(4, 1)
 
  CopySSE(2, 4)
  ShuffleSSE(4, 2, 01001110)
  AddFloats(4, 2)
  CopySSE(2, 4)
  ShuffleSSE(4, 2, 00010001)
  AddFloats(4, 2)
 
  CopySSE(3, 4)
  ShuffleSSE(4, 3, 01001110)
  AddFloats(4, 3)
  CopySSE(3, 4)
  ShuffleSSE(4, 3, 00010001)
  AddFloats(4, 3)
 
  ;mash the rows sum of the rows into once SSE register
  CopySSE(0, 4)
  CopySSE(2, 5)
  InterweaveFromLowSSE(1, 4)
  InterweaveFromLowSSE(3, 5)
  ;CopyHighSSE(4, 5)
  ;PopSSE(*Output, 5)
  PopHighSSE(*Output, 4)
  PopLowSSE(*Output+8, 5)
EndProcedure


;check to see if SSE is available if not end the program
If CheckSSE() = #False
  MessageRequester("Error", "SSE Instructions where not detected on this computer and are needed", #MB_ICONERROR)
  End
EndIf

;for the older hardware users,
;if SSE2 is not available then just show a message (because it isn't required to run this example)
If CheckSSE2() = #False
  MessageRequester("Info", "SSE Instructions where not detected on this computer but are not needed", #MB_ICONWARNING)
EndIf

;set up the test values
*TestMatrix.Matrix = AllocateAlignedMemory(SizeOf(Matrix))
*TestMatrix\Row[0]\Col[0] = 1
*TestMatrix\Row[0]\Col[1] = 2
*TestMatrix\Row[0]\Col[2] = 3
*TestMatrix\Row[0]\Col[3] = 4

*TestMatrix\Row[1]\Col[0] = 5
*TestMatrix\Row[1]\Col[1] = 6
*TestMatrix\Row[1]\Col[2] = 7
*TestMatrix\Row[1]\Col[3] = 8

*TestMatrix\Row[2]\Col[0] = 9
*TestMatrix\Row[2]\Col[1] = 10
*TestMatrix\Row[2]\Col[2] = 11
*TestMatrix\Row[2]\Col[3] = 12

*TestMatrix\Row[3]\Col[0] = 13
*TestMatrix\Row[3]\Col[1] = 14
*TestMatrix\Row[3]\Col[2] = 15
*TestMatrix\Row[3]\Col[3] = 16

*TestVector.Vector = AllocateAlignedMemory(SizeOf(Vector))
*TestVector\x = 1
*TestVector\y = 2
*TestVector\z = 3
*TestVector\w = 4

*Output.Vector = AllocateAlignedMemory(SizeOf(Vector))

startPB = ElapsedMilliseconds()
For i=0 To 100000000
  MatrixTransformPB(*TestMatrix, *TestVector, *Output)
Next
EndPB = ElapsedMilliseconds()

startSSE = ElapsedMilliseconds()
For i=0 To 100000000
  MatrixTransformSSE(*TestMatrix, *TestVector, *Output)
Next
EndSSE = ElapsedMilliseconds()

;it is good practice to free allocated memory before ending the program
FreeAlignedMemory(*TestMatrix, SizeOf(Matrix))
FreeAlignedMemory(*TestVector, SizeOf(Vector))
FreeAlignedMemory(*Output, SizeOf(Vector))

;show the advatage SSE has
MessageRequester("Info","Purebasic Timming: " + Str(EndPB - startPB) + Chr(10) + "SSE Timming: " + Str(EndSSE - startSSE))
Last edited by Kuzmat on Tue Mar 31, 2015 4:51 am, edited 1 time in total.
davido
Addict
Addict
Posts: 1890
Joined: Fri Nov 09, 2012 11:04 pm
Location: Uttoxeter, UK

Re: SSE1/2 Include with example

Post by davido »

Nice idea.

Tested with PB5.31 x64 on Windows 7 x64

Got exactly the same error as grabiller posted earlier.
DE AA EB
wilbert
PureBasic Expert
PureBasic Expert
Posts: 3942
Joined: Sun Aug 08, 2004 5:21 am
Location: Netherlands

Re: SSE1/2 Include with example

Post by wilbert »

davido wrote:Nice idea.

Tested with PB5.31 x64 on Windows 7 x64

Got exactly the same error as grabiller posted earlier.
The code as it is can't function on x64.
It wasn't programmed to support x64.
Windows (x64)
Raspberry Pi OS (Arm64)
davido
Addict
Addict
Posts: 1890
Joined: Fri Nov 09, 2012 11:04 pm
Location: Uttoxeter, UK

Re: SSE1/2 Include with example

Post by davido »

@wilbert,
Thank you for the explanation. :D
DE AA EB
Kuzmat
User
User
Posts: 13
Joined: Mon Jan 27, 2014 9:09 am

Re: SSE1/2 Include with example

Post by Kuzmat »

Sorry, forgot to specify that the x32.
For x64 requires significant modifications.
Kuzmat
User
User
Posts: 13
Joined: Mon Jan 27, 2014 9:09 am

Re: SSE1/2 Include with example

Post by Kuzmat »

Corrected for al x32-x64 version PB5.24, PB5.31

Code: Select all

;////////////////////////////////////////////////////////////////
;//
;// Filename: SSE.pbi
;// Version: 1.0.0.0
;// Date: 10-19-06
;// Author: Steven (Dreglor) Garcia
;//
;////////////////////////////////////////////////////////////////

;
;This Include is handy tool for any SSE and/or SSE2 programmer it wraps the more useful SSE instructions to simple macros
;and procedures. It also includes some procedures that helpful when using SSE instructions.
;don't forget to enable inline ASM before compiling
;
 
;-Structures

;
;SSE Registers are 128 bit wide, they can be cut up in 2 ways so that muiltable data can fit inside the same register
;4 floats Or 2 doubles. this data can be operated on all at once, making it nice to get alot of stuff done quickly
;Note: That these structures self align some times but not always but it is a good idea to assume that there are not either poke them into
;the SSE register (which is slower then if pushed).
;
EnableExplicit
EnableASM

Structure SSE32 ;SSE, for 32 bit Handling
  StructureUnion
    Float1.f ;For set data
    x.f ;For vectors
    r.f ;For colors
  EndStructureUnion
  StructureUnion
    Float2.f
    y.f
    g.f
  EndStructureUnion
  StructureUnion
    Float3.f
    z.f
    b.f
  EndStructureUnion
  StructureUnion
    Float4.f
    w.f
    a.f
  EndStructureUnion
EndStructure

Structure SSE64 ;SSE2, for 64 bit Handling
  StructureUnion
    Double1.d ;For set data
  EndStructureUnion
  StructureUnion
    Double2.d
  EndStructureUnion
EndStructure

Structure SSE ;All-Purpose SSE Access
  StructureUnion
    Fields8.SSE64 ;it would be easier if we could nest structure unions
    Fields4.SSE32
  EndStructureUnion
EndStructure

;- Macros
; Correct x64 register length
CompilerIf #PB_Compiler_Processor = #PB_Processor_x64
  Macro Eax
    Rax
  EndMacro
  Macro Edx
    Rdx
  EndMacro
  Macro Ecx
    Rcx
  EndMacro  
CompilerEndIf

;Tranportation of SSE data
;Must be a Aligned Pointer
Macro PushSSE(SSEVariable, Register = 0) ;A fast version of PokeSSE
  MOV Eax, SSEVariable
  !movaps xmm#Register, [Eax]
EndMacro

Macro PopSSE(SSEVariable, Register = 0) ;A fast version of PeekSSE
  MOV Eax, SSEVariable
  !movaps [Eax], xmm#Register
EndMacro

;If The Variable is not a direct pointer then you will have to use these
Macro PushSSEClean(SSEVariable, Register = 0) ;A fast version of PokeSSE
  Define _Pointer = SSEVariable
  MOV Eax, _Pointer
  !movaps xmm#Register, [Eax]
EndMacro

Macro PopSSEClean(SSEVariable, Register = 0) ;A fast version of PeekSSE
  Define _Pointer = SSEVariable
  MOV Eax, _Pointer
  !movaps [Eax], xmm#Register
EndMacro

;the pointer doesn't have to be aligned
Macro PokeSSE(SSEVariable, Register = 0)
  MOV Eax, SSEVariable
  !movups xmm#Register, [Eax]
EndMacro

Macro PeekSSE(SSEVariable, Register = 0)
  MOV Eax, SSEVariable
  !movups [Eax], xmm#Register
EndMacro

;If The Variable is not a direct pointer then you will have to use these
Macro PokeSSEClean(SSEVariable, Register = 0)
  Define _Pointer = SSEVariable
  MOV Eax, _Pointer
  !movups xmm#Register, [Eax]
EndMacro

Macro PeekSSEClean(SSEVariable, Register = 0)
  Define _Pointer = SSEVariable
  MOV Eax, _Pointer
  !movups [Eax], xmm#Register
EndMacro

;the many ways to move data
;the low high stuff is really reversed here to make the SSE Structure correct when operating
;with low and high
Macro CopySSE(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

; Macro CopyLowSSE(SourceRegister, DestinationRegister)
;   !movhps xmm#DestinationRegister, xmm#SourceRegister
; EndMacro
Macro PopLowSSE(SSEVariable, Register)
  Define _Pointer = SSEVariable
  MOV Eax, _Pointer
  !movhps [Eax], xmm#Register
EndMacro

Macro PushLowSSE(SSEVariable, Register)
  Define _Pointer = SSEVariable
  MOV Eax, _Pointer
  !movhps xmm#Register, [Eax]
EndMacro

; Macro CopyHighSSE(SourceRegister, DestinationRegister)
;   !movlps xmm#DestinationRegister, xmm#SourceRegister
; EndMacro
Macro PopHighSSE(SSEVariable, Register)
  Define _Pointer = SSEVariable
  MOV Eax, _Pointer  
  !movlps [Eax], xmm#Register
EndMacro

Macro PushHighSSE(SSEVariable, Register)
  Define _Pointer = SSEVariable
  MOV Eax, _Pointer  
  !movlps xmm#Register, [Eax]
EndMacro


Macro CopyLowToHighSSE(SourceRegister, DestinationRegister)
  !movhlps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro CopyHighToLowSSE(SourceRegister, DestinationRegister)
  !movlhps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro InterweaveFromLowSSE(SourceRegister, DestinationRegister)
  !unpckhps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro InterweaveFromHighSSE(SourceRegister, DestinationRegister)
  !unpcklps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro ShuffleSSE(SourceRegister, DestinationRegister, ShuffleMask) ;use binary (no % prefix)
  !shufps xmm#DestinationRegister, xmm#DestinationRegister, ShuffleMask#b
EndMacro

Macro BroadcastSSEFloat1(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 00000000)
EndMacro

Macro BroadcastSSEFloat2(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 01010101)
EndMacro

Macro BroadcastSSEFloat3(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 10101010)
EndMacro

Macro BroadcastSSEFloat4(SourceRegister, DestinationRegister)
  !movaps xmm#DestinationRegister, xmm#SourceRegister
  ShuffleSSE(DestinationRegister, DestinationRegister, 11111111)
EndMacro

;here are some useful debugging procedures
Macro DebugSSERegisterAsFloat(Register = 0)
  Define _TempDebug32.SSE32
  PopSSEClean(_TempDebug32, Register)
  Debug StrF(_TempDebug32\Float1) + ", " + StrF(_TempDebug32\Float2) + ", " + StrF(_TempDebug32\Float3) + ", " + StrF(_TempDebug32\Float4)
EndMacro

Macro DebugSSERegisterAsDouble(Register = 0)
  Define _TempDebug64.SSE64
  PopSSEClean(_TempDebug64, Register)
  Debug StrD(_TempDebug64\Double1) + ", " +StrD(_TempDebug64\Double2)
EndMacro

;
;The Entire Instruction set is not represented here nor is it as flexable as writing them by hand
;the instructions take only 2 parameters; Destination and Source. the source register can an address
;pointer but these macros don't allow that to make it easier it will be couple cycles longer to push
;then operate on the Register
;please not that what ever is in DestinationRegister will be destroyed and be over written with the result
;

;SSE Instructions
;Arithmetic
Macro AddSingleFloat(SourceRegister, DestinationRegister)
  !addss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractSingleFloat(SourceRegister, DestinationRegister)
  !subss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplySingleFloat(SourceRegister, DestinationRegister)
  !mulss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideSingleFloat(SourceRegister, DestinationRegister)
  !divss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrSingleFloat(SourceRegister, DestinationRegister)
  !sqrtss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeSingleFloat(SourceRegister, DestinationRegister)
  !maxss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeSingleFloat(SourceRegister, DestinationRegister)
  !minss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro AddFloats(SourceRegister, DestinationRegister)
  !addps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractFloats(SourceRegister, DestinationRegister)
  !subps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplyFloats(SourceRegister, DestinationRegister)
  !mulps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideFloats(SourceRegister, DestinationRegister)
  !divps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrFloats(SourceRegister, DestinationRegister)
  !sqrtps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeFloats(SourceRegister, DestinationRegister)
  !maxps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeFloats(SourceRegister, DestinationRegister)
  !minps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;Recipicol
Macro RcpSingleFloat(SourceRegister, DestinationRegister)
  !rcpss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RSqrSingleFloat(SourceRegister, DestinationRegister)
  !rsqrtss xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpFloats(SourceRegister, DestinationRegister)
  !rcpps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpSqrFloats(SourceRegister, DestinationRegister)
  !rsqrtps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro CompareIsEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 0
EndMacro

Macro CompareIsLesserSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 1
EndMacro

Macro CompareIsLesserOrEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 2
EndMacro

Macro CompareIsUnorderedSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 3
EndMacro

Macro CompareIsNotEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 4
EndMacro

Macro CompareIsNotLesserSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 5
EndMacro

Macro CompareIsNotLesserOrEqualSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 6
EndMacro

Macro CompareIsOrderedSingleFloat(SourceRegister, DestinationRegister)
  !cmpss xmm#DestinationRegister, xmm#SourceRegister, 7
EndMacro

Macro CompareIsEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 0
EndMacro

Macro CompareIsLesserFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 1
EndMacro

Macro CompareIsLesserOrEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 2
EndMacro

Macro CompareIsUnorderedFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 3
EndMacro

Macro CompareIsNotEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 4
EndMacro

Macro CompareIsNotLesserFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 5
EndMacro

Macro CompareIsNotLesserOrEqualFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 6
EndMacro

Macro CompareIsOrderedFloats(SourceRegister, DestinationRegister)
  !cmpps xmm#DestinationRegister, xmm#SourceRegister, 7
EndMacro

Macro AndSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !andps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro NandSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !andnps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro OrSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !orps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro XorSSE(SourceRegister, DestinationRegister) ;Operates on all 128bits
  !xorps xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;SSE2 Instructions
;Arithmetic
Macro AddSingleDouble(SourceRegister, DestinationRegister)
  !addsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractSingleDouble(SourceRegister, DestinationRegister)
  !subsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplySingleDouble(SourceRegister, DestinationRegister)
  !mulsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideSingleDouble(SourceRegister, DestinationRegister)
  !divsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrSingleDouble(SourceRegister, DestinationRegister)
  !sqrtsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeSingleDouble(SourceRegister, DestinationRegister)
  !maxsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeSingleDouble(SourceRegister, DestinationRegister)
  !minsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro AddDoubles(SourceRegister, DestinationRegister)
  !addpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SubtractDoubles(SourceRegister, DestinationRegister)
  !subpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MulitplyDoubles(SourceRegister, DestinationRegister)
  !mulpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro DivideDoubles(SourceRegister, DestinationRegister)
  !divpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro SqrDoubles(SourceRegister, DestinationRegister)
  !sqrtpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MaximizeDoubles(SourceRegister, DestinationRegister)
  !maxpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro MinimizeDoubles(SourceRegister, DestinationRegister)
  !minpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;Recipicol
Macro RcpSingleDouble(SourceRegister, DestinationRegister)
  !rcpsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RSqrSingleDouble(SourceRegister, DestinationRegister)
  !rsqrtsd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpDoubles(SourceRegister, DestinationRegister)
  !rcppd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

Macro RcpSqrDoubles(SourceRegister, DestinationRegister)
  !rsqrtpd xmm#DestinationRegister, xmm#SourceRegister
EndMacro

;Logical

Macro CompareIsEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 0
EndMacro

Macro CompareIsLesserSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 1
EndMacro

Macro CompareIsLesserOrEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 2
EndMacro

Macro CompareIsUnorderedSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 3
EndMacro

Macro CompareIsNotEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 4
EndMacro

Macro CompareIsNotLesserSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 5
EndMacro

Macro CompareIsNotLesserOrEqualSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 6
EndMacro

Macro CompareIsOrderedSingleDouble(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 7
EndMacro

Macro CompareIsEqualDoubles(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 0
EndMacro

Macro CompareIsLesserDoubles(SourceRegister, DestinationRegister)
  !cmpsd xmm#DestinationRegister, xmm#SourceRegister, 1
EndMacro

Macro CompareIsLesserOrEqualDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 2
EndMacro

Macro CompareIsUnorderedDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 3
EndMacro

Macro CompareIsNotEqualDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 4
EndMacro

Macro CompareIsNotLesserDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 5
EndMacro

Macro CompareIsNotLesserOrEqualDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 6
EndMacro

Macro CompareIsOrderedDoubles(SourceRegister, DestinationRegister)
  !cmppd xmm#DestinationRegister, xmm#SourceRegister, 7
EndMacro

Macro ShiftLeftSSE(SourceRegister, Value) ;Operates on all 128bits
  !pslldq xmm#DestinationRegister, Value
EndMacro

Macro ShiftRightSSE(SourceRegister, Value) ;Operates on all 128bits
  !psrldq xmm#DestinationRegister, Value
EndMacro
 
;- Procedures
 
Procedure IsAligned(pointer)
  If pointer % 16 = 0
    ProcedureReturn #True
  Else
    ProcedureReturn #False
  EndIf
EndProcedure

; 
;Because allocating memory and aligning it causes memory leaks when freed later,
;This is one way of handling 16 byte aligned memory blocks, althought proably not the best way to do so,
;what it does is it allocates memory with a 16 bytes extra then it aligns the memory block.
;it will always have at least one byte at the end of the memory block this is where the offset is stored
;so when we go to free the memory block it will read the ending byte push the pointer back to the orignal
;pointer was and free from there other wise it will free the aligned block and leave up to 15 bytes
;this may not seam like a lot but when you have alot of this happing the left overs stack up
;and it is not good practice to leave leaks such as theses.
;

Macro Align16(pointer)
  (((pointer) + 15) &~ $0F)
EndMacro

Procedure AllocateAlignedMemory(Size)
  Protected *pointer = AllocateMemory(Size + 16)
  Protected *ReturningPointer = Align16(*pointer)
  PokeB(*ReturningPointer + Size, *ReturningPointer - *pointer)
  ProcedureReturn *ReturningPointer
EndProcedure
 
Procedure ReAllocateAlignedMemory(*MemoryBlock, Size)
  Protected *pointer = ReAllocateMemory(*MemoryBlock, Size + 16)
  Protected *ReturningPointer = Align16(*pointer)
  PokeB(*ReturningPointer + Size, *ReturningPointer - *pointer)
  ProcedureReturn *ReturningPointer
EndProcedure

Procedure FreeAlignedMemory(*MemoryBlock, Size)
  FreeMemory(*MemoryBlock - PeekB(*MemoryBlock + Size))
EndProcedure


;Instruction Set Checks
Procedure CheckSSE()
  Protected FeatureFlags
  MOV Eax, 1
  !CPUID
  MOV FeatureFlags, Edx
  ProcedureReturn (FeatureFlags >> 25) & 1
EndProcedure
 
Procedure CheckSSE2()
  Protected FeatureFlags
  MOV Eax, 1
  !CPUID
  MOV FeatureFlags, Edx
  ProcedureReturn (FeatureFlags >> 26) & 1
EndProcedure

Procedure CheckSSE3()
  Protected FeatureFlags
  MOV Eax, 1
  !CPUID
  MOV FeatureFlags, Ecx
  ProcedureReturn FeatureFlags  & 1
EndProcedure
Post Reply