Page 3 of 3

Posted: Wed Dec 20, 2006 11:29 am
by Helle
@ Trond: Sorry!
For CPU´s with SSE, but not SSE2:

Code: Select all

#Tries = 100000000 

z.f 
time = GetTickCount_() 
For I = 0 To #Tries 
  ;z = I 
  !cvtsi2ss xmm1,[v_I] 
  !sqrtss xmm0,xmm1 
Next 
!movss [v_z],xmm0 
MessageRequester("SSE", Str(GetTickCount_()-time)+#CRLF$+StrF(z)) 
Test of SIMD:

Code: Select all

;- CPU-SIMD-Test, "Helle" Klaus Helbing, 20.12.2006, PB4.02

Global mmx.c   = $2d         ;"-"
Global dnow.c  = $2d
Global ednow.c = $2d
Global cmov.c  = $2d
Global sse.c   = $2d
Global sse2.c  = $2d
Global sse3.c  = $2d
Global ssse3.c = $2d

Global Bit0.l  = $1          ;for SSE3
Global Bit9.l  = $200        ;for SSSE3
Global Bit15.l = $8000       ;for CMOVcc
Global Bit21.l = $200000     ;for EFlag
Global Bit23.l = $800000     ;for MMX 
Global Bit25.l = $2000000    ;for SSE
Global Bit26.l = $4000000    ;for SSE2
Global Bit30.l = $40000000   ;for extended 3DNow!
Global Bit31.l = $80000000   ;for 3DNow!

Global Name$   = "The tested CPU supported :"
Global MMX$    = "MMX :            "
Global DNOW$   = "3DNow! :       "
Global EDNOW$  = "ext3DNow! :  "
Global CMOV$   = "CMOVcc :      "
Global SSE$    = "SSE :             "
Global SSE2$   = "SSE2 :           "
Global SSE3$   = "SSE3 :           "
Global SSSE3$  = "SSSE3 :         "

;-------- Test, if CPUID is possible
    !pushfd                  ;EFlag-Register (32-Bit) on the Stack
    !pop eax                 ;copy to EAX
    !mov edx,eax             ;save EAX
    !xor edx,[v_Bit21]       ;toggle Bit21 
    !push edx                          
    !popfd                   ;write in EFlag
    !pushfd                  ;back to Stack
    !pop edx
    !push eax                ;EAX back
    !popfd   
    !cmp eax,edx
    !jne l_iscpuid           ;not equal -> CPUID is supported

 MessageRequester("Status", "The tested CPU give no support for CPUID; no MMX or SSE !")
End  

IsCPUID:
;------------------------------------------------------------------------------

;-------- Test of MMX, SSE, SSE2, SSE3 and SSSE3
    !mov eax,1h
    !cpuid
    !test edx,[v_Bit23]      ;MMX
    !jz l_nommx
    !mov [v_mmx],2bh         ;"+" 
NOMMX:
    !test edx,[v_Bit25]      ;SSE  
    !jz l_nosse
    !mov [v_sse],2bh
NOSSE:
    !test edx,[v_Bit26]      ;SSE2  
    !jz l_nosse2
    !mov [v_sse2],2bh
NOSSE2:
    !test ecx,[v_Bit0]       ;SSE3
    !jz l_nosse3
    !mov [v_sse3],2bh    
NOSSE3:
    !test ecx,[v_Bit9]       ;SSSE3 ("old" SSE4)
    !jz l_nossse3
    !mov [v_ssse3],2bh 
NOSSSE3:
 ;-------- Test of CMOVcc (conditional move)
    !test edx,[v_Bit15]    
    !jz l_nocmov
    !mov [v_cmov],2bh
NOCMOV:       
;------------------------------------------------------------------------------    

;-------- value of extended levels (for 3DNow!-Test)
;-------- back-value in EAX (-80000000h) = value of extended level
    !mov eax,80000000h
    !cpuid
    !cmp eax,80000000h
    !jbe l_noext             ;no extended levels, no 3DNow!
;------------------------------------------------------------------------------
    
;-------- Test of 3DNow! and extended 3DNow!
    !mov eax,80000001h       ;80000001h is the first extended level, no supported from Intel-CPU´s!
    !cpuid                   ;Intel-CPU´s gives EAX=0 return
    !or eax,eax
    !je l_noext              ;is Intel-Prozessor
    !test edx,[v_Bit31]      ;3DNow! 
    !jz l_noext
    !mov [v_dnow],2bh
    !test edx,[v_Bit30]      ;extended 3DNow!  
    !jz l_noext
    !mov [v_ednow],2bh
NOEXT:    
;------------------------------------------------------------------------------
   
 MessageRequester(Name$,MMX$+Chr(mmx)+Chr(10)+DNOW$+Chr(dnow)+Chr(10)+EDNOW$+Chr(ednow)+Chr(10)+CMOV$+Chr(cmov)+Chr(10)+SSE$+Chr(sse)+Chr(10)+SSE2$+Chr(sse2)+Chr(10)+SSE3$+Chr(sse3)+Chr(10)+SSSE3$+Chr(ssse3))
End  
Gruss
Helle

Edit 10.01.2007: New detection for SSE3

Posted: Wed Dec 20, 2006 6:31 pm
by Helle
This is code only for an AMD-CPU:

Code: Select all

Procedure.f Sqrt(N.f) 
  !mov eax, [p.v_N] 
  !sub eax, $3F800000 
  !shr eax, 1 
  !add eax, $3F800000 
  !mov [esp-4], eax 
  !fld dword [esp-4] 
  CompilerIf #PB_Compiler_Debugger 
    ProcedureReturn 
  CompilerElse 
    !ret 4 
  CompilerEndIf 
EndProcedure 

#Tries = 50000000 

;- ONLY for AMD-CPU´s !!! AMD-K6-2 or better
z.f
time = GetTickCount_() 
For I = 0 To #Tries 
!femms
!movd mm0,[v_I]
!pi2fd mm1,mm0
!movq mm0,mm1
!pfrsqrt mm1,mm0
!punpckldq mm0,mm0
!pfmul mm0,mm1
!movd [v_z],mm0
!femms
Next 

MessageRequester("3DNow!", Str(GetTickCount_()-time)+#CRLF$+StrF(z)) 

z.f 
time = GetTickCount_() 
For I = 0 To #Tries 
  z = I 
  Sqrt(z) 
Next 
z=Sqrt(z)
MessageRequester("Procedure", Str(GetTickCount_()-time)+#CRLF$+StrF(z)) 

z.f 
time = GetTickCount_() 
For I = 0 To #Tries 
  z = I 
  Sqr(z) 
Next 
z=Sqr(z)
MessageRequester("PB", Str(GetTickCount_()-time)+#CRLF$+StrF(z))
Gruss
Helle

P.S.: 3DNow! works with a table on the chip and is very quick!

Posted: Sat Apr 07, 2007 5:28 pm
by gebe
Hi chaps...

Derek,
I got finally to the test you suggested
and found the results similar with the given test program
I did some changes and saw that the normal sqr is same speed
and a lot more accurate
my machine gives: 625 ,,,,,, 625
and 625 R=7147.7 not important ,,,,,, 625 r=7071.06

That is when I use the given Z=I
Sqr(Z)

If I use the different approach
Z=sqr(I)

I get 422 ,,,,,,, 609
and 422 ,,, r=7147.7 ,,,,,,l, 609 ,,, r=7071.06

That seems to prove the point that accuracy is time costly.

and that test must be very carefully conducted .

(Not that I am a specialist .I just have tried a lot of times)

gebe

here is the more fancy way.

Code: Select all

Procedure.f Sqrt(N.f) 
  !mov eax, [p.v_N] 
  !sub eax, $3F800000 
  !shr eax, 1 
  !add eax, $3F800000 
  !mov [esp-4], eax 
  !fld dword [esp-4] 
  CompilerIf #PB_Compiler_Debugger 
    ProcedureReturn 
  CompilerElse 
    !ret 4 
  CompilerEndIf 
EndProcedure 


#Tries = 50000000 

z.f 
time = GetTickCount_() 
For I = 0 To #Tries 
 ; z = I ;more accurate ????????????????????????????
 ;sqrt(z)
 z= Sqrt(i) ;more accurate ?????????????????????????
Next
stopTime.l= GetTickCount_();dont allow any loss of time
;before the count stop
Duration.l=stopTime - time
result.d=sqrt(i);read the result of square root of 50 000 001
;now we have all the time to get a nicely formatted
;answer
MessageRequester(StrD(result),"With Sqrt   "+ Str(GetTickCount_()-time)+ " mSec.                    ") 
;make enough space for showing accuracy in title
 
time = GetTickCount_() 
For I = 0 To #Tries 
 ; z = I 
 z= Sqr(i) 
Next
stopTime = GetTickCount_() 
Duration =stopTime  - time
result = Sqr(i)
MessageRequester(StrD(result),"With Sqr   "+ Str(GetTickCount_()-time)+ " mSec.                   ") 
;we could also put the results o 20 full runs in an array And then inspect 
gebe :? :)
am I right ???? :?:

Posted: Sat Apr 07, 2007 5:35 pm
by gebe
Sorry,Idid not see the more fancy stuff ,i started at the begining.
i am going to save and check the other codes

Gebe :)

Posted: Sat Apr 07, 2007 6:10 pm
by gebe
Helle ,
That 3D thing SCREAMS !!!! That is SPEED with capital SSSSS !!

Nice to see speeding .. Carefull for the cops :P

gebe

Posted: Sat Apr 07, 2007 6:59 pm
by Derek
@gebe, as you can see some people are interested in speed over size. :)

Posted: Sun Apr 08, 2007 4:36 am
by gebe
Yah,the speed in <square root> is very important even more than accuracy...
I also checked without inlining the machine code and it barely ,if, changes the time (proc of MC called at every for next loop)

That is interesting stuff. 8).
Thanks again for the tip.

gebe