Coding - FPU intrinsics

va!n · Post by **va!n** » Sat Sep 17, 2005 9:59 pm

just another interesting article by chaos/farbrausch! i share it here, because i think there is some interesting stuff inside... here we go...

If you are doing 64k intros, you can not rely on the runtime library for some basic fpu functions. Most of them are provided as intrinsics by vc++, but some need to be implemented by yourself.

Most of the fpu assembly code in here was written or found by ryg.

This is a header that gives you access to most basic fpu functions. Note that you don't need to include math.h or anything else to get this working.

Code: Select all

/****************************************************************************/
/***                                                                      ***/
/***   Intrinsics                                                         ***/
/***                                                                      ***/
/****************************************************************************/


typedef unsigned int size_t;
extern "C"
{
  int __cdecl abs(int);

  double __cdecl atan(double);
  double __cdecl atan2(double,double);
  double __cdecl cos(double);
  double __cdecl exp(double);
  double __cdecl fabs(double);
  double __cdecl log(double);
  double __cdecl log10(double);
  double __cdecl sin(double);
  double __cdecl sqrt(double);
  double __cdecl tan(double); 

  double __cdecl acos(double);
  double __cdecl asin(double);
  double __cdecl cosh(double);
  double __cdecl fmod(double,double);
  double __cdecl pow(double,double);
  double __cdecl sinh(double);
  double __cdecl tanh(double);

  void * __cdecl memset( void *dest, int c, size_t count );
  void * __cdecl memcpy( void *dest, const void *src, size_t count );
  int __cdecl memcmp( const void *buf1, const void *buf2, size_t count );
  size_t __cdecl strlen( const char *string );
}

#pragma intrinsic (abs)                                       // int intrinsic
#pragma intrinsic (memset,memcpy,memcmp,strlen)               // memory intrinsic
#pragma intrinsic (atan,atan2,cos,exp,log,log10,sin,sqrt,tan,fabs) // true intrinsic
#pragma intrinsic (acos,asin,cosh,fmod,pow,sinh,tanh)         // fake intrinsic

__forceinline sInt sAbs(sInt i)                                 { return abs(i); }
__forceinline void sSetMem(sPtr dd,sInt s,sInt c)               { memset(dd,s,c); }
__forceinline void sCopyMem(sPtr dd,const void *ss,sInt c)      { memcpy(dd,ss,c); }
__forceinline sInt sCmpMem(const sPtr dd,const void *ss,sInt c) { return (sInt)memcmp(dd,ss,c); }
__forceinline sInt sGetStringLen(const sChar *s)                { return (sInt)strlen(s); }

__forceinline sF64 sFATan(sF64 f)         { return atan(f); }
__forceinline sF64 sFATan2(sF64 a,sF64 b) { return atan2(a,b); }
__forceinline sF64 sFCos(sF64 f)          { return cos(f); }
__forceinline sF64 sFAbs(sF64 f)          { return fabs(f); }
__forceinline sF64 sFLog(sF64 f)          { return log(f); }
__forceinline sF64 sFLog10(sF64 f)        { return log10(f); }
__forceinline sF64 sFSin(sF64 f)          { return sin(f); }
__forceinline sF64 sFSqrt(sF64 f)         { return sqrt(f); }

__forceinline sF64 sFACos(sF64 f)         { return acos(f); }
__forceinline sF64 sFASin(sF64 f)         { return asin(f); }
__forceinline sF64 sFCosH(sF64 f)         { return cosh(f); }
__forceinline sF64 sFSinH(sF64 f)         { return sinh(f); }
__forceinline sF64 sFTanH(sF64 f)         { return tanh(f); }

__forceinline sF64 sFInvSqrt(sF64 f)      { return 1.0/sqrt(f); }

#if !sINTRO 
__forceinline sF64 sFMod(sF64 a,sF64 b)   { return fmod(a,b); }
__forceinline sF64 sFExp(sF64 f)          { return exp(f); }
__forceinline sF64 sFPow(sF64 a,sF64 b)   { return pow(a,b); }
#endif

#if sINTRO
sF64 sFPow(sF64 a,sF64 b);
sF64 sFMod(sF64 a,sF64 b);
sF64 sFExp(sF64 f);
#endif

/****************************************************************************/
/***                                                                      ***/
/***   asm                                                                ***/
/***                                                                      ***/
/****************************************************************************/

#pragma warning (disable : 4035) 

__forceinline void sFloatFix()
{
  __asm
  {
    fclex;
    push    0103fh; // round to nearest even + single precision
    fldcw   [esp];
    pop     eax;
  }
}

__forceinline void sFloatDouble()
{
  __asm
  {
    fclex;
    push    0123fh; // round to nearest even + double precision
    fldcw   [esp];
    pop     eax;
  }
}

__forceinline void sFloatDen1()
{
  __asm
  {
    fclex;
    push    0141fh;
    fldcw   [esp];
    pop     eax;
  }
}
__forceinline void sFloatDen0()
{
  __asm
  {
    fclex;
    push    0143fh;
    fldcw   [esp];
    pop     eax;
  }
}

__forceinline sInt sFtol (const float f)
{
  __asm 
  {
    fld f
    push eax
    fistp dword ptr [esp]
    pop eax
  }
}

__forceinline sF32 sFRound (const float f)
{
  __asm 
  {
    fld f
    frndint
  }
}

__forceinline void sFSinCos(const float x,sF32 &sine,sF32 &cosine)
{
  __asm
  {
    fld x;
    fsincos;
    mov eax,[cosine];
    fstp dword ptr [eax];
    mov eax,[sine];
    fstp dword ptr [eax];
  }
}

__forceinline sInt sMulDiv(sInt var_a,sInt var_b,sInt var_c)
{
  __asm
  {
    mov eax,var_a
    imul var_b
    idiv var_c
  }
}

__forceinline sInt sMulShift(sInt var_a,sInt var_b)
{
  __asm
  {
    mov eax, var_a
    imul var_b
    shrd eax, edx, 16
  }
}

__forceinline sInt sDivShift(sInt var_a,sInt var_b)
{
  __asm
  {
    mov eax,var_a
    mov edx,eax
    shl eax,16
    sar edx,16
    idiv var_b
  }
}

#pragma warning (default : 4035) 

Some floatingpoint functions are not true intrinsics, even the build-in version relies on the runtime library. Here is an implementation without:


sF64 sFMod(sF64 a,sF64 b)
{
  __asm
  {
    fld   qword ptr [b];
    fld   qword ptr [a];
    fprem;

    fstp  st(1);
    fstp  qword ptr [a];
  }

  return a;
}

sF64 sFPow(sF64 a,sF64 b)
{
  // faster pow based on code by agner fog
  __asm
  {
    fld   qword ptr [b];
    fld   qword ptr [a];

    ftst;
    fstsw ax;
    sahf;
    jz    zero;

    fyl2x;
    fist  dword ptr [a];
    sub   esp, 12;
    mov   dword ptr [esp],0;
    mov   dword ptr [esp+4],0x80000000;
    fisub dword ptr [a];
    mov   eax, dword ptr [a];
    add   eax, 0x3fff;
    mov   [esp+8], eax;
    jle   underflow;
    cmp   eax, 0x8000;
    jge   overflow;
    f2xm1;
    fld1;
    fadd;
    fld   tbyte ptr [esp];
    add   esp, 12;
    fmul;
    jmp   end;

underflow:
    fstp  st;
    fldz;
    add   esp, 12;
    jmp   end;

overflow:
    push  0x7f800000;
    fstp  st;
    fld   dword ptr [esp];
    add   esp, 16;
    jmp   end;

zero:
    fstp  st(1);

end:
  }
}


sF64 sFExp(sF64 f)
{
  __asm
  {
    fld   qword ptr [f];
    fldl2e;
    fmulp st(1), st;

    fld1;
    fld   st(1);
    fprem;
    f2xm1;
    faddp st(1), st;
    fscale;

    fstp  st(1);
    fstp  qword ptr [f];
  }

  return f;
}

jack · Post by **jack** » Sat Sep 17, 2005 10:22 pm

good stuff

Rescator · Post by **Rescator** » Mon Sep 19, 2005 11:40 pm

This only uses x87 instructions right? so should be the most cpu compatible fpu instructions as opposed to MMX, SSE etc. variants. or?