Coding - FPU intrinsics
Posted: Sat Sep 17, 2005 9:59 pm
just another interesting article by chaos/farbrausch! i share it here, because i think there is some interesting stuff inside... here we go...
If you are doing 64k intros, you can not rely on the runtime library for some basic fpu functions. Most of them are provided as intrinsics by vc++, but some need to be implemented by yourself.
Most of the fpu assembly code in here was written or found by ryg.
This is a header that gives you access to most basic fpu functions. Note that you don't need to include math.h or anything else to get this working.
If you are doing 64k intros, you can not rely on the runtime library for some basic fpu functions. Most of them are provided as intrinsics by vc++, but some need to be implemented by yourself.
Most of the fpu assembly code in here was written or found by ryg.
This is a header that gives you access to most basic fpu functions. Note that you don't need to include math.h or anything else to get this working.
Code: Select all
/****************************************************************************/
/*** ***/
/*** Intrinsics ***/
/*** ***/
/****************************************************************************/
typedef unsigned int size_t;
extern "C"
{
int __cdecl abs(int);
double __cdecl atan(double);
double __cdecl atan2(double,double);
double __cdecl cos(double);
double __cdecl exp(double);
double __cdecl fabs(double);
double __cdecl log(double);
double __cdecl log10(double);
double __cdecl sin(double);
double __cdecl sqrt(double);
double __cdecl tan(double);
double __cdecl acos(double);
double __cdecl asin(double);
double __cdecl cosh(double);
double __cdecl fmod(double,double);
double __cdecl pow(double,double);
double __cdecl sinh(double);
double __cdecl tanh(double);
void * __cdecl memset( void *dest, int c, size_t count );
void * __cdecl memcpy( void *dest, const void *src, size_t count );
int __cdecl memcmp( const void *buf1, const void *buf2, size_t count );
size_t __cdecl strlen( const char *string );
}
#pragma intrinsic (abs) // int intrinsic
#pragma intrinsic (memset,memcpy,memcmp,strlen) // memory intrinsic
#pragma intrinsic (atan,atan2,cos,exp,log,log10,sin,sqrt,tan,fabs) // true intrinsic
#pragma intrinsic (acos,asin,cosh,fmod,pow,sinh,tanh) // fake intrinsic
__forceinline sInt sAbs(sInt i) { return abs(i); }
__forceinline void sSetMem(sPtr dd,sInt s,sInt c) { memset(dd,s,c); }
__forceinline void sCopyMem(sPtr dd,const void *ss,sInt c) { memcpy(dd,ss,c); }
__forceinline sInt sCmpMem(const sPtr dd,const void *ss,sInt c) { return (sInt)memcmp(dd,ss,c); }
__forceinline sInt sGetStringLen(const sChar *s) { return (sInt)strlen(s); }
__forceinline sF64 sFATan(sF64 f) { return atan(f); }
__forceinline sF64 sFATan2(sF64 a,sF64 b) { return atan2(a,b); }
__forceinline sF64 sFCos(sF64 f) { return cos(f); }
__forceinline sF64 sFAbs(sF64 f) { return fabs(f); }
__forceinline sF64 sFLog(sF64 f) { return log(f); }
__forceinline sF64 sFLog10(sF64 f) { return log10(f); }
__forceinline sF64 sFSin(sF64 f) { return sin(f); }
__forceinline sF64 sFSqrt(sF64 f) { return sqrt(f); }
__forceinline sF64 sFACos(sF64 f) { return acos(f); }
__forceinline sF64 sFASin(sF64 f) { return asin(f); }
__forceinline sF64 sFCosH(sF64 f) { return cosh(f); }
__forceinline sF64 sFSinH(sF64 f) { return sinh(f); }
__forceinline sF64 sFTanH(sF64 f) { return tanh(f); }
__forceinline sF64 sFInvSqrt(sF64 f) { return 1.0/sqrt(f); }
#if !sINTRO
__forceinline sF64 sFMod(sF64 a,sF64 b) { return fmod(a,b); }
__forceinline sF64 sFExp(sF64 f) { return exp(f); }
__forceinline sF64 sFPow(sF64 a,sF64 b) { return pow(a,b); }
#endif
#if sINTRO
sF64 sFPow(sF64 a,sF64 b);
sF64 sFMod(sF64 a,sF64 b);
sF64 sFExp(sF64 f);
#endif
/****************************************************************************/
/*** ***/
/*** asm ***/
/*** ***/
/****************************************************************************/
#pragma warning (disable : 4035)
__forceinline void sFloatFix()
{
__asm
{
fclex;
push 0103fh; // round to nearest even + single precision
fldcw [esp];
pop eax;
}
}
__forceinline void sFloatDouble()
{
__asm
{
fclex;
push 0123fh; // round to nearest even + double precision
fldcw [esp];
pop eax;
}
}
__forceinline void sFloatDen1()
{
__asm
{
fclex;
push 0141fh;
fldcw [esp];
pop eax;
}
}
__forceinline void sFloatDen0()
{
__asm
{
fclex;
push 0143fh;
fldcw [esp];
pop eax;
}
}
__forceinline sInt sFtol (const float f)
{
__asm
{
fld f
push eax
fistp dword ptr [esp]
pop eax
}
}
__forceinline sF32 sFRound (const float f)
{
__asm
{
fld f
frndint
}
}
__forceinline void sFSinCos(const float x,sF32 &sine,sF32 &cosine)
{
__asm
{
fld x;
fsincos;
mov eax,[cosine];
fstp dword ptr [eax];
mov eax,[sine];
fstp dword ptr [eax];
}
}
__forceinline sInt sMulDiv(sInt var_a,sInt var_b,sInt var_c)
{
__asm
{
mov eax,var_a
imul var_b
idiv var_c
}
}
__forceinline sInt sMulShift(sInt var_a,sInt var_b)
{
__asm
{
mov eax, var_a
imul var_b
shrd eax, edx, 16
}
}
__forceinline sInt sDivShift(sInt var_a,sInt var_b)
{
__asm
{
mov eax,var_a
mov edx,eax
shl eax,16
sar edx,16
idiv var_b
}
}
#pragma warning (default : 4035)
Some floatingpoint functions are not true intrinsics, even the build-in version relies on the runtime library. Here is an implementation without:
sF64 sFMod(sF64 a,sF64 b)
{
__asm
{
fld qword ptr [b];
fld qword ptr [a];
fprem;
fstp st(1);
fstp qword ptr [a];
}
return a;
}
sF64 sFPow(sF64 a,sF64 b)
{
// faster pow based on code by agner fog
__asm
{
fld qword ptr [b];
fld qword ptr [a];
ftst;
fstsw ax;
sahf;
jz zero;
fyl2x;
fist dword ptr [a];
sub esp, 12;
mov dword ptr [esp],0;
mov dword ptr [esp+4],0x80000000;
fisub dword ptr [a];
mov eax, dword ptr [a];
add eax, 0x3fff;
mov [esp+8], eax;
jle underflow;
cmp eax, 0x8000;
jge overflow;
f2xm1;
fld1;
fadd;
fld tbyte ptr [esp];
add esp, 12;
fmul;
jmp end;
underflow:
fstp st;
fldz;
add esp, 12;
jmp end;
overflow:
push 0x7f800000;
fstp st;
fld dword ptr [esp];
add esp, 16;
jmp end;
zero:
fstp st(1);
end:
}
}
sF64 sFExp(sF64 f)
{
__asm
{
fld qword ptr [f];
fldl2e;
fmulp st(1), st;
fld1;
fld st(1);
fprem;
f2xm1;
faddp st(1), st;
fscale;
fstp st(1);
fstp qword ptr [f];
}
return f;
}