SSE exp() denkfehler??

zeusosc

Hi asm'ler,.

ich habe versucht eine etwas schnellere exp(x) function zu schreiben,.. für 0<x<1 scheint es auch zu funktionieren,... habe ich irgendwo einen denkfehler??
Ich habe den source code auch mit kommentaren versehen,.. danke für eventuelle hilfe,...

meine idee:

$exp(x)=exp(2^{l}I)=exp(I)^{2^{l}}=\left(2^kJ\right)^{2^{l}}=\left(2^{k+l}\right)\cdot J^{2^{l}}$

$=\left(2^{k+l}\right)\cdot \left(2^t\cdot S\right)=\left(2^{k+l+t}\right)\cdot S$

die durchführung:

double __cdecl _expf_test(double x)
{
//cpu_1.GetMilliseconds();
//	
//static const unsigned __int64 SignPosd =0x8000000000000000;
//static const unsigned __int64 ExponentPosd=0x7ff0000000000000;
//static const unsigned __int64 MantissaPosd=0xFFFFFFFFFFFFF
unsigned __int64 pre=0;
static const unsigned __int64 preFak	=0xfff0000000000000;
static const unsigned __int64 zerofak	=0x3fe0000000000000;
static const unsigned __int64 zerofak2	=0x3fe;
static const double one		=1.0f;
double debug_double=0.0f;

unsigned __int64 temp_x5=0;
	__asm{

		//set the register
			movsd xmm0,preFak  // store the sign and exponent mask
			movsd xmm1,x		// store the value
			movsd xmm2,MantissaPosd	//store the mantissa mask
			movsd xmm3,zerofak	//store the 2^0 exponent
			movsd xmm4,xmm2		//set the mantissa mask

		//get the prefactors (sign && exponent) to xmm0 from 2^k * I = x
			andpd xmm0,xmm1 //we have now the sign and exponent of x stored at xmm0

		//get the mantissa and set it zero fak, 2^0 * I
			andpd xmm2,xmm1
			orpd xmm2,xmm3 //we have now the zero based mantissa of x stored at xmm2
#ifdef _DEBUG
			movsd debug_double,xmm2
#endif

//#################################################################################
// talor row till n=10

		//calculate the e^I
			movsd	xmm3, facd10p
			mulsd	xmm3, xmm2

			addsd	xmm3, facd9p
			mulsd	xmm3,xmm2

			addsd	xmm3, facd8p
			mulsd	xmm3,xmm2

			addsd	xmm3, facd7p
			mulsd	xmm3,xmm2

			addsd	xmm3, facd6p
			mulsd	xmm3,xmm2

			addsd	xmm3, facd5p
			mulsd	xmm3,xmm2

			addsd	xmm3, facd4p
			mulsd	xmm3,xmm2

			addsd	xmm3, facd3p
			mulsd	xmm3,xmm2

			addsd	xmm3, facd2p
			mulsd	xmm3,xmm2

			addsd	xmm3,facd1p
			mulsd	xmm3,xmm2

			addsd	xmm3,facd1p //now we have the taylor till n=10 of e^I stored at xmm3
#ifdef _DEBUG
			movsd debug_double,xmm3
#endif

//#################################################################################
// prepare for power the taylor row result (exp(I))^2^(Exponent)
// we just square(root) the mantissa, and add the exponents together
//	exp(I)=2^k *J
// => exp(I)^(2^(Exponent))=2^(k+Exponent) * J^(2^(Exponent))
// the squares of J will causes an new exponent t we have to add also
//=> J^(2^(Exponent))=2^t * S =>exp(I)^(2^(Exponent))=2^(k+Exponent + t) *S
//#################################################################################

// get the mantissa of result
			movsd xmm2,xmm4
			movsd xmm5,zerofak
			andpd xmm2,xmm3
			orpd  xmm2,xmm5 //now we have the zerofak mantissa of xmm3 stored at xmm2
#ifdef _DEBUG
			movsd debug_double,xmm2
#endif

//get Exponent of x without the sign end shift to right, first pos of first int
			movsd xmm5,ExponentPosd
			andpd xmm5,xmm1
			psrlq xmm5,52
			movsd temp_x5,xmm5

//we have now the sign and exponent of x stored at xmm0
//we have stored the value x at xmm1
//now we have the zerofak mantissa of xmm3 stored at xmm2
//now we have the taylor till n=10 of e^I stored at xmm3
//we have also stored the MantissaPosd mask to xmm4
//and we have stored the Exponent of x stored at xmm5,xmm6 shifted to right

//compare if the extracted exponent is positiv or negative
			mov eax,dword ptr[temp_x5]
			cmp eax,0x3fe
			je SHORT _fin_power1
			jge SHORT _pos_fak
//#################################################################################
// negative exponent of x causes to take some square roots
_neg_fak:

_neg_fak_loop_start:
			cmp eax,0x3fe
			jge SHORT _neg_fak_loop_end

			sqrtpd xmm2,xmm2

			add eax,1
			jmp SHORT _neg_fak_loop_start
_neg_fak_loop_end:

			jmp SHORT _fin_power1

//#################################################################################
//positive exponent of x causes to take some squares
_pos_fak:

_pos_fak_loop_start:
		cmp eax,0x3fe
		jnge SHORT _pos_fak_loop_end

		mulsd xmm2,xmm2

		sub	eax,1
		jmp SHORT _pos_fak_loop_start
_pos_fak_loop_end:

//#################################################################################
//we are finish to power the xmm3 mantissa to 2^k, now we have to add the exponents

//we have now the sign and exponent of x stored at xmm0
//we have stored the value x at xmm1
//--we have now stored the J^(2^(Exponent)) at xmm2
//now we have the taylor till n=10 of e^I stored at xmm3
//we have also stored the MantissaPosd mask to xmm4
//--and we have stored the zerofak stored at xmm5 shifted to right
//and we have stored the Exponent of x stored at xmm6 shifted to right

//#################################################################################
// let us get the exponent of xmm2, xmm5 and add it to xmm3
_fin_power1:
#ifdef _DEBUG
			movsd debug_double,xmm2
#endif

			movsd	temp_x5,xmm5
			mov eax,dword ptr[temp_x5]
			sub eax,0x3fe

//get the exponent of xmm2
			movsd xmm6,ExponentPosd
			andpd xmm6,xmm2
			psrlq xmm6,52

//store it to temp_x5			
			movsd temp_x5,xmm6
//substract 0x3fe
			mov ebx,dword ptr[temp_x5]
			sub ebx,0x3fe
//add it to eax
			add eax,ebx

//get the exponent of xmm3
			movsd xmm6,ExponentPosd
			movsd xmm5,xmm3
			andpd xmm5,xmm6
			psrlq xmm5,52
//store it to temp_x5
			movsd temp_x5,xmm5
			mov ecx,dword ptr[temp_x5]
			sub ecx,0x3fe
//add it to eax
			add eax,ecx
//now add 0x3fe again
			add eax,0x3fe
//bis hier scheint es ok
//now only xmm2 is useful, we need the mantissa

// get the mantissa of result
			movsd xmm5,MantissaPosd
			movsd xmm6,xmm2
			andpd xmm6,xmm5 //mantissa of xmm2 is stored in xmm6
//#ifdef _DEBUG
//			movsd xmm5,zerofak
//			orpd xmm6,xmm5
//			movsd debug_double,xmm6
//
//#endif

//let us set the new Exponent
			mov dword ptr[temp_x5],eax
			movsd xmm1,temp_x5
			psllq xmm1,52
			orpd  xmm1,xmm6 //now we have the new exponent and S stored to xmm1

//now let us have an look if sgn has been set at xmm0
			movsd xmm3,SignPosd
			movsd xmm5,xmm0
			andpd xmm5,xmm3
			movsd temp_x5,xmm5
			mov eax,dword ptr[temp_x5]
			cmp eax,0
			je SHORT _end
//if sign has been set, we have to calculate the reciproque of xmm1
			movsd xmm2,one
			divsd xmm2,xmm1
			movsd xmm1,xmm2
_end:
			movsd x,xmm1
	};
return x;
};

keine direkte Hilfe, aber sehr gute Literatur zu diesem Thema:

Titel:  Math Toolkit for Real-Time Development
Autor:  Jack W. Crenshaw
ISBN:   1-929629-09-5

masm

zeusosc

Danke für den Tipp,.. ich werde es mir mal anschauen,... es wird aber noch rund 7 tage dauern bis es hier ist,...

ich denke aber das ich mit der liniearisierung der exponential funktion sowie die Portenzdarstellung nicht falsch liege:

$exp(x)=exp(2^lI)=exp(I)^{2^l}=:\left(\sum_{n}^{10}\frac{I^n}{n!}\right)^{2^l}\forall I\in \{0,\ldots,1\}\subset\mathbb{R}$

(ok,.. bei n=4 ist das Taylor poly eigentlich exakt genug,.... ich habe es dennoch mal bis 10 laufen lassen,...)

dabei nutze ich halt aus das jeder double typ (nach IEE754) in der form

$x:\{0,1\}\times\mathbb{N}^{\pm}_0\times\{0,\ldots,1\}\subset\mathbb{R}\to\mathbb{X}\subset\mathbb{R}$

$x(s,\lambda,a)\mapsto s\cdot2^\lambda \cdot a$

ist.

Für jede weitere hilfe wäre ich dankbar,..

zeusilein
-------------------------
ok, $$\mathbb{N}^\pm_0$$ stimmt nicht ganz eher eine kleine untermenge davon,.. ihr wisst aber was ich meine

zeusosc

jaaa,. ein fehler gefunden:

//###########################################################################
//positive exponent of x causes to take some squares
_pos_fak:

_pos_fak_loop_start:
        //cmp eax,0x3fe //<<<hier
        cmp eax,0x3ff
        jnge SHORT _pos_fak_loop_end

        mulsd xmm2,xmm2

        sub    eax,1
        jmp SHORT _pos_fak_loop_start
_pos_fak_loop_end:

setzt man den exponenten vorher nicht auf null, so liefert dieser schon den richtigen wert für postive ganze zahlen. Setzt man den exponenten auf 1, so liefert das schon mal die richtige mantisse.
---------------------------------------------
Edit:
enen zweiten fehler gefunden: bei negativen potenzen haben die einzelnen summanden des Taylor Poly wechselnde vorzeichen, wodurch sich natürlich die mantisse und s ändert,... d.h. das sign flag darf ich garnicht herausfiltern,..

zeusosc

in diesem teil des forums is ja nicht viel los

also ich habe das problem gelöst.

--------------
edit:
Aber leider nimmt die laufzeit wegen der schleifen für große exponenten linear zu. Damit ist für große und kleine Zahlen die laufzeit leider fürn popo.....
Während die exp() der math.h quasi eine konstante laufzeit aufweißt,....

-------------

Ich werde gleich mal den benchmark plus korregierte version posten, aber ich habe noch ne frage zur abschätzung des fehlers,..

Wie genau ist exp() ?

grüße

abc.w

zeusosc schrieb:

in diesem teil des forums is ja nicht viel los

Na, dann mach deine Funktionen "portabel", damit man sie unter Linux bauen und damit rumspielen kann

zeusosc

... sagen wir mal so,...
da ich ja jetzt einen guten lösungsansatz für eine konstante laufzeit habe, versuche ich die mathematische und konzeptionelle beschreibung so zu gestalten das jeder der will auch gerne auf ein anderes system (auf x64, avr,m430) portieren kann, wenn es äquivalente mnemonics gibt xD

zeusosc

Happy :xmas1: ,

also ich hatte mir das ja nochmal angeschaut usw...
und habe einen evtl. schon bekannten ansatz genommen, ich hoffe ihr versteht
diesen auf anhieb und könnt mich dann auch sofort korregieren

Zuerst schreibe ich die Aufgabe ein bissl um:

$e^{x}=y \Leftrightarrow y= 2^{x\cdot \text{Log}_2(e)}$

Da der VS2008 Compiler ein bissl doof ist,(der schneidet bei der 7ten
nachkommastelle gerne etwas ab) habe ich die double wert für

\text{Log}_2(e)$$ per hand ausgerechnet: ```cpp static const __int64 l2de =0x3FF71547652B82FE; //1,4426950408889634 ``` Nun kommt trick17: ich zerlege den exponenten $$ x\cdot\text{Log}_2(e)

in zwei Komponenten, einmal den nachkommaanteil und den Ganzzahlanteil:

$2^{x\cdot \text{Log}\_2(e)}=2^{F\_L + F\_H}=2^{F\_L}\cdot 2^{F_H}$

Dabei bezeichnet $$F_L,F_H$$ den "FracLow" und "FracHigh" des
exponenten.

Dann haben die Faktoren folgende Eigenschaften:
A) $$2^{F_L} \in \left[1,2\right] \Leftrightarrow F_L \in \left[0,1\right]$$
und für die double schreibweise:

$\Rightarrow \left(1+\frac{M_{F\_L}}{2^{52}}\right)\cdot 2^{E\_{F_L}}$

$$2^{F_H}\in \mathbb{N}_\pm \Leftrightarrow F_H \in \mathbb{N}_\pm$$
und für die double schreibweise:

$2^{F\_H} \in \mathbb{N}\_\pm \Rightarrow M_{F\_H}=0 \Rightarrow \left(1+\frac{0}{2^{52}}\right)\cdot 2^{E\_{F\_H}}\cdot (\pm 1\_{F_H})$

Die Folgerung währe:

$y=2^{F\_L}\cdot 2^{F\_H} \Rightarrow \left(1+\frac{M_{F\_L}}{2^{52}}\right)\cdot (\pm 1\_{F\_H}) \cdot 2^{E\_{F\_L} + E\_{F_H}}$

Würde man sich wohl mit 2-4 stellen genauigkeit nach dem komma genügen,
so würde es reichen den Fraclow als mantisse zu nehmen
(lineare regression von $$ \sqrt[n]{2}$$), ansonsten halt ne
Reihenentwicklung, so wie ich es gemacht habe....

Ich habe vergessen: $$ (\pm 1_{F_H}) \rightarrow 1 \wedge {E_{F_H}}\in \mathbb{N}_\pm$$
das vorzeichen von dem exponenten fließt ja in den Ganzkommaanteil

F_H$$ und somit in $$E_{F_H}$$ ein.... Comments are welcome

:xmas2:

zeusosc

Soo, ich denke ich bin fertig, mit ner akzeptablen abstand zur exp funktion der math.h...

Hier einmal die Ausgabe der testdurchläufe:
(//Compiled with /arch:SSE2)

Cycles:                 1e+009
Runtime at all: 80073.3
Runtime per Cycle: 8.00733e-005

-------          exp()   -------
Cycles:                 1e+009
Runtime:                        39642.5
Runtime per Cycle:      3.96425e-005
Result:                 6.05732e+021

-------          _expf()         -------
Cycles:                 1e+009
Runtime:                        37629.8
Runtime per Cycle:      3.76298e-005
Result:                 6.05732e+021

-----------------------------------------------------
Absolute Timedifference: 2012.65
Arithmated middle mantissa distance between exp() and _expf():  0.000355568
Please press enter....

Hier die Testfunktionen (in c/c++)

//Compiled with /arch:SSE2

std::string dts(double x)
{
	std::stringstream strRet;
	strRet<<x;
	return strRet.str();
};

std::string _mark(std::string strName,double e,double t, double div, double cycles)
{
	std::string strRet="------- \t "+strName+"\t -------\r\n";
	strRet+="Cycles:			"+dts(cycles)+"\r\n";
	strRet+="Runtime:			"+dts(t)+"\r\n";
	strRet+="Runtime per Cycle:	"+dts(t*div)+"\r\n";
	strRet+="Result:			"+dts(e)+"\r\n";
	return strRet;
};

std::string _print_bench_result(double er1, double er2, double t1, double t2,double loops_div, double cycles,double ts)
{

	std::string strRet;
	strRet+="Cycles:			"+dts(cycles)+"\r\n";
	strRet+="Runtime at all:	"+dts(ts)+"\r\n";
	strRet+="Runtime per Cycle: "+dts(ts*loops_div)+"\r\n\r\n";
	strRet+=_mark("exp()",er1,t1,loops_div,cycles)+"\r\n";
	strRet+=_mark("_expf()",er2,t2,loops_div,cycles)+"\r\n\r\n";
	strRet+="-----------------------------------------------------\r\n";
	strRet+="Absolute Timedifference: "+dts(fabs(t1-t2))+"\r\n";
	return strRet;
};

int _tmain(int argc, _TCHAR* argv[])
	{
_stde::_cpu_ticks_struct cpu,cpu1; //winapi QueryPerformanceCounter
cpu.GetMilliseconds();

static double ergebniss1=0;
static double ergebniss2=0;
static double x= 0.155555555555f;

static const unsigned int end=1000000;
static const unsigned int end2=1000;
static const double cycles=(((double)end)*((double) end2));
static const double divi=1.0f/cycles;
unsigned int i=0;
unsigned int j=0;
double time_exp=0.0f;
double time_expf_test=0.0f;
double diff=0;
static double adder=50.0f/end;
cpu_1.GetMilliseconds();
while(j<end)
{
	adder=adder*(-1.0f);
	x=x*(-1.0f);
	x=x+adder;

	i=0;
	cpu.GetMilliseconds();
	while(i<end2)
	{
			ergebniss1=exp(x);
		i++;
	}
	time_exp+=cpu.GetMilliseconds();

	i=0;
	cpu.GetMilliseconds();
	while(i<end2)
	{
			ergebniss2=_expf(x);
		i++;
	};
	time_expf_test+=cpu.GetMilliseconds();
int n=0;
double blub=frexp(ergebniss2,&n);
diff+= (ergebniss2-ergebniss1)/pow(2.0f,n);

j++;
};
diff=diff/cycles;
std::cout<<_print_bench_result(ergebniss1,ergebniss2,time_exp,time_expf_test,divi,cycles,cpu_1.GetMilliseconds());
std::cout<<"Arithmated middle mantissa distance between exp() and _expf():\t"<<diff<<"\r\nPlease press enter....";
std::cin>>j;

return 0;
};

sooo und nun mein hilight

inline double __fastcall _expf(double y)
{

	double debug_temp=0.0f;
__int64 blub=1;
int a1=0;

	__asm{

		movsd	xmm1, y				// y to xmm1
		movsd   xmm0, l2de		// log2(e) at xmm0
		//multiply it
		mulsd	xmm0,xmm1			//build the exponent +-y*log2(e)
		//convert
		cvttsd2si eax,xmm0
		cvtsi2sd  xmm2,eax

//movq blub,xmm1
//movsd debug_temp,xmm1
//movq blub,xmm0	
//movsd debug_temp,xmm0
//movq blub,xmm2
//movsd debug_temp,xmm2
		//now we splitt into frachigh(xmm1) and frac low part(xmm0)
		subsd	xmm0,xmm2
		//now we take our new polynom

		movsd xmm3,_pb6
		mulsd xmm3,xmm0

		addsd xmm3,_pb5
		mulsd xmm3,xmm0

		addsd xmm3,_pb4
		mulsd xmm3,xmm0

		addsd xmm3,_pb3
		mulsd xmm3,xmm0

		addsd xmm3,_pb2
		mulsd xmm3,xmm0

		addsd xmm3,_pb1
		mulsd xmm3,xmm0

		addsd xmm3,_pb0

//##############################################################
//xmm3,xmm2,xmm0,eax are reserved
//here we convert the frac high part to the exponentialform
		//movsd		xmm1,SignPosd

		//store the sign
		movsd		xmm1,xmm2

		psrlq		xmm1,63 //shift sign to right
		movq		blub,xmm1
		mov			ebx,dword ptr[blub]
		//test		ebx,ebx
		//jnz					

		//now we extract the exponent of xmm3
		movsd		xmm2,ExponentPosd
		andpd		xmm2,xmm3
		psrlq		xmm2,52
		movsd		blub,xmm2
		mov			ecx,dword ptr[blub]
		sub			ecx,0x3fe
	//	int 3
		//proof if sgn was set
		test		ebx,ebx
		jz			SHORT _no_sgn
		jmp			SHORT _sgn

_no_sgn:
		add		ecx,eax	//just add the exponents
		jmp SHORT _fin_sum_exponents
_sgn:
		sub		ecx,eax //just sub the exponents
_fin_sum_exponents:

		add ecx,0x3fe //we have to add the zeropoint again
		and	ecx,0x7ff //we have to make sure that only 7ff can be written

		//move back
		mov dword ptr[blub],ecx

		movsd	xmm2,invExponentPosd
		movsd	xmm1,blub
		andpd	xmm2,xmm3
		movsd   blub,xmm2
//int 3
		psllq	xmm1,52 //shift back left
		movsd   blub,xmm1
//int 3
		orpd	xmm2,xmm1 //and set the exponent

//##############################################################

		movsd y,xmm2	

	}

return y;
};

Also wenn ihr noch Optimierungspotenzial seht, bitte uuuuunbedingt bescheid sagen....
----------------------------------------------------------------------------
Edit:
hier noch ein paar präzise konstanten:

static const __int64 l2de	=0x3FF71547652B82FE; //1.4426950408889634
static const __int64 _pb0	=0x3FF0000000000000;//1.000000f
static const __int64 _pb1	=0x3FE62E429E0A41A2;//0.693147;
static const __int64 _pb2	=0x3FCEBFF47735C183;//0.240233;
static const __int64 _pb3	=0x3FAC6653F1EF9C05;//0.0554682f;
static const __int64 _pb4	=0x3F83E27EE5B92CDD;//0.00970935
static const __int64 _pb5	=0x3F53EBF0289C8156;//0.00121592
static const __int64 _pb6	=0x3F2DB52DDAEAD8E3;//0.000226652
static const __int64 _fhDiv	=0x10000000000000;// 2^52
/*1. + 0.693147 x + 0.240233 x^2 + 0.0554682 x^3 + 0.00970935 x^4 + 
 0.00121592 x^5 + 0.000226652 x^6*/

Und die BitMasken:

static const unsigned __int64 SignPosd			=0x8000000000000000;
static const unsigned __int64 ExponentPosd		=0x7ff0000000000000;
static const unsigned __int64 invExponentPosd	=0x800fffffffffffff;
static const unsigned __int64 MantissaPosd		=0x000FFFFFFFFFFFFF;

hi,
das ist gut! - habe es noch etwas verkürzt (masm-syntax). Könntest es ja mal mit deiner Version vergleichen (und hier posten ;-)). Wäre auch toll, wenn du mal die Formel für die Polynom-Koeffizienten zeigst.

; return through xmm0
expf proc x:REAL8

.const
	align 16
	l2de	dq 3FF71547652B82FEh	; 1.4426950408889634 
	_pb6	dq 3F2DB52DDAEAD8E3h	; 0.000226652 
	_pb5	dq 3F53EBF0289C8156h	; 0.00121592 
	_pb4	dq 3F83E27EE5B92CDDh	; 0.00970935
	_pb3	dq 3FAC6653F1EF9C05h	; 0.0554682f
	_pb2	dq 3FCEBFF47735C183h	; 0.240233
	_pb1	dq 3FE62E429E0A41A2h	; 0.693147
	_pb0	dq 3FF0000000000000h	; 1.000000f 
.code

	movsd xmm1,l2de        ; log2(e) at xmm0 
	mulsd xmm1,x           ; build the exponent +-y*log2(e) 

	cvttsd2si eax,xmm1 
	cvtsi2sd  xmm0,eax 

	lea eax,[eax*8]		;
	lea eax,[eax+eax]	; shl eax,4

	subsd xmm1,xmm0 

	movsd xmm0,_pb6 
	mulsd xmm0,xmm1 

	addsd xmm0,_pb5 
	mulsd xmm0,xmm1 

	addsd xmm0,_pb4 
	mulsd xmm0,xmm1 

	addsd xmm0,_pb3 
	mulsd xmm0,xmm1 

	addsd xmm0,_pb2 
	mulsd xmm0,xmm1 

	addsd xmm0,_pb1 
	mulsd xmm0,xmm1 

	addsd xmm0,_pb0 

	pextrw ecx,xmm0,3
	mov edx,ecx
	and edx,0800fh
	jns @1
	neg eax	
@1: 
	and ecx,07ff0h
	lea ecx,[ecx+eax]
	and ecx,07ff0h
	lea ecx,[ecx+edx]
	pinsrw xmm0,ecx,3
	; xmm0 holds result

	ret

expf endp

grüße, masm

zeusosc

Jo ich habe die verkürzte fassung noch nicht ausprobiert (mache ich aber heute noch)...

Zu den Koeffizienten,..
Das Polynom ist wie oben beschrieben die entwicklung von

\sqrt[\hat{x}]{2}=2^{\frac{1}{\hat{x}}}=2^{x}\forall x\in \left[0,..,1\right]\subset\mathbb{R}

Diese erhält man entweder durch eine Taylorapproximation um den Entwicklungspunkt

x_0=0$$ oder durch einen Polynomialen fit in Origin oder Mathematica für die genannte Funktion.... Um den Punkt 0.5 hat man zu große abstände an den Rändern 1,2 von der funktion,... greetz \-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\-\- Edit: ich hatte schon über eine lookup table nachgedacht, aber 2^52 werte reinklatschen,... dann noch die 64 bit addressierung...... und die ganze rumrechnerei erstmal :D, Da müsste ich n makro in Mathematica schreiben das mir die werte exakt umrechnet...(ok, letztes bit ungenauigkeit ist ja nach ieee 7x erlaubt....)

zeusosc

geilomat masm,...

deine modifikation ist ja mal richtig performant

Cycles:                 1e+009
Runtime at all: 70561.2
Runtime per Cycle: 7.05612e-005

-------          _expf_masm()   -------
Cycles:                 1e+009
Runtime:                        30499.4
Runtime per Cycle:      3.04994e-005
Result:                 6.05732e+021

-------          _expf()         -------
Cycles:                 1e+009
Runtime:                        37343.9
Runtime per Cycle:      3.73439e-005
Result:                 6.05732e+021

-----------------------------------------------------
Absolute Timedifference: 6844.58
Arithmated middle mantissa distance between _expf_masm() and _expf():  0.000355567
Please press enter....

leider weiß ich nicht woher die differenz der mantisse kommen soll, denn vorher habe ich drei zufallszahlen überprüft und die waren identisch bis zur letzten stelle....

Ich habe noch eine idee wie man evtl. eine höhere Präzision und geringere Laufzeit rausholt,... da würde die berechnung des fraclow von 13 auf 7 anweisungen schrumpfen...

greetz n thx
---------------------------------------------------------------------------
Edit:

Also ich habe den maximalen Absoluten fehler der 2^fraclow komponente durch
neue koeffizienten von rund $$|1.2E-7|$$
auf gerundet $$|5.5463E-9|$$ reduzieren können...

p(x)=1. + 0.693147 x + 0.240231 x^2 + 0.0554788 x^3 + 0.00968722 x^4 +
0.00123681 x^5 + 0.000219337 x^6,..

Mit $$2^x-p(x)$$ kann man die differenzfunktion plotten die die abweichung zeigt...

nachtfeuer

entschuldigung, ich habe da nur ein paar Verständnis Fragen:

wie war nochmal der eigentliche theoretisch/praktische Hintergrund?
welche Funktion, bzw. welcher Algo soll genau beschleunigt werden? bzw. welcher Algo ist wo genau zu langsam? (nicht der mathematische, der ist sowieso immer zu langsam , der in welchem asm-code).

Also z.B. ist der (zu langsame) Ausgangspunkt ein asm-Programm wie:

;standardrundungsverhalten der Fpu
fld exponent
fldl2e
fmulp
fld st(0)
frndint                 ;falls nötig, Control-Word anpassen
fsub st(1), st(0)
fxch
f2xm1
fld1
faddp st(1), st(0)
fscale
fstp st(1)
;Ergebnis in st(0)

?

zeusosc

Hi nachtfeuer,

also wenn nicht mit der option /arch:SSE2 /arch:SSE unter VS2008 kompiliert wird,
wird die FPU für die berechnung der exponentialfunktion (math.h) genutzt. Wird hingegen mit dieser Compilerflag kompiliert, wird die Kalkulation der exponentialfunktion mittels SSE vorgenommen, was eine beschleunigung bei rund 1e9 durchgängen von rund 7sec bedeutet.

Da ich die berechnung nun etwas weiter beschleunigen wollte , habe ich mir überlegt wie die berechnung sein müsste und versucht diese per hand zu optimieren.
(Danke an masm, der nochmal aus meinen source riiiichtig performance rausgeholt hat )

Nun, die methode wie die exponentialfunktion berechnet wird habe ich gepostet, wie groß die abweichung der fraclow komponente ist habe ich gepostet.

Nun habe ich eine funktion die bei 1e9 durchgängen gegenüber der fpu variante rund 17 sec spart,... und das bei einer genauigkeit von rund 8 stellen der Mantisse...

greetz
--------------------------------
Edit:
Benchmark @ Intel Core2Duo P8400 @ 2.66Ghz -VS2008 Pro-DebugMode