Algo mit Assembler noch weiter optimierbar?

Patrick

Moin Jungs,

ich berechne über eine Schleife Alphablending pro Pixel, doch bei 640x480 Pixeln braucht dieser "Algo" fast 60 Millisekunden! Das ist etwas kräftig, aber bekomme es nicht mehr weiter mit C++ optimiert, auch der Compiler hat sein Bestes gegeben.

Auch mit meinen Inline Assembler-Kenntnissen (nur Grundlagen, sowie etwas ISSE1[aber hier nichts tiefgründigeres]) habe ich es nicht besser optimieren können.

Daher die Frage an die jenigen die von inline Assembler etwas mehr Peil haben als ich: Würde sich folgender Code mit einer SIMD-Technik al á ISSE1/2/3, MMX oder 3DNow! erheblich (10-20 Millisekunden) beschleunigen lassen?

Aber hier erstmal Code:

// Bei Colorkey müssen die einzelnen Pixel überprüft werden
    if (colorkey)
    {
            // Durch die Farbdaten laufen (BGR)
        for (unsigned long i=0; i<tile_w*tile_h*3; i += 3)
        {
                // Testen ob Pixel die Farbe für die Transparenz hat, wenn nicht neue Farbe errechnen.
            if ((data_source[i  ] != static_cast<unsigned char>(color >>  0)) || 
                (data_source[i+1] != static_cast<unsigned char>(color >>  8)) || 
                (data_source[i+2] != static_cast<unsigned char>(color >> 16)))
            {
                    // Farbe errechnen
                data_dest[i  ] += ((data_source[i  ] - data_dest[i  ]) * alpha + 255) >> 8;
                data_dest[i+1] += ((data_source[i+1] - data_dest[i+1]) * alpha + 255) >> 8;
                data_dest[i+2] += ((data_source[i+2] - data_dest[i+2]) * alpha + 255) >> 8;
            }
        }  
    }
        // wenn kein Colorkey vorhanden ist, einfach durchrasseln.
    else
    {
            // Durch die Farbdaten laufen (BGR)
        for (unsigned long i=0; i<tile_w*tile_h*3; i += 3)
        {
                // Farbe errechnen
            data_dest[i  ] += ((data_source[i  ] - data_dest[i  ]) * alpha + 255) >> 8;
            data_dest[i+1] += ((data_source[i+1] - data_dest[i+1]) * alpha + 255) >> 8;
            data_dest[i+2] += ((data_source[i+2] - data_dest[i+2]) * alpha + 255) >> 8;
        }  
    }

Werte:
colorkey: ist ein bool der aussagt ob auf die transparenet Farbe getestet werden soll oder nicht.
tile_w und tile_h: ist ein unsigned long und sind die Dimensionen der Farbdaten.
data_source und data_dest: beides unsigned char*. Jedes enthält für eine Farbe 3 Farbkanäle, also BGR.
color: ist ein unsigned long, im 0xAARRGGBB Format für die transparente Farbe.

Aber zurück zur Frage: kann man da noch was raushauen? Wenn ja: Wie und hat jemand weiterführende Infos dazu?

Danke im voraus!
- Patrick

Hallo,

if ((data_source[i  ] != static_cast<unsigned char>(color >>  0)) || 
                (data_source[i+1] != static_cast<unsigned char>(color >>  8)) || 
                (data_source[i+2] != static_cast<unsigned char>(color >> 16)))

hast du vorher mal versucht, jedem Pixel 4 Byte zu spendieren und dann DWORDs miteinander zu vergleichen?

Patrick

SeppSchrot
Stimmt!

Na ja, mir geht es mehr um die Errechnung der Farbe, denn diese 60 Millisekunden sind nicht bei Colorkey sondern bei Grafiken ohne Colorkey (Klar mit Colorkey gehts richtig in die tiefe). Die Farberrechnung braucht leider sehr lange

camper

die tatsache, dass data_source[i]-data_dest[i] mal positiv, mal negativ sein kann, macht die sache unangenehm. ich hab mal zum spass die untere schleife mittels mmx und sse2 implementiert. auf meinem athlon64 ist sse2 nur minimal schneller als mmx, auf einem intel prozessor sieht das möglicherweise anders aus. allerdings habe ich mir auch keine besondere mühe bzgl. der instruktionsreihenfolge u.ä. gegeben, da kann man evtl. noch ein bisschen herausholen.

#include <iostream>
#include <cstddef>
#include <ctime>
#include <cmath>

using namespace std;

typedef unsigned char uchar;
typedef unsigned __int64 u64;

u64 rdtsc()
{
	__asm rdtsc
}

void f1(uchar* dest, const uchar* src, size_t width, size_t height, uchar alpha)
{
	for ( size_t i = 0, end = width * height * 3; i < end; ++i )
	{
		dest[ i ] = ( dest[ i ] * 256 + ( src[ i ] - dest[ i ] ) * alpha + 255 ) / 256;
	}
}

void f2(uchar* dest, const uchar* src, size_t width, size_t height, uchar alpha)
{
	__asm
	{
		movzx		eax, alpha
		movd		mm0, eax
		pshufw		mm0, mm0, 0
		psubw		mm1, mm1
		mov			eax, 255
		movd		mm2, eax
		pshufw		mm2, mm2, 0
		mov			eax, width
		imul		eax, height
		lea			eax, [ eax + eax * 2 ]
		shr			eax, 3
		mov			ecx, dest
		mov			edx, src
		lea			ecx, [ ecx + eax * 8 ]
		lea			edx, [ edx + eax * 8 ]
		neg			eax
loop_:
		movq		mm4, [ ecx + eax * 8 ]
		movq		mm5, mm4
		movq		mm6, [ edx + eax * 8 ]
		movq		mm7, mm6
		inc			eax
		punpcklbw	mm4, mm1
		punpckhbw	mm5, mm1
		punpcklbw	mm6, mm1
		punpckhbw	mm7, mm1
		psubw		mm6, mm4
		psubw		mm7, mm5
		psllw		mm4, 8
		psllw		mm5, 8
		pmullw		mm6, mm0
		pmullw		mm7, mm0
		paddw		mm6, mm2
		paddw		mm7, mm2
		paddw		mm6, mm4
		paddw		mm7, mm5
		psrlw		mm6, 8
		psrlw		mm7, 8
		packuswb	mm6, mm7
		movq		[ ecx + eax * 8 - 8 ], mm6
		jnz			loop_
		emms
	}
	for ( size_t i = width * height * 3 / 8 * 8, end = width * height * 3; i < end; ++i )
	{
		dest[ i ] = ( dest[ i ] * 256 + ( src[ i ] - dest[ i ] ) * alpha + 255 ) / 256;
	}
}
void f3(uchar* dest, const uchar* src, size_t width, size_t height, uchar alpha)
{
	__asm
	{
		movzx		eax, alpha
		movd		xmm0, eax
		pshuflw		xmm0, xmm0, 0
		movlhps		xmm0, xmm0
		psubw		xmm1, xmm1
		mov			eax, 255
		movd		xmm2, eax
		pshuflw		xmm2, xmm2, 0
		movlhps		xmm2, xmm2
		mov			eax, width
		imul		eax, height
		lea			eax, [ eax + eax * 2 ]
		and			eax, 0xfffffff0
		mov			ecx, dest
		mov			edx, src
		lea			ecx, [ ecx + eax ]
		lea			edx, [ edx + eax ]
		neg			eax
loop_:
		movdqa		xmm4, [ ecx + eax ]
		movdqa		xmm5, xmm4
		movdqa		xmm6, [ edx + eax ]
		movdqa		xmm7, xmm6
		add			eax, 16
		punpcklbw	xmm4, xmm1
		punpckhbw	xmm5, xmm1
		punpcklbw	xmm6, xmm1
		punpckhbw	xmm7, xmm1
		psubw		xmm6, xmm4
		psubw		xmm7, xmm5
		psllw		xmm4, 8
		psllw		xmm5, 8
		pmullw		xmm6, xmm0
		pmullw		xmm7, xmm0
		paddw		xmm6, xmm2
		paddw		xmm7, xmm2
		paddw		xmm6, xmm4
		paddw		xmm7, xmm5
		psrlw		xmm6, 8
		psrlw		xmm7, 8
		packuswb	xmm6, xmm7
		movdqa		[ ecx + eax - 16 ], xmm6
		jnz			loop_
	}
	for ( size_t i = width * height * 3 / 16 * 16, end = width * height * 3; i < end; ++i )
	{
		dest[ i ] = ( dest[ i ] * 256 + ( src[ i ] - dest[ i ] ) * alpha + 255 ) / 256;
	}
}
uchar* dest1;
uchar* dest2;
uchar* dest3;
uchar* src;
int main()
{
	srand( time( NULL ) );
	const size_t width = 100;
	const size_t height = 100;
	const uchar alpha = 10;
	uchar* d1 = new uchar[ width * height * 3 + 15 ];
	uchar* d2 = new uchar[ width * height * 3 + 15 ];
	uchar* d3 = new uchar[ width * height * 3 + 15 ];
	uchar* s = new uchar[ width * height * 3 + 15 ];
	dest1 = (uchar*)(( (size_t)d1 + 15 ) & 0xfffffff0);
	dest2 = (uchar*)(( (size_t)d2 + 15 ) & 0xfffffff0);
	dest3 = (uchar*)(( (size_t)d3 + 15 ) & 0xfffffff0);
	src = (uchar*)(( (size_t)s + 15 ) & 0xfffffff0);
	for ( size_t i = 0, end = width * height * 3; i < end; ++i )
	{
		dest1[ i ] = dest2[ i ] = dest3[ i ] = rand() % 256;
		src[ i ] = rand() % 256;
	}
	cout << memcmp( dest1, dest2, width * height * 3 ) << endl;
	f1( dest1, src, width, height, alpha );
	f2( dest2, src, width, height, alpha );
	cout << memcmp( dest1, dest2, width * height * 3 ) << endl;
	f3( dest3, src, width, height, alpha );
	cout << memcmp( dest1, dest3, width * height * 3 ) << endl;
	for ( size_t i = 0; i < width * height * 3; ++i )
	{
		if ( dest1[i] != dest2[i] )
			cout << i << '\t' << int(dest1[i]) << '\t' << int(dest2[i]) << endl;
	}
	u64 min1 = ~u64(0);
	u64 min2 = ~u64(0);
	u64 min3 = ~u64(0);
	for ( int i = 0; i < 10000; ++i )
	{
		u64 t = rdtsc();
		f1( dest1, src, width, height, alpha );
		t = rdtsc() - t;
		if ( t < min1 )
		{
			min1 = t;
			i = 0;
		}
	}
	cout << min1 << endl;
	for ( int i = 0; i < 10000; ++i )
	{
		u64 t = rdtsc();
		f2( dest1, src, width, height, alpha );
		t = rdtsc() - t;
		if ( t < min2 )
		{
			min2 = t;
			i = 0;
		}
	}
	cout << min2 << endl;
	for ( int i = 0; i < 10000; ++i )
	{
		u64 t = rdtsc();
		f3( dest1, src, width, height, alpha );
		t = rdtsc() - t;
		if ( t < min3 )
		{
			min3 = t;
			i = 0;
		}
	}
	cout << min3 << endl;
	delete [] s;
	delete [] d3;
	delete [] d2;
	delete [] d1;
}