C
die tatsache, dass data_source[i]-data_dest[i] mal positiv, mal negativ sein kann, macht die sache unangenehm. ich hab mal zum spass die untere schleife mittels mmx und sse2 implementiert. auf meinem athlon64 ist sse2 nur minimal schneller als mmx, auf einem intel prozessor sieht das möglicherweise anders aus. allerdings habe ich mir auch keine besondere mühe bzgl. der instruktionsreihenfolge u.ä. gegeben, da kann man evtl. noch ein bisschen herausholen.
#include <iostream>
#include <cstddef>
#include <ctime>
#include <cmath>
using namespace std;
typedef unsigned char uchar;
typedef unsigned __int64 u64;
u64 rdtsc()
{
__asm rdtsc
}
void f1(uchar* dest, const uchar* src, size_t width, size_t height, uchar alpha)
{
for ( size_t i = 0, end = width * height * 3; i < end; ++i )
{
dest[ i ] = ( dest[ i ] * 256 + ( src[ i ] - dest[ i ] ) * alpha + 255 ) / 256;
}
}
void f2(uchar* dest, const uchar* src, size_t width, size_t height, uchar alpha)
{
__asm
{
movzx eax, alpha
movd mm0, eax
pshufw mm0, mm0, 0
psubw mm1, mm1
mov eax, 255
movd mm2, eax
pshufw mm2, mm2, 0
mov eax, width
imul eax, height
lea eax, [ eax + eax * 2 ]
shr eax, 3
mov ecx, dest
mov edx, src
lea ecx, [ ecx + eax * 8 ]
lea edx, [ edx + eax * 8 ]
neg eax
loop_:
movq mm4, [ ecx + eax * 8 ]
movq mm5, mm4
movq mm6, [ edx + eax * 8 ]
movq mm7, mm6
inc eax
punpcklbw mm4, mm1
punpckhbw mm5, mm1
punpcklbw mm6, mm1
punpckhbw mm7, mm1
psubw mm6, mm4
psubw mm7, mm5
psllw mm4, 8
psllw mm5, 8
pmullw mm6, mm0
pmullw mm7, mm0
paddw mm6, mm2
paddw mm7, mm2
paddw mm6, mm4
paddw mm7, mm5
psrlw mm6, 8
psrlw mm7, 8
packuswb mm6, mm7
movq [ ecx + eax * 8 - 8 ], mm6
jnz loop_
emms
}
for ( size_t i = width * height * 3 / 8 * 8, end = width * height * 3; i < end; ++i )
{
dest[ i ] = ( dest[ i ] * 256 + ( src[ i ] - dest[ i ] ) * alpha + 255 ) / 256;
}
}
void f3(uchar* dest, const uchar* src, size_t width, size_t height, uchar alpha)
{
__asm
{
movzx eax, alpha
movd xmm0, eax
pshuflw xmm0, xmm0, 0
movlhps xmm0, xmm0
psubw xmm1, xmm1
mov eax, 255
movd xmm2, eax
pshuflw xmm2, xmm2, 0
movlhps xmm2, xmm2
mov eax, width
imul eax, height
lea eax, [ eax + eax * 2 ]
and eax, 0xfffffff0
mov ecx, dest
mov edx, src
lea ecx, [ ecx + eax ]
lea edx, [ edx + eax ]
neg eax
loop_:
movdqa xmm4, [ ecx + eax ]
movdqa xmm5, xmm4
movdqa xmm6, [ edx + eax ]
movdqa xmm7, xmm6
add eax, 16
punpcklbw xmm4, xmm1
punpckhbw xmm5, xmm1
punpcklbw xmm6, xmm1
punpckhbw xmm7, xmm1
psubw xmm6, xmm4
psubw xmm7, xmm5
psllw xmm4, 8
psllw xmm5, 8
pmullw xmm6, xmm0
pmullw xmm7, xmm0
paddw xmm6, xmm2
paddw xmm7, xmm2
paddw xmm6, xmm4
paddw xmm7, xmm5
psrlw xmm6, 8
psrlw xmm7, 8
packuswb xmm6, xmm7
movdqa [ ecx + eax - 16 ], xmm6
jnz loop_
}
for ( size_t i = width * height * 3 / 16 * 16, end = width * height * 3; i < end; ++i )
{
dest[ i ] = ( dest[ i ] * 256 + ( src[ i ] - dest[ i ] ) * alpha + 255 ) / 256;
}
}
uchar* dest1;
uchar* dest2;
uchar* dest3;
uchar* src;
int main()
{
srand( time( NULL ) );
const size_t width = 100;
const size_t height = 100;
const uchar alpha = 10;
uchar* d1 = new uchar[ width * height * 3 + 15 ];
uchar* d2 = new uchar[ width * height * 3 + 15 ];
uchar* d3 = new uchar[ width * height * 3 + 15 ];
uchar* s = new uchar[ width * height * 3 + 15 ];
dest1 = (uchar*)(( (size_t)d1 + 15 ) & 0xfffffff0);
dest2 = (uchar*)(( (size_t)d2 + 15 ) & 0xfffffff0);
dest3 = (uchar*)(( (size_t)d3 + 15 ) & 0xfffffff0);
src = (uchar*)(( (size_t)s + 15 ) & 0xfffffff0);
for ( size_t i = 0, end = width * height * 3; i < end; ++i )
{
dest1[ i ] = dest2[ i ] = dest3[ i ] = rand() % 256;
src[ i ] = rand() % 256;
}
cout << memcmp( dest1, dest2, width * height * 3 ) << endl;
f1( dest1, src, width, height, alpha );
f2( dest2, src, width, height, alpha );
cout << memcmp( dest1, dest2, width * height * 3 ) << endl;
f3( dest3, src, width, height, alpha );
cout << memcmp( dest1, dest3, width * height * 3 ) << endl;
for ( size_t i = 0; i < width * height * 3; ++i )
{
if ( dest1[i] != dest2[i] )
cout << i << '\t' << int(dest1[i]) << '\t' << int(dest2[i]) << endl;
}
u64 min1 = ~u64(0);
u64 min2 = ~u64(0);
u64 min3 = ~u64(0);
for ( int i = 0; i < 10000; ++i )
{
u64 t = rdtsc();
f1( dest1, src, width, height, alpha );
t = rdtsc() - t;
if ( t < min1 )
{
min1 = t;
i = 0;
}
}
cout << min1 << endl;
for ( int i = 0; i < 10000; ++i )
{
u64 t = rdtsc();
f2( dest1, src, width, height, alpha );
t = rdtsc() - t;
if ( t < min2 )
{
min2 = t;
i = 0;
}
}
cout << min2 << endl;
for ( int i = 0; i < 10000; ++i )
{
u64 t = rdtsc();
f3( dest1, src, width, height, alpha );
t = rdtsc() - t;
if ( t < min3 )
{
min3 = t;
i = 0;
}
}
cout << min3 << endl;
delete [] s;
delete [] d3;
delete [] d2;
delete [] d1;
}