Gesetzte Bits im Byte zählen

Evil Azrael

jo, ging mit der kapitelübersicht recht schnell.

da könnte man ja doch auf die idee kommen,per #ifdef für x86er die version zu benutzen, auch wenn ich die noch nicht so 100% verstanden habe.

Mr. N

für den MSVC:

__inline unsigned bitcnt(unsigned v) { unsigned r; __asm {
    mov eax, [v]
    mov edx, eax
    shr eax, 1
    and eax, 055555555h
    sub edx, eax
    mov eax, edx
    shr edx, 2
    and eax, 033333333h
    and edx, 033333333h
    add eax, edx
    mov edx, eax
    shr eax, 4
    add eax, edx
    and eax, 00F0F0F0Fh
   imul eax, 001010101h
    shr eax, 24
    mov r,   eax
   }
   return r;
}

[ Dieser Beitrag wurde am 27.08.2002 um 15:06 Uhr von Mr. n editiert. ]

volkard

gebaut vom MSVC:

8:    u32 countBitsTrue(u32 x)
9:    {//http://www.df.lth.se/~john_e/gems/gem002d.html
00401000   mov         eax,ecx
00401002   shr         eax,1
00401004   and         eax,55555555h
00401009   sub         ecx,eax
10:       x = x - ((x >> 1) & 0x55555555);
11:       x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
0040100B   mov         eax,ecx
0040100D   and         ecx,33333333h
00401013   shr         eax,2
00401016   and         eax,33333333h
0040101B   add         eax,ecx
12:       x = (x + (x >> 4)) & 0x0F0F0F0F;
0040101D   mov         ecx,eax
0040101F   shr         ecx,4
00401022   add         ecx,eax
00401024   and         ecx,0F0F0F0Fh
13:       x *= 0x001010101;
0040102A   mov         eax,ecx
0040102C   shl         eax,8
0040102F   add         eax,ecx
00401031   shl         eax,8
00401034   add         eax,ecx
00401036   shl         eax,8
00401039   add         eax,ecx
14:       x >>= 24;
0040103B   shr         eax,18h
15:       return x;
16:   }
0040103E   ret

Irgendwie hat der Weichling sich nicht getraut, IMUL zu verwenden.

Mr. N

es stimmt zwar, dass man DirectP vorziehen sollte, aber so... *g*

volkard

Original erstellt von Mr. n:
es stimmt zwar, dass man DirectP vorziehen sollte, aber so... *g*

DirectP?

Mr. N

DirectPath : Chapter 4 / Select DirectPath over VectorPath Instructions; Reader S. 70; Textseite 50

und in welchem dokument?

Mr. N

welches dokument wohl? was ist der link oben wohl?

phreaking

Hi!
Ich habe mir dazu auch etwas überlegt, das ist wahrscheinlich aber nur schneller, wenn gar keine Bits gesetzt sind

mov eax, [var]  ; die zu testende Zahl
xor bh, bh  ; die Anzahl der gesetzten Bits
next:
bsf eax, bl  ; nach gesetztem Bit suchen
jz end       ; 
inc bl
inc bh       ; ein weiteres gesetztes Bit gefunden
shr bl       ; bereits getestete Bits rausschieben
jmp next
end:         ; eax=0, bh=Anz. der gesetzten Bits in var

Ich hoffe, der Code ist zumindest etwas effizienter, als jedes Bit einzeln zu testen...

so long, phreaking

Mr. N

vielleicht hast du ja erkannt, dass die anderen codes _keine_ schleife benötigen und sogar in _konstanter_ (wenn der prozessor so gut ist) Zeit ablaufen... bei "meinem" code tippe ich auf < 20 Instruktionen. das ist _gut_. nichtsdestotrotz natürlich gut, dass du dir gedanken gemacht hast...

volkard

Ich hab auch ne Schleife.

6:    u32 countBitsTrue(u32 data)
7:    {
00401000   xor         eax,eax
8:        u32 result=0;
9:        while(data)
00401002   test        ecx,ecx
00401004   je          countBitsTrue+0Eh (0040100e)
10:       {
11:           ++result;
12:           data&=data-1;
00401006   lea         edx,[ecx-1]
00401009   inc         eax
0040100A   and         ecx,edx
0040100C   jne         countBitsTrue+6 (00401006)
13:       }
14:       return result;
15:   };
0040100E   ret

Über die fantasievolle Verwendung von lea freue ich mich immer wieder.

Evil Azrael

hilfe, hier sind ja alle noch bekloppter als ich *g*

was man bei den funktionen aber nicht vergessen sollte ist, das ja auch der funktionsaufruf in anweisungen umgesetzt werden. der stack wird ja gesichert und so ....

aber das noch keiner eine GCC version gepostet hat... ihr enttäuscht mich

volkard

Original erstellt von Evil Azrael:
aber das noch keiner eine GCC version gepostet hat... ihr enttäuscht mich

Ich hab C++ gepostet. Jag das einfach mal durch den GCC und wir gucken uns seinen Code an.

Mr. N

@volkard: irgendwie kommt mir dein (c++-)code bekannt vor...

volkard

meinen bescheidenen Messungen nach braucht die Version ohne Schleife 19 Takte, die mit Schleife mindesten 32 Takte. Gemessen an Zahlen -1 bis -10000000. Als Inline-Funktion 18 und 31.
Uih. Ist ein Funktionaufruf jeute echt so billig? Oder hab ich nur Mist gemessen?

Evil Azrael

wie misst man sowas? ich hätte jetzt im buch nachgeguckt, und von hand die takte ausgerechnet.

kann es sein, dass der GCC inline assembler mist ist, oder bin ich nur so dumm?

Mr. N

ach ja:

#include <stdio.h>

unsigned char bitcnt(unsigned char c) {
    unsigned char r = 0, x = c;
    while (c) {
        ++r;
        c &= c - 1;
    }
    return r;
}

int main(int argc, char **argv)
{
    unsigned char cnt[256];

    for (int i = 0; i < 256; i++)
        cnt[i] = bitcnt((unsigned char) i);

    FILE *f;

    if (argc < 2) f = stdout;
    else f = fopen(argv[1], "w");

    fprintf(f, "const int bitcount[256] = {\n");
    for (i = 0; i < 256; ++i) {
        fprintf(f, "\t0x%X%s%s", cnt[i], i < 255 ? "," : "", (i & 7) == 7 ? "\n" : " ");
    }
    fprintf(f, "};\n#define BITCOUNT(uc) (bitcount[(unsigned char) uc])\n");
    return 0;
}

volkard

Der hier ist 1 Takt bei mir schneller. Aber ich geh davon aus, daß auf moderneren Prozessoren als meinem Celeron400 IMUL schneller ist.

17:   u32 countBitsTrue4(u32 x)
18:   {//http://www.df.lth.se/~john_e/gems/gem002d.html
00401000   mov         eax,ecx
00401002   shr         eax,1
00401004   and         eax,55555555h
00401009   sub         ecx,eax
19:       x = x - ((x >> 1) & 0x55555555);
20:       x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
0040100B   mov         eax,ecx
0040100D   shr         eax,2
00401010   and         ecx,33333333h
00401016   and         eax,33333333h
0040101B   add         eax,ecx
21:       x = (x + (x >> 4)) & 0x0F0F0F0F;
0040101D   mov         ecx,eax
0040101F   shr         ecx,4
00401022   add         ecx,eax
00401024   and         ecx,0F0F0F0Fh
22:       x = x + (x >> 8);
0040102A   mov         edx,ecx
0040102C   shr         edx,8
0040102F   add         ecx,edx
23:       x = x + (x >> 16);
00401031   mov         eax,ecx
00401033   shr         eax,10h
00401036   add         eax,ecx
24:       x = x & 0x3f;
00401038   and         eax,3Fh
25:       return x;
26:   }
0040103B   ret

gemessen hab ich unter der Annahme, daß ich von vielen Messungen den mit der kürzesten Laufzeit nehmen muß, weil ich sonst cache-misses und das alles zu bezahlen hätte.
Als Uhr nahm ich rdtsc.

#include <iostream>
#include <ctime>
using namespace std;

typedef unsigned int u32;

u32 countBitsTrue1(u32 x) //17 takte
{//http://www.df.lth.se/~john_e/gems/gem002d.html
    x = x - ((x >> 1) & 0x55555555);
    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F;
    x *= 0x001010101;
    x >>= 24;
    return x;
}

u32 countBitsTrue4(u32 x)//16 takte
{//http://www.df.lth.se/~john_e/gems/gem002d.html
    x = x - ((x >> 1) & 0x55555555);
    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F;
    x = x + (x >> 8);
    x = x + (x >> 16);
    x = x & 0x3f;
    return x;
}

u32 countBitsTrue2(u32 data) //30 Takte
{
    u32 result=0;
    while(data)
    {
        ++result;
        data&=data-1;
    }
    return result;
};

unsigned countBitsTrue3(unsigned v) //24 takte
{ 
    unsigned r; 
    __asm {
    mov eax, [v]
    mov edx, eax
    shr eax, 1
    and eax, 055555555h
    sub edx, eax
    mov eax, edx
    shr edx, 2
    and eax, 033333333h
    and edx, 033333333h
    add eax, edx
    mov edx, eax
    shr eax, 4
    add eax, edx
    and eax, 00F0F0F0Fh
   imul eax, 001010101h
    shr eax, 24
    mov r,   eax
   }
   return r;
}

u32 countBitsTrue5(u32 c) //30 takte
{
    u32 r = 0, x = c;
    while (x) {
        ++r;
        x &= x - 1;
    }
    return r;
}

u32 countBitsTrue6(u32 c) //98 takte
{
    static char table[]={0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4};
    u32 r = 0;
    while (c) 
    {
        r+=table[c%16];
        c/=16;
    }
    return r;
}

u32 tunix(u32 x)
{
    return x;
}

#pragma warning(disable:4035)
inline u32 rdtsc()
{
    __asm rdtsc;
}

int __cdecl main()
{
    u32 min=u32(-1);
    u32 r=0;
    for(int i=0;i<10000000;++i)
    {
        u32 start=rdtsc()+36;
        r+=countBitsTrue6(~i);
        u32 stop=rdtsc();
        if(stop-start<min)
        {
            min=stop-start;
            cout<<min<<endl;
        }
    }
    cout<<"fertig"<<endl;
    cout<<r<<endl;
    return 0;
}

[ Dieser Beitrag wurde am 27.08.2002 um 20:58 Uhr von volkard editiert. ]

-bg-

Also wenn es echt auf Speed ankommt würde ich sagen:
nicht Rechnen,
sondern Nachschaun!

d.h. wir richten uns zunächst eine große Tabeller ein. Ich würde raten mit 256 Einträgen (für jedes byte einen).

dann kann man die Ergebnisse über einfache lookups holen.
Zur Generierung der Tabellen sollte man wohl einen der oben genannte Algorythmen benutzen.

mady

Hi all ...

ich habe -bg-'s Vorschlag mal umgesetzt ...

#include <time.h>
#include <stdio.h>

typedef unsigned int u32;

u32 countBitsTrue4(u32 x)//16 takte
{
    x = x - ((x >> 1) & 0x55555555);
    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
    x = (x + (x >> 4)) & 0x0F0F0F0F;
    x = x + (x >> 8);
    x = x + (x >> 16);
    x = x & 0x3f;
    return x;
}

int tab[0xFFFF];

u32 count_bits(u32 x)
{
    x = tab[x & 0xFFFF] + tab[x>>16];

    return x;
}

#define TEST_PATTERN   0x12345678

int main(void)
{
    int i;
    clock_t before;
    double elapsed;

    for (i = 0; i < 0xFFFF; i++)
        tab[i] = countBitsTrue4(i);

    printf("x = %d\n", count_bits(TEST_PATTERN));

    before = clock();
    for (i = 0; i < 100000000; i++)
        count_bits(TEST_PATTERN);
        // countBitsTrue4(TEST_PATTERN);

    elapsed = clock() - before;
    printf("Funktion benötigte %.3lf Sekunden\n", elapsed/CLOCKS_PER_SEC);

    return 0;
}

Die Tabellen-Variante ist um einiges schneller ... dafür muss man halt zuerst die Tabelle initialisieren ...