K
Nun, meine Ergebnisse moechte ich euch nicht vorenthalten. Als Vergleich dient die einfachste Implementierung, die ich mir vorstellen kann. Es ist daher nicht ganz fair. Es wird fuer Tests immer angenommen, dass das Positionsarray immer genug Platz bietet.
long line_thresh(unsigned char* line, unsigned long width, unsigned char th, unsigned short* pos, unsigned long size)
{
bool above = true;
unsigned long count = 0;
for(unsigned long i=0; i<width; ++i)
{
bool tmp = line[i]>=th;
if (above != tmp)
{
if (count<size) pos[count] = i;
above = tmp;
++count;
}
}
return count;
}
Fuer SSE bedarf es einiger Vorbereitung, wie das Initialisieren einer Lookup-Tabelle:
__m128i above_lut[256];
unsigned char above_count[256];
unsigned int calc_entry(uint8_t b, bool above, unsigned short pos[8], bool* ab)
{
std::bitset <8> bs(b);
unsigned int count = 0;
for(unsigned int i=0; i<8; ++i)
{
bool tmp = bs.test(i);
if (above != tmp)
{
pos[count] = i;
++count;
above = tmp;
}
}
*ab = above;
return count;
}
void init_above(__m128i above_lut[256], unsigned char above_count[256])
{
bool dummy;
for(unsigned int i=0; i<256; ++i)
{
above_count[i] = calc_entry(i, true, (unsigned short*)&above_lut[i], &dummy);
}
}
Und schliesslich mittels SSE:
long line_thresh_sse(unsigned char* pbytes, unsigned int length, unsigned char th, unsigned short* pos, unsigned int size)
{
// check alignment
unsigned int a = 16-((unsigned int)pbytes)%16;
bool above = true;
unsigned int count = 0;
unsigned short num = 0;
while((((unsigned int)pbytes)%16) && num<length)
{
bool tmp = (*pbytes)>=th;
if (above != tmp)
{
if (count<size) pos[count] = num;
above = tmp;
++count;
}
++pbytes;
++num;
}
length -= num;
__m128i idx = _mm_set1_epi16(num);
__m128i t = _mm_set1_epi8(th);
__m128i eight = _mm_set1_epi16(8);
__m128i p;
for(unsigned int i=0; i<length-16; i+=16)
{
__m128i v = _mm_load_si128((__m128i*)(pbytes));
v = _mm_min_epu8(v, t);
v = _mm_cmpeq_epi8(v, t);
uint16_t res = _mm_movemask_epi8(v);
unsigned int lut_idx = (above) ? (res&0x00FF) : (res&0x00FF)^0xFF; // symmetry of lookup table
if (lut_idx != 255) // same as above_count[lut_idx] != 0
{
p = _mm_load_si128(&above_lut[lut_idx]);
p = _mm_add_epi16(p, idx);
_mm_storeu_si128((__m128i*)(pos+count), p);
count += above_count[lut_idx];
}
above = (res&0x0080);
idx = _mm_add_epi16(idx, eight);
lut_idx = (above) ? (res>>8) : (res>>8)^0xFF;
if (lut_idx != 255)
{
p = _mm_load_si128(&above_lut[lut_idx]);
p = _mm_add_epi16(p, idx);
_mm_storeu_si128((__m128i*)(pos+count), p);
count += above_count[lut_idx];
}
above = (res&0x8000);
idx = _mm_add_epi16(idx, eight);
pbytes += 16;
}
// do something for trailing bytes
return count;
}
Zeitmessungen ergeben, dass die simple Implementierung etwa 3 bis 5 mal mehr Zeit benoetigt. Nicht gerade der Brueller. Leider ist SSE recht unflexibel beispielsweise bei Shifts zu benutzen, so dass ein SSE-Register schlecht als Positionsakkumulator dienen kann.