C
Nochmals Code etwas aufgeräumt und Rechenfehler beseitigt. Durch die Berechnung von jeweils 4 Zellen, die übereinander liegen, werden im Vergleich zum naiven linearen Algorithmus nur 8 statt 10 Integeroperationen pro Zelle benötigt. Dieser Unterschied schlägt bei der Geschwindigkeit voll durch.
Ein Vergrößerung von stripe_size würde die Anzahl der nötigen Operation weiter veringern, allerdings ist der Registersatz mit nur 16 Registern zu klein um dann alle Zwischenergebnisse halten zu können, so dass sich keine Verbesserung der Laufzeit ergibt.
Benötigt bei mir mit einem einzigen Thread und generischem Code (nur SSE2, kein AVX) ca. 0.92ms für 2000x2000 Feld, bzw. ca. 0.8 Takte/Zelle. Ich nehme daher meine Behauptung zurück, der Algorithmus wäre durch den Speicherdurchsatz begrenzt, eine Verbesserung durch kompaktere Speicherung (bits oder nibbles) ist daher wahrscheinlich nicht zu erwarten. AVX bringt keine echte Verbesserung. Mit AVX2 sollte sich die Geschwindigkeit allerdings nahezu verdoppeln.
Ein zweiter Thread verbessert das Ergebnis bei mir nur um ca. 10-15%.
#include <algorithm>
#include <atomic>
#include <cassert>
#include <chrono>
#include <cstddef>
#include <exception>
#include <iostream>
#include <random>
#include <string>
#include <thread>
#include <utility>
#include <vector>
#include <type_traits>
#ifdef __AVX2__
#define block_size 32
#else
#define block_size 16
#endif
#define stripe_size 4
template <typename T> const T& as_const(T& x) noexcept { return x; }
struct gameoflife_state {
std::ptrdiff_t width, height, stride, stripes;
std::vector<char> data;
char* origin;
gameoflife_state(std::ptrdiff_t w, std::ptrdiff_t h, char v = 0) : width(w), height(h), stride((w+2+block_size-1)/block_size*block_size),
stripes((height+stripe_size-1)/stripe_size), data(internal_size(), v) {
assert((width == 0) == (height == 0));
void* p = &data[stride+1];
std::size_t size = data.size();
origin = static_cast<char*>(std::align( block_size, block_size, p, size ) );
}
gameoflife_state(const gameoflife_state&) = delete;
gameoflife_state& operator=(const gameoflife_state&) = delete;
std::size_t size() const noexcept { return width * height; }
std::size_t internal_size() const noexcept {
return stride * (stripes * stripe_size + 2) + block_size - 1;
}
char& operator()(std::ptrdiff_t x, std::ptrdiff_t y) noexcept { return const_cast<char&>(as_const(*this)(x, y)); }
const char& operator()(std::ptrdiff_t x, std::ptrdiff_t y) const noexcept { return origin[x + stride * y]; }
void swap(gameoflife_state& rhs) noexcept {
using std::swap;
swap(width, rhs.width);
swap(height, rhs.height);
swap(stride, rhs.stride);
swap(stripes, rhs.stripes);
swap(data, rhs.data);
swap(origin, rhs.origin);
}
void normalize() noexcept {
(*this)(-1, -1) = (*this)(width-1, height-1);
for (std::ptrdiff_t x = 0; x < width; ++x) {
(*this)(x, -1) = (*this)(x, height-1);
}
(*this)(width,-1) = (*this)(0, height-1);
for (std::ptrdiff_t y = 0; y < height; ++y) {
(*this)(-1, y) = (*this)(width-1, y);
(*this)(width, y) = (*this)(0, y);
}
(*this)(-1, height) = (*this)(width-1, 0);
for (std::ptrdiff_t x = 0; x < width; ++x) {
(*this)(x, height) = (*this)(x, 0);
}
(*this)(width, height) = (*this)(0, 0);
}
};
typedef char vec_type __attribute__((vector_size(block_size),aligned(1),__may_alias__));
vec_type load(const void* src, int offset) noexcept {
return *static_cast<const vec_type*>(__builtin_assume_aligned(src, block_size, offset));
}
char store(void* dst, vec_type v) noexcept {
*static_cast<vec_type*>(__builtin_assume_aligned(dst, block_size)) = -(v == 3);
return 0;
}
template <int... I, int... I2, int... I_2> // 0..stripe_size-1, 0..stripe_size+1, 0..stripe_size/2
void compute_cells(char* dst, const char* src, int stripes, int stride,
std::integer_sequence<int, I...>, std::integer_sequence<int, I2...>, std::integer_sequence<int, I_2...>) noexcept {
for (; stripes--;) {
for (auto blocks = stride / block_size; blocks--; ) {
vec_type row[] = { load(&src[(I2-1)*stride-1], block_size-1) + load(&src[(I2-1)*stride+1], 1) ... };
vec_type pair[] = { row[I_2*2+1] + row[I_2*2+2] ... };
vec_type sum[] = { pair[I/2] + row[I+(I%2?2:0)] + load(&src[(I-1)*stride], 0) + load(&src[(I+1)*stride], 0)... };
char dummy[] = { store(&dst[I*stride], sum[I] | load(&src[I*stride], 0))... };
(void)dummy;
src += block_size;
dst += block_size;
}
src += stride * (stripe_size - 1);
dst += stride * (stripe_size - 1);
}
}
void compute_cells(char* __restrict__ dst, const char* __restrict__ src, int stripes, int stride) noexcept {
compute_cells(dst, src, stripes, stride,
std::make_integer_sequence<int, stripe_size>{},
std::make_integer_sequence<int, stripe_size+2>{},
std::make_integer_sequence<int, stripe_size/2>{});
}
void compute_gen(gameoflife_state& current_gen, gameoflife_state& next_gen, std::size_t num_threads) {
assert(current_gen.width == next_gen.width);
assert(current_gen.height == next_gen.height);
auto stride = current_gen.stride;
std::vector<std::thread> threads;
threads.reserve(num_threads - 1);
char* src = current_gen.origin;
char* dst = next_gen.origin;
auto stripes = current_gen.stripes;
auto task_stripes = (stripes + num_threads - 1) / num_threads;
for (std::size_t i = 1; i < num_threads; ++i) {
threads.emplace_back([=] { compute_cells(dst, src, task_stripes, stride); });
src += stride * task_stripes * stripe_size;
dst += stride * task_stripes * stripe_size;
stripes -= task_stripes;
}
compute_cells(dst, src, stripes, stride);
for (auto& i : threads)
i.join();
next_gen.normalize();
}
void show(gameoflife_state& s) {
for ( std::ptrdiff_t y = 0; y < s.height; ++y ) {
for ( std::ptrdiff_t x = 0; x < s.width; ++x ) {
std::cout << (s(x,y)?'x':'.');
}
std::cout << '\n';
}
std::cout << '\n';
}
#if 0
#define SHOW(x) show(x)
#else
#define SHOW(x)
#endif
int main(int argc, char** argv) {
std::ptrdiff_t w = 512, h = 512, num_threads = std::max<std::ptrdiff_t>(std::thread::hardware_concurrency(), 1), gens = -1;
switch (argc) {
case 0:
case 1: break;
case 3:
case 4:
case 5:
try {
w = std::stol(argv[1]);
h = std::stol(argv[2]);
if ( argc >= 4 )
num_threads = std::stol(argv[3]);
if ( argc >= 5 )
gens = std::stol(argv[4]);
} catch (std::exception const&) {
std::cerr << "invalid arguments (must be integral)\n";
return -1;
}
break;
default: std::cerr << "invalid number of arguments\n"; return -1;
}
gameoflife_state current_gen{w, h};
gameoflife_state next_gen{w, h};
{
std::minstd_rand prng;
std::bernoulli_distribution dist{0.3};
for (std::ptrdiff_t y = 0; y < current_gen.height; ++y)
for (std::ptrdiff_t x = 0; x < current_gen.width; ++x)
current_gen(x, y) = dist(prng) ? 1 : 0;
current_gen.normalize();
}
SHOW(current_gen);
using clock_t = std::chrono::high_resolution_clock;
const auto start = clock_t::now();
if ( gens == -1 )
gens = 30000000000/current_gen.height/current_gen.width;
for (auto i = gens; i--; ) {
compute_gen(current_gen, next_gen, num_threads);
current_gen.swap(next_gen);
SHOW(current_gen);
}
const auto end = clock_t::now();
const std::chrono::duration<double, std::milli> elapsed = end - start;
std::cout << "average: " << elapsed.count() / gens << " ms per generation\n";
}