Stream Benchmark

Ich wollte aus Übungszwecken mal den Stream-Benchmark nachprogrammieren und muss nun feststellen, dass mein Programm wesentlich höhere Bandbreiten berechnet als das Original.

Ich kann keinen allzu großen Unterschied mehr feststellen. Kann jemand sagen, was hier schief läuft:

#include <iostream>
#include <chrono>
#include <vector>
#include <algorithm>
#include <omp.h>

#ifndef STREAM_TYPE
#define STREAM_TYPE double
#endif

#ifndef ARRAY_SIZE
#define ARRAY_SIZE 100000000
#endif

#ifndef REP
#define REP 10
#endif

class Benchmark
{
  typedef std::chrono::high_resolution_clock time;

private:
  decltype(time::now()) start_time;
  decltype(time::now()) end;
  std::chrono::duration<double, std::milli> ms;
  unsigned int factor;
  std::string name;

public:
  Benchmark(std::string name, int factor) : name(name), factor(factor) {}

  std::vector<double> values;

  void start()
  {
    start_time = time::now();
  }

  void stop()
  {
    end = time::now();
    ms = end - start_time;
    values.push_back(ms.count());
  }

  void eval()
  {
    std::cout << "number of values: " << values.size() << std::endl;
    double min = *std::min_element(values.begin() + 1, values.end());
    double max = *std::max_element(values.begin() + 1, values.end());
    double avg = accumulate(values.begin() + 1, values.end(), 0.0) / values.size();
    std::cout << name << " "
              << (factor * ARRAY_SIZE * 8. / (1024 * 1024)) / (min / 1000) << " MiB/s"
              << std::endl;
    std::cout << "Min time: " << min << std::endl;
    std::cout << "Avg time: " << avg << std::endl;
    std::cout << "Max time: " << max << std::endl;
  }
};

int main()
{
  typedef std::chrono::high_resolution_clock time;
  std::chrono::duration<double, std::milli> ms;

  std::vector<double> values(REP);
  double min, max, avg;

  STREAM_TYPE *a = new STREAM_TYPE[ARRAY_SIZE];
  STREAM_TYPE *b = new STREAM_TYPE[ARRAY_SIZE];
  STREAM_TYPE *c = new STREAM_TYPE[ARRAY_SIZE];

  STREAM_TYPE scalar = 3.0;

  //////////////////////////////////////////////////
  // User information
  //////////////////////////////////////////////////

  /*
   * set up benchmarks
   */
  Benchmark copy("copy", 2), scale("scale", 2), add("add", 3), triad("triad", 3);
  /*
   * prints the number of threads
   */
#pragma omp parallel
  {
#pragma omp master
    {
      std::cout << "Number of threads: " << omp_get_num_threads() << std::endl;
    }
  }

  /*
   * prints the array size and estimated memory usage
   */
  std::cout << "Array size: " << ARRAY_SIZE << std::endl;
  std::cout << "Memory usage per array: " << (ARRAY_SIZE * 8. / (1024 * 1024)) << " MiB" << std::endl;
  std::cout << "Estimated total memory usage: " << (3 * ARRAY_SIZE * 8. / (1024 * 1024)) << " MiB" << std::endl;

  /*
   * copy Benchmark
   */
  for (int r = 0; r < REP; r++)
  {

    /*
     * copy benchmark
     */
    copy.start();
    #pragma omp parallel for
    for (int i = 0; i < ARRAY_SIZE; i++)
    {
      a[i] = b[i];
    }
    copy.stop();

    /*
     * scale benchmark
     */
    scale.start();
    #pragma omp parallel for
    for (int i = 0; i < ARRAY_SIZE; i++)
    {
      a[i] = scalar * b[i];
    }
    scale.stop();
    /*
     * add benchmark
     */
    add.start();
    #pragma omp parallel for
    for (int i = 0; i < ARRAY_SIZE; i++)
    {
      a[i] = b[i] + c[i];
    }
    add.stop();
    /*
     * triad benchmark
     */
    triad.start();
    #pragma omp parallel for
    for (int i = 0; i < ARRAY_SIZE; i++)
    {
      a[i] = b[i] + scalar * c[i];
    }
    triad.stop();
  }

  copy.eval();
  scale.eval();
  add.eval();
  triad.eval();

  delete a;
  delete b;
  delete c;

  return 0;
}

hustbaer

Hast du dir den erzeugten Code angesehen - nur zwecks sicherstellen dass von dem Code überhaupt 'was übrig bleibt?
Weil a[i] halt nirgends ausgewertet wird - im Prinzip kann der Compiler den Inhalt der ganzen Schleifen einfach verwerfen.

Ein einfacher Fix sollte sein die einzelnen Benchmark-Schleifen in je eine Funktion auszulagern, welche die Arrays als Parameter mitbekommt, und wo z.B. mit BOOST_NOINLINE verhindert wird dass sie inline erweitert (und dann wegoptimiert) wird.