Funktion optimieren

Lennox

Bei all deinen Sortierfunktionen ist immer n eine globale Konstante. Wenn du normale Sortierfunktionen haben willst, würde ich das n auf jeden Fall noch als Parameter übergeben.

Eigentlich wollte ich nur etwas Inline-Assembler lernen... Hätte nicht gedacht, dass das Thema so Fahrt aufnimmt.

Lennox

@wob
So besser?

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

volatile size_t initial_seed = 123;

size_t total_hash = 23;

size_t rand1(size_t seed, size_t max)
{
    size_t a = 16807;
    return (a * seed) % max;
}

size_t my_rand(size_t max)
{
    size_t r = rand1(initial_seed, max);
    initial_seed = rand1(initial_seed, -1);
    return r;
}

void rfill(size_t n, size_t m, size_t a[m][n])
{
    for (size_t i = 0; i < m; i++)
    {
        for (size_t j = 0; j < n; j++)
        {
            a[i][j] = my_rand(51);
        }
    }
}

double average(size_t n, size_t a[])
{
    // The average should be around 25.
    double average = 0;
    for (size_t i = 0; i < n; i++)
    {
        average += (double)a[i] / (double)n;
    }
    return average;
}

// Fast, combining hash function with few collisions and h(a,b) often not equal to h(b,a)...
size_t my_hash(size_t a, size_t b)
{
    return a * 31 + b;
}

void add_to_total_hash(size_t n, size_t m, size_t a[m][n])
{
    for (size_t i = 0; i < m; i++)
    {
        for (size_t j = 0; j < n; j++)
        {
            total_hash = my_hash(total_hash, a[i][j]);
        }
    }
}

void print(size_t n, size_t m, size_t a[m][n])
{
    add_to_total_hash(n, m, a);
    for (size_t i = 0; i < n; i++)
    {
        printf("%zu ", a[0][i]);
    }
    printf("\na: %f\n...\n", average(n, a[0]));
    // ...
    for (size_t i = 0; i < n; i++)
    {
        printf("%zu ", a[m - 1][i]);
    }
    printf("\na: %f\nh: %zu\n\n", average(n, a[m - 1]), total_hash);
}

// Comparison function for sort_a
int compare(const void *a, const void *b)
{
    return (*(size_t *)a - *(size_t *)b);
}

// Quicksort?
void sort_a(size_t n, size_t arr[])
{
    qsort(arr, n, sizeof(size_t), compare);
}

// Bubble sort
void sort_b(size_t n, size_t arr[])
{
    size_t i = 0, j = 1, a, b;
    while (j < n)
    {
        a = arr[i];
        b = arr[j];
        if (a > b)
        {
            arr[i] = b;
            arr[j] = a;
            i = 0;
            j = 1;
            continue;
        }
        i++;
        j++;
    }
}

// Selection sort
void sort_c(size_t n, size_t arr[])
{
    for (size_t i = 0; i < n - 1; i++)
    {
        size_t a = i;
        size_t b = arr[i];
        for (size_t j = i + 1; j < n; j++)
        {
            if (arr[j] < b)
            {
                a = j;
                b = arr[j];
            }
        }
        arr[a] = arr[i];
        arr[i] = b;
    }
}

// Insertion sort
void sort_d(size_t n, size_t arr[])
{
    size_t i = 0, j = 1, k, l, a, b;
    while (j < n)
    {
        a = arr[i];
        b = arr[j];
        if (a > b)
        {
            k = i;
            l = j;
            do
            {
                arr[k] = b;
                arr[l] = a;
                if (k == 0)
                {
                    break;
                }
                k--;
                l--;
                a = arr[k];
                b = arr[l];
            } while (a > b);
        }
        i++;
        j++;
    }
}

#pragma GCC push_options
#pragma GCC optimize("O0")
void sort_optimized(size_t n, size_t arr[])
{
    asm volatile(
        "  movq  %%rdi, -40(%%rbp)    \n"
        "  movq  %%rsi, -48(%%rbp)    \n"
        "  movq  $0, -8(%%rbp)        \n"
        "  movq  $1, -16(%%rbp)       \n"
        "  jmp  3f                    \n"
        "1:                           \n"
        "  movq  -8(%%rbp), %%rax     \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -48(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -24(%%rbp)    \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -48(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -32(%%rbp)    \n"
        "  movq  -24(%%rbp), %%rax    \n"
        "  cmpq  %%rax, -32(%%rbp)    \n"
        "  jnb  2f                    \n"
        "  movq  -8(%%rbp), %%rax     \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -48(%%rbp), %%rax    \n"
        "  addq  %%rax, %%rdx         \n"
        "  movq  -32(%%rbp), %%rax    \n"
        "  movq  %%rax, (%%rdx)       \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -48(%%rbp), %%rax    \n"
        "  addq  %%rax, %%rdx         \n"
        "  movq  -24(%%rbp), %%rax    \n"
        "  movq  %%rax, (%%rdx)       \n"
        "  movq  $0, -8(%%rbp)        \n"
        "  movq  $1, -16(%%rbp)       \n"
        "  jmp  3f                    \n"
        "2:                           \n"
        "  addq  $1, -8(%%rbp)        \n"
        "  addq  $1, -16(%%rbp)       \n"
        "3:                           \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  cmpq  -40(%%rbp), %%rax    \n"
        "  jb  1b                     \n"
        : /* No outputs. */
        : /* No inputs. */
        : "memory");
}
#pragma GCC pop_options

double profile_sort_func(void (*f)(size_t, size_t *), size_t n, size_t m, size_t arr[m][n])
{
    rfill(n, m, arr);
    print(n, m, arr);
    clock_t c1 = clock();
    for (size_t i = 0; i < m; i++)
    {
        (*f)(n, arr[i]);
    }
    clock_t c2 = clock();
    print(n, m, arr);
    return (double)(c2 - c1) / CLOCKS_PER_SEC;
}

int main(int argc, char const *argv[])
{
    size_t n = 10;
    size_t m = 100000;
    size_t arr[m][n];
    double secs[5] = {
        profile_sort_func(sort_a, n, m, arr),
        profile_sort_func(sort_b, n, m, arr),
        profile_sort_func(sort_c, n, m, arr),
        profile_sort_func(sort_d, n, m, arr),
        profile_sort_func(sort_optimized, n, m, arr),
    };
    double max = 0;
    for (size_t i = 0; i < 5; i++)
    {
        if (secs[i] > max)
        {
            max = secs[i];
        }
    }
    printf("%fs %f%%\n%fs %f%%\n%fs %f%%\n%fs %f%%\n%fs %f%%\n",
           secs[0], secs[0] * 100 / max,
           secs[1], secs[1] * 100 / max,
           secs[2], secs[2] * 100 / max,
           secs[3], secs[3] * 100 / max,
           secs[4], secs[4] * 100 / max);
    return 0;
}

gcc -Wall main.c -o main
./main 
27 42 3 33 40 14 19 12 36 43 
a: 26.900000
...
24 36 36 26 49 36 47 1 48 40 
a: 34.300000
h: 5939366454565110528

3 12 14 19 27 33 36 40 42 43 
a: 26.900000
...
1 24 26 36 36 36 40 47 48 49 
a: 34.300000
h: 4429705970288659611

33 44 43 0 24 15 46 36 2 11 
a: 25.400000
...
28 20 39 12 2 25 18 43 16 50 
a: 25.300000
h: 6352222085557659981

0 2 11 15 24 33 36 43 44 46 
a: 25.400000
...
2 12 16 18 20 25 28 39 43 50 
a: 25.300000
h: 4505934733348363633

9 39 48 42 46 11 50 24 15 37 
a: 32.100000
...
1 23 45 46 29 44 24 3 7 40 
a: 26.200000
h: 4225079418598471750

9 11 15 24 37 39 42 46 48 50 
a: 32.100000
...
1 3 7 23 24 29 40 44 45 46 
a: 26.200000
h: 1630903755648978609

3 26 21 15 26 27 35 13 32 43 
a: 24.100000
...
45 8 6 21 47 31 28 6 45 24 
a: 26.100000
h: 4076123578238199228

3 13 15 21 26 26 27 32 35 43 
a: 24.100000
...
6 6 8 21 24 28 31 45 45 47 
a: 26.100000
h: 10126057256555243605

13 30 14 4 11 21 10 24 4 19 
a: 15.000000
...
32 12 45 33 31 9 41 32 21 46 
a: 30.200000
h: 14143251785556203417

4 4 10 11 13 14 19 21 24 30 
a: 15.000000
...
9 12 21 31 32 32 33 41 45 46 
a: 30.200000
h: 10117127439067026547

0.019759s 66.492798%
0.029716s 100.000000%
0.019451s 65.456320%
0.014100s 47.449186%
0.028555s 96.093014%

... Ich habe noch nicht angefangen, das Assembly zu optimieren.

Finnegan

@Lennox sagte in Funktion optimieren:

... Ich habe noch nicht angefangen, das Assembly zu optimieren.

Zu n = 10 muss man allerdings anmerken, dass es keine Neuigkeit ist, dass Bubblesort für sehr kleine Arrays im Allgemeinen ziemlich gut abschneidet. Gründe hierfür können u.a. sein:

kleinerer Overhead, weniger Code
weniger umständliche Pointer-Manipulationen, direkt in-place
weniger Branches und besser vorhersagbares Branching-Verhalten (Branch Prediction)
gute Cache-Lokalität kleinerer Arrays

Insertion Sort hat allerdings auch diese Vorteile und meines Wissens ist das der Algorithmus der Wahl für kleine Arrays.

Auch ist der Code natürlich nicht optimiert mit gcc -Wall main.c -o main. Meines Wissens ist -O0 Default. Aber das kann ja durchaus deine Absicht sein, wenn du allgemein über Optimierung lernen willst und eben nicht vorhast, den Compiler mit -O3 zu schlagen

Edit: Noch ein Tip: Gib doch auch mal den Namen der Algorithmen in deinem Output aus, auch wenn das für dich klar ist. Als Außenstehender finde ich es echt mühsam das korrekt zuzuordnen. Erst über den Array-Index, dann über den Funktionsnamen und dann haben die auch noch Namen wie sort_a oder sort_b und man erfährt eigentlich nur aus den Kommentaren, was da eigentlich passiert.

Lennox

Mit der O3-Option sollten die Ergebnisse ähnlich sein. Bubblesort ist sehr einfach zu implementieren, schneidet aber ca. um 1/3 schlechter ab als qsort. Insertionsort ist auch einfach zu implementieren, benötigt für kleine n aber ca. nur die Hälfte der Zeit von qsort. Mit den vergleichsbasierten Verfahren, die wiederum auf Vertausch, Selektion oder Einfügung (oder Teilung) basieren, haben wir damit alle wichtigen Verfahren einmal dem teilenden qsort-Verfahren gegenübergestellt.

Lennox

@Finnegan
Etwas aus dem Nähkästchen, dieses Thema sollte keine Überraschungen liefern... Vielmehr ging es mir darum, den gcc und Asm etwas besser zu verstehen... Sortierverfahren sind ja so ziemlich das erste, womit sich kluge Leute in der Informatik sehr ausgiebig beschäftigt hatten. Es würde einem Wunder gleichen, wenn ich dabei zu bahnbrechenden, neuen Erkenntnissen gelangt wäre... Aber immerhin konnte ich etwas (Neues) lernen.

Schlangenmensch

@Lennox Darum habe ich am Anfang gefragt, was Ziel der ganzen Aktion ist. Hättest du das da schon gesagt, hätte hier keiner was zum Thema Optimierungen gesagt. Aber seinerzeit hast du davon gesprochen dich mit Optimierungsverfahren auseinandersetzen zu wollen; Loop unrolling etc (etwas was man übrigens auch ohne Asm machen kann )

Lennox

@Schlangenmensch
Es ging mir um beides: Übersetzungs- und Optimierungstechniken in C und Asm... Außerdem kann es ja nicht schaden... wenn man das (also den Benchmark) dabei von Anfang an richtig macht.

Ich muss leider für eine so genannte "Mixed-Prüfung" lernen, angefangen bei Strom/Elektrik, dann weiter mit Komponenten der Elektrotechnik, dann weiter mit Modellierung und Diagrammen, und dann weiter mit Low-Level-Programmierung und Optimierung. Verstehe auch nicht... warum das so thematisch breit, anstatt tief gefächert ist.

Lennox

Jetzt habe ich ein wenig optimiert...

Das Ergebnis ist, dass bis ca. n=28 Insertion-Sort schneller ist als Quicksort für -O0, und bis ca. n=72 Insertion-Sort schneller ist als Quicksort für -O3.

Die Überraschung ist also, dass -O3 den Code langsamer anstatt schneller macht.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

volatile size_t initial_seed = 123;

size_t total_hash = 23;

size_t rand1(size_t seed, size_t max)
{
    size_t a = 16807;
    return (a * seed) % max;
}

size_t my_rand(size_t max)
{
    size_t r = rand1(initial_seed, max);
    initial_seed = rand1(initial_seed, -1);
    return r;
}

void rfill(size_t n, size_t m, size_t a[m][n])
{
    for (size_t i = 0; i < m; i++)
    {
        for (size_t j = 0; j < n; j++)
        {
            a[i][j] = my_rand(51);
        }
    }
}

double average(size_t n, size_t a[])
{
    // The average should be around 25.
    double average = 0;
    for (size_t i = 0; i < n; i++)
    {
        average += (double)a[i] / (double)n;
    }
    return average;
}

// Fast, combining hash function with few collisions and h(a,b) often not equal to h(b,a)...
size_t my_hash(size_t a, size_t b)
{
    return a * 31 + b;
}

void add_to_total_hash(size_t n, size_t m, size_t a[m][n])
{
    for (size_t i = 0; i < m; i++)
    {
        for (size_t j = 0; j < n; j++)
        {
            total_hash = my_hash(total_hash, a[i][j]);
        }
    }
}

void print(size_t n, size_t m, size_t a[m][n])
{
    add_to_total_hash(n, m, a);
    for (size_t i = 0; i < n; i++)
    {
        printf("%zu ", a[0][i]);
    }
    printf("\na: %f\n...\n", average(n, a[0]));
    // ...
    for (size_t i = 0; i < n; i++)
    {
        printf("%zu ", a[m - 1][i]);
    }
    printf("\na: %f\nh: %zu\n\n", average(n, a[m - 1]), total_hash);
}

// Comparison function for sort_a
int compare(const void *a, const void *b)
{
    return (*(size_t *)a - *(size_t *)b);
}

// Quicksort
void sort_a_quick(size_t n, size_t arr[])
{
    qsort(arr, n, sizeof(size_t), compare);
}

// Bubble sort
void sort_b_bubble(size_t n, size_t arr[])
{
    size_t i = 0, j = 1, a, b;
    while (j < n)
    {
        a = arr[i];
        b = arr[j];
        if (a > b)
        {
            arr[i] = b;
            arr[j] = a;
            i = 0;
            j = 1;
            continue;
        }
        i++;
        j++;
    }
}

// Selection sort
void sort_c_selection(size_t n, size_t arr[])
{
    for (size_t i = 0; i < n - 1; i++)
    {
        size_t a = i;
        size_t b = arr[i];
        for (size_t j = i + 1; j < n; j++)
        {
            if (arr[j] < b)
            {
                a = j;
                b = arr[j];
            }
        }
        arr[a] = arr[i];
        arr[i] = b;
    }
}

// Insertion sort
void sort_d_insertion(size_t n, size_t arr[])
{
    size_t i = 0, j = 1, k, l, a, b;
    while (j < n)
    {
        a = arr[i];
        b = arr[j];
        if (a > b)
        {
            k = i;
            l = j;
            do
            {
                arr[k] = b;
                arr[l] = a;
                if (k == 0)
                {
                    break;
                }
                k--;
                l--;
                a = arr[k];
                b = arr[l];
            } while (a > b);
        }
        i++;
        j++;
    }
}

#pragma GCC push_options
#pragma GCC optimize("O0")
void sort_e_insertion_optimized(size_t n, size_t arr[])
{
    asm volatile(
        "  movq  %%rdi, -56(%%rbp)    \n"
        "  movq  %%rsi, -64(%%rbp)    \n"
        "  movq  $0, -8(%%rbp)        \n"
        "  movq  $1, -16(%%rbp)       \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "1:                           \n"
        "  movq  -8(%%rbp), %%rax     \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -40(%%rbp)    \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -48(%%rbp)    \n"
        "  movq  -40(%%rbp), %%rax    \n"
        "  cmpq  %%rax, -48(%%rbp)    \n"
        "  jnb  3f                    \n"
        "  movq  -8(%%rbp), %%rax     \n"
        "  movq  %%rax, -24(%%rbp)    \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  movq  %%rax, -32(%%rbp)    \n"
        "2:                           \n"
        "  movq  -24(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rax, %%rdx         \n"
        "  movq  -48(%%rbp), %%rax    \n"
        "  movq  %%rax, (%%rdx)       \n"
        "  movq  -32(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rax, %%rdx         \n"
        "  movq  -40(%%rbp), %%rax    \n"
        "  movq  %%rax, (%%rdx)       \n"
        "  cmpq  $0, -24(%%rbp)       \n"
        "  je   3f                    \n"
        "  subq  $1, -24(%%rbp)       \n"
        "  subq  $1, -32(%%rbp)       \n"
        "  movq  -24(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -40(%%rbp)    \n"
        "  movq  -32(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -48(%%rbp)    \n"
        "  movq  -40(%%rbp), %%rax    \n"
        "  cmpq  %%rax, -48(%%rbp)    \n"
        "  jb   2b                    \n"
        "3:                           \n"
        "  addq  $1, -8(%%rbp)        \n"
        "  addq  $1, -16(%%rbp)       \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  cmpq  -56(%%rbp), %%rax    \n"
        "  jb   1b                    \n"
        : /* No outputs. */
        : /* No inputs. */
        : "memory");
}
#pragma GCC pop_options

double profile_sort_func(void (*f)(size_t, size_t *), size_t n, size_t m, size_t arr[m][n])
{
    rfill(n, m, arr);
    print(n, m, arr);
    clock_t c1 = clock();
    for (size_t i = 0; i < m; i++)
    {
        (*f)(n, arr[i]);
    }
    clock_t c2 = clock();
    print(n, m, arr);
    return (double)(c2 - c1) / CLOCKS_PER_SEC;
}

int get_sort_winner(size_t n, size_t m)
{
    size_t arr[m][n];
    double secs[5] = {
        profile_sort_func(sort_a_quick, n, m, arr),
        profile_sort_func(sort_b_bubble, n, m, arr),
        profile_sort_func(sort_c_selection, n, m, arr),
        profile_sort_func(sort_d_insertion, n, m, arr),
        profile_sort_func(sort_e_insertion_optimized, n, m, arr),
    };
    int min_index = 0;
    double min = secs[0], max = secs[0];
    for (size_t i = 0; i < 5; i++)
    {
        if (secs[i] < min)
        {
            min_index = i;
            min = secs[i];
        }
        if (secs[i] > max)
        {
            max = secs[i];
        }
    }
    printf("%fs %f%%\n%fs %f%%\n%fs %f%%\n%fs %f%%\n%fs %f%%\n",
           secs[0], secs[0] * 100 / max,
           secs[1], secs[1] * 100 / max,
           secs[2], secs[2] * 100 / max,
           secs[3], secs[3] * 100 / max,
           secs[4], secs[4] * 100 / max);
    return min_index;
}

int main(int argc, char const *argv[])
{
    size_t n = 5;
    size_t m = 10000;
    while (get_sort_winner(n, m) != 0)
    {
        n++;
    }
}

Edit: In der Assemblercodeoptimierung habe ich aus der äußeren while-Schleife eine do-while-Schleife gemacht... das ist natürlich gefährlich. Außerdem habe ich nop's und unnötige Labels bzw. Sprünge entfernt.

Lennox

@Lennox sagte in Funktion optimieren:

" movq -16(%%rbp), %%rax \n"

Habe gerade noch etwas gesehen... Diese Zeile (170) scheint überflüssig zu sein, denn in 172 bekommt rax ja schon wieder einen neuen Wert, Typo.

Lennox

Weiter optimiert... Jetzt bin ich in den meisten Fällen schneller als der gcc:

    asm volatile(
        "  movq  %%rdi, -56(%%rbp)    \n"
        "  movq  %%rsi, -64(%%rbp)    \n"
        "  movq  $0, -8(%%rbp)        \n"
        "  subq  $1, -8(%%rbp)        \n"
        "  movq  $0, -16(%%rbp)       \n"
        "1:                           \n"
        "  addq  $1, -16(%%rbp)       \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  cmpq  -56(%%rbp), %%rax    \n"
        "  jnb  3f                    \n"
        "  addq  $1, -8(%%rbp)        \n"
        "  movq  -8(%%rbp), %%rax     \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -40(%%rbp)    \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -48(%%rbp)    \n"
        "  movq  -40(%%rbp), %%rax    \n"
        "  cmpq  %%rax, -48(%%rbp)    \n"
        "  jnb  1b                    \n"
        "  movq  -8(%%rbp), %%rax     \n"
        "  movq  %%rax, -24(%%rbp)    \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  movq  %%rax, -32(%%rbp)    \n"
        "2:                           \n"
        "  movq  -24(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rax, %%rdx         \n"
        "  movq  -48(%%rbp), %%rax    \n"
        "  movq  %%rax, (%%rdx)       \n"
        "  movq  -32(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rax, %%rdx         \n"
        "  movq  -40(%%rbp), %%rax    \n"
        "  movq  %%rax, (%%rdx)       \n"
        "  cmpq  $0, -24(%%rbp)       \n"
        "  je   1b                    \n"
        "  subq  $1, -24(%%rbp)       \n"
        "  subq  $1, -32(%%rbp)       \n"
        "  movq  -24(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -40(%%rbp)    \n"
        "  movq  -32(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -48(%%rbp)    \n"
        "  movq  -40(%%rbp), %%rax    \n"
        "  cmpq  %%rax, -48(%%rbp)    \n"
        "  jnb  1b                    \n"
        "  jmp  2b                    \n"
        "3:                           \n"
        : /* No outputs. */
        : /* No inputs. */
        : "memory");

Ich habe ein paar Vermutungen:

jnb ist schneller als jb
Viele Vergleichsoperationen sind relativ teuer
In der Summe sind viele kleine Sprünge schneller, als lange Sprünge, insofern es nicht zu viele Labels gibt

Stimmt das?

Lennox

Für eine bessere Übersicht habe ich noch ein kleines Bewertungssystem hinzugefügt. In jeder Runde bekommt der erstplatzierte 5 Punkte, der zweitplatzierte 4 usw. - der letztplatzierte bekommt 1 Punkt. Es werden die Runden n=5, n=6 usw. gespielt - bis Quicksort in einer Runde am schnellsten ist.

Long story short, danach wird ausgegeben, wer die meisten Punkte gesammelt hat:

...
(0) 0.008103s 15.380965% (71 points)
(1) 0.052682s 100.000000% (27 points)
(2) 0.011104s 21.077408% (55 points)
(3) 0.008565s 16.257925% (104 points)
(4) 0.008230s 15.622034% (118 points)
n_max=29
(4.) 118 points
(3.) 104 points
(0.) 71 points
(2.) 55 points
(1.) 27 points

Nach diesem Punktesystem, wäre mein optimiertes Verfahren auf Platz 1 und Quicksort auf Platz 3 - und das einfache Bubblesort-Verfahren ist am langsamsten.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

volatile size_t initial_seed = 123;

size_t total_hash = 23;

// A structure for the future rating system.
typedef struct score
{
    size_t index;
    double seconds;
    size_t points;
} SCORE;

size_t rand1(size_t seed, size_t max)
{
    size_t a = 16807;
    return (a * seed) % max;
}

size_t my_rand(size_t max)
{
    size_t r = rand1(initial_seed, max);
    initial_seed = rand1(initial_seed, -1);
    return r;
}

void rfill(size_t n, size_t m, size_t a[m][n])
{
    for (size_t i = 0; i < m; i++)
    {
        for (size_t j = 0; j < n; j++)
        {
            a[i][j] = my_rand(51);
        }
    }
}

double average(size_t n, size_t a[])
{
    // The average should be around 25.
    double average = 0;
    for (size_t i = 0; i < n; i++)
    {
        average += (double)a[i] / (double)n;
    }
    return average;
}

// Fast, combining hash function with few collisions and h(a,b) often not equal to h(b,a)...
size_t my_hash(size_t a, size_t b)
{
    return a * 31 + b;
}

void add_to_total_hash(size_t n, size_t m, size_t a[m][n])
{
    for (size_t i = 0; i < m; i++)
    {
        for (size_t j = 0; j < n; j++)
        {
            total_hash = my_hash(total_hash, a[i][j]);
        }
    }
}

void print(size_t n, size_t m, size_t a[m][n])
{
    add_to_total_hash(n, m, a);
    for (size_t i = 0; i < n; i++)
    {
        printf("%zu ", a[0][i]);
    }
    printf("\na: %f\n...\n", average(n, a[0]));
    // ...
    for (size_t i = 0; i < n; i++)
    {
        printf("%zu ", a[m - 1][i]);
    }
    printf("\na: %f\nh: %zu\n\n", average(n, a[m - 1]), total_hash);
}

// Comparison function for sort_a
int compare(const void *a, const void *b)
{
    return (*(size_t *)a - *(size_t *)b);
}

// Comparison function for score seconds
int compare_score_seconds(const void *a, const void *b)
{
    double a_dbl = ((SCORE *)a)->seconds;
    double b_dbl = ((SCORE *)b)->seconds;
    return a_dbl < b_dbl ? -1 : a_dbl > b_dbl ? +1
                                              : 0;
}

// Comparison function for score points
int compare_score_points(const void *a, const void *b)
{
    int a_int = ((SCORE *)a)->points;
    int b_int = ((SCORE *)b)->points;
    return b_int - a_int;
}

// Quicksort
void sort_a_quick(size_t n, size_t arr[])
{
    qsort(arr, n, sizeof(size_t), compare);
}

// Bubble sort
void sort_b_bubble(size_t n, size_t arr[])
{
    size_t i = 0, j = 1, a, b;
    while (j < n)
    {
        a = arr[i];
        b = arr[j];
        if (a > b)
        {
            arr[i] = b;
            arr[j] = a;
            i = 0;
            j = 1;
            continue;
        }
        i++;
        j++;
    }
}

// Selection sort
void sort_c_selection(size_t n, size_t arr[])
{
    for (size_t i = 0; i < n - 1; i++)
    {
        size_t a = i;
        size_t b = arr[i];
        for (size_t j = i + 1; j < n; j++)
        {
            if (arr[j] < b)
            {
                a = j;
                b = arr[j];
            }
        }
        arr[a] = arr[i];
        arr[i] = b;
    }
}

// Insertion sort
void sort_d_insertion(size_t n, size_t arr[])
{
    size_t i = 0, j = 1, k, l, a, b;
    while (j < n)
    {
        a = arr[i];
        b = arr[j];
        if (a > b)
        {
            k = i;
            l = j;
            do
            {
                arr[k] = b;
                arr[l] = a;
                if (k == 0)
                {
                    break;
                }
                k--;
                l--;
                a = arr[k];
                b = arr[l];
            } while (a > b);
        }
        i++;
        j++;
    }
}

#pragma GCC push_options
#pragma GCC optimize("O0")
void sort_e_insertion_optimized(size_t n, size_t arr[])
{
    asm volatile(
        "  movq  %%rdi, -56(%%rbp)    \n"
        "  movq  %%rsi, -64(%%rbp)    \n"
        "  movq  $0, -8(%%rbp)        \n"
        "  subq  $1, -8(%%rbp)        \n"
        "  movq  $0, -16(%%rbp)       \n"
        "1:                           \n"
        "  addq  $1, -16(%%rbp)       \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  cmpq  -56(%%rbp), %%rax    \n"
        "  jnb  3f                    \n"
        "  addq  $1, -8(%%rbp)        \n"
        "  movq  -8(%%rbp), %%rax     \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -40(%%rbp)    \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -48(%%rbp)    \n"
        "  movq  -40(%%rbp), %%rax    \n"
        "  cmpq  %%rax, -48(%%rbp)    \n"
        "  jnb  1b                    \n"
        "  movq  -8(%%rbp), %%rax     \n"
        "  movq  %%rax, -24(%%rbp)    \n"
        "  movq  -16(%%rbp), %%rax    \n"
        "  movq  %%rax, -32(%%rbp)    \n"
        "2:                           \n"
        "  movq  -24(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rax, %%rdx         \n"
        "  movq  -48(%%rbp), %%rax    \n"
        "  movq  %%rax, (%%rdx)       \n"
        "  movq  -32(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rax, %%rdx         \n"
        "  movq  -40(%%rbp), %%rax    \n"
        "  movq  %%rax, (%%rdx)       \n"
        "  cmpq  $0, -24(%%rbp)       \n"
        "  je   1b                    \n"
        "  subq  $1, -24(%%rbp)       \n"
        "  subq  $1, -32(%%rbp)       \n"
        "  movq  -24(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -40(%%rbp)    \n"
        "  movq  -32(%%rbp), %%rax    \n"
        "  leaq  0(,%%rax,8), %%rdx   \n"
        "  movq  -64(%%rbp), %%rax    \n"
        "  addq  %%rdx, %%rax         \n"
        "  movq  (%%rax), %%rax       \n"
        "  movq  %%rax, -48(%%rbp)    \n"
        "  movq  -40(%%rbp), %%rax    \n"
        "  cmpq  %%rax, -48(%%rbp)    \n"
        "  jnb  1b                    \n"
        "  jmp  2b                    \n"
        "3:                           \n"
        : /* No outputs. */
        : /* No inputs. */
        : "memory");
}
#pragma GCC pop_options

double profile_sort_func(void (*f)(size_t, size_t *), size_t n, size_t m, size_t arr[m][n])
{
    rfill(n, m, arr);
    print(n, m, arr);
    clock_t c1 = clock();
    for (size_t i = 0; i < m; i++)
    {
        (*f)(n, arr[i]);
    }
    clock_t c2 = clock();
    print(n, m, arr);
    return (double)(c2 - c1) / CLOCKS_PER_SEC;
}

size_t num_of_funcs = 5;
size_t get_sort_scores(size_t n, size_t m, SCORE *scores)
{
    size_t arr[m][n];
    scores[0].seconds = profile_sort_func(sort_a_quick, n, m, arr);
    scores[1].seconds = profile_sort_func(sort_b_bubble, n, m, arr);
    scores[2].seconds = profile_sort_func(sort_c_selection, n, m, arr);
    scores[3].seconds = profile_sort_func(sort_d_insertion, n, m, arr);
    scores[4].seconds = profile_sort_func(sort_e_insertion_optimized, n, m, arr);
    SCORE scores_cpy[num_of_funcs];
    for (size_t i = 0; i < num_of_funcs; i++)
    {
        scores_cpy[i] = scores[i];
    }
    qsort(scores_cpy, num_of_funcs, sizeof(SCORE), compare_score_seconds);
    double factor = 100 / scores_cpy[num_of_funcs - 1].seconds;
    for (size_t i = 0; i < num_of_funcs; i++)
    {
        scores[scores_cpy[i].index].points += num_of_funcs - i;
    }
    for (size_t i = 0; i < num_of_funcs; i++)
    {
        printf("(%zu) %fs %f%% (%zu points)\n", i, scores[i].seconds, scores[i].seconds * factor, scores[i].points);
    }
    return scores_cpy[0].index;
}

int main(int argc, char const *argv[])
{
    size_t n = 5;
    size_t m = 10000;
    SCORE scores[] = {
        {0, 0, 0},
        {1, 0, 0},
        {2, 0, 0},
        {3, 0, 0},
        {4, 0, 0},
    };
    while (get_sort_scores(n++, m, scores) != 0)
        ;
    printf("n_max=%zu\n", n - 1);
    qsort(scores, 5, sizeof(SCORE), compare_score_points);
    for (size_t i = 0; i < num_of_funcs; i++)
    {
        printf("(%zu.) %zu points\n", scores[i].index, scores[i].points);
    }
}

Btw., wäre die Nähe der profile_sort_func zur Sortierungsfunktionsdefinition (also dem Aufruf...) eigentlich auch relevant? Also, können lange "Call"-Strecken das Ergebnis auch beeinflussen?

Lennox

Hmm, unter den Blinden ist der einäugige König... Mit -O3 ist wieder der Compiler bzw. das unmodifizierte Insertionsort-Verfahren schneller.

Finnegan

@Lennox sagte in Funktion optimieren:

Btw., wäre die Nähe der profile_sort_func zur Sortierungsfunktionsdefinition (also dem Aufruf...) eigentlich auch relevant? Also, können lange "Call"-Strecken das Ergebnis auch beeinflussen?

Ich würde vermuten dass das höchstens mit -O0 auftritt und der Effekt dann auch nur minimal ist. Funktionsaufrufe sind nicht sonderlich teuer und auf höheren Optimierungstufen dürfte das auch ge-inlined werden. Soweit ich sehe ist das ja nur ein zusätzlicher Call je Sortierung. Das ist vernachlässigbar. Wäre das z.B. ein extra Call je "swap"-Operation in den Algorithmen würde ich mir mehr Gedanken machen.

Wo das aber schadet ist, wenn ich als Außenstehender herausfinden will, welche dieser Zahlen überhaupt für welchen Algorithmus steht. Ich hatte ja schonmal angeregt, den Namen des Algorithmus mit auszugeben (das wäre eine tiefer hängende Frucht für die "Übersichtlichkeit" gewesen als ein Punktesystem). Ich mache mir jetzt zumindest nicht mehr die Mühe herauszufinden, welche Ergebnisse für welchen Algo stehen und denke mir nur "meh, irgendwelche Algorithmen laufen unterschiedlich schnell, nett"

Ansonsten: Nochmal, wenn du schon so viel in die Richtung machst, schau dir Google Benchmark an. Braucht vielleicht etwas Einarbeitung, aber du sparst dir eine Menge von dem Zeug drumherum und kannst dich auf die Algorithmen konzentrieren. Und es vermeidet jede Menge Probleme, die eventuell dein Ergebnis verfälschen. Wenn ich mich recht entsinne, kannst du die ganzen unterschiedlichen Parameter wie zu testende Array-Längen direkt beim Benchmark-Aufruf übergeben (z.B. via Skript) und ihm z.B. so Sachen sagen wie "teste mal für dieses Intervall von Array-Längen mit dieser Schrittweite" oder "erhöhe die Array-Länge exponentiell". Du kannst dir die Ergebnisse dann auch nach JSON oder CSV exportieren lassen und die Daten dann nutzen, um damit Diagramme zeichnen zu lassen (z.B. mit Excel oder anderen Tools). Da erkennt man dann oft ein bisschen mehr, z.B. die unterschiedlichen Stärken der verschiedenen Algorithmen abhängig von der Array-Größe. Da kann man dann auch gut ablesen, wann sich der eine Algorithmus, und wann der andere eher lohnt. Und das beste: Es zeigt auch die Test-Funktionsnamen in der Übersicht. Wenn die ordentlich benannt sind, können wir auch alle mehr mit den Zahlen anfangen