Wie beeinflußt man den Compiler so, daß er ohne Assemblercode die SSE und AVX Einheit effizient nutzt?

hustbaer schrieb:

aber wie stellt ihr nun sicher, daß der Compiler den Code so compilert, daß dieser die SSE und AVX Einheiten auch möglichst parallel mit den Matrizendaten füllt, so daß diese möglichst effizient und parallel berechnet werden?

Sicherstellen kannst du das nur, indem du SSE Intrinsics verwendest - so dein Compiler welche hat. Oder eben Inline-Assembler.

Danke für den Hinweis, das mit den SSE Instrinsics ist goldwert.

Sind SSE Instrinsics Compilerunabhängig, sofern die Compiler nicht uralt sind und so etwas untersützten?
Also kann ich den gleichen Instrinsics Code mit dem Compiler von VS und dem GNU GCC compilieren oder gibt es da Unterschiede?

Zu VS habe ich jetzt hier etwas gefunden, aber wie geht es beim gcc?
http://www.codeproject.com/KB/recipes/sseintro.aspx

SeppJ schrieb:

In C++ nimmste ein valarray.

Danke, ich werde die valarrays mit mal anschauen.
Aber für C gibt es wohl nichts vergleichbares, oder?

volkard

using namespace std;

struct Matrix{
    float data[16];
};

int main() {
    Matrix* kram=new Matrix[5000];
    for(int i=0;i!=5000;++i)
        for(int j=0;j!=16;++j)
            cin>>kram[i].data[j];
    float skalar;
    cin>>skalar;
    for(int i=0;i!=5000;++i)
        for(int j=0;j!=16;++j)
            kram[i].data[j]*=skalar;
    for(int i=0;i!=5000;++i)
        for(int j=0;j!=16;++j)
            cout<<kram[i].data[j];
}

wird zu

.file	"main.cpp"
	.text
	.p2align 4,,15
.globl main
	.type	main, @function
main:
.LFB1237:
	.cfi_startproc
	pushq	%r13
	.cfi_def_cfa_offset 16
	movl	$320000, %edi
	pushq	%r12
	.cfi_def_cfa_offset 24
	pushq	%rbp
	.cfi_def_cfa_offset 32
	pushq	%rbx
	.cfi_def_cfa_offset 40
	subq	$24, %rsp
	.cfi_def_cfa_offset 64
	.cfi_offset 3, -40
	.cfi_offset 6, -32
	.cfi_offset 12, -24
	.cfi_offset 13, -16
	call	_Znam
	leaq	320000(%rax), %r12
	movq	%rax, %r13
	movq	%rax, %rbx
	movq	%rax, %rbp
	.p2align 4,,7
	.p2align 3
.L2:
	movq	%rbp, %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	4(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	8(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	12(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	16(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	20(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	24(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	28(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	32(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	36(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	40(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	44(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	48(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	52(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	56(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	leaq	60(%rbp), %rsi
	movl	$_ZSt3cin, %edi
	addq	$64, %rbp
	call	_ZNSi10_M_extractIfEERSiRT_
	cmpq	%r12, %rbp
	jne	.L2
	leaq	12(%rsp), %rsi
	movl	$_ZSt3cin, %edi
	call	_ZNSi10_M_extractIfEERSiRT_
	movss	12(%rsp), %xmm4
	leaq	16(%r13), %rcx
	shufps	$0, %xmm4, %xmm4
	leaq	32(%r13), %rdx
	leaq	48(%r13), %rax
	.p2align 4,,7
	.p2align 3
.L3:
	movlps	0(%r13), %xmm3
	movlps	(%rcx), %xmm2
	movlps	(%rdx), %xmm1
	movlps	(%rax), %xmm0
	movhps	8(%r13), %xmm3
	movhps	8(%rcx), %xmm2
	movhps	8(%rdx), %xmm1
	movhps	8(%rax), %xmm0
	mulps	%xmm4, %xmm3
	mulps	%xmm4, %xmm2
	mulps	%xmm4, %xmm1
	movlps	%xmm3, 0(%r13)
	mulps	%xmm4, %xmm0
	movhps	%xmm3, 8(%r13)
	addq	$64, %r13
	movlps	%xmm2, (%rcx)
	movhps	%xmm2, 8(%rcx)
	addq	$64, %rcx
	movlps	%xmm1, (%rdx)
	movhps	%xmm1, 8(%rdx)
	addq	$64, %rdx
	movlps	%xmm0, (%rax)
	movhps	%xmm0, 8(%rax)
	addq	$64, %rax
	cmpq	%r12, %r13
	jne	.L3
	.p2align 4,,7
	.p2align 3
.L5:
	cvtss2sd	(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	4(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	8(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	12(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	16(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	20(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	24(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	28(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	32(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	36(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	40(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	44(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	48(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	52(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	56(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	call	_ZNSo9_M_insertIdEERSoT_
	cvtss2sd	60(%rbx), %xmm0
	movl	$_ZSt4cout, %edi
	addq	$64, %rbx
	call	_ZNSo9_M_insertIdEERSoT_
	cmpq	%r12, %rbx
	jne	.L5
	addq	$24, %rsp
	.cfi_def_cfa_offset 40
	xorl	%eax, %eax
	popq	%rbx
	.cfi_def_cfa_offset 32
	popq	%rbp
	.cfi_def_cfa_offset 24
	popq	%r12
	.cfi_def_cfa_offset 16
	popq	%r13
	.cfi_def_cfa_offset 8
	ret
	.cfi_endproc
.LFE1237:
	.size	main, .-main
	.p2align 4,,15
	.type	_GLOBAL__I_main, @function
_GLOBAL__I_main:
.LFB1393:
	.cfi_startproc
	subq	$8, %rsp
	.cfi_def_cfa_offset 16
	movl	$_ZStL8__ioinit, %edi
	call	_ZNSt8ios_base4InitC1Ev
	movl	$__dso_handle, %edx
	movl	$_ZStL8__ioinit, %esi
	movl	$_ZNSt8ios_base4InitD1Ev, %edi
	addq	$8, %rsp
	.cfi_def_cfa_offset 8
	jmp	__cxa_atexit
	.cfi_endproc
.LFE1393:
	.size	_GLOBAL__I_main, .-_GLOBAL__I_main
	.section	.ctors,"aw",@progbits
	.align 8
	.quad	_GLOBAL__I_main
	.local	_ZStL8__ioinit
	.comm	_ZStL8__ioinit,1,1
	.weakref	_ZL20__gthrw_pthread_oncePiPFvvE,pthread_once
	.weakref	_ZL27__gthrw_pthread_getspecificj,pthread_getspecific
	.weakref	_ZL27__gthrw_pthread_setspecificjPKv,pthread_setspecific
	.weakref	_ZL22__gthrw_pthread_createPmPK14pthread_attr_tPFPvS3_ES3_,pthread_create
	.weakref	_ZL20__gthrw_pthread_joinmPPv,pthread_join
	.weakref	_ZL21__gthrw_pthread_equalmm,pthread_equal
	.weakref	_ZL20__gthrw_pthread_selfv,pthread_self
	.weakref	_ZL22__gthrw_pthread_detachm,pthread_detach
	.weakref	_ZL22__gthrw_pthread_cancelm,pthread_cancel
	.weakref	_ZL19__gthrw_sched_yieldv,sched_yield
	.weakref	_ZL26__gthrw_pthread_mutex_lockP15pthread_mutex_t,pthread_mutex_lock
	.weakref	_ZL29__gthrw_pthread_mutex_trylockP15pthread_mutex_t,pthread_mutex_trylock
	.weakref	_ZL31__gthrw_pthread_mutex_timedlockP15pthread_mutex_tPK8timespec,pthread_mutex_timedlock
	.weakref	_ZL28__gthrw_pthread_mutex_unlockP15pthread_mutex_t,pthread_mutex_unlock
	.weakref	_ZL26__gthrw_pthread_mutex_initP15pthread_mutex_tPK19pthread_mutexattr_t,pthread_mutex_init
	.weakref	_ZL29__gthrw_pthread_mutex_destroyP15pthread_mutex_t,pthread_mutex_destroy
	.weakref	_ZL30__gthrw_pthread_cond_broadcastP14pthread_cond_t,pthread_cond_broadcast
	.weakref	_ZL27__gthrw_pthread_cond_signalP14pthread_cond_t,pthread_cond_signal
	.weakref	_ZL25__gthrw_pthread_cond_waitP14pthread_cond_tP15pthread_mutex_t,pthread_cond_wait
	.weakref	_ZL30__gthrw_pthread_cond_timedwaitP14pthread_cond_tP15pthread_mutex_tPK8timespec,pthread_cond_timedwait
	.weakref	_ZL28__gthrw_pthread_cond_destroyP14pthread_cond_t,pthread_cond_destroy
	.weakref	_ZL26__gthrw_pthread_key_createPjPFvPvE,pthread_key_create
	.weakref	_ZL26__gthrw_pthread_key_deletej,pthread_key_delete
	.weakref	_ZL30__gthrw_pthread_mutexattr_initP19pthread_mutexattr_t,pthread_mutexattr_init
	.weakref	_ZL33__gthrw_pthread_mutexattr_settypeP19pthread_mutexattr_ti,pthread_mutexattr_settype
	.weakref	_ZL33__gthrw_pthread_mutexattr_destroyP19pthread_mutexattr_t,pthread_mutexattr_destroy
	.ident	"GCC: (Gentoo 4.5.2 p1.1, pie-0.4.5) 4.5.2"
	.section	.note.GNU-stack,"",@progbits

nur mit -O3 -march=native auf meinem

flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 syscall nx mmxext fxsr_opt lm 3dnowext 3dnow up rep_good nopl pni lahf_lm

Wo sollte man über mehr MMX, SSE oder so nachdenken? Also ich lese dauernd mulps. Dachte, damit sei der Kram in den alleimeisten Fällen schon für mich erledigt. Und ich müßte nur noch gelegentlich ins Compilat schauen, ob da oft genux mmx drin steht, und wenn's ein paar mal drinsteht, wird schon alles recht sein.

volkard schrieb:

Wo sollte man über mehr MMX, SSE oder so nachdenken? Also ich lese dauernd mulps. Dachte, damit sei der Kram in den alleimeisten Fällen schon für mich erledigt. Und ich müßte nur noch gelegentlich ins Compilat schauen, ob da oft genux mmx drin steht, und wenn's ein paar mal drinsteht, wird schon alles recht sein.

Also die Seite sagt dazu folgendes:

This doesn't show the function using inline Assembly. Anyone who is interested may read it in the demo project. Calculation times on my computer:

* C++ code - 26 ms
* C++ with SSE Intrinsics - 9 ms
* Inline Assembly with SSE instructions - 9 ms

Execution time should be estimated in the Release configuration, with compiler optimizations.

Obwohl also die SIMD Einheiten benutzt werden, kann man nicht davon ausgehen, daß der Compiler bei normalem C++ Code sie auch effizient zu nutzen weiß.

Das Ergebnis 26 ms vs. 9 ms ist schon erheblich.

Oder auch:

The program also calculates the minimum and maximum values in the result array. ARRAY_SIZE is defined as 100000. Result array is shown in the listbox. Calculation time (ms) for each way is shown in the dialog:

* C++ code - 6 ms on my computer;
* C++ code with SSE Intrinsics - 3 ms;
* Inline Assembly with SSE instructions - 2 ms.

Assembly code performs better because of intensive using of the SSX registers. However, usually C++ code with SSE Intrinsics performs like Assembly code or better, because it is difficult to write an Assembly code which runs faster than optimized code generated by C++ compiler.

Der Code ist da:
http://www.codeproject.com/KB/recipes/sseintro.aspx

volkard

SIMD.this schrieb:

Also die Seite sagt dazu folgendes:

This doesn't show the function using inline Assembly. Anyone who is interested may read it in the demo project. Calculation times on my computer:

* C++ code - 26 ms
* C++ with SSE Intrinsics - 9 ms
* Inline Assembly with SSE instructions - 9 ms

Execution time should be estimated in the Release configuration, with compiler optimizations.

Das war am 10. Jul 2003...
Ich würde erstmal schauen, ob das noch so ist.

Zum GCC habe ich das hier gefunden:

http://ds9a.nl/gcc-simd/index.html

Das mit dem 2003 müßte mal testen, der Code ist ja da, nur komme ich jetzt nicht dazu. Freiwillige vor.

SeppJ

SIMD.this schrieb:

Aber für C gibt es wohl nichts vergleichbares, oder?

Es gibt da noch die Integrated Performance Primitives von Intel, was ungefähr deren Variante der SSE Intrinsics ist. Die müssten auch in C benutzbar sein, wenn ich mich nicht irre.

Ich weiß gerade nicht wie das mit der Lizenz ist, da musst du selber gucken. Intel ist recht interessiert daran, dass Programmierer ihre Prozessoren optimal ausnutzen, daher wird die Lizenz sicher frei oder zumindest nicht sehr restriktiv sein.

Da hat wohl einer schon die Arbeit gemacht:

Anderer dafür aktueller Test 2009 mit gcc 4.x und VS08:

Lately I have been playing a lot with SSE optimizations and I really enjoy it so far – using functions to tell the compiler what instructions to use makes you feel the power in your finger tips. At first I was naive and thought the compiler will do exactly what it’s being told, assuming that you know what you’re doing – looking at the SSE intrinsic header file was mostly a bunch of calls to internal GCC functions or ‘extern’ in MSVC, suggesting that the compiler will simply follow your leadership.

I assumed wrong – the compiler will take the liberty to optimized your code even further – at points you wouldn’t even think about, though I have noticed that is not always the case with MSVC. MSVC will sometimes behave too trusting at the coder even when optimizations obviously could be made. After grasping the concept of SSE and what it could do, I quickly realized MSVC won’t optimize as good as GCC 4.x or ICC would.

http://www.liranuna.com/sse-intrinsics-optimizations-in-popular-compilers/

Hier auch ganz interessant: http://www.g-truc.net/post-0369.html#menu

Marc++us

Wem das so wichtig ist, daß der Compiler extrem optimalen Code auf der CPU erzeugt, wird um den Intel-Compiler nicht herumkommen... also von Intel mal die Testversion laden, damit compilieren, und Benchmark machen.

Oder für die Rechenoperationen gleich eine entsprechende Bibliothek nehmen.

nachtfeuer

Marc++us schrieb:

Wem das so wichtig ist, daß der Compiler extrem optimalen Code auf der CPU erzeugt, wird um den Intel-Compiler nicht herumkommen...

Auf die Frage oben hin glaube ich das auch, aber...
http://developer.amd.com/tools/open64/Pages/default.aspx
http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx
http://developer.nvidia.com/cuda-downloads

Marc++us

... angeblich...

http://magazin.c-plusplus.net/artikel/Intel ISTEP 2011 Software Conference (Teil I)

Abschnitt "AMD und Intel"

nachtfeuer

Bestreitet keiner. Auch die dazuTools von Intel sind nicht so schlecht. Aber nur 30 Tage zum testen. Und danach? Was ich im Hinterkopf hatte, war etwa diese Richtung:
Erst diesen Thread gelesen...
http://www.c-plusplus.net/forum/287155?sid=43a79aa65747d6bdf241beaf652396d4

...und sowas gedacht wie
http://www.heise.de/preisvergleich/a613014.html
(man bekommt zwei zum Preis von einer...)

Blue-Tiger

Hiho!

Viele Compiler heutzutage besitzen AutoVectorizer, die genau dafuer gemacht sind! Damit der richtig gut arbeiten kann, ist es wichtig zu wissen dass deine Arrays nicht aliased sind! Leider ists fuer einen Compiler superschwer, das selbst rauszufinden.

Unter C gibts dafuer seit C99 das "restrict"-Keyword (http://en.wikipedia.org/wiki/Restrict). Ich wuerd davon ausgehen, dass du mit Hilfe von restrict deine Funktion einfach in reinem C schreiben kannst, und der AutoVectorizer wird optimalen SSE-Code draus basteln. Wichtig vllt. noch: achte darauf, dass deine Arrays auf Words aligned sind. Unaligned loads sind bei SSE-Einheiten schrecklich, schrecklich langsam (oder waren es zumindest 2009, als ich mich zuletzt damit befasst habe).

In C++ gibts leider kein "restrict" (mein groesster Kritikpunkt am neuen Standard ). Wie SeppJ schon gesagt hat, wird valarray dir vllt. helfen koennen, weil die Implementierungen dieser Klasse vermutlich Compiler-Spezifische Tricks und Erweiterungen verwenden, um die Wirkung von restrict zu erzielen.

Wenn du bereit bist, auf "Standard C++" zu verzichten und auf Compiler-Erweiterungen zurueckzugreifen: wie schon gsagt gibts SSE-Intrinsics, die gibts soweit ich weiss fuer mehrere Prozessoren, aber die sind doch ziemlich (EXTREM) low-level.

Besser gehts mit Compiler-Spezifischen Erweiterungen. Hier kann ich dir aber leider nur zum GCC Auskunft geben:

1. GCC kennt auch im C++ Modus "restrict" (IIRC)
2. Es gibt eine GCC-Erweiterung extra fuer sowas: http://gcc.gnu.org/onlinedocs/gcc-4.6.0/gcc/Vector-Extensions.html (wichtig: du musst selbst dafuer sorgen, dass diese Vector-Datentypen im Speicher richtig aligned sind, sonst kanns sein dass du dir 'nen SegFault einfaengst (zumindest war das noch 2009 so, als ich damit rumgespielt hab))

hustbaer

BTW: bei MSVC gibt es __declspec(restrict) und __restrict

http://msdn.microsoft.com/en-us/library/8bcxafdh.aspx
http://msdn.microsoft.com/en-us/library/5ft82fed.aspx