Problem mit SSE Instruktionen :-(

Ishildur

Hallo zusammen
Ich habe folgende Methode geschriben:

// ----------------------------------------- public operator "*" -----------------------------------------
// This operator multiplies the parametric matrix by this matrix and finally returns a copy of the
// modified matrix. If the "SSE" precompiler constant is set, this operator uses the SSE instruction
// set instead of the x87 instruction set (this will double the execution speed).
// Author: Samuel Lörtscher
// -------------------------------------------------------------------------------------------------------
Matrix Matrix::operator*(const Matrix &M) const{
 // declare and init local variables
 Matrix m;

 // check if the SSE instruction set shall be used
 #ifdef SSE
  // we need to define this manually as the compiler
  // will just produce useless trash
  __asm{
   mov esi, dword ptr [this]
   mov edi, dword ptr [M]
   xor eax, eax

   L1: // those instructions will be repeated for each row of the destination matrix
   movaps xmm0, xmmword ptr [esi+eax]
   shufps xmm0, xmm0, 000h
   mulps  xmm0, xmmword ptr [edi]
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 055h
   mulps  xmm1, xmmword ptr [edi+10h]
   addps  xmm0, xmm1
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 0aah
   mulps  xmm1, xmmword ptr [edi+20h]
   addps  xmm0, xmm1
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 0ffh
   mulps  xmm1, xmmword ptr [edi+30h]
   addps  xmm0, xmm1
   movaps xmmword ptr [m+eax], xmm0
   add    eax, 10h
   cmp    eax, 30h
   jbe    L1
  }
 // otherwise use the x87 fpu instructions
 #else
  // perform a matrix multiplication and return the modified matrix
  m._11 = this->_11*M._11 + this->_12*M._21 + this->_13*M._31 + this->_14*M._41;
  m._12 = this->_11*M._12 + this->_12*M._22 + this->_13*M._32 + this->_14*M._42;
  m._13 = this->_11*M._13 + this->_12*M._23 + this->_13*M._33 + this->_14*M._43;
  m._14 = this->_11*M._14 + this->_12*M._24 + this->_13*M._34 + this->_14*M._44;
  m._21 = this->_21*M._11 + this->_22*M._21 + this->_23*M._31 + this->_24*M._41;
  m._22 = this->_21*M._12 + this->_22*M._22 + this->_23*M._32 + this->_24*M._42;
  m._23 = this->_21*M._13 + this->_22*M._23 + this->_23*M._33 + this->_24*M._43;
  m._24 = this->_21*M._14 + this->_22*M._24 + this->_23*M._34 + this->_24*M._44;
  m._31 = this->_31*M._11 + this->_32*M._21 + this->_33*M._31 + this->_34*M._41;
  m._32 = this->_31*M._12 + this->_32*M._22 + this->_33*M._32 + this->_34*M._42;
  m._33 = this->_31*M._13 + this->_32*M._23 + this->_33*M._33 + this->_34*M._43;
  m._34 = this->_31*M._14 + this->_32*M._24 + this->_33*M._34 + this->_34*M._44;
  m._41 = this->_41*M._11 + this->_42*M._21 + this->_43*M._31 + this->_44*M._41;
  m._42 = this->_41*M._12 + this->_42*M._22 + this->_43*M._32 + this->_44*M._42;
  m._43 = this->_41*M._13 + this->_42*M._23 + this->_43*M._33 + this->_44*M._43;
  m._44 = this->_41*M._14 + this->_42*M._24 + this->_43*M._34 + this->_44*M._44;
 #endif

 // finally return the final matrix
 return m;
}
// -------------------------------------------------------------------------------------------------------

Diese Funktioniert im Debug Mode hervorragend, jedoch im Release Mode bekomme ich nur noch Pixelfragmente auf den Bildschrim (das Programm stürzt allerdings nicht ab).

Mir ist auch folgendes aufgefallen:

// ----------------------------------------- public operator "*" -----------------------------------------
// This operator multiplies the parametric matrix by this matrix and finally returns a copy of the
// modified matrix. If the "SSE" precompiler constant is set, this operator uses the SSE instruction
// set instead of the x87 instruction set (this will double the execution speed).
// Author: Samuel Lörtscher
// -------------------------------------------------------------------------------------------------------
Matrix Matrix::operator*(const Matrix &M) const{
 // declare and init local variables
 Matrix m;

 // check if the SSE instruction set shall be used
 #ifdef SSE
  // we need to define this manually as the compiler
  // will just produce useless trash
  __asm{
   mov esi, dword ptr [this]
   mov edi, dword ptr [M]
   xor eax, eax

   //L1: // those instructions will be repeated for each row of the destination matrix
   movaps xmm0, xmmword ptr [esi+eax]
   shufps xmm0, xmm0, 000h
   mulps  xmm0, xmmword ptr [edi]
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 055h
   mulps  xmm1, xmmword ptr [edi+10h]
   addps  xmm0, xmm1
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 0aah
   mulps  xmm1, xmmword ptr [edi+20h]
   addps  xmm0, xmm1
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 0ffh
   mulps  xmm1, xmmword ptr [edi+30h]
   addps  xmm0, xmm1
   movaps xmmword ptr [m+eax], xmm0
   //add    eax, 10h
   //cmp    eax, 30h
   //jbe    L1
  }
 // otherwise use the x87 fpu instructions
 //#else
  // perform a matrix multiplication and return the modified matrix
  /*m._11 = this->_11*M._11 + this->_12*M._21 + this->_13*M._31 + this->_14*M._41;
  m._12 = this->_11*M._12 + this->_12*M._22 + this->_13*M._32 + this->_14*M._42;
  m._13 = this->_11*M._13 + this->_12*M._23 + this->_13*M._33 + this->_14*M._43;
  m._14 = this->_11*M._14 + this->_12*M._24 + this->_13*M._34 + this->_14*M._44;
  */m._21 = this->_21*M._11 + this->_22*M._21 + this->_23*M._31 + this->_24*M._41;
  m._22 = this->_21*M._12 + this->_22*M._22 + this->_23*M._32 + this->_24*M._42;
  m._23 = this->_21*M._13 + this->_22*M._23 + this->_23*M._33 + this->_24*M._43;
  m._24 = this->_21*M._14 + this->_22*M._24 + this->_23*M._34 + this->_24*M._44;
  m._31 = this->_31*M._11 + this->_32*M._21 + this->_33*M._31 + this->_34*M._41;
  m._32 = this->_31*M._12 + this->_32*M._22 + this->_33*M._32 + this->_34*M._42;
  m._33 = this->_31*M._13 + this->_32*M._23 + this->_33*M._33 + this->_34*M._43;
  m._34 = this->_31*M._14 + this->_32*M._24 + this->_33*M._34 + this->_34*M._44;
  m._41 = this->_41*M._11 + this->_42*M._21 + this->_43*M._31 + this->_44*M._41;
  m._42 = this->_41*M._12 + this->_42*M._22 + this->_43*M._32 + this->_44*M._42;
  m._43 = this->_41*M._13 + this->_42*M._23 + this->_43*M._33 + this->_44*M._43;
  m._44 = this->_41*M._14 + this->_42*M._24 + this->_43*M._34 + this->_44*M._44;
 #endif

 // finally return the final matrix
 return m;
}
// -------------------------------------------------------------------------------------------------------

Funktioniert nur im Debug Mode, hingegen:

// ----------------------------------------- public operator "*" -----------------------------------------
// This operator multiplies the parametric matrix by this matrix and finally returns a copy of the
// modified matrix. If the "SSE" precompiler constant is set, this operator uses the SSE instruction
// set instead of the x87 instruction set (this will double the execution speed).
// Author: Samuel Lörtscher
// -------------------------------------------------------------------------------------------------------
Matrix Matrix::operator*(const Matrix &M) const{
 // declare and init local variables
 Matrix m;

 // check if the SSE instruction set shall be used
 #ifdef SSE
  // we need to define this manually as the compiler
  // will just produce useless trash
  __asm{
   mov esi, dword ptr [this]
   mov edi, dword ptr [M]
   xor eax, eax

   //L1: // those instructions will be repeated for each row of the destination matrix
   movaps xmm0, xmmword ptr [esi+eax]
   shufps xmm0, xmm0, 000h
   mulps  xmm0, xmmword ptr [edi]
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 055h
   mulps  xmm1, xmmword ptr [edi+10h]
   addps  xmm0, xmm1
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 0aah
   mulps  xmm1, xmmword ptr [edi+20h]
   addps  xmm0, xmm1
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 0ffh
   mulps  xmm1, xmmword ptr [edi+30h]
   addps  xmm0, xmm1
   movaps xmmword ptr [m], xmm0
   //add    eax, 10h
   //cmp    eax, 30h
   //jbe    L1
  }
 // otherwise use the x87 fpu instructions
 //#else
  // perform a matrix multiplication and return the modified matrix
  /*m._11 = this->_11*M._11 + this->_12*M._21 + this->_13*M._31 + this->_14*M._41;
  m._12 = this->_11*M._12 + this->_12*M._22 + this->_13*M._32 + this->_14*M._42;
  m._13 = this->_11*M._13 + this->_12*M._23 + this->_13*M._33 + this->_14*M._43;
  m._14 = this->_11*M._14 + this->_12*M._24 + this->_13*M._34 + this->_14*M._44;
  */m._21 = this->_21*M._11 + this->_22*M._21 + this->_23*M._31 + this->_24*M._41;
  m._22 = this->_21*M._12 + this->_22*M._22 + this->_23*M._32 + this->_24*M._42;
  m._23 = this->_21*M._13 + this->_22*M._23 + this->_23*M._33 + this->_24*M._43;
  m._24 = this->_21*M._14 + this->_22*M._24 + this->_23*M._34 + this->_24*M._44;
  m._31 = this->_31*M._11 + this->_32*M._21 + this->_33*M._31 + this->_34*M._41;
  m._32 = this->_31*M._12 + this->_32*M._22 + this->_33*M._32 + this->_34*M._42;
  m._33 = this->_31*M._13 + this->_32*M._23 + this->_33*M._33 + this->_34*M._43;
  m._34 = this->_31*M._14 + this->_32*M._24 + this->_33*M._34 + this->_34*M._44;
  m._41 = this->_41*M._11 + this->_42*M._21 + this->_43*M._31 + this->_44*M._41;
  m._42 = this->_41*M._12 + this->_42*M._22 + this->_43*M._32 + this->_44*M._42;
  m._43 = this->_41*M._13 + this->_42*M._23 + this->_43*M._33 + this->_44*M._43;
  m._44 = this->_41*M._14 + this->_42*M._24 + this->_43*M._34 + this->_44*M._44;
 #endif

 // finally return the final matrix
 return m;
}
// -------------------------------------------------------------------------------------------------------

Funktioniert sowohl im Debug als auch im Release Mode hervorragend. Also:

movaps xmmword ptr [m], xmm0 Funktioniert
movaps xmmword ptr [m+eax], xmm0 Funktioniert nicht!!

Aber ich habe ja xor eax, eax gemacht und verändere dieses Register nicht mehr, also was soll denn das?

Mfg Samuel

Ishildur

Also ich dachte bisher eigentlich ich könne ziemlich gut mit Assembler aber das hier ergibt bei mir wirklich nur noch Fragezeichen:

movaps xmmword ptr [m], xmm0 :OK im Release

lea eax, [m]
movaps xmmword ptr [eax], xmm0 :Fehler, rendert nur noch trash im Release

Im Debug Mode funktionieren beide Varianten einwandfrei. Also versteht das jemand von euch?

Ishildur

Jetzt geht er...

// ----------------------------------------- public operator "*" -----------------------------------------
// This operator multiplies the parametric matrix by this matrix and finally returns a copy of the
// modified matrix. If the "SSE" precompiler constant is set, this operator uses the SSE instruction
// set instead of the x87 instruction set (this will double the execution speed).
// Author: Samuel Lörtscher
// -------------------------------------------------------------------------------------------------------
Matrix Matrix::operator*(const Matrix &M) const{
 // check if the SSE instruction set shall be used
 #ifdef SSE
  // declare and init local variables
  float128 m[4];

  // we need to define this manually as the compiler
  // will just produce useless trash
  __asm{
   mov esi, dword ptr [this]
   mov edi, dword ptr [M]
   xor eax, eax

   L1: // those instructions will be repeated for each row of the destination matrix
   movaps xmm0, xmmword ptr [esi+eax]
   shufps xmm0, xmm0, 000h
   mulps  xmm0, xmmword ptr [edi]
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 055h
   mulps  xmm1, xmmword ptr [edi+10h]
   addps  xmm0, xmm1
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 0aah
   mulps  xmm1, xmmword ptr [edi+20h]
   addps  xmm0, xmm1
   movaps xmm1, xmmword ptr [esi+eax]
   shufps xmm1, xmm1, 0ffh
   mulps  xmm1, xmmword ptr [edi+30h]
   addps  xmm0, xmm1
   movaps xmmword ptr [m+eax], xmm0
   add    eax, 10h
   cmp    eax, 30h
   jbe    L1
  }

  // finally return the final matrix
  return *reinterpret_cast<Matrix*>(&m);
 // otherwise use the x87 fpu instructions
 #else
  // declare and init local variables
  Matrix m;

  // perform a matrix multiplication and return the modified matrix
  m._11 = this->_11*M._11 + this->_12*M._21 + this->_13*M._31 + this->_14*M._41;
  m._12 = this->_11*M._12 + this->_12*M._22 + this->_13*M._32 + this->_14*M._42;
  m._13 = this->_11*M._13 + this->_12*M._23 + this->_13*M._33 + this->_14*M._43;
  m._14 = this->_11*M._14 + this->_12*M._24 + this->_13*M._34 + this->_14*M._44;
  m._21 = this->_21*M._11 + this->_22*M._21 + this->_23*M._31 + this->_24*M._41;
  m._22 = this->_21*M._12 + this->_22*M._22 + this->_23*M._32 + this->_24*M._42;
  m._23 = this->_21*M._13 + this->_22*M._23 + this->_23*M._33 + this->_24*M._43;
  m._24 = this->_21*M._14 + this->_22*M._24 + this->_23*M._34 + this->_24*M._44;
  m._31 = this->_31*M._11 + this->_32*M._21 + this->_33*M._31 + this->_34*M._41;
  m._32 = this->_31*M._12 + this->_32*M._22 + this->_33*M._32 + this->_34*M._42;
  m._33 = this->_31*M._13 + this->_32*M._23 + this->_33*M._33 + this->_34*M._43;
  m._34 = this->_31*M._14 + this->_32*M._24 + this->_33*M._34 + this->_34*M._44;
  m._41 = this->_41*M._11 + this->_42*M._21 + this->_43*M._31 + this->_44*M._41;
  m._42 = this->_41*M._12 + this->_42*M._22 + this->_43*M._32 + this->_44*M._42;
  m._43 = this->_41*M._13 + this->_42*M._23 + this->_43*M._33 + this->_44*M._43;
  m._44 = this->_41*M._14 + this->_42*M._24 + this->_43*M._34 + this->_44*M._44;

  // finally return the final matrix
  return m;
 #endif
}
// -------------------------------------------------------------------------------------------------------

Ich habe also die Matrix:

// ******************************************* struct "Matrix" *******************************************
// This structure represents a 128-bit aligned 4x4 row major matrix.
// Author: Samuel Lörtscher
// *******************************************************************************************************
__declspec(align(16)) struct Matrix{

durch float128:

typedef __m128           float128; // 128-bit (SSE) floating point variable

ersetzt und plötzlich funktioniert es auch im Release. Aber was zum Teufel...
Ich verstehe nur noch Bahnhof

Corsair*01

Ich glaube du hattest dich hier etwas im Forum vergriffen. XP

Ishildur

Ja vielleicht gehört es eher ins Assemblerforum aber ich dachte, dass die Spieleentwickler sich ja vielleicht sogar besser mit den Eigenheiten des SSE Instruktionssets auskennen? Ansonsten habe ich auch nichts dagegen einzuwenden, wenn es ins Assemblerforum verschoben würde.

rapso

allokierst du irgendwo etwas mit new was aligned sein sollte? das ist beim normalen new nicht der fall, falls du glueck hast, stuertzt es ab, falls du pech hast, wird irgendwo der ptr angepasst und es kommt nur crap hinten raus.

dir bleibt nichts besseres als im release mode zu debuggen, fuehre einmal die sse und einmal die c++ version aus und vergleiche die werte und falls !=, machst du ein debug break.

Ishildur

@rapso
Ja also im Debug Mode habe ich sämtliche Adressen auf das 128bit Alignment hin überprüft und sämtliche Adressen enden mit 0, wie erforderlich. Auch habe ich das Disassemly sowohl von der Release als auch von der Debug Version und konnte auch da keine Unterschiede ausmachen, welche diese Verhalten erklären. Die resultierenden Matrizen jedoch unterscheiden sich im Release Mode deutlich zwischen SSE und C++, im Debug Mode jedoch nicht...

Das mit dem New ist eine interessante Frage. Ich habe eben erst auf SSE umgestellt und habe bisher in der gesamten Engine nicht darauf geachtet.
Frage: achtet der new operator auf __declspec(align(16)) oder funktioniert dies nur bei statisch alloziiertem Speicher?

Falls nein, was findest du sinnvoller:
Variante 1:
Sämtliche SSE Funktionen erwarten aligned Daten und delegieren die Verantwortung an den Caller:
Vorteil: deutlich effizienter
Nachteil: deutlich höherer Memory Footprint sowie viel Arbeit, weil vermutlich die halbe Engine darauf angepasst werden muss.

Verantwortung liegt innerhalb der Funktion:
Vorteil: geringerer Memoryfootprint sowie keine Anpassung der Engine erforderlich
Nachteil: sämtliche Daten müssen innerhalb der entsprechenden Funktion in jedem Frame neu aligned werden (was eigentlich immer in einem Kopiervorgang endet), vermutlich extreeeem ineffizient.

rapso

Ishildur schrieb:

@rapso
Ja also im Debug Mode habe ich sämtliche Adressen auf das 128bit Alignment hin überprüft und sämtliche Adressen enden mit 0, wie erforderlich.

wie schoen, dann wird es im debug ja sicherlich richtig laufen
wie gesagt, pruef die allokationen im release nach. wenn du von pixeln sprichst und diese mit new allokierst, ist es wahrscheinlich dass new dir keine alignte addresse zurueckliefert.

Auch habe ich das Disassemly sowohl von der Release als auch von der Debug Version und konnte auch da keine Unterschiede ausmachen, welche diese Verhalten erklären. Die resultierenden Matrizen jedoch unterscheiden sich im Release Mode deutlich zwischen SSE und C++, im Debug Mode jedoch nicht...

schon beim input oder erst nach dem ausrechnen?

Ishildur

Kann ich nicht sagen, ich habe ja im Release Mode keinen Zugriff auf den Inhalt der XMM Register...