?
Ausgehend von deinem letzten Thread hier mal eine Algo (SSE3: haddps, masm-syntax )zur Multiplikation von 4x4 Matrizen (singel). Die Matrizen müssen aligned 16 sein damit es klappt:
; matrix A,B und Dest müssen aligned 16 sein!
; (alternativ movdqa durch movdqu ersetzen)
mul4x4 proc pmA:PVOID,pmB:PVOID,pmDest:PVOID
mov eax,pmA
mov edx,pmB
mov ecx,pmDest
movdqa xmm0,OWORD ptr [edx+0*16] ; lade matrix B
movdqa xmm6,OWORD ptr [edx+1*16] ;
movdqa xmm4,OWORD ptr [edx+2*16] ;
movdqa xmm7,OWORD ptr [edx+3*16] ;
movdqa xmm2,xmm0
movdqa xmm5,xmm4
unpcklps xmm0,xmm6
unpcklps xmm4,xmm7
unpckhps xmm2,xmm6
unpckhps xmm5,xmm7
movdqa xmm1,xmm0
movdqa xmm3,xmm2
unpcklpd xmm0,xmm4 ; = Spalte 1
unpckhpd xmm1,xmm4 ; = ... 2
unpcklpd xmm2,xmm5 ; = ... 3
unpckhpd xmm3,xmm5 ; = ... 4
; A11-A14
movdqa xmm4,OWORD ptr [eax+0*16]
movdqa xmm5,xmm4
movdqa xmm6,xmm4
movdqa xmm7,xmm4
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
haddps xmm4,xmm4
haddps xmm4,xmm4
haddps xmm5,xmm5
haddps xmm5,xmm5
haddps xmm6,xmm6
haddps xmm6,xmm6
haddps xmm7,xmm7
haddps xmm7,xmm7
unpcklps xmm4,xmm5
unpcklps xmm6,xmm7
unpcklpd xmm4,xmm6
movdqa OWORD ptr [ecx+0*16],xmm4
; A21-A24
movdqa xmm4,OWORD ptr [eax+1*16]
movdqa xmm5,xmm4
movdqa xmm6,xmm4
movdqa xmm7,xmm4
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
haddps xmm4,xmm4
haddps xmm4,xmm4
haddps xmm5,xmm5
haddps xmm5,xmm5
haddps xmm6,xmm6
haddps xmm6,xmm6
haddps xmm7,xmm7
haddps xmm7,xmm7
unpcklps xmm4,xmm5
unpcklps xmm6,xmm7
unpcklpd xmm4,xmm6
movdqa OWORD ptr [ecx+1*16],xmm4
; A31-A34
movdqa xmm4,OWORD ptr [eax+2*16]
movdqa xmm5,xmm4
movdqa xmm6,xmm4
movdqa xmm7,xmm4
mulps xmm4,xmm0
mulps xmm5,xmm1
mulps xmm6,xmm2
mulps xmm7,xmm3
haddps xmm4,xmm4
haddps xmm4,xmm4
haddps xmm5,xmm5
haddps xmm5,xmm5
haddps xmm6,xmm6
haddps xmm6,xmm6
haddps xmm7,xmm7
haddps xmm7,xmm7
unpcklps xmm4,xmm5
unpcklps xmm6,xmm7
unpcklpd xmm4,xmm6
movdqa OWORD ptr [ecx+2*16],xmm4
; A41-A44
movdqa xmm4,OWORD ptr [eax+3*16]
mulps xmm0,xmm4
mulps xmm1,xmm4
mulps xmm2,xmm4
mulps xmm3,xmm4
haddps xmm0,xmm0
haddps xmm0,xmm0
haddps xmm1,xmm1
haddps xmm1,xmm1
haddps xmm2,xmm2
haddps xmm2,xmm2
haddps xmm3,xmm3
haddps xmm3,xmm3
unpcklps xmm0,xmm1
unpcklps xmm2,xmm3
unpcklpd xmm0,xmm2
movdqa OWORD ptr [ecx+3*16],xmm0
; ; A41-A44
; movdqa xmm4,OWORD ptr [eax+3*16]
; movdqa xmm5,xmm4
; movdqa xmm6,xmm4
; movdqa xmm7,xmm4
; mulps xmm4,xmm0
; mulps xmm5,xmm1
; mulps xmm6,xmm2
; mulps xmm7,xmm3
; haddps xmm4,xmm4
; haddps xmm4,xmm4
; haddps xmm5,xmm5
; haddps xmm5,xmm5
; haddps xmm6,xmm6
; haddps xmm6,xmm6
; haddps xmm7,xmm7
; haddps xmm7,xmm7
; unpcklps xmm4,xmm5
; unpcklps xmm6,xmm7
; unpcklpd xmm4,xmm6
; movdqa OWORD ptr [ecx+3*16],xmm4
ret
mul4x4 endp