;
; sse_mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, su3_vector *b1, 
;                                su3_vector *b2, su3_vector *b3, su3_vector *c)
; 
; Multiply an array of 4 su4 matrices, a[4], by an array of vectors, b[4], summing
; the result into an su3 vector c.
;

global sse_mult_su3_mat_vec_sum_4dir
sse_mult_su3_mat_vec_sum_4dir:
	push		ebp
	mov		ebp,esp
	push		eax
	push		ebx
	push		ecx
	mov		eax,[ebp+8]			; su3_matrix *a
	mov		ebx,[ebp+12]			; su3_vector *b
	mov		ecx,[ebp+28]			; su3_vector *c

	;  bring in real and imaginary b[0] vector
	movlps		xmm0,[ebx]			; x,x,b0i,b0r		<(bb0)->c[0]>
	movlps		xmm1,[ebx+8]			; x,x,b1i,b1r		<(bb0)->c[1]>
	movlps		xmm2,[ebx+16]			; x,x,b2i,b2r		<(bb0)->c[2]>
	shufps		xmm0,xmm0,0x44			; b0i,b0r,b0i,b0r
	shufps		xmm1,xmm1,0x44			; b1i,b1r,b1i,b1r
	shufps		xmm2,xmm2,0x44			; b2i,b2r,b2i,b2r

	; bring in real components of first two rows of matrix a[0]
	movss		xmm3,[eax]			; x,x,x,c00r		<(aa)[0].e[0][0].real>
	movss		xmm7,[eax+24]			; x,x,x,c10r		<(aa)[0].e[1][0].real>
	shufps		xmm3,xmm7,0x00			; c10r,c10r,c00r,c00r
	movss		xmm4,[eax+8]			; x,x,x,c01r		<(aa)[0].e[0][1].real>
	movss		xmm7,[eax+32]			; x,x,x,c11r		<(aa)[0].e[1][1].real>
	shufps		xmm4,xmm7,0x00			; c11r,c11r,c01r,c01r
	mulps		xmm3,xmm0
	mulps		xmm4,xmm1
	addps		xmm3,xmm4
	movss		xmm5,[eax+16]			; x,x,x,c02r		<(aa)[0].e[0][2].real>
	movss		xmm7,[eax+40]			; x,x,x,c12r		<(aa)[0].e[1][2].real>
	shufps		xmm5,xmm7,0x00			; c12r,c12r,c02r,c02r
	mulps		xmm5,xmm2
	addps		xmm3,xmm5
	
	; special handling of the 3rd row of matrix a[0]
	shufps		xmm1,xmm0,0x44			; b0i,b0r,b1i,b1r
	movss		xmm7,[eax+48]			; x,x,x,c20r		<(aa)[0].e[2][0].real>
	movss		xmm6,[eax+56]			; x,x,x,c21r		<(aa)[0].e[2][1].real>
	shufps		xmm6,xmm7,0x00			; c20r,c20r,c21r,c21r
	mulps		xmm6,xmm1

	; shuffle b vector for imaginary components of matrix a[0]
	shufps		xmm0,xmm0,0xB1			; b0r,b0i,b0r,b0i
	xorps		xmm0,[negate]			; b0r,-b0i,b0r,-b0i	<_sse_sgn13>
	shufps		xmm1,xmm1,0x11			; b1r,b1i,b1r,b1i
	xorps		xmm1,[negate]			; b1r,-b1i,b1r,-b1i	<_sse_sgn13>
	shufps		xmm2,xmm2,0xB1			; b2r,b2i,b2r,b2i
	xorps		xmm2,[negate]			; b2r,-b2i,b2r,-b2i	<_sse_sgn13>

	; bring in imaginary components of first two rows of matrix b[0]
	movss		xmm4,[eax+4]			; x,x,x,c00i		<(aa)[0].e[0][0].imag>
	movss		xmm7,[eax+28]			; x,x,x,c10i		<(aa)[0].e[1][0].imag>
	shufps		xmm4,xmm7,0x00			; c10i,c10i,c00i,c00i
	mulps		xmm4,xmm0
	addps		xmm3,xmm4
	movss		xmm5,[eax+12]			; x,x,x,c01i		<(aa)[0].e[0][1].imag>
	movss		xmm7,[eax+36]			; x,x,x,c11i		<(aa)[0].e[1][1].imag>
	shufps		xmm5,xmm7,0x00			; c11i,c11i,c01i,c01i
	mulps		xmm5,xmm1
	addps		xmm3,xmm5
	movss		xmm5,[eax+20]			; x,x,x,c02i		<(aa)[0].e[0][2].imag>
	movss		xmm7,[eax+44]			; x,x,x,c12i		<(aa)[0].e[1][2].imag>
	shufps		xmm5,xmm7,0x00			; c12i,c12i,c02i,c02i
	mulps		xmm5,xmm2
	addps		xmm3,xmm5			; d1i,d1r,d0i,d0r
	movups		[ecx],xmm3			; store result		<(cc)->c[0]>

	; more special handling of the 3rd row of matrix a[0]
	shufps		xmm1,xmm0,0x44			; b0r,-b0i,b1r,-b1i
	movss		xmm7,[eax+52]			; x,x,x,c20i		<(aa)[0].e[2][0].imag>
	movss		xmm5,[eax+60]			; x,x,x,c21i		<(aa)[0].e[2][1].imag>
	shufps		xmm5,xmm7,0x00			; c20i,c20i,c21i,c21i
	mulps		xmm5,xmm1
	addps		xmm6,xmm5
	shufps		xmm2,xmm2,0xB4			; -b2i,b2r,b2r,-b2i
	xorps		xmm2,[neg2]			; b2i,b2r,b2r,-b2i	<_sse_sgn4>
	movlps		xmm7,[eax+64]			; x,x,c22i,c22r		<(aa)[0].e[2][2]>
	shufps		xmm7,xmm7,0x05			; c22r,c22r,c22i,c22i
	mulps		xmm7,xmm2
	addps		xmm6,xmm7
	movaps		xmm7,xmm6
	shufps		xmm7,xmm7,0xEE
	addps		xmm6,xmm7
	movlps		[ecx+16],xmm6			;			<(cc)->c[2]>
	
	; *******************************************************************	

        mov             ebx,[ebp+16]                    ; su3_vector *b
        add             eax,72                          ; su3_matrix *a[1]
	;  bring in real and imaginary b[1] vector
	movlps		xmm0,[ebx]			; x,x,b0i,b0r		<(bb1)->c[0]>
	movlps		xmm1,[ebx+8]			; x,x,b1i,b1r		<(bb1)->c[1]>
	movlps		xmm2,[ebx+16]			; x,x,b2i,b2r		<(bb1)->c[2]>
	shufps		xmm0,xmm0,0x44			; b0i,b0r,b0i,b0r
	shufps		xmm1,xmm1,0x44			; b1i,b1r,b1i,b1r
	shufps		xmm2,xmm2,0x44			; b2i,b2r,b2i,b2r

	; bring in real components of first two rows of matrix a[1]
	movss		xmm3,[eax]			; x,x,x,c00r		<(aa)[1].e[0][0].real>
	movss		xmm7,[eax+24]			; x,x,x,c10r		<(aa)[1].e[1][0].real>
	shufps		xmm3,xmm7,0x00			; c10r,c10r,c00r,c00r
	movss		xmm4,[eax+8]			; x,x,x,c01r		<(aa)[1].e[0][1].real>
	movss		xmm7,[eax+32]			; x,x,x,c11r		<(aa)[1].e[1][1].real>
	shufps		xmm4,xmm7,0x00			; c11r,c11r,c01r,c01r
	mulps		xmm3,xmm0
	mulps		xmm4,xmm1
	addps		xmm3,xmm4
	movss		xmm5,[eax+16]			; x,x,x,c02r		<(aa)[1].e[0][2].real>
	movss		xmm7,[eax+40]			; x,x,x,c12r		<(aa)[1].e[1][2].real>
	shufps		xmm5,xmm7,0x00			; c12r,c12r,c02r,c02r
	mulps		xmm5,xmm2
	addps		xmm3,xmm5

	; special handling of the 3rd row of matrix a[1]
	shufps		xmm1,xmm0,0x44			; b0i,b0r,b1i,b1r
	movss		xmm7,[eax+48]			; x,x,x,c20r		<(aa)[1].e[2][0].real>
	movss		xmm6,[eax+56]			; x,x,x,c21r		<(aa)[1].e[2][1].real>
	shufps		xmm6,xmm7,0x00			; c20r,c20r,c21r,c21r
	mulps		xmm6,xmm1

	; shuffle b vector for imaginary components of matrix a[1]
	shufps		xmm0,xmm0,0xB1			; b0r,b0i,b0r,b0i
	xorps		xmm0,[negate]			; b0r,-b0i,b0r,-b0i	<_sse_sgn13>
	shufps		xmm1,xmm1,0x11			; b1r,b1i,b1r,b1i
	xorps		xmm1,[negate]			; b1r,-b1i,b1r,-b1i	<_sse_sgn13>
	shufps		xmm2,xmm2,0xB1			; b2r,b2i,b2r,b2i
	xorps		xmm2,[negate]			; b2r,-b2i,b2r,-b2i	<_sse_sgn13>

	; bring in imaginary components of first two rows of matrix b[1]
	movss		xmm4,[eax+4]			; x,x,x,c00i		<(aa)[1].e[0][0].imag>
	movss		xmm7,[eax+28]			; x,x,x,c10i		<(aa)[1].e[1][0].imag>
	shufps		xmm4,xmm7,0x00			; c10i,c10i,c00i,c00i
	mulps		xmm4,xmm0
	addps		xmm3,xmm4
	movss		xmm5,[eax+12]			; x,x,x,c01i		<(aa)[1].e[0][1].imag>
	movss		xmm7,[eax+36]			; x,x,x,c11i		<(aa)[1].e[1][1].imag>
	shufps		xmm5,xmm7,0x00			; c11i,c11i,c01i,c01i
	mulps		xmm5,xmm1
	addps		xmm3,xmm5
	movss		xmm5,[eax+20]			; x,x,x,c02i		<(aa)[1].e[0][2].imag>
	movss		xmm7,[eax+44]			; x,x,x,c12i		<(aa)[1].e[1][2].imag>
	shufps		xmm5,xmm7,0x00			; c12i,c12i,c02i,c02i
	mulps		xmm5,xmm2
	addps		xmm3,xmm5			; d1i,d1r,d0i,d0r
	movups		xmm7,[ecx]			;			<(cc)->c[0]>
	addps		xmm7,xmm3
	movups		[ecx],xmm7			; store result		<(cc)->c[0]>

	; more special handling of the 3rd row of matrix a[1]
	shufps		xmm1,xmm0,0x44			; b0r,-b0i,b1r,-b1i
	movss		xmm7,[eax+52]			; x,x,x,c20i		<(aa)[1].e[2][0].imag>
	movss		xmm5,[eax+60]			; x,x,x,c21i		<(aa)[1].e[2][1].imag>
	shufps		xmm5,xmm7,0x00			; c20i,c20i,c21i,c21i
	mulps		xmm5,xmm1
	addps		xmm6,xmm5
	shufps		xmm2,xmm2,0xB4			; -b2i,b2r,b2r,-b2i
	xorps		xmm2,[neg2]			; b2i,b2r,b2r,-b2i	<_sse_sgn4>
	movlps		xmm7,[eax+64]			; x,x,c22i,c22r		<(aa)[1].e[2][2]>
	shufps		xmm7,xmm7,0x05			; c22r,c22r,c22i,c22i
	mulps		xmm7,xmm2
	addps		xmm6,xmm7
	movaps		xmm7,xmm6
	shufps		xmm7,xmm7,0xEE
	addps		xmm6,xmm7
	movlps		xmm7,[ecx+16]			;			<(cc)->c[2]>
	addps		xmm7,xmm6
	movlps		[ecx+16],xmm7			;			<(cc)->c[2]>

	
	; *******************************************************************	


        mov             ebx,[ebp+20]                    ; su3_vector *b
        add             eax,72                          ; su3_matrix *a[1]
	;  bring in real and imaginary b[2] vector
	movlps		xmm0,[ebx]			; x,x,b0i,b0r		<(bb2)->c[0]>
	movlps		xmm1,[ebx+8]			; x,x,b1i,b1r		<(bb2)->c[1]>
	movlps		xmm2,[ebx+16]			; x,x,b2i,b2r		<(bb2)->c[2]>
	shufps		xmm0,xmm0,0x44			; b0i,b0r,b0i,b0r
	shufps		xmm1,xmm1,0x44			; b1i,b1r,b1i,b1r
	shufps		xmm2,xmm2,0x44			; b2i,b2r,b2i,b2r

	; bring in real components of first two rows of matrix a[2]
	movss		xmm3,[eax]			; x,x,x,c00r		<(aa)[2].e[0][0].real>
	movss		xmm7,[eax+24]			; x,x,x,c10r		<(aa)[2].e[1][0].real>
	shufps		xmm3,xmm7,0x00			; c10r,c10r,c00r,c00r
	movss		xmm4,[eax+8]			; x,x,x,c01r		<(aa)[2].e[0][1].real>
	movss		xmm7,[eax+32]			; x,x,x,c11r		<(aa)[2].e[1][1].real>
	shufps		xmm4,xmm7,0x00			; c11r,c11r,c01r,c01r
	mulps		xmm3,xmm0
	mulps		xmm4,xmm1
	addps		xmm3,xmm4
	movss		xmm5,[eax+16]			; x,x,x,c02r		<(aa)[2].e[0][2].real>
	movss		xmm7,[eax+40]			; x,x,x,c12r		<(aa)[2].e[1][2].real>
	shufps		xmm5,xmm7,0x00			; c12r,c12r,c02r,c02r
	mulps		xmm5,xmm2
	addps		xmm3,xmm5

	; special handling of the 3rd row of matrix a[2]
	shufps		xmm1,xmm0,0x44			; b0i,b0r,b1i,b1r
	movss		xmm7,[eax+48]			; x,x,x,c20r		<(aa)[2].e[2][0].real>
	movss		xmm6,[eax+56]			; x,x,x,c21r		<(aa)[2].e[2][1].real>
	shufps		xmm6,xmm7,0x00			; c20r,c20r,c21r,c21r
	mulps		xmm6,xmm1

	; shuffle b vector for imaginary components of matrix a[2]
	shufps		xmm0,xmm0,0xB1			; b0r,b0i,b0r,b0i
	xorps		xmm0,[negate]			; b0r,-b0i,b0r,-b0i	<_sse_sgn13>
	shufps		xmm1,xmm1,0x11			; b1r,b1i,b1r,b1i
	xorps		xmm1,[negate]			; b1r,-b1i,b1r,-b1i	<_sse_sgn13>
	shufps		xmm2,xmm2,0xB1			; b2r,b2i,b2r,b2i
	xorps		xmm2,[negate]			; b2r,-b2i,b2r,-b2i	<_sse_sgn13>

	; bring in imaginary components of first two rows of matrix b[2]
	movss		xmm4,[eax+4]			; x,x,x,c00i		<(aa)[2].e[0][0].imag>
	movss		xmm7,[eax+28]			; x,x,x,c10i		<(aa)[2].e[1][0].imag>
	shufps		xmm4,xmm7,0x00			; c10i,c10i,c00i,c00i
	mulps		xmm4,xmm0
	addps		xmm3,xmm4
	movss		xmm5,[eax+12]			; x,x,x,c01i		<(aa)[2].e[0][1].imag>
	movss		xmm7,[eax+36]			; x,x,x,c11i		<(aa)[2].e[1][1].imag>
	shufps		xmm5,xmm7,0x00			; c11i,c11i,c01i,c01i
	mulps		xmm5,xmm1
	addps		xmm3,xmm5
	movss		xmm5,[eax+20]			; x,x,x,c02i		<(aa)[2].e[0][2].imag>
	movss		xmm7,[eax+44]			; x,x,x,c12i		<(aa)[2].e[1][2].imag>
	shufps		xmm5,xmm7,0x00			; c12i,c12i,c02i,c02i
	mulps		xmm5,xmm2
	addps		xmm3,xmm5			; d1i,d1r,d0i,d0r
	movups		xmm7,[ecx]			;			<(cc)->c[0]>
	addps		xmm7,xmm3
	movups		[ecx],xmm7			; store result		<(cc)->c[0]>

	; more special handling of the 3rd row of matrix a[2]
	shufps		xmm1,xmm0,0x44			; b0r,-b0i,b1r,-b1i
	movss		xmm7,[eax+52]			; x,x,x,c20i		<(aa)[2].e[2][0].imag>
	movss		xmm5,[eax+60]			; x,x,x,c21i		<(aa)[2].e[2][1].imag>
	shufps		xmm5,xmm7,0x00			; c20i,c20i,c21i,c21i
	mulps		xmm5,xmm1
	addps		xmm6,xmm5
	shufps		xmm2,xmm2,0xB4			; -b2i,b2r,b2r,-b2i
	xorps		xmm2,[neg2]			; b2i,b2r,b2r,-b2i	<_sse_sgn4>
	movlps		xmm7,[eax+64]			; x,x,c22i,c22r		<(aa)[2].e[2][2]>
	shufps		xmm7,xmm7,0x05			; c22r,c22r,c22i,c22i
	mulps		xmm7,xmm2
	addps		xmm6,xmm7
	movaps		xmm7,xmm6
	shufps		xmm7,xmm7,0xEE
	addps		xmm6,xmm7
	movlps		xmm7,[ecx+16]			;			<(cc)->c[2]>
	addps		xmm7,xmm6
	movlps		[ecx+16],xmm7			;			<(cc)->c[2]>

	
	; *******************************************************************	

	

        mov             ebx,[ebp+24]                    ; su3_vector *b
        add             eax,72                          ; su3_matrix *a[1]
	;  bring in real and imaginary b[3] vector
	movlps		xmm0,[ebx]			; x,x,b0i,b0r		<(bb3)->c[0]>
	movlps		xmm1,[ebx+8]			; x,x,b1i,b1r		<(bb3)->c[1]>
	movlps		xmm2,[ebx+16]			; x,x,b2i,b2r		<(bb3)->c[2]>
	shufps		xmm0,xmm0,0x44			; b0i,b0r,b0i,b0r
	shufps		xmm1,xmm1,0x44			; b1i,b1r,b1i,b1r
	shufps		xmm2,xmm2,0x44			; b2i,b2r,b2i,b2r

	; bring in real components of first two rows of matrix a[3]
	movss		xmm3,[eax]			; x,x,x,c00r		<(aa)[3].e[0][0].real>
	movss		xmm7,[eax+24]			; x,x,x,c10r		<(aa)[3].e[1][0].real>
	shufps		xmm3,xmm7,0x00			; c10r,c10r,c00r,c00r
	movss		xmm4,[eax+8]			; x,x,x,c01r		<(aa)[3].e[0][1].real>
	movss		xmm7,[eax+32]			; x,x,x,c11r		<(aa)[3].e[1][1].real>
	shufps		xmm4,xmm7,0x00			; c11r,c11r,c01r,c01r
	mulps		xmm3,xmm0
	mulps		xmm4,xmm1
	addps		xmm3,xmm4
	movss		xmm5,[eax+16]			; x,x,x,c02r		<(aa)[3].e[0][2].real>
	movss		xmm7,[eax+40]			; x,x,x,c12r		<(aa)[3].e[1][2].real>
	shufps		xmm5,xmm7,0x00			; c12r,c12r,c02r,c02r
	mulps		xmm5,xmm2
	addps		xmm3,xmm5

	; special handling of the 3rd row of matrix a[3]
	shufps		xmm1,xmm0,0x44			; b0i,b0r,b1i,b1r
	movss		xmm7,[eax+48]			; x,x,x,c20r		<(aa)[3].e[2][0].real>
	movss		xmm6,[eax+56]			; x,x,x,c21r		<(aa)[3].e[2][1].real>
	shufps		xmm6,xmm7,0x00			; c20r,c20r,c21r,c21r
	mulps		xmm6,xmm1

	; shuffle b vector for imaginary components of matrix a[3]
	shufps		xmm0,xmm0,0xB1			; b0r,b0i,b0r,b0i
	xorps		xmm0,[negate]			; b0r,-b0i,b0r,-b0i	<_sse_sgn13>
	shufps		xmm1,xmm1,0x11			; b1r,b1i,b1r,b1i
	xorps		xmm1,[negate]			; b1r,-b1i,b1r,-b1i	<_sse_sgn13>
	shufps		xmm2,xmm2,0xB1			; b2r,b2i,b2r,b2i
	xorps		xmm2,[negate]			; b2r,-b2i,b2r,-b2i	<_sse_sgn13>

	; bring in imaginary components of first two rows of matrix b[3]
	movss		xmm4,[eax+4]			; x,x,x,c00i		<(aa)[3].e[0][0].imag>
	movss		xmm7,[eax+28]			; x,x,x,c10i		<(aa)[3].e[1][0].imag>
	shufps		xmm4,xmm7,0x00			; c10i,c10i,c00i,c00i
	mulps		xmm4,xmm0
	addps		xmm3,xmm4
	movss		xmm5,[eax+12]			; x,x,x,c01i		<(aa)[3].e[0][1].imag>
	movss		xmm7,[eax+36]			; x,x,x,c11i		<(aa)[3].e[1][1].imag>
	shufps		xmm5,xmm7,0x00			; c11i,c11i,c01i,c01i
	mulps		xmm5,xmm1
	addps		xmm3,xmm5
	movss		xmm5,[eax+20]			; x,x,x,c02i		<(aa)[3].e[0][2].imag>
	movss		xmm7,[eax+44]			; x,x,x,c12i		<(aa)[3].e[1][2].imag>
	shufps		xmm5,xmm7,0x00			; c12i,c12i,c02i,c02i
	mulps		xmm5,xmm2
	addps		xmm3,xmm5			; d1i,d1r,d0i,d0r
	movups		xmm7,[ecx]			;			<(cc)->c[0]>
	addps		xmm7,xmm3
	movups		[ecx],xmm7			; store result		<(cc)->c[0]>

	; more special handling of the 3rd row of matrix a[3]
	shufps		xmm1,xmm0,0x44			; b0r,-b0i,b1r,-b1i
	movss		xmm7,[eax+52]			; x,x,x,c20i		<(aa)[3].e[2][0].imag>
	movss		xmm5,[eax+60]			; x,x,x,c21i		<(aa)[3].e[2][1].imag>
	shufps		xmm5,xmm7,0x00			; c20i,c20i,c21i,c21i
	mulps		xmm5,xmm1
	addps		xmm6,xmm5
	shufps		xmm2,xmm2,0xB4			; -b2i,b2r,b2r,-b2i
	xorps		xmm2,[neg2]			; b2i,b2r,b2r,-b2i	<_sse_sgn4>
	movlps		xmm7,[eax+64]			; x,x,c22i,c22r		<(aa)[3].e[2][2]>
	shufps		xmm7,xmm7,0x05			; c22r,c22r,c22i,c22i
	mulps		xmm7,xmm2
	addps		xmm6,xmm7
	movaps		xmm7,xmm6
	shufps		xmm7,xmm7,0xEE
	addps		xmm6,xmm7
	movlps		xmm7,[ecx+16]			;			<(cc)->c[2]>
	addps		xmm7,xmm6
	movlps		[ecx+16],xmm7			;			<(cc)->c[2]>
	
	; *******************************************************************	

here:	pop	ecx
	pop	ebx
	pop	eax
	mov	esp,ebp
	pop	ebp
	ret
	
	align		16
negate:	dd		0x80000000
	dd		0x00000000
	dd		0x80000000
	dd		0x00000000
	
	align		16
neg2:   dd		0x00000000
	dd		0x00000000
	dd		0x00000000
	dd		0x80000000
	
