;
; sse_mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c)
;

global sse_mult_su3_nn
sse_mult_su3_nn:
	push		ebp
	mov		ebp,esp
	push		eax
	push		ebx
	push		ecx
	mov		eax,[ebp+8]			; su3_matrix *a
	mov		ebx,[ebp+12]			; su3_vector *b
	mov		ecx,[ebp+16]			; su3_vector *c

	; bring in "half wilson vector" (first two columns)
	movlps		xmm0,[ebx]			;			<(bb)->e[0][0]>
	movlps		xmm1,[ebx+24]			;			<(bb)->e[1][0]>
	movlps		xmm2,[ebx+48]			;			<(bb)->e[2][0]>
	movhps		xmm0,[ebx+8]			; b[1]1 b[0]1		<(bb)->e[0][1]>
	movhps		xmm1,[ebx+32]			; b[1]2 b[0]2		<(bb)->e[1][1]>
	movhps		xmm2,[ebx+56]			; b[1]3 b[0]3		<(bb)->e[2][1]>

	; multiply vectors by real components of a
	movss		xmm3,[eax]			; a(1,1)		<(aa)->e[0][0].real>
	movss		xmm6,[eax+8]			; a(1,2)		<(aa)->e[0][1].real>
	movss		xmm4,[eax+24]			; a(2,1)		<(aa)->e[1][0].real>
	movss		xmm7,[eax+40]			; a(2,3)		<(aa)->e[1][2].real>
	movss		xmm5,[eax+48]			; a(3,1)		<(aa)->e[2][0].real>
	shufps		xmm3,xmm3,0x00
	shufps		xmm6,xmm6,0x00
	shufps		xmm4,xmm4,0x00
	mulps		xmm3,xmm0
	shufps		xmm7,xmm7,0x00
	mulps		xmm6,xmm1
	shufps		xmm5,xmm5,0x00
	mulps		xmm4,xmm0
	addps		xmm3,xmm6
	mulps		xmm7,xmm2
	mulps		xmm5,xmm0
	addps		xmm4,xmm7
	movss		xmm6,[eax+56]			; a(3,2)		<(aa)->e[2][1].real>
	movss		xmm7,[eax+16]			; a(1,3)		<(aa)->e[0][2].real>
	shufps		xmm6,xmm6,0x00
	shufps		xmm7,xmm7,0x00
	mulps		xmm6,xmm1
	mulps		xmm7,xmm2
	addps		xmm5,xmm6
	addps		xmm3,xmm7
	movss		xmm6,[eax+32]			; a(2,2)		<(aa)->e[1][1].real>
	movss		xmm7,[eax+64]			; a(3,3)		<(aa)->e[2][2].real>
	shufps		xmm6,xmm6,0x00
	shufps		xmm7,xmm7,0x00
	mulps		xmm6,xmm1
	mulps		xmm7,xmm2
	addps		xmm4,xmm6
	addps		xmm5,xmm7
	
	; multiply vectors by imaginary components of a
	movss		xmm6,[eax+4]			; a(1,1)		<(aa)->e[0][0].imag>
	movss		xmm7,[eax+36]			; a(2,2)		<(aa)->e[1][1].imag>
	shufps		xmm0,xmm0,0xb1
	shufps		xmm1,xmm1,0xb1
	shufps		xmm2,xmm2,0xb1
	shufps		xmm6,xmm6,0x00
	shufps		xmm7,xmm7,0x00
	xorps		xmm0,[negate]			;			<_sse_sgn13>
	xorps		xmm1,[negate]			;			<_sse_sgn13>
	xorps		xmm2,[negate]			;			<_sse_sgn13>
	mulps		xmm6,xmm0
	mulps		xmm7,xmm1
	addps		xmm3,xmm6
	addps		xmm4,xmm7
	movss		xmm6,[eax+68]			; a(3,3)		<(aa)->e[2][2].imag>
	movss		xmm7,[eax+28]			; a(2,1)		<(aa)->e[1][0].imag>
	shufps		xmm6,xmm6,0x00
	shufps		xmm7,xmm7,0x00
	mulps		xmm6,xmm2
	mulps		xmm7,xmm0
	addps		xmm5,xmm6
	addps		xmm4,xmm7
	movss		xmm6,[eax+12]			; a(1,2)		<(aa)->e[0][1].imag>
	movss		xmm7,[eax+52]			; a(3,1)		<(aa)->e[2][0].imag>
	shufps		xmm6,xmm6,0x00
	shufps		xmm7,xmm7,0x00
	mulps		xmm6,xmm1
	mulps		xmm7,xmm0
	addps		xmm3,xmm6
	addps		xmm5,xmm7
	movss		xmm0,[eax+20]			; a(1,3)		<(aa)->e[0][2].imag>
	movss		xmm6,[eax+60]			; a(3,2)		<(aa)->e[2][1].imag>
	movss		xmm7,[eax+44]			; a(2,3)		<(aa)->e[1][2].imag>
	shufps		xmm0,xmm0,0x00
	shufps		xmm6,xmm6,0x00
	shufps		xmm7,xmm7,0x00
	mulps		xmm0,xmm2
	mulps		xmm6,xmm1
	mulps		xmm7,xmm2
	addps		xmm3,xmm0
	addps		xmm5,xmm6
	addps		xmm4,xmm7

	; move results into output
	movlps		[ecx],xmm3			;			<(cc)->e[0][0]>
	movlps		[ecx+24],xmm4			;			<(cc)->e[1][0]>
	movlps		[ecx+48],xmm5			;			<(cc)->e[2][0]>
	movhps		[ecx+8],xmm3			;			<(cc)->e[0][1]>
	movhps		[ecx+32],xmm4			;			<(cc)->e[1][1]>
	movhps		[ecx+56],xmm5			;			<(cc)->e[2][1]>

	;  Now handle 3rd column
	movlps		xmm0,[ebx+16]			; x,x,b0i,b0r		<(bb)->e[0][2]>
	movlps		xmm1,[ebx+40]			; x,x,b1i,b1r		<(bb)->e[1][2]>
	movlps		xmm2,[ebx+64]			; x,x,b2i,b2r		<(bb)->e[2][2]>
	shufps		xmm0,xmm0,0x44			; b0i,b0r,b0i,b0r
	shufps		xmm1,xmm1,0x44			; b1i,b1r,b1i,b1r
	shufps		xmm2,xmm2,0x44			; b2i,b2r,b2i,b2r

	; bring in real components of first two rows of matrix a
	movss		xmm3,[eax]			; x,x,x,c00r		<(aa)->e[0][0].real>
	movss		xmm7,[eax+24]			; x,x,x,c10r		<(aa)->e[1][0].real>
	shufps		xmm3,xmm7,0x00			; c10r,c10r,c00r,c00r
	movss		xmm4,[eax+8]			; x,x,x,c01r		<(aa)->e[0][1].real>
	movss		xmm7,[eax+32]			; x,x,x,c11r		<(aa)->e[1][1].real>
	shufps		xmm4,xmm7,0x00			; c11r,c11r,c01r,c01r
	mulps		xmm3,xmm0
	mulps		xmm4,xmm1
	addps		xmm3,xmm4
	movss		xmm5,[eax+16]			; x,x,x,c02r		<(aa)->e[0][2].real>
	movss		xmm7,[eax+40]			; x,x,x,c12r		<(aa)->e[1][2].real>
	shufps		xmm5,xmm7,0x00			; c12r,c12r,c02r,c02r
	mulps		xmm5,xmm2
	addps		xmm3,xmm5

	; special handling of the 3rd row of matrix a
	shufps		xmm1,xmm0,0x44			; b0i,b0r,b1i,b1r
	movss		xmm7,[eax+48]			; x,x,x,c20r		<(aa)->e[2][0].real>
	movss		xmm6,[eax+56]			; x,x,x,c21r		<(aa)->e[2][1].real>
	shufps		xmm6,xmm7,0x00			; c20r,c20r,c21r,c21r
	mulps		xmm6,xmm1

	; shuffle b vector for imaginary components of matrix a
	shufps		xmm0,xmm0,0xB1			; b0r,b0i,b0r,b0i
	xorps		xmm0,[negate]			; b0r,-b0i,b0r,-b0i	<_sse_sgn13>
	shufps		xmm1,xmm1,0x11			; b1r,b1i,b1r,b1i
	xorps		xmm1,[negate]			; b1r,-b1i,b1r,-b1i	<_sse_sgn13>
	shufps		xmm2,xmm2,0xB1			; b2r,b2i,b2r,b2i
	xorps		xmm2,[negate]			; b2r,-b2i,b2r,-b2i	<_sse_sgn13>

	; bring in imaginary components of first two rows of matrix b
	movss		xmm4,[eax+4]			; x,x,x,c00i		<(aa)->e[0][0].imag>
	movss		xmm7,[eax+28]			; x,x,x,c10i		<(aa)->e[1][0].imag>
	shufps		xmm4,xmm7,0x00			; c10i,c10i,c00i,c00i
	mulps		xmm4,xmm0
	addps		xmm3,xmm4
	movss		xmm5,[eax+12]			; x,x,x,c01i		<(aa)->e[0][1].imag>
	movss		xmm7,[eax+36]			; x,x,x,c11i		<(aa)->e[1][1].imag>
	shufps		xmm5,xmm7,0x00			; c11i,c11i,c01i,c01i
	mulps		xmm5,xmm1
	addps		xmm3,xmm5
	movss		xmm5,[eax+20]			; x,x,x,c02i		<(aa)->e[0][2].imag>
	movss		xmm7,[eax+44]			; x,x,x,c12i		<(aa)->e[1][2].imag>
	shufps		xmm5,xmm7,0x00			; c12i,c12i,c02i,c02i
	mulps		xmm5,xmm2
	addps		xmm3,xmm5			; d1i,d1r,d0i,d0r
	movlps		[ecx+16],xmm3			; store result		<(cc)->e[0][2]>
	movhps		[ecx+40],xmm3			;			<(cc)->e[1][2]>

	; more special handling of the 3rd row of matrix a
	shufps		xmm1,xmm0,0x44			; b0r,-b0i,b1r,-b1i
	movss		xmm7,[eax+52]			; x,x,x,c20i		<(aa)->e[2][0].imag>
	movss		xmm5,[eax+60]			; x,x,x,c21i		<(aa)->e[2][1].imag>
	shufps		xmm5,xmm7,0x00			; c20i,c20i,c21i,c21i
	mulps		xmm5,xmm1
	addps		xmm6,xmm5
	shufps		xmm2,xmm2,0xB4			; -b2i,b2r,b2r,-b2i
	xorps		xmm2,[neg2]			; b2i,b2r,b2r,-b2i	<_sse_sgn4>
	movlps		xmm7,[eax+64]			; x,x,c22i,c22r		<(aa)->e[2][2]>
	shufps		xmm7,xmm7,0x05			; c22r,c22r,c22i,c22i
	mulps		xmm7,xmm2
	addps		xmm6,xmm7
	movaps		xmm7,xmm6
	shufps		xmm7,xmm7,0xEE
	addps		xmm6,xmm7
	movlps		[ecx+64],xmm6			;			<(cc)->e[2][2]>

	
	; *******************************************************************	

here:	pop	ecx
	pop	ebx
	pop	eax
	mov	esp,ebp
	pop	ebp
	ret
	
	align		16
negate:	dd		0x80000000
	dd		0x00000000
	dd		0x80000000
	dd		0x00000000
	
        align           16
neg2:   dd              0x00000000
        dd              0x00000000
        dd              0x00000000
        dd              0x80000000
