; ; sse_mult_adj_su3_mat_vec_4dir( su3_matrix *a[4], su3_vector *b, su3_vector *c[4]) ; ; Multiply the adjoint of each of four input matrices by an input vector, ; storing the resulting vectors in an array. ; global sse_mult_adj_su3_mat_vec_4dir sse_mult_adj_su3_mat_vec_4dir: push ebp mov ebp,esp push eax push ebx push ecx mov eax,[ebp+8] ; su3_matrix *a[0] mov ebx,[ebp+12] ; su3_vector *b mov ecx,[ebp+16] ; su3_vector *c[0] ; bring in real and imaginary b vector movups xmm0,[ebx] ; c1i,c1r,c0i,c0r <(bb)->c[0]> movaps xmm1,xmm0 shufps xmm1,xmm1,0xB1 ; c1r,c1i,c0r,c0i movups xmm2,[ebx+8] ; c2i,c2r,c1i,c1r <(bb)->c[1]> shufps xmm2,xmm2,0xEB ; c2i,c2r,c2r,c2i ; xmm0: c1i, c1r, c0i, c0r ; xmm1: c1r, c1i, c0r, c0i ; xmm2: c2i, c2r, c2r, c2i ; ******************************************************************* ; ******************************************************************* ; bring in first column of a[0] matrix movlps xmm3,[eax] ; <(aa)[0].e[0][0]> movhps xmm3,[eax+24] ; c1i,c1r,c0i,c0r <(aa)[0].e[1][0]> movhps xmm4,[eax+48] ; c2i,c2r,x,x <(aa)[0].e[2][0]> shufps xmm4,xmm4,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm4: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm4,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm4,xmm5 addps xmm4,xmm6 ; ******************************************************************* ; bring in second column of a[0] matrix movlps xmm3,[eax+8] ; <(aa)[0].e[0][1]> movhps xmm3,[eax+32] ; c1i,c1r,c0i,c0r <(aa)[0].e[1][1]> movhps xmm7,[eax+56] ; c2i,c2r,x,x <(aa)[0].e[2][1]> shufps xmm7,xmm7,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm7: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm7,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm7: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm7: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm7,xmm5 addps xmm7,xmm6 ; xmm4: i-sum, r-sum, i.r-sum, r.i-sum [0] ; xmm7: i-sum, r-sum, i.r-sum, r.i-sum [1] movaps xmm5,xmm4 shufps xmm5,xmm7,0x22 shufps xmm4,xmm7,0x77 ; xmm4: i.r-sum[1], i-sum[1], i.r-sum[0], i-sum[0] ; xmm5: r.i-sum[1], r-sum[1], r.i-sum[0], r-sum[0] xorps xmm4,[negate] ; <_sse_sgn24> addps xmm5,xmm4 movups [ecx],xmm5 ; <(cc)[0].c[0]> ; ******************************************************************* ; bring in third column of a[0] matrix movlps xmm3,[eax+16] ; <(aa)[0].e[0][2]> movhps xmm3,[eax+40] ; c1i,c1r,c0i,c0r <(aa)[0].e[1][2]> movhps xmm4,[eax+64] ; c2i,c2r,x,x <(aa)[0].e[2][2]> shufps xmm4,xmm4,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm4: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm4,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm4,xmm5 addps xmm4,xmm6 ; xmm4: i-sum, r-sum, i.r-sum, r.i-sum movaps xmm5,xmm4 shufps xmm4,xmm4,0x77 shufps xmm5,xmm5,0x22 ; xmm4: i.r-sum, i-sum, i.r-sum, i-sum ; xmm5: r.i-sum, r-sum, r.i-sum, r-sum xorps xmm4,[negate] ; <_sse_sgn24> addps xmm5,xmm4 movhps [ecx+16],xmm5 ; <(cc)[0].c[2]> ; ******************************************************************* ; ******************************************************************* ; bring in first column of a[1] matrix add eax,72 add ecx,24 movlps xmm3,[eax] ; <(aa)[1].e[0][0]> movhps xmm3,[eax+24] ; c1i,c1r,c0i,c0r <(aa)[1].e[1][0]> movhps xmm4,[eax+48] ; c2i,c2r,x,x <(aa)[1].e[2][0]> shufps xmm4,xmm4,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm4: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm4,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm4,xmm5 addps xmm4,xmm6 ; ******************************************************************* ; bring in second column of a[1] matrix movlps xmm3,[eax+8] ; <(aa)[1].e[0][1]> movhps xmm3,[eax+32] ; c1i,c1r,c0i,c0r <(aa)[1].e[1][1]> movhps xmm7,[eax+56] ; c2i,c2r,x,x <(aa)[1].e[2][1]> shufps xmm7,xmm7,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm7: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm7,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm7: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm7: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm7,xmm5 addps xmm7,xmm6 ; xmm4: i-sum, r-sum, i.r-sum, r.i-sum [0] ; xmm7: i-sum, r-sum, i.r-sum, r.i-sum [1] movaps xmm5,xmm4 shufps xmm5,xmm7,0x22 shufps xmm4,xmm7,0x77 ; xmm4: i.r-sum[1], i-sum[1], i.r-sum[0], i-sum[0] ; xmm5: r.i-sum[1], r-sum[1], r.i-sum[0], r-sum[0] xorps xmm4,[negate] ; <_sse_sgn24> addps xmm5,xmm4 movups [ecx],xmm5 ; <(cc)[1].c[0]> ; ******************************************************************* ; bring in third column of a[1] matrix movlps xmm3,[eax+16] ; <(aa)[1].e[0][2]> movhps xmm3,[eax+40] ; c1i,c1r,c0i,c0r <(aa)[1].e[1][2]> movhps xmm4,[eax+64] ; c2i,c2r,x,x <(aa)[1].e[2][2]> shufps xmm4,xmm4,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm4: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm4,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm4,xmm5 addps xmm4,xmm6 ; xmm4: i-sum, r-sum, i.r-sum, r.i-sum movaps xmm5,xmm4 shufps xmm4,xmm4,0x77 shufps xmm5,xmm5,0x22 ; xmm4: i.r-sum, i-sum, i.r-sum, i-sum ; xmm5: r.i-sum, r-sum, r.i-sum, r-sum xorps xmm4,[negate] ; <_sse_sgn24> addps xmm5,xmm4 movhps [ecx+16],xmm5 ; <(cc)[1].c[2]> ; ******************************************************************* ; ******************************************************************* ; bring in first column of a[2] matrix add eax,72 add ecx,24 movlps xmm3,[eax] ; <(aa)[2].e[0][0]> movhps xmm3,[eax+24] ; c1i,c1r,c0i,c0r <(aa)[2].e[1][0]> movhps xmm4,[eax+48] ; c2i,c2r,x,x <(aa)[2].e[2][0]> shufps xmm4,xmm4,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm4: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm4,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm4,xmm5 addps xmm4,xmm6 ; ******************************************************************* ; bring in second column of a[1] matrix movlps xmm3,[eax+8] ; <(aa)[2].e[0][1]> movhps xmm3,[eax+32] ; c1i,c1r,c0i,c0r <(aa)[2].e[1][1]> movhps xmm7,[eax+56] ; c2i,c2r,x,x <(aa)[2].e[2][1]> shufps xmm7,xmm7,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm7: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm7,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm7: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm7: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm7,xmm5 addps xmm7,xmm6 ; xmm4: i-sum, r-sum, i.r-sum, r.i-sum [0] ; xmm7: i-sum, r-sum, i.r-sum, r.i-sum [1] movaps xmm5,xmm4 shufps xmm5,xmm7,0x22 shufps xmm4,xmm7,0x77 ; xmm4: i.r-sum[1], i-sum[1], i.r-sum[0], i-sum[0] ; xmm5: r.i-sum[1], r-sum[1], r.i-sum[0], r-sum[0] xorps xmm4,[negate] ; <_sse_sgn24> addps xmm5,xmm4 movups [ecx],xmm5 ; <(cc)[2].c[0]> ; ******************************************************************* ; bring in third column of a[1] matrix movlps xmm3,[eax+16] ; <(aa)[2].e[0][2]> movhps xmm3,[eax+40] ; c1i,c1r,c0i,c0r <(aa)[2].e[1][2]> movhps xmm4,[eax+64] ; c2i,c2r,x,x <(aa)[2].e[2][2]> shufps xmm4,xmm4,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm4: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm4,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm4,xmm5 addps xmm4,xmm6 ; xmm4: i-sum, r-sum, i.r-sum, r.i-sum movaps xmm5,xmm4 shufps xmm4,xmm4,0x77 shufps xmm5,xmm5,0x22 ; xmm4: i.r-sum, i-sum, i.r-sum, i-sum ; xmm5: r.i-sum, r-sum, r.i-sum, r-sum xorps xmm4,[negate] ; <_sse_sgn24> addps xmm5,xmm4 movhps [ecx+16],xmm5 ; <(cc)[2].c[2]> ; ******************************************************************* ; ******************************************************************* ; bring in first column of a[3] matrix add eax,72 add ecx,24 movlps xmm3,[eax] ; <(aa)[3].e[0][0]> movhps xmm3,[eax+24] ; c1i,c1r,c0i,c0r <(aa)[3].e[1][0]> movhps xmm4,[eax+48] ; c2i,c2r,x,x <(aa)[3].e[2][0]> shufps xmm4,xmm4,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm4: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm4,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm4,xmm5 addps xmm4,xmm6 ; ******************************************************************* ; bring in second column of a[1] matrix movlps xmm3,[eax+8] ; <(aa)[3].e[0][1]> movhps xmm3,[eax+32] ; c1i,c1r,c0i,c0r <(aa)[3].e[1][1]> movhps xmm7,[eax+56] ; c2i,c2r,x,x <(aa)[3].e[2][1]> shufps xmm7,xmm7,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm7: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm7,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm7: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm7: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm7,xmm5 addps xmm7,xmm6 ; xmm4: i-sum, r-sum, i.r-sum, r.i-sum [0] ; xmm7: i-sum, r-sum, i.r-sum, r.i-sum [1] movaps xmm5,xmm4 shufps xmm5,xmm7,0x22 shufps xmm4,xmm7,0x77 ; xmm4: i.r-sum[1], i-sum[1], i.r-sum[0], i-sum[0] ; xmm5: r.i-sum[1], r-sum[1], r.i-sum[0], r-sum[0] xorps xmm4,[negate] ; <_sse_sgn24> addps xmm5,xmm4 movups [ecx],xmm5 ; <(cc)[3].c[0]> ; ******************************************************************* ; bring in third column of a[1] matrix movlps xmm3,[eax+16] ; <(aa)[3].e[0][2]> movhps xmm3,[eax+40] ; c1i,c1r,c0i,c0r <(aa)[3].e[1][2]> movhps xmm4,[eax+64] ; c2i,c2r,x,x <(aa)[3].e[2][2]> shufps xmm4,xmm4,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm4: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm4,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm4,xmm5 addps xmm4,xmm6 ; xmm4: i-sum, r-sum, i.r-sum, r.i-sum movaps xmm5,xmm4 shufps xmm4,xmm4,0x77 shufps xmm5,xmm5,0x22 ; xmm4: i.r-sum, i-sum, i.r-sum, i-sum ; xmm5: r.i-sum, r-sum, r.i-sum, r-sum xorps xmm4,[negate] ; <_sse_sgn24> addps xmm5,xmm4 movhps [ecx+16],xmm5 ; <(cc)[3].c[2]> ; ******************************************************************* here: pop ecx pop ebx pop eax mov esp,ebp pop ebp ret align 16 negate: dd 0x00000000 dd 0x80000000 dd 0x00000000 dd 0x80000000