; ; sse_mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c) ; global sse_mult_su3_mat_vec sse_mult_su3_mat_vec: push ebp mov ebp,esp push eax push ebx push ecx mov eax,[ebp+8] ; su3_matrix *a mov ebx,[ebp+12] ; su3_vector *b mov ecx,[ebp+16] ; su3_vector *c ; bring in real and imaginary b vector movups xmm0,[ebx] ; c1i,c1r,c0i,c0r movaps xmm1,xmm0 shufps xmm1,xmm1,0xB1 ; c1r,c1i,c0r,c0i movhps xmm2,[ebx+16] ; c2i,c2r,x,x shufps xmm2,xmm2,0xEB ; c2i,c2r,c2r,c2i ; xmm0: c1i, c1r, c0i, c0r ; xmm1: c1r, c1i, c0r, c0i ; xmm2: c2i, c2r, c2r, c2i ; ******************************************************************* ; bring in first row of a matrix movups xmm3,[eax] ; c1i,c1r,c0i,c0r movups xmm4,[eax+8] ; c2i,c2r,c1i,c1r shufps xmm4,xmm4,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm4: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm4,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm4,xmm5 addps xmm4,xmm6 ; ******************************************************************* ; bring in second row of a matrix movups xmm3,[eax+24] ; c1i,c1r,c0i,c0r movups xmm7,[eax+32] ; c2i,c2r,c1i,c1r shufps xmm7,xmm7,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm7: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm7,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm7: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm7: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm7,xmm5 addps xmm7,xmm6 ; xmm4: i-sum, r-sum, i.r-sum, r.i-sum [0] ; xmm7: i-sum, r-sum, i.r-sum, r.i-sum [1] movaps xmm5,xmm4 shufps xmm5,xmm7,0x22 shufps xmm4,xmm7,0x77 ; xmm4: i.r-sum[1], i-sum[1], i.r-sum[0], i-sum[0] ; xmm5: r.i-sum[1], r-sum[1], r.i-sum[0], r-sum[0] xorps xmm4,[negate] addps xmm5,xmm4 movups [ecx],xmm5 ; ******************************************************************* ; bring in third row of a matrix movups xmm3,[eax+48] ; c1i,c1r,c0i,c0r movups xmm4,[eax+56] ; c2i,c2r,c1i,c1r shufps xmm4,xmm4,0xEE ; c2i,c2r,c2i,c2r ; xmm3: a1i, a1r, a0i, a0r ; xmm4: a2i, a2r, a2i, a2r movaps xmm5,xmm3 mulps xmm3,xmm0 mulps xmm5,xmm1 mulps xmm4,xmm2 ; xmm3: c1i.i, c1r.r, c0i.i, c0r.r ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.r, c1r.i, c0i.r, c0r.i movaps xmm6,xmm5 shufps xmm6,xmm3,0x4E ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i shufps xmm5,xmm3,0xE4 ; xmm4: c2i.i, c2r.r, c2i.r, c2r.i ; xmm5: c1i.i, c1r.r, c0i.r, c0r.i ; xmm6: c0i.i, c0r.r, c1i.r, c1r.i addps xmm4,xmm5 addps xmm4,xmm6 ; xmm4: i-sum, r-sum, i.r-sum, r.i-sum movaps xmm5,xmm4 shufps xmm4,xmm4,0x77 shufps xmm5,xmm5,0x22 ; xmm4: i.r-sum, i-sum, i.r-sum, i-sum ; xmm5: r.i-sum, r-sum, r.i-sum, r-sum xorps xmm4,[negate] addps xmm5,xmm4 movhps [ecx+16],xmm5 ; ******************************************************************* here: pop ecx pop ebx pop eax mov esp,ebp pop ebp ret align 16 negate: dd 0x80000000 dd 0x00000000 dd 0x80000000 dd 0x00000000