#define _inline_sse_su3_projector(aa,bb,cc) \ { \ __asm__ __volatile__ ("movlps %0, %%xmm0 \n\t" \ "movhps %1, %%xmm0 \n\t" \ "movaps %%xmm0, %%xmm1 \n\t" \ "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \ "xorps %2, %%xmm1 \n\t" \ "movss %3, %%xmm2 \n\t" \ "shufps $0x00, %%xmm2, %%xmm2 \n\t" \ "movss %4, %%xmm3 \n\t" \ "shufps $0x00, %%xmm3, %%xmm3 \n\t" \ "mulps %%xmm0, %%xmm2 \n\t" \ "mulps %%xmm1, %%xmm3 \n\t" \ "addps %%xmm3, %%xmm2 \n\t" \ "xorps %5, %%xmm2" \ : \ : \ "m" ((bb)->c[0]), \ "m" ((bb)->c[1]), \ "m" (_sse_sgn24), \ "m" ((aa)->c[0].real), \ "m" ((aa)->c[0].imag), \ "m" (_sse_sgn24)); \ __asm__ __volatile__ ("movups %%xmm2, %0 \n\t" \ : \ "=m" ((cc)->e[0][0])); \ __asm__ __volatile__ ("movss %0, %%xmm2 \n\t" \ "shufps $0x00, %%xmm2, %%xmm2 \n\t" \ "movss %1, %%xmm3 \n\t" \ "shufps $0x00, %%xmm3, %%xmm3 \n\t" \ "mulps %%xmm0, %%xmm2 \n\t" \ "mulps %%xmm1, %%xmm3 \n\t" \ "addps %%xmm3, %%xmm2 \n\t" \ "xorps %2, %%xmm2 \n\t" \ : \ : \ "m" ((aa)->c[1].real), \ "m" ((aa)->c[1].imag), \ "m" (_sse_sgn24)); \ __asm__ __volatile__ ("movups %%xmm2, %0 \n\t" \ : \ "=m" ((cc)->e[1][0])); \ __asm__ __volatile__ ("movss %0, %%xmm2 \n\t" \ "shufps $0x00, %%xmm2, %%xmm2 \n\t" \ "movss %1, %%xmm3 \n\t" \ "shufps $0x00, %%xmm3, %%xmm3 \n\t" \ "mulps %%xmm0, %%xmm2 \n\t" \ "mulps %%xmm1, %%xmm3 \n\t" \ "addps %%xmm3, %%xmm2 \n\t" \ "xorps %2, %%xmm2 \n\t" \ : \ : \ "m" ((aa)->c[2].real), \ "m" ((aa)->c[2].imag), \ "m" (_sse_sgn24)); \ __asm__ __volatile__ ("movups %%xmm2, %0 \n\t" \ : \ "=m" ((cc)->e[2][0])); \ __asm__ __volatile__ ("movlps %0, %%xmm0 \n\t" \ "movhps %1, %%xmm0 \n\t" \ "movaps %%xmm0, %%xmm1 \n\t" \ "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \ "xorps %2, %%xmm1 \n\t" \ "movss %3, %%xmm2 \n\t" \ "shufps $0x00, %%xmm2, %%xmm2 \n\t" \ "movss %4, %%xmm3 \n\t" \ "shufps $0x00, %%xmm3, %%xmm3 \n\t" \ "mulps %%xmm0, %%xmm2 \n\t" \ "mulps %%xmm1, %%xmm3 \n\t" \ "addps %%xmm3, %%xmm2 \n\t" \ : \ : \ "m" ((aa)->c[0]), \ "m" ((aa)->c[1]), \ "m" (_sse_sgn24), \ "m" ((bb)->c[2].real), \ "m" ((bb)->c[2].imag)); \ __asm__ __volatile__ ("movlps %%xmm2, %0 \n\t" \ "movhps %%xmm2, %1 \n\t" \ : \ "=m" ((cc)->e[0][2]), \ "=m" ((cc)->e[1][2])); \ __asm__ __volatile__ ("movlps %0, %%xmm0 \n\t" \ "shufps $0x14, %%xmm0, %%xmm0 \n\t" \ "movlps %1, %%xmm2 \n\t" \ "shufps $0x44, %%xmm2, %%xmm2 \n\t" \ "xorps %2, %%xmm2 \n\t" \ "mulps %%xmm0, %%xmm2 \n\t" \ "movaps %%xmm2, %%xmm1 \n\t" \ "shufps $0xd4, %%xmm1, %%xmm1 \n\t" \ "shufps $0x8c, %%xmm2, %%xmm2 \n\t" \ "addps %%xmm1, %%xmm2 \n\t" \ : \ : \ "m" ((aa)->c[2]), \ "m" ((bb)->c[2]), \ "m" (_sse_sgn4)); \ __asm__ __volatile__ ("movhps %%xmm2, %0 \n\t" \ : \ "=m" ((cc)->e[2][2])); \ }