/******************************************************************************* * * File sse.h * * Macros for Dirac spinors, SU(3) vectors and SU(3) matrices using * inline assembly SSE and SSE2 instructions * * Needs gcc version 2.95.2 or later, and binutils snapshot 010122 or later * if the SSE2 instructions are used * * Version: 2.1 * Author: Martin Luescher * Date: 15.03.2001 * *******************************************************************************/ typedef struct { float c1,c2,c3,c4; } sse_float __attribute__ ((aligned (16))); typedef struct { sse_float c1,c2,c3; } sse_vector __attribute__ ((aligned (16))); static sse_float _sse_sgn12 __attribute__ ((unused)) ={-1.0f,-1.0f,1.0f,1.0f}; static sse_float _sse_sgn13 __attribute__ ((unused)) ={-1.0f,1.0f,-1.0f,1.0f}; static sse_float _sse_sgn14 __attribute__ ((unused)) ={-1.0f,1.0f,1.0f,-1.0f}; static sse_float _sse_sgn23 __attribute__ ((unused)) ={1.0f,-1.0f,-1.0f,1.0f}; static sse_float _sse_sgn24 __attribute__ ((unused)) ={1.0f,-1.0f,1.0f,-1.0f}; static sse_float _sse_sgn34 __attribute__ ((unused)) ={1.0f,1.0f,-1.0f,-1.0f}; /******************************************************************************* * * Cache manipulation macros * *******************************************************************************/ #if defined P4 #define _prefetch_spinor(addr) \ __asm__ __volatile__ ("prefetcht0 %0 \n\t" \ "prefetcht0 %1" \ : \ : \ "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \ "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128))) #define _prefetch_su3(addr) \ __asm__ __volatile__ ("prefetcht0 %0 \n\t" \ "prefetcht0 %1" \ : \ : \ "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \ "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128))) #else #define _prefetch_spinor(addr) \ __asm__ __volatile__ ("prefetcht0 %0 \n\t" \ "prefetcht0 %1 \n\t" \ "prefetcht0 %2" \ : \ : \ "m" (*(((char*)(addr)))), \ "m" (*(((char*)(addr))+32)), \ "m" (*(((char*)(addr))+64))) #define _prefetch_su3(addr) \ __asm__ __volatile__ ("prefetcht0 %0 \n\t" \ "prefetcht0 %1 \n\t" \ "prefetcht0 %2" \ : \ : \ "m" (*(((char*)(((unsigned int)(addr))&~0x1f)))), \ "m" (*(((char*)(((unsigned int)(addr))&~0x1f))+32)), \ "m" (*(((char*)(((unsigned int)(addr))&~0x1f))+64))) #endif /******************************************************************************* * * Macros for su3 vectors used in D_psi version 2.1 * * Most of these macros operate on pairs of su3 vectors that are stored * in the low and high words of xmm0,xmm1,xmm2 or xmm3,xmm4,xmm5. For example, * * xmm0 -> sl.c1.re,sl.c1.im,sh.c1.re,sh.c1.im * xmm1 -> sl.c2.re,sl.c2.im,sh.c2.re,sh.c2.im * xmm2 -> sl.c3.re,sl.c3.im,sh.c3.re,sh.c3.im * * (where sl and sh are of type su3_vector). This can also be interpreted as * an sse_vector s that is stored in these registers according to * * xmm0 -> s.c1.c1,s.c1.c2,s.c1.c3,s.c1.c4 * xmm1 -> s.c2.c1,s.c2.c2,s.c2.c3,s.c2.c4 * xmm2 -> s.c3.c1,s.c3.c2,s.c3.c3,s.c3.c4 * * The load and store macros can be used to move data in either format * from and to the xmm registers * *******************************************************************************/ /* * Loads two su3 vectors sl and sh to the low and high words of xmm0,xmm1,xmm2 */ #if defined SSE2 #define _sse_pair_load(sl,sh) \ __asm__ __volatile__ ("movsd %0, %%xmm0 \n\t" \ "movsd %1, %%xmm1 \n\t" \ "movsd %2, %%xmm2 \n\t" \ "movhps %3, %%xmm0 \n\t" \ "movhps %4, %%xmm1 \n\t" \ "movhps %5, %%xmm2" \ : \ : \ "m" ((sl).c1), \ "m" ((sl).c2), \ "m" ((sl).c3), \ "m" ((sh).c1), \ "m" ((sh).c2), \ "m" ((sh).c3)) #else #define _sse_pair_load(sl,sh) \ __asm__ __volatile__ ("movlps %0, %%xmm0 \n\t" \ "movlps %1, %%xmm1 \n\t" \ "movlps %2, %%xmm2 \n\t" \ "movhps %3, %%xmm0 \n\t" \ "movhps %4, %%xmm1 \n\t" \ "movhps %5, %%xmm2" \ : \ : \ "m" ((sl).c1), \ "m" ((sl).c2), \ "m" ((sl).c3), \ "m" ((sh).c1), \ "m" ((sh).c2), \ "m" ((sh).c3)) #endif /* * Loads two su3 vectors sl and sh to the low and high words of xmm3,xmm4,xmm5 */ #if defined SSE2 #define _sse_pair_load_up(sl,sh) \ __asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \ "movsd %1, %%xmm4 \n\t" \ "movsd %2, %%xmm5 \n\t" \ "movhps %3, %%xmm3 \n\t" \ "movhps %4, %%xmm4 \n\t" \ "movhps %5, %%xmm5" \ : \ : \ "m" ((sl).c1), \ "m" ((sl).c2), \ "m" ((sl).c3), \ "m" ((sh).c1), \ "m" ((sh).c2), \ "m" ((sh).c3)) #else #define _sse_pair_load_up(sl,sh) \ __asm__ __volatile__ ("movlps %0, %%xmm3 \n\t" \ "movlps %1, %%xmm4 \n\t" \ "movlps %2, %%xmm5 \n\t" \ "movhps %3, %%xmm3 \n\t" \ "movhps %4, %%xmm4 \n\t" \ "movhps %5, %%xmm5" \ : \ : \ "m" ((sl).c1), \ "m" ((sl).c2), \ "m" ((sl).c3), \ "m" ((sh).c1), \ "m" ((sh).c2), \ "m" ((sh).c3)) #endif /* * Stores the low and high words of xmm0,xmm1,xmm2 to the su3 vectors rl and rh */ #define _sse_pair_store(rl,rh) \ __asm__ __volatile__ ("movlps %%xmm0, %0 \n\t" \ "movlps %%xmm1, %1 \n\t" \ "movlps %%xmm2, %2 \n\t" \ "movhps %%xmm0, %3 \n\t" \ "movhps %%xmm1, %4 \n\t" \ "movhps %%xmm2, %5" \ : \ "=m" ((rl).c1), \ "=m" ((rl).c2), \ "=m" ((rl).c3), \ "=m" ((rh).c1), \ "=m" ((rh).c2), \ "=m" ((rh).c3)) /* * Stores the low and high words of xmm3,xmm4,xmm5 to the su3 vectors rl and rh */ #define _sse_pair_store_up(rl,rh) \ __asm__ __volatile__ ("movlps %%xmm3, %0 \n\t" \ "movlps %%xmm4, %1 \n\t" \ "movlps %%xmm5, %2 \n\t" \ "movhps %%xmm3, %3 \n\t" \ "movhps %%xmm4, %4 \n\t" \ "movhps %%xmm5, %5" \ : \ "=m" ((rl).c1), \ "=m" ((rl).c2), \ "=m" ((rl).c3), \ "=m" ((rh).c1), \ "=m" ((rh).c2), \ "=m" ((rh).c3)) /* * Loads the components s.c1,s.c2,s.c3 of an _sse_vector s to xmm0,xmm1,xmm2 */ #define _sse_vector_load(s) \ __asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" \ "movaps %1, %%xmm1 \n\t" \ "movaps %2, %%xmm2" \ : \ : \ "m" ((s).c1), \ "m" ((s).c2), \ "m" ((s).c3)) /* * Stores xmm0,xmm1,xmm2 to the components r.c1,r.c2,r.c3 of an _sse_vector r */ #define _sse_vector_store(r) \ __asm__ __volatile__ ("movaps %%xmm0, %0 \n\t" \ "movaps %%xmm1, %1 \n\t" \ "movaps %%xmm2, %2" \ : \ "=m" ((r).c1), \ "=m" ((r).c2), \ "=m" ((r).c3)) /* * Multiplies xmm0,xmm1,xmm2 with a constant sse_float c */ #define _sse_vector_mul(c) \ __asm__ __volatile__ ("mulps %0, %%xmm0 \n\t" \ "mulps %0, %%xmm1 \n\t" \ "mulps %0, %%xmm2" \ : \ : \ "m" (c)) /* * Adds xmm3,xmm4,xmm5 to xmm1,xmm2,xmm3 */ #define _sse_vector_add() \ __asm__ __volatile__ ("addps %%xmm3, %%xmm0 \n\t" \ "addps %%xmm4, %%xmm1 \n\t" \ "addps %%xmm5, %%xmm2" \ : \ :) /* * Subtracts xmm3,xmm4,xmm5 from xmm1,xmm2,xmm3 */ #define _sse_vector_sub() \ __asm__ __volatile__ ("subps %%xmm3, %%xmm0 \n\t" \ "subps %%xmm4, %%xmm1 \n\t" \ "subps %%xmm5, %%xmm2" \ : \ :) /* * Multiplies the high words xmm3,xmm4,xmm5 with -1 and adds these registers * to xmm0,xmm1,xmm2 */ #define _sse_vector_addsub() \ __asm__ __volatile__ ("mulps %0, %%xmm3 \n\t" \ "mulps %0, %%xmm4 \n\t" \ "mulps %0, %%xmm5 \n\t" \ "addps %%xmm3, %%xmm0 \n\t" \ "addps %%xmm4, %%xmm1 \n\t" \ "addps %%xmm5, %%xmm2" \ : \ : \ "m" (_sse_sgn34)) /* * Multiplies the low words xmm3,xmm4,xmm5 with -1 and adds these registers * to xmm0,xmm1,xmm2 */ #define _sse_vector_subadd() \ __asm__ __volatile__ ("mulps %0, %%xmm3 \n\t" \ "mulps %0, %%xmm4 \n\t" \ "mulps %0, %%xmm5 \n\t" \ "addps %%xmm3, %%xmm0 \n\t" \ "addps %%xmm4, %%xmm1 \n\t" \ "addps %%xmm5, %%xmm2" \ : \ : \ "m" (_sse_sgn12)) /* * Multiplies xmm3,xmm4,xmm5 with i and adds them to xmm1,xmm2,xmm3 */ #define _sse_vector_i_add() \ __asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ "mulps %0, %%xmm3 \n\t" \ "mulps %0, %%xmm4 \n\t" \ "mulps %0, %%xmm5 \n\t" \ "addps %%xmm3, %%xmm0 \n\t" \ "addps %%xmm4, %%xmm1 \n\t" \ "addps %%xmm5, %%xmm2" \ : \ : \ "m" (_sse_sgn13)) /* * Multiplies xmm3,xmm4,xmm5 with i and subtracts them from xmm1,xmm2,xmm3 */ #define _sse_vector_i_sub() \ __asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ "mulps %0, %%xmm3 \n\t" \ "mulps %0, %%xmm4 \n\t" \ "mulps %0, %%xmm5 \n\t" \ "addps %%xmm3, %%xmm0 \n\t" \ "addps %%xmm4, %%xmm1 \n\t" \ "addps %%xmm5, %%xmm2" \ : \ : \ "m" (_sse_sgn24)) /* * Exchanges the high and low words of xmm3,xmm4,xmm5, multiplies them with i * and adds the result to xmm1,xmm2,xmm3 */ #define _sse_vector_xch_i_add() \ __asm__ __volatile__ ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \ "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \ "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \ "mulps %0, %%xmm3 \n\t" \ "mulps %0, %%xmm4 \n\t" \ "mulps %0, %%xmm5 \n\t" \ "addps %%xmm3, %%xmm0 \n\t" \ "addps %%xmm4, %%xmm1 \n\t" \ "addps %%xmm5, %%xmm2" \ : \ : \ "m" (_sse_sgn13)) /* * Exchanges the high and low words of xmm3,xmm4,xmm5, multiplies them with i * and subtracts the result from xmm1,xmm2,xmm3 */ #define _sse_vector_xch_i_sub() \ __asm__ __volatile__ ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \ "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \ "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \ "mulps %0, %%xmm3 \n\t" \ "mulps %0, %%xmm4 \n\t" \ "mulps %0, %%xmm5 \n\t" \ "addps %%xmm3, %%xmm0 \n\t" \ "addps %%xmm4, %%xmm1 \n\t" \ "addps %%xmm5, %%xmm2" \ : \ : \ "m" (_sse_sgn24)) /* * Multiplies the low and high words of xmm3,xmm4,xmm5 with i and -i * respectively and adds these registers to xmm1,xmm2,xmm3 */ #define _sse_vector_i_addsub() \ __asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ "mulps %0, %%xmm3 \n\t" \ "mulps %0, %%xmm4 \n\t" \ "mulps %0, %%xmm5 \n\t" \ "addps %%xmm3, %%xmm0 \n\t" \ "addps %%xmm4, %%xmm1 \n\t" \ "addps %%xmm5, %%xmm2" \ : \ : \ "m" (_sse_sgn14)) /* * Multiplies the low and high words of xmm3,xmm4,xmm5 with -i and i * respectively and adds these registers to xmm1,xmm2,xmm3 */ #define _sse_vector_i_subadd() \ __asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ "mulps %0, %%xmm3 \n\t" \ "mulps %0, %%xmm4 \n\t" \ "mulps %0, %%xmm5 \n\t" \ "addps %%xmm3, %%xmm0 \n\t" \ "addps %%xmm4, %%xmm1 \n\t" \ "addps %%xmm5, %%xmm2" \ : \ : \ "m" (_sse_sgn23)) /* * Exchanges the high and low words in xmm3,xmm4,xmm5 */ #define _sse_vector_xch() \ __asm__ __volatile__ ("shufps $0x4e, %%xmm3, %%xmm3 \n\t" \ "shufps $0x4e, %%xmm4, %%xmm4 \n\t" \ "shufps $0x4e, %%xmm5, %%xmm5" \ : \ :) /* * Multiplies a pair sl,sh of su3 vectors with an su3 matrix u, * assuming sl and sh are in the low and high words of xmm0,xmm1,xmm2 * * On output the result is in xmm3,xmm4,xmm5 and the registers * xmm0,xmm1,xmm2 are changed */ #define _sse_su3_multiply(u) \ __asm__ __volatile__ ("movss %0, %%xmm3 \n\t" \ "movss %1, %%xmm6 \n\t" \ "movss %2, %%xmm4 \n\t" \ "movss %3, %%xmm7 \n\t" \ "movss %4, %%xmm5 \n\t" \ "shufps $0x0, %%xmm3, %%xmm3 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm4, %%xmm4 \n\t" \ "mulps %%xmm0, %%xmm3 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm1, %%xmm6 \n\t" \ "shufps $0x0, %%xmm5, %%xmm5 \n\t" \ "mulps %%xmm0, %%xmm4 \n\t" \ "addps %%xmm6, %%xmm3 \n\t" \ "mulps %%xmm2, %%xmm7 \n\t" \ "mulps %%xmm0, %%xmm5 \n\t" \ "addps %%xmm7, %%xmm4 \n\t" \ "movss %5, %%xmm6 \n\t" \ "movss %6, %%xmm7 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm1, %%xmm6 \n\t" \ "mulps %%xmm2, %%xmm7 \n\t" \ "addps %%xmm6, %%xmm5 \n\t" \ "addps %%xmm7, %%xmm3 \n\t" \ "movss %7, %%xmm6 \n\t" \ "movss %8, %%xmm7 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm1, %%xmm6 \n\t" \ "mulps %%xmm2, %%xmm7 \n\t" \ "addps %%xmm6, %%xmm4 \n\t" \ "addps %%xmm7, %%xmm5" \ : \ : \ "m" ((u).c11.re), \ "m" ((u).c12.re), \ "m" ((u).c21.re), \ "m" ((u).c23.re), \ "m" ((u).c31.re), \ "m" ((u).c32.re), \ "m" ((u).c13.re), \ "m" ((u).c22.re), \ "m" ((u).c33.re)); \ __asm__ __volatile__ ("movss %0, %%xmm6 \n\t" \ "movss %1, %%xmm7 \n\t" \ "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \ "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \ "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %9, %%xmm0 \n\t" \ "mulps %9, %%xmm1 \n\t" \ "mulps %9, %%xmm2 \n\t" \ "mulps %%xmm0, %%xmm6 \n\t" \ "mulps %%xmm1, %%xmm7 \n\t" \ "addps %%xmm6, %%xmm3 \n\t" \ "addps %%xmm7, %%xmm4 \n\t" \ "movss %2, %%xmm6 \n\t" \ "movss %3, %%xmm7 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm2, %%xmm6 \n\t" \ "mulps %%xmm0, %%xmm7 \n\t" \ "addps %%xmm6, %%xmm5 \n\t" \ "addps %%xmm7, %%xmm4 \n\t" \ "movss %4, %%xmm6 \n\t" \ "movss %5, %%xmm7 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm1, %%xmm6 \n\t" \ "mulps %%xmm0, %%xmm7 \n\t" \ "addps %%xmm6, %%xmm3 \n\t" \ "addps %%xmm7, %%xmm5 \n\t" \ "movss %6, %%xmm0 \n\t" \ "movss %7, %%xmm6 \n\t" \ "movss %8, %%xmm7 \n\t" \ "shufps $0x0, %%xmm0, %%xmm0 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm2, %%xmm0 \n\t" \ "mulps %%xmm1, %%xmm6 \n\t" \ "mulps %%xmm2, %%xmm7 \n\t" \ "addps %%xmm0, %%xmm3 \n\t" \ "addps %%xmm6, %%xmm5 \n\t" \ "addps %%xmm7, %%xmm4" \ : \ : \ "m" ((u).c11.im), \ "m" ((u).c22.im), \ "m" ((u).c33.im), \ "m" ((u).c21.im), \ "m" ((u).c12.im), \ "m" ((u).c31.im), \ "m" ((u).c13.im), \ "m" ((u).c32.im), \ "m" ((u).c23.im), \ "m" (_sse_sgn13)) /* * Multiplies a pair sl,sh of su3 vectors with an su3 matrix u^dagger, * assuming sl and sh are in the low and high words of xmm0,xmm1,xmm2 * * On output the result is in xmm3,xmm4,xmm5 and the registers * xmm0,xmm1,xmm2 are changed */ #define _sse_su3_inverse_multiply(u) \ __asm__ __volatile__ ("movss %0, %%xmm3 \n\t" \ "movss %1, %%xmm6 \n\t" \ "movss %2, %%xmm4 \n\t" \ "movss %3, %%xmm7 \n\t" \ "movss %4, %%xmm5 \n\t" \ "shufps $0x0, %%xmm3, %%xmm3 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm4, %%xmm4 \n\t" \ "mulps %%xmm0, %%xmm3 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm1, %%xmm6 \n\t" \ "shufps $0x0, %%xmm5, %%xmm5 \n\t" \ "mulps %%xmm0, %%xmm4 \n\t" \ "addps %%xmm6, %%xmm3 \n\t" \ "mulps %%xmm2, %%xmm7 \n\t" \ "mulps %%xmm0, %%xmm5 \n\t" \ "addps %%xmm7, %%xmm4 \n\t" \ "movss %5, %%xmm6 \n\t" \ "movss %6, %%xmm7 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm1, %%xmm6 \n\t" \ "mulps %%xmm2, %%xmm7 \n\t" \ "addps %%xmm6, %%xmm5 \n\t" \ "addps %%xmm7, %%xmm3 \n\t" \ "movss %7, %%xmm6 \n\t" \ "movss %8, %%xmm7 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm1, %%xmm6 \n\t" \ "mulps %%xmm2, %%xmm7 \n\t" \ "addps %%xmm6, %%xmm4 \n\t" \ "addps %%xmm7, %%xmm5" \ : \ : \ "m" ((u).c11.re), \ "m" ((u).c21.re), \ "m" ((u).c12.re), \ "m" ((u).c32.re), \ "m" ((u).c13.re), \ "m" ((u).c23.re), \ "m" ((u).c31.re), \ "m" ((u).c22.re), \ "m" ((u).c33.re)); \ __asm__ __volatile__ ("movss %0, %%xmm6 \n\t" \ "movss %1, %%xmm7 \n\t" \ "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \ "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \ "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %9, %%xmm0 \n\t" \ "mulps %9, %%xmm1 \n\t" \ "mulps %9, %%xmm2 \n\t" \ "mulps %%xmm0, %%xmm6 \n\t" \ "mulps %%xmm1, %%xmm7 \n\t" \ "addps %%xmm6, %%xmm3 \n\t" \ "addps %%xmm7, %%xmm4 \n\t" \ "movss %2, %%xmm6 \n\t" \ "movss %3, %%xmm7 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm2, %%xmm6 \n\t" \ "mulps %%xmm0, %%xmm7 \n\t" \ "addps %%xmm6, %%xmm5 \n\t" \ "addps %%xmm7, %%xmm4 \n\t" \ "movss %4, %%xmm6 \n\t" \ "movss %5, %%xmm7 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm1, %%xmm6 \n\t" \ "mulps %%xmm0, %%xmm7 \n\t" \ "addps %%xmm6, %%xmm3 \n\t" \ "addps %%xmm7, %%xmm5 \n\t" \ "movss %6, %%xmm0 \n\t" \ "movss %7, %%xmm6 \n\t" \ "movss %8, %%xmm7 \n\t" \ "shufps $0x0, %%xmm0, %%xmm0 \n\t" \ "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ "mulps %%xmm2, %%xmm0 \n\t" \ "mulps %%xmm1, %%xmm6 \n\t" \ "mulps %%xmm2, %%xmm7 \n\t" \ "addps %%xmm0, %%xmm3 \n\t" \ "addps %%xmm6, %%xmm5 \n\t" \ "addps %%xmm7, %%xmm4" \ : \ : \ "m" ((u).c11.im), \ "m" ((u).c22.im), \ "m" ((u).c33.im), \ "m" ((u).c12.im), \ "m" ((u).c21.im), \ "m" ((u).c13.im), \ "m" ((u).c31.im), \ "m" ((u).c23.im), \ "m" ((u).c32.im), \ "m" (_sse_sgn24));