#define _inline_sse_sub_four_su3_vecs(aa,bb0,bb1,bb2,bb3) \
{ \
__asm__ __volatile__ ("movups %0, %%xmm0 \n\t" \
                      "movlps %1, %%xmm1 \n\t" \
                      "shufps $0x44, %%xmm1, %%xmm1 \n\t" \
                      "movups %2, %%xmm2 \n\t" \
                      "movlps %3, %%xmm3 \n\t" \
                      "shufps $0x44, %%xmm3, %%xmm3 \n\t" \
                      "subps %%xmm2, %%xmm0 \n\t" \
                      "subps %%xmm3, %%xmm1 \n\t" \
                      "movups %4, %%xmm2 \n\t" \
                      "movlps %5, %%xmm3" \
                      : \
                      : \
                      "m" ((aa)->c[0]), \
                      "m" ((aa)->c[2]), \
                      "m" ((bb0)->c[0]), \
                      "m" ((bb0)->c[2]), \
                      "m" ((bb1)->c[0]), \
                      "m" ((bb1)->c[2])); \
__asm__ __volatile__ ("shufps $0x44, %%xmm3, %%xmm3 \n\t" \
                      "subps %%xmm2, %%xmm0 \n\t" \
                      "subps %%xmm3, %%xmm1 \n\t" \
                      "movups %0, %%xmm2 \n\t" \
                      "movlps %1, %%xmm3 \n\t" \
                      "shufps $0x44, %%xmm3, %%xmm3 \n\t" \
                      "subps %%xmm2, %%xmm0 \n\t" \
                      "subps %%xmm3, %%xmm1 \n\t" \
                      "movups %2, %%xmm2 \n\t" \
                      "movlps %3, %%xmm3 \n\t" \
                      "shufps $0x44, %%xmm3, %%xmm3 \n\t" \
                      "subps %%xmm2, %%xmm0 \n\t" \
                      "subps %%xmm3, %%xmm1 \n\t" \
                      : \
                      : \
                      "m" ((bb2)->c[0]), \
                      "m" ((bb2)->c[2]), \
                      "m" ((bb3)->c[0]), \
                      "m" ((bb3)->c[2])); \
__asm__ __volatile__ ("movups %%xmm0, %0 \n\t" \
                      "movlps %%xmm1, %1 \n\t" \
                      : \
                      "=m" ((aa)->c[0]), \
                      "=m" ((aa)->c[2])); \
}
