/* 
 * QCDSTREAM
 *  This benchmark is derived directly from John McCalpin's STREAM.
 *  Differences:
 *    - no scale, add, or triad measurements
 *    - measurements of float and long double transfers
 *    - malloc'd instead of static arrays
 *    - timings of SU3 (complex) matrix-vector and matrix-matrix routines
 *      from the MILC lattice QCD codes
 *    - timings of inline SSE versions of matrix routines
 *    - investigations of data access patterns (in cache, sequential,
 *      strided, and mapped)
 *    - investigations of precaching
 *
 *  Don Holmgren
 *  Fermilab
 *  djholm@fnal.gov
 */

# include <stdio.h>
# include <stdlib.h>
# include <getopt.h>
# include <math.h>
# include <float.h>
# include <limits.h>
# include <sys/time.h>
# include <unistd.h>
# include <string.h>
# include <sched.h>
# include <error.h>
# include "su3.h"
# include "inline_sse.h"
# include "prefetch.h"

# define N	4000000
# define NTIMES	10
# define OFFSET	0
# define PAGE 4096
# define ALIGN 128L
# define MATVEC_FLOPS 66
# define MATHWVEC_FLOPS 132
# define MATMAT_FLOPS 198
# define STRIDE 347


# ifndef MIN
# define MIN(x,y) ((x)<(y)?(x):(y))
# endif
# ifndef MAX
# define MAX(x,y) ((x)>(y)?(x):(y))
# endif


static float *a, *b, *c;
static unsigned int map1[N], map2[N];

extern double second();
void anal(double *, int, char *);
void anal2(double *, int, int, char *);
void analstd(double *, double *, double *);


int main(int argc, char **argv) {
  
  register int	j, k;
  float *apt, *bpt, *cpt;
  double		times[NTIMES];
  void               *temp;

  int opt_mem = 1, opt_sse = 1, opt_prefetch = 1;
  int opt_mapped = 1, opt_strided = 1, opt_incache = 1, opt_seq = 1;
  int opt_matvec = 0, opt_mathwvec = 0, opt_matmat = 0;
  int parsed_opt, option_index;

  struct option long_options[] =
  {
    {"mem",      no_argument, &opt_mem,      1},
    {"sse",      no_argument, &opt_sse,      1},
    {"prefetch", no_argument, &opt_prefetch, 1},
    {"mapped",   no_argument, &opt_mapped,   1},
    {"strided",  no_argument, &opt_strided,  1},
    {"incache",  no_argument, &opt_incache,  1},
    {"seq",      no_argument, &opt_seq,      1},
    {"matvec",   no_argument, &opt_matvec,   1},
    {"mathwvec", no_argument, &opt_mathwvec, 1},
    {"matmat",   no_argument, &opt_matmat,   1},
    {"all",      no_argument, 0,           'a'},
    {"nosse",    no_argument, 0,           'n'},
    {"allmat",   no_argument, 0,           'm'},
    {0, 0, 0, 0}
  };

  if (argc == 1) {
    /* "all" is default */
    opt_mem = opt_sse = opt_prefetch = 1;
    opt_mapped = opt_strided = opt_incache = opt_seq = 1;
    opt_matvec = opt_mathwvec = opt_matmat = 1;
  }
  else {
    opt_mem = opt_sse = opt_prefetch = 0;
    opt_mapped = opt_strided = opt_incache = opt_seq = 0;
    opt_matvec = opt_mathwvec = opt_matmat = 0;
    while (1) {
      parsed_opt = getopt_long(argc, argv, "", long_options, &option_index);
      if (parsed_opt == -1) break;
      switch (parsed_opt) 
	{
	case 0:
	  /* no action necessary, flag set in getopt_long */
	  break;
	case 'a':
	  /* "all" option */
	  opt_mem = opt_sse = opt_prefetch = 1;
	  opt_mapped = opt_strided = opt_incache = opt_seq = 1;
	  opt_matvec = opt_mathwvec = opt_matmat = 1;
	  break;
	case 'm':
	  /* "allmat" option */
	  opt_prefetch = 1;
	  opt_mapped = opt_strided = opt_incache = opt_seq = 1;
	  opt_matvec = opt_mathwvec = opt_matmat = 1;
	  break;
	case 'n':
	  /* "nosse" option */
	  opt_mem = 1;
	  opt_mapped = opt_strided = opt_incache = opt_seq = 1;
	  opt_matvec = opt_mathwvec = opt_matmat = 1;
	  break;
	default:
	  printf("Usage: %s   with the following optional switches:\n", argv[0]);
	  printf("  --mem         Do memory tests\n");
	  printf("  --sse         Do SSE tests\n");
	  printf("  --prefetch    Do prefetch tests\n");
	  printf("  --mapped      Do mapped linear algebra timings\n");
	  printf("  --strided     Do strided linear algebra timings\n");
	  printf("  --incache     Do in-cache linear algebra timings\n");
	  printf("  --seq         Do sequential linear algebra timings\n");
	  printf("  --matvec      Do matrix-vector timings\n");
	  printf("  --mathwvec    Do matrix-half-wilson-vector timings\n");
	  printf("  --matmat      Do matrix-matrix timings\n");
	  printf("  --all         Do all measurements (memory + linear algebra)\n");
	  printf("  --nosse       Do all measurements, but no SSE (including prefetch)\n");
	  printf("  --allmat      Do all linear algebra measurements\n");
	  exit(0);
	}
    }
    if (optind < argc) {
      fprintf (stderr, "Extra arguments found: ");
      while (optind < argc) fprintf (stderr, "%s ", argv[optind++]);
      putchar ('\n');
    }
  }

#if 0
  struct sched_param param={sched_priority:20};

  if (sched_setscheduler(0, SCHED_FIFO, &param) < 0)
    perror("Attempt to put in real time queue");
#endif

  /* --- allocate arrays ---- */

  temp = malloc(N*sizeof(float) + PAGE);
  if (temp == NULL) {
    printf("Malloc failure\n");
    exit(1);
  }
  a = (float *)(((unsigned int)temp + ALIGN) & ~(ALIGN - 1L));
  temp = malloc(N*sizeof(float) + PAGE);
  if (temp == NULL) {
    printf("Malloc failure\n");
    exit(1);
  }
  b = (float *)(((unsigned int)temp + ALIGN) & ~(ALIGN - 1L));
  temp = malloc(N*sizeof(float) + PAGE);
  if (temp == NULL) {
    printf("Malloc failure\n");
    exit(1);
  }
  c = (float *)(((unsigned int)temp + ALIGN) & ~(ALIGN - 1L));
  
  /* --- initialize source arrays (a, b) --- */
  for (j=0; j < N; j++) {
    a[j] = (float)(rand() - RAND_MAX/2)/(float)(RAND_MAX/2)*2.0;
    b[j] = (float)(rand() - RAND_MAX/2)/(float)(RAND_MAX/2)*2.0;
  }
  for (j=0; j<N; j++) {
    c[j] = a[j];
  }

  /* --- initialize map array (map) ---- */
  for (j=0; j < N; j++) {
    map1[j] = (j*STRIDE*3) % N;
    map2[j] = (j*STRIDE*3+50) % N;
  }
  
  if (opt_mem) {
    printf("\nFunction\tRate (MB/s)\t\tMean time\t\tMin time\tMax time\n");
    printf(  "--------\t-----------\t\t---------\t\t--------\t--------\n");
    
    /*  Float Stream COPY */
    for (k=0; k<NTIMES; k++) {
      times[k] = second();
      for (j=0; j<N; j++) {
	c[j] = a[j];
      }
      times[k] = second() - times[k];
    }
    anal(times, 2 * sizeof(float) * N, "Float Copy:");
    
    /*  Standard Stream COPY */
    for (k=0; k<NTIMES; k++) {
      times[k] = second();
      for (j=0; j<N; j+=2) {
	*(double *)&c[j] = *(double *)&a[j];
      }
      times[k] = second() - times[k];
    }
    anal(times, 2 * sizeof(double) * N/2, "Double Copy:");
    
    if (opt_sse) {
      /*  SSE optimized Stream COPY, should work on PIII, P4, newer Athlon */
      for (k=0; k<NTIMES; k++) {
	times[k] = second();
	for (j=0; j<N; j+=4) {
	  __asm__ __volatile__ ("movaps   %1, %%xmm0 \n\t" \
				"movntps  %%xmm0, %0" \
				: \
				"=m" (c[j]) \
				: \
				"m" (a[j]));
	}
	times[k] = second() - times[k];
      }
      anal(times, 2 * 4*sizeof(float) * N/4, "SSE Copy:");
    } /* endif opt_sse */
  }  /* endif opt_mem */
    
  if (opt_matvec || opt_mathwvec || opt_matmat) {
    printf("\n\nFunction\tRate (MFlop/s)\t\tMean time\t\tMin time\tMax time\n");
    printf(    "--------\t--------------\t\t---------\t\t--------\t--------\n");
    
    if (opt_matvec) {
      printf("MILC MatVec\n");
      
      if (opt_incache) {
	/* MILC Mat-Vec In Cache */
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  for (j=0; j < N/18; j++) {
	    mult_su3_mat_vec((su3_matrix *)(a), (su3_vector *)(b), (su3_vector *)(c));
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATVEC_FLOPS, "  In Cache:");
      }

      if (opt_seq) {
	/* MILC Mat-Vec Sequential */
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  apt = a;
	  bpt = b;
	  cpt = c;
	  for (j=0; j < N/18; j++) {
	    mult_su3_mat_vec((su3_matrix *)(apt), (su3_vector *)(bpt), (su3_vector *)(cpt));
	    apt += 18;
	    bpt += 6;
	    cpt += 6;
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATVEC_FLOPS, "  Sequential:");
      }

      if (opt_strided) {
	/* MILC Mat-Vec Strided */
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  for (j=0; j < N/18; j++) {
	    mult_su3_mat_vec((su3_matrix *)(a+(j*STRIDE)%N), (su3_vector *)(b+(j*STRIDE)%N), (su3_vector *)(c+(j*STRIDE)%N));
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATVEC_FLOPS, "  Strided:");

	if (opt_prefetch) {
	  /* MILC Mat-Vec Prefetched Strided*/
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      prefetch_matrix(a+((j+1)*STRIDE)%N);
	      prefetch_vector(b+((j+1)*STRIDE)%N);
	      mult_su3_mat_vec((su3_matrix *)(a+(j*STRIDE)%N), (su3_vector *)(b+(j*STRIDE)%N), (su3_vector *)(c+(j*STRIDE)%N));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATVEC_FLOPS, "  PF+Strided:");
	}
      }

      if (opt_mapped) {
	/* MILC Mat-Vec Mapped */
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  for (j=0; j < N/18; j++) {
	    mult_su3_mat_vec((su3_matrix *)(a+map1[j]), (su3_vector *)(b+map1[j]), (su3_vector *)(c+map1[j]));
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATVEC_FLOPS, "  Mapped:");

	if (opt_prefetch) {
	  /* MILC Mat-Vec Prefetched + Mapped */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      prefetch_matrix(a+map2[j+1]);
	      prefetch_vector(b+map2[j+1]);
	      mult_su3_mat_vec((su3_matrix *)(a+map2[j]), (su3_vector *)(b+map2[j]), (su3_vector *)(c+map2[j]));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATVEC_FLOPS, "  PF+Mapped:");
	}
      }

      if (opt_sse) {
	printf("SSE MatVec\n");
	
	if (opt_incache) {
	  /* SSE Mat-Vec In Cache */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_mat_vec((su3_matrix *)(a), (su3_vector *)(b), (su3_vector *)(c));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATVEC_FLOPS, "  In Cache:");
	}

	if (opt_seq) {
	  /* SSE Mat-Vec Sequential */
	  for (k=0; k<NTIMES; k++) {
	    apt = a;
	    bpt = b;
	    cpt = c;
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_mat_vec((su3_matrix *)(apt), (su3_vector *)(bpt), (su3_vector *)(cpt));
	      apt += 18;
	      bpt += 6;
	      cpt += 6;
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATVEC_FLOPS, "  Sequential:");
	}

	if (opt_strided) {
	  /* SSE Mat-Vec Strided */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_mat_vec((su3_matrix *)(a+(j*STRIDE)%N), (su3_vector *)(b+(j*STRIDE)%N), (su3_vector *)(c+(j*STRIDE)%N));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATVEC_FLOPS, "  Strided:");

	  if (opt_prefetch) {
	    /* SSE Mat-Vec Prefetched Strided*/
	    for (k=0; k<NTIMES; k++) {
	      times[k] = second();
	      for (j=0; j < N/18; j++) {
		prefetch_matrix(a+((j+1)*STRIDE)%N);
		prefetch_vector(b+((j+1)*STRIDE)%N);
		_inline_sse_mult_su3_mat_vec((su3_matrix *)(a+(j*STRIDE)%N), (su3_vector *)(b+(j*STRIDE)%N), (su3_vector *)(c+(j*STRIDE)%N));
	      }
	      times[k] = second() - times[k];
	    }
	    anal2(times, (int)(N/18), MATVEC_FLOPS, "  PF+Strided:");
	  }
	}

	if (opt_mapped) {
	  /* SSE Mat-Vec Mapped */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_mat_vec((su3_matrix *)(a+map1[j]), (su3_vector *)(b+map1[j]), (su3_vector *)(c+map1[j]));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATVEC_FLOPS, "  Mapped:");

	  if (opt_prefetch) {
	    /* SSE Mat-Vec Prefetched + Mapped */
	    for (k=0; k<NTIMES; k++) {
	      times[k] = second();
	      for (j=0; j < N/18; j++) {
		prefetch_matrix(a+map2[j+1]);
		prefetch_vector(b+map2[j+1]);
		_inline_sse_mult_su3_mat_vec((su3_matrix *)(a+map2[j]), (su3_vector *)(b+map2[j]), (su3_vector *)(c+map2[j]));
	      }
	      times[k] = second() - times[k];
	    }
	    anal2(times, (int)(N/18), MATVEC_FLOPS, "  PF+Mapped:");
	  } /* endif opt_prefetch */
	} /* endif opt_mapped */
      } /* endif opt_sse */
    } /* endif mat_vec */


    if (opt_mathwvec) {
      printf("\nMILC MatHWVec\n");

      if (opt_incache) {
	/* MILC Mat-HWVec In Cache */
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  for (j=0; j < N/18; j++) {
	    mult_su3_mat_hwvec((su3_matrix *)(a), (half_wilson_vector *)(b), (half_wilson_vector *)(c));
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  In Cache:");
      }

      if (opt_seq) {
	/* MILC Mat-HWVec Sequential */
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  apt = a;
	  bpt = b;
	  cpt = c;
	  for (j=0; j < N/18; j++) {
	    mult_su3_mat_hwvec((su3_matrix *)(apt), (half_wilson_vector *)(bpt), (half_wilson_vector *)(cpt));
	    apt += 18;
	    bpt += 12;
	    cpt += 12;
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  Sequential:");
      }

      if (opt_strided) {
	/* MILC Mat-HWVec Strided*/
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  for (j=0; j < N/18; j++) {
	    mult_su3_mat_hwvec((su3_matrix *)(a+(j*STRIDE)%N), (half_wilson_vector *)(b+(j*STRIDE)%N), (half_wilson_vector *)(c+(j*STRIDE)%N));
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  Strided:");
	
	if (opt_prefetch) {
	  /* MILC Mat-HWVec Prefetched Strided*/
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      prefetch_matrix(a+((j+1)*STRIDE)%N);
	      prefetch_vector(b+((j+1)*STRIDE)%N);
	      mult_su3_mat_hwvec((su3_matrix *)(a+(j*STRIDE)%N), (half_wilson_vector *)(b+(j*STRIDE)%N), (half_wilson_vector *)(c+(j*STRIDE)%N));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  PF+Strided:");
	} /* endif prefetch */
      } /* endif strided */

      if (opt_mapped) {
	/* MILC Mat-HWVec Mapped */
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  for (j=0; j < N/18; j++) {
	    mult_su3_mat_hwvec((su3_matrix *)(a+map1[j]), (half_wilson_vector *)(b+map1[j]), (half_wilson_vector *)(c+map1[j]));
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  Mapped:");
	
	if (opt_prefetch) {
	  /* MILC Mat-HWVec Prefetched Mapped */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      prefetch_matrix(a+map2[j]);
	      prefetch_vector(b+map2[j]);
	      mult_su3_mat_hwvec((su3_matrix *)(a+map2[j]), (half_wilson_vector *)(b+map2[j]), (half_wilson_vector *)(c+map2[j]));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  PF+Mapped:");
	}
      }

      if (opt_sse) {
	printf("SSE MatHWVec\n");
	
	if (opt_incache) {
	  /* SSE Mat-HWVec In Cache */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_mat_hwvec((su3_matrix *)(a), (half_wilson_vector *)(b), (half_wilson_vector *)(c));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  In Cache:");
	}

	if (opt_seq) {
	  /* SSE Mat-HWVec Sequential */
	  for (k=0; k<NTIMES; k++) {
	    apt = a;
	    bpt = b;
	    cpt = c;
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_mat_hwvec((su3_matrix *)(apt), (half_wilson_vector *)(bpt), (half_wilson_vector *)(cpt));
	      apt += 18;
	      bpt += 12;
	      cpt += 12;
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  Sequential:");
	}

	if (opt_strided) {
	  /* SSE Mat-HWVec Strided*/
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_mat_hwvec((su3_matrix *)(a+(j*STRIDE)%N), (half_wilson_vector *)(b+(j*STRIDE)%N), (half_wilson_vector *)(c+(j*STRIDE)%N));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  Strided:");
	  
	  if (opt_prefetch) {
	    /* SSE Mat-HWVec Prefetched Strided*/
	    for (k=0; k<NTIMES; k++) {
	      times[k] = second();
	      for (j=0; j < N/18; j++) {
		prefetch_matrix(a+((j+1)*STRIDE)%N);
		prefetch_vector(b+((j+1)*STRIDE)%N);
		_inline_sse_mult_su3_mat_hwvec((su3_matrix *)(a+(j*STRIDE)%N), (half_wilson_vector *)(b+(j*STRIDE)%N), (half_wilson_vector *)(c+(j*STRIDE)%N));
	      }
	      times[k] = second() - times[k];
	    }
	    anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  PF+Strided:");
	  }
	}
	
	if (opt_mapped) {
	  /* SSE Mat-HWVec Mapped */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_mat_hwvec((su3_matrix *)(a+map1[j]), (half_wilson_vector *)(b+map1[j]), (half_wilson_vector *)(c+map1[j]));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  Mapped:");
	  
	  if (opt_prefetch) {
	    /* SSE Mat-HWVec Prefetched Mapped */
	    for (k=0; k<NTIMES; k++) {
	      times[k] = second();
	      for (j=0; j < N/18; j++) {
		prefetch_matrix(a+map2[j]);
		prefetch_vector(b+map2[j]);
		_inline_sse_mult_su3_mat_hwvec((su3_matrix *)(a+map2[j]), (half_wilson_vector *)(b+map2[j]), (half_wilson_vector *)(c+map2[j]));
	      }
	      times[k] = second() - times[k];
	    }
	    anal2(times, (int)(N/18), MATHWVEC_FLOPS, "  PF+Mapped:");
	  } /* endif opt_prefetch */
	} /* endif opt_mapped */
      } /* endif opt_sse */
    } /* endif opt_mathwvec */

    if (opt_matmat) {
      printf("\nMILC MatMat\n");

      if (opt_incache) {
	/* MILC Mat-Mat nn In Cache */
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  for (j=0; j < N/18; j++) {
	    mult_su3_nn((su3_matrix *)(a), (su3_matrix *)(b), (su3_matrix *)(c));
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATMAT_FLOPS, "  In Cache:");
      }

      if (opt_seq) {
	/* MILC Mat-Mat nn Sequential */
	for (k=0; k<NTIMES; k++) {
	  apt = a;
	  bpt = b;
	  cpt = c;
	  times[k] = second();
	  for (j=0; j < N/18; j++) {
	    mult_su3_nn((su3_matrix *)(apt), (su3_matrix *)(bpt), (su3_matrix *)(cpt));
	    apt += 18;
	    bpt += 18;
	    cpt += 18;
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATMAT_FLOPS, "  Sequential:");
      }

      if (opt_strided) {
	/* MILC Mat-Mat nn Strided*/
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  for (j=0; j < N/18; j++) {
	    mult_su3_nn((su3_matrix *)(a+(j*STRIDE)%N), (su3_matrix *)(b+(j*STRIDE)%N), (su3_matrix *)(c+(j*STRIDE)%N));
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATMAT_FLOPS, "  Strided:");

	if (opt_prefetch) {
	  /* MILC Mat-Mat nn Prefetched + Strided */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      prefetch_matrix(a+((j+1)*STRIDE)%N);
	      prefetch_matrix(b+((j+1)*STRIDE)%N);
	      mult_su3_nn((su3_matrix *)(a+(j*STRIDE)%N), (su3_matrix *)(b+(j*STRIDE)%N), (su3_matrix *)(c+(j*STRIDE)%N));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATMAT_FLOPS, "  PF+Strided:");
	}
      }

      if (opt_mapped) {
	/* MILC Mat-Mat nn Mapped */
	for (k=0; k<NTIMES; k++) {
	  times[k] = second();
	  for (j=0; j < N/18; j++) {
	    mult_su3_nn((su3_matrix *)(a+map1[j]), (su3_matrix *)(b+map1[j]), (su3_matrix *)(c+map1[j]));
	  }
	  times[k] = second() - times[k];
	}
	anal2(times, (int)(N/18), MATMAT_FLOPS, "  Mapped:");

	if (opt_prefetch) {
	  /* MILC Mat-Mat nn Prefetched + Mapped */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      prefetch_matrix(a+map2[j+1]);
	      prefetch_matrix(b+map2[j+1]);
	      mult_su3_nn((su3_matrix *)(a+map2[j]), (su3_matrix *)(b+map2[j]), (su3_matrix *)(c+map2[j]));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATMAT_FLOPS, "  PF+Mapped:");
	} /* endif opt_prefetch */
      } /* endif opt_mapped */

      if (opt_sse) {
	printf("SSE MatMat\n");

	if (opt_incache) {
	  /* SSE Mat-Mat nn In Cache */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_nn((su3_matrix *)(a), (su3_matrix *)(b), (su3_matrix *)(c));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATMAT_FLOPS, "  In Cache:");
	}

	if (opt_seq) {
	  /* SSE Mat-Mat nn Sequential */
	  for (k=0; k<NTIMES; k++) {
	    apt = a;
	    bpt = b;
	    cpt = c;
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_nn((su3_matrix *)(apt), (su3_matrix *)(bpt), (su3_matrix *)(cpt));
	      apt += 18;
	      bpt += 18;
	      cpt += 18;
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATMAT_FLOPS, "  Sequential:");
	}

	if (opt_strided) {
	  /* SSE Mat-Mat nn Strided*/
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_nn((su3_matrix *)(a+(j*STRIDE)%N), (su3_matrix *)(b+(j*STRIDE)%N), (su3_matrix *)(c+(j*STRIDE)%N));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATMAT_FLOPS, "  Strided:");
	  
	  if (opt_prefetch) {
	    /* SSE Mat-Mat nn Prefetched + Strided */
	    for (k=0; k<NTIMES; k++) {
	      times[k] = second();
	      for (j=0; j < N/18; j++) {
		prefetch_matrix(a+((j+1)*STRIDE)%N);
		prefetch_matrix(b+((j+1)*STRIDE)%N);
		_inline_sse_mult_su3_nn((su3_matrix *)(a+(j*STRIDE)%N), (su3_matrix *)(b+(j*STRIDE)%N), (su3_matrix *)(c+(j*STRIDE)%N));
	      }
	      times[k] = second() - times[k];
	    }
	    anal2(times, (int)(N/18), MATMAT_FLOPS, "  PF+Strided:");
	  } /*endif opt_prefetch */
	} /* endif opt_strided */
	    
	if (opt_mapped) {
	  /* SSE Mat-Mat nn Mapped */
	  for (k=0; k<NTIMES; k++) {
	    times[k] = second();
	    for (j=0; j < N/18; j++) {
	      _inline_sse_mult_su3_nn((su3_matrix *)(a+map1[j]), (su3_matrix *)(b+map1[j]), (su3_matrix *)(c+map1[j]));
	    }
	    times[k] = second() - times[k];
	  }
	  anal2(times, (int)(N/18), MATMAT_FLOPS, "  Mapped:");
	  
	  if (opt_prefetch) {
	    /* SSE Mat-Mat nn Prefetched + Mapped */
	    for (k=0; k<NTIMES; k++) {
	      times[k] = second();
	      for (j=0; j < N/18; j++) {
		prefetch_matrix(a+map2[j+1]);
		prefetch_matrix(b+map2[j+1]);
		_inline_sse_mult_su3_nn((su3_matrix *)(a+map2[j]), (su3_matrix *)(b+map2[j]), (su3_matrix *)(c+map2[j]));
	      }
	      times[k] = second() - times[k];
	    }
	    anal2(times, (int)(N/18), MATMAT_FLOPS, "  PF+Mapped:");
	  } /* endif opt_prefetch */
	} /* endif opt_mapped */
      } /* endif opt_sse */
    } /* endif opt_matmat */
  } /* endif opt_matmat || ... */
  printf ("\n");
  return 0;
}
    
void anal(double *times, int bytes, char *label) {
  double maxtime = 0;
  double mintime = FLT_MAX;
  int k;
  double mean, std;

  for (k=0; k<NTIMES; k++) {
      mintime = MIN(mintime, times[k]);
      maxtime = MAX(maxtime, times[k]);
  }
  
  analstd(times, &mean, &std);
  printf("%s\t%6.1f +/-%5.1f\t\t%.4f +/- %.5f\t%.4f\t\t%.4f\n", label,
	 1.0E-06 * bytes/mintime,
	 1.0E-06 * bytes/mintime/mintime*std,
	 mean,
	 std,
	 mintime,
	 maxtime);
}

void anal2(double *times, int cnt, int flops, char *label) {
  double maxtime = 0;
  double mintime = FLT_MAX;
  int k;
  double mean, std;

  for (k=0; k<NTIMES; k++) {
      mintime = MIN(mintime, times[k]);
      maxtime = MAX(maxtime, times[k]);
  }
  
  analstd(times, &mean, &std);
  printf("%s\t%6.1f +/-%5.1f\t\t%.4f +/- %.5f\t%.4f\t\t%.4f\n", label,
	 1.0E-06 * cnt * flops/mean,
	 1.0E-06 * cnt * flops/mean/mean*std,
	 mean,
	 std,
	 mintime,
	 maxtime);
}

void analstd(double *times, double *mean, double *std) {
  int k, excluded=0;
  double meantot=0, stdtot=0;

  /*
   * Calculate mean, standard deviation
   */
  *mean = *std = 0;
  for (k=0; k<NTIMES; k++) meantot += times[k];
  *mean = meantot/NTIMES;
  for (k=0; k<NTIMES; k++) stdtot += (times[k] - *mean) * (times[k] - *mean);
  *std = sqrt(stdtot/(NTIMES-1));

  /*
   * Get rid of outliers (more than 1.75 std away from mean)
   */
  for (k=0; k<NTIMES; k++) {
    if (abs(times[k] - *mean) > (1.75 * *std)) {
      meantot -= times[k];
      excluded++;
    }
  }

  /*
   * If there are still enough data points, recalculate the mean and std.
   * Otherwise, return with original values.
   */
  if ((NTIMES - excluded) > 5) {
    *mean = meantot/NTIMES;
    stdtot = 0;
    for (k=0; k<NTIMES; k++) stdtot += (times[k] - *mean) * (times[k] - *mean);
    *std = sqrt(stdtot/(NTIMES-excluded-1));
  }
}