/* * QCDSTREAM * This benchmark is derived directly from John McCalpin's STREAM. * Differences: * - no scale, add, or triad measurements * - measurements of float and long double transfers * - malloc'd instead of static arrays * - timings of SU3 (complex) matrix-vector and matrix-matrix routines * from the MILC lattice QCD codes * - timings of inline SSE versions of matrix routines * - investigations of data access patterns (in cache, sequential, * strided, and mapped) * - investigations of precaching * * Don Holmgren * Fermilab * djholm@fnal.gov */ # include # include # include # include # include # include # include # include # include # include # include # include "su3.h" # include "inline_sse.h" # include "prefetch.h" # define N 4000000 # define NTIMES 10 # define OFFSET 0 # define PAGE 4096 # define ALIGN 128L # define MATVEC_FLOPS 66 # define MATHWVEC_FLOPS 132 # define MATMAT_FLOPS 198 # define STRIDE 347 # ifndef MIN # define MIN(x,y) ((x)<(y)?(x):(y)) # endif # ifndef MAX # define MAX(x,y) ((x)>(y)?(x):(y)) # endif static float *a, *b, *c; static unsigned int map1[N], map2[N]; extern double second(); void anal(double *, int, char *); void anal2(double *, int, int, char *); void analstd(double *, double *, double *); int main(int argc, char **argv) { register int j, k; float *apt, *bpt, *cpt; double times[NTIMES]; void *temp; int opt_mem = 1, opt_sse = 1, opt_prefetch = 1; int opt_mapped = 1, opt_strided = 1, opt_incache = 1, opt_seq = 1; int opt_matvec = 0, opt_mathwvec = 0, opt_matmat = 0; int parsed_opt, option_index; struct option long_options[] = { {"mem", no_argument, &opt_mem, 1}, {"sse", no_argument, &opt_sse, 1}, {"prefetch", no_argument, &opt_prefetch, 1}, {"mapped", no_argument, &opt_mapped, 1}, {"strided", no_argument, &opt_strided, 1}, {"incache", no_argument, &opt_incache, 1}, {"seq", no_argument, &opt_seq, 1}, {"matvec", no_argument, &opt_matvec, 1}, {"mathwvec", no_argument, &opt_mathwvec, 1}, {"matmat", no_argument, &opt_matmat, 1}, {"all", no_argument, 0, 'a'}, {"nosse", no_argument, 0, 'n'}, {"allmat", no_argument, 0, 'm'}, {0, 0, 0, 0} }; if (argc == 1) { /* "all" is default */ opt_mem = opt_sse = opt_prefetch = 1; opt_mapped = opt_strided = opt_incache = opt_seq = 1; opt_matvec = opt_mathwvec = opt_matmat = 1; } else { opt_mem = opt_sse = opt_prefetch = 0; opt_mapped = opt_strided = opt_incache = opt_seq = 0; opt_matvec = opt_mathwvec = opt_matmat = 0; while (1) { parsed_opt = getopt_long(argc, argv, "", long_options, &option_index); if (parsed_opt == -1) break; switch (parsed_opt) { case 0: /* no action necessary, flag set in getopt_long */ break; case 'a': /* "all" option */ opt_mem = opt_sse = opt_prefetch = 1; opt_mapped = opt_strided = opt_incache = opt_seq = 1; opt_matvec = opt_mathwvec = opt_matmat = 1; break; case 'm': /* "allmat" option */ opt_prefetch = 1; opt_mapped = opt_strided = opt_incache = opt_seq = 1; opt_matvec = opt_mathwvec = opt_matmat = 1; break; case 'n': /* "nosse" option */ opt_mem = 1; opt_mapped = opt_strided = opt_incache = opt_seq = 1; opt_matvec = opt_mathwvec = opt_matmat = 1; break; default: printf("Usage: %s with the following optional switches:\n", argv[0]); printf(" --mem Do memory tests\n"); printf(" --sse Do SSE tests\n"); printf(" --prefetch Do prefetch tests\n"); printf(" --mapped Do mapped linear algebra timings\n"); printf(" --strided Do strided linear algebra timings\n"); printf(" --incache Do in-cache linear algebra timings\n"); printf(" --seq Do sequential linear algebra timings\n"); printf(" --matvec Do matrix-vector timings\n"); printf(" --mathwvec Do matrix-half-wilson-vector timings\n"); printf(" --matmat Do matrix-matrix timings\n"); printf(" --all Do all measurements (memory + linear algebra)\n"); printf(" --nosse Do all measurements, but no SSE (including prefetch)\n"); printf(" --allmat Do all linear algebra measurements\n"); exit(0); } } if (optind < argc) { fprintf (stderr, "Extra arguments found: "); while (optind < argc) fprintf (stderr, "%s ", argv[optind++]); putchar ('\n'); } } #if 0 struct sched_param param={sched_priority:20}; if (sched_setscheduler(0, SCHED_FIFO, ¶m) < 0) perror("Attempt to put in real time queue"); #endif /* --- allocate arrays ---- */ temp = malloc(N*sizeof(float) + PAGE); if (temp == NULL) { printf("Malloc failure\n"); exit(1); } a = (float *)(((unsigned int)temp + ALIGN) & ~(ALIGN - 1L)); temp = malloc(N*sizeof(float) + PAGE); if (temp == NULL) { printf("Malloc failure\n"); exit(1); } b = (float *)(((unsigned int)temp + ALIGN) & ~(ALIGN - 1L)); temp = malloc(N*sizeof(float) + PAGE); if (temp == NULL) { printf("Malloc failure\n"); exit(1); } c = (float *)(((unsigned int)temp + ALIGN) & ~(ALIGN - 1L)); /* --- initialize source arrays (a, b) --- */ for (j=0; j < N; j++) { a[j] = (float)(rand() - RAND_MAX/2)/(float)(RAND_MAX/2)*2.0; b[j] = (float)(rand() - RAND_MAX/2)/(float)(RAND_MAX/2)*2.0; } for (j=0; j (1.75 * *std)) { meantot -= times[k]; excluded++; } } /* * If there are still enough data points, recalculate the mean and std. * Otherwise, return with original values. */ if ((NTIMES - excluded) > 5) { *mean = meantot/NTIMES; stdtot = 0; for (k=0; k