Merge branch 'develop'

2025-04-23 21:39:10 -04:00 · 2023-12-31 11:26:48 -08:00 · 2023-12-31 11:26:48 -08:00 · ec2a35def9
commit ec2a35def9
parent 645ca62c91 031d24e695
65 changed files with 5705 additions and 18904 deletions
--- a/hw/rtl/VX_cluster.sv
+++ b/hw/rtl/VX_cluster.sv
@ -82,64 +82,9 @@ module VX_cluster import VX_gpu_pkg::*; #(
 `endif

    VX_mem_bus_if #(
-        .DATA_SIZE (L2_WORD_SIZE),
-        .TAG_WIDTH (L2_TAG_WIDTH)
-    ) l2_mem_bus_if[L2_NUM_REQS]();
-
-    VX_mem_bus_if #(
-        .DATA_SIZE (ICACHE_LINE_SIZE),
-        .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
-    ) per_socket_icache_mem_bus_if[`NUM_SOCKETS]();
-
-    VX_mem_bus_if #(
-        .DATA_SIZE (DCACHE_LINE_SIZE),
-        .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
-    ) per_socket_dcache_mem_bus_if[`NUM_SOCKETS]();
-
-    VX_mem_bus_if #(
-        .DATA_SIZE (ICACHE_LINE_SIZE),
-        .TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH)
-    ) icache_mem_bus_if[1]();
-
-    VX_mem_bus_if #(
-        .DATA_SIZE (DCACHE_LINE_SIZE),
-        .TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH)
-    ) dcache_mem_bus_if[1]();
-
-    `RESET_RELAY (l1_mem_arb_reset, reset);
-
-    VX_mem_arb #(
-        .NUM_INPUTS  (`NUM_SOCKETS),
-        .DATA_SIZE   (ICACHE_LINE_SIZE),
-        .TAG_WIDTH   (ICACHE_MEM_TAG_WIDTH),
-        .TAG_SEL_IDX (1), // Skip 0 for NC flag
-        .ARBITER     ("R"),
-        .OUT_REG_REQ (2),
-        .OUT_REG_RSP (2)
-    ) icache_mem_arb (
-        .clk        (clk),
-        .reset      (l1_mem_arb_reset),
-        .bus_in_if  (per_socket_icache_mem_bus_if),
-        .bus_out_if (icache_mem_bus_if)
-    );
-
-    VX_mem_arb #(
-        .NUM_INPUTS  (`NUM_SOCKETS),
-        .DATA_SIZE   (DCACHE_LINE_SIZE),
-        .TAG_WIDTH   (DCACHE_MEM_TAG_WIDTH),
-        .TAG_SEL_IDX (1), // Skip 0 for NC flag
-        .ARBITER     ("R"),
-        .OUT_REG_REQ (2),
-        .OUT_REG_RSP (2)
-    ) dcache_mem_arb (
-        .clk        (clk),
-        .reset      (l1_mem_arb_reset),
-        .bus_in_if  (per_socket_dcache_mem_bus_if),
-        .bus_out_if (dcache_mem_bus_if)
-    );
-
-    `ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[ICACHE_MEM_ARB_IDX], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH);
-    `ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[DCACHE_MEM_ARB_IDX], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);    
+        .DATA_SIZE (`L1_LINE_SIZE),
+        .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
+    ) per_socket_mem_bus_if[`NUM_SOCKETS]();

    `RESET_RELAY (l2_reset, reset);

@ -155,7 +100,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
        .MSHR_SIZE      (`L2_MSHR_SIZE),
        .MRSQ_SIZE      (`L2_MRSQ_SIZE),
        .MREQ_SIZE      (`L2_MREQ_SIZE),
-        .TAG_WIDTH      (L1_MEM_TAG_WIDTH),
+        .TAG_WIDTH      (L2_TAG_WIDTH),
        .WRITE_ENABLE   (1),
        .UUID_WIDTH     (`UUID_WIDTH),  
        .CORE_OUT_REG   (2),
@ -168,7 +113,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
    `ifdef PERF_ENABLE
        .cache_perf     (mem_perf_tmp_if.l2cache),
    `endif
-        .core_bus_if    (l2_mem_bus_if),
+        .core_bus_if    (per_socket_mem_bus_if),
        .mem_bus_if     (mem_bus_if)
    );

@ -209,8 +154,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
            
            .dcr_bus_if     (socket_dcr_bus_if),

-            .icache_mem_bus_if (per_socket_icache_mem_bus_if[i]),
-            .dcache_mem_bus_if (per_socket_dcache_mem_bus_if[i]),
+            .mem_bus_if     (per_socket_mem_bus_if[i]),

        `ifdef GBAR_ENABLE
            .gbar_bus_if    (per_socket_gbar_bus_if[i]),
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -292,8 +292,8 @@
 // Floating-Point Units ///////////////////////////////////////////////////////

 // Size of FPU Request Queue
-`ifndef FPU_REQ_QUEUE_SIZE
-`define FPU_REQ_QUEUE_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
+`ifndef FPUQ_SIZE
+`define FPUQ_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
 `endif

 // FNCP Latency
--- a/hw/rtl/VX_gpu_pkg.sv
+++ b/hw/rtl/VX_gpu_pkg.sv
@ -141,10 +141,9 @@ package VX_gpu_pkg;

    /////////////////////////////// L1 Parameters /////////////////////////////

-    localparam ICACHE_MEM_ARB_TAG_WIDTH = (ICACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS));
-    localparam DCACHE_MEM_ARB_TAG_WIDTH = (DCACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS));
-    localparam L1_MEM_TAG_WIDTH         = `MAX(ICACHE_MEM_ARB_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
-
+    localparam L1_MEM_TAG_WIDTH     = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
+    localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
+    
    /////////////////////////////// L2 Parameters /////////////////////////////

    localparam ICACHE_MEM_ARB_IDX = 0;
@ -154,10 +153,10 @@ package VX_gpu_pkg;
    localparam L2_WORD_SIZE	        = `L1_LINE_SIZE;

    // Input request size
-    localparam L2_NUM_REQS	        = 2;
+    localparam L2_NUM_REQS	        = `NUM_SOCKETS;

    // Core request tag bits
-    localparam L2_TAG_WIDTH	        = L1_MEM_TAG_WIDTH;
+    localparam L2_TAG_WIDTH	        = L1_MEM_ARB_TAG_WIDTH;

    // Memory request data bits
    localparam L2_MEM_DATA_WIDTH	= (`L2_LINE_SIZE * 8);
--- a/hw/rtl/VX_socket.sv
+++ b/hw/rtl/VX_socket.sv
@ -30,8 +30,7 @@ module VX_socket import VX_gpu_pkg::*; #(
    VX_dcr_bus_if.slave     dcr_bus_if,

    // Memory
-    VX_mem_bus_if.master    icache_mem_bus_if,
-    VX_mem_bus_if.master    dcache_mem_bus_if,
+    VX_mem_bus_if.master    mem_bus_if,

 `ifdef GBAR_ENABLE
    // Barrier
@ -79,6 +78,11 @@ module VX_socket import VX_gpu_pkg::*; #(
        .TAG_WIDTH (ICACHE_TAG_WIDTH)
    ) per_core_icache_bus_if[`SOCKET_SIZE]();

+    VX_mem_bus_if #(
+        .DATA_SIZE (ICACHE_LINE_SIZE),
+        .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
+    ) icache_mem_bus_if();
+
    `RESET_RELAY (icache_reset, reset);

    VX_cache_cluster #(
@ -117,6 +121,11 @@ module VX_socket import VX_gpu_pkg::*; #(
        .DATA_SIZE (DCACHE_WORD_SIZE),
        .TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
    ) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
+    
+    VX_mem_bus_if #(
+        .DATA_SIZE (DCACHE_LINE_SIZE),
+        .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
+    ) dcache_mem_bus_if();

    `RESET_RELAY (dcache_reset, reset);

@ -151,6 +160,40 @@ module VX_socket import VX_gpu_pkg::*; #(
        .mem_bus_if     (dcache_mem_bus_if)
    );

+    ///////////////////////////////////////////////////////////////////////////  
+
+    VX_mem_bus_if #(
+        .DATA_SIZE (`L1_LINE_SIZE),
+        .TAG_WIDTH (L1_MEM_TAG_WIDTH)
+    ) l1_mem_bus_if[2]();
+
+    VX_mem_bus_if #(
+        .DATA_SIZE (`L1_LINE_SIZE),
+        .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
+    ) l1_mem_arb_bus_if[1]();
+
+    `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
+    `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
+
+    `RESET_RELAY (mem_arb_reset, reset);
+
+    VX_mem_arb #(
+        .NUM_INPUTS   (2),
+        .DATA_SIZE    (`L1_LINE_SIZE),
+        .TAG_WIDTH    (L1_MEM_TAG_WIDTH),
+        .TAG_SEL_IDX  (1), // Skip 0 for NC flag
+        .ARBITER      ("R"),
+        .OUT_REG_REQ  (2),
+        .OUT_REG_RSP  (2)
+    ) mem_arb (
+        .clk        (clk),
+        .reset      (mem_arb_reset),
+        .bus_in_if  (l1_mem_bus_if),
+        .bus_out_if (l1_mem_arb_bus_if)
+    );
+
+    `ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
+
    ///////////////////////////////////////////////////////////////////////////

    wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;
--- a/hw/rtl/core/VX_fpu_unit.sv
+++ b/hw/rtl/core/VX_fpu_unit.sv
@ -30,7 +30,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
    localparam NUM_LANES  = `NUM_FPU_LANES;
    localparam PID_BITS   = `CLOG2(`NUM_THREADS / NUM_LANES);
    localparam PID_WIDTH  = `UP(PID_BITS);
-    localparam TAG_WIDTH  = `LOG2UP(`FPU_REQ_QUEUE_SIZE);
+    localparam TAG_WIDTH  = `LOG2UP(`FPUQ_SIZE);
    localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);

    VX_execute_if #(
@ -87,7 +87,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(

        VX_index_buffer #(
            .DATAW  (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
-            .SIZE   (`FPU_REQ_QUEUE_SIZE)
+            .SIZE   (`FPUQ_SIZE)
        ) tag_store (
            .clk          (clk),
            .reset        (reset),
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@ -15,10 +15,6 @@ all:
 	$(MAKE) -C blackscholes
 	$(MAKE) -C transpose
 	$(MAKE) -C convolution
-#	$(MAKE) -C cutcp
-#	$(MAKE) -C sgemm2
-#	$(MAKE) -C vectorhypot
-#	$(MAKE) -C mri-q run-simx

 run-simx:
 	$(MAKE) -C vecadd run-simx
@ -37,10 +33,6 @@ run-simx:
 	$(MAKE) -C blackscholes run-simx
 	$(MAKE) -C transpose run-simx
 	$(MAKE) -C convolution run-simx
-#	$(MAKE) -C cutcp run-simx
-#	$(MAKE) -C sgemm2 run-simx
-#	$(MAKE) -C vectorhypot run-simx
-#	$(MAKE) -C mri-q run-simx

 run-rtlsim:
 	$(MAKE) -C vecadd run-rtlsim
@ -59,10 +51,6 @@ run-rtlsim:
 	$(MAKE) -C oclprintf run-rtlsim
 	$(MAKE) -C blackscholes run-rtlsim
 	$(MAKE) -C convolution run-rtlsim
-#	$(MAKE) -C cutcp run-rtlsim
-#	$(MAKE) -C sgemm2 run-rtlsim
-#	$(MAKE) -C vectorhypot run-rtlsim
-#	$(MAKE) -C mri-q run-rtlsim

 run-opae:
 	$(MAKE) -C vecadd run-opae
@ -81,10 +69,6 @@ run-opae:
 	$(MAKE) -C oclprintf run-opae
 	$(MAKE) -C blackscholes run-opae
 	$(MAKE) -C convolution run-opae
-#	$(MAKE) -C cutcp run-opae
-#	$(MAKE) -C sgemm2 run-opae
-#	$(MAKE) -C vectorhypot run-opae
-#	$(MAKE) -C mri-q run-opae

 clean:
 	$(MAKE) -C vecadd clean
@ -103,10 +87,6 @@ clean:
 	$(MAKE) -C oclprintf clean
 	$(MAKE) -C blackscholes clean
 	$(MAKE) -C convolution clean
-#	$(MAKE) -C cutcp clean
-#	$(MAKE) -C sgemm2 clean
-#	$(MAKE) -C vectorhypot clean
-#	$(MAKE) -C mri-q clean

 clean-all:
 	$(MAKE) -C vecadd clean-all
@ -124,8 +104,4 @@ clean-all:
 	$(MAKE) -C lbm clean-all
 	$(MAKE) -C oclprintf clean-all
 	$(MAKE) -C blackscholes clean-all
-	$(MAKE) -C convolution clean-all
-#	$(MAKE) -C cutcp clean-all
-#	$(MAKE) -C sgemm2 clean-all
-#	$(MAKE) -C vectorhypot clean-all
-#	$(MAKE) -C mri-q clean-all
+	$(MAKE) -C convolution clean-all
--- a/tests/opencl/cutcp/Makefile
+++ b/tests/opencl/cutcp/Makefile
@ -1,9 +0,0 @@
-PROJECT = cutcp
-
-SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
-
-CXXFLAGS += -I.
-
-OPTS ?= 
-
-include ../common.mk
--- a/tests/opencl/cutcp/args.c
+++ b/tests/opencl/cutcp/args.c
@ -1,617 +0,0 @@
-
-#include <parboil.h>
-#include <errno.h>
-#include <limits.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-
-/*****************************************************************************/
-/* Memory management routines */
-
-/* Free an array of owned strings. */
-void
-pb_FreeStringArray(char **string_array)
-{
-  char **p;
-
-  if (!string_array) return;
-  for (p = string_array; *p; p++) free(*p);
-  free(string_array);
-}
-
-struct pb_PlatformParam *
-pb_PlatformParam(char *name, char *version)
-{
-  if (name == NULL) {
-    fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
-    exit(-1);
-  }
-
-  struct pb_PlatformParam *ret =
-    (struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
-
-  ret->name = name;
-  ret->version = version;
-  return ret;
-}
-
-void
-pb_FreePlatformParam(struct pb_PlatformParam *p)
-{
-  if (p == NULL) return;
-
-  free(p->name);
-  free(p->version);
-  free(p);
-}
-
-struct pb_DeviceParam *
-pb_DeviceParam_index(int index)
-{
-  struct pb_DeviceParam *ret =
-    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
-  ret->criterion = pb_Device_INDEX;
-  ret->index = index;
-  return ret;
-}
-
-struct pb_DeviceParam *
-pb_DeviceParam_cpu(void)
-{
-  struct pb_DeviceParam *ret =
-    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
-  ret->criterion = pb_Device_CPU;
-  return ret;
-}
-
-struct pb_DeviceParam *
-pb_DeviceParam_gpu(void)
-{
-  struct pb_DeviceParam *ret =
-    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
-  ret->criterion = pb_Device_GPU;
-  return ret;
-}
-
-struct pb_DeviceParam *
-pb_DeviceParam_accelerator(void)
-{
-  struct pb_DeviceParam *ret =
-    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
-  ret->criterion = pb_Device_ACCELERATOR;
-  return ret;
-}
-
-struct pb_DeviceParam *
-pb_DeviceParam_name(char *name)
-{
-  struct pb_DeviceParam *ret =
-    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
-  ret->criterion = pb_Device_NAME;
-  ret->name = name;
-  return ret;
-}
-
-void
-pb_FreeDeviceParam(struct pb_DeviceParam *p)
-{
-  if (p == NULL) return;
-
-  switch(p->criterion) {
-  case pb_Device_NAME:
-    free(p->name);
-    break;
-  case pb_Device_INDEX:
-  case pb_Device_CPU:
-  case pb_Device_ACCELERATOR:
-    break;
-  default:
-    fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
-    exit(-1);
-  }
-}
-
-void
-pb_FreeParameters(struct pb_Parameters *p)
-{
-  free(p->outFile);
-  pb_FreeStringArray(p->inpFiles);
-  pb_FreePlatformParam(p->platform);
-  pb_FreeDeviceParam(p->device);
-  free(p);
-}
-
-/*****************************************************************************/
-
-/* Parse a comma-delimited list of strings into an
- * array of strings. */
-static char ** 
-read_string_array(char *in)
-{
-  char **ret;
-  int i;
-  int count;			/* Number of items in the input */
-  char *substring;		/* Current substring within 'in' */
-
-  /* Count the number of items in the string */
-  count = 1;
-  for (i = 0; in[i]; i++) if (in[i] == ',') count++;
-
-  /* Allocate storage */
-  ret = (char **)malloc((count + 1) * sizeof(char *));
-
-  /* Create copies of the strings from the list */
-  substring = in;
-  for (i = 0; i < count; i++) {
-    char *substring_end;
-    int substring_length;
-
-    /* Find length of substring */
-    for (substring_end = substring;
-	 (*substring_end != ',') && (*substring_end != 0);
-	 substring_end++);
-
-    substring_length = substring_end - substring;
-
-    /* Allocate memory and copy the substring */
-    ret[i] = (char *)malloc(substring_length + 1);
-    memcpy(ret[i], substring, substring_length);
-    ret[i][substring_length] = 0;
-
-    /* go to next substring */
-    substring = substring_end + 1;
-  }
-  ret[i] = NULL;		/* Write the sentinel value */
-
-  return ret;
-}
-
-static void
-report_parse_error(const char *str)
-{
-  fputs(str, stderr);
-}
-
-/* Interpret a string as a 'pb_DeviceParam' value.
- * Return a pointer to a new value, or NULL on failure.
- */
-static struct pb_DeviceParam *
-read_device_param(char *str)
-{  
-  /* Try different ways of interpreting 'device_string' until one works */
-
-  /* If argument is an integer, then interpret it as a device index */
-  errno = 0;
-  char *end;
-  long device_int = strtol(str, &end, 10);
-  if (!errno) {
-    /* Negative numbers are not valid */
-    if (device_int < 0 || device_int > INT_MAX) return NULL;
-
-    return pb_DeviceParam_index(device_int);
-  }
-
-  /* Match against predefined strings */
-  if (strcmp(str, "CPU") == 0)
-    return pb_DeviceParam_cpu();
-  if (strcmp(str, "GPU") == 0)
-    return pb_DeviceParam_gpu();
-  if (strcmp(str, "ACCELERATOR") == 0)
-    return pb_DeviceParam_accelerator();
-
-  /* Assume any other string is a device name */
-  return pb_DeviceParam_name(strdup(str));
-}
-
-/* Interpret a string as a 'pb_PlatformParam' value.
- * Return a pointer to a new value, or NULL on failure.
- */
-static struct pb_PlatformParam *
-read_platform_param(char *str)
-{
-  int separator_index;          /* Index of the '-' character separating
-                                 * name and version number.  It's -1 if
-                                 * there's no '-' character. */
-
-  /* Find the last occurrence of '-' in 'str' */
-  {
-    char *cur;
-    separator_index = -1;
-    for (cur = str; *cur; cur++) {
-      if (*cur == '-') separator_index = cur - str;
-    }
-  }
-
-  /* The platform name is either the entire string, or all characters before
-   * the separator */
-  int name_length = separator_index == -1 ? strlen(str) : separator_index;
-  char *name_str = (char *)malloc(name_length + 1);
-  memcpy(name_str, str, name_length);
-  name_str[name_length] = 0;
-
-  /* The version is either NULL, or all characters after the separator */
-  char *version_str;
-  if (separator_index == -1) {
-    version_str = NULL;
-  }
-  else {
-    const char *version_input_str = str + separator_index + 1;
-    int version_length = strlen(version_input_str);
-
-    version_str = (char *)malloc(version_length + 1);
-    memcpy(version_str, version_input_str, version_length);
-    version_str[version_length] = 0;
-  }
-
-  /* Create output structure */
-  return pb_PlatformParam(name_str, version_str);
-}
-
-/****************************************************************************/
-/* Argument parsing state */
-
-/* Argument parsing state.
- *
- * Arguments that are interpreted by the argument parser are removed from
- * the list.  Variables 'argc' and 'argn' do not count arguments that have
- * been removed.
- *
- * During argument parsing, the array of arguments is compacted, overwriting
- * the erased arguments.  Variable 'argv_put' points to the array element
- * where the next argument will be written.  Variable 'argv_get' points to
- * the array element where the next argument will be read from.
- */
-struct argparse {
-  int argc;			/* Number of arguments.  Mutable. */
-  int argn;			/* Current argument index. */
-  char **argv_get;		/* Argument value being read. */
-  char **argv_put;		/* Argument value being written.
-				 * argv_put <= argv_get. */
-};
-
-static void
-initialize_argparse(struct argparse *ap, int argc, char **argv)
-{
-  ap->argc = argc;
-  ap->argn = 0;
-  ap->argv_get = ap->argv_put = argv;
-}
-
-/* Finish argument parsing, without processing the remaining arguments.
- * Write new argument count into _argc. */
-static void
-finalize_argparse(struct argparse *ap, int *_argc, char **argv)
-{
-  /* Move the remaining arguments */
-  for(; ap->argn < ap->argc; ap->argn++)
-    *ap->argv_put++ = *ap->argv_get++;
-
-  /* Update the argument count */
-  *_argc = ap->argc;
-
-  /* Insert a terminating NULL */
-  argv[ap->argc] = NULL;
-}
-
-/* Delete the current argument.  The argument will not be visible
- * when argument parsing is done. */
-static void
-delete_argument(struct argparse *ap)
-{
-  if (ap->argn >= ap->argc) {
-    fprintf(stderr, "delete_argument\n");
-  }
-  ap->argc--;
-  ap->argv_get++;
-}
-
-/* Go to the next argument.  Also, move the current argument to its
- * final location in argv. */
-static void
-next_argument(struct argparse *ap)
-{
-  if (ap->argn >= ap->argc) {
-    fprintf(stderr, "next_argument\n");
-  }
-  /* Move argument to its new location. */
-  *ap->argv_put++ = *ap->argv_get++;
-  ap->argn++;
-}
-
-static int
-is_end_of_arguments(struct argparse *ap)
-{
-  return ap->argn == ap->argc;
-}
-
-/* Get the current argument */
-static char *
-get_argument(struct argparse *ap)
-{
-  return *ap->argv_get;
-}
-
-/* Get the current argument, and also delete it */
-static char *
-consume_argument(struct argparse *ap)
-{
-  char *ret = get_argument(ap);
-  delete_argument(ap);
-  return ret;
-}
-
-/****************************************************************************/
-
-/* The result of parsing a command-line argument */
-typedef enum {
-  ARGPARSE_OK,                  /* Success */
-  ARGPARSE_ERROR,               /* Error */
-  ARGPARSE_DONE                 /* Success, and do not continue parsing */
-} result;
-
-typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
-
-
-/* A command-line option */
-struct option {
-  char short_name;              /* If not 0, the one-character
-                                 * name of this option */
-  const char *long_name;        /* If not NULL, the long name of this option */
-  parse_action *action;       /* What to do when this option occurs.
-                               * Sentinel value is NULL.
-                               */
-};
-
-/* Output file
- *
- * -o FILE
- */
-static result
-parse_output_file(struct argparse *ap, struct pb_Parameters *params)
-{
-  if (is_end_of_arguments(ap))
-    {
-      report_parse_error("Expecting file name after '-o'\n");
-      return ARGPARSE_ERROR;
-    }
-
-  /* Replace the output file name */
-  free(params->outFile);
-  params->outFile = strdup(consume_argument(ap));
-
-  return ARGPARSE_OK;
-}
-
-/* Input files
- *
- * -i FILE,FILE,...
- */
-static result
-parse_input_files(struct argparse *ap, struct pb_Parameters *params)
-{
-  if (is_end_of_arguments(ap))
-    {
-      report_parse_error("Expecting file name after '-i'\n");
-      return ARGPARSE_ERROR;
-    }
-
-  /* Replace the input file list */
-  pb_FreeStringArray(params->inpFiles);
-  params->inpFiles = read_string_array(consume_argument(ap));
-  return ARGPARSE_OK;
-}
-
-/* End of options
- *
- * --
- */
-
-static result
-parse_end_options(struct argparse *ap, struct pb_Parameters *params)
-{
-  return ARGPARSE_DONE;
-}
-
-/* OpenCL device
- *
- * --device X
- */
-
-static result
-parse_device(struct argparse *ap, struct pb_Parameters *params)
-{
-  /* Read the next argument, which specifies a device */
-
-  if (is_end_of_arguments(ap))
-    {
-      report_parse_error("Expecting device specification after '--device'\n");
-      return ARGPARSE_ERROR;
-    }
-
-  char *device_string = consume_argument(ap);
-  struct pb_DeviceParam *device_param = read_device_param(device_string);
-
-  if (!device_param) {
-    report_parse_error("Unrecognized device specification format on command line\n");
-    return ARGPARSE_ERROR;
-  }
-
-  /* Save the result */
-  pb_FreeDeviceParam(params->device);
-  params->device = device_param;
-
-  return ARGPARSE_OK;
-}
-
-static result
-parse_platform(struct argparse *ap, struct pb_Parameters *params)
-{
-  /* Read the next argument, which specifies a platform */
-
-  if (is_end_of_arguments(ap))
-    {
-      report_parse_error("Expecting device specification after '--platform'\n");
-      return ARGPARSE_ERROR;
-    }
-
-  char *platform_string = consume_argument(ap);
-  struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
-
-  if (!platform_param) {
-    report_parse_error("Unrecognized platform specification format on command line\n");
-    return ARGPARSE_ERROR;
-  }
-
-  /* Save the result */
-  pb_FreePlatformParam(params->platform);
-  params->platform = platform_param;
-
-  return ARGPARSE_OK;
-}
-  
-
-static struct option options[] = {
-  { 'o', NULL, &parse_output_file },
-  { 'i', NULL, &parse_input_files },
-  { '-', NULL, &parse_end_options },
-  { 0, "device", &parse_device },
-  { 0, "platform", &parse_platform },
-  { 0, NULL, NULL }
-};
-
-static int
-is_last_option(struct option *op)
-{
-  return op->action == NULL;
-}
-
-/****************************************************************************/
-
-/* Parse command-line parameters.
- * Return zero on error, nonzero otherwise.
- * On error, the other outputs may be invalid.
- *
- * The information collected from parameters is used to update
- * 'ret'.  'ret' should be initialized.
- *
- * '_argc' and 'argv' are updated to contain only the unprocessed arguments.
- */
-static int
-pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
-{
-  char *err_message;
-  struct argparse ap;
-
-  /* Each argument */
-  initialize_argparse(&ap, *_argc, argv);
-  while(!is_end_of_arguments(&ap)) {
-    result arg_result;          /* Result of parsing this option */
-    char *arg = get_argument(&ap);
-
-    /* Process this argument */
-    if (arg[0] == '-') {
-      /* Single-character flag */
-      if ((arg[1] != 0) && (arg[2] == 0)) {
-        delete_argument(&ap);	/* This argument is consumed here */
-
-        /* Find a matching short option */
-        struct option *op;
-        for (op = options; !is_last_option(op); op++) {
-          if (op->short_name == arg[1]) {
-            arg_result = (*op->action)(&ap, ret);
-            goto option_was_processed;
-          }
-        }
-
-        /* No option matches */
-        report_parse_error("Unexpected command-line parameter\n");
-        arg_result = ARGPARSE_ERROR;
-        goto option_was_processed;
-      }
-
-      /* Long flag */
-      if (arg[1] == '-') {
-        delete_argument(&ap);	/* This argument is consumed here */
-
-        /* Find a matching long option */
-        struct option *op;
-        for (op = options; !is_last_option(op); op++) {
-          if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
-            arg_result = (*op->action)(&ap, ret);
-            goto option_was_processed;
-          }
-        }
-
-        /* No option matches */
-        report_parse_error("Unexpected command-line parameter\n");
-        arg_result = ARGPARSE_ERROR;
-        goto option_was_processed;
-      }
-    }
-    else {
-      /* Other arguments are ignored */
-      next_argument(&ap);
-      arg_result = ARGPARSE_OK;
-      goto option_was_processed;
-    }
-
-  option_was_processed:
-    /* Decide what to do next based on 'arg_result' */
-    switch(arg_result) {
-    case ARGPARSE_OK:
-      /* Continue processing */
-      break;
-
-    case ARGPARSE_ERROR:
-      /* Error exit from the function */
-      return 0;
-
-    case ARGPARSE_DONE:
-      /* Normal exit from the argument parsing loop */
-      goto end_of_options;
-    }
-  } /* end for each argument */
-
-  /* If all arguments were processed, then normal exit from the loop */
-
- end_of_options:
-  finalize_argparse(&ap, _argc, argv);
-  return 1;
-}
-
-/*****************************************************************************/
-/* Other exported functions */
-
-struct pb_Parameters *
-pb_ReadParameters(int *_argc, char **argv)
-{
-  struct pb_Parameters *ret =
-    (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
-
-  /* Initialize the parameters structure */
-  ret->outFile = NULL;
-  ret->inpFiles = (char **)malloc(sizeof(char *));
-  ret->inpFiles[0] = NULL;
-  ret->platform = NULL;
-  ret->device = NULL;
-
-  /* Read parameters and update _argc, argv */
-  if (!pb_ParseParameters(ret, _argc, argv)) {
-    /* Parse error */
-    pb_FreeParameters(ret);
-    return NULL;
-  }
-
-  return ret;
-}
-
-int
-pb_Parameters_CountInputs(struct pb_Parameters *p)
-{
-  int n;
-
-  for (n = 0; p->inpFiles[n]; n++);
-  return n;
-}
-
--- a/tests/opencl/cutcp/atom.h
+++ b/tests/opencl/cutcp/atom.h
@ -1,37 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2008-2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef ATOM_H
-#define ATOM_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-  typedef struct Atom_t {
-    float x, y, z, q;
-  } Atom;
-
-  typedef struct Atoms_t {
-    Atom *atoms;
-    int size;
-  } Atoms;
-
-  typedef struct Vec3_t {
-    float x, y, z;
-  } Vec3;
-
-  Atoms *read_atom_file(const char *fname);
-  void free_atom(Atoms *atom);
-  void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* ATOM_H */
--- a/tests/opencl/cutcp/cutcpu.c
+++ b/tests/opencl/cutcp/cutcpu.c
@ -1,195 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2008-2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <parboil.h>
-#include "atom.h"
-#include "cutoff.h"
-
-#undef DEBUG_PASS_RATE
-#define CHECK_CYLINDER_CPU
-
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
-
-extern int cpu_compute_cutoff_potential_lattice(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
-  int nx = lattice->dim.nx;
-  int ny = lattice->dim.ny;
-  int nz = lattice->dim.nz;
-  float xlo = lattice->dim.lo.x;
-  float ylo = lattice->dim.lo.y;
-  float zlo = lattice->dim.lo.z;
-  float gridspacing = lattice->dim.h;
-  int natoms = atoms->size;
-  Atom *atom = atoms->atoms;
-
-  const float a2 = cutoff * cutoff;
-  const float inv_a2 = 1.f / a2;
-  float s;
-  const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
-
-  int n;
-  int i, j, k;
-  int ia, ib, ic;
-  int ja, jb, jc;
-  int ka, kb, kc;
-  int index;
-  int koff, jkoff;
-
-  float x, y, z, q;
-  float dx, dy, dz;
-  float dz2, dydz2, r2;
-  float e;
-  float xstart, ystart;
-
-  float *pg;
-
-  int gindex;
-  int ncell, nxcell, nycell, nzcell;
-  int *first, *next;
-  float inv_cellen = INV_CELLEN;
-  Vec3 minext, maxext;		/* Extent of atom bounding box */
-  float xmin, ymin, zmin;
-  float xmax, ymax, zmax;
-
-#if DEBUG_PASS_RATE
-  unsigned long long pass_count = 0;
-  unsigned long long fail_count = 0;
-#endif
-
-  /* find min and max extent */
-  get_atom_extent(&minext, &maxext, atoms);
-
-  /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
-  ncell = nxcell * nycell * nzcell;
-
-  /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    first[gindex] = -1;
-  }
-  next = (int *) malloc(natoms * sizeof(int));
-  for (n = 0;  n < natoms;  n++) {
-    next[n] = -1;
-  }
-
-  /* geometric hashing */
-  for (n = 0;  n < natoms;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
-    next[n] = first[gindex];
-    first[gindex] = n;
-  }
-
-  /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
-      x = atom[n].x - xlo;
-      y = atom[n].y - ylo;
-      z = atom[n].z - zlo;
-      q = atom[n].q;
-
-      /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
-
-      /* find extent of surrounding box of grid points */
-      ia = ic - radius;
-      ib = ic + radius + 1;
-      ja = jc - radius;
-      jb = jc + radius + 1;
-      ka = kc - radius;
-      kb = kc + radius + 1;
-
-      /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
-
-      /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
-        dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
-#ifdef CHECK_CYLINDER_CPU
-          if (dydz2 >= a2) continue;
-#endif
-
-          dx = xstart;
-          index = jkoff + ia;
-          pg = lattice->lattice + index;
-
-#if defined(__INTEL_COMPILER)
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-            s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s;
-            *pg += (r2 < a2 ? e : 0);  /* LOOP VECTORIZED!! */
-          }
-#else
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-            if (r2 >= a2)
-		{
-#ifdef DEBUG_PASS_RATE
-		  fail_count++;
-#endif
-		  continue;
-		}
-#ifdef DEBUG_PASS_RATE
-	    pass_count++;
-#endif
-            s = (1.f - r2 * inv_a2);
-            e = q * (1/sqrtf(r2)) * s * s;
-            *pg += e;
-          }
-#endif
-        }
-      } /* end loop over surrounding grid points */
-
-    } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
-
-  /* free memory */
-  free(next);
-  free(first);
-
-  /* For debugging: print the number of times that the test passed/failed */
-#ifdef DEBUG_PASS_RATE
-  printf ("Pass :%lld\n", pass_count);
-  printf ("Fail :%lld\n", fail_count);
-#endif
-
-  return 0;
-}
--- a/tests/opencl/cutcp/cutoff.c
+++ b/tests/opencl/cutcp/cutoff.c
@ -1,508 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2008-2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <CL/cl.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <parboil.h>
-
-#include "atom.h"
-#include "cutoff.h"
-#include "macros.h"
-#include "ocl.h"
-
-static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
-  if (NULL == filename || NULL == data || 0 == size)
-    return CL_INVALID_VALUE;
-
-  FILE* fp = fopen(filename, "r");
-  if (NULL == fp) {
-    fprintf(stderr, "Failed to load kernel.");
-    return CL_INVALID_VALUE;
-  }
-  fseek(fp , 0 , SEEK_END);
-  long fsize = ftell(fp);
-  rewind(fp);
-
-  *data = (uint8_t*)malloc(fsize);
-  *size = fread(*data, 1, fsize, fp);
-  
-  fclose(fp);
-  
-  return CL_SUCCESS;
-}
-
-// OpenCL 1.1 support for int3 is not uniform on all implementations, so
-// we use int4 instead.  Only the 'x', 'y', and 'z' fields of xyz are used.
-typedef cl_int4 xyz;
-
-//extern "C" int gpu_compute_cutoff_potential_lattice(
-int gpu_compute_cutoff_potential_lattice(
-    struct pb_TimerSet *timers,
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* cutoff distance */
-    Atoms *atoms,                      /* array of atoms */
-    int verbose,                        /* print info/debug messages */
-    struct pb_Parameters *parameters
-    )
-{
-  int nx = lattice->dim.nx;
-  int ny = lattice->dim.ny;
-  int nz = lattice->dim.nz;
-  float xlo = lattice->dim.lo.x;
-  float ylo = lattice->dim.lo.y;
-  float zlo = lattice->dim.lo.z;
-  float h = lattice->dim.h;
-  int natoms = atoms->size;
-  Atom *atom = atoms->atoms;
-
-  xyz nbrlist[NBRLIST_MAXLEN];
-  int nbrlistlen = 0;
-
-  int binHistoFull[BIN_DEPTH+1] = { 0 };   /* clear every array element */
-  int binHistoCover[BIN_DEPTH+1] = { 0 };  /* clear every array element */
-  int num_excluded = 0;
-
-  int xRegionDim, yRegionDim, zRegionDim;
-  int xRegionIndex, yRegionIndex, zRegionIndex;
-  int xOffset, yOffset, zOffset;
-  int lnx, lny, lnz, lnall;
-  float *regionZeroAddr, *thisRegion;
-  cl_mem regionZeroCl;
-  int index, indexRegion;
-
-  int c;
-  xyz binDim;
-  int nbins;
-  cl_float4 *binBaseAddr, *binZeroAddr;
-  cl_mem binBaseCl, binZeroCl;
-  int *bincntBaseAddr, *bincntZeroAddr;
-  Atoms *extra = NULL;
-
-  cl_mem NbrListLen;
-  cl_mem NbrList;
-
-  int i, j, k, n;
-  int sum, total;
-
-  float avgFillFull, avgFillCover;
-  const float cutoff2 = cutoff * cutoff;
-  const float inv_cutoff2 = 1.f / cutoff2;
-
-  size_t gridDim[3], blockDim[3];
-
-  // The "compute" timer should be active upon entry to this function
-
-  /* pad lattice to be factor of 8 in each dimension */
-  xRegionDim = (int) ceilf(nx/8.f);
-  yRegionDim = (int) ceilf(ny/8.f);
-  zRegionDim = (int) ceilf(nz/8.f);
-
-  lnx = 8 * xRegionDim;
-  lny = 8 * yRegionDim;
-  lnz = 8 * zRegionDim;
-  lnall = lnx * lny * lnz;
-
-  /* will receive energies from OpenCL */
-  regionZeroAddr = (float *) malloc(lnall * sizeof(float));
-
-  /* create bins */
-  c = (int) ceil(cutoff * BIN_INVLEN);  /* count extra bins around lattice */
-  binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
-  binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
-  binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
-  nbins = binDim.x * binDim.y * binDim.z;
-  binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
-  binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
-
-  bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
-  bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
-
-  /* create neighbor list */
-  if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
-    float s = sqrtf(3);
-    float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
-    int cnt = 0;
-    /* develop neighbor list around 1 cell */
-    if (2*c + 1 > NBRLIST_DIM) {
-      fprintf(stderr, "must have cutoff <= %f\n",
-          (NBRLIST_DIM-1)/2 * BIN_LENGTH);
-      return -1;
-    }
-    for (k = -c;  k <= c;  k++) {
-      for (j = -c;  j <= c;  j++) {
-        for (i = -c;  i <= c;  i++) {
-          if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
-          nbrlist[cnt].x = i;
-          nbrlist[cnt].y = j;
-          nbrlist[cnt].z = k;
-          cnt++;
-        }
-      }
-    }
-    nbrlistlen = cnt;
-  }
-  else if (8*h <= 2*BIN_LENGTH) {
-    float s = 2.f*sqrtf(3);
-    float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
-    int cnt = 0;
-    /* develop neighbor list around 3-cube of cells */
-    if (2*c + 3 > NBRLIST_DIM) {
-      fprintf(stderr, "must have cutoff <= %f\n",
-          (NBRLIST_DIM-3)/2 * BIN_LENGTH);
-      return -1;
-    }
-    for (k = -c;  k <= c;  k++) {
-      for (j = -c;  j <= c;  j++) {
-        for (i = -c;  i <= c;  i++) {
-          if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
-          nbrlist[cnt].x = i;
-          nbrlist[cnt].y = j;
-          nbrlist[cnt].z = k;
-          cnt++;
-        }
-      }
-    }
-    nbrlistlen = cnt;
-  }
-  else {
-    fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
-    return -1;
-  }
-
-  /* perform geometric hashing of atoms into bins */
-  {
-    /* array of extra atoms, permit average of one extra per bin */
-    Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
-    int extra_len = 0;
-    
-    for (n = 0;  n < natoms;  n++) {
-      cl_float4 p;
-      p.x = atom[n].x - xlo;
-      p.y = atom[n].y - ylo;
-      p.z = atom[n].z - zlo;
-      p.w = atom[n].q;
-      i = (int) floorf(p.x * BIN_INVLEN);
-      j = (int) floorf(p.y * BIN_INVLEN);
-      k = (int) floorf(p.z * BIN_INVLEN);
-      if (i >= -c && i < binDim.x - c &&
-	  j >= -c && j < binDim.y - c &&
-	  k >= -c && k < binDim.z - c &&
-	  atom[n].q != 0) {
-	int index = (k * binDim.y + j) * binDim.x + i;
-	cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
-	int bindex = bincntZeroAddr[index];
-	if (bindex < BIN_DEPTH) {
-	  /* copy atom into bin and increase counter for this bin */
-	  bin[bindex] = p;
-	  bincntZeroAddr[index]++;
-	}
-	else {
-	  /* add index to array of extra atoms to be computed with CPU */
-	  if (extra_len >= nbins) {
-	    fprintf(stderr, "exceeded space for storing extra atoms\n");
-	    return -1;
-	  }
-	  extra_atoms[extra_len] = atom[n];
-	  extra_len++;
-	}
-      }
-      else {
-	/* excluded atoms are either outside bins or neutrally charged */
-	num_excluded++;
-      }
-    }
-
-    /* Save result */
-    extra = (Atoms *)malloc(sizeof(Atoms));
-    extra->atoms = extra_atoms;
-    extra->size = extra_len;
-  }
-
-  /* bin stats */
-  sum = total = 0;
-  for (n = 0;  n < nbins;  n++) {
-    binHistoFull[ bincntBaseAddr[n] ]++;
-    sum += bincntBaseAddr[n];
-    total += BIN_DEPTH;
-  }
-  avgFillFull = sum / (float) total;
-  sum = total = 0;
-  for (k = 0;  k < binDim.z - 2*c;  k++) {
-    for (j = 0;  j < binDim.y - 2*c;  j++) {
-      for (i = 0;  i < binDim.x - 2*c;  i++) {
-        int index = (k * binDim.y + j) * binDim.x + i;
-        binHistoCover[ bincntZeroAddr[index] ]++;
-        sum += bincntZeroAddr[index];
-        total += BIN_DEPTH;
-      }
-    }
-  }
-  avgFillCover = sum / (float) total;
-
-  if (verbose) {
-    /* report */
-    printf("number of atoms = %d\n", natoms);
-    printf("lattice spacing = %g\n", h);
-    printf("cutoff distance = %g\n", cutoff);
-    printf("\n");
-    printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
-    printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
-    printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
-    printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
-    printf("number of bytes for lattice data = %u\n", (unsigned int) (lnall*sizeof(float)));
-    printf("\n");
-    printf("bin padding thickness = %d\n", c);
-    printf("bin cover dimensions = %d %d %d\n",
-        binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
-    printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
-    printf("number of bins = %d\n", nbins);
-    printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
-    printf("%% overhead space = %g\n",
-        (natoms / (double) (nbins * BIN_DEPTH)) * 100);
-    printf("number of bytes for bin data = %u\n",
-        (unsigned int)(nbins * BIN_DEPTH * sizeof(cl_float4)));
-    printf("\n");
-    printf("bin histogram with padding:\n");
-    sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
-      printf("     number of bins with %d atoms:  %d\n", n, binHistoFull[n]);
-      sum += binHistoFull[n];
-    }
-    printf("     total number of bins:  %d\n", sum);
-    printf("     %% average fill:  %g\n", avgFillFull * 100);
-    printf("\n");
-    printf("bin histogram excluding padding:\n");
-    sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
-      printf("     number of bins with %d atoms:  %d\n", n, binHistoCover[n]);
-      sum += binHistoCover[n];
-    }
-    printf("     total number of bins:  %d\n", sum);
-    printf("     %% average fill:  %g\n", avgFillCover * 100);
-    printf("\n");
-    printf("number of extra atoms = %d\n", extra->size);
-    printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
-    printf("\n");
-
-    /* sanity check on bins */
-    sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
-      sum += n * binHistoFull[n];
-    }
-    sum += extra->size + num_excluded;
-    printf("sanity check on bin histogram with edges:  "
-        "sum + others = %d\n", sum);
-    sum = 0;
-    for (n = 0;  n <= BIN_DEPTH;  n++) {
-      sum += n * binHistoCover[n];
-    }
-    sum += extra->size + num_excluded;
-    printf("sanity check on bin histogram excluding edges:  "
-        "sum + others = %d\n", sum);
-    printf("\n");
-
-    /* neighbor list */
-    printf("neighbor list length = %d\n", nbrlistlen);
-    printf("\n");
-  }
-
-  pb_Context* pb_context;
-  pb_context = pb_InitOpenCLContext(parameters);
-  if (pb_context == NULL) {
-    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
-    return -1;
-  }
-
-  cl_int clStatus;
-  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
-  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
-  cl_context clContext = (cl_context) pb_context->clContext;
-
-  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-  CHECK_ERROR("clCreateCommandQueue")
-
-  pb_SetOpenCL(&clContext, &clCommandQueue);
-  
-  //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
-  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
-  uint8_t *kernel_bin = NULL;
-  size_t kernel_size;
-  cl_int binary_status = 0;  
-  clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
-  CHECK_ERROR("read_kernel_file")  
-	cl_program clProgram = clCreateProgramWithBinary(
-      clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
-  CHECK_ERROR("clCreateProgramWithSource")
-
-  char clOptions[50];
-  sprintf(clOptions,"-I src/opencl_base");  //-cl-nv-verbose
-
-  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
-  if (clStatus != CL_SUCCESS) {
-    size_t string_size = 0;
-    clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 
-                          0, NULL, &string_size);
-    char* string = (char*)malloc(string_size*sizeof(char));
-    clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 
-                          string_size, string, NULL);
-    puts(string);
-  }
-
-  CHECK_ERROR("clBuildProgram")
-
-  cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus);
-  CHECK_ERROR("clCreateKernel")
-
-  /* setup OpenCL kernel parameters */
-  blockDim[0] = 8;
-  blockDim[1] = 8;
-  blockDim[2] = 2;
-  gridDim[0] = 4 * xRegionDim * blockDim[0];
-  gridDim[1] = yRegionDim * blockDim[1];
-  gridDim[2] = 1 * blockDim[2];
-
-  /* allocate and initialize memory on OpenCL device */
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-  if (verbose) {
-    printf("Allocating %.2fMB on OpenCL device for potentials\n",
-           lnall * sizeof(float) / (double) (1024*1024));
-  }
-
-  regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus);
-  CHECK_ERROR("clCreateBuffer")
-
-  // clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float));
-
-  if (verbose) {
-    printf("Allocating %.2fMB on OpenCL device for atom bins\n",
-           nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
-  }
-
-  binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
-  CHECK_ERROR("clCreateBuffer")
- 
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-
-  //Sub buffers are not supported in OpenCL v1.0
-  int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;  
-
-  NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-
-  NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
-  CHECK_ERROR("clCreateBuffer")
-  clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueWriteBuffer")
-
-  if (verbose) 
-    printf("\n");
-
-  clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
-  clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
-  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
-  clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
-  clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
-  clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
-  clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
-  clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&regionZeroCl);
-  clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
-  clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
-  CHECK_ERROR("clSetKernelArg")
-
-  /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
-  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
-  printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
-  for (zRegionIndex = 0;  zRegionIndex < zRegionDim;  zRegionIndex++) {
-    printf("  computing plane %d\r", zRegionIndex);
-    fflush(stdout);
-
-    clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
-    CHECK_ERROR("clSetKernelArg")
-
-    clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
-
-    CHECK_ERROR("clEnqueueNDRangeKernel")
-
-    clStatus = clFinish(clCommandQueue);
-
-    CHECK_ERROR("clFinish")
-  }
-
-  printf("Finished OpenCL kernel calls\n");
-
-  /* copy result regions from OpenCL device */
-  pb_SwitchToTimer(timers, pb_TimerID_COPY);
-  clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueReadBuffer")
-
-  /* free OpenCL memory allocations */
-  clStatus = clReleaseMemObject(regionZeroCl);
-  clStatus = clReleaseMemObject(binBaseCl);
-  clStatus = clReleaseMemObject(NbrListLen);
-  clStatus = clReleaseMemObject(NbrList);
-  CHECK_ERROR("clReleaseMemObject")
-
-  clStatus = clReleaseKernel(clKernel);
-  clStatus = clReleaseProgram(clProgram);
-  clStatus = clReleaseCommandQueue(clCommandQueue);
-  clStatus = clReleaseContext(clContext);
-
-  //free((void*)clSource[0]);
-
-  /* transpose regions back into lattice */
-  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
-  for (k = 0;  k < nz;  k++) {
-    zRegionIndex = (k >> 3);
-    zOffset = (k & 7);
-
-    for (j = 0;  j < ny;  j++) {
-      yRegionIndex = (j >> 3);
-      yOffset = (j & 7);
-
-      for (i = 0;  i < nx;  i++) {
-        xRegionIndex = (i >> 3);
-        xOffset = (i & 7);
-
-        thisRegion = regionZeroAddr
-          + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
-              + xRegionIndex) * REGION_SIZE;
-
-        indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
-        index = (k * ny + j) * nx + i;
-
-        lattice->lattice[index] = thisRegion[indexRegion];
-      }
-    }
-  }
-
-  /* handle extra atoms */
-  if (extra->size > 0) {
-    printf("computing extra atoms on CPU\n");
-    if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
-      fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
-          "for extra atoms\n");
-      return -1;
-    }
-    printf("\n");
-  }
-
-  /* cleanup memory allocations */
-  free(regionZeroAddr);
-  free(binBaseAddr);
-  free(bincntBaseAddr);
-  free_atom(extra);
-
-  return 0;
-}
--- a/tests/opencl/cutcp/cutoff.h
+++ b/tests/opencl/cutcp/cutoff.h
@ -1,72 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2008-2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef CUTOFF_H
-#define CUTOFF_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define SHIFTED
-
-  /* A structure to record how points in 3D space map to array
-     elements.  Array element (z, y, x)
-     where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
-     maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
-  */
-  typedef struct LatticeDim_t {
-    /* Number of lattice points in x, y, z dimensions */
-    int nx, ny, nz;
-
-    /* Lowest corner of lattice */
-    Vec3 lo;
-
-    /* Lattice spacing */
-    float h;
-  } LatticeDim;
-
-  /* An electric potential field sampled on a regular grid.  The
-     lattice size and grid point positions are specified by 'dim'.
-  */
-  typedef struct Lattice_t {
-    LatticeDim dim;
-    float *lattice;
-  } Lattice;
-
-  LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
-
-  Lattice *create_lattice(LatticeDim dim);
-  void destroy_lattice(Lattice *);
-
-  int gpu_compute_cutoff_potential_lattice(
-      struct pb_TimerSet *timers,
-      Lattice *lattice,
-      float cutoff,                      /* cutoff distance */
-      Atoms *atom,                       /* array of atoms */
-      int verbose,                        /* print info/debug messages */
-      struct pb_Parameters *parameters
-    );
-
-  int cpu_compute_cutoff_potential_lattice(
-      Lattice *lattice,                  /* the lattice */
-      float cutoff,                      /* cutoff distance */
-      Atoms *atoms                       /* array of atoms */
-    );
-
-  int remove_exclusions(
-      Lattice *lattice,                  /* the lattice */
-      float exclcutoff,                  /* exclusion cutoff distance */
-      Atoms *atom                        /* array of atoms */
-    );
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* CUTOFF_H */
--- a/tests/opencl/cutcp/excl.c
+++ b/tests/opencl/cutcp/excl.c
@ -1,157 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2008-2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <parboil.h>
-
-#include "atom.h"
-#include "cutoff.h"
-
-#define CELLEN      4.f
-#define INV_CELLEN  (1.f/CELLEN)
-
-extern int remove_exclusions(
-    Lattice *lattice,                  /* the lattice */
-    float cutoff,                      /* exclusion cutoff distance */
-    Atoms *atoms                       /* array of atoms */
-    )
-{
-  int nx = lattice->dim.nx;
-  int ny = lattice->dim.ny;
-  int nz = lattice->dim.nz;
-  float xlo = lattice->dim.lo.x;
-  float ylo = lattice->dim.lo.y;
-  float zlo = lattice->dim.lo.z;
-  float gridspacing = lattice->dim.h;
-  Atom *atom = atoms->atoms;
-
-  const float a2 = cutoff * cutoff;
-  const float inv_gridspacing = 1.f / gridspacing;
-  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
-    /* lattice point radius about each atom */
-
-  int n;
-  int i, j, k;
-  int ia, ib, ic;
-  int ja, jb, jc;
-  int ka, kb, kc;
-  int index;
-  int koff, jkoff;
-
-  float x, y, z, q;
-  float dx, dy, dz;
-  float dz2, dydz2, r2;
-  float e;
-  float xstart, ystart;
-
-  float *pg;
-
-  int gindex;
-  int ncell, nxcell, nycell, nzcell;
-  int *first, *next;
-  float inv_cellen = INV_CELLEN;
-  Vec3 minext, maxext;
-
-  /* find min and max extent */
-  get_atom_extent(&minext, &maxext, atoms);
-
-  /* number of cells in each dimension */
-  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
-  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
-  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
-  ncell = nxcell * nycell * nzcell;
-
-  /* allocate for cursor link list implementation */
-  first = (int *) malloc(ncell * sizeof(int));
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    first[gindex] = -1;
-  }
-  next = (int *) malloc(atoms->size * sizeof(int));
-  for (n = 0;  n < atoms->size;  n++) {
-    next[n] = -1;
-  }
-
-  /* geometric hashing */
-  for (n = 0;  n < atoms->size;  n++) {
-    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
-    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
-    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
-    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
-    gindex = (k*nycell + j)*nxcell + i;
-    next[n] = first[gindex];
-    first[gindex] = n;
-  }
-
-  /* traverse the grid cells */
-  for (gindex = 0;  gindex < ncell;  gindex++) {
-    for (n = first[gindex];  n != -1;  n = next[n]) {
-      x = atom[n].x - xlo;
-      y = atom[n].y - ylo;
-      z = atom[n].z - zlo;
-      q = atom[n].q;
-
-      /* find closest grid point with position less than or equal to atom */
-      ic = (int) (x * inv_gridspacing);
-      jc = (int) (y * inv_gridspacing);
-      kc = (int) (z * inv_gridspacing);
-
-      /* find extent of surrounding box of grid points */
-      ia = ic - radius;
-      ib = ic + radius + 1;
-      ja = jc - radius;
-      jb = jc + radius + 1;
-      ka = kc - radius;
-      kb = kc + radius + 1;
-
-      /* trim box edges so that they are within grid point lattice */
-      if (ia < 0)   ia = 0;
-      if (ib >= nx) ib = nx-1;
-      if (ja < 0)   ja = 0;
-      if (jb >= ny) jb = ny-1;
-      if (ka < 0)   ka = 0;
-      if (kb >= nz) kb = nz-1;
-
-      /* loop over surrounding grid points */
-      xstart = ia*gridspacing - x;
-      ystart = ja*gridspacing - y;
-      dz = ka*gridspacing - z;
-      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
-        koff = k*ny;
-        dz2 = dz*dz;
-
-        dy = ystart;
-        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
-          jkoff = (koff + j)*nx;
-          dydz2 = dy*dy + dz2;
-
-          dx = xstart;
-          index = jkoff + ia;
-          pg = lattice->lattice + index;
-
-          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
-            r2 = dx*dx + dydz2;
-
-	    /* If atom and lattice point are too close, set the lattice value
-	     * to zero */
-            if (r2 < a2) *pg = 0;
-          }
-        }
-      } /* end loop over surrounding grid points */
-
-    } /* end loop over atoms in a gridcell */
-  } /* end loop over gridcells */
-
-  /* free memory */
-  free(next);
-  free(first);
-
-  return 0;
-}
--- a/tests/opencl/cutcp/gpu_info.c
+++ b/tests/opencl/cutcp/gpu_info.c
@ -1,55 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-//#include <endian.h>
-#include <stdlib.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#include "gpu_info.h"
-
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
-
-	thread[0]=_thread;
-	grid[0]=_grid;
-}
--- a/tests/opencl/cutcp/gpu_info.h
+++ b/tests/opencl/cutcp/gpu_info.h
@ -1,28 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef __GPUINFOH__
-#define __GPUINFOH__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/tests/opencl/cutcp/kernel.cl
+++ b/tests/opencl/cutcp/kernel.cl
@ -1,104 +0,0 @@
-/*
- * potential lattice is decomposed into size 8^3 lattice point "regions"
- *
- * THIS IMPLEMENTATION:  one thread per lattice point
- * thread block size 128 gives 4 thread blocks per region
- * kernel is invoked for each x-y plane of regions,
- * where gridDim.x is 4*(x region dimension) so that blockIdx.x 
- * can absorb the z sub-region index in its 2 lowest order bits
- *
- * Regions are stored contiguously in memory in row-major order
- *
- * The bins have to not only cover the region, but they need to surround
- * the outer edges so that region sides and corners can still use
- * neighbor list stencil.  The binZeroAddr is actually a shifted pointer into
- * the bin array (binZeroAddr = binBaseAddr + (c*binDim_y + c)*binDim_x + c)
- * where c = ceil(cutoff / binsize).  This allows for negative offsets to
- * be added to myBinIndex.
- *
- * The (0,0,0) spatial origin corresponds to lower left corner of both
- * regionZeroAddr and binZeroAddr.  The atom coordinates are translated
- * during binning to enforce this assumption.
- */
-
-#include "macros.h"
-
-// OpenCL 1.1 support for int3 is not uniform on all implementations, so
-// we use int4 instead.  Only the 'x', 'y', and 'z' fields of xyz are used.
-typedef int4 xyz;
-
-__kernel void opencl_cutoff_potential_lattice(
-    int binDim_x,
-    int binDim_y,
-    __global float4 *binBaseAddr,
-    int offset,
-    float h,                /* lattice spacing */
-    float cutoff2,          /* square of cutoff distance */
-    float inv_cutoff2,
-    __global float *regionZeroAddr,  /* address of lattice regions starting at origin */
-    int zRegionIndex,
-    __constant int *NbrListLen,
-    __constant xyz *NbrList
-    )
-{
-  __global float4* binZeroAddr = binBaseAddr + offset;
-
-  __global float *myRegionAddr;
-  int Bx, By, Bz;
-
-  /* thread id */
-  const int tid = (get_local_id(2)*get_local_size(1) + 
-                      get_local_id(1))*get_local_size(0) + get_local_id(0);
-
-  /* this is the start of the sub-region indexed by tid */
-  myRegionAddr = regionZeroAddr + ((zRegionIndex*get_num_groups(1)
-	+ get_group_id(1))*(get_num_groups(0)>>2) + (get_group_id(0)>>2))*REGION_SIZE
-	+ (get_group_id(0)&3)*SUB_REGION_SIZE;
-
-  /* spatial coordinate of this lattice point */
-  float x = (8 * (get_group_id(0) >> 2) + get_local_id(0)) * h;
-  float y = (8 * get_group_id(1) + get_local_id(1)) * h;
-  float z = (8 * zRegionIndex + 2*(get_group_id(0)&3) + get_local_id(2)) * h;
-
-  float dx;
-  float dy;
-  float dz;
-  float r2;
-  float s;
-
-  int totalbins = 0;
-
-  /* bin number determined by center of region */
-  Bx = (int) floor((8 * (get_group_id(0) >> 2) + 4) * h * BIN_INVLEN);
-  By = (int) floor((8 * get_group_id(1) + 4) * h * BIN_INVLEN);
-  Bz = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
-
-  float energy = 0.f;
-  int bincnt;
-  for (bincnt = 0; bincnt < *NbrListLen;  bincnt++) {
-    int i = Bx + NbrList[bincnt].x;
-    int j = By + NbrList[bincnt].y;
-    int k = Bz + NbrList[bincnt].z;
-
-  	__global float4* p_global = binZeroAddr + 
-  	                       (((k*binDim_y + j)*binDim_x + i) * BIN_DEPTH);
-
-    int m;
-    for (m = 0;  m < BIN_DEPTH;  m++) {
-    	float aq = p_global[m].w;
-      if (0.f != aq) {
-        dx = p_global[m].x - x;
-        dy = p_global[m].y - y;
-        dz = p_global[m].z - z;
-        r2 = dx*dx + dy*dy + dz*dz;
-        if (r2 < cutoff2) {
-          s = (1.f - r2 * inv_cutoff2);
-          energy += aq * rsqrt(r2) * s * s;
-        }
-      }
-    } /* end loop over atoms in bin */
-  } /* end loop over neighbor list */
-
-  /* store into global memory */
-  myRegionAddr[tid+0] = energy;
-}
--- a/tests/opencl/cutcp/macros.h
+++ b/tests/opencl/cutcp/macros.h
@ -1,69 +0,0 @@
-#ifndef __MACROSH__
-#define __MACROSH__
-
-#ifdef __DEVICE_EMULATION__
-#define DEBUG
-/* define which grid block and which thread to examine */
-#define BX  0
-#define BY  0
-#define TX  0
-#define TY  0
-#define TZ  0
-#define EMU(code) do { \
-  if (blockIdx.x==BX && blockIdx.y==BY && \
-      threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
-    code; \
-  } \
-} while (0)
-#define INT(n)    printf("%s = %d\n", #n, n)
-#define FLOAT(f)  printf("%s = %g\n", #f, (double)(f))
-#define INT3(n)   printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
-#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
-    (double)(f).y, (double)(f).z, (double)(f).w)
-#else
-#define EMU(code)
-#define INT(n)
-#define FLOAT(f)
-#define INT3(n)
-#define FLOAT4(f)
-#endif
-
-/* report error from OpenCL */
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
-  }
-
-/*
- * neighbor list:
- * stored in constant memory as table of offsets
- * flat index addressing is computed by kernel
- *
- * reserve enough memory for 11^3 stencil of grid cells
- * this fits within 16K of memory
- */
-#define NBRLIST_DIM  11
-#define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
-
-/*
- * atom bins cached into shared memory for processing
- *
- * this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
- * should permit scheduling of up to 3 thread blocks per SM
- */
-#define BIN_DEPTH         8  /* max number of atoms per bin */
-#define BIN_SIZE         32  /* size of bin in floats */
-#define BIN_CACHE_MAXLEN 32  /* max number of atom bins to cache */
-
-#define BIN_LENGTH      4.f  /* spatial length in Angstroms */
-#define BIN_INVLEN  (1.f / BIN_LENGTH)
-/* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
- * so that bin fill should be 80% (for non-empty regions of space) */
-
-#define REGION_SIZE     512  /* number of floats in lattice region */
-#define SUB_REGION_SIZE 128  /* number of floats in lattice sub-region */
-
-#endif
--- a/tests/opencl/cutcp/main.cc
+++ b/tests/opencl/cutcp/main.cc
@ -1,190 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2008-2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <parboil.h>
-
-#include "atom.h"
-#include "cutoff.h"
-#include "output.h"
-
-#define ERRTOL 1e-4f
-
-#define NOKERNELS             0
-#define CUTOFF1               1
-#define CUTOFF6              32
-#define CUTOFF6OVERLAP       64
-#define CUTOFFCPU         16384
-
-
-int appenddata(const char *filename, int size, double time) {
-  FILE *fp;
-  fp=fopen(filename, "a");
-  if (fp == NULL) {
-    printf("error appending to file %s..\n", filename);
-    return -1;
-  }
-  fprintf(fp, "%d  %.3f\n", size, time);
-  fclose(fp);
-  return 0;
-}
-
-LatticeDim
-lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
-{
-  LatticeDim ret;
-
-  ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
-  ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
-  ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
-  ret.lo = lo;
-  ret.h = h;
-
-  return ret;
-}
-
-Lattice *
-create_lattice(LatticeDim dim)
-{
-  int size;
-  Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
-
-  if (lat == NULL) {
-    fprintf(stderr, "Out of memory\n");
-    exit(1);
-  }
-
-  lat->dim = dim;
-
-  /* Round up the allocated size to a multiple of 8 */
-  size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
-  lat->lattice = (float *)calloc(size, sizeof(float));
-
-  if (lat->lattice == NULL) {
-    fprintf(stderr, "Out of memory\n");
-    exit(1);
-  }
-
-  return lat;
-}
-
-
-void
-destroy_lattice(Lattice *lat)
-{
-  if (lat) {
-    free(lat->lattice);
-    free(lat);
-  }
-}
-
-int main(int argc, char *argv[]) {
-  Atoms *atom;
-
-  LatticeDim lattice_dim;
-  Lattice *gpu_lattice;
-  Vec3 min_ext, max_ext;	/* Bounding box of atoms */
-  Vec3 lo, hi;			/* Bounding box with padding  */
-
-  float h = 0.5f;		/* Lattice spacing */
-  float cutoff = 12.f;		/* Cutoff radius */
-  float exclcutoff = 1.f;	/* Radius for exclusion */
-  float padding = 0.5f;		/* Bounding box padding distance */
-
-  int n;
-
-  struct pb_Parameters *parameters;
-  struct pb_TimerSet timers;
-
-  /* Read input parameters */
-  parameters = pb_ReadParameters(&argc, argv);
-  if (parameters == NULL) {
-    exit(1);
-  }
-
-  parameters->inpFiles = (char **)malloc(sizeof(char *) * 2);
-  parameters->inpFiles[0] = (char *)malloc(100);
-  parameters->inpFiles[1] = NULL;
-  strncpy(parameters->inpFiles[0], "watbox.sl40.pqr", 100);
-
-  /* Expect one input file */
-  if (pb_Parameters_CountInputs(parameters) != 1) {
-    fprintf(stderr, "Expecting one input file\n");
-    exit(1);
-  }
-
-  pb_InitializeTimerSet(&timers);
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  {
-    const char *pqrfilename = parameters->inpFiles[0];
-
-    if (!(atom = read_atom_file(pqrfilename))) {
-      fprintf(stderr, "read_atom_file() failed\n");
-      exit(1);
-    }
-    printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
-  }
-
-  /* find extent of domain */
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-  get_atom_extent(&min_ext, &max_ext, atom);
-  printf("extent of domain is:\n");
-  printf("  minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
-  printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
-
-  printf("padding domain by %g Angstroms\n", padding);
-  lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
-  hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
-  printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
-
-  lattice_dim = lattice_from_bounding_box(lo, hi, h);
-  gpu_lattice = create_lattice(lattice_dim);
-  printf("\n");
-
-  /*
-   *  Run OpenCL kernel
-   *  (Begin and end with COMPUTE timer active)
-   */
-  if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0, parameters)) {
-    fprintf(stderr, "Computation failed\n");
-    exit(1);
-  }
-
-  /*
-   * Zero the lattice points that are too close to an atom.  This is
-   * necessary for numerical stability.
-   */
-  if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
-    fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
-    exit(1);
-  }
-
-  printf("\n");
-
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-
-  /* Print output */
-  if (parameters->outFile) {
-    //write_lattice_summary(parameters->outFile, gpu_lattice);
-  }
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  /* Cleanup */
-  destroy_lattice(gpu_lattice);
-  free_atom(atom);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-  pb_FreeParameters(parameters);
-
-  return 0;
-}
--- a/tests/opencl/cutcp/ocl.c
+++ b/tests/opencl/cutcp/ocl.c
@ -1,49 +0,0 @@
-#include <CL/cl.h>
-#include <stdio.h>
-#include <string.h>
-#include "ocl.h"
-
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
-
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
-
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
-}
-
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
-}
--- a/tests/opencl/cutcp/ocl.h
+++ b/tests/opencl/cutcp/ocl.h
@ -1,25 +0,0 @@
-#ifndef __OCLH__
-#define __OCLH__
-
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
-  }
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/tests/opencl/cutcp/output.c
+++ b/tests/opencl/cutcp/output.c
@ -1,67 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2008-2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <inttypes.h>
-#include <math.h>
-#include <parboil.h>
-
-#include "atom.h"
-#include "cutoff.h"
-
-void
-write_lattice_summary(const char *filename, Lattice *lattice)
-{
-  float *lattice_data = lattice->lattice;
-  int nx = lattice->dim.nx;
-  int ny = lattice->dim.ny;
-  int nz = lattice->dim.nz;
-
-  /* Open output file */
-  FILE *outfile = fopen(filename, "w");
-
-  if (outfile == NULL) {
-    fprintf(stderr, "Cannot open output file\n");
-    exit(1);
-  }
-
-  /* Write the sum of the the absolute values of all lattice potentials */
-  {
-    double abspotential = 0.0;
-    float tmp;
-    int i;
-
-    for (i = 0; i < nx * ny * nz; i++)
-      abspotential += fabs((double) lattice_data[i]);
-
-    tmp = (float) abspotential;
-
-    fwrite(&tmp, 1, sizeof(float), outfile);
-  }
-
-  /* Write the size of a lattice plane */
-  {
-    uint32_t tmp;
-
-    tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
-    fwrite(&tmp, 1, sizeof(uint32_t), outfile);
-  }
-
-  /* Write the plane of lattice data at z=0 and z = nz-1 */
-  {
-    int plane_size = nx * ny;
-
-    fwrite(lattice_data, plane_size, sizeof(float), outfile);
-    fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
-	   outfile);
-  }
-
-  /* Cleanup */
-  fclose(outfile);
-}
--- a/tests/opencl/cutcp/output.h
+++ b/tests/opencl/cutcp/output.h
@ -1,25 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2008-2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef OUTPUT_H
-#define OUTPUT_H
-
-#include "cutoff.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void
-write_lattice_summary(const char *filename, Lattice *lattice);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/tests/opencl/cutcp/parboil.h
+++ b/tests/opencl/cutcp/parboil.h
@ -1,348 +0,0 @@
-/*
- * (c) 2010 The Board of Trustees of the University of Illinois.
- */
-#ifndef PARBOIL_HEADER
-#define PARBOIL_HEADER
-
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <unistd.h>
-
-/* A platform as specified by the user on the command line */
-struct pb_PlatformParam {
-  char *name;                   /* The platform name.  This string is owned. */
-  char *version;                /* The platform version; may be NULL.
-                                 * This string is owned. */
-};
-
-/* Create a PlatformParam from the given strings.
- * 'name' must not be NULL.  'version' may be NULL.
- * If not NULL, the strings should have been allocated by malloc(),
- * and they will be owned by the returned object.
- */
-struct pb_PlatformParam *
-pb_PlatformParam(char *name, char *version);
-
-void
-pb_FreePlatformParam(struct pb_PlatformParam *);
-
-/* A criterion for how to select a device */
-enum pb_DeviceSelectionCriterion {
-  pb_Device_INDEX,             /* Enumerate the devices and select one
-                                * by its number */
-  pb_Device_CPU,                /* Select a CPU device */
-  pb_Device_GPU,                /* Select a GPU device  */
-  pb_Device_ACCELERATOR,        /* Select an accelerator device */
-  pb_Device_NAME                /* Select a device by name */
-};
-
-/* A device as specified by the user on the command line */
-struct pb_DeviceParam {
-  enum pb_DeviceSelectionCriterion criterion;
-  union {
-    int index;                  /* If criterion == pb_Device_INDEX,
-                                 * the index of the device */
-    char *name;                 /* If criterion == pb_Device_NAME,
-                                 * the name of the device.
-                                 * This string is owned. */
-  };
-};
-
-struct pb_DeviceParam *
-pb_DeviceParam_index(int index);
-
-struct pb_DeviceParam *
-pb_DeviceParam_cpu(void);
-
-struct pb_DeviceParam *
-pb_DeviceParam_gpu(void);
-
-struct pb_DeviceParam *
-pb_DeviceParam_accelerator(void);
-
-/* Create a by-name device selection criterion.
- * The string should have been allocated by malloc(), and it will will be
- * owned by the returned object.
- */
-struct pb_DeviceParam *
-pb_DeviceParam_name(char *name);
-
-void
-pb_FreeDeviceParam(struct pb_DeviceParam *);
-
-/* Command line parameters for benchmarks */
-struct pb_Parameters {
-  char *outFile;		/* If not NULL, the raw output of the
-				 * computation should be saved to this
-				 * file. The string is owned. */
-  char **inpFiles;		/* A NULL-terminated array of strings
-				 * holding the input file(s) for the
-				 * computation.  The array and strings
-				 * are owned. */
-  struct pb_PlatformParam *platform; /* If not NULL, the platform
-                                      * specified on the command line. */
-  struct pb_DeviceParam *device; /* If not NULL, the device
-                                      * specified on the command line. */
-};
-
-/* Read command-line parameters.
- *
- * The argc and argv parameters to main are read, and any parameters
- * interpreted by this function are removed from the argument list.
- *
- * A new instance of struct pb_Parameters is returned.
- * If there is an error, then an error message is printed on stderr
- * and NULL is returned.
- */
-struct pb_Parameters *
-pb_ReadParameters(int *_argc, char **argv);
-
-/* Free an instance of struct pb_Parameters.
- */
-void
-pb_FreeParameters(struct pb_Parameters *p);
-
-void
-pb_FreeStringArray(char **);
-
-/* Count the number of input files in a pb_Parameters instance.
- */
-int
-pb_Parameters_CountInputs(struct pb_Parameters *p);
-
-/* A time or duration. */
-//#if _POSIX_VERSION >= 200112L
-typedef unsigned long long pb_Timestamp; /* time in microseconds */
-//#else
-//# error "Timestamps not implemented"
-//#endif
-
-enum pb_TimerState {
-  pb_Timer_STOPPED,
-  pb_Timer_RUNNING,
-};
-
-struct pb_Timer {
-  enum pb_TimerState state;
-  pb_Timestamp elapsed;		/* Amount of time elapsed so far */
-  pb_Timestamp init;		/* Beginning of the current time interval,
-				 * if state is RUNNING.  End of the last 
-				 * recorded time interfal otherwise.  */
-};
-
-/* Reset a timer.
- * Use this to initialize a timer or to clear
- * its elapsed time.  The reset timer is stopped.
- */
-void
-pb_ResetTimer(struct pb_Timer *timer);
-
-/* Start a timer.  The timer is set to RUNNING mode and
- * time elapsed while the timer is running is added to
- * the timer.
- * The timer should not already be running.
- */
-void
-pb_StartTimer(struct pb_Timer *timer);
-
-/* Stop a timer.
- * This stops adding elapsed time to the timer.
- * The timer should not already be stopped.
- */
-void
-pb_StopTimer(struct pb_Timer *timer);
-
-/* Get the elapsed time in seconds. */
-double
-pb_GetElapsedTime(struct pb_Timer *timer);
-
-/* Execution time is assigned to one of these categories. */
-enum pb_TimerID {
-  pb_TimerID_NONE = 0,
-  pb_TimerID_IO,		/* Time spent in input/output */
-  pb_TimerID_KERNEL,		/* Time spent computing on the device, 
-				 * recorded asynchronously */
-  pb_TimerID_COPY,		/* Time spent synchronously moving data 
-				 * to/from device and allocating/freeing 
-				 * memory on the device */
-  pb_TimerID_DRIVER,		/* Time spent in the host interacting with the 
-				 * driver, primarily for recording the time 
-                                 * spent queueing asynchronous operations */
-  pb_TimerID_COPY_ASYNC,	/* Time spent in asynchronous transfers */
-  pb_TimerID_COMPUTE,		/* Time for all program execution other
-				 * than parsing command line arguments,
-				 * I/O, kernel, and copy */
-  pb_TimerID_OVERLAP,		/* Time double-counted in asynchronous and 
-				 * host activity: automatically filled in, 
-				 * not intended for direct usage */
-  pb_TimerID_LAST		/* Number of timer IDs */
-};
-
-/* Dynamic list of asynchronously tracked times between events */
-struct pb_async_time_marker_list {
-  char *label; // actually just a pointer to a string
-  enum pb_TimerID timerID;	/* The ID to which the interval beginning 
-                                 * with this marker should be attributed */
-  void * marker; 
-  //cudaEvent_t marker; 		/* The driver event for this marker */
-  struct pb_async_time_marker_list *next; 
-};
-
-struct pb_SubTimer {
-  char *label;
-  struct pb_Timer timer;
-  struct pb_SubTimer *next;
-};
-
-struct pb_SubTimerList {
-  struct pb_SubTimer *current;
-  struct pb_SubTimer *subtimer_list;
-};
-
-/* A set of timers for recording execution times. */
-struct pb_TimerSet {
-  enum pb_TimerID current;
-  struct pb_async_time_marker_list* async_markers;
-  pb_Timestamp async_begin;
-  pb_Timestamp wall_begin;
-  struct pb_Timer timers[pb_TimerID_LAST];
-  struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
-};
-
-/* Reset all timers in the set. */
-void
-pb_InitializeTimerSet(struct pb_TimerSet *timers);
-
-void
-pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
-
-/* Select which timer the next interval of time should be accounted
- * to. The selected timer is started and other timers are stopped.
- * Using pb_TimerID_NONE stops all timers. */
-void
-pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
-
-void
-pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
-
-/* Print timer values to standard output. */
-void
-pb_PrintTimerSet(struct pb_TimerSet *timers);
-
-/* Release timer resources */
-void
-pb_DestroyTimerSet(struct pb_TimerSet * timers);
-
-void
-pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
-
-
-typedef struct pb_Device_tag {
-  char* name;
-  void* clDevice;
-  int id;
-  unsigned int in_use;
-  unsigned int available;
-} pb_Device;
-
-struct pb_Context_tag;
-typedef struct pb_Context_tag pb_Context;
-
-typedef struct pb_Platform_tag {
-  char* name;
-  char* version;
-  void* clPlatform;
-  unsigned int in_use;
-  pb_Context** contexts;
-  pb_Device** devices;
-} pb_Platform;
-
-struct pb_Context_tag {
-  void* clPlatformId;
-  void* clContext;
-  void* clDeviceId;
-  pb_Platform* pb_platform;
-  pb_Device* pb_device;
-};
-
-// verbosely print out list of platforms and their devices to the console.
-pb_Platform**
-pb_GetPlatforms();
-
-// Choose a platform according to the given platform specification
-pb_Platform*
-pb_GetPlatform(struct pb_PlatformParam *platform);
-
-// choose a platform: by name, name & version
-pb_Platform*
-pb_GetPlatformByName(const char* name);
-
-pb_Platform*
-pb_GetPlatformByNameAndVersion(const char* name, const char* version);
-
-// Choose a device according to the given device specification
-pb_Device*
-pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
-
-pb_Device**
-pb_GetDevices(pb_Platform* pb_platform);
-
-// choose a device by name.
-pb_Device*
-pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
-
-pb_Platform*
-pb_GetPlatformByEnvVars();
-
-pb_Context*
-pb_InitOpenCLContext(struct pb_Parameters* parameters);
-
-void
-pb_ReleasePlatforms();
-
-void
-pb_ReleaseContext(pb_Context* c);
-
-void
-pb_PrintPlatformInfo(pb_Context* c);
-
-void
-perf_init();
-
-//#define MEASURE_KERNEL_TIME
-
-#include <CL/cl.h>
-
-#ifdef MEASURE_KERNEL_TIME
-#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
-cl_int
-pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
-                       cl_kernel        /* kernel */,
-                       cl_uint          /* work_dim */,
-                       const size_t *   /* global_work_offset */,
-                       const size_t *   /* global_work_size */,
-                       const size_t *   /* local_work_size */,
-                       cl_uint          /* num_events_in_wait_list */,
-                       const cl_event * /* event_wait_list */,
-                       cl_event *       /* event */);
-#endif
-
-enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
-void pb_sig_float(char*, float*, int);
-void pb_sig_double(char*, double*, int);
-void pb_sig_short(char*, short*, int);
-void pb_sig_int(char*, int*, int);
-void pb_sig_uchar(char*, unsigned char*, unsigned int);
-void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif //PARBOIL_HEADER
-
--- a/tests/opencl/cutcp/parboil_opencl.c
+++ b/tests/opencl/cutcp/parboil_opencl.c
--- a/tests/opencl/cutcp/readatom.c
+++ b/tests/opencl/cutcp/readatom.c
@ -1,139 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2008-2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include "atom.h"
-
-
-#define LINELEN 96
-#define INITLEN 20
-
-
-Atoms *read_atom_file(const char *fname)
-{
-  FILE *file;
-  char line[LINELEN];
-
-  Atom *atom;			/* Atom array */
-  int len = INITLEN;		/* Size of atom array */
-  int cnt = 0;			/* Number of atoms read */
-
-  /* allocate initial atom array */
-  atom = (Atom *) malloc(len * sizeof(Atom));
-  if (NULL==atom) {
-    fprintf(stderr, "can't allocate memory\n");
-    return NULL;
-  }
-
-  int i;
-  for (i = 0; i < len; ++i) {
-    atom[i].x = i+0;
-    atom[i].y = i+1;
-    atom[i].z = i+2;
-    atom[i].q = 1;
-  }
-
-#if 0
-  /* open atom "pqr" file */
-  file = fopen(fname, "r");
-  if (NULL==file) {
-    fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
-    return NULL;
-  }
-
-  /* loop to read pqr file line by line */
-  while (fgets(line, LINELEN, file) != NULL) {
-
-    if (strncmp(line, "ATOM  ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
-      continue;  /* skip anything that isn't an atom record */
-    }
-
-    if (cnt==len) {  /* extend atom array */
-      void *tmp = realloc(atom, 2*len*sizeof(Atom));
-      if (NULL==tmp) {
-        fprintf(stderr, "can't allocate more memory\n");
-        return NULL;
-      }
-      atom = (Atom *) tmp;
-      len *= 2;
-    }
-
-    /* read position coordinates and charge from atom record */
-    if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
-          &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
-      fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
-      return NULL;
-    }
-
-    cnt++;  /* count atoms as we store them */
-  }
-
-  /* verify EOF and close file */
-  if ( !feof(file) ) {
-    fprintf(stderr, "did not find EOF\n");
-    return NULL;
-  }
-  if (fclose(file)) {
-    fprintf(stderr, "can't close file\n");
-    return NULL;
-  }
-#endif
-
-  /* Build the output data structure */
-  {
-    Atoms *out = (Atoms *)malloc(sizeof(Atoms));
-
-    if (NULL == out) {
-      fprintf(stderr, "can't allocate memory\n");
-      return NULL;
-    }
-
-    out->size = cnt;
-    out->atoms = atom;
-
-    return out;
-  }
-}
-
-
-void free_atom(Atoms *atom)
-{
-  if (atom) {
-    free(atom->atoms);
-    free(atom);
-  }
-}
-
-void
-get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
-{
-  Atom *atoms = atom->atoms;
-  int natoms = atom->size;
-  Vec3 lo;
-  Vec3 hi;
-  int n;
-
-  hi.x = lo.x = atoms[0].x;
-  hi.y = lo.y = atoms[0].y;
-  hi.z = lo.z = atoms[0].z;
-
-  for (n = 1; n < natoms; n++) {
-    lo.x = fminf(lo.x, atoms[n].x);
-    hi.x = fmaxf(hi.x, atoms[n].x);
-    lo.y = fminf(lo.y, atoms[n].y);
-    hi.y = fmaxf(hi.y, atoms[n].y);
-    lo.z = fminf(lo.z, atoms[n].z);
-    hi.z = fmaxf(hi.z, atoms[n].z);
-  }
-
-  *out_lo = lo;
-  *out_hi = hi;
-}
--- a/tests/opencl/cutcp/watbox.sl40.pqr
+++ b/tests/opencl/cutcp/watbox.sl40.pqr
--- a/tests/opencl/fft/.depend
+++ b/tests/opencl/fft/.depend
@ -0,0 +1,7 @@
+main.o: main.cc /opt/pocl/runtime/include/CL/opencl.h \
+ /opt/pocl/runtime/include/CL/cl.h \
+ /opt/pocl/runtime/include/CL/cl_version.h \
+ /opt/pocl/runtime/include/CL/cl_platform.h \
+ /opt/pocl/runtime/include/CL/cl_gl.h \
+ /opt/pocl/runtime/include/CL/cl_gl_ext.h \
+ /opt/pocl/runtime/include/CL/cl_ext.h common.h
--- a/tests/opencl/sgemm2/Makefile
+++ b/tests/opencl/sgemm2/Makefile
@ -1,4 +1,4 @@
-PROJECT = sgemm2
+PROJECT = fft4

 SRCS = main.cc

--- a/tests/opencl/fft/common.h
+++ b/tests/opencl/fft/common.h
@ -0,0 +1,3 @@
+#pragma once
+
+#define LOCAL_SIZE 16
--- a/tests/opencl/fft/fft4
+++ b/tests/opencl/fft/fft4
--- a/tests/opencl/fft/fft_radix4.dump
+++ b/tests/opencl/fft/fft_radix4.dump
--- a/tests/opencl/fft/kernel.cl
+++ b/tests/opencl/fft/kernel.cl
@ -0,0 +1,63 @@
+#include "common.h"
+
+__kernel void fft_radix4(__global float2* input, __global float2* output, const unsigned int N) {
+    int globalId = get_global_id(0);
+    int localId = get_local_id(0);
+    int groupId = get_group_id(0);
+
+    // Allocate local memory to store intermediate results and twiddle factors
+    __local float2 localData[LOCAL_SIZE];
+    __local float2 twiddleFactors[LOCAL_SIZE / 4];
+
+    // Calculate twiddle factors for this FFT stage and store in local memory
+    if (localId < LOCAL_SIZE / 4) {
+        float angle = -2 * M_PI * localId / LOCAL_SIZE;
+        twiddleFactors[localId] = (float2)(cos(angle), sin(angle));
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Calculate the offset for the data this work-group will process
+    int offset = groupId * LOCAL_SIZE;
+
+    // Load a chunk of input into local memory for faster access
+    if (globalId < N) {
+        localData[localId] = input[globalId];
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // Perform the Radix-4 FFT on the data chunk in local memory
+    for (unsigned int stride = 1; stride < LOCAL_SIZE; stride *= 4) {
+        int twiddleIndex = (localId / stride) % 4;
+        float2 twiddle = twiddleFactors[twiddleIndex * (LOCAL_SIZE / (4 * stride))];
+
+        // Load data
+        float2 data0 = localData[localId];
+        float2 data1 = localData[localId + stride];
+        float2 data2 = localData[localId + 2 * stride];
+        float2 data3 = localData[localId + 3 * stride];
+
+        // Apply twiddle factors
+        data1 *= twiddle;
+        data2 *= twiddle * twiddle;
+        data3 *= twiddle * twiddle * twiddle;
+
+        // Radix-4 butterfly operations
+        float2 t0 = data0 + data2;
+        float2 t1 = data0 - data2;
+        float2 t2 = data1 + data3;
+        float2 t3 = (data1 - data3) * (float2)(0, -1);
+
+        // Store results
+        localData[localId] = t0 + t2;
+        localData[localId + stride] = t1 + t3;
+        localData[localId + 2 * stride] = t0 - t2;
+        localData[localId + 3 * stride] = t1 - t3;
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // Write the results back to global memory
+    if (globalId < N) {
+        output[globalId] = localData[localId];
+    }
+}
--- a/tests/opencl/fft/kernel.pocl
+++ b/tests/opencl/fft/kernel.pocl
--- a/tests/opencl/sgemm2/main.cc
+++ b/tests/opencl/sgemm2/main.cc
@ -7,12 +7,31 @@
 #include <unistd.h> 
 #include <chrono>
 #include <vector>
+#include <cmath>
+#include "common.h"

-#define LOCAL_SIZE 16
+#define KERNEL_NAME "fft_radix4"

 #define FLOAT_ULP 6

-#define KERNEL_NAME "sgemm2"
+struct float2 {
+    float x;
+    float y;
+
+    float2(float real = 0.0f, float imag = 0.0f) : x(real), y(imag) {}
+
+    float2 operator+(const float2& other) const {
+        return {x + other.x, y + other.y};
+    }
+
+    float2 operator-(const float2& other) const {
+        return {x - other.x, y - other.y};
+    }
+
+    float2 operator*(const float2& other) const {
+        return {x * other.x - y * other.y, x * other.y + y * other.x};
+    }
+};

 #define CL_CHECK(_expr)                                                \
   do {                                                                \
@ -45,7 +64,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
    fprintf(stderr, "Failed to load kernel.");
    return -1;
  }
-  
  fseek(fp , 0 , SEEK_END);
  long fsize = ftell(fp);
  rewind(fp);
@ -58,25 +76,33 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
  return 0;
 }

-static bool compare_equal(float a, float b) {
-  union fi_t { float f; int32_t i; };
-  fi_t fa, fb;
-  fa.f = a;
-  fb.f = b;
-  auto d = std::abs(fa.i - fb.i);
-  return d <= FLOAT_ULP;
+static std::vector<float2> referenceDFT(const std::vector<float2>& input) {
+  std::vector<float2> output(input.size());
+  for (unsigned int k = 0; k < input.size(); ++k) { // For each output element
+    output[k] = {0, 0}; // Initialize to zero
+    for (unsigned int n = 0; n < input.size(); ++n) { // For each input element
+      float angle = -2 * M_PI * k * n / input.size();
+      float2 twiddle = {cos(angle), sin(angle)};
+      output[k].x += input[n].x * twiddle.x - input[n].y * twiddle.y;
+      output[k].y += input[n].x * twiddle.y + input[n].y * twiddle.x;
+    }
+  }
+  return output;
 }

-static void matmul_cpu(float *C, float *A, float *B, int N) {
-    for (int i = 0; i < N; i++) {
-        for (int j = 0; j < N; j++) {
-            float sum = 0.0f;
-            for (int k = 0; k < N; k++) {
-                sum += A[i * N + k] * B[k * N + j];
-            }
-            C[i * N + j] = sum;
-        }
+static int verifyOutput(const std::vector<float2>& output, 
+                         const std::vector<float2>& reference, 
+                         unsigned int N) {
+  int errors = 0;
+  for (unsigned int i = 0; i < N; ++i) {
+    float2 diff = {output[i].x - reference[i].x, output[i].y - reference[i].y};
+    float error = sqrt(diff.x * diff.x + diff.y * diff.y);
+    if (error > 1e-5) {
+      printf("*** error: [%d] expected=(%f,%f), actual=(%f,%f)\n", i, reference[i].x, reference[i].y, output[i].x, output[i].y);
+      ++errors;
    }
+  }
+  return errors;
 }

 cl_device_id device_id = NULL;
@ -84,24 +110,22 @@ cl_context context = NULL;
 cl_command_queue commandQueue = NULL;
 cl_program program = NULL;
 cl_kernel kernel = NULL;
-cl_mem a_memobj = NULL;
-cl_mem b_memobj = NULL;
-cl_mem c_memobj = NULL;
+cl_mem i_memobj = NULL;
+cl_mem o_memobj = NULL;
 uint8_t *kernel_bin = NULL;

 static void cleanup() {
  if (commandQueue) clReleaseCommandQueue(commandQueue);
  if (kernel) clReleaseKernel(kernel);
  if (program) clReleaseProgram(program);
-  if (a_memobj) clReleaseMemObject(a_memobj);
-  if (b_memobj) clReleaseMemObject(b_memobj);
-  if (c_memobj) clReleaseMemObject(c_memobj);
+  if (i_memobj) clReleaseMemObject(i_memobj);
+  if (o_memobj) clReleaseMemObject(o_memobj);
  if (context) clReleaseContext(context);
  if (device_id) clReleaseDevice(device_id);  
  if (kernel_bin) free(kernel_bin);
 }

-int size = 32;
+int size = 64;

 static void show_usage() {
  printf("Usage: [-n size] [-h: help]\n");
@ -124,19 +148,13 @@ static void parse_args(int argc, char **argv) {
      exit(-1);
    }
  }
+
+  printf("Workload size=%d\n", size);
 }

 int main (int argc, char **argv) {
  // parse command arguments
  parse_args(argc, argv);
-
-  uint32_t num_points = size * size;
-
-  printf("Matrix size=%d\n", size);
-  if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
-    printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
-    return -1;
-  }
  
  cl_platform_id platform_id;
  size_t kernel_size;
@ -148,15 +166,10 @@ int main (int argc, char **argv) {
  printf("Create context\n");
  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));

-  char device_string[1024];
-  clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
-  printf("Using device: %s\n", device_string);
-
  printf("Allocate device buffers\n");
-  size_t nbytes = num_points * sizeof(float);
-  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
-  b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
-  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
+  size_t nbytes = size * sizeof(float2);
+  i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));

  printf("Create program from kernel source\n");
 #ifdef HOSTGPU
@ -169,11 +182,7 @@ int main (int argc, char **argv) {
    return -1;
  program = CL_CHECK2(clCreateProgramWithBinary(
    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
-#endif  
-  if (program == NULL) {
-    cleanup();
-    return -1;
-  }
+#endif

  // Build program
  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@ -181,61 +190,47 @@ int main (int argc, char **argv) {
  // Create kernel
  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));

-  size_t global_size[2] = {size, size};
-  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};  
-
  // Set kernel arguments
-  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
-  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
-  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
-  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
-  CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
-  CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&i_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&o_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), (void *)&size));

- // Allocate memories for input arrays and output arrays.
- std::vector<float> h_a(num_points);
- std::vector<float> h_b(num_points);
- std::vector<float> h_c(num_points);
+  // Allocate memories for input arrays and output arrays.
+  std::vector<float2> h_i(size);
+  std::vector<float2> h_o(size);
 	
  // Generate input values
-  for (uint32_t i = 0; i < num_points; ++i) {
-    h_a[i] = static_cast<float>(rand()) / RAND_MAX;
-    h_b[i] = static_cast<float>(rand()) / RAND_MAX;
+  for (int i = 0; i < size; ++i) {
+    h_i[i].x = sin(2 * M_PI * i / size); // Sine wave as an example
+    h_i[i].y = 0.0f; // Zero imaginary part
  }

  // Creating command queue
  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  

 	printf("Upload source buffers\n");
-  CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a.data(), 0, NULL, NULL));
-  CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b.data(), 0, NULL, NULL));
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, nbytes, h_i.data(), 0, NULL, NULL));

  printf("Execute the kernel\n");
+  size_t global_work_size[1] = {size};
+  size_t local_work_size[1] = {LOCAL_SIZE};
  auto time_start = std::chrono::high_resolution_clock::now();
-  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL));
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
  CL_CHECK(clFinish(commandQueue));
  auto time_end = std::chrono::high_resolution_clock::now();
  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
  printf("Elapsed time: %lg ms\n", elapsed);

  printf("Download destination buffer\n");
-  CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, nbytes, h_o.data(), 0, NULL, NULL));

  printf("Verify result\n");
-  std::vector<float> ref_vec(num_points);
-  matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size);
-  int errors = 0;
-  for (uint32_t i = 0; i < num_points; ++i) {
-    if (!compare_equal(h_c[i], ref_vec[i])) {
-      if (errors < 100) 
-        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
-      ++errors;
-    }
-  }
-  if (errors != 0) {
-    printf("FAILED! - %d errors\n", errors);    
-  } else {
+  std::vector<float2> reference = referenceDFT(h_i);
+  auto errors = verifyOutput(h_o, reference, size);
+  if (0 == errors) {
    printf("PASSED!\n");
+  } else {
+    printf("FAILED! - %d errors\n", errors);    
  }

  // Clean up		
--- a/tests/opencl/fft/main.cc.o
+++ b/tests/opencl/fft/main.cc.o
--- a/tests/opencl/mri-q/32_32_32_dataset.bin
+++ b/tests/opencl/mri-q/32_32_32_dataset.bin
--- a/tests/opencl/mri-q/Makefile
+++ b/tests/opencl/mri-q/Makefile
@ -1,9 +0,0 @@
-PROJECT = mri-q
-
-SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c file.cc computeQ.c
-
-CXXFLAGS += -I.
-
-OPTS ?= 
-
-include ../common.mk
--- a/tests/opencl/mri-q/args.c
+++ b/tests/opencl/mri-q/args.c
@ -1,617 +0,0 @@
-
-#include <parboil.h>
-#include <errno.h>
-#include <limits.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-
-/*****************************************************************************/
-/* Memory management routines */
-
-/* Free an array of owned strings. */
-void
-pb_FreeStringArray(char **string_array)
-{
-  char **p;
-
-  if (!string_array) return;
-  for (p = string_array; *p; p++) free(*p);
-  free(string_array);
-}
-
-struct pb_PlatformParam *
-pb_PlatformParam(char *name, char *version)
-{
-  if (name == NULL) {
-    fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
-    exit(-1);
-  }
-
-  struct pb_PlatformParam *ret =
-    (struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
-
-  ret->name = name;
-  ret->version = version;
-  return ret;
-}
-
-void
-pb_FreePlatformParam(struct pb_PlatformParam *p)
-{
-  if (p == NULL) return;
-
-  free(p->name);
-  free(p->version);
-  free(p);
-}
-
-struct pb_DeviceParam *
-pb_DeviceParam_index(int index)
-{
-  struct pb_DeviceParam *ret =
-    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
-  ret->criterion = pb_Device_INDEX;
-  ret->index = index;
-  return ret;
-}
-
-struct pb_DeviceParam *
-pb_DeviceParam_cpu(void)
-{
-  struct pb_DeviceParam *ret =
-    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
-  ret->criterion = pb_Device_CPU;
-  return ret;
-}
-
-struct pb_DeviceParam *
-pb_DeviceParam_gpu(void)
-{
-  struct pb_DeviceParam *ret =
-    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
-  ret->criterion = pb_Device_GPU;
-  return ret;
-}
-
-struct pb_DeviceParam *
-pb_DeviceParam_accelerator(void)
-{
-  struct pb_DeviceParam *ret =
-    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
-  ret->criterion = pb_Device_ACCELERATOR;
-  return ret;
-}
-
-struct pb_DeviceParam *
-pb_DeviceParam_name(char *name)
-{
-  struct pb_DeviceParam *ret =
-    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
-  ret->criterion = pb_Device_NAME;
-  ret->name = name;
-  return ret;
-}
-
-void
-pb_FreeDeviceParam(struct pb_DeviceParam *p)
-{
-  if (p == NULL) return;
-
-  switch(p->criterion) {
-  case pb_Device_NAME:
-    free(p->name);
-    break;
-  case pb_Device_INDEX:
-  case pb_Device_CPU:
-  case pb_Device_ACCELERATOR:
-    break;
-  default:
-    fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
-    exit(-1);
-  }
-}
-
-void
-pb_FreeParameters(struct pb_Parameters *p)
-{
-  free(p->outFile);
-  pb_FreeStringArray(p->inpFiles);
-  pb_FreePlatformParam(p->platform);
-  pb_FreeDeviceParam(p->device);
-  free(p);
-}
-
-/*****************************************************************************/
-
-/* Parse a comma-delimited list of strings into an
- * array of strings. */
-static char ** 
-read_string_array(char *in)
-{
-  char **ret;
-  int i;
-  int count;			/* Number of items in the input */
-  char *substring;		/* Current substring within 'in' */
-
-  /* Count the number of items in the string */
-  count = 1;
-  for (i = 0; in[i]; i++) if (in[i] == ',') count++;
-
-  /* Allocate storage */
-  ret = (char **)malloc((count + 1) * sizeof(char *));
-
-  /* Create copies of the strings from the list */
-  substring = in;
-  for (i = 0; i < count; i++) {
-    char *substring_end;
-    int substring_length;
-
-    /* Find length of substring */
-    for (substring_end = substring;
-	 (*substring_end != ',') && (*substring_end != 0);
-	 substring_end++);
-
-    substring_length = substring_end - substring;
-
-    /* Allocate memory and copy the substring */
-    ret[i] = (char *)malloc(substring_length + 1);
-    memcpy(ret[i], substring, substring_length);
-    ret[i][substring_length] = 0;
-
-    /* go to next substring */
-    substring = substring_end + 1;
-  }
-  ret[i] = NULL;		/* Write the sentinel value */
-
-  return ret;
-}
-
-static void
-report_parse_error(const char *str)
-{
-  fputs(str, stderr);
-}
-
-/* Interpret a string as a 'pb_DeviceParam' value.
- * Return a pointer to a new value, or NULL on failure.
- */
-static struct pb_DeviceParam *
-read_device_param(char *str)
-{  
-  /* Try different ways of interpreting 'device_string' until one works */
-
-  /* If argument is an integer, then interpret it as a device index */
-  errno = 0;
-  char *end;
-  long device_int = strtol(str, &end, 10);
-  if (!errno) {
-    /* Negative numbers are not valid */
-    if (device_int < 0 || device_int > INT_MAX) return NULL;
-
-    return pb_DeviceParam_index(device_int);
-  }
-
-  /* Match against predefined strings */
-  if (strcmp(str, "CPU") == 0)
-    return pb_DeviceParam_cpu();
-  if (strcmp(str, "GPU") == 0)
-    return pb_DeviceParam_gpu();
-  if (strcmp(str, "ACCELERATOR") == 0)
-    return pb_DeviceParam_accelerator();
-
-  /* Assume any other string is a device name */
-  return pb_DeviceParam_name(strdup(str));
-}
-
-/* Interpret a string as a 'pb_PlatformParam' value.
- * Return a pointer to a new value, or NULL on failure.
- */
-static struct pb_PlatformParam *
-read_platform_param(char *str)
-{
-  int separator_index;          /* Index of the '-' character separating
-                                 * name and version number.  It's -1 if
-                                 * there's no '-' character. */
-
-  /* Find the last occurrence of '-' in 'str' */
-  {
-    char *cur;
-    separator_index = -1;
-    for (cur = str; *cur; cur++) {
-      if (*cur == '-') separator_index = cur - str;
-    }
-  }
-
-  /* The platform name is either the entire string, or all characters before
-   * the separator */
-  int name_length = separator_index == -1 ? strlen(str) : separator_index;
-  char *name_str = (char *)malloc(name_length + 1);
-  memcpy(name_str, str, name_length);
-  name_str[name_length] = 0;
-
-  /* The version is either NULL, or all characters after the separator */
-  char *version_str;
-  if (separator_index == -1) {
-    version_str = NULL;
-  }
-  else {
-    const char *version_input_str = str + separator_index + 1;
-    int version_length = strlen(version_input_str);
-
-    version_str = (char *)malloc(version_length + 1);
-    memcpy(version_str, version_input_str, version_length);
-    version_str[version_length] = 0;
-  }
-
-  /* Create output structure */
-  return pb_PlatformParam(name_str, version_str);
-}
-
-/****************************************************************************/
-/* Argument parsing state */
-
-/* Argument parsing state.
- *
- * Arguments that are interpreted by the argument parser are removed from
- * the list.  Variables 'argc' and 'argn' do not count arguments that have
- * been removed.
- *
- * During argument parsing, the array of arguments is compacted, overwriting
- * the erased arguments.  Variable 'argv_put' points to the array element
- * where the next argument will be written.  Variable 'argv_get' points to
- * the array element where the next argument will be read from.
- */
-struct argparse {
-  int argc;			/* Number of arguments.  Mutable. */
-  int argn;			/* Current argument index. */
-  char **argv_get;		/* Argument value being read. */
-  char **argv_put;		/* Argument value being written.
-				 * argv_put <= argv_get. */
-};
-
-static void
-initialize_argparse(struct argparse *ap, int argc, char **argv)
-{
-  ap->argc = argc;
-  ap->argn = 0;
-  ap->argv_get = ap->argv_put = argv;
-}
-
-/* Finish argument parsing, without processing the remaining arguments.
- * Write new argument count into _argc. */
-static void
-finalize_argparse(struct argparse *ap, int *_argc, char **argv)
-{
-  /* Move the remaining arguments */
-  for(; ap->argn < ap->argc; ap->argn++)
-    *ap->argv_put++ = *ap->argv_get++;
-
-  /* Update the argument count */
-  *_argc = ap->argc;
-
-  /* Insert a terminating NULL */
-  argv[ap->argc] = NULL;
-}
-
-/* Delete the current argument.  The argument will not be visible
- * when argument parsing is done. */
-static void
-delete_argument(struct argparse *ap)
-{
-  if (ap->argn >= ap->argc) {
-    fprintf(stderr, "delete_argument\n");
-  }
-  ap->argc--;
-  ap->argv_get++;
-}
-
-/* Go to the next argument.  Also, move the current argument to its
- * final location in argv. */
-static void
-next_argument(struct argparse *ap)
-{
-  if (ap->argn >= ap->argc) {
-    fprintf(stderr, "next_argument\n");
-  }
-  /* Move argument to its new location. */
-  *ap->argv_put++ = *ap->argv_get++;
-  ap->argn++;
-}
-
-static int
-is_end_of_arguments(struct argparse *ap)
-{
-  return ap->argn == ap->argc;
-}
-
-/* Get the current argument */
-static char *
-get_argument(struct argparse *ap)
-{
-  return *ap->argv_get;
-}
-
-/* Get the current argument, and also delete it */
-static char *
-consume_argument(struct argparse *ap)
-{
-  char *ret = get_argument(ap);
-  delete_argument(ap);
-  return ret;
-}
-
-/****************************************************************************/
-
-/* The result of parsing a command-line argument */
-typedef enum {
-  ARGPARSE_OK,                  /* Success */
-  ARGPARSE_ERROR,               /* Error */
-  ARGPARSE_DONE                 /* Success, and do not continue parsing */
-} result;
-
-typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
-
-
-/* A command-line option */
-struct option {
-  char short_name;              /* If not 0, the one-character
-                                 * name of this option */
-  const char *long_name;        /* If not NULL, the long name of this option */
-  parse_action *action;       /* What to do when this option occurs.
-                               * Sentinel value is NULL.
-                               */
-};
-
-/* Output file
- *
- * -o FILE
- */
-static result
-parse_output_file(struct argparse *ap, struct pb_Parameters *params)
-{
-  if (is_end_of_arguments(ap))
-    {
-      report_parse_error("Expecting file name after '-o'\n");
-      return ARGPARSE_ERROR;
-    }
-
-  /* Replace the output file name */
-  free(params->outFile);
-  params->outFile = strdup(consume_argument(ap));
-
-  return ARGPARSE_OK;
-}
-
-/* Input files
- *
- * -i FILE,FILE,...
- */
-static result
-parse_input_files(struct argparse *ap, struct pb_Parameters *params)
-{
-  if (is_end_of_arguments(ap))
-    {
-      report_parse_error("Expecting file name after '-i'\n");
-      return ARGPARSE_ERROR;
-    }
-
-  /* Replace the input file list */
-  pb_FreeStringArray(params->inpFiles);
-  params->inpFiles = read_string_array(consume_argument(ap));
-  return ARGPARSE_OK;
-}
-
-/* End of options
- *
- * --
- */
-
-static result
-parse_end_options(struct argparse *ap, struct pb_Parameters *params)
-{
-  return ARGPARSE_DONE;
-}
-
-/* OpenCL device
- *
- * --device X
- */
-
-static result
-parse_device(struct argparse *ap, struct pb_Parameters *params)
-{
-  /* Read the next argument, which specifies a device */
-
-  if (is_end_of_arguments(ap))
-    {
-      report_parse_error("Expecting device specification after '--device'\n");
-      return ARGPARSE_ERROR;
-    }
-
-  char *device_string = consume_argument(ap);
-  struct pb_DeviceParam *device_param = read_device_param(device_string);
-
-  if (!device_param) {
-    report_parse_error("Unrecognized device specification format on command line\n");
-    return ARGPARSE_ERROR;
-  }
-
-  /* Save the result */
-  pb_FreeDeviceParam(params->device);
-  params->device = device_param;
-
-  return ARGPARSE_OK;
-}
-
-static result
-parse_platform(struct argparse *ap, struct pb_Parameters *params)
-{
-  /* Read the next argument, which specifies a platform */
-
-  if (is_end_of_arguments(ap))
-    {
-      report_parse_error("Expecting device specification after '--platform'\n");
-      return ARGPARSE_ERROR;
-    }
-
-  char *platform_string = consume_argument(ap);
-  struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
-
-  if (!platform_param) {
-    report_parse_error("Unrecognized platform specification format on command line\n");
-    return ARGPARSE_ERROR;
-  }
-
-  /* Save the result */
-  pb_FreePlatformParam(params->platform);
-  params->platform = platform_param;
-
-  return ARGPARSE_OK;
-}
-  
-
-static struct option options[] = {
-  { 'o', NULL, &parse_output_file },
-  { 'i', NULL, &parse_input_files },
-  { '-', NULL, &parse_end_options },
-  { 0, "device", &parse_device },
-  { 0, "platform", &parse_platform },
-  { 0, NULL, NULL }
-};
-
-static int
-is_last_option(struct option *op)
-{
-  return op->action == NULL;
-}
-
-/****************************************************************************/
-
-/* Parse command-line parameters.
- * Return zero on error, nonzero otherwise.
- * On error, the other outputs may be invalid.
- *
- * The information collected from parameters is used to update
- * 'ret'.  'ret' should be initialized.
- *
- * '_argc' and 'argv' are updated to contain only the unprocessed arguments.
- */
-static int
-pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
-{
-  char *err_message;
-  struct argparse ap;
-
-  /* Each argument */
-  initialize_argparse(&ap, *_argc, argv);
-  while(!is_end_of_arguments(&ap)) {
-    result arg_result;          /* Result of parsing this option */
-    char *arg = get_argument(&ap);
-
-    /* Process this argument */
-    if (arg[0] == '-') {
-      /* Single-character flag */
-      if ((arg[1] != 0) && (arg[2] == 0)) {
-        delete_argument(&ap);	/* This argument is consumed here */
-
-        /* Find a matching short option */
-        struct option *op;
-        for (op = options; !is_last_option(op); op++) {
-          if (op->short_name == arg[1]) {
-            arg_result = (*op->action)(&ap, ret);
-            goto option_was_processed;
-          }
-        }
-
-        /* No option matches */
-        report_parse_error("Unexpected command-line parameter\n");
-        arg_result = ARGPARSE_ERROR;
-        goto option_was_processed;
-      }
-
-      /* Long flag */
-      if (arg[1] == '-') {
-        delete_argument(&ap);	/* This argument is consumed here */
-
-        /* Find a matching long option */
-        struct option *op;
-        for (op = options; !is_last_option(op); op++) {
-          if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
-            arg_result = (*op->action)(&ap, ret);
-            goto option_was_processed;
-          }
-        }
-
-        /* No option matches */
-        report_parse_error("Unexpected command-line parameter\n");
-        arg_result = ARGPARSE_ERROR;
-        goto option_was_processed;
-      }
-    }
-    else {
-      /* Other arguments are ignored */
-      next_argument(&ap);
-      arg_result = ARGPARSE_OK;
-      goto option_was_processed;
-    }
-
-  option_was_processed:
-    /* Decide what to do next based on 'arg_result' */
-    switch(arg_result) {
-    case ARGPARSE_OK:
-      /* Continue processing */
-      break;
-
-    case ARGPARSE_ERROR:
-      /* Error exit from the function */
-      return 0;
-
-    case ARGPARSE_DONE:
-      /* Normal exit from the argument parsing loop */
-      goto end_of_options;
-    }
-  } /* end for each argument */
-
-  /* If all arguments were processed, then normal exit from the loop */
-
- end_of_options:
-  finalize_argparse(&ap, _argc, argv);
-  return 1;
-}
-
-/*****************************************************************************/
-/* Other exported functions */
-
-struct pb_Parameters *
-pb_ReadParameters(int *_argc, char **argv)
-{
-  struct pb_Parameters *ret =
-    (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
-
-  /* Initialize the parameters structure */
-  ret->outFile = NULL;
-  ret->inpFiles = (char **)malloc(sizeof(char *));
-  ret->inpFiles[0] = NULL;
-  ret->platform = NULL;
-  ret->device = NULL;
-
-  /* Read parameters and update _argc, argv */
-  if (!pb_ParseParameters(ret, _argc, argv)) {
-    /* Parse error */
-    pb_FreeParameters(ret);
-    return NULL;
-  }
-
-  return ret;
-}
-
-int
-pb_Parameters_CountInputs(struct pb_Parameters *p)
-{
-  int n;
-
-  for (n = 0; p->inpFiles[n]; n++);
-  return n;
-}
-
--- a/tests/opencl/mri-q/computeQ.c
+++ b/tests/opencl/mri-q/computeQ.c
@ -1,118 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#include <stdio.h>
-#include <malloc.h>
-#include <CL/cl.h>
-#include "ocl.h"
-#include "macros.h"
-#include "computeQ.h"
-#include "parboil.h"
-
-#define NC 1
-
-void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm)
-{
-  int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
-  if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
-    phiMagBlocks++;
-  
-  size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK;
-  size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK;
-
-  cl_int clStatus;
-  clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(cl_mem),&phiR_d);
-  clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(cl_mem),&phiI_d);
-  clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&phiMag_d);
-  clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(int),&numK);
-  CHECK_ERROR("clSetKernelArg")
-
-  clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimPhiMagGrid,&DimPhiMagBlock,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueNDRangeKernel")
-}
-
-static
-unsigned long long int
-readElapsedTime(cl_event internal)
-{
-  cl_int status;
-  cl_ulong t_begin, t_end;
-  status = clGetEventProfilingInfo(internal, CL_PROFILING_COMMAND_START,
-    sizeof(cl_ulong), &t_begin, NULL);
-  if (status != CL_SUCCESS) return 0;
-  status = clGetEventProfilingInfo(internal, CL_PROFILING_COMMAND_END,
-  sizeof(cl_ulong), &t_end, NULL);
-  if (status != CL_SUCCESS) return 0;
-  return (unsigned long long int)(t_end - t_begin);
-}
-
-
-void computeQ_GPU (int numK,int numX,
-		   cl_mem x_d, cl_mem y_d, cl_mem z_d,
-		   struct kValues* kVals,
-		   cl_mem Qr_d, cl_mem Qi_d,
-		   clPrmtr* clPrm)
-{
-  int QGrids = numK / KERNEL_Q_K_ELEMS_PER_GRID;
-  if (numK % KERNEL_Q_K_ELEMS_PER_GRID)
-    QGrids++;
-  int QBlocks = numX / KERNEL_Q_THREADS_PER_BLOCK;
-  if (numX % KERNEL_Q_THREADS_PER_BLOCK)
-    QBlocks++;
-
-  size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC;
-  size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC;
-
-  cl_int clStatus;
-  cl_mem ck;
-  ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
-
-  int QGrid;
-  for (QGrid = 0; QGrid < QGrids; QGrid++) {
-    // Put the tile of K values into constant mem
-    int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
-    struct kValues* kValsTile = kVals + QGridBase;
-    int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
-
-    clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,ck,CL_TRUE,0,numElems*sizeof(struct kValues),kValsTile,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueWriteBuffer")
-    
-    clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(int),&numK);
-    clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(int),&QGridBase);
-    clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&x_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(cl_mem),&y_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,4,sizeof(cl_mem),&z_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,5,sizeof(cl_mem),&Qr_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,6,sizeof(cl_mem),&Qi_d);
-    clStatus = clSetKernelArg(clPrm->clKernel,7,sizeof(cl_mem),&ck);
-    CHECK_ERROR("clSetKernelArg")
-
-    printf ("Grid: %d, Block: %d\n", DimQGrid, DimQBlock);
-
-    #define TIMED_EXECUTION
-    #ifdef TIMED_EXECUTION
-    cl_event e;
-    clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,&e);
-    CHECK_ERROR("clEnqueueNDRangeKernel")
-    clWaitForEvents(1, &e);
-    printf ("%llu\n", readElapsedTime(e));
-    #else
-    clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,NULL);
-    CHECK_ERROR("clEnqueueNDRangeKernel")
-    #endif
-  }
-}
-
-void createDataStructsCPU(int numK, int numX, float** phiMag,
-	 float** Qr, float** Qi)
-{
-  *phiMag = (float* ) memalign(16, numK * sizeof(float));
-  *Qr = (float*) memalign(16, numX * sizeof (float));
-  *Qi = (float*) memalign(16, numX * sizeof (float));
-}
-
--- a/tests/opencl/mri-q/computeQ.h
+++ b/tests/opencl/mri-q/computeQ.h
@ -1,22 +0,0 @@
-#ifndef __COMPUTEQ__
-#define __COMPUTEQ__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm);
-void computeQ_GPU (int numK,int numX,
-		   cl_mem x_d, cl_mem y_d, cl_mem z_d,
-		   struct kValues* kVals,
-		   cl_mem Qr_d, cl_mem Qi_d,
-		   clPrmtr* clPrm);
-
-void createDataStructsCPU(int numK, int numX, float** phiMag,
-	 		  float** Qr, float** Qi);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/tests/opencl/mri-q/file.cc
+++ b/tests/opencl/mri-q/file.cc
@ -1,78 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-//#include <endian.h>
-#include <stdlib.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#include "file.h"
-
-#if __BYTE_ORDER != __LITTLE_ENDIAN
-# error "File I/O is not implemented for this system: wrong endianness."
-#endif
-
-extern "C"
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI)
-{
-  int numK, numX;
-  FILE* fid = fopen(fName, "r");
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open input file\n");
-      exit(-1);
-    }
-  fread (&numK, sizeof (int), 1, fid);
-  *_numK = numK;
-  fread (&numX, sizeof (int), 1, fid);
-  *_numX = numX;
-  *kx = (float *) memalign(16, numK * sizeof (float));
-  fread (*kx, sizeof (float), numK, fid);
-  *ky = (float *) memalign(16, numK * sizeof (float));
-  fread (*ky, sizeof (float), numK, fid);
-  *kz = (float *) memalign(16, numK * sizeof (float));
-  fread (*kz, sizeof (float), numK, fid);
-  *x = (float *) memalign(16, numX * sizeof (float));
-  fread (*x, sizeof (float), numX, fid);
-  *y = (float *) memalign(16, numX * sizeof (float));
-  fread (*y, sizeof (float), numX, fid);
-  *z = (float *) memalign(16, numX * sizeof (float));
-  fread (*z, sizeof (float), numX, fid);
-  *phiR = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiR, sizeof (float), numK, fid);
-  *phiI = (float *) memalign(16, numK * sizeof (float));
-  fread (*phiI, sizeof (float), numK, fid);
-  fclose (fid); 
-}
-
-extern "C"
-void outputData(char* fName, float* outR, float* outI, int numX)
-{
-  FILE* fid = fopen(fName, "w");
-  uint32_t tmp32;
-
-  if (fid == NULL)
-    {
-      fprintf(stderr, "Cannot open output file\n");
-      exit(-1);
-    }
-
-  /* Write the data size */
-  tmp32 = numX;
-  fwrite(&tmp32, sizeof(uint32_t), 1, fid);
-
-  /* Write the reconstructed data */
-  fwrite (outR, sizeof (float), numX, fid);
-  fwrite (outI, sizeof (float), numX, fid);
-  fclose (fid);
-}
--- a/tests/opencl/mri-q/file.h
+++ b/tests/opencl/mri-q/file.h
@ -1,22 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void inputData(char* fName, int* _numK, int* _numX,
-               float** kx, float** ky, float** kz,
-               float** x, float** y, float** z,
-               float** phiR, float** phiI);
-
-void outputData(char* fName, float* outR, float* outI, int numX);
-
-#ifdef __cplusplus
-}
-#endif
--- a/tests/opencl/mri-q/gpu_info.c
+++ b/tests/opencl/mri-q/gpu_info.c
@ -1,55 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-//#include <endian.h>
-#include <stdlib.h>
-#include <malloc.h>
-#include <stdio.h>
-#include <inttypes.h>
-
-#include "gpu_info.h"
-
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm)
-{
-	int max_thread;
-	int max_block=8;
-	if(major==1)
-	{
-		if(minor>=2)
-			max_thread=1024;
-		else
-			max_thread=768;
-	}
-	else if(major==2)
-		max_thread=1536;
-	else
-		//newer GPU  //keep using 2.0
-		max_thread=1536;
-	
-	int _grid;
-	int _thread;
-	
-	if(task*pad>sm*max_thread)
-	{
-		_thread=max_thread/max_block;
-		_grid = ((task*pad+_thread-1)/_thread)*_thread;
-	}
-	else
-	{
-		_thread=pad;
-		_grid=task*pad;
-	}
-
-	thread[0]=_thread;
-	grid[0]=_grid;
-}
--- a/tests/opencl/mri-q/gpu_info.h
+++ b/tests/opencl/mri-q/gpu_info.h
@ -1,28 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2010 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-#ifndef __GPUINFOH__
-#define __GPUINFOH__
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void compute_active_thread(size_t *thread,
-			   size_t *grid,
-			   int task,
-			   int pad,
-			   int major,
-			   int minor,
-			   int sm);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/tests/opencl/mri-q/kernel.cl
+++ b/tests/opencl/mri-q/kernel.cl
@ -1,51 +0,0 @@
-#include "macros.h"
-
-__kernel void
-ComputePhiMag_GPU(__global float* phiR, __global float* phiI, __global float* phiMag, int numK) {
-  int indexK = get_global_id(0);
-  float real = indexK;
-  float imag = indexK;
-  if (indexK < numK) {
-    /*float*/ real = phiR[indexK];
-    /*float*/ imag = phiI[indexK];
-    phiMag[indexK] = real*real + imag*imag;
-  }
-}
-
-__kernel void
-ComputeQ_GPU(int numK, int kGlobalIndex,
-	     __global float* x, __global float* y, __global float* z,
-	     __global float* Qr, __global float* Qi, __global struct kValues* ck) 
-{
-  float sX;
-  float sY;
-  float sZ;
-  float sQr;
-  float sQi;
-
-  // Determine the element of the X arrays computed by this thread
-  int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + get_local_id(0);
-
-  // Read block's X values from global mem to shared mem
-  sX = x[xIndex];
-  sY = y[xIndex];
-  sZ = z[xIndex];
-  sQr = Qr[xIndex];
-  sQi = Qi[xIndex];
-
-  int kIndex = 0;
-  for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID); kIndex++) {
-    if (kGlobalIndex < numK) {
-      float expArg;
-      expArg = PIx2 * (ck[kIndex].Kx * sX +
-			   ck[kIndex].Ky * sY +
-			   ck[kIndex].Kz * sZ);
-      sQr = sQr + ck[kIndex].PhiMag * cos(expArg); // native_cos(expArg);
-      sQi = sQi + ck[kIndex].PhiMag * sin(expArg); // native_sin(expArg);
-    }
-    kGlobalIndex++;
-  }
-
-  Qr[xIndex] = sQr;
-  Qi[xIndex] = sQi;
-}
--- a/tests/opencl/mri-q/macros.h
+++ b/tests/opencl/mri-q/macros.h
@ -1,21 +0,0 @@
-#ifndef __MACROS__
-#define __MACROS__
-
-#define PI   3.1415926535897932384626433832795029f
-#define PIx2 6.2831853071795864769252867665590058f
-
-#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
-#define K_ELEMS_PER_GRID 2048
-
-#define KERNEL_PHI_MAG_THREADS_PER_BLOCK 256
-#define KERNEL_Q_THREADS_PER_BLOCK 256
-#define KERNEL_Q_K_ELEMS_PER_GRID 1024
-
-struct kValues {
-  float Kx;
-  float Ky;
-  float Kz;
-  float PhiMag;
-};
-
-#endif
--- a/tests/opencl/mri-q/main.cc
+++ b/tests/opencl/mri-q/main.cc
@ -1,321 +0,0 @@
-/***************************************************************************
- *cr
- *cr            (C) Copyright 2007 The Board of Trustees of the
- *cr                        University of Illinois
- *cr                         All Rights Reserved
- *cr
- ***************************************************************************/
-
-/* 
- * C code for creating the Q data structure for fast convolution-based 
- * Hessian multiplication for arbitrary k-space trajectories.
- *
- * Inputs:
- * kx - VECTOR of kx values, same length as ky and kz
- * ky - VECTOR of ky values, same length as kx and kz
- * kz - VECTOR of kz values, same length as kx and ky
- * x  - VECTOR of x values, same length as y and z
- * y  - VECTOR of y values, same length as x and z
- * z  - VECTOR of z values, same length as x and y
- * phi - VECTOR of the Fourier transform of the spatial basis 
- *      function, evaluated at [kx, ky, kz].  Same length as kx, ky, and kz.
- *
- * recommended g++ options:
- *  -O3 -lm -ffast-math -funroll-all-loops
- */
-
-#include <stdio.h>
-#include <sys/time.h>
-#include <parboil.h>
-#include <CL/cl.h>
-
-#include "ocl.h"
-#include "file.h"
-#include "macros.h"
-#include "computeQ.h"
-
-static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
-  if (nullptr == filename || nullptr == data || 0 == size)
-    return CL_INVALID_VALUE;
-
-  FILE* fp = fopen(filename, "r");
-  if (NULL == fp) {
-    fprintf(stderr, "Failed to load kernel.");
-    return CL_INVALID_VALUE;
-  }
-  fseek(fp , 0 , SEEK_END);
-  long fsize = ftell(fp);
-  rewind(fp);
-
-  *data = (uint8_t*)malloc(fsize);
-  *size = fread(*data, 1, fsize, fp);
-  
-  fclose(fp);
-  
-  return CL_SUCCESS;
-}
-
-static void
-setupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr,clPrmtr* clPrm)
-{
-  cl_int clStatus;
-  *dev_ptr = clCreateBuffer(clPrm->clContext,CL_MEM_READ_ONLY,num*size,NULL,&clStatus);
-  CHECK_ERROR("clCreateBuffer");
-  clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
-  CHECK_ERROR("clEnequeueWriteBuffer");
-}
-
-static void
-cleanupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr, clPrmtr* clPrm)
-{
-  cl_int clStatus;
-  clStatus = clEnqueueReadBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
-  CHECK_ERROR("clEnqueueReadBuffer")
-  clStatus = clReleaseMemObject(*dev_ptr);
-  CHECK_ERROR("clReleaseMemObject")
-}
-
-int
-main (int argc, char *argv[]) {
-  int numX, numK;		/* Number of X and K values */
-  int original_numK;		/* Number of K values in input file */
-  float *kx, *ky, *kz;		/* K trajectory (3D vectors) */
-  float *x, *y, *z;		/* X coordinates (3D vectors) */
-  float *phiR, *phiI;		/* Phi values (complex) */
-  float *phiMag;		/* Magnitude of Phi */
-  float *Qr, *Qi;		/* Q signal (complex) */
-
-  struct kValues* kVals;
-
-  struct pb_Parameters *params;
-  struct pb_TimerSet timers;
-
-  pb_InitializeTimerSet(&timers);
-
-  /* Read command line */
-  params = pb_ReadParameters(&argc, argv);
-  
-  params->inpFiles = (char **)malloc(sizeof(char *) * 2);
-  params->inpFiles[0] = (char *)malloc(100);
-  params->inpFiles[1] = NULL;
-  strncpy(params->inpFiles[0], "32_32_32_dataset.bin", 100);
-
-  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
-    {
-      fprintf(stderr, "Expecting one input filename\n");
-      exit(-1);
-    }
-    
-  /* Read in data */
-  pb_SwitchToTimer(&timers, pb_TimerID_IO);
-  inputData(params->inpFiles[0],
-	    &original_numK, &numX,
-	    &kx, &ky, &kz,
-	    &x, &y, &z,
-	    &phiR, &phiI);
-
-  /* Reduce the number of k-space samples if a number is given
-   * on the command line */
-  if (argc < 2)
-    numK = original_numK;
-  else
-    {
-      int inputK;
-      char *end;
-      inputK = strtol(argv[1], &end, 10);
-      if (end == argv[1])
-	{
-	  fprintf(stderr, "Expecting an integer parameter\n");
-	  exit(-1);
-	}
-
-      numK = MIN(inputK, original_numK);
-    }
-
-  printf("%d pixels in output; %d samples in trajectory; using %d samples\n",
-         numX, original_numK, numK);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  clPrmtr clPrm;
-
-  pb_Context* pb_context;
-  pb_context = pb_InitOpenCLContext(params);
-  if (pb_context == NULL) {
-    fprintf (stderr, "Error: No OpenCL platform/device can be found.");
-    return -1;
-  }
-
-  cl_int clStatus;
-  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
-  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
-  clPrm.clContext = (cl_context) pb_context->clContext;
-
-  clPrm.clCommandQueue = clCreateCommandQueue(clPrm.clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
-  CHECK_ERROR("clCreateCommandQueue")
-
-  pb_SetOpenCL(&(clPrm.clContext), &(clPrm.clCommandQueue));
-
-#ifdef HOSTGPU
-  const char* clSource[] = {readFile("kernel.cl")};
-  CHECK_ERROR("clCreateProgramWithSource")
-  cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
-#else
-  uint8_t *kernel_bin = NULL;
-  size_t kernel_size;
-  cl_int binary_status = 0;  
-  CHECK_ERROR("read_kernel_file")
-  clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);  
-  CHECK_ERROR("clCreateProgramWithSource")
-	cl_program clProgram = clCreateProgramWithBinary(
-      clPrm.clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);  
-#endif
-
-  char options[50];
-  sprintf(options,"-I src/opencl_nvidia");
-  clStatus = clBuildProgram(clProgram,0,NULL,options,NULL,NULL);
-  if (clStatus != CL_SUCCESS) {
-    char buf[4096];
-    clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 4096, buf, NULL);
-    printf ("%s\n", buf);
-    CHECK_ERROR("clBuildProgram")
-  }
-
-  /* Create CPU data structures */
-  createDataStructsCPU(numK, numX, &phiMag, &Qr, &Qi);
-
-  /* GPU section 1 (precompute PhiMag) */
-  {
-    clPrm.clKernel = clCreateKernel(clProgram,"ComputePhiMag_GPU",&clStatus);
-    CHECK_ERROR("clCreateKernel")    
-
-    /* Mirror several data structures on the device */
-    cl_mem phiR_d;
-    cl_mem phiI_d;
-    cl_mem phiMag_d;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-    
-    setupMemoryGPU(numK,sizeof(float),&phiR_d,phiR,&clPrm);
-    setupMemoryGPU(numK,sizeof(float),&phiI_d,phiI,&clPrm);
-    phiMag_d = clCreateBuffer(clPrm.clContext,CL_MEM_WRITE_ONLY,numK*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    computePhiMag_GPU(numK, phiR_d, phiI_d, phiMag_d, &clPrm);
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    cleanupMemoryGPU(numK,sizeof(float),&phiMag_d,phiMag,&clPrm);
-
-    clStatus = clReleaseMemObject(phiR_d);
-    CHECK_ERROR("clReleaseMemObject")
-    clStatus = clReleaseMemObject(phiI_d);
-    CHECK_ERROR("clReleaseMemObject")
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  kVals = (struct kValues*)calloc(numK, sizeof (struct kValues));
-
-  int k;
-  for (k = 0; k < numK; k++) {
-    kVals[k].Kx = kx[k];
-    kVals[k].Ky = ky[k];
-    kVals[k].Kz = kz[k];
-    kVals[k].PhiMag = phiMag[k];
-  }
-
-  free(phiMag);
-  
-  clStatus = clReleaseKernel(clPrm.clKernel);
-
-  /* GPU section 2 */
-  {
-    clPrm.clKernel = clCreateKernel(clProgram,"ComputeQ_GPU",&clStatus);
-    CHECK_ERROR("clCreateKernel")
-
-    cl_mem x_d;
-    cl_mem y_d;
-    cl_mem z_d;
-    cl_mem Qr_d;
-    cl_mem Qi_d;
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    setupMemoryGPU(numX,sizeof(float),&x_d,x,&clPrm);
-    setupMemoryGPU(numX,sizeof(float),&y_d,y,&clPrm);
-    setupMemoryGPU(numX,sizeof(float),&z_d,z,&clPrm);
-
-    Qr_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    clMemSet(&clPrm,Qr_d,0,numX*sizeof(float));
-    Qi_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
-    CHECK_ERROR("clCreateBuffer")
-    clMemSet(&clPrm,Qi_d,0,numX*sizeof(float));
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
-
-    computeQ_GPU(numK, numX, x_d, y_d, z_d, kVals, Qr_d, Qi_d, &clPrm);
-
-    clStatus = clFinish(clPrm.clCommandQueue);
-    CHECK_ERROR("clFinish")
-
-    pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-    clStatus = clReleaseMemObject(x_d);
-    CHECK_ERROR("clReleaseMemObject")
-    clStatus = clReleaseMemObject(y_d);
-    CHECK_ERROR("clReleaseMemObject")
-    clStatus = clReleaseMemObject(z_d);
-    CHECK_ERROR("clReleaseMemObject")
-    cleanupMemoryGPU(numX,sizeof(float),&Qr_d,Qr,&clPrm);
-    cleanupMemoryGPU(numX,sizeof(float),&Qi_d,Qi,&clPrm);
-  }
-
-  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-
-  if (params->outFile)
-    {
-      /* Write Q to file */
-      pb_SwitchToTimer(&timers, pb_TimerID_IO);
-      outputData(params->outFile, Qr, Qi, numX);
-      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
-    }
-
-  free (kx);
-  free (ky);
-  free (kz);
-  free (x);
-  free (y);
-  free (z);
-  free (phiR);
-  free (phiI);
-  free (kVals);
-  free (Qr);
-  free (Qi);
-
-  //free((void*)clSource[0]);
-
-  clStatus = clReleaseKernel(clPrm.clKernel);
-  clStatus = clReleaseProgram(clProgram);
-  clStatus = clReleaseCommandQueue(clPrm.clCommandQueue);
-  clStatus = clReleaseContext(clPrm.clContext);
-
-  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
-  pb_PrintTimerSet(&timers);
-
-  pb_FreeParameters(params);
-
-  return 0;
-}
--- a/tests/opencl/mri-q/ocl
+++ b/tests/opencl/mri-q/ocl
@ -1,50 +0,0 @@
-#include <CL/cl.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include "ocl.h"
-
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
-
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
-
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
-}
-
-void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
-}
--- a/tests/opencl/mri-q/ocl
+++ b/tests/opencl/mri-q/ocl
@ -1,21 +0,0 @@
-#ifndef __OCLH__
-#define __OCLH__
-
-typedef struct {
-	cl_uint major;
-	cl_uint minor;
-	cl_uint multiProcessorCount;
-} OpenCLDeviceProp;
-
-void clMemSet(cl_command_queue, cl_mem, int, size_t);
-char* readFile(const char*);
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
-  }
-
-#endif
--- a/tests/opencl/mri-q/ocl.c
+++ b/tests/opencl/mri-q/ocl.c
@ -1,50 +0,0 @@
-#include <CL/cl.h>
-#include <stdio.h>
-#include <string.h>
-#include "ocl.h"
-#include <parboil.h>
-
-char* readFile(const char* fileName)
-{
-        FILE* fp;
-        fp = fopen(fileName,"r");
-        if(fp == NULL)
-        {
-                printf("Error 1!\n");
-                exit(1);
-        }
-
-        fseek(fp,0,SEEK_END);
-        long size = ftell(fp);
-        rewind(fp);
-
-        char* buffer = (char*)malloc(sizeof(char)*(size+1));
-        if(buffer  == NULL)
-        {
-                printf("Error 2!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-        size_t res = fread(buffer,1,size,fp);
-        if(res != size)
-        {
-                printf("Error 3!\n");
-                fclose(fp);
-                exit(1);
-        }
-
-	buffer[size] = 0;
-        fclose(fp);
-        return buffer;
-}
-
-void clMemSet(clPrmtr* clPrm, cl_mem buf, int val, size_t size)
-{
-	cl_int clStatus;
-	char* temp = (char*)malloc(size);
-	memset(temp,val,size);
-	clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
-	CHECK_ERROR("clEnqueueWriteBuffer")
-	free(temp);
-}
--- a/tests/opencl/mri-q/ocl.h
+++ b/tests/opencl/mri-q/ocl.h
@ -1,31 +0,0 @@
-#ifndef __OCLH__
-#define __OCLH__
-
-#include <stdlib.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef struct {
-	cl_context clContext;
-	cl_command_queue clCommandQueue;
-	cl_kernel clKernel;
-} clPrmtr;
-
-void clMemSet(clPrmtr*, cl_mem, int, size_t);
-char* readFile(const char*);
-
-#define CHECK_ERROR(errorMessage)           \
-  if(clStatus != CL_SUCCESS)                \
-  {                                         \
-     printf("Error: %s!\n",errorMessage);   \
-     printf("Line: %d\n",__LINE__);         \
-     exit(1);                               \
-  }
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/tests/opencl/mri-q/parboil.h
+++ b/tests/opencl/mri-q/parboil.h
@ -1,348 +0,0 @@
-/*
- * (c) 2010 The Board of Trustees of the University of Illinois.
- */
-#ifndef PARBOIL_HEADER
-#define PARBOIL_HEADER
-
-#include <stdio.h>
-#include <string.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include <unistd.h>
-
-/* A platform as specified by the user on the command line */
-struct pb_PlatformParam {
-  char *name;                   /* The platform name.  This string is owned. */
-  char *version;                /* The platform version; may be NULL.
-                                 * This string is owned. */
-};
-
-/* Create a PlatformParam from the given strings.
- * 'name' must not be NULL.  'version' may be NULL.
- * If not NULL, the strings should have been allocated by malloc(),
- * and they will be owned by the returned object.
- */
-struct pb_PlatformParam *
-pb_PlatformParam(char *name, char *version);
-
-void
-pb_FreePlatformParam(struct pb_PlatformParam *);
-
-/* A criterion for how to select a device */
-enum pb_DeviceSelectionCriterion {
-  pb_Device_INDEX,             /* Enumerate the devices and select one
-                                * by its number */
-  pb_Device_CPU,                /* Select a CPU device */
-  pb_Device_GPU,                /* Select a GPU device  */
-  pb_Device_ACCELERATOR,        /* Select an accelerator device */
-  pb_Device_NAME                /* Select a device by name */
-};
-
-/* A device as specified by the user on the command line */
-struct pb_DeviceParam {
-  enum pb_DeviceSelectionCriterion criterion;
-  union {
-    int index;                  /* If criterion == pb_Device_INDEX,
-                                 * the index of the device */
-    char *name;                 /* If criterion == pb_Device_NAME,
-                                 * the name of the device.
-                                 * This string is owned. */
-  };
-};
-
-struct pb_DeviceParam *
-pb_DeviceParam_index(int index);
-
-struct pb_DeviceParam *
-pb_DeviceParam_cpu(void);
-
-struct pb_DeviceParam *
-pb_DeviceParam_gpu(void);
-
-struct pb_DeviceParam *
-pb_DeviceParam_accelerator(void);
-
-/* Create a by-name device selection criterion.
- * The string should have been allocated by malloc(), and it will will be
- * owned by the returned object.
- */
-struct pb_DeviceParam *
-pb_DeviceParam_name(char *name);
-
-void
-pb_FreeDeviceParam(struct pb_DeviceParam *);
-
-/* Command line parameters for benchmarks */
-struct pb_Parameters {
-  char *outFile;		/* If not NULL, the raw output of the
-				 * computation should be saved to this
-				 * file. The string is owned. */
-  char **inpFiles;		/* A NULL-terminated array of strings
-				 * holding the input file(s) for the
-				 * computation.  The array and strings
-				 * are owned. */
-  struct pb_PlatformParam *platform; /* If not NULL, the platform
-                                      * specified on the command line. */
-  struct pb_DeviceParam *device; /* If not NULL, the device
-                                      * specified on the command line. */
-};
-
-/* Read command-line parameters.
- *
- * The argc and argv parameters to main are read, and any parameters
- * interpreted by this function are removed from the argument list.
- *
- * A new instance of struct pb_Parameters is returned.
- * If there is an error, then an error message is printed on stderr
- * and NULL is returned.
- */
-struct pb_Parameters *
-pb_ReadParameters(int *_argc, char **argv);
-
-/* Free an instance of struct pb_Parameters.
- */
-void
-pb_FreeParameters(struct pb_Parameters *p);
-
-void
-pb_FreeStringArray(char **);
-
-/* Count the number of input files in a pb_Parameters instance.
- */
-int
-pb_Parameters_CountInputs(struct pb_Parameters *p);
-
-/* A time or duration. */
-//#if _POSIX_VERSION >= 200112L
-typedef unsigned long long pb_Timestamp; /* time in microseconds */
-//#else
-//# error "Timestamps not implemented"
-//#endif
-
-enum pb_TimerState {
-  pb_Timer_STOPPED,
-  pb_Timer_RUNNING,
-};
-
-struct pb_Timer {
-  enum pb_TimerState state;
-  pb_Timestamp elapsed;		/* Amount of time elapsed so far */
-  pb_Timestamp init;		/* Beginning of the current time interval,
-				 * if state is RUNNING.  End of the last 
-				 * recorded time interfal otherwise.  */
-};
-
-/* Reset a timer.
- * Use this to initialize a timer or to clear
- * its elapsed time.  The reset timer is stopped.
- */
-void
-pb_ResetTimer(struct pb_Timer *timer);
-
-/* Start a timer.  The timer is set to RUNNING mode and
- * time elapsed while the timer is running is added to
- * the timer.
- * The timer should not already be running.
- */
-void
-pb_StartTimer(struct pb_Timer *timer);
-
-/* Stop a timer.
- * This stops adding elapsed time to the timer.
- * The timer should not already be stopped.
- */
-void
-pb_StopTimer(struct pb_Timer *timer);
-
-/* Get the elapsed time in seconds. */
-double
-pb_GetElapsedTime(struct pb_Timer *timer);
-
-/* Execution time is assigned to one of these categories. */
-enum pb_TimerID {
-  pb_TimerID_NONE = 0,
-  pb_TimerID_IO,		/* Time spent in input/output */
-  pb_TimerID_KERNEL,		/* Time spent computing on the device, 
-				 * recorded asynchronously */
-  pb_TimerID_COPY,		/* Time spent synchronously moving data 
-				 * to/from device and allocating/freeing 
-				 * memory on the device */
-  pb_TimerID_DRIVER,		/* Time spent in the host interacting with the 
-				 * driver, primarily for recording the time 
-                                 * spent queueing asynchronous operations */
-  pb_TimerID_COPY_ASYNC,	/* Time spent in asynchronous transfers */
-  pb_TimerID_COMPUTE,		/* Time for all program execution other
-				 * than parsing command line arguments,
-				 * I/O, kernel, and copy */
-  pb_TimerID_OVERLAP,		/* Time double-counted in asynchronous and 
-				 * host activity: automatically filled in, 
-				 * not intended for direct usage */
-  pb_TimerID_LAST		/* Number of timer IDs */
-};
-
-/* Dynamic list of asynchronously tracked times between events */
-struct pb_async_time_marker_list {
-  char *label; // actually just a pointer to a string
-  enum pb_TimerID timerID;	/* The ID to which the interval beginning 
-                                 * with this marker should be attributed */
-  void * marker; 
-  //cudaEvent_t marker; 		/* The driver event for this marker */
-  struct pb_async_time_marker_list *next; 
-};
-
-struct pb_SubTimer {
-  char *label;
-  struct pb_Timer timer;
-  struct pb_SubTimer *next;
-};
-
-struct pb_SubTimerList {
-  struct pb_SubTimer *current;
-  struct pb_SubTimer *subtimer_list;
-};
-
-/* A set of timers for recording execution times. */
-struct pb_TimerSet {
-  enum pb_TimerID current;
-  struct pb_async_time_marker_list* async_markers;
-  pb_Timestamp async_begin;
-  pb_Timestamp wall_begin;
-  struct pb_Timer timers[pb_TimerID_LAST];
-  struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
-};
-
-/* Reset all timers in the set. */
-void
-pb_InitializeTimerSet(struct pb_TimerSet *timers);
-
-void
-pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
-
-/* Select which timer the next interval of time should be accounted
- * to. The selected timer is started and other timers are stopped.
- * Using pb_TimerID_NONE stops all timers. */
-void
-pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
-
-void
-pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
-
-/* Print timer values to standard output. */
-void
-pb_PrintTimerSet(struct pb_TimerSet *timers);
-
-/* Release timer resources */
-void
-pb_DestroyTimerSet(struct pb_TimerSet * timers);
-
-void
-pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
-
-
-typedef struct pb_Device_tag {
-  char* name;
-  void* clDevice;
-  int id;
-  unsigned int in_use;
-  unsigned int available;
-} pb_Device;
-
-struct pb_Context_tag;
-typedef struct pb_Context_tag pb_Context;
-
-typedef struct pb_Platform_tag {
-  char* name;
-  char* version;
-  void* clPlatform;
-  unsigned int in_use;
-  pb_Context** contexts;
-  pb_Device** devices;
-} pb_Platform;
-
-struct pb_Context_tag {
-  void* clPlatformId;
-  void* clContext;
-  void* clDeviceId;
-  pb_Platform* pb_platform;
-  pb_Device* pb_device;
-};
-
-// verbosely print out list of platforms and their devices to the console.
-pb_Platform**
-pb_GetPlatforms();
-
-// Choose a platform according to the given platform specification
-pb_Platform*
-pb_GetPlatform(struct pb_PlatformParam *platform);
-
-// choose a platform: by name, name & version
-pb_Platform*
-pb_GetPlatformByName(const char* name);
-
-pb_Platform*
-pb_GetPlatformByNameAndVersion(const char* name, const char* version);
-
-// Choose a device according to the given device specification
-pb_Device*
-pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
-
-pb_Device**
-pb_GetDevices(pb_Platform* pb_platform);
-
-// choose a device by name.
-pb_Device*
-pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
-
-pb_Platform*
-pb_GetPlatformByEnvVars();
-
-pb_Context*
-pb_InitOpenCLContext(struct pb_Parameters* parameters);
-
-void
-pb_ReleasePlatforms();
-
-void
-pb_ReleaseContext(pb_Context* c);
-
-void
-pb_PrintPlatformInfo(pb_Context* c);
-
-void
-perf_init();
-
-//#define MEASURE_KERNEL_TIME
-
-#include <CL/cl.h>
-
-#ifdef MEASURE_KERNEL_TIME
-#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
-cl_int
-pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
-                       cl_kernel        /* kernel */,
-                       cl_uint          /* work_dim */,
-                       const size_t *   /* global_work_offset */,
-                       const size_t *   /* global_work_size */,
-                       const size_t *   /* local_work_size */,
-                       cl_uint          /* num_events_in_wait_list */,
-                       const cl_event * /* event_wait_list */,
-                       cl_event *       /* event */);
-#endif
-
-enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
-void pb_sig_float(char*, float*, int);
-void pb_sig_double(char*, double*, int);
-void pb_sig_short(char*, short*, int);
-void pb_sig_int(char*, int*, int);
-void pb_sig_uchar(char*, unsigned char*, unsigned int);
-void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif //PARBOIL_HEADER
-
--- a/tests/opencl/mri-q/parboil_opencl.c
+++ b/tests/opencl/mri-q/parboil_opencl.c
--- a/tests/opencl/sgemm2/kernel.cl
+++ b/tests/opencl/sgemm2/kernel.cl
@ -1,73 +0,0 @@
-__kernel void sgemm2(__global float *A, 
-                     __global float *B, 
-                     __global float *C, 
-                     const unsigned int N, 
-                     __local float *localA, 
-                     __local float *localB)
-{
-    int globalRow = get_global_id(1);
-    int globalCol = get_global_id(0);
-    int localRow  = get_local_id(1);
-    int localCol  = get_local_id(0);
-    int localSize = get_local_size(0);  // assuming square local size
-
-    float sum = 0.0f;
-
-    // Loop over all blocks of both matrices
-    for (int k = 0; k < N; k += localSize) {
-        // Load block of matrix A to local memory
-        localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
-
-        // Load block of matrix B to local memory, adjusting for column-major access
-        localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
-
-        // Synchronize to make sure the tiles are loaded
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Multiply the two matrix blocks and accumulate result
-        for (int j = 0; j < localSize; j++) {
-            sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
-        }
-
-        // Ensure computation is done before loading next block
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-
-    C[globalRow * N + globalCol] = sum;
-}
-
-/*__kernel void sgemm2(__global float *A, 
-                       __global float *B, 
-                       __global float *C, 
-                       const unsigned int N)
-{
-    int globalRow = get_global_id(1);
-    int globalCol = get_global_id(0);
-    int localRow  = get_local_id(1);
-    int localCol  = get_local_id(0);
-
-    // Static local memory declaration
-    __local float localA[16][16];
-    __local float localB[16][16];
-
-    float sum = 0.0f;
-
-    // Iterate over blocks
-    for (int k = 0; k < N; k += 16) {
-        // Load a block of matrix A into local memory
-        localA[localRow][localCol] = A[globalRow * N + k + localCol];
-
-        // Load a block of matrix B into local memory
-        localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
-
-        // Ensure the entire block is loaded
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Compute multiplication for this block
-        for (int j = 0; j < 16; j++) {
-            sum += localA[localRow][j] * localB[j][localCol];
-        }
-    }
-
-    C[globalRow * N + globalCol] = sum;
-}*/
--- a/tests/opencl/vectorhypot/Makefile
+++ b/tests/opencl/vectorhypot/Makefile
@ -1,10 +0,0 @@
-PROJECT = vectorhypot
-
-SRCS = main.cc oclUtils.cpp shrUtils.cpp cmd_arg_reader.cpp
-
-CXXFLAGS += -I.
-
-OPTS ?= 
-
-include ../common.mk
-
--- a/tests/opencl/vectorhypot/cmd_arg_reader.cpp
+++ b/tests/opencl/vectorhypot/cmd_arg_reader.cpp
@ -1,152 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-/* CUda UTility Library */
-
-// includes, file
-#include "cmd_arg_reader.h"
-
-// includes, system
-#include <vector>
-
-// internal unnamed namespace
-
-namespace 
-{
-    // types, internal (class, enum, struct, union, typedef)
-
-    // variables, internal
-
-} // namespace {
-
-// variables, exported
-
-/*static*/ CmdArgReader* CmdArgReader::self;
-/*static*/ char** CmdArgReader::rargv;
-/*static*/ int CmdArgReader::rargc;
-
-// functions, exported
-
-////////////////////////////////////////////////////////////////////////////////
-//! Public construction interface
-//! @return a handle to the class instance
-//! @param argc number of command line arguments (as given to main())
-//! @param argv command line argument string (as given to main())
-////////////////////////////////////////////////////////////////////////////////
-/*static*/ void
-CmdArgReader::init( const int argc, const char** argv) 
-{  
-    if ( NULL != self) 
-    {
-        return;
-    }
-
-    // command line arguments 
-    if (( 0 == argc) || ( 0 == argv)) 
-    {
-        LOGIC_EXCEPTION( "No command line arguments given.");
-    }
-
-    self = new CmdArgReader();
-
-    self->createArgsMaps( argc, argv);
-
-    rargc = argc;
-    rargv = const_cast<char**>( argv);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Constructor, default
-////////////////////////////////////////////////////////////////////////////////
-CmdArgReader::CmdArgReader() :
-    args(),
-    unprocessed(),
-    iter(),
-    iter_unprocessed()
-{  }
-
-////////////////////////////////////////////////////////////////////////////////
-//! Destructor
-////////////////////////////////////////////////////////////////////////////////
-CmdArgReader::~CmdArgReader() 
-{
-    for( iter = args.begin(); iter != args.end(); ++iter) 
-    {
-        if( *(iter->second.first) == typeid( int)) 
-        {
-            delete static_cast<int*>( iter->second.second);
-            break;
-        }
-        else if( *(iter->second.first) == typeid( bool)) 
-        {
-            delete static_cast<bool*>( iter->second.second);
-            break;
-        }
-        else if( *(iter->second.first) == typeid( std::string)) 
-        {
-            delete static_cast<std::string*>( iter->second.second);
-            break;
-        }
-        else if( *(iter->second.first) == typeid( std::vector< std::string>) ) 
-        {
-            delete static_cast< std::vector< std::string>* >( iter->second.second);
-            break;
-        }
-        else if( *(iter->second.first) == typeid( std::vector<int>) ) 
-        {
-            delete static_cast< std::vector<int>* >( iter->second.second);
-            break;
-        }
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Read args as token value pair into map for better processing (Even the 
-//! values remain strings until the parameter values is requested by the
-//! program.)
-//! @param argc the argument count (as given to 'main')
-//! @param argv the char* array containing the command line arguments
-////////////////////////////////////////////////////////////////////////////////
-void
-CmdArgReader::createArgsMaps( const int argc, const char** argv) {
-
-    std::string token;
-    std::string val_str;
-
-    std::map< std::string, std::string> args;
-
-    std::string::size_type pos;
-    std::string arg;
-    for( int i=1; i<argc; ++i) 
-    {
-        arg = argv[i];
-
-        // check if valid command line argument: all arguments begin with - or --
-        if (arg[0] != '-') 
-        {
-            RUNTIME_EXCEPTION("Invalid command line argument.");
-        }
-
-        int numDashes = (arg[1] == '-' ? 2 : 1);
-
-        // check if only flag or if a value is given
-        if ( (pos = arg.find( '=')) == std::string::npos) 
-        {  
-            unprocessed[ std::string( arg, numDashes, arg.length()-numDashes)] = "FLAG";                                  
-        }
-        else 
-        {
-            unprocessed[ std::string( arg, numDashes, pos-numDashes)] = 
-                                      std::string( arg, pos+1, arg.length()-1);
-        }
-    }
-}
-
--- a/tests/opencl/vectorhypot/cmd_arg_reader.h
+++ b/tests/opencl/vectorhypot/cmd_arg_reader.h
@ -1,488 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-/* CUda UTility Library */
-
-#ifndef _CMDARGREADER_H_
-#define _CMDARGREADER_H_
-
-// includes, system
-#include <map>
-#include <iostream>
-#include <sstream>
-#include <algorithm>
-#include <typeinfo>
-
-// includes, project
-#include "exception.h"
-
-//! Preprocessed command line arguments
-//! @note Lazy evaluation: The arguments are converted from strings to
-//!       the correct data type upon request. Converted values are stored 
-//!       in an additonal map so that no additional conversion is
-//!       necessary. Arrays of command line arguments are stored in 
-//!       std::vectors 
-//! @note Usage: 
-//!          const std::string* file = 
-//!                       CmdArgReader::getArg< std::string>( "model")
-//!          const std::vector< std::string>* files = 
-//!          CmdArgReader::getArg< std::vector< std::string> >( "model")
-//! @note All command line arguments begin with '--' followed by the token; 
-//!   token and value are seperated by '='; example --samples=50
-//! @note Arrays have the form --model=[one.obj,two.obj,three.obj] 
-//!       (without whitespaces)
-
-//! Command line argument parser
-class CmdArgReader 
-{
-    template<class> friend class TestCmdArgReader;
-
-protected:
-
-    //! @param self handle to the only instance of this class
-    static  CmdArgReader*  self;
-
-public:
-
-    //! Public construction interface
-    //! @return a handle to the class instance
-    //! @param argc number of command line arguments (as given to main())
-    //! @param argv command line argument string (as given to main())
-    static void init( const int argc, const char** argv);
-
-public:
-
-    //! Get the value of the command line argument with given name
-    //! @return A const handle to the requested argument.
-    //! If the argument does not exist or if it 
-    //!  is not from type T NULL is returned
-    //! @param name the name of the requested argument
-    //! @note T the type of the argument requested
-    template<class T>
-        static inline const T* getArg( const std::string& name);
-
-    //! Check if a command line argument with the given name exists
-    //! @return  true if a command line argument with name \a name exists,
-    //!          otherwise false
-    //! @param name  name of the command line argument in question
-    static inline bool existArg( const std::string& name);
-
-    //! Get the original / raw argc program argument
-    static inline int& getRArgc();
-
-    //! Get the original / raw argv program argument
-    static inline char**& getRArgv();
-
-public:
-
-    //! Destructor
-    ~CmdArgReader();
-
-protected:
-
-    //! Constructor, default
-    CmdArgReader();
-
-private:
-
-    // private helper functions
-
-    //! Get the value of the command line argument with given name
-    //! @note Private helper function for 'getArg' to work on the members
-    //! @return A const handle to the requested argument. If the argument
-    //!         does not exist or if it  is not from type T a NULL pointer
-    //!         is returned.
-    //! @param name the name of the requested argument
-    //! @note T the type of the argument requested
-    template<class T>
-        inline const T* getArgHelper( const std::string& name);
-
-    //! Check if a command line argument with name \a name exists
-    //! @return true if a command line argument of name \a name exists, 
-    //!         otherwise false
-    //! @param name the name of the requested argument
-    inline bool existArgHelper( const std::string& name) const;
-
-    //! Read args as token value pair into map for better processing
-    //!  (Even the values remain strings until the parameter values is 
-    //!   requested by the program.)
-    //! @param argc the argument count (as given to 'main')
-    //! @param argv the char* array containing the command line arguments
-    void  createArgsMaps( const int argc, const char** argv);
-
-    //! Helper for "casting" the strings from the map with the unprocessed 
-    //! values to the correct 
-    //!  data type.
-    //! @return true if conversion succeeded, otherwise false
-    //! @param element the value as string
-    //! @param val the value as type T
-    template<class T>
-        static inline bool convertToT( const std::string& element, T& val);
-
-public:
-
-    // typedefs internal
-
-    //! container for a processed command line argument
-    //! typeid is used to easily be able to decide if a re-requested token-value
-    //! pair match the type of the first conversion
-    typedef std::pair< const std::type_info*, void*>  ValType;
-    //! map of already converted values
-    typedef std::map< std::string, ValType >          ArgsMap;
-    //! iterator for the map of already converted values
-    typedef ArgsMap::iterator                         ArgsMapIter;
-    typedef ArgsMap::const_iterator                   ConstArgsMapIter;
-
-    //! map of unprocessed (means unconverted) token-value pairs
-    typedef std::map< std::string, std::string>            UnpMap;
-    //! iterator for the map of unprocessed (means unconverted) token-value pairs
-    typedef std::map< std::string, std::string>::iterator  UnpMapIter;
-
-private:
-
-#ifdef _WIN32
-#  pragma warning( disable: 4251)
-#endif
-
-    //! rargc original value of argc
-    static  int  rargc;
-
-    //! rargv contains command line arguments in raw format
-    static char**  rargv;
-
-    //! args Map containing the already converted token-value pairs
-    ArgsMap     args;
-
-    //! args Map containing the unprocessed / unconverted token-value pairs
-    UnpMap     unprocessed;
-
-    //! iter Iterator for the map with the already converted token-value 
-    //!  pairs (to avoid frequent reallocation)
-    ArgsMapIter iter;
-
-    //! iter Iterator for the map with the unconverted token-value 
-    //!  pairs (to avoid frequent reallocation)
-    UnpMapIter iter_unprocessed;
-
-#ifdef _WIN32
-#  pragma warning( default: 4251)
-#endif
-
-private:
-
-    //! Constructor, copy (not implemented)
-    CmdArgReader( const CmdArgReader&);
-
-    //! Assignment operator (not implemented)
-    CmdArgReader& operator=( const CmdArgReader&);
-};
-
-// variables, exported (extern)
-
-// functions, inlined (inline)
-
-////////////////////////////////////////////////////////////////////////////////
-//! Conversion function for command line argument arrays
-//! @note This function is used each type for which no template specialization
-//!  exist (which will cause errors if the type does not fulfill the std::vector
-//!  interface).
-////////////////////////////////////////////////////////////////////////////////
-template<class T>
-/*static*/ inline bool
-CmdArgReader::convertToT( const std::string& element, T& val)
-{
-    // preallocate storage
-    val.resize( std::count( element.begin(), element.end(), ',') + 1);
-
-    unsigned int i = 0;
-    std::string::size_type pos_start = 1;  // leave array prefix '['
-    std::string::size_type pos_end = 0;
-
-    // do for all elements of the comma seperated list
-    while( std::string::npos != ( pos_end = element.find(',', pos_end+1)) ) 
-    {
-        // convert each element by the appropriate function
-        if ( ! convertToT< typename T::value_type >( 
-            std::string( element, pos_start, pos_end - pos_start), val[i])) 
-        {
-            return false;
-        }
-
-        pos_start = pos_end + 1;
-        ++i;
-    }
-
-    std::string tmp1(  element, pos_start, element.length() - pos_start - 1);
-
-    // process last element (leave array postfix ']')
-    if ( ! convertToT< typename T::value_type >( std::string( element,
-        pos_start,
-        element.length() - pos_start - 1),
-        val[i])) 
-    {
-        return false;
-    }
-
-    // possible to process all elements?
-    return true;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Conversion function for command line arguments of type int
-////////////////////////////////////////////////////////////////////////////////
-template<>
-inline bool
-CmdArgReader::convertToT<int>( const std::string& element, int& val) 
-{
-    std::istringstream ios( element);
-    ios >> val;
-
-    bool ret_val = false;
-    if ( ios.eof()) 
-    {
-        ret_val = true;
-    }
-
-    return ret_val;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Conversion function for command line arguments of type float
-////////////////////////////////////////////////////////////////////////////////
-template<>
-inline bool
-CmdArgReader::convertToT<float>( const std::string& element, float& val) 
-{
-    std::istringstream ios( element);
-    ios >> val;
-
-    bool ret_val = false;
-    if ( ios.eof()) 
-    {
-        ret_val = true;
-    }
-
-    return ret_val;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Conversion function for command line arguments of type double
-////////////////////////////////////////////////////////////////////////////////
-template<>
-inline bool
-CmdArgReader::convertToT<double>( const std::string& element, double& val) 
-{
-    std::istringstream ios( element);
-    ios >> val;
-
-    bool ret_val = false;
-    if ( ios.eof()) 
-    {
-        ret_val = true;
-    }
-
-    return ret_val;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Conversion function for command line arguments of type string
-////////////////////////////////////////////////////////////////////////////////
-template<>
-inline bool
-CmdArgReader::convertToT<std::string>( const std::string& element, 
-                                      std::string& val)
-{
-    val = element;
-    return true;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Conversion function for command line arguments of type bool
-////////////////////////////////////////////////////////////////////////////////
-template<>
-inline bool
-CmdArgReader::convertToT<bool>( const std::string& element, bool& val) 
-{
-    // check if value is given as string-type { true | false }
-    if ( "true" == element) 
-    {
-        val = true;
-        return true;
-    }
-    else if ( "false" == element) 
-    {
-        val = false;
-        return true;
-    }
-    // check if argument is given as integer { 0 | 1 }
-    else 
-    {
-        int tmp;
-        if ( convertToT<int>( element, tmp)) 
-        {
-            if ( 1 == tmp) 
-            {
-                val = true;
-                return true;
-            }
-            else if ( 0 == tmp) 
-            {
-                val = false;
-                return true;
-            }
-        }
-    }
-
-    return false;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Get the value of the command line argument with given name
-//! @return A const handle to the requested argument. If the argument does
-//!  not exist or if it is not from type T NULL is returned
-//! @param T the type of the argument requested
-//! @param name the name of the requested argument
-////////////////////////////////////////////////////////////////////////////////
-template<class T>
-/*static*/ const T*
-CmdArgReader::getArg( const std::string& name) 
-{
-    if( ! self) 
-    {
-        RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
-        return NULL;
-    }
-
-    return self->getArgHelper<T>( name);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Check if a command line argument with the given name exists
-//! @return  true if a command line argument with name \a name exists,
-//!          otherwise false
-//! @param name  name of the command line argument in question
-////////////////////////////////////////////////////////////////////////////////
-/*static*/ inline bool 
-CmdArgReader::existArg( const std::string& name) 
-{
-    if( ! self) 
-    {
-        RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
-        return false;
-    }
-
-    return self->existArgHelper( name);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! @brief Get the value of the command line argument with given name
-//! @return A const handle to the requested argument. If the argument does
-//!  not exist or if it is not from type T NULL is returned
-//! @param T the type of the argument requested
-//! @param name the name of the requested argument
-////////////////////////////////////////////////////////////////////////////////
-template<class T>
-const T*
-CmdArgReader::getArgHelper( const std::string& name) 
-{
-    // check if argument already processed and stored in correct type
-    if ( args.end() != (iter = args.find( name))) 
-    {
-        if ( (*(iter->second.first)) == typeid( T) ) 
-        {
-            return (T*) iter->second.second;
-        }
-    }
-    else 
-    {
-        T* tmp = new T;
-
-        // check the array with unprocessed values
-        if ( unprocessed.end() != (iter_unprocessed = unprocessed.find( name))) 
-        {
-            // try to "cast" the string to the type requested
-            if ( convertToT< T >( iter_unprocessed->second, *tmp)) 
-            {
-                // add the token element pair to map of already converted values
-                args[name] = std::make_pair( &(typeid( T)), (void*) tmp);
-
-                return tmp;
-            }
-        }
-
-        // not used while not inserted into the map -> cleanup
-        delete tmp;
-    }
-
-    // failed, argument not available
-    return NULL;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Check if a command line argument with name \a name exists
-//! @return true if a command line argument of name \a name exists, 
-//!         otherwise false
-//! @param name the name of the requested argument
-////////////////////////////////////////////////////////////////////////////////
-inline bool
-CmdArgReader::existArgHelper( const std::string& name) const 
-{
-    bool ret_val = false;
-
-    // check if argument already processed and stored in correct type
-    if( args.end() != args.find( name)) 
-    {
-        ret_val = true;
-    }
-    else 
-    {
-
-        // check the array with unprocessed values
-        if ( unprocessed.end() != unprocessed.find( name)) 
-        {
-            ret_val = true; 
-        }
-    }
-
-    return ret_val;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Get the original / raw argc program argument
-////////////////////////////////////////////////////////////////////////////////
-/*static*/ inline int&
-CmdArgReader::getRArgc() 
-{
-    if( ! self) 
-    {
-        RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
-    }
-
-    return rargc;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Get the original / raw argv program argument
-////////////////////////////////////////////////////////////////////////////////
-/*static*/ inline char**&
-CmdArgReader::getRArgv() 
-{
-    if( ! self) 
-    {
-        RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
-    }
-
-    return rargv;
-}
-
-// functions, exported (extern)
-
-#endif // #ifndef _CMDARGREADER_H_
--- a/tests/opencl/vectorhypot/exception.h
+++ b/tests/opencl/vectorhypot/exception.h
@ -1,151 +0,0 @@
-/*
-* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
-*
-* Please refer to the NVIDIA end user license agreement (EULA) associated
-* with this source code for terms and conditions that govern your use of
-* this software. Any use, reproduction, disclosure, or distribution of
-* this software and related documentation outside the terms of the EULA
-* is strictly prohibited.
-*
-*/
-
-/* CUda UTility Library */
-#ifndef _EXCEPTION_H_
-#define _EXCEPTION_H_
-
-// includes, system
-#include <exception>
-#include <stdexcept>
-#include <iostream>
-#include <stdlib.h>
-
-//! Exception wrapper.
-//! @param Std_Exception Exception out of namespace std for easy typing.
-template<class Std_Exception>
-class Exception : public Std_Exception 
-{
-public:
-
-    //! @brief Static construction interface
-    //! @return Alwayss throws ( Located_Exception<Exception>)
-    //! @param file file in which the Exception occurs
-    //! @param line line in which the Exception occurs
-    //! @param detailed details on the code fragment causing the Exception
-    static void throw_it( const char* file, 
-                          const int line,
-                          const char* detailed = "-" );  
-
-    //! Static construction interface
-    //! @return Alwayss throws ( Located_Exception<Exception>)
-    //! @param file file in which the Exception occurs
-    //! @param line line in which the Exception occurs
-    //! @param detailed details on the code fragment causing the Exception
-    static void throw_it( const char* file, 
-                          const int line,      
-                          const std::string& detailed);  
-
-    //! Destructor
-    virtual ~Exception() throw(); 
-
-private:
-
-    //! Constructor, default (private)
-    Exception(); 
-
-    //! Constructor, standard
-    //! @param str string returned by what()
-    Exception( const std::string& str); 
-
-};
-
-////////////////////////////////////////////////////////////////////////////////
-//! Exception handler function for arbitrary exceptions
-//! @param ex exception to handle
-////////////////////////////////////////////////////////////////////////////////
-template<class Exception_Typ>
-inline void
-handleException( const Exception_Typ& ex) 
-{
-    std::cerr << ex.what() << std::endl;
-
-    exit( EXIT_FAILURE);
-}
-
-//! Convenience macros
-
-//! Exception caused by dynamic program behavior, e.g. file does not exist
-#define RUNTIME_EXCEPTION( msg) \
-    Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
-
-//! Logic exception in program, e.g. an assert failed
-#define LOGIC_EXCEPTION( msg) \
-    Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
-
-//! Out of range exception
-#define RANGE_EXCEPTION( msg) \
-    Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
-
-////////////////////////////////////////////////////////////////////////////////
-//! Implementation
-
-// includes, system
-#include <sstream>
-
-////////////////////////////////////////////////////////////////////////////////
-//! Static construction interface.
-//! @param  Exception causing code fragment (file and line) and detailed infos.
-////////////////////////////////////////////////////////////////////////////////
-/*static*/ template<class Std_Exception>
-void
-Exception<Std_Exception>::
-throw_it( const char* file, const int line, const char* detailed) 
-{
-    std::stringstream s;
-
-    // Quiet heavy-weight but exceptions are not for 
-    // performance / release versions
-    s << "Exception in file '" << file << "' in line " << line << "\n"
-      << "Detailed description: " << detailed << "\n";
-
-    throw Exception( s.str());
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Static construction interface.
-//! @param  Exception causing code fragment (file and line) and detailed infos.
-////////////////////////////////////////////////////////////////////////////////
-/*static*/ template<class Std_Exception>
-void
-Exception<Std_Exception>::
-throw_it( const char* file, const int line, const std::string& msg) 
-{
-    throw_it( file, line, msg.c_str());
-}
-
-////////////////////////////////////////////////////////////////////////////////
-//! Constructor, default (private).
-////////////////////////////////////////////////////////////////////////////////
-template<class Std_Exception>
-Exception<Std_Exception>::Exception() :
- Exception("Unknown Exception.\n")
-{ }
-
-////////////////////////////////////////////////////////////////////////////////
-//! Constructor, standard (private).
-//! String returned by what().
-////////////////////////////////////////////////////////////////////////////////
-template<class Std_Exception>
-Exception<Std_Exception>::Exception( const std::string& s) :
- Std_Exception( s)
-{ }   
-
-////////////////////////////////////////////////////////////////////////////////
-//! Destructor
-////////////////////////////////////////////////////////////////////////////////
-template<class Std_Exception>
-Exception<Std_Exception>::~Exception() throw() { }
-
-// functions, exported
-
-#endif // #ifndef _EXCEPTION_H_
-
--- a/tests/opencl/vectorhypot/kernel.cl
+++ b/tests/opencl/vectorhypot/kernel.cl
@ -1,41 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
- 
-// OpenCL Kernel Function Naive Implementation for hyptenuse
-__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
-{
-    // get index into global data array
-    size_t szGlobalOffset = get_global_id(0) + uiOffset;
-
-    // bound check 
-    if (szGlobalOffset >= uiNumElements)
-    {   
-        return; 
-    }
-
-    // Processing 4 elements per work item, so read fgA and fgB source values from GMEM
-    float4 f4A = fg4A[szGlobalOffset];
-    float4 f4B = fg4B[szGlobalOffset];
-    float4 f4H = (float4)0.0f;
-     
-    // Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop  
-    for (int i = 0; i < iInnerLoopCount; i++)  
-    {
-        // compute the 4 hypotenuses using built-in function
-        f4H.x = hypot (f4A.x, f4B.x);
-        f4H.y = hypot (f4A.y, f4B.y);
-        f4H.z = hypot (f4A.z, f4B.z);
-        f4H.w = hypot (f4A.w, f4B.w);
-    }
-    
-    // Write 4 result values back out to GMEM
-    fg4Hypot[szGlobalOffset] = f4H;
-}
--- a/tests/opencl/vectorhypot/main.cc
+++ b/tests/opencl/vectorhypot/main.cc
@ -1,702 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-// *********************************************************************
-// oclCopyComputeOverlap Notes:  
-//
-// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
-// element by element vector hyptenuse computation using 2 input float arrays
-// and 1 output float array.
-//
-// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
-// with respect to GPU computation (and with respect to host thread). 
-//
-// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
-// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
-// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
-//
-// After setup, warmup and calibration to the system, the sample runs 4 scenarios:  
-//      A) Computations with 2 command queues on GPU
-//         A multiple-cycle sequence is executed, timed and compared against the host
-//      B) Computations with 1 command queue on GPU
-//         A multiple-cycle sequence is executed, timed and compared against the host
-//
-//      The 2-command queue approach ought to be substantially faster
-//
-// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently 
-// increases compute time without increasing data size (via a loop inside the kernel)
-//
-//      At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
-//      (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
-//
-//      If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
-//        
-//          Single Queue with all the data and all the work  
-//             Ttot (serial)        = 4T + 4T + 2T      = 10T    
-//
-//          Dual Queue, where each queue has 1/2 the data and 1/2 the work 
-//             Tq0  (overlap)       = 2T + 2T + T .... 
-//             Tq1  (overlap)       = .... 2T + 2T + T
-//
-//             Ttot (elapsed, wall) = 2T + 2T + 2T + T  = 7T
-//
-//          Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 %	(Tesla arch 1.2 or 1.3, single copy engine)
-//
-//			For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
-//			This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.			
-// *********************************************************************
-
-// common SDK header for standard utilities and system libs 
-#include <oclUtils.h>
-#include <shrQATest.h>
-#include <iostream>
-
-// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
-// values greater than 0.0f represent a speed-up relative to non-overlapped
-#define EXPECTED_OVERLAP 30.0f
-#define EXPECTED_OVERLAP_FERMI 45.0f
-#define PASS_FACTOR 0.60f
-#define RETRIES_ON_FAILURE 1
-
-// Base sizes for parameters manipulated dynamically or on the command line
-#define BASE_WORK_ITEMS 64
-#define BASE_ARRAY_LENGTH 40000
-#define BASE_LOOP_COUNT 32
-
-static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
-  if (nullptr == filename || nullptr == data || 0 == size)
-    return CL_INVALID_VALUE;
-
-  FILE* fp = fopen(filename, "r");
-  if (NULL == fp) {
-    fprintf(stderr, "Failed to load kernel.");
-    return CL_INVALID_VALUE;
-  }
-  fseek(fp , 0 , SEEK_END);
-  long fsize = ftell(fp);
-  rewind(fp);
-
-  *data = (uint8_t*)malloc(fsize);
-  *size = fread(*data, 1, fsize, fp);
-  
-  fclose(fp);
-  
-  return CL_SUCCESS;
-}
-
-// Vars
-// *********************************************************************
-cl_platform_id cpPlatform;                          // OpenCL platform
-cl_context cxGPUContext;                            // OpenCL context
-cl_command_queue cqCommandQueue[2];                 // OpenCL command queues
-cl_device_id* cdDevices;                            // OpenCL device list  
-cl_program cpProgram;                               // OpenCL program
-cl_kernel ckKernel[2];                              // OpenCL kernel, 1 per queue
-cl_mem cmPinnedSrcA;                                // OpenCL pinned host source buffer A
-cl_mem cmPinnedSrcB;                                // OpenCL pinned host source buffer B 
-cl_mem cmPinnedResult;                              // OpenCL pinned host result buffer 
-float* fSourceA = NULL;                             // Mapped pointer for pinned Host source A buffer
-float* fSourceB = NULL;                             // Mapped pointer for pinned Host source B buffer
-float* fResult = NULL;                              // Mapped pointer for pinned Host result buffer 
-cl_mem cmDevSrcA;                                   // OpenCL device source buffer A
-cl_mem cmDevSrcB;                                   // OpenCL device source buffer B 
-cl_mem cmDevResult;                                 // OpenCL device result buffer 
-size_t szBuffBytes;                                 // Size of main buffers
-size_t szGlobalWorkSize;                            // 1D var for Total # of work items in the launched ND range
-size_t szLocalWorkSize = BASE_WORK_ITEMS;           // initial # of work items in the work group	
-cl_int ciErrNum;			                        // Error code var
-char* cPathAndName = NULL;                          // Var for full paths to data, src, etc.
-char* cSourceCL = NULL;                             // Buffer to hold source for compilation 
-const char* cExecutableName = NULL;
-
-// demo config vars
-const char* cSourceFile = "kernel.cl";         // OpenCL computation kernel source code
-float* Golden = NULL;                               // temp buffer to hold golden results for cross check    
-bool bNoPrompt = false;                             // Command line switch to skip exit prompt
-bool bQATest = false;                               // Command line switch to test
-
-// Forward Declarations
-// *********************************************************************
-double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
-double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
-int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles); 
-void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
-void Cleanup (int iExitCode);
-void (*pCleanup)(int) = &Cleanup;
-
-int *gp_argc = 0;
-const char *** gp_argv = NULL;
-
-// Main function 
-// *********************************************************************
-int main(int argc, const char **argv)
-{
-    //Locals
-    size_t szKernelLength;                      // Byte size of kernel code
-    double dBuildTime;                          // Compile time
-    cl_uint uiTargetDevice = 0;	                // Default Device to compute on
-    cl_uint uiNumDevsUsed = 1;                  // Number of devices used in this sample   
-    cl_uint uiNumDevices;                       // Number of devices available 
-    int iDevCap = -1;                           // Capability of device
-    int iInnerLoopCount = BASE_LOOP_COUNT;      // Varies "compute intensity" per data within the kernel 
-    const int iTestCycles = 10;                 // How many times to run the external test loop 
-    const int iWarmupCycles = 8;                // How many times to run the warmup sequence 
-    cl_uint uiWorkGroupMultiple = 4;            // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
-    cl_uint uiNumElements = BASE_ARRAY_LENGTH;  // initial # of elements per array to process (note: procesing 4 per work item)
-    cl_uint uiSizeMultiple = 4;                 // Command line var (using "sizemult=<n>") to optionally increase vector sizes
-    bool bPassFlag = false;                     // Var to accumulate test pass/fail
-    shrBOOL bMatch = shrFALSE;                  // Cross check result
-	shrBOOL bTestOverlap = shrFALSE;
-	double dAvgGPUTime[2] = {0.0, 0.0};         // Average time of iTestCycles calls for 2-Queue and 1-Queue test
-    double dHostTime[2] = {0.0, 0.0};           // Host computation time (2nd test is redundant but a good stability indicator)
-    float fMinPassCriteria[2] = {0.0f, 0.0f};	// Test pass cireria, adjusted dependant on GPU arch
-
-    gp_argc = &argc;
-    gp_argv = &argv;
-
-    shrQAStart(argc, (char **)argv);
-
-    // start logs 
-	cExecutableName = argv[0];
-    shrSetLogFileName ("oclCopyComputeOverlap.txt");
-    shrLog("%s Starting...\n\n", argv[0]); 
-
-    // get basic command line args 
-    bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
-    bQATest   = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
-    shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
-
-    // Optional Command-line multiplier for vector size 
-    //   Default val of 4 gives 10.24 million float elements per vector
-    //   Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
-    shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
-    uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);  
-    uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
-    shrLog("Array sizes = %u float elements\n", uiNumElements); 
-
-    // Optional Command-line multiplier for workgroup size (x 64 work items)
-    //   Default val of 4 gives szLocalWorkSize of 256.
-    //   Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
-    shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
-    uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10); 
-    szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
-    shrLog("Workgroup Size = %u\n\n", szLocalWorkSize); 
-
-    // Get the NVIDIA platform if available, otherwise use default
-    shrLog("Get the Platform ID...\n\n");
-    ciErrNum = oclGetPlatformID(&cpPlatform);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-
-    // Get OpenCL platform name and version
-    char cBuffer[256];
-    ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    shrLog("Platform Name = %s\n\n", cBuffer);
-
-    // Get all the devices
-    shrLog("Get the Device info and select Device...\n");
-    uiNumDevices = 1;
-    cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
-    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, cdDevices, NULL);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);   
-
-    // Set target device and check capabilities 
-    shrLog(" # of Devices Available = %u\n", uiNumDevices); 
-    uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
-    shrLog(" Using Device %u, ", uiTargetDevice); 
-    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);  
-    /*iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
-    if (iDevCap > 0) {
-       shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
-    } else {
-       shrLog("\n\n", iDevCap); 
-    }
-    if (strstr(cBuffer, "NVIDIA") != NULL)
-    {
-        if (iDevCap < 12)
-        {
-            shrLog("Device doesn't have overlap capability.  Skipping test...\n"); 
-            Cleanup (EXIT_SUCCESS);
-        }
-
-		// Device and Platform eligible for overlap testing
-		bTestOverlap = shrTRUE;
-
-        // If device has overlap capability, proceed
-        fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP;               // 1st cycle overlap is same for 1 or 2 copy engines
-        if (iDevCap != 20) 
-        {
-            // Single copy engine
-            fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP;            // avg of many cycles
-        }
-        else 
-        {
-            char cDevName[1024];
-            clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
-            if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
-            {
-                // Tesla or Quadro (arch = 2.0) ... Dual copy engine 
-                fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI;  // average of many cycles
-            }
-            else
-            {
-                // Geforce ... Single copy engine
-                fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP;        // average of many cycles
-            }
-        }
-    }*/ 
-
-    // Create the context
-    shrLog("clCreateContext...\n"); 
-    cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    
-    // Create 2 command-queues
-    cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    shrLog("clCreateCommandQueue [0]...\n"); 
-    cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    shrLog("clCreateCommandQueue [1]...\n"); 
-
-    // Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
-    szBuffBytes = sizeof(cl_float) * uiNumElements;
-    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements); 
-
-    // Allocate pinned source and result host buffers:  
-    //   Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
-    cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements); 
-
-    // Get mapped pointers to pinned input host buffers
-    //   Note:  This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
-    fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
-    oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
-    shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n"); 
-
-    // Alloc temp golden buffer for cross checks
-    Golden = (float*)malloc(szBuffBytes);
-    oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
-
-#ifdef HOSTGPU
-    // Read the OpenCL kernel in from source file
-    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
-    oclCheckError(cPathAndName != NULL, shrTRUE);
-    shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
-    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);    
-    // Create the program object
-    shrLog("clCreateProgramWithSource...\n");
-    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-#else    
-    uint8_t *kernel_bin = NULL;
-	size_t kernel_size;
-	cl_int binary_status = 0;  
-	ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
-	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-	cpProgram = clCreateProgramWithBinary(
-		cxGPUContext, 1, &cdDevices[uiTargetDevice], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
-	oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-#endif
-    // Build the program for the target device
-    clFinish(cqCommandQueue[0]);
-    shrDeltaT(0);
-    ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
-    shrLog("clBuildProgram..."); 
-    if (ciErrNum != CL_SUCCESS)
-    {
-        // write out standard error, Build Log and PTX, then cleanup and exit
-        shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
-        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
-        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
-        Cleanup(EXIT_FAILURE);
-    }
-    dBuildTime = shrDeltaT(0);
-
-    // Create the kernel
-    ckKernel[0] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    ckKernel[1] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    shrLog("clCreateKernel (ckKernel[2])...\n"); 
-
-    // Offsets for 2 queues
-    cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
-
-    // Set the Argument values for the 1st kernel instance (queue 0)
-    ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
-    ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
-    ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
-    ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
-    ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
-    ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
-    shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n"); 
-
-    // Set the Argument values for the 2d kernel instance (queue 1)
-    ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
-    ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
-    ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
-    ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
-    ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
-    ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n"); 
-
-    //*******************************************
-    // Warmup the driver with dual queue sequence
-    //*******************************************
-
-    // Warmup with dual queue sequence for iTestCycles
-    shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
-    DualQueueSequence(iWarmupCycles, uiNumElements, false);
-
-    // Use single queue config to adjust compute intensity 
-    shrLog("Adjust compute for GPU / system...\n");
-    iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles); 
-    shrLog("  Kernel inner loop count = %d\n", iInnerLoopCount); 
-
-    //*******************************************
-    // Run and time with 2 command-queues
-    //*******************************************
-	for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
-	
-	// Run the sequence iTestCycles times
-    dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
-
-    // Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer) 
-    shrLog("  Device vs Host Result Comparison\t: "); 
-    VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);   
-    shrDeltaT(0);
-    for (int i = 0; i < iTestCycles; i++)
-    {
-        VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);   
-    }
-    dHostTime[0] = shrDeltaT(0)/iTestCycles;
-
-    // Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer) 
-    bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
-    shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); 
-    bPassFlag = (bMatch == shrTRUE);
-
-    //*******************************************
-    // Run and time with 1 command queue
-    //*******************************************
-    // Run the sequence iTestCycles times
-    dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
-
-    // Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer) 
-    shrLog("  Device vs Host Result Comparison\t: "); 
-    shrDeltaT(0);
-    for (int i = 0; i < iTestCycles; i++)
-    {
-        VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);   
-    }
-    dHostTime[1] = shrDeltaT(0)/iTestCycles;
-
-    // Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer) 
-    bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
-    shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); 
-    bPassFlag &= (bMatch == shrTRUE);
-
-    //*******************************************
-
-    // Compare Single and Dual queue timing 
-    shrLog("\nResult Summary:\n"); 
-
-    // Log GPU and CPU Time for 2-queue scenario
-    shrLog("  Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
-    shrLog("  Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
-
-    // Log GPU and CPU Time for 1-queue scenario
-    shrLog("  Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
-    shrLog("  Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
-
-    // Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
-    double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
-    
-	if( bTestOverlap ) {
-		bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
-		if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
-			shrLog("  Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%)  -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
-
-			// Log info to master log in standard format
-			shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", 
-			dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize); 
-			
-			bPassFlag &= bAvgOverlapOK;
-			break;
-		}
-	} 
-
-		shrLog("  Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%)  -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
-	}
-
-
-    //*******************************************
-    // Report pass/fail, cleanup and exit
-    Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
-
-    return 0;
-}
-
-// Run 1 queue sequence for n cycles
-// *********************************************************************
-double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
-{
-    // Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer) 
-    shrFillArray(fSourceA, (int)uiNumElements);
-    shrFillArray(fSourceB, (int)uiNumElements);
-
-    // Reset Global work size for 1 command-queue, and log work sizes & dimensions
-    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
-
-    // *** Make sure queues are empty and then start timer 
-    double dAvgTime = 0.0;
-    clFinish(cqCommandQueue[0]);
-    clFinish(cqCommandQueue[1]);  
-    shrDeltaT(0);
-
-    // Run the sequence iCycles times
-    for (int i = 0; i < iCycles; i++)
-    {
-        // Nonblocking Write of all of input data from host to device in command-queue 0 
-        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
-        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
-        shrCheckError(ciErrNum, CL_SUCCESS);
-
-        // Launch kernel computation, command-queue 0 
-        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
-        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-
-        // Non Blocking Read of output data from device to host, command-queue 0 
-        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
-        shrCheckError(ciErrNum, CL_SUCCESS);
-
-        // Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
-        clFlush(cqCommandQueue[0]);
-    }
-    
-    // *** Assure sync to host and return average sequence time
-    clFinish(cqCommandQueue[0]);
-    dAvgTime = shrDeltaT(0)/(double)iCycles;
-
-    // Log config if asked for
-    if (bShowConfig)
-    {
-        shrLog("\n1-Queue sequence Configuration:\n");
-        shrLog("  Global Work Size (per command-queue)\t= %u\n  Local Work Size \t\t\t= %u\n  # of Work Groups (per command-queue)\t= %u\n  # of command-queues\t\t\t= 1\n", 
-           szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize); 
-    }
-    return dAvgTime;
-}
-
-// Run 2 queue sequence for n cycles
-// *********************************************************************
-double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
-{
-    // Locals
-    size_t szHalfBuffer = szBuffBytes / 2;
-    size_t szHalfOffset = szHalfBuffer / sizeof(float);
-    double dAvgTime = 0.0;
-
-    // Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer) 
-    shrFillArray(fSourceA, (int)uiNumElements);
-    shrFillArray(fSourceB, (int)uiNumElements);
-
-    // Set Global work size for 2 command-queues, and log work sizes & dimensions
-    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
-
-    // Make sure queues are empty and then start timer 
-    clFinish(cqCommandQueue[0]);
-    clFinish(cqCommandQueue[1]);
-    shrDeltaT(0);
-
-    for (int i = 0; i < iCycles; i++)
-    {
-        // Mid Phase 0 
-        // Nonblocking Write of 1st half of input data from host to device in command-queue 0 
-        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
-        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
-        shrCheckError(ciErrNum, CL_SUCCESS);
-
-        // Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver 
-        // (not necessary on Linux, Mac OSX or WinXP)
-        clFlush(cqCommandQueue[0]);
-        clFlush(cqCommandQueue[1]);
-
-        // Start Phase 1 ***********************************
-
-        // Launch kernel computation, command-queue 0
-        // (Note:  The order MATTERS here on Fermi !  THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
-        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
-        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-
-        // Nonblocking Write of 2nd half of input data from host to device in command-queue 1 
-        // (Note:  The order MATTERS here on Fermi !  THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
-        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
-        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
-        shrCheckError(ciErrNum, CL_SUCCESS);
-
-        // Push out the compute for queue 0 and write for queue 1 to the driver
-        // (not necessary on Linux, Mac OSX or WinXP)
-        clFlush(cqCommandQueue[0]);
-        clFlush(cqCommandQueue[1]);
-
-        // Start Phase 2 ***********************************
-
-        // Launch kernel computation, command-queue 1 
-        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
-        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-
-        // Non Blocking Read of 1st half of output data from device to host, command-queue 0 
-        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
-        shrCheckError(ciErrNum, CL_SUCCESS);
-
-        // Push out the compute for queue 1 and the read for queue 0 to the driver 
-        // (not necessary on Linux, Mac OSX or WinXP)
-        clFlush(cqCommandQueue[0]);
-        clFlush(cqCommandQueue[1]);
-
-        // Start Phase 0 (Rolls over) ***********************************
-
-        // Non Blocking Read of 2nd half of output data from device to host, command-queue 1 
-        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
-        shrCheckError(ciErrNum, CL_SUCCESS);
-    }
-
-    // *** Sync to host and get average sequence time
-    clFinish(cqCommandQueue[0]);
-    clFinish(cqCommandQueue[1]);    
-    dAvgTime = shrDeltaT(0)/(double)iCycles;
-
-    // Log config if asked for
-    if (bShowConfig)
-    {
-        shrLog("\n2-Queue sequence Configuration:\n");
-        shrLog("  Global Work Size (per command-queue)\t= %u\n  Local Work Size \t\t\t= %u\n  # of Work Groups (per command-queue)\t= %u\n  # of command-queues\t\t\t= 2\n", 
-           szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize); 
-    }
-
-    return dAvgTime;
-}
-
-// Function to adjust compute task according to device capability
-// This allows a consistent overlap % across a wide variety of GPU's for test purposes
-// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
-// *********************************************************************
-int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
-{
-    // Locals
-    double dCopyTime, dComputeTime;
-    int iComputedLoopCount; 
-
-    // Change Source Data
-    shrFillArray(fSourceA, (int)uiNumElements);
-    shrFillArray(fSourceB, (int)uiNumElements);
-
-    // Reset Global work size for 1 command-queue, and log work sizes & dimensions
-    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
-
-    // *** Make sure queues are empty and then start timer 
-    clFinish(cqCommandQueue[0]);
-    clFinish(cqCommandQueue[1]);  
-    shrDeltaT(0);
-
-    // Run the copy iCycles times and measure copy time on this system
-    for (int i = 0; i < iCycles; i++)
-    {
-        // Nonblocking Write of all of input data from host to device in command-queue 0 
-        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
-        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
-        ciErrNum |= clFlush(cqCommandQueue[0]);
-        shrCheckError(ciErrNum, CL_SUCCESS);
-    }
-    clFinish(cqCommandQueue[0]);
-    dCopyTime = shrDeltaT(0);
-
-    // Run the compute iCycles times and measure compute time on this system
-    for (int i = 0; i < iCycles; i++)
-    {
-        // Launch kernel computation, command-queue 0 
-        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
-        ciErrNum |= clFlush(cqCommandQueue[0]);
-        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    }
-    clFinish(cqCommandQueue[0]);
-    dComputeTime = shrDeltaT(0);
-
-    // Determine number of core loop cycles proportional to copy/compute time ratio
-    dComputeTime = MAX(dComputeTime, 1.0e-6);
-    iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
-    ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
-    ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
-    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
-    return (iComputedLoopCount);
-} 
-
-// Cleanup/Exit function 
-// *********************************************************************
-void Cleanup (int iExitCode)
-{
-    // Cleanup allocated objects
-    shrLog("Starting Cleanup...\n\n");
-    if(cPathAndName)free(cPathAndName);
-    if(cSourceCL)free(cSourceCL);
-    if(Golden)free(Golden);
-    if(ckKernel[0])clReleaseKernel(ckKernel[0]);
-    if(ckKernel[1])clReleaseKernel(ckKernel[1]);
-    if(cpProgram)clReleaseProgram(cpProgram);
-    if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
-    if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
-    if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
-    if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
-    if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
-    if(cmDevResult)clReleaseMemObject(cmDevResult);
-    if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
-    if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
-    if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
-    if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
-    if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
-    if(cxGPUContext)clReleaseContext(cxGPUContext);
-    if(cdDevices)free(cdDevices);
-
-    // Master status Pass/Fail (all tests)
-    shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
-}
-
-// "Golden" Host processing vector hyptenuse function for comparison purposes
-// *********************************************************************
-void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
-{
-    for (unsigned int i = 0; i < uiNumElements; i++) 
-    {
-        float fA = pfData1[i];
-        float fB = pfData2[i];
-        float fC = sqrtf(fA * fA + fB * fB);
-
-        pfResult[i] = fC;
-    }
-}
--- a/tests/opencl/vectorhypot/oclUtils.cpp
+++ b/tests/opencl/vectorhypot/oclUtils.cpp
@ -1,806 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-// *********************************************************************
-// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK 
-// *********************************************************************
-
-#include <fstream>
-#include <vector>
-#include <iostream>
-#include <algorithm>
-#include <stdarg.h>
-#include "oclUtils.h"
-
-//////////////////////////////////////////////////////////////////////////////
-//! Gets the platform ID for NVIDIA if available, otherwise default
-//!
-//! @return the id 
-//! @param clSelectedPlatformID         OpenCL platoform ID
-//////////////////////////////////////////////////////////////////////////////
-cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID)
-{
-    char chBuffer[1024];
-    cl_uint num_platforms; 
-    cl_platform_id* clPlatformIDs;
-    cl_int ciErrNum;
-    *clSelectedPlatformID = NULL;
-
-    // Get OpenCL platform count
-    ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
-    if (ciErrNum != CL_SUCCESS)
-    {
-        shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum);
-        return -1000;
-    }
-    else 
-    {
-        if(num_platforms == 0)
-        {
-            shrLog("No OpenCL platform found!\n\n");
-            return -2000;
-        }
-        else 
-        {
-            // if there's a platform or more, make space for ID's
-            if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL)
-            {
-                shrLog("Failed to allocate memory for cl_platform ID's!\n\n");
-                return -3000;
-            }
-
-            // get platform info for each platform and trap the NVIDIA platform if found
-            ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
-            for(cl_uint i = 0; i < num_platforms; ++i)
-            {
-                ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL);
-                if(ciErrNum == CL_SUCCESS)
-                {
-                    if(strstr(chBuffer, "NVIDIA") != NULL)
-                    {
-                        *clSelectedPlatformID = clPlatformIDs[i];
-                        break;
-                    }
-                }
-            }
-
-            // default to zeroeth platform if NVIDIA not found
-            if(*clSelectedPlatformID == NULL)
-            {
-                shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n");
-                *clSelectedPlatformID = clPlatformIDs[0];
-            }
-
-            free(clPlatformIDs);
-        }
-    }
-
-    return CL_SUCCESS;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Print the device name
-//!
-//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
-//! @param device         OpenCL id of the device
-//////////////////////////////////////////////////////////////////////////////
-void oclPrintDevName(int iLogMode, cl_device_id device)
-{
-    char device_string[1024];
-    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
-    shrLogEx(iLogMode, 0, "%s\n", device_string);
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Print info about the device
-//!
-//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
-//! @param device         OpenCL id of the device
-//////////////////////////////////////////////////////////////////////////////
-void oclPrintDevInfo(int iLogMode, cl_device_id device)
-{
-    char device_string[1024];
-    bool nv_device_attibute_query = false;
-
-    // CL_DEVICE_NAME
-    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_NAME: \t\t\t%s\n", device_string);
-
-    // CL_DEVICE_VENDOR
-    clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_VENDOR: \t\t\t%s\n", device_string);
-
-    // CL_DRIVER_VERSION
-    clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(device_string), &device_string, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DRIVER_VERSION: \t\t\t%s\n", device_string);
-
-    // CL_DEVICE_VERSION
-    clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(device_string), &device_string, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_VERSION: \t\t\t%s\n", device_string);
-
-    // CL_DEVICE_OPENCL_C_VERSION (if CL_DEVICE_VERSION version > 1.0)
-    if(strncmp("OpenCL 1.0", device_string, 10) != 0) 
-    {
-        // This code is unused for devices reporting OpenCL 1.0, but a def is needed anyway to allow compilation using v 1.0 headers 
-        // This constant isn't #defined in 1.0
-        #ifndef CL_DEVICE_OPENCL_C_VERSION
-            #define CL_DEVICE_OPENCL_C_VERSION 0x103D   
-        #endif
-
-        clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(device_string), &device_string, NULL);
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_OPENCL_C_VERSION: \t\t%s\n", device_string);
-    }
-
-    // CL_DEVICE_TYPE
-    cl_device_type type;
-    clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
-    if( type & CL_DEVICE_TYPE_CPU )
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
-    if( type & CL_DEVICE_TYPE_GPU )
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
-    if( type & CL_DEVICE_TYPE_ACCELERATOR )
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
-    if( type & CL_DEVICE_TYPE_DEFAULT )
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
-    
-    // CL_DEVICE_MAX_COMPUTE_UNITS
-    cl_uint compute_units;
-    clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", compute_units);
-
-	// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
-    size_t workitem_dims;
-    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
-
-    // CL_DEVICE_MAX_WORK_ITEM_SIZES
-    size_t workitem_size[3];
-    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
-    
-    // CL_DEVICE_MAX_WORK_GROUP_SIZE
-    size_t workgroup_size;
-    clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
-
-    // CL_DEVICE_MAX_CLOCK_FREQUENCY
-    cl_uint clock_frequency;
-    clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);
-
-    // CL_DEVICE_ADDRESS_BITS
-    cl_uint addr_bits;
-    clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_ADDRESS_BITS:\t\t%u\n", addr_bits);
-
-    // CL_DEVICE_MAX_MEM_ALLOC_SIZE
-    cl_ulong max_mem_alloc_size;
-    clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(max_mem_alloc_size / (1024 * 1024)));
-
-    // CL_DEVICE_GLOBAL_MEM_SIZE
-    cl_ulong mem_size;
-    clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(mem_size / (1024 * 1024)));
-
-    // CL_DEVICE_ERROR_CORRECTION_SUPPORT
-    cl_bool error_correction_support;
-    clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(error_correction_support), &error_correction_support, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", error_correction_support == CL_TRUE ? "yes" : "no");
-
-    // CL_DEVICE_LOCAL_MEM_TYPE
-    cl_device_local_mem_type local_mem_type;
-    clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(local_mem_type), &local_mem_type, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", local_mem_type == 1 ? "local" : "global");
-
-    // CL_DEVICE_LOCAL_MEM_SIZE
-    clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(mem_size / 1024));
-
-    // CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
-    clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(mem_size), &mem_size, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(mem_size / 1024));
-
-    // CL_DEVICE_QUEUE_PROPERTIES
-    cl_command_queue_properties queue_properties;
-    clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(queue_properties), &queue_properties, NULL);
-    if( queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");    
-    if( queue_properties & CL_QUEUE_PROFILING_ENABLE )
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
-
-    // CL_DEVICE_IMAGE_SUPPORT
-    cl_bool image_support;
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", image_support);
-
-    // CL_DEVICE_MAX_READ_IMAGE_ARGS
-    cl_uint max_read_image_args;
-    clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(max_read_image_args), &max_read_image_args, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", max_read_image_args);
-
-    // CL_DEVICE_MAX_WRITE_IMAGE_ARGS
-    cl_uint max_write_image_args;
-    clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(max_write_image_args), &max_write_image_args, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", max_write_image_args);
-
-    // CL_DEVICE_SINGLE_FP_CONFIG
-    cl_device_fp_config fp_config;
-    clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, NULL);
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_SINGLE_FP_CONFIG:\t\t%s%s%s%s%s%s\n",
-        fp_config & CL_FP_DENORM ? "denorms " : "",
-        fp_config & CL_FP_INF_NAN ? "INF-quietNaNs " : "",
-        fp_config & CL_FP_ROUND_TO_NEAREST ? "round-to-nearest " : "",
-        fp_config & CL_FP_ROUND_TO_ZERO ? "round-to-zero " : "",
-        fp_config & CL_FP_ROUND_TO_INF ? "round-to-inf " : "",
-        fp_config & CL_FP_FMA ? "fma " : "");
-    
-    // CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
-    size_t szMaxDims[5];
-    shrLogEx(iLogMode, 0, "\n  CL_DEVICE_IMAGE <dim>"); 
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &szMaxDims[0], NULL);
-    shrLogEx(iLogMode, 0, "\t\t\t2D_MAX_WIDTH\t %u\n", szMaxDims[0]);
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[1], NULL);
-    shrLogEx(iLogMode, 0, "\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", szMaxDims[1]);
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &szMaxDims[2], NULL);
-    shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_WIDTH\t %u\n", szMaxDims[2]);
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[3], NULL);
-    shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", szMaxDims[3]);
-    clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &szMaxDims[4], NULL);
-    shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_DEPTH\t %u\n", szMaxDims[4]);
-
-    // CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
-    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(device_string), &device_string, NULL);
-    if (device_string != 0) 
-    {
-        shrLogEx(iLogMode, 0, "\n  CL_DEVICE_EXTENSIONS:");
-        std::string stdDevString;
-        stdDevString = std::string(device_string);
-        size_t szOldPos = 0;
-        size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
-        while (szSpacePos != stdDevString.npos)
-        {
-            if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
-                nv_device_attibute_query = true;
-
-            if (szOldPos > 0)
-            {
-                shrLogEx(iLogMode, 0, "\t\t");
-            }
-            shrLogEx(iLogMode, 0, "\t\t\t%s\n", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str());
-            
-            do {
-                szOldPos = szSpacePos + 1;
-                szSpacePos = stdDevString.find(' ', szOldPos);
-            } while (szSpacePos == szOldPos);
-        }
-        shrLogEx(iLogMode, 0, "\n");
-    }
-    else 
-    {
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_EXTENSIONS: None\n");
-    }
-
-    if(nv_device_attibute_query) 
-    {
-        cl_uint compute_capability_major, compute_capability_minor;
-        clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &compute_capability_major, NULL);
-        clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &compute_capability_minor, NULL);
-        shrLogEx(iLogMode, 0, "\n  CL_DEVICE_COMPUTE_CAPABILITY_NV:\t%u.%u\n", compute_capability_major, compute_capability_minor);        
-
-        shrLogEx(iLogMode, 0, "  NUMBER OF MULTIPROCESSORS:\t\t%u\n", compute_units); // this is the same value reported by CL_DEVICE_MAX_COMPUTE_UNITS
-        shrLogEx(iLogMode, 0, "  NUMBER OF CUDA CORES:\t\t\t%u\n", ConvertSMVer2Cores(compute_capability_major, compute_capability_minor) * compute_units);
-
-        cl_uint regs_per_block;
-        clGetDeviceInfo(device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(cl_uint), &regs_per_block, NULL);
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_REGISTERS_PER_BLOCK_NV:\t%u\n", regs_per_block);        
-
-        cl_uint warp_size;
-        clGetDeviceInfo(device, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL);
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_WARP_SIZE_NV:\t\t%u\n", warp_size);        
-
-        cl_bool gpu_overlap;
-        clGetDeviceInfo(device, CL_DEVICE_GPU_OVERLAP_NV, sizeof(cl_bool), &gpu_overlap, NULL);
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_GPU_OVERLAP_NV:\t\t%s\n", gpu_overlap == CL_TRUE ? "CL_TRUE" : "CL_FALSE");        
-
-        cl_bool exec_timeout;
-        clGetDeviceInfo(device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(cl_bool), &exec_timeout, NULL);
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:\t%s\n", exec_timeout == CL_TRUE ? "CL_TRUE" : "CL_FALSE");        
-
-        cl_bool integrated_memory;
-        clGetDeviceInfo(device, CL_DEVICE_INTEGRATED_MEMORY_NV, sizeof(cl_bool), &integrated_memory, NULL);
-        shrLogEx(iLogMode, 0, "  CL_DEVICE_INTEGRATED_MEMORY_NV:\t%s\n", integrated_memory == CL_TRUE ? "CL_TRUE" : "CL_FALSE");        
-    }
-
-    // CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
-    shrLogEx(iLogMode, 0, "  CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t"); 
-    cl_uint vec_width [6];
-    clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &vec_width[0], NULL);
-    clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &vec_width[1], NULL);
-    clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &vec_width[2], NULL);
-    clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &vec_width[3], NULL);
-    clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &vec_width[4], NULL);
-    clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &vec_width[5], NULL);
-    shrLogEx(iLogMode, 0, "CHAR %u, SHORT %u, INT %u, LONG %u, FLOAT %u, DOUBLE %u\n\n\n", 
-           vec_width[0], vec_width[1], vec_width[2], vec_width[3], vec_width[4], vec_width[5]); 
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Get and return device capability
-//!
-//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA 
-//! @param device         OpenCL id of the device
-//////////////////////////////////////////////////////////////////////////////
-int oclGetDevCap(cl_device_id device)
-{
-    char cDevString[1024];
-    bool bDevAttributeQuery = false;
-    int iDevArch = -1;
-
-    // Get device extensions, and if any then search for cl_nv_device_attribute_query
-    clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(cDevString), &cDevString, NULL);
-    if (cDevString != 0) 
-    {
-        std::string stdDevString;
-        stdDevString = std::string(cDevString);
-        size_t szOldPos = 0;
-        size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
-        while (szSpacePos != stdDevString.npos)
-        {
-            if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
-            {
-                bDevAttributeQuery = true;
-            }
-            
-            do {
-                szOldPos = szSpacePos + 1;
-                szSpacePos = stdDevString.find(' ', szOldPos);
-            } while (szSpacePos == szOldPos);
-        }
-    }
-
-    // if search succeeded, get device caps
-    if(bDevAttributeQuery) 
-    {
-        cl_int iComputeCapMajor, iComputeCapMinor;
-        clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), (void*)&iComputeCapMajor, NULL);
-        clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), (void*)&iComputeCapMinor, NULL);
-        iDevArch = (10 * iComputeCapMajor) + iComputeCapMinor;
-    }
-
-    return iDevArch;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Gets the id of the first device from the context
-//!
-//! @return the id 
-//! @param cxGPUContext         OpenCL context
-//////////////////////////////////////////////////////////////////////////////
-cl_device_id oclGetFirstDev(cl_context cxGPUContext)
-{
-    size_t szParmDataBytes;
-    cl_device_id* cdDevices;
-
-    // get the list of GPU devices associated with context
-    clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
-    cdDevices = (cl_device_id*) malloc(szParmDataBytes);
-
-    clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
-
-    cl_device_id first = cdDevices[0];
-    free(cdDevices);
-
-    return first;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Gets the id of device with maximal FLOPS from the context
-//!
-//! @return the id 
-//! @param cxGPUContext         OpenCL context
-//////////////////////////////////////////////////////////////////////////////
-cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext)
-{
-    size_t szParmDataBytes;
-    cl_device_id* cdDevices;
-
-    // get the list of GPU devices associated with context
-    clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
-    cdDevices = (cl_device_id*) malloc(szParmDataBytes);
-    size_t device_count = szParmDataBytes / sizeof(cl_device_id);
-
-    clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
-
-    cl_device_id max_flops_device = cdDevices[0];
-    int max_flops = 0;
-    
-    size_t current_device = 0;
-    
-    // CL_DEVICE_MAX_COMPUTE_UNITS
-    cl_uint compute_units;
-    clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
-
-    // CL_DEVICE_MAX_CLOCK_FREQUENCY
-    cl_uint clock_frequency;
-    clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
-    
-    max_flops = compute_units * clock_frequency;
-    ++current_device;
-
-    while( current_device < device_count )
-    {
-        // CL_DEVICE_MAX_COMPUTE_UNITS
-        cl_uint compute_units;
-        clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
-
-        // CL_DEVICE_MAX_CLOCK_FREQUENCY
-        cl_uint clock_frequency;
-        clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
-        
-        int flops = compute_units * clock_frequency;
-        if( flops > max_flops )
-        {
-            max_flops        = flops;
-            max_flops_device = cdDevices[current_device];
-        }
-        ++current_device;
-    }
-
-    free(cdDevices);
-
-    return max_flops_device;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Loads a Program file and prepends the cPreamble to the code.
-//!
-//! @return the source string if succeeded, 0 otherwise
-//! @param cFilename        program filename
-//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header
-//! @param szFinalLength    returned length of the code string
-//////////////////////////////////////////////////////////////////////////////
-char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
-{
-    // locals 
-    FILE* pFileStream = NULL;
-    size_t szSourceLength;
-
-    // open the OpenCL source code file
-    #ifdef _WIN32   // Windows version
-        if(fopen_s(&pFileStream, cFilename, "rb") != 0) 
-        {       
-            return NULL;
-        }
-    #else           // Linux version
-        pFileStream = fopen(cFilename, "rb");
-        if(pFileStream == 0) 
-        {       
-            return NULL;
-        }
-    #endif
-
-    size_t szPreambleLength = strlen(cPreamble);
-
-    // get the length of the source code
-    fseek(pFileStream, 0, SEEK_END); 
-    szSourceLength = ftell(pFileStream);
-    fseek(pFileStream, 0, SEEK_SET); 
-
-    // allocate a buffer for the source code string and read it in
-    char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); 
-    memcpy(cSourceString, cPreamble, szPreambleLength);
-    if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
-    {
-        fclose(pFileStream);
-        free(cSourceString);
-        return 0;
-    }
-
-    // close the file and return the total length of the combined (preamble + source) string
-    fclose(pFileStream);
-    if(szFinalLength != 0)
-    {
-        *szFinalLength = szSourceLength + szPreambleLength;
-    }
-    cSourceString[szSourceLength + szPreambleLength] = '\0';
-
-    return cSourceString;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Gets the id of the nth device from the context
-//!
-//! @return the id or -1 when out of range
-//! @param cxGPUContext         OpenCL context
-//! @param device_idx            index of the device of interest
-//////////////////////////////////////////////////////////////////////////////
-cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int nr)
-{
-    size_t szParmDataBytes;
-    cl_device_id* cdDevices;
-
-    // get the list of GPU devices associated with context
-    clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
-    
-    if( szParmDataBytes / sizeof(cl_device_id) <= nr ) {
-      return (cl_device_id)-1;
-    }
-    
-    cdDevices = (cl_device_id*) malloc(szParmDataBytes);
-
-    clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
-    
-    cl_device_id device = cdDevices[nr];
-    free(cdDevices);
-
-    return device;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Get the binary (PTX) of the program associated with the device
-//!
-//! @param cpProgram    OpenCL program
-//! @param cdDevice     device of interest
-//! @param binary       returned code
-//! @param length       length of returned code
-//////////////////////////////////////////////////////////////////////////////
-void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length)
-{
-    // Grab the number of devices associated witht the program
-    cl_uint num_devices;
-    clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
-
-    // Grab the device ids
-    cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
-    clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
-
-    // Grab the sizes of the binaries
-    size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));    
-    clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
-
-    // Now get the binaries
-    char** ptx_code = (char**) malloc(num_devices * sizeof(char*));
-    for( unsigned int i=0; i<num_devices; ++i) {
-        ptx_code[i]= (char*)malloc(binary_sizes[i]);
-    }
-    clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
-
-    // Find the index of the device of interest
-    unsigned int idx = 0;
-    while( idx<num_devices && devices[idx] != cdDevice ) ++idx;
-    
-    // If it is associated prepare the result
-    if( idx < num_devices )
-    {
-        *binary = ptx_code[idx];
-        *length = binary_sizes[idx];
-    }
-
-    // Cleanup
-    free( devices );
-    free( binary_sizes );
-    for( unsigned int i=0; i<num_devices; ++i) {
-        if( i != idx ) free(ptx_code[i]);
-    }
-    free( ptx_code );
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
-//!
-//! @param cpProgram                   OpenCL program
-//! @param cdDevice                    device of interest
-//! @param const char*  cPtxFileName   optional PTX file name
-//////////////////////////////////////////////////////////////////////////////
-void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName)
-{
-    // Grab the number of devices associated with the program
-    cl_uint num_devices;
-    clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
-
-    // Grab the device ids
-    cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
-    clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
-
-    // Grab the sizes of the binaries
-    size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));    
-    clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
-
-    // Now get the binaries
-    char** ptx_code = (char**)malloc(num_devices * sizeof(char*));
-    for( unsigned int i=0; i<num_devices; ++i)
-    {
-        ptx_code[i] = (char*)malloc(binary_sizes[i]);
-    }
-    clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
-
-    // Find the index of the device of interest
-    unsigned int idx = 0;
-    while((idx < num_devices) && (devices[idx] != cdDevice)) 
-    {
-        ++idx;
-    }
-    
-    // If the index is associated, log the result
-    if(idx < num_devices)
-    {
-         
-        // if a separate filename is supplied, dump ptx there 
-        if (NULL != cPtxFileName)
-        {
-            shrLog("\nWriting ptx to separate file: %s ...\n\n", cPtxFileName);
-            FILE* pFileStream = NULL;
-            #ifdef _WIN32
-                fopen_s(&pFileStream, cPtxFileName, "wb");
-            #else
-                pFileStream = fopen(cPtxFileName, "wb");
-            #endif
-
-            fwrite(ptx_code[idx], binary_sizes[idx], 1, pFileStream);
-            fclose(pFileStream);        
-        }
-        else // log to logfile and console if no ptx file specified
-        {
-           shrLog("\n%s\nProgram Binary:\n%s\n%s\n", HDASHLINE, ptx_code[idx], HDASHLINE);
-        }
-    }
-
-    // Cleanup
-    free(devices);
-    free(binary_sizes);
-    for(unsigned int i = 0; i < num_devices; ++i)
-    {
-        free(ptx_code[i]);
-    }
-    free( ptx_code );
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
-//!
-//! @param cpProgram    OpenCL program
-//! @param cdDevice     device of interest
-//////////////////////////////////////////////////////////////////////////////
-void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice)
-{
-    // write out the build log and ptx, then exit
-    char cBuildLog[10240];
-    clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 
-                          sizeof(cBuildLog), cBuildLog, NULL );
-    shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE);
-}
-
-// Helper function for De-allocating cl objects
-// *********************************************************************
-void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs)
-{
-    int i;
-    for (i = 0; i < iNumObjs; i++)
-    {
-        if (cmMemObjs[i])clReleaseMemObject(cmMemObjs[i]);
-    }
-}  
-
-// Helper function to get OpenCL error string from constant
-// *********************************************************************
-const char* oclErrorString(cl_int error)
-{
-    static const char* errorString[] = {
-        "CL_SUCCESS",
-        "CL_DEVICE_NOT_FOUND",
-        "CL_DEVICE_NOT_AVAILABLE",
-        "CL_COMPILER_NOT_AVAILABLE",
-        "CL_MEM_OBJECT_ALLOCATION_FAILURE",
-        "CL_OUT_OF_RESOURCES",
-        "CL_OUT_OF_HOST_MEMORY",
-        "CL_PROFILING_INFO_NOT_AVAILABLE",
-        "CL_MEM_COPY_OVERLAP",
-        "CL_IMAGE_FORMAT_MISMATCH",
-        "CL_IMAGE_FORMAT_NOT_SUPPORTED",
-        "CL_BUILD_PROGRAM_FAILURE",
-        "CL_MAP_FAILURE",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "",
-        "CL_INVALID_VALUE",
-        "CL_INVALID_DEVICE_TYPE",
-        "CL_INVALID_PLATFORM",
-        "CL_INVALID_DEVICE",
-        "CL_INVALID_CONTEXT",
-        "CL_INVALID_QUEUE_PROPERTIES",
-        "CL_INVALID_COMMAND_QUEUE",
-        "CL_INVALID_HOST_PTR",
-        "CL_INVALID_MEM_OBJECT",
-        "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
-        "CL_INVALID_IMAGE_SIZE",
-        "CL_INVALID_SAMPLER",
-        "CL_INVALID_BINARY",
-        "CL_INVALID_BUILD_OPTIONS",
-        "CL_INVALID_PROGRAM",
-        "CL_INVALID_PROGRAM_EXECUTABLE",
-        "CL_INVALID_KERNEL_NAME",
-        "CL_INVALID_KERNEL_DEFINITION",
-        "CL_INVALID_KERNEL",
-        "CL_INVALID_ARG_INDEX",
-        "CL_INVALID_ARG_VALUE",
-        "CL_INVALID_ARG_SIZE",
-        "CL_INVALID_KERNEL_ARGS",
-        "CL_INVALID_WORK_DIMENSION",
-        "CL_INVALID_WORK_GROUP_SIZE",
-        "CL_INVALID_WORK_ITEM_SIZE",
-        "CL_INVALID_GLOBAL_OFFSET",
-        "CL_INVALID_EVENT_WAIT_LIST",
-        "CL_INVALID_EVENT",
-        "CL_INVALID_OPERATION",
-        "CL_INVALID_GL_OBJECT",
-        "CL_INVALID_BUFFER_SIZE",
-        "CL_INVALID_MIP_LEVEL",
-        "CL_INVALID_GLOBAL_WORK_SIZE",
-    };
-
-    const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
-
-    const int index = -error;
-
-    return (index >= 0 && index < errorCount) ? errorString[index] : "Unspecified Error";
-}
-
-// Helper function to get OpenCL image format string (channel order and type) from constant
-// *********************************************************************
-const char* oclImageFormatString(cl_uint uiImageFormat)
-{
-    // cl_channel_order 
-    if (uiImageFormat == CL_R)return "CL_R";  
-    if (uiImageFormat == CL_A)return "CL_A";  
-    if (uiImageFormat == CL_RG)return "CL_RG";  
-    if (uiImageFormat == CL_RA)return "CL_RA";  
-    if (uiImageFormat == CL_RGB)return "CL_RGB";
-    if (uiImageFormat == CL_RGBA)return "CL_RGBA";  
-    if (uiImageFormat == CL_BGRA)return "CL_BGRA";  
-    if (uiImageFormat == CL_ARGB)return "CL_ARGB";  
-    if (uiImageFormat == CL_INTENSITY)return "CL_INTENSITY";  
-    if (uiImageFormat == CL_LUMINANCE)return "CL_LUMINANCE";  
-
-    // cl_channel_type 
-    if (uiImageFormat == CL_SNORM_INT8)return "CL_SNORM_INT8";
-    if (uiImageFormat == CL_SNORM_INT16)return "CL_SNORM_INT16";
-    if (uiImageFormat == CL_UNORM_INT8)return "CL_UNORM_INT8";
-    if (uiImageFormat == CL_UNORM_INT16)return "CL_UNORM_INT16";
-    if (uiImageFormat == CL_UNORM_SHORT_565)return "CL_UNORM_SHORT_565";
-    if (uiImageFormat == CL_UNORM_SHORT_555)return "CL_UNORM_SHORT_555";
-    if (uiImageFormat == CL_UNORM_INT_101010)return "CL_UNORM_INT_101010";
-    if (uiImageFormat == CL_SIGNED_INT8)return "CL_SIGNED_INT8";
-    if (uiImageFormat == CL_SIGNED_INT16)return "CL_SIGNED_INT16";
-    if (uiImageFormat == CL_SIGNED_INT32)return "CL_SIGNED_INT32";
-    if (uiImageFormat == CL_UNSIGNED_INT8)return "CL_UNSIGNED_INT8";
-    if (uiImageFormat == CL_UNSIGNED_INT16)return "CL_UNSIGNED_INT16";
-    if (uiImageFormat == CL_UNSIGNED_INT32)return "CL_UNSIGNED_INT32";
-    if (uiImageFormat == CL_HALF_FLOAT)return "CL_HALF_FLOAT";
-    if (uiImageFormat == CL_FLOAT)return "CL_FLOAT";
-
-    // unknown constant
-    return "Unknown";
-}
--- a/tests/opencl/vectorhypot/oclUtils.h
+++ b/tests/opencl/vectorhypot/oclUtils.h
@ -1,198 +0,0 @@
-/*
- * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
- 
-#ifndef OCL_UTILS_H
-#define OCL_UTILS_H
-
-// *********************************************************************
-// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK 
-// *********************************************************************
-
-// Common headers:  Cross-API utililties and OpenCL header
-#include <shrUtils.h>
-
-// All OpenCL headers
-#if defined (__APPLE__) || defined(MACOSX)
-    #include <OpenCL/opencl.h>
-#else
-    #include <CL/opencl.h>
-#endif 
-
-// Includes
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-// For systems with CL_EXT that are not updated with these extensions, we copied these
-// extensions from <CL/cl_ext.h>
-#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
-  /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
-  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
-  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
-  #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
-  #define CL_DEVICE_WARP_SIZE_NV                      0x4003
-  #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
-  #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
-  #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
-#endif
-
-// reminders for build output window and log
-#ifdef _WIN32
-    #pragma message ("Note: including shrUtils.h")
-    #pragma message ("Note: including opencl.h")
-#endif
-
-// SDK Revision #
-#define OCL_SDKREVISION "7027912"
-
-// Error and Exit Handling Macros... 
-// *********************************************************************
-// Full error handling macro with Cleanup() callback (if supplied)... 
-// (Companion Inline Function lower on page)
-#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
-
-// Short version without Cleanup() callback pointer
-// Both Input (a) and Reference (b) are specified as args
-#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) 
-
-//////////////////////////////////////////////////////////////////////////////
-//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
-//!
-//! @return the id 
-//! @param clSelectedPlatformID         OpenCL platform ID
-//////////////////////////////////////////////////////////////////////////////
-extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
-
-//////////////////////////////////////////////////////////////////////////////
-//! Print info about the device
-//!
-//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
-//! @param device         OpenCL id of the device
-//////////////////////////////////////////////////////////////////////////////
-extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
-
-//////////////////////////////////////////////////////////////////////////////
-//! Get and return device capability
-//!
-//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA 
-//! @param device         OpenCL id of the device
-//////////////////////////////////////////////////////////////////////////////
-extern "C" int oclGetDevCap(cl_device_id device);
-
-//////////////////////////////////////////////////////////////////////////////
-//! Print the device name
-//!
-//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
-//! @param device         OpenCL id of the device
-//////////////////////////////////////////////////////////////////////////////
-extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
-
-//////////////////////////////////////////////////////////////////////////////
-//! Gets the id of the first device from the context
-//!
-//! @return the id 
-//! @param cxGPUContext         OpenCL context
-//////////////////////////////////////////////////////////////////////////////
-extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
-
-//////////////////////////////////////////////////////////////////////////////
-//! Gets the id of the nth device from the context
-//!
-//! @return the id or -1 when out of range
-//! @param cxGPUContext         OpenCL context
-//! @param device_idx            index of the device of interest
-//////////////////////////////////////////////////////////////////////////////
-extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
-
-//////////////////////////////////////////////////////////////////////////////
-//! Gets the id of device with maximal FLOPS from the context
-//!
-//! @return the id 
-//! @param cxGPUContext         OpenCL context
-//////////////////////////////////////////////////////////////////////////////
-extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
-
-//////////////////////////////////////////////////////////////////////////////
-//! Loads a Program file and prepends the cPreamble to the code.
-//!
-//! @return the source string if succeeded, 0 otherwise
-//! @param cFilename        program filename
-//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header
-//! @param szFinalLength    returned length of the code string
-//////////////////////////////////////////////////////////////////////////////
-extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
-
-//////////////////////////////////////////////////////////////////////////////
-//! Get the binary (PTX) of the program associated with the device
-//!
-//! @param cpProgram    OpenCL program
-//! @param cdDevice     device of interest
-//! @param binary       returned code
-//! @param length       length of returned code
-//////////////////////////////////////////////////////////////////////////////
-extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
-
-//////////////////////////////////////////////////////////////////////////////
-//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
-//!
-//! @param cpProgram                   OpenCL program
-//! @param cdDevice                    device of interest
-//! @param const char*  cPtxFileName   optional PTX file name
-//////////////////////////////////////////////////////////////////////////////
-extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
-
-//////////////////////////////////////////////////////////////////////////////
-//! Get and log the Build Log from the OpenCL compiler for the requested program & device
-//!
-//! @param cpProgram    OpenCL program
-//! @param cdDevice     device of interest
-//////////////////////////////////////////////////////////////////////////////
-extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
-
-// Helper function for De-allocating cl objects
-// *********************************************************************
-extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
-
-// Helper function to get OpenCL error string from constant
-// *********************************************************************
-extern "C" const char* oclErrorString(cl_int error);
-
-// Helper function to get OpenCL image format string (channel order and type) from constant
-// *********************************************************************
-extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
-
-// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
-// *********************************************************************
-inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
-{
-    // An error condition is defined by the sample/test value not equal to the reference
-    if (iReference != iSample)
-    {
-        // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
-        iSample = (iSample == 0) ? -9999 : iSample; 
-
-        // Log the error info
-        shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
-
-        // Cleanup and exit, or just exit if no cleanup function pointer provided.  Use iSample (error code in this case) as process exit code.
-        if (pCleanup != NULL)
-        {
-            pCleanup(iSample);
-        }
-        else 
-        {
-            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
-            exit(iSample);
-        }
-    }
-}
-
-#endif
--- a/tests/opencl/vectorhypot/shrQATest.h
+++ b/tests/opencl/vectorhypot/shrQATest.h
@ -1,238 +0,0 @@
-/*
-* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
-*
-* Please refer to the NVIDIA end user license agreement (EULA) associated
-* with this source code for terms and conditions that govern your use of
-* this software. Any use, reproduction, disclosure, or distribution of
-* this software and related documentation outside the terms of the EULA
-* is strictly prohibited.
-*
-*/
-
-#ifndef SHR_QATEST_H
-#define SHR_QATEST_H
-
-// *********************************************************************
-// Generic utilities for NVIDIA GPU Computing SDK 
-// *********************************************************************
-
-// OS dependent includes
-#ifdef _WIN32
-    #pragma message ("Note: including windows.h")
-    #pragma message ("Note: including math.h")
-    #pragma message ("Note: including assert.h")
-    #pragma message ("Note: including time.h")
-
-// Headers needed for Windows
-    #include <windows.h>
-	#include <time.h>
-#else
-    // Headers needed for Linux
-    #include <sys/stat.h>
-    #include <sys/types.h>
-    #include <sys/time.h>
-    #include <stdio.h>
-    #include <stdlib.h>
-    #include <string.h>
-    #include <stdarg.h>
-    #include <unistd.h>
-    #include <time.h>
-#endif
-
-#ifndef STRCASECMP
-#ifdef _WIN32
-#define STRCASECMP _stricmp
-#else
-#define STRCASECMP strcasecmp
-#endif
-#endif
-
-#ifndef STRNCASECMP
-#ifdef _WIN32
-#define STRNCASECMP _strnicmp
-#else
-#define STRNCASECMP strncasecmp
-#endif
-#endif
-
-
-// Standardized QA Start/Finish for CUDA SDK tests
-#define shrQAStart(a, b)      __shrQAStart(a, b)
-#define shrQAFinish(a, b, c)  __shrQAFinish(a, b, c)
-#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
-
-inline int findExeNameStart(const char *exec_name)
-{
-    int exename_start = (int)strlen(exec_name);
-
-    while( (exename_start > 0) && 
-            (exec_name[exename_start] != '\\') && 
-            (exec_name[exename_start] != '/') )
-    {
-        exename_start--;
-    }
-    if (exec_name[exename_start] == '\\' || 
-        exec_name[exename_start] == '/')
-    {
-        return exename_start+1;
-    } else {
-        return exename_start;
-    }
-}
-
-inline int __shrQAStart(int argc, char **argv)
-{
-    bool bQATest = false;
-    // First clear the output buffer
-    fflush(stdout);
-    fflush(stdout);
-
-    for (int i=1; i < argc; i++) {
-        int string_start = 0;
-        while (argv[i][string_start] == '-')
-           string_start++;
-        char *string_argv = &argv[i][string_start];
-
-        if (!STRCASECMP(string_argv, "qatest")) {
-           bQATest = true;
-        }
-    }
-    
-    // We don't want to print the entire path, so we search for the first 
-    int exename_start = findExeNameStart(argv[0]);
-    if (bQATest) {
-        fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
-        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
-        fprintf(stdout, "\n");
-    } else {
-        fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
-    }
-    fflush(stdout);
-    printf("\n"); fflush(stdout);
-    return exename_start;
-}
-
-enum eQAstatus {
-    QA_FAILED = 0,
-    QA_PASSED = 1,
-    QA_WAIVED = 2
-};
-
-inline void __ExitInTime(int seconds)
-{
-    fprintf(stdout, "> exiting in %d seconds: ", seconds);
-    fflush(stdout);
-    time_t t;
-    int count;
-    for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
-        fprintf(stdout, "%d...", count);
-#ifdef WIN32
-        Sleep(1000);
-#else
-        sleep(1);
-#endif
-    }
-    fprintf(stdout,"done!\n\n"); 
-	fflush(stdout);
-}
-
-
-inline void __shrQAFinish(int argc, const char **argv, int iStatus)
-{
-    // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
-    bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
-    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
-	
-    for (int i=1; i < argc; i++) {
-        int string_start = 0;
-        while (argv[i][string_start] == '-')
-           string_start++;
-
-        const char *string_argv = &argv[i][string_start];
-        if (!STRCASECMP(string_argv, "qatest")) {
-           bQATest = true;
-        }	
-        // For SDK individual samples that don't specify -noprompt or -prompt, 
-        // a 3 second delay will happen before exiting, giving a user time to view results
-        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
-            bNoPrompt = true;
-            bQuitInTime = false;
-        }
-        if (!STRCASECMP(string_argv, "prompt")) {
-            bNoPrompt = false;
-            bQuitInTime = false;
-        }
-    }
-
-    int exename_start = findExeNameStart(argv[0]);
-    if (bQATest) {
-        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
-        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
-        fprintf(stdout, "\n");
-    } else {
-        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
-    }
-    fflush(stdout);
-    printf("\n"); fflush(stdout);
-    if (bQuitInTime) {
-        __ExitInTime(3);
-    } else {
-        if (!bNoPrompt) {
-            fprintf(stdout, "\nPress <Enter> to exit...\n");
-            fflush(stdout);
-            getchar();
-        }
-    }
-}
-
-inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
-{
-    bool bQuitInTime = true;
-    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
-	
-    for (int i=1; i < argc; i++) {
-        int string_start = 0;
-        while (argv[i][string_start] == '-')
-           string_start++;
-
-        const char *string_argv = &argv[i][string_start];
-        // For SDK individual samples that don't specify -noprompt or -prompt, 
-        // a 3 second delay will happen before exiting, giving a user time to view results
-        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
-            bQuitInTime = false;
-        }
-        if (!STRCASECMP(string_argv, "prompt")) {
-            bQuitInTime = false;
-        }
-    }
-
-    int exename_start = findExeNameStart(argv[0]);
-    if (bQATest) {
-        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
-        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
-        fprintf(stdout, "\n");
-    } else {
-        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
-    }
-    fflush(stdout);
-    
-    if (bQuitInTime) {
-        __ExitInTime(3);
-    }
-}
-
-inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
-{
-    __shrQAFinish(argc, argv, iStatus);
-
-    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); 
-}
-
-inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
-{
-    __shrQAFinish2(bQAtest, argc, argv, iStatus);
-
-    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
-}
-
-#endif
--- a/tests/opencl/vectorhypot/shrUtils.cpp
+++ b/tests/opencl/vectorhypot/shrUtils.cpp
--- a/tests/opencl/vectorhypot/shrUtils.h
+++ b/tests/opencl/vectorhypot/shrUtils.h
@ -1,642 +0,0 @@
-/*
-* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
-*
-* Please refer to the NVIDIA end user license agreement (EULA) associated
-* with this source code for terms and conditions that govern your use of
-* this software. Any use, reproduction, disclosure, or distribution of
-* this software and related documentation outside the terms of the EULA
-* is strictly prohibited.
-*
-*/
-
-#ifndef SHR_UTILS_H
-#define SHR_UTILS_H
-
-// *********************************************************************
-// Generic utilities for NVIDIA GPU Computing SDK 
-// *********************************************************************
-
-// reminders for output window and build log
-#ifdef _WIN32
-    #pragma message ("Note: including windows.h")
-    #pragma message ("Note: including math.h")
-    #pragma message ("Note: including assert.h")
-#endif
-
-// OS dependent includes
-#ifdef _WIN32
-    // Headers needed for Windows
-    #include <windows.h>
-#else
-    // Headers needed for Linux
-    #include <sys/stat.h>
-    #include <sys/types.h>
-    #include <sys/time.h>
-    #include <stdio.h>
-    #include <stdlib.h>
-    #include <string.h>
-    #include <stdarg.h>
-#endif
-
-// Other headers needed for both Windows and Linux
-#include <math.h>
-#include <assert.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-
-// Un-comment the following #define to enable profiling code in SDK apps
-//#define GPU_PROFILING
-
-// Beginning of GPU Architecture definitions
-inline int ConvertSMVer2Cores(int major, int minor)
-{
-	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
-	typedef struct {
-		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
-		int Cores;
-	} sSMtoCores;
-
-	sSMtoCores nGpuArchCoresPerSM[] = 
-	{ { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
-	  { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
-	  { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
-	  { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
-	  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
-	  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
-	  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
-	  {   -1, -1 }
-	};
-
-	int index = 0;
-	while (nGpuArchCoresPerSM[index].SM != -1) {
-		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
-			return nGpuArchCoresPerSM[index].Cores;
-		}
-		index++;
-	}
-	printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
-	return -1;
-}
-// end of GPU Architecture definitions
-
-
-// Defines and enum for use with logging functions
-// *********************************************************************
-#define DEFAULTLOGFILE "SdkConsoleLog.txt"
-#define MASTERLOGFILE "SdkMasterLog.csv"
-enum LOGMODES 
-{
-    LOGCONSOLE = 1, // bit to signal "log to console" 
-    LOGFILE    = 2, // bit to signal "log to file" 
-    LOGBOTH    = 3, // convenience union of first 2 bits to signal "log to both"
-    APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
-    MASTER     = 8, // bit to signal master .csv log output
-    ERRORMSG   = 16, // bit to signal "pre-pend Error" 
-    CLOSELOG   = 32  // bit to close log file, if open, after any requested file write
-};
-#define HDASHLINE "-----------------------------------------------------------\n"
-
-// Standardized boolean
-enum shrBOOL
-{
-    shrFALSE = 0,
-    shrTRUE = 1
-};
-
-// Standardized MAX, MIN and CLAMP
-#define MAX(a, b) ((a > b) ? a : b)
-#define MIN(a, b) ((a < b) ? a : b)
-#define CLAMP(a, b, c) MIN(MAX(a, b), c)    // double sided clip of input a
-#define TOPCLAMP(a, b) (a < b ? a:b)	    // single top side clip of input a
-
-// Error and Exit Handling Macros... 
-// *********************************************************************
-// Full error handling macro with Cleanup() callback (if supplied)... 
-// (Companion Inline Function lower on page)
-#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
-
-// Short version without Cleanup() callback pointer
-// Both Input (a) and Reference (b) are specified as args
-#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) 
-
-// Standardized Exit Macro for leaving main()... extended version
-// (Companion Inline Function lower on page)
-#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
-
-// Standardized Exit Macro for leaving main()... short version
-// (Companion Inline Function lower on page)
-#define shrEXIT(a, b)        __shrExitEX(a, b, EXIT_SUCCESS)
-
-// Simple argument checker macro
-#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE 
-
-// Define for user-customized error handling
-#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
-
-// Function to deallocate memory allocated within shrUtils
-// *********************************************************************
-extern "C" void shrFree(void* ptr);
-
-// *********************************************************************
-// Helper function to log standardized information to Console, to File or to both
-//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); 
-//!         : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
-//! 
-//! Automatically opens file and stores handle if needed and not done yet
-//! Closes file and nulls handle on request
-//! 
-//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.  
-//!          LOGFILE and LOGBOTH may be | 'd  with APPENDMODE to select file append mode instead of overwrite mode 
-//!          LOGFILE and LOGBOTH may be | 'd  with CLOSELOG to "write and close" 
-//!          First 3 options may be | 'd  with MASTER to enable independent write to master data log file
-//!          First 3 options may be | 'd  with ERRORMSG to start line with standard error message
-//! @param 2 dValue:    
-//!          Positive val = double value for time in secs to be formatted to 6 decimals. 
-//!          Negative val is an error code and this give error preformatting.
-//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.  
-//!          ALL printf flags, width, precision and type specifiers are supported with this exception: 
-//!              Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
-//!              Single byte char type specifiers (%s and %c) ARE supported 
-//! @param 4... variable args: like printf or fprintf.  Must match format specifer type above.  
-//! @return 0 if OK, negative value on error or if error occurs or was passed in. 
-// *********************************************************************
-extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
-
-// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, 
-// *********************************************************************
-extern "C" int shrLog(const char* cFormatString, ...);
-
-// *********************************************************************
-// Delta timer function for up to 3 independent timers using host high performance counters 
-// Maintains state for 3 independent counters
-//! Example: double dElapsedTime = shrDeltaTime(0);
-//! 
-//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
-//! @return delta time of specified counter since last call in seconds.  Otherwise -9999.0 if error
-// *********************************************************************
-extern "C" double shrDeltaT(int iCounterID);
-
-// Optional LogFileNameOverride function
-// *********************************************************************
-extern "C" void shrSetLogFileName (const char* cOverRideName);
-
-// Helper function to init data arrays 
-// *********************************************************************
-extern "C" void shrFillArray(float* pfData, int iSize);
-
-// Helper function to print data arrays 
-// *********************************************************************
-extern "C" void shrPrintArray(float* pfData, int iSize);
-
-////////////////////////////////////////////////////////////////////////////
-//! Find the path for a filename
-//! @return the path if succeeded, otherwise 0
-//! @param filename        name of the file
-//! @param executablePath  optional absolute path of the executable
-////////////////////////////////////////////////////////////////////////////
-extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing single precision floating point data
-//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is initialized 
-//!       within shrUtils, then free() has to be used to deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, 
-              bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing double precision floating point data
-//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is
-//! @note If a NULL pointer is passed to this function and it is initialized 
-//!       within shrUtils, then free() has to be used to deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, 
-              bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing integer data
-//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is
-//! @note If a NULL pointer is passed to this function and it is initialized 
-//!       within shrUtils, then free() has to be used to deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing unsigned integer data
-//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is 
-//! @note If a NULL pointer is passed to this function and it is initialized 
-//!       within shrUtils, then free() has to be used to deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, 
-               unsigned int* len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing char / byte data
-//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is 
-//! @note If a NULL pointer is passed to this function and it is initialized 
-//!       within shrUtils, then free() has to be used to deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, 
-              bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Read file \filename containing unsigned char / byte data
-//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
-//! @param filename name of the source file
-//! @param data  uninitialized pointer, returned initialized and pointing to
-//!        the data read
-//! @param len  number of data elements in data, -1 on error
-//! @note If a NULL pointer is passed to this function and it is
-//! @note If a NULL pointer is passed to this function and it is initialized 
-//!       within shrUtils, then free() has to be used to deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, 
-               unsigned int* len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing single precision floating point 
-//! data
-//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-//! @param epsilon  epsilon for comparison
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
-               const float epsilon, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing double precision floating point 
-//! data
-//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-//! @param epsilon  epsilon for comparison
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
-               const double epsilon, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing integer data
-//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
-               bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing unsigned integer data
-//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, 
-                unsigned int len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing char / byte data
-//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, 
-               bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Write a data file \filename containing unsigned char / byte data
-//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
-//! @param filename name of the file to write
-//! @param data  pointer to data to write
-//! @param len  number of data elements in data, -1 on error
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
-                unsigned int len, bool verbose = false);
-
-////////////////////////////////////////////////////////////////////////////
-//! Load PPM image file (with unsigned char as data element type), padding 
-//! 4th component
-//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
-//! @param file  name of the image file
-//! @param OutData  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-//! 
-//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, 
-                             unsigned int *w, unsigned int *h);
-
-////////////////////////////////////////////////////////////////////////////
-//! Save PPM image file (with unsigned char as data element type, padded to 
-//! 4 bytes)
-//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, 
-               unsigned int w, unsigned int h);
-
-////////////////////////////////////////////////////////////////////////////////
-//! Save PGM image file (with unsigned char as data element type)
-//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-////////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, 
-              unsigned int w, unsigned int h); 
-
-////////////////////////////////////////////////////////////////////////////
-//! Load PGM image file (with unsigned char as data element type)
-//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
-//! @param file  name of the image file
-//! @param data  handle to the data read
-//! @param w     width of the image
-//! @param h     height of the image
-//! @note If a NULL pointer is passed to this function and it is initialized 
-//!       within shrUtils, then free() has to be used to deallocate the memory
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
-                  unsigned int *w,unsigned int *h);
-
-////////////////////////////////////////////////////////////////////////////
-// Command line arguments: General notes
-// * All command line arguments begin with '--' followed by the token; 
-//   token and value are seperated by '='; example --samples=50
-// * Arrays have the form --model=[one.obj,two.obj,three.obj] 
-//   (without whitespaces)
-////////////////////////////////////////////////////////////////////////////
-
-////////////////////////////////////////////////////////////////////////////
-//! Check if command line argument \a flag-name is given
-//! @return shrTRUE if command line argument \a flag_name has been given, 
-//!         otherwise shrFALSE
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param flag_name  name of command line flag
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, 
-                     const char* flag_name);
-
-////////////////////////////////////////////////////////////////////////////
-//! Get the value of a command line argument of type int
-//! @return shrTRUE if command line argument \a arg_name has been given and
-//!         is of the requested type, otherwise shrFALSE
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param arg_name  name of the command line argument
-//! @param val  value of the command line argument
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, 
-                        const char* arg_name, int* val);
-
-////////////////////////////////////////////////////////////////////////////
-//! Get the value of a command line argument of type unsigned int
-//! @return shrTRUE if command line argument \a arg_name has been given and
-//!         is of the requested type, otherwise shrFALSE
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param arg_name  name of the command line argument
-//! @param val  value of the command line argument
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, 
-                        const char* arg_name, unsigned int* val);
-
-////////////////////////////////////////////////////////////////////////////
-//! Get the value of a command line argument of type float
-//! @return shrTRUE if command line argument \a arg_name has been given and
-//!         is of the requested type, otherwise shrFALSE
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param arg_name  name of the command line argument
-//! @param val  value of the command line argument
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, 
-                        const char* arg_name, float* val);
-
-////////////////////////////////////////////////////////////////////////////
-//! Get the value of a command line argument of type string
-//! @return shrTRUE if command line argument \a arg_name has been given and
-//!         is of the requested type, otherwise shrFALSE
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param arg_name  name of the command line argument
-//! @param val  value of the command line argument
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, 
-                          const char* arg_name, char** val);
-
-////////////////////////////////////////////////////////////////////////////
-//! Get the value of a command line argument list those element are strings
-//! @return shrTRUE if command line argument \a arg_name has been given and
-//!         is of the requested type, otherwise shrFALSE
-//! @param argc  argc as passed to main()
-//! @param argv  argv as passed to main()
-//! @param arg_name  name of the command line argument
-//! @param val  command line argument list
-//! @param len  length of the list / number of elements
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, 
-                              const char* arg_name, char** val, 
-                              unsigned int* len);
-
-////////////////////////////////////////////////////////////////////////////
-//! Compare two float arrays
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrComparef( const float* reference, const float* data,
-             const unsigned int len);
-
-////////////////////////////////////////////////////////////////////////////
-//! Compare two integer arrays
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrComparei( const int* reference, const int* data, 
-             const unsigned int len ); 
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two unsigned integer arrays, with epsilon and threshold
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
-////////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
-            const unsigned int len, const float epsilon, const float threshold );
-
-////////////////////////////////////////////////////////////////////////////
-//! Compare two unsigned char arrays
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
-              const unsigned int len ); 
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two integers with a tolernance for # of byte errors
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param epsilon    epsilon to use for the comparison
-//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
-////////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
-             const unsigned int len, const float epsilon, const float threshold );
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two integer arrays witha n epsilon tolerance for equality
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param epsilon    epsilon to use for the comparison
-////////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
-             const unsigned int len, const float epsilon );
-
-////////////////////////////////////////////////////////////////////////////
-//! Compare two float arrays with an epsilon tolerance for equality
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param epsilon    epsilon to use for the comparison
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
-              const unsigned int len, const float epsilon );
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two float arrays with an epsilon tolerance for equality and a 
-//!     threshold for # pixel errors
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param epsilon    epsilon to use for the comparison
-////////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
-             const unsigned int len, const float epsilon, const float threshold );
-
-////////////////////////////////////////////////////////////////////////////
-//! Compare two float arrays using L2-norm with an epsilon tolerance for 
-//! equality
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param reference  handle to the reference data / gold image
-//! @param data       handle to the computed data
-//! @param len        number of elements in reference and data
-//! @param epsilon    epsilon to use for the comparison
-////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
-                const unsigned int len, const float epsilon );
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two PPM image files with an epsilon tolerance for equality
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param src_file   filename for the image to be compared
-//! @param data       filename for the reference data / gold image
-//! @param epsilon    epsilon to use for the comparison
-//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
-//! $param verboseErrors output details of image mismatch to std::err
-////////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
-
-////////////////////////////////////////////////////////////////////////////////
-//! Compare two PGM image files with an epsilon tolerance for equality
-//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
-//! @param src_file   filename for the image to be compared
-//! @param data       filename for the reference data / gold image
-//! @param epsilon    epsilon to use for the comparison
-//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
-//! $param verboseErrors output details of image mismatch to std::err
-////////////////////////////////////////////////////////////////////////////////
-extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
-
-extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
-
-extern "C" size_t shrRoundUp(int group_size, int global_size);
-
-// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
-// *********************************************************************
-inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
-{
-    if (iReference != iSample)
-    {
-        shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); 
-        if (pCleanup != NULL)
-        {
-            pCleanup(EXIT_FAILURE);
-        }
-        else 
-        {
-            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-}
-
-// Standardized Exit
-// *********************************************************************
-inline void __shrExitEX(int argc, const char** argv, int iExitCode)
-{
-#ifdef WIN32
-    if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
-#else 
-    if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
-#endif
-    {
-        shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");                  
-        getchar();                                                           
-    }       
-    else 
-    {
-        shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); 
-    }
-    fflush(stderr);                                                         
-    exit(iExitCode);
-}
-
-#endif