Merge branch 'master' of https://github.gatech.edu/casl/Vortex

2025-04-23 13:27:29 -04:00 · 2019-11-25 02:52:22 -05:00 · 2019-11-25 02:52:22 -05:00 · b752ce5485
commit b752ce5485
parent 44b8cdf5c1 a38987e7af
76 changed files with 386716 additions and 14 deletions
--- a/benchmarks/opencl/BlackScholes/BlackScholes.cl
+++ b/benchmarks/opencl/BlackScholes/BlackScholes.cl
@ -0,0 +1,101 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+#if(0)
+    #define EXP(a) native_exp(a)
+    #define LOG(a) native_log(a)
+    #define SQRT(a) native_sqrt(a)
+#else
+    #define EXP(a) exp(a)
+    #define LOG(a) log(a)
+    #define SQRT(a) sqrt(a)
+#endif
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Predefine functions to avoid bug in OpenCL compiler on Mac OSX 10.7 systems
+///////////////////////////////////////////////////////////////////////////////
+float CND(float d);
+void BlackScholesBody(__global float *call, __global float *put,  float S,
+					  float X, float T, float R, float V);
+
+///////////////////////////////////////////////////////////////////////////////
+// Rational approximation of cumulative normal distribution function
+///////////////////////////////////////////////////////////////////////////////
+float CND(float d){
+    const float       A1 = 0.31938153f;
+    const float       A2 = -0.356563782f;
+    const float       A3 = 1.781477937f;
+    const float       A4 = -1.821255978f;
+    const float       A5 = 1.330274429f;
+    const float RSQRT2PI = 0.39894228040143267793994605993438f;
+
+    float
+        K = 1.0f / (1.0f + 0.2316419f * fabs(d));
+
+    float
+        cnd = RSQRT2PI * EXP(- 0.5f * d * d) * 
+        (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+    if(d > 0)
+        cnd = 1.0f - cnd;
+
+    return cnd;
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Black-Scholes formula for both call and put
+///////////////////////////////////////////////////////////////////////////////
+void BlackScholesBody(
+    __global float *call, //Call option price
+    __global float *put,  //Put option price
+    float S,              //Current stock price
+    float X,              //Option strike price
+    float T,              //Option years
+    float R,              //Riskless rate of return
+    float V               //Stock volatility
+){
+    float sqrtT = SQRT(T);
+    float    d1 = (LOG(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT);
+    float    d2 = d1 - V * sqrtT;
+    float CNDD1 = CND(d1);
+    float CNDD2 = CND(d2);
+
+    //Calculate Call and Put simultaneously
+    float expRT = EXP(- R * T);
+    *call = (S * CNDD1 - X * expRT * CNDD2);
+    *put  = (X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1));
+}
+
+
+
+__kernel void BlackScholes(
+    __global float *d_Call, //Call option price
+    __global float *d_Put,  //Put option price
+    __global float *d_S,    //Current stock price
+    __global float *d_X,    //Option strike price
+    __global float *d_T,    //Option years
+    float R,                //Riskless rate of return
+    float V,                //Stock volatility
+    unsigned int optN
+){
+    for(unsigned int opt = get_global_id(0); opt < optN; opt += get_global_size(0))
+        BlackScholesBody(
+            &d_Call[opt],
+            &d_Put[opt],
+            d_S[opt],
+            d_X[opt],
+            d_T[opt],
+            R,
+            V
+        );
+}
--- a/benchmarks/opencl/BlackScholes/Makefile
+++ b/benchmarks/opencl/BlackScholes/Makefile
@ -0,0 +1,66 @@
+RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
+POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
+POCL_INC_PATH = $(wildcard ../include)
+POCL_LIB_PATH = $(wildcard ../lib)
+VX_RT_PATH = $(wildcard ../../../runtime)
+VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
+
+CC  = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
+CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
+DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
+HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
+GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
+
+VX_SRCS =  $(VX_RT_PATH)/newlib/newlib.c
+VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
+VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
+VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
+VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
+VX_SRCS += $(VX_RT_PATH)/tests/tests.c
+VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
+VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
+
+VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
+
+CXXFLAGS =  -g -O0 -march=rv32im -mabi=ilp32
+CXXFLAGS += -ffreestanding # program may not begin at main()
+CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
+CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
+CXXFLAGS += -I$(POCL_INC_PATH) -I.
+
+VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
+QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
+
+PROJECT=BlackScholes
+
+all: $(PROJECT).dump $(PROJECT).hex
+
+lib$(PROJECT).a: BlackScholes.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
+
+$(PROJECT).elf: main.cc lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc oclBlackScholes_common.h oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp $(VX_LIBS) -o $(PROJECT).elf
+
+$(PROJECT).qemu: main.cc lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
+
+$(PROJECT).hex: $(PROJECT).elf
+	$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
+
+$(PROJECT).dump: $(PROJECT).elf
+	$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
+
+run: $(PROJECT).hex
+	POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
+
+qemu: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-s: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-c: $(PROJECT).qemu
+	$(GDB) $(PROJECT).qemu
+
+clean:
+	rm -rf *.elf *.dump *.hex
--- a/benchmarks/opencl/BlackScholes/main.cpp
+++ b/benchmarks/opencl/BlackScholes/main.cpp
@ -0,0 +1,248 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// standard utilities and systems includes
+#include <oclUtils.h>
+#include <shrQATest.h>
+#include "oclBlackScholes_common.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// Helper functions
+////////////////////////////////////////////////////////////////////////////////
+double executionTime(cl_event &event){
+    cl_ulong start, end;
+
+    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
+    clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
+
+    return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Random float helper
+////////////////////////////////////////////////////////////////////////////////
+float randFloat(float low, float high){
+    float t = (float)rand() / (float)RAND_MAX;
+    return (1.0f - t) * low + t * high;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Main program
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+    cl_platform_id   cpPlatform;       //OpenCL platform
+    cl_device_id*    cdDevices = NULL; //OpenCL devices list (array)
+    cl_context       cxGPUContext;     //OpenCL context
+    cl_command_queue cqCommandQueue;   //OpenCL command que
+    cl_mem                             //OpenCL memory buffer objects
+        d_Call,
+        d_Put,
+        d_S,
+        d_X,
+        d_T;
+
+    cl_int ciErrNum;
+
+    float
+        *h_CallCPU,
+        *h_PutCPU,
+        *h_CallGPU,
+        *h_PutGPU,
+        *h_S,
+        *h_X,
+        *h_T;
+
+    const unsigned int   optionCount = 4000000;
+    const float                    R = 0.02f;
+    const float                    V = 0.30f;
+
+    shrQAStart(argc, argv);
+
+    // Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    shrLog("clGetPlatformID...\n"); 
+
+    //Get all the devices
+    cl_uint uiNumDevices = 0;           // Number of devices available
+    cl_uint uiTargetDevice = 0;	        // Default Device to compute on
+    cl_uint uiNumComputeUnits;          // Number of compute units (SM's on NV GPU)
+    shrLog("Get the Device info and select Device...\n");
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+
+    // Get command line device options and config accordingly
+    shrLog("  # of Devices Available = %u\n", uiNumDevices); 
+    if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) 
+    {
+        uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
+    }
+    shrLog("  Using Device %u: ", uiTargetDevice); 
+    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
+    ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    shrLog("\n  # of Compute Units = %u\n", uiNumComputeUnits); 
+
+    // set logfile name and start logs
+    shrSetLogFileName ("oclBlackScholes.txt");
+    shrLog("%s Starting...\n\n", argv[0]); 
+
+    shrLog("Allocating and initializing host memory...\n");
+        h_CallCPU = (float *)malloc(optionCount * sizeof(float));
+        h_PutCPU  = (float *)malloc(optionCount * sizeof(float));
+        h_CallGPU = (float *)malloc(optionCount * sizeof(float));
+        h_PutGPU  = (float *)malloc(optionCount * sizeof(float));
+        h_S       = (float *)malloc(optionCount * sizeof(float));
+        h_X       = (float *)malloc(optionCount * sizeof(float));
+        h_T       = (float *)malloc(optionCount * sizeof(float));
+
+        srand(2009);
+        for(unsigned int i = 0; i < optionCount; i++){
+            h_CallCPU[i] = -1.0f;
+            h_PutCPU[i]  = -1.0f;
+            h_S[i]       = randFloat(5.0f, 30.0f);
+            h_X[i]       = randFloat(1.0f, 100.0f);
+            h_T[i]       = randFloat(0.25f, 10.0f);
+        }
+
+    shrLog("Initializing OpenCL...\n");
+        // Get the NVIDIA platform
+        ciErrNum = oclGetPlatformID(&cpPlatform);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+        // Get a GPU device
+        ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+        // Create the context
+        cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+        //Create a command-queue
+        cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    shrLog("Creating OpenCL memory objects...\n");
+        d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+        d_Put  = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+        d_S    = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+        d_X    = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+        d_T    = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    shrLog("Starting up BlackScholes...\n");
+        initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
+
+    shrLog("Running OpenCL BlackScholes...\n\n");
+        //Just a single run or a warmup iteration
+        BlackScholes(
+            NULL,
+            d_Call,
+            d_Put,
+            d_S,
+            d_X,
+            d_T,
+            R,
+            V,
+            optionCount
+        );
+
+#ifdef GPU_PROFILING
+    const int numIterations = 16;
+    cl_event startMark, endMark;
+    ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
+    ciErrNum |= clFinish(cqCommandQueue);
+    shrCheckError(ciErrNum, CL_SUCCESS);
+    shrDeltaT(0);
+
+    for(int i = 0; i < numIterations; i++){
+        BlackScholes(
+            cqCommandQueue,
+            d_Call,
+            d_Put,
+            d_S,
+            d_X,
+            d_T,
+            R,
+            V,
+            optionCount
+        );
+    }
+
+    ciErrNum  = clEnqueueMarker(cqCommandQueue, &endMark);
+    ciErrNum |= clFinish(cqCommandQueue);
+    shrCheckError(ciErrNum, CL_SUCCESS);
+
+    //Calculate performance metrics by wallclock time
+    double gpuTime = shrDeltaT(0) / numIterations;
+    shrLogEx(LOGBOTH | MASTER, 0, "oclBlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %i, Workgroup = %u\n", 
+        (double)(2.0 * optionCount * 1.0e-9)/gpuTime, gpuTime, (2 * optionCount), 1, 0);
+
+    //Get profiling info
+    cl_ulong startTime = 0, endTime = 0;
+    ciErrNum  = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
+    ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
+    shrCheckError(ciErrNum, CL_SUCCESS);
+    shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
+#endif
+
+    shrLog("\nReading back OpenCL BlackScholes results...\n");
+        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+        ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    shrLog("Comparing against Host/C++ computation...\n"); 
+        BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
+        double deltaCall = 0, deltaPut = 0, sumCall = 0, sumPut = 0;
+        double L1call, L1put;
+        for(unsigned int i = 0; i < optionCount; i++)
+        {
+            sumCall += fabs(h_CallCPU[i]);
+            sumPut  += fabs(h_PutCPU[i]);
+            deltaCall += fabs(h_CallCPU[i] - h_CallGPU[i]);
+            deltaPut  += fabs(h_PutCPU[i] - h_PutGPU[i]);
+        }
+        L1call = deltaCall / sumCall; 
+        L1put = deltaPut / sumPut;
+        shrLog("Relative L1 (call, put) = (%.3e, %.3e)\n\n", L1call, L1put);
+
+    shrLog("Shutting down...\n");
+        closeBlackScholes();
+        ciErrNum  = clReleaseMemObject(d_T);
+        ciErrNum |= clReleaseMemObject(d_X);
+        ciErrNum |= clReleaseMemObject(d_S);
+        ciErrNum |= clReleaseMemObject(d_Put);
+        ciErrNum |= clReleaseMemObject(d_Call);
+        ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
+        ciErrNum |= clReleaseContext(cxGPUContext);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+        free(h_T);
+        free(h_X);
+        free(h_S);
+        free(h_PutGPU);
+        free(h_CallGPU);
+        free(h_PutCPU);
+        free(h_CallCPU);
+
+       if(cdDevices)free(cdDevices);
+
+        shrQAFinishExit(argc, (const char **)argv, ((L1call < 1E-6) && (L1put < 1E-6)) ? QA_PASSED : QA_FAILED );
+}
--- a/benchmarks/opencl/BlackScholes/oclBlackScholes.pdf
+++ b/benchmarks/opencl/BlackScholes/oclBlackScholes.pdf
--- a/benchmarks/opencl/BlackScholes/oclBlackScholes_common.h
+++ b/benchmarks/opencl/BlackScholes/oclBlackScholes_common.h
@ -0,0 +1,50 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+
+
+#include <oclUtils.h>
+
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options on CPU
+////////////////////////////////////////////////////////////////////////////////
+extern "C" void BlackScholesCPU(
+    float *h_Call, //Call option price
+    float *h_Put,  //Put option price
+    float *h_S,    //Current stock price
+    float *h_X,    //Option strike price
+    float *h_T,    //Option years
+    float R,       //Riskless rate of return
+    float V,       //Stock volatility
+    unsigned int optionCount
+);
+
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenCL Black-Scholes kernel launcher
+////////////////////////////////////////////////////////////////////////////////
+extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv);
+
+extern "C" void closeBlackScholes(void);
+
+extern "C" void BlackScholes(
+    cl_command_queue cqCommandQueue,
+    cl_mem d_Call, //Call option price
+    cl_mem d_Put,  //Put option price
+    cl_mem d_S,    //Current stock price
+    cl_mem d_X,    //Option strike price
+    cl_mem d_T,    //Option years
+    cl_float R,    //Riskless rate of return
+    cl_float V,    //Stock volatility
+    cl_uint optionCount
+);
--- a/benchmarks/opencl/BlackScholes/oclBlackScholes_gold.cpp
+++ b/benchmarks/opencl/BlackScholes/oclBlackScholes_gold.cpp
@ -0,0 +1,92 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+
+
+#include <math.h>
+#include "oclBlackScholes_common.h"
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+// Rational approximation of cumulative normal distribution function
+///////////////////////////////////////////////////////////////////////////////
+static double CND(double d){
+    const double       A1 = 0.31938153;
+    const double       A2 = -0.356563782;
+    const double       A3 = 1.781477937;
+    const double       A4 = -1.821255978;
+    const double       A5 = 1.330274429;
+    const double RSQRT2PI = 0.39894228040143267793994605993438;
+
+    double
+        K = 1.0 / (1.0 + 0.2316419 * fabs(d));
+
+    double
+        cnd = RSQRT2PI * exp(- 0.5 * d * d) * 
+        (K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
+
+    if(d > 0)
+        cnd = 1.0 - cnd;
+
+    return cnd;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Black-Scholes formula for both call and put
+///////////////////////////////////////////////////////////////////////////////
+static void BlackScholesBodyCPU(
+    float& call, //Call option price
+    float& put,  //Put option price
+    float Sf,    //Current stock price
+    float Xf,    //Option strike price
+    float Tf,    //Option years
+    float Rf,    //Riskless rate of return
+    float Vf     //Stock volatility
+){
+    double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
+
+    double sqrtT = sqrt(T);
+    double    d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
+    double    d2 = d1 - V * sqrtT;
+    double CNDD1 = CND(d1);
+    double CNDD2 = CND(d2);
+
+    //Calculate Call and Put simultaneously
+    double expRT = exp(- R * T);
+    call = (float)(S * CNDD1 - X * expRT * CNDD2);
+    put  = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Process an array of optN options
+////////////////////////////////////////////////////////////////////////////////
+extern "C" void BlackScholesCPU(
+    float *h_Call, //Call option price
+    float *h_Put,  //Put option price
+    float *h_S,    //Current stock price
+    float *h_X,    //Option strike price
+    float *h_T,    //Option years
+    float R,       //Riskless rate of return
+    float V,       //Stock volatility
+    unsigned int optionCount
+){
+    for(unsigned int i = 0; i < optionCount; i++)
+        BlackScholesBodyCPU(
+            h_Call[i],
+            h_Put[i],
+            h_S[i],
+            h_X[i],
+            h_T[i],
+            R,
+            V
+        );
+}
--- a/benchmarks/opencl/BlackScholes/oclBlackScholes_launcher.cpp
+++ b/benchmarks/opencl/BlackScholes/oclBlackScholes_launcher.cpp
@ -0,0 +1,125 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+
+
+#include <oclUtils.h>
+#include "oclBlackScholes_common.h"
+
+static cl_program cpBlackScholes;   //OpenCL program
+static cl_kernel  ckBlackScholes;   //OpenCL kernel
+static cl_command_queue cqDefaultCommandQueue;
+
+extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
+    cl_int ciErrNum;
+    size_t kernelLength;
+
+    shrLog("...loading BlackScholes.cl\n");
+        char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
+        shrCheckError(cPathAndName != NULL, shrTRUE);
+        char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
+        shrCheckError(cBlackScholes != NULL, shrTRUE);
+
+    shrLog("...creating BlackScholes program\n");
+        //cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
+        cpBlackScholes = clCreateProgramWithBuiltInKernels(context, 1, &device_id, "BlackScholes", NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+    shrLog("...building BlackScholes program\n");
+        ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);
+
+        if(ciErrNum != CL_BUILD_SUCCESS){
+            shrLog("*** Compilation failure ***\n");
+
+            size_t deviceNum;
+            cl_device_id *cdDevices;
+            ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &deviceNum);
+            shrCheckError(ciErrNum, CL_SUCCESS);
+
+            cdDevices = (cl_device_id *)malloc(deviceNum * sizeof(cl_device_id));
+            shrCheckError(cdDevices != NULL, shrTRUE);
+
+            ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, deviceNum * sizeof(cl_device_id), cdDevices, NULL);
+            shrCheckError(ciErrNum, CL_SUCCESS);
+
+            size_t logSize;
+            char *logTxt;
+
+            ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
+            shrCheckError(ciErrNum, CL_SUCCESS);
+
+            logTxt = (char *)malloc(logSize);
+            shrCheckError(logTxt != NULL, shrTRUE);
+
+            ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, logSize, logTxt, NULL);
+            shrCheckError(ciErrNum, CL_SUCCESS);
+
+            shrLog("%s\n", logTxt);
+            shrLog("*** Exiting ***\n");
+            free(logTxt);
+            free(cdDevices);
+            exit(666);
+        }
+
+    //Save ptx code to separate file
+    oclLogPtx(cpBlackScholes, oclGetFirstDev(cxGPUContext), "BlackScholes.ptx");
+
+    shrLog("...creating BlackScholes kernels\n");
+        ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+    cqDefaultCommandQueue = cqParamCommandQueue;
+    free(cBlackScholes);
+    free(cPathAndName);
+}
+
+extern "C" void closeBlackScholes(void){
+    cl_int ciErrNum;
+    ciErrNum  = clReleaseKernel(ckBlackScholes);
+    ciErrNum |= clReleaseProgram(cpBlackScholes);
+    shrCheckError(ciErrNum, CL_SUCCESS);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// OpenCL Black-Scholes kernel launcher
+////////////////////////////////////////////////////////////////////////////////
+extern "C" void BlackScholes(
+    cl_command_queue cqCommandQueue,
+    cl_mem d_Call, //Call option price
+    cl_mem d_Put,  //Put option price
+    cl_mem d_S,    //Current stock price
+    cl_mem d_X,    //Option strike price
+    cl_mem d_T,    //Option years
+    cl_float R,    //Riskless rate of return
+    cl_float V,    //Stock volatility
+    cl_uint optionCount
+){
+    cl_int ciErrNum;
+
+    if(!cqCommandQueue)
+        cqCommandQueue = cqDefaultCommandQueue;
+
+    ciErrNum  = clSetKernelArg(ckBlackScholes, 0, sizeof(cl_mem),   (void *)&d_Call);
+    ciErrNum |= clSetKernelArg(ckBlackScholes, 1, sizeof(cl_mem),   (void *)&d_Put);
+    ciErrNum |= clSetKernelArg(ckBlackScholes, 2, sizeof(cl_mem),   (void *)&d_S);
+    ciErrNum |= clSetKernelArg(ckBlackScholes, 3, sizeof(cl_mem),   (void *)&d_X);
+    ciErrNum |= clSetKernelArg(ckBlackScholes, 4, sizeof(cl_mem),   (void *)&d_T);
+    ciErrNum |= clSetKernelArg(ckBlackScholes, 5, sizeof(cl_float), (void *)&R);
+    ciErrNum |= clSetKernelArg(ckBlackScholes, 6, sizeof(cl_float), (void *)&V);
+    ciErrNum |= clSetKernelArg(ckBlackScholes, 7, sizeof(cl_uint),  (void *)&optionCount);
+    shrCheckError(ciErrNum, CL_SUCCESS);
+
+    //Run the kernel
+    size_t globalWorkSize = 60 * 1024;
+	size_t localWorkSize = 128;
+    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
+    shrCheckError(ciErrNum, CL_SUCCESS);
+}
--- a/benchmarks/opencl/BlackScholes/oclUtils.h
+++ b/benchmarks/opencl/BlackScholes/oclUtils.h
@ -0,0 +1,198 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+#ifndef OCL_UTILS_H
+#define OCL_UTILS_H
+
+// *********************************************************************
+// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// Common headers:  Cross-API utililties and OpenCL header
+#include <shrUtils.h>
+
+// All OpenCL headers
+#if defined (__APPLE__) || defined(MACOSX)
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/opencl.h>
+#endif 
+
+// Includes
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// For systems with CL_EXT that are not updated with these extensions, we copied these
+// extensions from <CL/cl_ext.h>
+#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+  /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+  #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+  #define CL_DEVICE_WARP_SIZE_NV                      0x4003
+  #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+  #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+  #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#endif
+
+// reminders for build output window and log
+#ifdef _WIN32
+    #pragma message ("Note: including shrUtils.h")
+    #pragma message ("Note: including opencl.h")
+#endif
+
+// SDK Revision #
+#define OCL_SDKREVISION "7027912"
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) 
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
+//!
+//! @return the id 
+//! @param clSelectedPlatformID         OpenCL platform ID
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print info about the device
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and return device capability
+//!
+//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA 
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" int oclGetDevCap(cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print the device name
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the first device from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxGPUContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of device with maximal FLOPS from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program file and prepends the cPreamble to the code.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get the binary (PTX) of the program associated with the device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//! @param binary       returned code
+//! @param length       length of returned code
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram                   OpenCL program
+//! @param cdDevice                    device of interest
+//! @param const char*  cPtxFileName   optional PTX file name
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the Build Log from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
+
+// Helper function for De-allocating cl objects
+// *********************************************************************
+extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
+
+// Helper function to get OpenCL error string from constant
+// *********************************************************************
+extern "C" const char* oclErrorString(cl_int error);
+
+// Helper function to get OpenCL image format string (channel order and type) from constant
+// *********************************************************************
+extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    // An error condition is defined by the sample/test value not equal to the reference
+    if (iReference != iSample)
+    {
+        // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
+        iSample = (iSample == 0) ? -9999 : iSample; 
+
+        // Log the error info
+        shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
+
+        // Cleanup and exit, or just exit if no cleanup function pointer provided.  Use iSample (error code in this case) as process exit code.
+        if (pCleanup != NULL)
+        {
+            pCleanup(iSample);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(iSample);
+        }
+    }
+}
+
+#endif
--- a/benchmarks/opencl/BlackScholes/shrQATest.h
+++ b/benchmarks/opencl/BlackScholes/shrQATest.h
@ -0,0 +1,238 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_QATEST_H
+#define SHR_QATEST_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// OS dependent includes
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+    #pragma message ("Note: including time.h")
+
+// Headers needed for Windows
+    #include <windows.h>
+	#include <time.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+    #include <unistd.h>
+    #include <time.h>
+#endif
+
+#ifndef STRCASECMP
+#ifdef _WIN32
+#define STRCASECMP _stricmp
+#else
+#define STRCASECMP strcasecmp
+#endif
+#endif
+
+#ifndef STRNCASECMP
+#ifdef _WIN32
+#define STRNCASECMP _strnicmp
+#else
+#define STRNCASECMP strncasecmp
+#endif
+#endif
+
+
+// Standardized QA Start/Finish for CUDA SDK tests
+#define shrQAStart(a, b)      __shrQAStart(a, b)
+#define shrQAFinish(a, b, c)  __shrQAFinish(a, b, c)
+#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
+
+inline int findExeNameStart(const char *exec_name)
+{
+    int exename_start = (int)strlen(exec_name);
+
+    while( (exename_start > 0) && 
+            (exec_name[exename_start] != '\\') && 
+            (exec_name[exename_start] != '/') )
+    {
+        exename_start--;
+    }
+    if (exec_name[exename_start] == '\\' || 
+        exec_name[exename_start] == '/')
+    {
+        return exename_start+1;
+    } else {
+        return exename_start;
+    }
+}
+
+inline int __shrQAStart(int argc, char **argv)
+{
+    bool bQATest = false;
+    // First clear the output buffer
+    fflush(stdout);
+    fflush(stdout);
+
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+        char *string_argv = &argv[i][string_start];
+
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }
+    }
+    
+    // We don't want to print the entire path, so we search for the first 
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    return exename_start;
+}
+
+enum eQAstatus {
+    QA_FAILED = 0,
+    QA_PASSED = 1,
+    QA_WAIVED = 2
+};
+
+inline void __ExitInTime(int seconds)
+{
+    fprintf(stdout, "> exiting in %d seconds: ", seconds);
+    fflush(stdout);
+    time_t t;
+    int count;
+    for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
+        fprintf(stdout, "%d...", count);
+#ifdef WIN32
+        Sleep(1000);
+#else
+        sleep(1);
+#endif
+    }
+    fprintf(stdout,"done!\n\n"); 
+	fflush(stdout);
+}
+
+
+inline void __shrQAFinish(int argc, const char **argv, int iStatus)
+{
+    // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
+    bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }	
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bNoPrompt = true;
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bNoPrompt = false;
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    } else {
+        if (!bNoPrompt) {
+            fprintf(stdout, "\nPress <Enter> to exit...\n");
+            fflush(stdout);
+            getchar();
+        }
+    }
+}
+
+inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
+{
+    bool bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    }
+}
+
+inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish(argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); 
+}
+
+inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish2(bQAtest, argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+#endif
--- a/benchmarks/opencl/BlackScholes/shrUtils.h
+++ b/benchmarks/opencl/BlackScholes/shrUtils.h
@ -0,0 +1,642 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_UTILS_H
+#define SHR_UTILS_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// reminders for output window and build log
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+#endif
+
+// OS dependent includes
+#ifdef _WIN32
+    // Headers needed for Windows
+    #include <windows.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+#endif
+
+// Other headers needed for both Windows and Linux
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// Un-comment the following #define to enable profiling code in SDK apps
+//#define GPU_PROFILING
+
+// Beginning of GPU Architecture definitions
+inline int ConvertSMVer2Cores(int major, int minor)
+{
+	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+	typedef struct {
+		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+		int Cores;
+	} sSMtoCores;
+
+	sSMtoCores nGpuArchCoresPerSM[] = 
+	{ { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+	  { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+	  { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+	  { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+	  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+	  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+	  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
+	  {   -1, -1 }
+	};
+
+	int index = 0;
+	while (nGpuArchCoresPerSM[index].SM != -1) {
+		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
+			return nGpuArchCoresPerSM[index].Cores;
+		}
+		index++;
+	}
+	printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
+	return -1;
+}
+// end of GPU Architecture definitions
+
+
+// Defines and enum for use with logging functions
+// *********************************************************************
+#define DEFAULTLOGFILE "SdkConsoleLog.txt"
+#define MASTERLOGFILE "SdkMasterLog.csv"
+enum LOGMODES 
+{
+    LOGCONSOLE = 1, // bit to signal "log to console" 
+    LOGFILE    = 2, // bit to signal "log to file" 
+    LOGBOTH    = 3, // convenience union of first 2 bits to signal "log to both"
+    APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
+    MASTER     = 8, // bit to signal master .csv log output
+    ERRORMSG   = 16, // bit to signal "pre-pend Error" 
+    CLOSELOG   = 32  // bit to close log file, if open, after any requested file write
+};
+#define HDASHLINE "-----------------------------------------------------------\n"
+
+// Standardized boolean
+enum shrBOOL
+{
+    shrFALSE = 0,
+    shrTRUE = 1
+};
+
+// Standardized MAX, MIN and CLAMP
+#define MAX(a, b) ((a > b) ? a : b)
+#define MIN(a, b) ((a < b) ? a : b)
+#define CLAMP(a, b, c) MIN(MAX(a, b), c)    // double sided clip of input a
+#define TOPCLAMP(a, b) (a < b ? a:b)	    // single top side clip of input a
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) 
+
+// Standardized Exit Macro for leaving main()... extended version
+// (Companion Inline Function lower on page)
+#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
+
+// Standardized Exit Macro for leaving main()... short version
+// (Companion Inline Function lower on page)
+#define shrEXIT(a, b)        __shrExitEX(a, b, EXIT_SUCCESS)
+
+// Simple argument checker macro
+#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE 
+
+// Define for user-customized error handling
+#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
+
+// Function to deallocate memory allocated within shrUtils
+// *********************************************************************
+extern "C" void shrFree(void* ptr);
+
+// *********************************************************************
+// Helper function to log standardized information to Console, to File or to both
+//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); 
+//!         : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+//! 
+//! Automatically opens file and stores handle if needed and not done yet
+//! Closes file and nulls handle on request
+//! 
+//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.  
+//!          LOGFILE and LOGBOTH may be | 'd  with APPENDMODE to select file append mode instead of overwrite mode 
+//!          LOGFILE and LOGBOTH may be | 'd  with CLOSELOG to "write and close" 
+//!          First 3 options may be | 'd  with MASTER to enable independent write to master data log file
+//!          First 3 options may be | 'd  with ERRORMSG to start line with standard error message
+//! @param 2 dValue:    
+//!          Positive val = double value for time in secs to be formatted to 6 decimals. 
+//!          Negative val is an error code and this give error preformatting.
+//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.  
+//!          ALL printf flags, width, precision and type specifiers are supported with this exception: 
+//!              Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
+//!              Single byte char type specifiers (%s and %c) ARE supported 
+//! @param 4... variable args: like printf or fprintf.  Must match format specifer type above.  
+//! @return 0 if OK, negative value on error or if error occurs or was passed in. 
+// *********************************************************************
+extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
+
+// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, 
+// *********************************************************************
+extern "C" int shrLog(const char* cFormatString, ...);
+
+// *********************************************************************
+// Delta timer function for up to 3 independent timers using host high performance counters 
+// Maintains state for 3 independent counters
+//! Example: double dElapsedTime = shrDeltaTime(0);
+//! 
+//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
+//! @return delta time of specified counter since last call in seconds.  Otherwise -9999.0 if error
+// *********************************************************************
+extern "C" double shrDeltaT(int iCounterID);
+
+// Optional LogFileNameOverride function
+// *********************************************************************
+extern "C" void shrSetLogFileName (const char* cOverRideName);
+
+// Helper function to init data arrays 
+// *********************************************************************
+extern "C" void shrFillArray(float* pfData, int iSize);
+
+// Helper function to print data arrays 
+// *********************************************************************
+extern "C" void shrPrintArray(float* pfData, int iSize);
+
+////////////////////////////////////////////////////////////////////////////
+//! Find the path for a filename
+//! @return the path if succeeded, otherwise 0
+//! @param filename        name of the file
+//! @param executablePath  optional absolute path of the executable
+////////////////////////////////////////////////////////////////////////////
+extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing single precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing double precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing single precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
+               const float epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing double precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
+               const double epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, 
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, 
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PPM image file (with unsigned char as data element type), padding 
+//! 4th component
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param OutData  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! 
+//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, 
+                             unsigned int *w, unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+//! Save PPM image file (with unsigned char as data element type, padded to 
+//! 4 bytes)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, 
+               unsigned int w, unsigned int h);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Save PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, 
+              unsigned int w, unsigned int h); 
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
+                  unsigned int *w,unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+// Command line arguments: General notes
+// * All command line arguments begin with '--' followed by the token; 
+//   token and value are seperated by '='; example --samples=50
+// * Arrays have the form --model=[one.obj,two.obj,three.obj] 
+//   (without whitespaces)
+////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////
+//! Check if command line argument \a flag-name is given
+//! @return shrTRUE if command line argument \a flag_name has been given, 
+//!         otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param flag_name  name of command line flag
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, 
+                     const char* flag_name);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, 
+                        const char* arg_name, int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type unsigned int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, 
+                        const char* arg_name, unsigned int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type float
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, 
+                        const char* arg_name, float* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type string
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, 
+                          const char* arg_name, char** val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument list those element are strings
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  command line argument list
+//! @param len  length of the list / number of elements
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, 
+                              const char* arg_name, char** val, 
+                              unsigned int* len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparef( const float* reference, const float* data,
+             const unsigned int len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparei( const int* reference, const int* data, 
+             const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned integer arrays, with epsilon and threshold
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
+            const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned char arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
+              const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integers with a tolernance for # of byte errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays witha n epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
+              const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality and a 
+//!     threshold for # pixel errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays using L2-norm with an epsilon tolerance for 
+//! equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
+                const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PPM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PGM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
+
+extern "C" size_t shrRoundUp(int group_size, int global_size);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    if (iReference != iSample)
+    {
+        shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); 
+        if (pCleanup != NULL)
+        {
+            pCleanup(EXIT_FAILURE);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
+// Standardized Exit
+// *********************************************************************
+inline void __shrExitEX(int argc, const char** argv, int iExitCode)
+{
+#ifdef WIN32
+    if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#else 
+    if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#endif
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");                  
+        getchar();                                                           
+    }       
+    else 
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); 
+    }
+    fflush(stderr);                                                         
+    exit(iExitCode);
+}
+
+#endif
--- a/benchmarks/opencl/DotProduct/DotProduct.cl
+++ b/benchmarks/opencl/DotProduct/DotProduct.cl
@ -0,0 +1,29 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+ __kernel void DotProduct (__global float* a, __global float* b, __global float* c, int iNumElements)
+{
+    // find position in global arrays
+    int iGID = get_global_id(0);
+
+    // bound check (equivalent to the limit on a 'for' loop for standard/serial C code
+    if (iGID >= iNumElements)
+    {   
+        return; 
+    }
+
+    // process 
+    int iInOffset = iGID << 2;
+    c[iGID] = a[iInOffset] * b[iInOffset] 
+               + a[iInOffset + 1] * b[iInOffset + 1]
+               + a[iInOffset + 2] * b[iInOffset + 2]
+               + a[iInOffset + 3] * b[iInOffset + 3];
+}
--- a/benchmarks/opencl/DotProduct/Makefile
+++ b/benchmarks/opencl/DotProduct/Makefile
@ -0,0 +1,66 @@
+RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
+POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
+POCL_INC_PATH = $(wildcard ../include)
+POCL_LIB_PATH = $(wildcard ../lib)
+VX_RT_PATH = $(wildcard ../../../runtime)
+VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
+
+CC  = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
+CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
+DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
+HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
+GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
+
+VX_SRCS =  $(VX_RT_PATH)/newlib/newlib.c
+VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
+VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
+VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
+VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
+VX_SRCS += $(VX_RT_PATH)/tests/tests.c
+VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
+VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
+
+VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
+
+CXXFLAGS =  -g -O0 -march=rv32im -mabi=ilp32 
+CXXFLAGS += -ffreestanding # program may not begin at main()
+CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
+CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
+CXXFLAGS += -I$(POCL_INC_PATH) -I.
+
+VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
+QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
+
+PROJECT=DotProduct
+
+all: $(PROJECT).dump $(PROJECT).hex
+
+lib$(PROJECT).a: DotProduct.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
+
+$(PROJECT).elf: main.cc lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
+
+$(PROJECT).qemu: main.cc lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
+
+$(PROJECT).hex: $(PROJECT).elf
+	$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
+
+$(PROJECT).dump: $(PROJECT).elf
+	$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
+
+run: $(PROJECT).hex
+	POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
+
+qemu: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-s: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-c: $(PROJECT).qemu
+	$(GDB) $(PROJECT).qemu
+
+clean:
+	rm -rf *.elf *.dump *.hex
--- a/benchmarks/opencl/DotProduct/main.cc
+++ b/benchmarks/opencl/DotProduct/main.cc
@ -0,0 +1,270 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// *********************************************************************
+// oclDotProduct Notes:  
+//
+// A simple OpenCL API demo application that implements a
+// vector dot product computation between 2 float arrays. 
+//
+// Runs computations with OpenCL on the GPU device and then checks results 
+// against basic host CPU/C++ computation.
+//
+// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness. 
+// But these are NOT required libs for OpenCL developement in general.
+// *********************************************************************
+
+// standard utilities and systems includes
+#include <oclUtils.h>
+#include <shrQATest.h>
+
+// Name of the file with the source code for the computation kernel
+// *********************************************************************
+const char* cSourceFile = "DotProduct.cl";
+
+// Host buffers for demo
+// *********************************************************************
+void *srcA, *srcB, *dst;        // Host buffers for OpenCL test
+void* Golden;                   // Host buffer for host golden processing cross check
+
+// OpenCL Vars
+cl_platform_id cpPlatform;      // OpenCL platform
+cl_device_id   *cdDevices;      // OpenCL device
+cl_context cxGPUContext;        // OpenCL context
+cl_command_queue cqCommandQueue;// OpenCL command que
+cl_program program;           // OpenCL program
+cl_kernel ckKernel;             // OpenCL kernel
+cl_mem cmDevSrcA;               // OpenCL device source buffer A
+cl_mem cmDevSrcB;               // OpenCL device source buffer B 
+cl_mem cmDevDst;                // OpenCL device destination buffer 
+size_t szGlobalWorkSize;        // Total # of work items in the 1D range
+size_t szLocalWorkSize;		    // # of work items in the 1D work group	
+size_t szParmDataBytes;			// Byte size of context information
+size_t szKernelLength;			// Byte size of kernel code
+cl_int ciErrNum;			    // Error code var
+char* cPathAndName = NULL;      // var for full paths to data, src, etc.
+char* cSourceCL = NULL;         // Buffer to hold source for compilation 
+const char* cExecutableName = NULL;
+
+// demo config vars
+int iNumElements= 1277944;	    // Length of float arrays to process (odd # for illustration)
+shrBOOL bNoPrompt = shrFALSE;  
+
+// Forward Declarations
+// *********************************************************************
+void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements);
+void Cleanup (int iExitCode);
+void (*pCleanup)(int) = &Cleanup;
+
+int *gp_argc = NULL;
+char ***gp_argv = NULL;
+
+// Main function 
+// *********************************************************************
+int main(int argc, char **argv)
+{
+    gp_argc = &argc;
+    gp_argv = &argv;
+
+    shrQAStart(argc, argv);
+
+    // Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    shrLog("clGetPlatformID...\n"); 
+
+    // Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    shrLog("clGetPlatformID...\n"); 
+
+    //Get all the devices
+    cl_uint uiNumDevices = 0;           // Number of devices available
+    cl_uint uiTargetDevice = 0;	        // Default Device to compute on
+    cl_uint uiNumComputeUnits;          // Number of compute units (SM's on NV GPU)
+    shrLog("Get the Device info and select Device...\n");
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+
+    // Get command line device options and config accordingly
+    shrLog("  # of Devices Available = %u\n", uiNumDevices); 
+    if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) 
+    {
+        uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
+    }
+    shrLog("  Using Device %u: ", uiTargetDevice); 
+    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
+    ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    shrLog("\n  # of Compute Units = %u\n", uiNumComputeUnits); 
+
+    // get command line arg for quick test, if provided
+    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
+
+    // start logs
+	cExecutableName = argv[0];
+    shrSetLogFileName ("oclDotProduct.txt");
+    shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements); 
+
+    // set and log Global and Local work size dimensions
+    szLocalWorkSize = 256;
+    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
+    shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
+           szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 
+
+    // Allocate and initialize host arrays
+    shrLog( "Allocate and Init Host Mem...\n"); 
+    srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
+    srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
+    dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
+    Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
+    shrFillArray((float*)srcA, 4 * iNumElements);
+    shrFillArray((float*)srcB, 4 * iNumElements);
+
+    // Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Get a GPU device
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Create the context
+    cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Create a command-queue
+    shrLog("clCreateCommandQueue...\n"); 
+    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
+    shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n"); 
+    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Read the OpenCL kernel in from source file
+    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
+    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
+    //oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
+    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
+    //oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
+
+    // Create the program
+    shrLog("clCreateProgramWithSource...\n"); 
+    //program = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
+    cl_program program =
+      clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
+        // Build the program with 'mad' Optimization option
+    #ifdef MAC
+        char* flags = "-cl-fast-relaxed-math -DMAC";
+    #else
+        char* flags = "-cl-fast-relaxed-math";
+    #endif
+    shrLog("clBuildProgram...\n"); 
+    ciErrNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+    if (ciErrNum != CL_SUCCESS)
+    {
+        // write out standard error, Build Log and PTX, then cleanup and exit
+        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+        oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
+        oclLogPtx(program, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
+        Cleanup(EXIT_FAILURE); 
+    }
+
+    // Create the kernel
+    shrLog("clCreateKernel (DotProduct)...\n"); 
+    ckKernel = clCreateKernel(program, "DotProduct", &ciErrNum);
+
+    // Set the Argument values
+    shrLog("clSetKernelArg 0 - 3...\n\n"); 
+    ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
+    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
+    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
+    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // --------------------------------------------------------
+    // Core sequence... copy input data to GPU, compute, copy results back
+
+    // Asynchronous write of data to GPU device
+    shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); 
+    ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
+    ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Launch kernel
+    shrLog("clEnqueueNDRangeKernel (DotProduct)...\n"); 
+    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Read back results and check accumulated errors
+    shrLog("clEnqueueReadBuffer (Dst)...\n\n"); 
+    ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Compute and compare results for golden-host and report errors and pass/fail
+    shrLog("Comparing against Host/C++ computation...\n\n"); 
+    DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
+    shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
+
+    // Cleanup and leave
+    Cleanup (EXIT_SUCCESS);
+}
+
+// "Golden" Host processing dot product function for comparison purposes
+// *********************************************************************
+void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements)
+{
+    int i, j, k;
+    for (i = 0, j = 0; i < iNumElements; i++) 
+    {
+        pfResult[i] = 0.0f;
+        for (k = 0; k < 4; k++, j++) 
+        {
+            pfResult[i] += pfData1[j] * pfData2[j]; 
+        } 
+    }
+}
+
+// Cleanup and exit code
+// *********************************************************************
+void Cleanup(int iExitCode)
+{
+    // Cleanup allocated objects
+    shrLog("Starting Cleanup...\n\n");
+    if(cPathAndName)free(cPathAndName);
+    if(cSourceCL)free(cSourceCL);
+	if(ckKernel)clReleaseKernel(ckKernel);  
+    if(program)clReleaseProgram(program);
+    if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
+    if(cxGPUContext)clReleaseContext(cxGPUContext);
+    if (cmDevSrcA)clReleaseMemObject(cmDevSrcA);
+    if (cmDevSrcB)clReleaseMemObject(cmDevSrcB);
+    if (cmDevDst)clReleaseMemObject(cmDevDst);
+
+    // Free host memory
+    free(srcA); 
+    free(srcB);
+    free (dst);
+    free(Golden);
+
+    if (cdDevices) free(cdDevices);
+
+    shrQAFinishExit(*gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED);
+}
--- a/benchmarks/opencl/DotProduct/oclUtils.h
+++ b/benchmarks/opencl/DotProduct/oclUtils.h
@ -0,0 +1,198 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+#ifndef OCL_UTILS_H
+#define OCL_UTILS_H
+
+// *********************************************************************
+// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// Common headers:  Cross-API utililties and OpenCL header
+#include <shrUtils.h>
+
+// All OpenCL headers
+#if defined (__APPLE__) || defined(MACOSX)
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/opencl.h>
+#endif 
+
+// Includes
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// For systems with CL_EXT that are not updated with these extensions, we copied these
+// extensions from <CL/cl_ext.h>
+#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+  /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+  #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+  #define CL_DEVICE_WARP_SIZE_NV                      0x4003
+  #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+  #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+  #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#endif
+
+// reminders for build output window and log
+#ifdef _WIN32
+    #pragma message ("Note: including shrUtils.h")
+    #pragma message ("Note: including opencl.h")
+#endif
+
+// SDK Revision #
+#define OCL_SDKREVISION "7027912"
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) 
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
+//!
+//! @return the id 
+//! @param clSelectedPlatformID         OpenCL platform ID
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print info about the device
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and return device capability
+//!
+//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA 
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" int oclGetDevCap(cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print the device name
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the first device from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxGPUContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of device with maximal FLOPS from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program file and prepends the cPreamble to the code.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get the binary (PTX) of the program associated with the device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//! @param binary       returned code
+//! @param length       length of returned code
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram                   OpenCL program
+//! @param cdDevice                    device of interest
+//! @param const char*  cPtxFileName   optional PTX file name
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the Build Log from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
+
+// Helper function for De-allocating cl objects
+// *********************************************************************
+extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
+
+// Helper function to get OpenCL error string from constant
+// *********************************************************************
+extern "C" const char* oclErrorString(cl_int error);
+
+// Helper function to get OpenCL image format string (channel order and type) from constant
+// *********************************************************************
+extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    // An error condition is defined by the sample/test value not equal to the reference
+    if (iReference != iSample)
+    {
+        // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
+        iSample = (iSample == 0) ? -9999 : iSample; 
+
+        // Log the error info
+        shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
+
+        // Cleanup and exit, or just exit if no cleanup function pointer provided.  Use iSample (error code in this case) as process exit code.
+        if (pCleanup != NULL)
+        {
+            pCleanup(iSample);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(iSample);
+        }
+    }
+}
+
+#endif
--- a/benchmarks/opencl/DotProduct/shrQATest.h
+++ b/benchmarks/opencl/DotProduct/shrQATest.h
@ -0,0 +1,238 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_QATEST_H
+#define SHR_QATEST_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// OS dependent includes
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+    #pragma message ("Note: including time.h")
+
+// Headers needed for Windows
+    #include <windows.h>
+	#include <time.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+    #include <unistd.h>
+    #include <time.h>
+#endif
+
+#ifndef STRCASECMP
+#ifdef _WIN32
+#define STRCASECMP _stricmp
+#else
+#define STRCASECMP strcasecmp
+#endif
+#endif
+
+#ifndef STRNCASECMP
+#ifdef _WIN32
+#define STRNCASECMP _strnicmp
+#else
+#define STRNCASECMP strncasecmp
+#endif
+#endif
+
+
+// Standardized QA Start/Finish for CUDA SDK tests
+#define shrQAStart(a, b)      __shrQAStart(a, b)
+#define shrQAFinish(a, b, c)  __shrQAFinish(a, b, c)
+#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
+
+inline int findExeNameStart(const char *exec_name)
+{
+    int exename_start = (int)strlen(exec_name);
+
+    while( (exename_start > 0) && 
+            (exec_name[exename_start] != '\\') && 
+            (exec_name[exename_start] != '/') )
+    {
+        exename_start--;
+    }
+    if (exec_name[exename_start] == '\\' || 
+        exec_name[exename_start] == '/')
+    {
+        return exename_start+1;
+    } else {
+        return exename_start;
+    }
+}
+
+inline int __shrQAStart(int argc, char **argv)
+{
+    bool bQATest = false;
+    // First clear the output buffer
+    fflush(stdout);
+    fflush(stdout);
+
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+        char *string_argv = &argv[i][string_start];
+
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }
+    }
+    
+    // We don't want to print the entire path, so we search for the first 
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    return exename_start;
+}
+
+enum eQAstatus {
+    QA_FAILED = 0,
+    QA_PASSED = 1,
+    QA_WAIVED = 2
+};
+
+inline void __ExitInTime(int seconds)
+{
+    fprintf(stdout, "> exiting in %d seconds: ", seconds);
+    fflush(stdout);
+    time_t t;
+    int count;
+    for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
+        fprintf(stdout, "%d...", count);
+#ifdef WIN32
+        Sleep(1000);
+#else
+        sleep(1);
+#endif
+    }
+    fprintf(stdout,"done!\n\n"); 
+	fflush(stdout);
+}
+
+
+inline void __shrQAFinish(int argc, const char **argv, int iStatus)
+{
+    // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
+    bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }	
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bNoPrompt = true;
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bNoPrompt = false;
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    } else {
+        if (!bNoPrompt) {
+            fprintf(stdout, "\nPress <Enter> to exit...\n");
+            fflush(stdout);
+            getchar();
+        }
+    }
+}
+
+inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
+{
+    bool bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    }
+}
+
+inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish(argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); 
+}
+
+inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish2(bQAtest, argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+#endif
--- a/benchmarks/opencl/DotProduct/shrUtils.h
+++ b/benchmarks/opencl/DotProduct/shrUtils.h
@ -0,0 +1,642 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_UTILS_H
+#define SHR_UTILS_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// reminders for output window and build log
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+#endif
+
+// OS dependent includes
+#ifdef _WIN32
+    // Headers needed for Windows
+    #include <windows.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+#endif
+
+// Other headers needed for both Windows and Linux
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// Un-comment the following #define to enable profiling code in SDK apps
+//#define GPU_PROFILING
+
+// Beginning of GPU Architecture definitions
+inline int ConvertSMVer2Cores(int major, int minor)
+{
+	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+	typedef struct {
+		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+		int Cores;
+	} sSMtoCores;
+
+	sSMtoCores nGpuArchCoresPerSM[] = 
+	{ { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+	  { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+	  { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+	  { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+	  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+	  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+	  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
+	  {   -1, -1 }
+	};
+
+	int index = 0;
+	while (nGpuArchCoresPerSM[index].SM != -1) {
+		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
+			return nGpuArchCoresPerSM[index].Cores;
+		}
+		index++;
+	}
+	printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
+	return -1;
+}
+// end of GPU Architecture definitions
+
+
+// Defines and enum for use with logging functions
+// *********************************************************************
+#define DEFAULTLOGFILE "SdkConsoleLog.txt"
+#define MASTERLOGFILE "SdkMasterLog.csv"
+enum LOGMODES 
+{
+    LOGCONSOLE = 1, // bit to signal "log to console" 
+    LOGFILE    = 2, // bit to signal "log to file" 
+    LOGBOTH    = 3, // convenience union of first 2 bits to signal "log to both"
+    APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
+    MASTER     = 8, // bit to signal master .csv log output
+    ERRORMSG   = 16, // bit to signal "pre-pend Error" 
+    CLOSELOG   = 32  // bit to close log file, if open, after any requested file write
+};
+#define HDASHLINE "-----------------------------------------------------------\n"
+
+// Standardized boolean
+enum shrBOOL
+{
+    shrFALSE = 0,
+    shrTRUE = 1
+};
+
+// Standardized MAX, MIN and CLAMP
+#define MAX(a, b) ((a > b) ? a : b)
+#define MIN(a, b) ((a < b) ? a : b)
+#define CLAMP(a, b, c) MIN(MAX(a, b), c)    // double sided clip of input a
+#define TOPCLAMP(a, b) (a < b ? a:b)	    // single top side clip of input a
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) 
+
+// Standardized Exit Macro for leaving main()... extended version
+// (Companion Inline Function lower on page)
+#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
+
+// Standardized Exit Macro for leaving main()... short version
+// (Companion Inline Function lower on page)
+#define shrEXIT(a, b)        __shrExitEX(a, b, EXIT_SUCCESS)
+
+// Simple argument checker macro
+#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE 
+
+// Define for user-customized error handling
+#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
+
+// Function to deallocate memory allocated within shrUtils
+// *********************************************************************
+extern "C" void shrFree(void* ptr);
+
+// *********************************************************************
+// Helper function to log standardized information to Console, to File or to both
+//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); 
+//!         : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+//! 
+//! Automatically opens file and stores handle if needed and not done yet
+//! Closes file and nulls handle on request
+//! 
+//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.  
+//!          LOGFILE and LOGBOTH may be | 'd  with APPENDMODE to select file append mode instead of overwrite mode 
+//!          LOGFILE and LOGBOTH may be | 'd  with CLOSELOG to "write and close" 
+//!          First 3 options may be | 'd  with MASTER to enable independent write to master data log file
+//!          First 3 options may be | 'd  with ERRORMSG to start line with standard error message
+//! @param 2 dValue:    
+//!          Positive val = double value for time in secs to be formatted to 6 decimals. 
+//!          Negative val is an error code and this give error preformatting.
+//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.  
+//!          ALL printf flags, width, precision and type specifiers are supported with this exception: 
+//!              Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
+//!              Single byte char type specifiers (%s and %c) ARE supported 
+//! @param 4... variable args: like printf or fprintf.  Must match format specifer type above.  
+//! @return 0 if OK, negative value on error or if error occurs or was passed in. 
+// *********************************************************************
+extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
+
+// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, 
+// *********************************************************************
+extern "C" int shrLog(const char* cFormatString, ...);
+
+// *********************************************************************
+// Delta timer function for up to 3 independent timers using host high performance counters 
+// Maintains state for 3 independent counters
+//! Example: double dElapsedTime = shrDeltaTime(0);
+//! 
+//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
+//! @return delta time of specified counter since last call in seconds.  Otherwise -9999.0 if error
+// *********************************************************************
+extern "C" double shrDeltaT(int iCounterID);
+
+// Optional LogFileNameOverride function
+// *********************************************************************
+extern "C" void shrSetLogFileName (const char* cOverRideName);
+
+// Helper function to init data arrays 
+// *********************************************************************
+extern "C" void shrFillArray(float* pfData, int iSize);
+
+// Helper function to print data arrays 
+// *********************************************************************
+extern "C" void shrPrintArray(float* pfData, int iSize);
+
+////////////////////////////////////////////////////////////////////////////
+//! Find the path for a filename
+//! @return the path if succeeded, otherwise 0
+//! @param filename        name of the file
+//! @param executablePath  optional absolute path of the executable
+////////////////////////////////////////////////////////////////////////////
+extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing single precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing double precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing single precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
+               const float epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing double precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
+               const double epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, 
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, 
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PPM image file (with unsigned char as data element type), padding 
+//! 4th component
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param OutData  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! 
+//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, 
+                             unsigned int *w, unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+//! Save PPM image file (with unsigned char as data element type, padded to 
+//! 4 bytes)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, 
+               unsigned int w, unsigned int h);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Save PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, 
+              unsigned int w, unsigned int h); 
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
+                  unsigned int *w,unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+// Command line arguments: General notes
+// * All command line arguments begin with '--' followed by the token; 
+//   token and value are seperated by '='; example --samples=50
+// * Arrays have the form --model=[one.obj,two.obj,three.obj] 
+//   (without whitespaces)
+////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////
+//! Check if command line argument \a flag-name is given
+//! @return shrTRUE if command line argument \a flag_name has been given, 
+//!         otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param flag_name  name of command line flag
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, 
+                     const char* flag_name);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, 
+                        const char* arg_name, int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type unsigned int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, 
+                        const char* arg_name, unsigned int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type float
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, 
+                        const char* arg_name, float* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type string
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, 
+                          const char* arg_name, char** val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument list those element are strings
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  command line argument list
+//! @param len  length of the list / number of elements
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, 
+                              const char* arg_name, char** val, 
+                              unsigned int* len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparef( const float* reference, const float* data,
+             const unsigned int len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparei( const int* reference, const int* data, 
+             const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned integer arrays, with epsilon and threshold
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
+            const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned char arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
+              const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integers with a tolernance for # of byte errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays witha n epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
+              const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality and a 
+//!     threshold for # pixel errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays using L2-norm with an epsilon tolerance for 
+//! equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
+                const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PPM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PGM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
+
+extern "C" size_t shrRoundUp(int group_size, int global_size);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    if (iReference != iSample)
+    {
+        shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); 
+        if (pCleanup != NULL)
+        {
+            pCleanup(EXIT_FAILURE);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
+// Standardized Exit
+// *********************************************************************
+inline void __shrExitEX(int argc, const char** argv, int iExitCode)
+{
+#ifdef WIN32
+    if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#else 
+    if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#endif
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");                  
+        getchar();                                                           
+    }       
+    else 
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); 
+    }
+    fflush(stderr);                                                         
+    exit(iExitCode);
+}
+
+#endif
--- a/benchmarks/opencl/VectorHypot/Makefile
+++ b/benchmarks/opencl/VectorHypot/Makefile
@ -0,0 +1,66 @@
+RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
+POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
+POCL_INC_PATH = $(wildcard ../include)
+POCL_LIB_PATH = $(wildcard ../lib)
+VX_RT_PATH = $(wildcard ../../../runtime)
+VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
+
+CC  = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
+CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
+DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
+HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
+GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
+
+VX_SRCS =  $(VX_RT_PATH)/newlib/newlib.c
+VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
+VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
+VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
+VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
+VX_SRCS += $(VX_RT_PATH)/tests/tests.c
+VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
+VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
+
+VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
+
+CXXFLAGS =  -g -O0 -march=rv32im -mabi=ilp32 
+CXXFLAGS += -ffreestanding # program may not begin at main()
+CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
+CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
+CXXFLAGS += -I$(POCL_INC_PATH) -I.
+
+VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
+QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
+
+PROJECT=VectorHypot
+
+all: $(PROJECT).dump $(PROJECT).hex
+
+lib$(PROJECT).a: VectorHypot.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
+
+$(PROJECT).elf: main.cc lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
+
+$(PROJECT).qemu: main.cc lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
+
+$(PROJECT).hex: $(PROJECT).elf
+	$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
+
+$(PROJECT).dump: $(PROJECT).elf
+	$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
+
+run: $(PROJECT).hex
+	POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
+
+qemu: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-s: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-c: $(PROJECT).qemu
+	$(GDB) $(PROJECT).qemu
+
+clean:
+	rm -rf *.elf *.dump *.hex
--- a/benchmarks/opencl/VectorHypot/VectorHypot.cl
+++ b/benchmarks/opencl/VectorHypot/VectorHypot.cl
@ -0,0 +1,41 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+// OpenCL Kernel Function Naive Implementation for hyptenuse
+__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
+{
+    // get index into global data array
+    size_t szGlobalOffset = get_global_id(0) + uiOffset;
+
+    // bound check 
+    if (szGlobalOffset >= uiNumElements)
+    {   
+        return; 
+    }
+
+    // Processing 4 elements per work item, so read fgA and fgB source values from GMEM
+    float4 f4A = fg4A[szGlobalOffset];
+    float4 f4B = fg4B[szGlobalOffset];
+    float4 f4H = (float4)0.0f;
+     
+    // Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop  
+    for (int i = 0; i < iInnerLoopCount; i++)  
+    {
+        // compute the 4 hypotenuses using built-in function
+        f4H.x = hypot (f4A.x, f4B.x);
+        f4H.y = hypot (f4A.y, f4B.y);
+        f4H.z = hypot (f4A.z, f4B.z);
+        f4H.w = hypot (f4A.w, f4B.w);
+    }
+    
+    // Write 4 result values back out to GMEM
+    fg4Hypot[szGlobalOffset] = f4H;
+}
--- a/benchmarks/opencl/VectorHypot/main.cc
+++ b/benchmarks/opencl/VectorHypot/main.cc
@ -0,0 +1,686 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// *********************************************************************
+// oclCopyComputeOverlap Notes:  
+//
+// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
+// element by element vector hyptenuse computation using 2 input float arrays
+// and 1 output float array.
+//
+// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
+// with respect to GPU computation (and with respect to host thread). 
+//
+// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
+// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
+// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
+//
+// After setup, warmup and calibration to the system, the sample runs 4 scenarios:  
+//      A) Computations with 2 command queues on GPU
+//         A multiple-cycle sequence is executed, timed and compared against the host
+//      B) Computations with 1 command queue on GPU
+//         A multiple-cycle sequence is executed, timed and compared against the host
+//
+//      The 2-command queue approach ought to be substantially faster
+//
+// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently 
+// increases compute time without increasing data size (via a loop inside the kernel)
+//
+//      At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
+//      (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
+//
+//      If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
+//        
+//          Single Queue with all the data and all the work  
+//             Ttot (serial)        = 4T + 4T + 2T      = 10T    
+//
+//          Dual Queue, where each queue has 1/2 the data and 1/2 the work 
+//             Tq0  (overlap)       = 2T + 2T + T .... 
+//             Tq1  (overlap)       = .... 2T + 2T + T
+//
+//             Ttot (elapsed, wall) = 2T + 2T + 2T + T  = 7T
+//
+//          Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 %	(Tesla arch 1.2 or 1.3, single copy engine)
+//
+//			For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
+//			This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.			
+// *********************************************************************
+
+// common SDK header for standard utilities and system libs 
+#include <oclUtils.h>
+#include <shrQATest.h>
+
+// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
+// values greater than 0.0f represent a speed-up relative to non-overlapped
+#define EXPECTED_OVERLAP 30.0f
+#define EXPECTED_OVERLAP_FERMI 45.0f
+#define PASS_FACTOR 0.60f
+#define RETRIES_ON_FAILURE 1
+
+// Base sizes for parameters manipulated dynamically or on the command line
+#define BASE_WORK_ITEMS 64
+#define BASE_ARRAY_LENGTH 40000
+#define BASE_LOOP_COUNT 32
+
+// Vars
+// *********************************************************************
+cl_platform_id cpPlatform;                          // OpenCL platform
+cl_context cxGPUContext;                            // OpenCL context
+cl_command_queue cqCommandQueue[2];                 // OpenCL command queues
+cl_device_id* cdDevices;                            // OpenCL device list  
+cl_program cpProgram;                               // OpenCL program
+cl_kernel ckKernel[2];                              // OpenCL kernel, 1 per queue
+cl_mem cmPinnedSrcA;                                // OpenCL pinned host source buffer A
+cl_mem cmPinnedSrcB;                                // OpenCL pinned host source buffer B 
+cl_mem cmPinnedResult;                              // OpenCL pinned host result buffer 
+float* fSourceA = NULL;                             // Mapped pointer for pinned Host source A buffer
+float* fSourceB = NULL;                             // Mapped pointer for pinned Host source B buffer
+float* fResult = NULL;                              // Mapped pointer for pinned Host result buffer 
+cl_mem cmDevSrcA;                                   // OpenCL device source buffer A
+cl_mem cmDevSrcB;                                   // OpenCL device source buffer B 
+cl_mem cmDevResult;                                 // OpenCL device result buffer 
+size_t szBuffBytes;                                 // Size of main buffers
+size_t szGlobalWorkSize;                            // 1D var for Total # of work items in the launched ND range
+size_t szLocalWorkSize = BASE_WORK_ITEMS;           // initial # of work items in the work group	
+cl_int ciErrNum;			                        // Error code var
+char* cPathAndName = NULL;                          // Var for full paths to data, src, etc.
+char* cSourceCL = NULL;                             // Buffer to hold source for compilation 
+const char* cExecutableName = NULL;
+
+// demo config vars
+const char* cSourceFile = "VectorHypot.cl";         // OpenCL computation kernel source code
+float* Golden = NULL;                               // temp buffer to hold golden results for cross check    
+bool bNoPrompt = false;                             // Command line switch to skip exit prompt
+bool bQATest = false;                               // Command line switch to test
+
+// Forward Declarations
+// *********************************************************************
+double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
+double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
+int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles); 
+void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
+void Cleanup (int iExitCode);
+void (*pCleanup)(int) = &Cleanup;
+
+int *gp_argc = 0;
+const char *** gp_argv = NULL;
+
+// Main function 
+// *********************************************************************
+int main(int argc, const char **argv)
+{
+    //Locals
+    size_t szKernelLength;                      // Byte size of kernel code
+    double dBuildTime;                          // Compile time
+    cl_uint uiTargetDevice = 0;	                // Default Device to compute on
+    cl_uint uiNumDevsUsed = 1;                  // Number of devices used in this sample   
+    cl_uint uiNumDevices;                       // Number of devices available 
+    int iDevCap = -1;                           // Capability of device
+    int iInnerLoopCount = BASE_LOOP_COUNT;      // Varies "compute intensity" per data within the kernel 
+    const int iTestCycles = 10;                 // How many times to run the external test loop 
+    const int iWarmupCycles = 8;                // How many times to run the warmup sequence 
+    cl_uint uiWorkGroupMultiple = 4;            // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
+    cl_uint uiNumElements = BASE_ARRAY_LENGTH;  // initial # of elements per array to process (note: procesing 4 per work item)
+    cl_uint uiSizeMultiple = 4;                 // Command line var (using "sizemult=<n>") to optionally increase vector sizes
+    bool bPassFlag = false;                     // Var to accumulate test pass/fail
+    shrBOOL bMatch = shrFALSE;                  // Cross check result
+	shrBOOL bTestOverlap = shrFALSE;
+	double dAvgGPUTime[2] = {0.0, 0.0};         // Average time of iTestCycles calls for 2-Queue and 1-Queue test
+    double dHostTime[2] = {0.0, 0.0};           // Host computation time (2nd test is redundant but a good stability indicator)
+    float fMinPassCriteria[2] = {0.0f, 0.0f};	// Test pass cireria, adjusted dependant on GPU arch
+
+    gp_argc = &argc;
+    gp_argv = &argv;
+
+    shrQAStart(argc, (char **)argv);
+
+    // start logs 
+	cExecutableName = argv[0];
+    shrSetLogFileName ("oclCopyComputeOverlap.txt");
+    shrLog("%s Starting...\n\n", argv[0]); 
+
+    // get basic command line args 
+    bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
+    bQATest   = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
+    shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
+
+    // Optional Command-line multiplier for vector size 
+    //   Default val of 4 gives 10.24 million float elements per vector
+    //   Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
+    shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
+    uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);  
+    uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
+    shrLog("Array sizes = %u float elements\n", uiNumElements); 
+
+    // Optional Command-line multiplier for workgroup size (x 64 work items)
+    //   Default val of 4 gives szLocalWorkSize of 256.
+    //   Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
+    shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
+    uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10); 
+    szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
+    shrLog("Workgroup Size = %u\n\n", szLocalWorkSize); 
+
+    // Get the NVIDIA platform if available, otherwise use default
+    shrLog("Get the Platform ID...\n\n");
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Get OpenCL platform name and version
+    char cBuffer[256];
+    ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("Platform Name = %s\n\n", cBuffer);
+
+    // Get all the devices
+    shrLog("Get the Device info and select Device...\n");
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
+
+    // Ethans changes
+    CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+    CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+    //ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Set target device and check capabilities 
+    shrLog(" # of Devices Available = %u\n", uiNumDevices); 
+    uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
+    shrLog(" Using Device %u, ", uiTargetDevice); 
+    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);  
+    iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
+    if (iDevCap > 0) {
+       shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
+    } else {
+       shrLog("\n\n", iDevCap); 
+    }
+    if (strstr(cBuffer, "NVIDIA") != NULL)
+    {
+        if (iDevCap < 12)
+        {
+            shrLog("Device doesn't have overlap capability.  Skipping test...\n"); 
+            Cleanup (EXIT_SUCCESS);
+        }
+
+		// Device and Platform eligible for overlap testing
+		bTestOverlap = shrTRUE;
+
+        // If device has overlap capability, proceed
+        fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP;               // 1st cycle overlap is same for 1 or 2 copy engines
+        if (iDevCap != 20) 
+        {
+            // Single copy engine
+            fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP;            // avg of many cycles
+        }
+        else 
+        {
+            char cDevName[1024];
+            clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
+            if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
+            {
+                // Tesla or Quadro (arch = 2.0) ... Dual copy engine 
+                fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI;  // average of many cycles
+            }
+            else
+            {
+                // Geforce ... Single copy engine
+                fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP;        // average of many cycles
+            }
+        }
+    }   
+
+    // Create the context
+    shrLog("clCreateContext...\n"); 
+    cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    
+    // Create 2 command-queues
+    cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateCommandQueue [0]...\n"); 
+    cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateCommandQueue [1]...\n"); 
+
+    // Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
+    szBuffBytes = sizeof(cl_float) * uiNumElements;
+    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements); 
+
+    // Allocate pinned source and result host buffers:  
+    //   Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
+    cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements); 
+
+    // Get mapped pointers to pinned input host buffers
+    //   Note:  This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
+    fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
+    //oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n"); 
+
+    // Alloc temp golden buffer for cross checks
+    Golden = (float*)malloc(szBuffBytes);
+    //oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
+
+    // Read the OpenCL kernel in from source file
+    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
+    //oclCheckError(cPathAndName != NULL, shrTRUE);
+    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
+   // oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
+
+    // Create the program object
+    //cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateProgramWithSource...\n"); 
+    cl_program program =
+      clCreateProgramWithBuiltInKernels(context, 1, &device_id, "VectorHypot", NULL);
+    // Build the program for the target device
+    clFinish(cqCommandQueue[0]);
+    shrDeltaT(0);
+    ciErrNum = clBuildProgram(program, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
+    shrLog("clBuildProgram..."); 
+    if (ciErrNum != CL_SUCCESS)
+    {
+        // write out standard error, Build Log and PTX, then cleanup and exit
+        shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
+        oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
+        oclLogPtx(program, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
+        Cleanup(EXIT_FAILURE);
+    }
+    dBuildTime = shrDeltaT(0);
+
+    // Ethan - Kernel Addition
+
+    if (program == NULL) {
+        std::cerr << "Failed to write program binary" << std::endl;
+        Cleanup(context, queue, program, kernel, memObjects);
+        return 1;
+    } else {
+        std::cout << "Read program from binary." << std::endl;
+    }
+
+    // Create the kernel
+    ckKernel[0] = clCreateKernel(program, "VectorHypot", &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    ckKernel[1] = clCreateKernel(program, "VectorHypot", &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateKernel (ckKernel[2])...\n"); 
+
+    // Offsets for 2 queues
+    cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
+
+    // Set the Argument values for the 1st kernel instance (queue 0)
+    ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
+    ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
+    ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
+    ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
+    ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
+    ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
+    shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n"); 
+
+    // Set the Argument values for the 2d kernel instance (queue 1)
+    ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
+    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n"); 
+
+    //*******************************************
+    // Warmup the driver with dual queue sequence
+    //*******************************************
+
+    // Warmup with dual queue sequence for iTestCycles
+    shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
+    DualQueueSequence(iWarmupCycles, uiNumElements, false);
+
+    // Use single queue config to adjust compute intensity 
+    shrLog("Adjust compute for GPU / system...\n");
+    iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles); 
+    shrLog("  Kernel inner loop count = %d\n", iInnerLoopCount); 
+
+    //*******************************************
+    // Run and time with 2 command-queues
+    //*******************************************
+	for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
+	
+	// Run the sequence iTestCycles times
+    dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
+
+    // Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer) 
+    shrLog("  Device vs Host Result Comparison\t: "); 
+    VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);   
+    shrDeltaT(0);
+    for (int i = 0; i < iTestCycles; i++)
+    {
+        VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);   
+    }
+    dHostTime[0] = shrDeltaT(0)/iTestCycles;
+
+    // Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer) 
+    bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
+    shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); 
+    bPassFlag = (bMatch == shrTRUE);
+
+    //*******************************************
+    // Run and time with 1 command queue
+    //*******************************************
+    // Run the sequence iTestCycles times
+    dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
+
+    // Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer) 
+    shrLog("  Device vs Host Result Comparison\t: "); 
+    shrDeltaT(0);
+    for (int i = 0; i < iTestCycles; i++)
+    {
+        VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);   
+    }
+    dHostTime[1] = shrDeltaT(0)/iTestCycles;
+
+    // Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer) 
+    bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
+    shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); 
+    bPassFlag &= (bMatch == shrTRUE);
+
+    //*******************************************
+
+    // Compare Single and Dual queue timing 
+    shrLog("\nResult Summary:\n"); 
+
+    // Log GPU and CPU Time for 2-queue scenario
+    shrLog("  Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
+    shrLog("  Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
+
+    // Log GPU and CPU Time for 1-queue scenario
+    shrLog("  Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
+    shrLog("  Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
+
+    // Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
+    double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
+    
+	if( bTestOverlap ) {
+		bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
+		if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
+			shrLog("  Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%)  -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
+
+			// Log info to master log in standard format
+			shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", 
+			dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize); 
+			
+			bPassFlag &= bAvgOverlapOK;
+			break;
+		}
+	} 
+
+		shrLog("  Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%)  -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
+	}
+
+
+    //*******************************************
+    // Report pass/fail, cleanup and exit
+    Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+// Run 1 queue sequence for n cycles
+// *********************************************************************
+double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
+{
+    // Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer) 
+    shrFillArray(fSourceA, (int)uiNumElements);
+    shrFillArray(fSourceB, (int)uiNumElements);
+
+    // Reset Global work size for 1 command-queue, and log work sizes & dimensions
+    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
+
+    // *** Make sure queues are empty and then start timer 
+    double dAvgTime = 0.0;
+    clFinish(cqCommandQueue[0]);
+    clFinish(cqCommandQueue[1]);  
+    shrDeltaT(0);
+
+    // Run the sequence iCycles times
+    for (int i = 0; i < iCycles; i++)
+    {
+        // Nonblocking Write of all of input data from host to device in command-queue 0 
+        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
+        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+        // Launch kernel computation, command-queue 0 
+        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
+        //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+        // Non Blocking Read of output data from device to host, command-queue 0 
+        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+        // Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
+        clFlush(cqCommandQueue[0]);
+    }
+    
+    // *** Assure sync to host and return average sequence time
+    clFinish(cqCommandQueue[0]);
+    dAvgTime = shrDeltaT(0)/(double)iCycles;
+
+    // Log config if asked for
+    if (bShowConfig)
+    {
+        shrLog("\n1-Queue sequence Configuration:\n");
+        shrLog("  Global Work Size (per command-queue)\t= %u\n  Local Work Size \t\t\t= %u\n  # of Work Groups (per command-queue)\t= %u\n  # of command-queues\t\t\t= 1\n", 
+           szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize); 
+    }
+    return dAvgTime;
+}
+
+// Run 2 queue sequence for n cycles
+// *********************************************************************
+double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
+{
+    // Locals
+    size_t szHalfBuffer = szBuffBytes / 2;
+    size_t szHalfOffset = szHalfBuffer / sizeof(float);
+    double dAvgTime = 0.0;
+
+    // Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer) 
+    shrFillArray(fSourceA, (int)uiNumElements);
+    shrFillArray(fSourceB, (int)uiNumElements);
+
+    // Set Global work size for 2 command-queues, and log work sizes & dimensions
+    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
+
+    // Make sure queues are empty and then start timer 
+    clFinish(cqCommandQueue[0]);
+    clFinish(cqCommandQueue[1]);
+    shrDeltaT(0);
+
+    for (int i = 0; i < iCycles; i++)
+    {
+        // Mid Phase 0 
+        // Nonblocking Write of 1st half of input data from host to device in command-queue 0 
+        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
+        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+        // Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver 
+        // (not necessary on Linux, Mac OSX or WinXP)
+        clFlush(cqCommandQueue[0]);
+        clFlush(cqCommandQueue[1]);
+
+        // Start Phase 1 ***********************************
+
+        // Launch kernel computation, command-queue 0
+        // (Note:  The order MATTERS here on Fermi !  THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
+        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
+        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+        // Nonblocking Write of 2nd half of input data from host to device in command-queue 1 
+        // (Note:  The order MATTERS here on Fermi !  THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
+        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
+        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+        // Push out the compute for queue 0 and write for queue 1 to the driver
+        // (not necessary on Linux, Mac OSX or WinXP)
+        clFlush(cqCommandQueue[0]);
+        clFlush(cqCommandQueue[1]);
+
+        // Start Phase 2 ***********************************
+
+        // Launch kernel computation, command-queue 1 
+        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
+        //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+        // Non Blocking Read of 1st half of output data from device to host, command-queue 0 
+        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+        // Push out the compute for queue 1 and the read for queue 0 to the driver 
+        // (not necessary on Linux, Mac OSX or WinXP)
+        clFlush(cqCommandQueue[0]);
+        clFlush(cqCommandQueue[1]);
+
+        // Start Phase 0 (Rolls over) ***********************************
+
+        // Non Blocking Read of 2nd half of output data from device to host, command-queue 1 
+        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+    }
+
+    // *** Sync to host and get average sequence time
+    clFinish(cqCommandQueue[0]);
+    clFinish(cqCommandQueue[1]);    
+    dAvgTime = shrDeltaT(0)/(double)iCycles;
+
+    // Log config if asked for
+    if (bShowConfig)
+    {
+        shrLog("\n2-Queue sequence Configuration:\n");
+        shrLog("  Global Work Size (per command-queue)\t= %u\n  Local Work Size \t\t\t= %u\n  # of Work Groups (per command-queue)\t= %u\n  # of command-queues\t\t\t= 2\n", 
+           szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize); 
+    }
+
+    return dAvgTime;
+}
+
+// Function to adjust compute task according to device capability
+// This allows a consistent overlap % across a wide variety of GPU's for test purposes
+// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
+// *********************************************************************
+int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
+{
+    // Locals
+    double dCopyTime, dComputeTime;
+    int iComputedLoopCount; 
+
+    // Change Source Data
+    shrFillArray(fSourceA, (int)uiNumElements);
+    shrFillArray(fSourceB, (int)uiNumElements);
+
+    // Reset Global work size for 1 command-queue, and log work sizes & dimensions
+    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
+
+    // *** Make sure queues are empty and then start timer 
+    clFinish(cqCommandQueue[0]);
+    clFinish(cqCommandQueue[1]);  
+    shrDeltaT(0);
+
+    // Run the copy iCycles times and measure copy time on this system
+    for (int i = 0; i < iCycles; i++)
+    {
+        // Nonblocking Write of all of input data from host to device in command-queue 0 
+        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
+        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
+        ciErrNum |= clFlush(cqCommandQueue[0]);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+    }
+    clFinish(cqCommandQueue[0]);
+    dCopyTime = shrDeltaT(0);
+
+    // Run the compute iCycles times and measure compute time on this system
+    for (int i = 0; i < iCycles; i++)
+    {
+        // Launch kernel computation, command-queue 0 
+        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
+        ciErrNum |= clFlush(cqCommandQueue[0]);
+        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    }
+    clFinish(cqCommandQueue[0]);
+    dComputeTime = shrDeltaT(0);
+
+    // Determine number of core loop cycles proportional to copy/compute time ratio
+    dComputeTime = MAX(dComputeTime, 1.0e-6);
+    iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
+    ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
+    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    return (iComputedLoopCount);
+} 
+
+// Cleanup/Exit function 
+// *********************************************************************
+void Cleanup (int iExitCode)
+{
+    // Cleanup allocated objects
+    shrLog("Starting Cleanup...\n\n");
+    if(cPathAndName)free(cPathAndName);
+    if(cSourceCL)free(cSourceCL);
+    if(Golden)free(Golden);
+    if(ckKernel[0])clReleaseKernel(ckKernel[0]);
+    if(ckKernel[1])clReleaseKernel(ckKernel[1]);
+    if(program)clReleaseProgram(program);
+    if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
+    if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
+    if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
+    if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
+    if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
+    if(cmDevResult)clReleaseMemObject(cmDevResult);
+    if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
+    if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
+    if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
+    if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
+    if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
+    if(cxGPUContext)clReleaseContext(cxGPUContext);
+    if(cdDevices)free(cdDevices);
+
+    // Master status Pass/Fail (all tests)
+    shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
+}
+
+// "Golden" Host processing vector hyptenuse function for comparison purposes
+// *********************************************************************
+void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
+{
+    for (unsigned int i = 0; i < uiNumElements; i++) 
+    {
+        float fA = pfData1[i];
+        float fB = pfData2[i];
+        float fC = sqrtf(fA * fA + fB * fB);
+
+        pfResult[i] = fC;
+    }
+}
--- a/benchmarks/opencl/VectorHypot/oclUtils.h
+++ b/benchmarks/opencl/VectorHypot/oclUtils.h
@ -0,0 +1,198 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+#ifndef OCL_UTILS_H
+#define OCL_UTILS_H
+
+// *********************************************************************
+// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// Common headers:  Cross-API utililties and OpenCL header
+#include <shrUtils.h>
+
+// All OpenCL headers
+#if defined (__APPLE__) || defined(MACOSX)
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/opencl.h>
+#endif 
+
+// Includes
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// For systems with CL_EXT that are not updated with these extensions, we copied these
+// extensions from <CL/cl_ext.h>
+#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+  /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+  #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+  #define CL_DEVICE_WARP_SIZE_NV                      0x4003
+  #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+  #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+  #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#endif
+
+// reminders for build output window and log
+#ifdef _WIN32
+    #pragma message ("Note: including shrUtils.h")
+    #pragma message ("Note: including opencl.h")
+#endif
+
+// SDK Revision #
+#define OCL_SDKREVISION "7027912"
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) 
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
+//!
+//! @return the id 
+//! @param clSelectedPlatformID         OpenCL platform ID
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print info about the device
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and return device capability
+//!
+//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA 
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" int oclGetDevCap(cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print the device name
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the first device from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxGPUContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of device with maximal FLOPS from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program file and prepends the cPreamble to the code.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get the binary (PTX) of the program associated with the device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//! @param binary       returned code
+//! @param length       length of returned code
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram                   OpenCL program
+//! @param cdDevice                    device of interest
+//! @param const char*  cPtxFileName   optional PTX file name
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the Build Log from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
+
+// Helper function for De-allocating cl objects
+// *********************************************************************
+extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
+
+// Helper function to get OpenCL error string from constant
+// *********************************************************************
+extern "C" const char* oclErrorString(cl_int error);
+
+// Helper function to get OpenCL image format string (channel order and type) from constant
+// *********************************************************************
+extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    // An error condition is defined by the sample/test value not equal to the reference
+    if (iReference != iSample)
+    {
+        // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
+        iSample = (iSample == 0) ? -9999 : iSample; 
+
+        // Log the error info
+        shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
+
+        // Cleanup and exit, or just exit if no cleanup function pointer provided.  Use iSample (error code in this case) as process exit code.
+        if (pCleanup != NULL)
+        {
+            pCleanup(iSample);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(iSample);
+        }
+    }
+}
+
+#endif
--- a/benchmarks/opencl/VectorHypot/shrQATest.h
+++ b/benchmarks/opencl/VectorHypot/shrQATest.h
@ -0,0 +1,238 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_QATEST_H
+#define SHR_QATEST_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// OS dependent includes
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+    #pragma message ("Note: including time.h")
+
+// Headers needed for Windows
+    #include <windows.h>
+	#include <time.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+    #include <unistd.h>
+    #include <time.h>
+#endif
+
+#ifndef STRCASECMP
+#ifdef _WIN32
+#define STRCASECMP _stricmp
+#else
+#define STRCASECMP strcasecmp
+#endif
+#endif
+
+#ifndef STRNCASECMP
+#ifdef _WIN32
+#define STRNCASECMP _strnicmp
+#else
+#define STRNCASECMP strncasecmp
+#endif
+#endif
+
+
+// Standardized QA Start/Finish for CUDA SDK tests
+#define shrQAStart(a, b)      __shrQAStart(a, b)
+#define shrQAFinish(a, b, c)  __shrQAFinish(a, b, c)
+#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
+
+inline int findExeNameStart(const char *exec_name)
+{
+    int exename_start = (int)strlen(exec_name);
+
+    while( (exename_start > 0) && 
+            (exec_name[exename_start] != '\\') && 
+            (exec_name[exename_start] != '/') )
+    {
+        exename_start--;
+    }
+    if (exec_name[exename_start] == '\\' || 
+        exec_name[exename_start] == '/')
+    {
+        return exename_start+1;
+    } else {
+        return exename_start;
+    }
+}
+
+inline int __shrQAStart(int argc, char **argv)
+{
+    bool bQATest = false;
+    // First clear the output buffer
+    fflush(stdout);
+    fflush(stdout);
+
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+        char *string_argv = &argv[i][string_start];
+
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }
+    }
+    
+    // We don't want to print the entire path, so we search for the first 
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    return exename_start;
+}
+
+enum eQAstatus {
+    QA_FAILED = 0,
+    QA_PASSED = 1,
+    QA_WAIVED = 2
+};
+
+inline void __ExitInTime(int seconds)
+{
+    fprintf(stdout, "> exiting in %d seconds: ", seconds);
+    fflush(stdout);
+    time_t t;
+    int count;
+    for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
+        fprintf(stdout, "%d...", count);
+#ifdef WIN32
+        Sleep(1000);
+#else
+        sleep(1);
+#endif
+    }
+    fprintf(stdout,"done!\n\n"); 
+	fflush(stdout);
+}
+
+
+inline void __shrQAFinish(int argc, const char **argv, int iStatus)
+{
+    // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
+    bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }	
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bNoPrompt = true;
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bNoPrompt = false;
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    } else {
+        if (!bNoPrompt) {
+            fprintf(stdout, "\nPress <Enter> to exit...\n");
+            fflush(stdout);
+            getchar();
+        }
+    }
+}
+
+inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
+{
+    bool bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    }
+}
+
+inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish(argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); 
+}
+
+inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish2(bQAtest, argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+#endif
--- a/benchmarks/opencl/VectorHypot/shrUtils.h
+++ b/benchmarks/opencl/VectorHypot/shrUtils.h
@ -0,0 +1,642 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_UTILS_H
+#define SHR_UTILS_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// reminders for output window and build log
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+#endif
+
+// OS dependent includes
+#ifdef _WIN32
+    // Headers needed for Windows
+    #include <windows.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+#endif
+
+// Other headers needed for both Windows and Linux
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// Un-comment the following #define to enable profiling code in SDK apps
+//#define GPU_PROFILING
+
+// Beginning of GPU Architecture definitions
+inline int ConvertSMVer2Cores(int major, int minor)
+{
+	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+	typedef struct {
+		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+		int Cores;
+	} sSMtoCores;
+
+	sSMtoCores nGpuArchCoresPerSM[] = 
+	{ { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+	  { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+	  { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+	  { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+	  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+	  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+	  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
+	  {   -1, -1 }
+	};
+
+	int index = 0;
+	while (nGpuArchCoresPerSM[index].SM != -1) {
+		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
+			return nGpuArchCoresPerSM[index].Cores;
+		}
+		index++;
+	}
+	printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
+	return -1;
+}
+// end of GPU Architecture definitions
+
+
+// Defines and enum for use with logging functions
+// *********************************************************************
+#define DEFAULTLOGFILE "SdkConsoleLog.txt"
+#define MASTERLOGFILE "SdkMasterLog.csv"
+enum LOGMODES 
+{
+    LOGCONSOLE = 1, // bit to signal "log to console" 
+    LOGFILE    = 2, // bit to signal "log to file" 
+    LOGBOTH    = 3, // convenience union of first 2 bits to signal "log to both"
+    APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
+    MASTER     = 8, // bit to signal master .csv log output
+    ERRORMSG   = 16, // bit to signal "pre-pend Error" 
+    CLOSELOG   = 32  // bit to close log file, if open, after any requested file write
+};
+#define HDASHLINE "-----------------------------------------------------------\n"
+
+// Standardized boolean
+enum shrBOOL
+{
+    shrFALSE = 0,
+    shrTRUE = 1
+};
+
+// Standardized MAX, MIN and CLAMP
+#define MAX(a, b) ((a > b) ? a : b)
+#define MIN(a, b) ((a < b) ? a : b)
+#define CLAMP(a, b, c) MIN(MAX(a, b), c)    // double sided clip of input a
+#define TOPCLAMP(a, b) (a < b ? a:b)	    // single top side clip of input a
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) 
+
+// Standardized Exit Macro for leaving main()... extended version
+// (Companion Inline Function lower on page)
+#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
+
+// Standardized Exit Macro for leaving main()... short version
+// (Companion Inline Function lower on page)
+#define shrEXIT(a, b)        __shrExitEX(a, b, EXIT_SUCCESS)
+
+// Simple argument checker macro
+#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE 
+
+// Define for user-customized error handling
+#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
+
+// Function to deallocate memory allocated within shrUtils
+// *********************************************************************
+extern "C" void shrFree(void* ptr);
+
+// *********************************************************************
+// Helper function to log standardized information to Console, to File or to both
+//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); 
+//!         : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+//! 
+//! Automatically opens file and stores handle if needed and not done yet
+//! Closes file and nulls handle on request
+//! 
+//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.  
+//!          LOGFILE and LOGBOTH may be | 'd  with APPENDMODE to select file append mode instead of overwrite mode 
+//!          LOGFILE and LOGBOTH may be | 'd  with CLOSELOG to "write and close" 
+//!          First 3 options may be | 'd  with MASTER to enable independent write to master data log file
+//!          First 3 options may be | 'd  with ERRORMSG to start line with standard error message
+//! @param 2 dValue:    
+//!          Positive val = double value for time in secs to be formatted to 6 decimals. 
+//!          Negative val is an error code and this give error preformatting.
+//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.  
+//!          ALL printf flags, width, precision and type specifiers are supported with this exception: 
+//!              Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
+//!              Single byte char type specifiers (%s and %c) ARE supported 
+//! @param 4... variable args: like printf or fprintf.  Must match format specifer type above.  
+//! @return 0 if OK, negative value on error or if error occurs or was passed in. 
+// *********************************************************************
+extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
+
+// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, 
+// *********************************************************************
+extern "C" int shrLog(const char* cFormatString, ...);
+
+// *********************************************************************
+// Delta timer function for up to 3 independent timers using host high performance counters 
+// Maintains state for 3 independent counters
+//! Example: double dElapsedTime = shrDeltaTime(0);
+//! 
+//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
+//! @return delta time of specified counter since last call in seconds.  Otherwise -9999.0 if error
+// *********************************************************************
+extern "C" double shrDeltaT(int iCounterID);
+
+// Optional LogFileNameOverride function
+// *********************************************************************
+extern "C" void shrSetLogFileName (const char* cOverRideName);
+
+// Helper function to init data arrays 
+// *********************************************************************
+extern "C" void shrFillArray(float* pfData, int iSize);
+
+// Helper function to print data arrays 
+// *********************************************************************
+extern "C" void shrPrintArray(float* pfData, int iSize);
+
+////////////////////////////////////////////////////////////////////////////
+//! Find the path for a filename
+//! @return the path if succeeded, otherwise 0
+//! @param filename        name of the file
+//! @param executablePath  optional absolute path of the executable
+////////////////////////////////////////////////////////////////////////////
+extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing single precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing double precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing single precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
+               const float epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing double precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
+               const double epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, 
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, 
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PPM image file (with unsigned char as data element type), padding 
+//! 4th component
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param OutData  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! 
+//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, 
+                             unsigned int *w, unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+//! Save PPM image file (with unsigned char as data element type, padded to 
+//! 4 bytes)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, 
+               unsigned int w, unsigned int h);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Save PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, 
+              unsigned int w, unsigned int h); 
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
+                  unsigned int *w,unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+// Command line arguments: General notes
+// * All command line arguments begin with '--' followed by the token; 
+//   token and value are seperated by '='; example --samples=50
+// * Arrays have the form --model=[one.obj,two.obj,three.obj] 
+//   (without whitespaces)
+////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////
+//! Check if command line argument \a flag-name is given
+//! @return shrTRUE if command line argument \a flag_name has been given, 
+//!         otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param flag_name  name of command line flag
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, 
+                     const char* flag_name);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, 
+                        const char* arg_name, int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type unsigned int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, 
+                        const char* arg_name, unsigned int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type float
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, 
+                        const char* arg_name, float* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type string
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, 
+                          const char* arg_name, char** val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument list those element are strings
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  command line argument list
+//! @param len  length of the list / number of elements
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, 
+                              const char* arg_name, char** val, 
+                              unsigned int* len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparef( const float* reference, const float* data,
+             const unsigned int len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparei( const int* reference, const int* data, 
+             const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned integer arrays, with epsilon and threshold
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
+            const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned char arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
+              const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integers with a tolernance for # of byte errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays witha n epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
+              const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality and a 
+//!     threshold for # pixel errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays using L2-norm with an epsilon tolerance for 
+//! equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
+                const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PPM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PGM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
+
+extern "C" size_t shrRoundUp(int group_size, int global_size);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    if (iReference != iSample)
+    {
+        shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); 
+        if (pCleanup != NULL)
+        {
+            pCleanup(EXIT_FAILURE);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
+// Standardized Exit
+// *********************************************************************
+inline void __shrExitEX(int argc, const char** argv, int iExitCode)
+{
+#ifdef WIN32
+    if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#else 
+    if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#endif
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");                  
+        getchar();                                                           
+    }       
+    else 
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); 
+    }
+    fflush(stderr);                                                         
+    exit(iExitCode);
+}
+
+#endif
--- a/benchmarks/opencl/reduce0/Makefile
+++ b/benchmarks/opencl/reduce0/Makefile
@ -0,0 +1,66 @@
+RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
+POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
+POCL_INC_PATH = $(wildcard ../include)
+POCL_LIB_PATH = $(wildcard ../lib)
+VX_RT_PATH = $(wildcard ../../../runtime)
+VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
+
+CC  = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
+CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
+DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
+HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
+GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
+
+VX_SRCS =  $(VX_RT_PATH)/newlib/newlib.c
+VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
+VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
+VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
+VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
+VX_SRCS += $(VX_RT_PATH)/tests/tests.c
+VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
+VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
+
+VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
+
+CXXFLAGS =  -g -O0 -march=rv32im -mabi=ilp32
+CXXFLAGS += -ffreestanding # program may not begin at main()
+CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
+CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
+CXXFLAGS += -I$(POCL_INC_PATH) -I.
+
+VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
+QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
+
+PROJECT=reduce0
+
+all: $(PROJECT).dump $(PROJECT).hex
+
+lib$(PROJECT).a: oclReduction_kernel.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
+
+$(PROJECT).elf: main.cc lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
+
+$(PROJECT).qemu: main.cc lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
+
+$(PROJECT).hex: $(PROJECT).elf
+	$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
+
+$(PROJECT).dump: $(PROJECT).elf
+	$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
+
+run: $(PROJECT).hex
+	POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
+
+qemu: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-s: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-c: $(PROJECT).qemu
+	$(GDB) $(PROJECT).qemu
+
+clean:
+	rm -rf *.elf *.dump *.hex
--- a/benchmarks/opencl/reduce0/main.cc
+++ b/benchmarks/opencl/reduce0/main.cc
@ -0,0 +1,638 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+/*
+    Parallel reduction
+
+    This sample shows how to perform a reduction operation on an array of values
+    to produce a single value.
+
+    Reductions are a very common computation in parallel algorithms.  Any time
+    an array of values needs to be reduced to a single value using a binary 
+    associative operator, a reduction can be used.  Example applications include
+    statistics computaions such as mean and standard deviation, and image 
+    processing applications such as finding the total luminance of an
+    image.
+
+    This code performs sum reductions, but any associative operator such as
+    min() or max() could also be used.
+
+    It assumes the input size is a power of 2.
+
+    COMMAND LINE ARGUMENTS
+
+    "--shmoo":         Test performance for 1 to 32M elements with each of the 7 different kernels
+    "--n=<N>":         Specify the number of elements to reduce (default 1048576)
+    "--threads=<N>":   Specify the number of threads per block (default 128)
+    "--kernel=<N>":    Specify which kernel to run (0-6, default 6)
+    "--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
+    "--cpufinal":      Read back the per-block results and do final sum of block sums on CPU (default false)
+    "--cputhresh=<N>": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1)
+    
+*/
+
+// Common system and utility includes 
+#include <oclUtils.h>
+#include <shrQATest.h>
+
+// additional includes
+#include <sstream>
+#include <oclReduction.h>
+
+// Forward declarations and sample-specific defines
+// *********************************************************************
+enum ReduceType
+{
+    REDUCE_INT,
+    REDUCE_FLOAT,
+    REDUCE_DOUBLE
+};
+
+template <class T>
+bool runTest( int argc, const char** argv, ReduceType datatype);
+
+#define MAX_BLOCK_DIM_SIZE 65535
+
+extern "C"
+bool isPow2(unsigned int x)
+{
+    return ((x&(x-1))==0);
+}
+
+cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2);
+
+// Main function 
+// *********************************************************************
+int main( int argc, const char** argv) 
+{
+    shrQAStart(argc, (char **)argv);
+
+    // start logs 
+    shrSetLogFileName ("oclReduction.txt");
+    shrLog("%s Starting...\n\n", argv[0]); 
+
+    char *typeChoice;
+    shrGetCmdLineArgumentstr(argc, argv, "type", &typeChoice);
+
+    // determine type of array from command line args
+    if (0 == typeChoice)
+    {
+        typeChoice = (char*)malloc(7 * sizeof(char));
+        #ifdef WIN32
+            strcpy_s(typeChoice, 7 * sizeof(char) + 1, "int");
+        #else
+            strcpy(typeChoice, "int");
+        #endif
+    }
+    ReduceType datatype = REDUCE_INT;
+
+    #ifdef WIN32
+        if (!_strcmpi(typeChoice, "float"))
+            datatype = REDUCE_FLOAT;
+        else if (!_strcmpi(typeChoice, "double"))
+            datatype = REDUCE_DOUBLE;
+        else
+            datatype = REDUCE_INT;
+    #else
+        if (!strcmp(typeChoice, "float"))
+            datatype = REDUCE_FLOAT;
+        else if (!strcmp(typeChoice, "double"))
+            datatype = REDUCE_DOUBLE;
+        else
+            datatype = REDUCE_INT;
+    #endif
+
+    shrLog("Reducing array of type %s.\n", typeChoice);
+
+    //Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    //Get the devices
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+    cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    //Create the context
+    cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    // get and log the device info
+    if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
+      int device_nr = 0;
+      shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &device_nr);
+	  if( device_nr < uiNumDevices ) {
+		device = oclGetDev(cxGPUContext, device_nr);
+	  } else {
+		shrLog("Invalid Device %d Requested.\n", device_nr);
+		shrExitEX(argc, argv, EXIT_FAILURE);
+	  }
+    } else {
+      device = oclGetMaxFlopsDev(cxGPUContext);
+    }
+    oclPrintDevName(LOGBOTH, device);
+    shrLog("\n");
+
+    // create a command-queue
+    cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    source_path = shrFindFilePath("oclReduction_kernel.cl", argv[0]);
+
+    bool bSuccess = false;
+    switch (datatype)
+    {
+    default:
+    case REDUCE_INT:
+        bSuccess = runTest<int>( argc, argv, datatype);
+        break;
+    case REDUCE_FLOAT:
+        bSuccess = runTest<float>( argc, argv, datatype);
+        break;
+    }
+    
+    // finish
+    shrQAFinishExit(argc, (const char **)argv, bSuccess ? QA_PASSED : QA_FAILED);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute sum reduction on CPU
+//! We use Kahan summation for an accurate sum of large arrays.
+//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
+//! 
+//! @param data       pointer to input data
+//! @param size       number of input data elements
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+T reduceCPU(T *data, int size)
+{
+    T sum = data[0];
+    T c = (T)0.0;              
+    for (int i = 1; i < size; i++)
+    {
+        T y = data[i] - c;  
+        T t = sum + y;      
+        c = (t - sum) - y;  
+        sum = t;            
+    }
+    return sum;
+}
+
+unsigned int nextPow2( unsigned int x ) {
+    --x;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    return ++x;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Compute the number of threads and blocks to use for the given reduction kernel
+// For the kernels >= 3, we set threads / block to the minimum of maxThreads and
+// n/2. For kernels < 3, we set to the minimum of maxThreads and n.  For kernel 
+// 6, we observe the maximum specified number of blocks, because each thread in 
+// that kernel can process a variable number of elements.
+////////////////////////////////////////////////////////////////////////////////
+void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
+{
+    if (whichKernel < 3)
+    {
+        threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
+        blocks = (n + threads - 1) / threads;
+    }
+    else
+    {
+        threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
+        blocks = (n + (threads * 2 - 1)) / (threads * 2);
+    }
+        
+
+    if (whichKernel == 6)
+        blocks = MIN(maxBlocks, blocks);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This function performs a reduction of the input data multiple times and 
+// measures the average reduction time.
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+T profileReduce(ReduceType datatype,
+                  cl_int  n, 
+                  int  numThreads,
+                  int  numBlocks,
+                  int  maxThreads,
+                  int  maxBlocks,
+                  int  whichKernel, 
+                  int  testIterations,
+                  bool cpuFinalReduction,
+                  int  cpuFinalThreshold,
+                  double* dTotalTime,
+                  T* h_odata,
+                  cl_mem d_idata, 
+                  cl_mem d_odata)
+{
+
+
+    T gpu_result = 0;
+    bool needReadBack = true;
+    cl_kernel finalReductionKernel[10];
+    int finalReductionIterations=0;
+
+    //shrLog("Profile Kernel %d\n", whichKernel);
+
+    cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) );
+    clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata);
+    clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata);
+    clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n);
+    clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL);
+
+    if( !cpuFinalReduction ) {
+        int s=numBlocks;
+        int threads = 0, blocks = 0;
+        int kernel = (whichKernel == 6) ? 5 : whichKernel;
+        
+        while(s > cpuFinalThreshold) 
+        {
+            getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
+
+            finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) );
+            clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata);
+            clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata);
+            clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n);
+            clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL);
+            
+            if (kernel < 3)
+                s = (s + threads - 1) / threads;
+            else
+                s = (s + (threads*2-1)) / (threads*2);
+
+            finalReductionIterations++;
+        }
+    }
+    
+    size_t globalWorkSize[1];
+    size_t localWorkSize[1];
+
+    for (int i = 0; i < testIterations; ++i)
+    {
+        gpu_result = 0;
+
+        clFinish(cqCommandQueue);
+        if(i>0) shrDeltaT(1);
+
+        // execute the kernel
+        globalWorkSize[0] = numBlocks * numThreads;
+        localWorkSize[0] = numThreads;
+	
+        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize,
+                                          0, NULL, NULL);               
+
+        // check if kernel execution generated an error        
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+        if (cpuFinalReduction)
+        {
+            // sum partial sums from each block on CPU        
+            // copy result from device to host
+            clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T), 
+                                h_odata, 0, NULL, NULL);
+
+            for(int i=0; i<numBlocks; i++) 
+            {
+                gpu_result += h_odata[i];
+            }
+
+            needReadBack = false;
+        }
+        else
+        {
+            // sum partial block sums on GPU
+            int s=numBlocks;
+            int kernel = (whichKernel == 6) ? 5 : whichKernel;
+            int it = 0;
+            
+
+            while(s > cpuFinalThreshold) 
+            {
+                int threads = 0, blocks = 0;
+                getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
+
+                globalWorkSize[0] = threads * blocks;
+                localWorkSize[0] = threads;
+                
+                ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0,
+                                                  globalWorkSize, localWorkSize, 0, NULL, NULL);               
+                //oclCheckError(ciErrNum, CL_SUCCESS);
+                
+                if (kernel < 3)
+                    s = (s + threads - 1) / threads;
+                else
+                    s = (s + (threads*2-1)) / (threads*2);
+
+                it++;
+            }
+
+            if (s > 1)
+            {
+                // copy result from device to host
+                clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T), 
+                                    h_odata, 0, NULL, NULL);
+
+                for(int i=0; i < s; i++) 
+                {
+                    gpu_result += h_odata[i];
+                }
+
+                needReadBack = false;
+            }
+        }
+
+        clFinish(cqCommandQueue);
+        if(i>0) *dTotalTime += shrDeltaT(1); 
+    }
+
+    if (needReadBack)
+    {
+        // copy final sum from device to host
+        clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T), 
+                            &gpu_result, 0, NULL, NULL);
+    }
+
+    // Release the kernels
+    clReleaseKernel(reductionKernel);
+    if( !cpuFinalReduction ) {
+        for(int it=0; it<finalReductionIterations; ++it) {
+            clReleaseKernel(finalReductionKernel[it]);
+        }
+        
+    }
+
+    return gpu_result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This function calls profileReduce multple times for a range of array sizes
+// and prints a report in CSV (comma-separated value) format that can be used for
+// generating a "shmoo" plot showing the performance for each kernel variation
+// over a wide range of input sizes.
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
+{ 
+    // create random input data on CPU
+    unsigned int bytes = maxN * sizeof(T);
+
+    T* h_idata = (T*)malloc(bytes);
+
+    for(int i = 0; i < maxN; i++) {
+        // Keep the numbers small so we don't get truncation error in the sum
+        if (datatype == REDUCE_INT)
+            h_idata[i] = (T)(rand() & 0xFF);
+        else
+            h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
+    }
+
+    int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);
+
+    // allocate mem for the result on host side
+    T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));
+
+    // allocate device memory and data
+    cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
+    cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL);
+
+    int testIterations = 100;
+    double dTotalTime = 0.0;
+    
+    // print headers
+    shrLog("Time in seconds for various numbers of elements for each kernel\n");
+    shrLog("\n\n");
+    shrLog("Kernel");
+    for (int i = minN; i <= maxN; i *= 2)
+    {
+        shrLog(", %d", i);
+    }
+   
+    for (int kernel = 0; kernel < 7; kernel++)
+    {
+        shrLog("\n");
+        shrLog("%d", kernel);
+        for (int i = minN; i <= maxN; i *= 2)
+        {
+            int numBlocks = 0;
+            int numThreads = 0;
+            getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);
+            
+            double reduceTime;
+            if( numBlocks <= MAX_BLOCK_DIM_SIZE ) {
+                profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, 
+                                testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata);
+                reduceTime = dTotalTime/(double)testIterations;
+            } else {                
+                reduceTime = -1.0;
+            }
+            shrLog(", %.4f m", reduceTime);
+        }
+    }
+
+    // cleanup
+    free(h_idata);
+    free(h_odata);
+    clReleaseMemObject(d_idata);
+    clReleaseMemObject(d_odata);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The main function whihc runs the reduction test.
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+bool
+runTest( int argc, const char** argv, ReduceType datatype) 
+{
+    int size = 1<<24;    // number of elements to reduce
+    int maxThreads;
+
+    cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1);        
+    clReleaseKernel(reductionKernel);
+
+    if (smallBlock) 
+      maxThreads = 64;  // number of threads per block
+    else
+      maxThreads = 128;
+
+    int whichKernel = 6;
+    int maxBlocks = 64;
+    bool cpuFinalReduction = false;
+    int cpuFinalThreshold = 1;
+
+    shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
+    shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
+    shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
+    shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
+    
+    shrLog(" %d elements\n", size);
+    shrLog(" %d threads (max)\n", maxThreads);
+
+    cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE);
+    shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);
+
+    bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE);
+
+#ifdef GPU_PROFILING
+    if (runShmoo)
+    {
+        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
+        return true;
+    }
+    else
+#endif
+    {
+        // create random input data on CPU
+        unsigned int bytes = size * sizeof(T);
+        T* h_idata = (T*)malloc(bytes);
+
+        for(int i=0; i<size; i++) 
+        {
+            // Keep the numbers small so we don't get truncation error in the sum
+            if (datatype == REDUCE_INT)
+                h_idata[i] = (T)(rand() & 0xFF);
+            else
+                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
+        }
+
+        int numBlocks = 0;
+        int numThreads = 0;
+        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
+        if (numBlocks == 1) cpuFinalThreshold = 1;
+        shrLog(" %d blocks\n\n", numBlocks);
+
+        // allocate mem for the result on host side
+        T* h_odata = (T*)malloc(numBlocks * sizeof(T));
+
+        // allocate device memory and data
+        cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
+        cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL);
+      
+        int testIterations = 100;
+        double dTotalTime = 0.0;
+        T gpu_result = 0;
+        gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks,
+                                        whichKernel, testIterations, cpuFinalReduction, 
+                                        cpuFinalThreshold, &dTotalTime,
+                                        h_odata, d_idata, d_odata);
+
+#ifdef GPU_PROFILING
+        double reduceTime = dTotalTime/(double)testIterations;
+        shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 
+               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
+#endif
+
+        // compute reference solution
+        shrLog("\nComparing against Host/C++ computation...\n"); 
+        T cpu_result = reduceCPU<T>(h_idata, size);
+        if (datatype == REDUCE_INT)
+        {
+            shrLog(" GPU result = %d\n", gpu_result);
+            shrLog(" CPU result = %d\n\n", cpu_result);
+            shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED");
+        }
+        else
+        {
+            shrLog(" GPU result = %.9f\n", gpu_result);
+            shrLog(" CPU result = %.9f\n\n", cpu_result);
+
+            double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12;
+            double diff = abs((double)gpu_result - (double)cpu_result);
+            shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED");
+        }
+      
+        // cleanup
+        free(h_idata);
+        free(h_odata);
+        clReleaseMemObject(d_idata);
+        clReleaseMemObject(d_odata);
+
+        return (gpu_result == cpu_result);
+    }
+}
+
+// Helper function to create and build program and kernel
+// *********************************************************************
+cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2)
+{
+    // compile cl program
+    size_t program_length;
+    char *source; 
+
+    std::ostringstream preamble;   
+
+    // create the program
+    // with type specification depending on datatype argument
+    switch (datatype)
+    {
+    default:
+    case REDUCE_INT:
+        preamble << "#define T int" << std::endl;
+        break;
+    case REDUCE_FLOAT:
+        preamble << "#define T float" << std::endl;
+        break;
+    }
+    
+    // set blockSize at compile time
+    preamble << "#define blockSize " << blockSize << std::endl;
+    
+    // set isPow2 at compile time
+    preamble << "#define nIsPow2 " << isPowOf2 << std::endl;
+    
+    // Load the source code and prepend the preamble
+    source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length);
+    //oclCheckError(source != NULL, shrTRUE);
+    
+    program =
+      clCreateProgramWithBuiltInKernels(context, 1, &device_id, "reduce0", NULL);
+    //cl_program rv_program = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, 
+     //                                                &program_length, &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+    free(source);
+
+    // build the program
+    ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
+    if (ciErrNum != CL_SUCCESS)
+    {
+        // write out standard error, Build Log and PTX, then cleanup and exit
+        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+        oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
+        oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclReduction.ptx");
+        //oclCheckError(ciErrNum, CL_SUCCESS); 
+    }
+    
+    // create Kernel    
+    std::ostringstream kernelName;
+    kernelName << "reduce" << whichKernel;    
+    cl_kernel ckKernel = clCreateKernel(rv_program, kernelName.str().c_str(), &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    size_t wgSize;
+    ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
+    if (wgSize == 64) 
+      smallBlock = true;
+    else smallBlock = false;
+
+    // NOTE: the program will get deleted when the kernel is also released
+    clReleaseProgram(rv_program);
+    
+    return ckKernel;
+}
--- a/benchmarks/opencl/reduce0/oclReduction.h
+++ b/benchmarks/opencl/reduce0/oclReduction.h
@ -0,0 +1,34 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+ #ifndef __REDUCTION_H__
+#define __REDUCTION_H__
+
+template <class T>
+void reduce_sm10(int size, int threads, int blocks, 
+                 int whichKernel, T *d_idata, T *d_odata);
+
+template <class T>
+void reduce_sm13(int size, int threads, int blocks, 
+                 int whichKernel, T *d_idata, T *d_odata);
+
+// CL objects
+cl_platform_id cpPlatform;
+cl_uint uiNumDevices;
+cl_device_id* cdDevices; 
+cl_context cxGPUContext;
+cl_command_queue cqCommandQueue;
+cl_device_id device;
+cl_int ciErrNum;
+const char* source_path;
+bool smallBlock = true;
+
+#endif
--- a/benchmarks/opencl/reduce0/oclReduction_kernel.cl
+++ b/benchmarks/opencl/reduce0/oclReduction_kernel.cl
@ -0,0 +1,273 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+    Parallel reduction kernels
+*/
+
+// The following defines are set during runtime compilation, see reduction.cpp
+// #define T float
+// #define blockSize 128
+// #define nIsPow2 1
+
+#ifndef _REDUCE_KERNEL_H_
+#define _REDUCE_KERNEL_H_
+
+/*
+    Parallel sum reduction using shared memory
+    - takes log(n) steps for n input elements
+    - uses n threads
+    - only works for power-of-2 arrays
+*/
+
+/* This reduction interleaves which threads are active by using the modulo
+   operator.  This operator is very expensive on GPUs, and the interleaved 
+   inactivity means that no whole warps are active, which is also very 
+   inefficient */
+__kernel void reduce0(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // load shared mem
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_global_id(0);
+    
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=1; s < get_local_size(0); s *= 2) {
+        // modulo arithmetic is slow!
+        if ((tid % (2*s)) == 0) {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+
+/* This version uses contiguous threads, but its interleaved 
+   addressing results in many shared memory bank conflicts. */
+__kernel void reduce1(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // load shared mem
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_global_id(0);
+    
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=1; s < get_local_size(0); s *= 2) 
+    {
+        int index = 2 * s * tid;
+
+        if (index < get_local_size(0)) 
+        {
+            sdata[index] += sdata[index + s];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version uses sequential addressing -- no divergence or bank conflicts.
+*/
+__kernel void reduce2(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // load shared mem
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_global_id(0);
+    
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=get_local_size(0)/2; s>0; s>>=1) 
+    {
+        if (tid < s) 
+        {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version uses n/2 threads --
+    it performs the first level of reduction when reading from global memory
+*/
+__kernel void reduce3(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    if (i + get_local_size(0) < n) 
+        sdata[tid] += g_idata[i+get_local_size(0)];  
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=get_local_size(0)/2; s>0; s>>=1) 
+    {
+        if (tid < s) 
+        {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version unrolls the last warp to avoid synchronization where it 
+    isn't needed
+*/
+__kernel void reduce4(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    if (i + get_local_size(0) < n) 
+        sdata[tid] += g_idata[i+get_local_size(0)];  
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    #pragma unroll 1
+    for(unsigned int s=get_local_size(0)/2; s>32; s>>=1) 
+    {
+        if (tid < s) 
+        {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (tid < 32)
+    {
+        if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; }
+        if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; }
+        if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; }
+        if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; }
+        if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; }
+        if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; }
+    }
+
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version is completely unrolled.  It uses a template parameter to achieve 
+    optimal code for any (power of 2) number of threads.  This requires a switch 
+    statement in the host code to handle all the different thread block sizes at 
+    compile time.
+*/
+__kernel void reduce5(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    if (i + blockSize < n) 
+        sdata[tid] += g_idata[i+blockSize];  
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 128) { if (tid <  64) { sdata[tid] += sdata[tid +  64]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    
+    if (tid < 32)
+    {
+        if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; }
+        if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; }
+        if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; }
+        if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; }
+        if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; }
+        if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; }
+    }
+    
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version adds multiple elements per thread sequentially.  This reduces the overall
+    cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
+    (Brent's Theorem optimization)
+*/
+__kernel void reduce6(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+    unsigned int gridSize = blockSize*2*get_num_groups(0);
+    sdata[tid] = 0;
+
+    // we reduce multiple elements per thread.  The number is determined by the 
+    // number of active thread blocks (via gridDim).  More blocks will result
+    // in a larger gridSize and therefore fewer elements per thread
+    while (i < n)
+    {         
+        sdata[tid] += g_idata[i];
+        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
+        if (nIsPow2 || i + blockSize < n) 
+            sdata[tid] += g_idata[i+blockSize];  
+        i += gridSize;
+    } 
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 128) { if (tid <  64) { sdata[tid] += sdata[tid +  64]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    
+    if (tid < 32)
+    {
+        if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; }
+        if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; }
+        if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; }
+        if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; }
+        if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; }
+        if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; }
+    }
+    
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+#endif // #ifndef _REDUCE_KERNEL_H_
--- a/benchmarks/opencl/reduce0/oclUtils.h
+++ b/benchmarks/opencl/reduce0/oclUtils.h
@ -0,0 +1,198 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+#ifndef OCL_UTILS_H
+#define OCL_UTILS_H
+
+// *********************************************************************
+// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// Common headers:  Cross-API utililties and OpenCL header
+#include <shrUtils.h>
+
+// All OpenCL headers
+#if defined (__APPLE__) || defined(MACOSX)
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/opencl.h>
+#endif 
+
+// Includes
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// For systems with CL_EXT that are not updated with these extensions, we copied these
+// extensions from <CL/cl_ext.h>
+#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+  /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+  #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+  #define CL_DEVICE_WARP_SIZE_NV                      0x4003
+  #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+  #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+  #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#endif
+
+// reminders for build output window and log
+#ifdef _WIN32
+    #pragma message ("Note: including shrUtils.h")
+    #pragma message ("Note: including opencl.h")
+#endif
+
+// SDK Revision #
+#define OCL_SDKREVISION "7027912"
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) 
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
+//!
+//! @return the id 
+//! @param clSelectedPlatformID         OpenCL platform ID
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print info about the device
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and return device capability
+//!
+//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA 
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" int oclGetDevCap(cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print the device name
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the first device from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxGPUContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of device with maximal FLOPS from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program file and prepends the cPreamble to the code.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get the binary (PTX) of the program associated with the device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//! @param binary       returned code
+//! @param length       length of returned code
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram                   OpenCL program
+//! @param cdDevice                    device of interest
+//! @param const char*  cPtxFileName   optional PTX file name
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the Build Log from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
+
+// Helper function for De-allocating cl objects
+// *********************************************************************
+extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
+
+// Helper function to get OpenCL error string from constant
+// *********************************************************************
+extern "C" const char* oclErrorString(cl_int error);
+
+// Helper function to get OpenCL image format string (channel order and type) from constant
+// *********************************************************************
+extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    // An error condition is defined by the sample/test value not equal to the reference
+    if (iReference != iSample)
+    {
+        // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
+        iSample = (iSample == 0) ? -9999 : iSample; 
+
+        // Log the error info
+        shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
+
+        // Cleanup and exit, or just exit if no cleanup function pointer provided.  Use iSample (error code in this case) as process exit code.
+        if (pCleanup != NULL)
+        {
+            pCleanup(iSample);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(iSample);
+        }
+    }
+}
+
+#endif
--- a/benchmarks/opencl/reduce0/shrQATest.h
+++ b/benchmarks/opencl/reduce0/shrQATest.h
@ -0,0 +1,238 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_QATEST_H
+#define SHR_QATEST_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// OS dependent includes
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+    #pragma message ("Note: including time.h")
+
+// Headers needed for Windows
+    #include <windows.h>
+	#include <time.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+    #include <unistd.h>
+    #include <time.h>
+#endif
+
+#ifndef STRCASECMP
+#ifdef _WIN32
+#define STRCASECMP _stricmp
+#else
+#define STRCASECMP strcasecmp
+#endif
+#endif
+
+#ifndef STRNCASECMP
+#ifdef _WIN32
+#define STRNCASECMP _strnicmp
+#else
+#define STRNCASECMP strncasecmp
+#endif
+#endif
+
+
+// Standardized QA Start/Finish for CUDA SDK tests
+#define shrQAStart(a, b)      __shrQAStart(a, b)
+#define shrQAFinish(a, b, c)  __shrQAFinish(a, b, c)
+#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
+
+inline int findExeNameStart(const char *exec_name)
+{
+    int exename_start = (int)strlen(exec_name);
+
+    while( (exename_start > 0) && 
+            (exec_name[exename_start] != '\\') && 
+            (exec_name[exename_start] != '/') )
+    {
+        exename_start--;
+    }
+    if (exec_name[exename_start] == '\\' || 
+        exec_name[exename_start] == '/')
+    {
+        return exename_start+1;
+    } else {
+        return exename_start;
+    }
+}
+
+inline int __shrQAStart(int argc, char **argv)
+{
+    bool bQATest = false;
+    // First clear the output buffer
+    fflush(stdout);
+    fflush(stdout);
+
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+        char *string_argv = &argv[i][string_start];
+
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }
+    }
+    
+    // We don't want to print the entire path, so we search for the first 
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    return exename_start;
+}
+
+enum eQAstatus {
+    QA_FAILED = 0,
+    QA_PASSED = 1,
+    QA_WAIVED = 2
+};
+
+inline void __ExitInTime(int seconds)
+{
+    fprintf(stdout, "> exiting in %d seconds: ", seconds);
+    fflush(stdout);
+    time_t t;
+    int count;
+    for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
+        fprintf(stdout, "%d...", count);
+#ifdef WIN32
+        Sleep(1000);
+#else
+        sleep(1);
+#endif
+    }
+    fprintf(stdout,"done!\n\n"); 
+	fflush(stdout);
+}
+
+
+inline void __shrQAFinish(int argc, const char **argv, int iStatus)
+{
+    // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
+    bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }	
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bNoPrompt = true;
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bNoPrompt = false;
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    } else {
+        if (!bNoPrompt) {
+            fprintf(stdout, "\nPress <Enter> to exit...\n");
+            fflush(stdout);
+            getchar();
+        }
+    }
+}
+
+inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
+{
+    bool bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    }
+}
+
+inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish(argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); 
+}
+
+inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish2(bQAtest, argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+#endif
--- a/benchmarks/opencl/reduce0/shrUtils.h
+++ b/benchmarks/opencl/reduce0/shrUtils.h
@ -0,0 +1,642 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_UTILS_H
+#define SHR_UTILS_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// reminders for output window and build log
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+#endif
+
+// OS dependent includes
+#ifdef _WIN32
+    // Headers needed for Windows
+    #include <windows.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+#endif
+
+// Other headers needed for both Windows and Linux
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// Un-comment the following #define to enable profiling code in SDK apps
+//#define GPU_PROFILING
+
+// Beginning of GPU Architecture definitions
+inline int ConvertSMVer2Cores(int major, int minor)
+{
+	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+	typedef struct {
+		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+		int Cores;
+	} sSMtoCores;
+
+	sSMtoCores nGpuArchCoresPerSM[] = 
+	{ { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+	  { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+	  { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+	  { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+	  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+	  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+	  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
+	  {   -1, -1 }
+	};
+
+	int index = 0;
+	while (nGpuArchCoresPerSM[index].SM != -1) {
+		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
+			return nGpuArchCoresPerSM[index].Cores;
+		}
+		index++;
+	}
+	printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
+	return -1;
+}
+// end of GPU Architecture definitions
+
+
+// Defines and enum for use with logging functions
+// *********************************************************************
+#define DEFAULTLOGFILE "SdkConsoleLog.txt"
+#define MASTERLOGFILE "SdkMasterLog.csv"
+enum LOGMODES 
+{
+    LOGCONSOLE = 1, // bit to signal "log to console" 
+    LOGFILE    = 2, // bit to signal "log to file" 
+    LOGBOTH    = 3, // convenience union of first 2 bits to signal "log to both"
+    APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
+    MASTER     = 8, // bit to signal master .csv log output
+    ERRORMSG   = 16, // bit to signal "pre-pend Error" 
+    CLOSELOG   = 32  // bit to close log file, if open, after any requested file write
+};
+#define HDASHLINE "-----------------------------------------------------------\n"
+
+// Standardized boolean
+enum shrBOOL
+{
+    shrFALSE = 0,
+    shrTRUE = 1
+};
+
+// Standardized MAX, MIN and CLAMP
+#define MAX(a, b) ((a > b) ? a : b)
+#define MIN(a, b) ((a < b) ? a : b)
+#define CLAMP(a, b, c) MIN(MAX(a, b), c)    // double sided clip of input a
+#define TOPCLAMP(a, b) (a < b ? a:b)	    // single top side clip of input a
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) 
+
+// Standardized Exit Macro for leaving main()... extended version
+// (Companion Inline Function lower on page)
+#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
+
+// Standardized Exit Macro for leaving main()... short version
+// (Companion Inline Function lower on page)
+#define shrEXIT(a, b)        __shrExitEX(a, b, EXIT_SUCCESS)
+
+// Simple argument checker macro
+#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE 
+
+// Define for user-customized error handling
+#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
+
+// Function to deallocate memory allocated within shrUtils
+// *********************************************************************
+extern "C" void shrFree(void* ptr);
+
+// *********************************************************************
+// Helper function to log standardized information to Console, to File or to both
+//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); 
+//!         : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+//! 
+//! Automatically opens file and stores handle if needed and not done yet
+//! Closes file and nulls handle on request
+//! 
+//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.  
+//!          LOGFILE and LOGBOTH may be | 'd  with APPENDMODE to select file append mode instead of overwrite mode 
+//!          LOGFILE and LOGBOTH may be | 'd  with CLOSELOG to "write and close" 
+//!          First 3 options may be | 'd  with MASTER to enable independent write to master data log file
+//!          First 3 options may be | 'd  with ERRORMSG to start line with standard error message
+//! @param 2 dValue:    
+//!          Positive val = double value for time in secs to be formatted to 6 decimals. 
+//!          Negative val is an error code and this give error preformatting.
+//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.  
+//!          ALL printf flags, width, precision and type specifiers are supported with this exception: 
+//!              Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
+//!              Single byte char type specifiers (%s and %c) ARE supported 
+//! @param 4... variable args: like printf or fprintf.  Must match format specifer type above.  
+//! @return 0 if OK, negative value on error or if error occurs or was passed in. 
+// *********************************************************************
+extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
+
+// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, 
+// *********************************************************************
+extern "C" int shrLog(const char* cFormatString, ...);
+
+// *********************************************************************
+// Delta timer function for up to 3 independent timers using host high performance counters 
+// Maintains state for 3 independent counters
+//! Example: double dElapsedTime = shrDeltaTime(0);
+//! 
+//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
+//! @return delta time of specified counter since last call in seconds.  Otherwise -9999.0 if error
+// *********************************************************************
+extern "C" double shrDeltaT(int iCounterID);
+
+// Optional LogFileNameOverride function
+// *********************************************************************
+extern "C" void shrSetLogFileName (const char* cOverRideName);
+
+// Helper function to init data arrays 
+// *********************************************************************
+extern "C" void shrFillArray(float* pfData, int iSize);
+
+// Helper function to print data arrays 
+// *********************************************************************
+extern "C" void shrPrintArray(float* pfData, int iSize);
+
+////////////////////////////////////////////////////////////////////////////
+//! Find the path for a filename
+//! @return the path if succeeded, otherwise 0
+//! @param filename        name of the file
+//! @param executablePath  optional absolute path of the executable
+////////////////////////////////////////////////////////////////////////////
+extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing single precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing double precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing single precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
+               const float epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing double precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
+               const double epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, 
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, 
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PPM image file (with unsigned char as data element type), padding 
+//! 4th component
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param OutData  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! 
+//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, 
+                             unsigned int *w, unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+//! Save PPM image file (with unsigned char as data element type, padded to 
+//! 4 bytes)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, 
+               unsigned int w, unsigned int h);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Save PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, 
+              unsigned int w, unsigned int h); 
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
+                  unsigned int *w,unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+// Command line arguments: General notes
+// * All command line arguments begin with '--' followed by the token; 
+//   token and value are seperated by '='; example --samples=50
+// * Arrays have the form --model=[one.obj,two.obj,three.obj] 
+//   (without whitespaces)
+////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////
+//! Check if command line argument \a flag-name is given
+//! @return shrTRUE if command line argument \a flag_name has been given, 
+//!         otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param flag_name  name of command line flag
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, 
+                     const char* flag_name);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, 
+                        const char* arg_name, int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type unsigned int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, 
+                        const char* arg_name, unsigned int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type float
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, 
+                        const char* arg_name, float* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type string
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, 
+                          const char* arg_name, char** val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument list those element are strings
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  command line argument list
+//! @param len  length of the list / number of elements
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, 
+                              const char* arg_name, char** val, 
+                              unsigned int* len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparef( const float* reference, const float* data,
+             const unsigned int len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparei( const int* reference, const int* data, 
+             const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned integer arrays, with epsilon and threshold
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
+            const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned char arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
+              const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integers with a tolernance for # of byte errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays witha n epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
+              const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality and a 
+//!     threshold for # pixel errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays using L2-norm with an epsilon tolerance for 
+//! equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
+                const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PPM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PGM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
+
+extern "C" size_t shrRoundUp(int group_size, int global_size);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    if (iReference != iSample)
+    {
+        shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); 
+        if (pCleanup != NULL)
+        {
+            pCleanup(EXIT_FAILURE);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
+// Standardized Exit
+// *********************************************************************
+inline void __shrExitEX(int argc, const char** argv, int iExitCode)
+{
+#ifdef WIN32
+    if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#else 
+    if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#endif
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");                  
+        getchar();                                                           
+    }       
+    else 
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); 
+    }
+    fflush(stderr);                                                         
+    exit(iExitCode);
+}
+
+#endif
--- a/benchmarks/opencl/transpose/Makefile
+++ b/benchmarks/opencl/transpose/Makefile
@ -0,0 +1,66 @@
+RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
+POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
+POCL_INC_PATH = $(wildcard ../include)
+POCL_LIB_PATH = $(wildcard ../lib)
+VX_RT_PATH = $(wildcard ../../../runtime)
+VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
+
+CC  = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
+CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
+DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
+HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
+GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
+
+VX_SRCS =  $(VX_RT_PATH)/newlib/newlib.c
+VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
+VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
+VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
+VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
+VX_SRCS += $(VX_RT_PATH)/tests/tests.c
+VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
+VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
+
+VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
+
+CXXFLAGS =  -g -O0 -march=rv32im -mabi=ilp32 
+CXXFLAGS += -ffreestanding # program may not begin at main()
+CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
+CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
+CXXFLAGS += -I$(POCL_INC_PATH) -I.
+
+VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
+QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
+
+PROJECT=transpose
+
+all: $(PROJECT).dump $(PROJECT).hex
+
+lib$(PROJECT).a: transpose.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
+
+$(PROJECT).elf: main.cc lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
+
+$(PROJECT).qemu: main.cc lib$(PROJECT).a
+	$(CXX) $(CXXFLAGS) main.cc transpose_gold.cpp $(QEMU_LIBS) -o $(PROJECT).qemu
+
+$(PROJECT).hex: $(PROJECT).elf
+	$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
+
+$(PROJECT).dump: $(PROJECT).elf
+	$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
+
+run: $(PROJECT).hex
+	POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
+
+qemu: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-s: $(PROJECT).qemu
+	POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
+
+gdb-c: $(PROJECT).qemu
+	$(GDB) $(PROJECT).qemu
+
+clean:
+	rm -rf *.elf *.dump *.hex
--- a/benchmarks/opencl/transpose/main.cc
+++ b/benchmarks/opencl/transpose/main.cc
@ -0,0 +1,365 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+/* Matrix transpose with Cuda 
+ * Host code.
+
+ * This example transposes arbitrary-size matrices.  It compares a naive
+ * transpose kernel that suffers from non-coalesced writes, to an optimized
+ * transpose with fully coalesced memory access and no bank conflicts.  On 
+ * a G80 GPU, the optimized transpose can be more than 10x faster for large
+ * matrices.
+ */
+
+// standard utility and system includes
+#include <oclUtils.h>
+#include <shrQATest.h>
+
+#define BLOCK_DIM 16
+
+// max GPU's to manage for multi-GPU parallel compute
+const unsigned int MAX_GPU_COUNT = 8;
+
+// global variables
+cl_platform_id cpPlatform;
+cl_uint uiNumDevices;
+cl_device_id* cdDevices;
+cl_context cxGPUContext;
+cl_kernel ckKernel[MAX_GPU_COUNT];
+cl_command_queue commandQueue[MAX_GPU_COUNT];
+cl_program rv_program;
+
+// forward declarations
+// *********************************************************************
+int runTest( int argc, const char** argv);
+extern "C" void computeGold( float* reference, float* idata, 
+                         const unsigned int size_x, const unsigned int size_y );
+
+// Main Program
+// *********************************************************************
+int main( int argc, const char** argv) 
+{    
+    shrQAStart(argc, (char **)argv);
+
+    // set logfile name and start logs
+    shrSetLogFileName ("oclTranspose.txt");
+    shrLog("%s Starting...\n\n", argv[0]); 
+
+    // run the main test
+    int result = runTest(argc, argv);
+    //oclCheckError(result, 0);
+}
+
+double transposeGPU(const char* kernelName, bool useLocalMem,  cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
+{
+    cl_mem d_odata[MAX_GPU_COUNT];
+    cl_mem d_idata[MAX_GPU_COUNT];
+    cl_kernel ckKernel[MAX_GPU_COUNT];
+
+    size_t szGlobalWorkSize[2];
+    size_t szLocalWorkSize[2];
+    cl_int ciErrNum;
+ 
+    // Create buffers for each GPU
+    // Each GPU will compute sizePerGPU rows of the result
+    size_t sizePerGPU = shrRoundUp(BLOCK_DIM, (size_x+ciDeviceCount-1) / ciDeviceCount);
+    
+    // size of memory required to store the matrix
+    const size_t mem_size = sizeof(float) * size_x * size_y;
+
+    for(unsigned int i = 0; i < ciDeviceCount; ++i){
+        // allocate device memory and copy host to device memory
+        d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                    mem_size, h_idata, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+        // create buffer to store output
+        d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY ,
+                                    sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+        // create the naive transpose kernel
+        ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum);
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+        
+        // set the args values for the naive kernel
+        size_t offset = i * sizePerGPU;
+        ciErrNum  = clSetKernelArg(ckKernel[i], 0, sizeof(cl_mem), (void *) &d_odata[i]);
+        ciErrNum |= clSetKernelArg(ckKernel[i], 1, sizeof(cl_mem), (void *) &d_idata[0]);
+        ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset);
+        ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x);
+        ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y);
+        if(useLocalMem)
+        {
+            ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 );
+        }
+    }
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    // set up execution configuration
+    szLocalWorkSize[0] = BLOCK_DIM;
+    szLocalWorkSize[1] = BLOCK_DIM;
+    szGlobalWorkSize[0] = sizePerGPU;
+    szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y);
+    
+    // execute the kernel numIterations times
+    int numIterations = 100;
+    shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y);
+    for (int i = -1; i < numIterations; ++i)
+    {
+        // Start time measurement after warmup
+        if( i == 0 ) shrDeltaT(0);
+
+        for(unsigned int k=0; k < ciDeviceCount; ++k){
+            ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL,                                           
+                                szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
+        }
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+    }    
+
+    // Block CPU till GPU is done
+    for(unsigned int k=0; k < ciDeviceCount; ++k){ 
+        ciErrNum |= clFinish(commandQueue[k]);
+    }
+    double time = shrDeltaT(0)/(double)numIterations;
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    // Copy back to host
+    for(unsigned int i = 0; i < ciDeviceCount; ++i){
+        size_t offset = i * sizePerGPU;
+        size_t size = MIN(size_x - i * sizePerGPU, sizePerGPU);
+
+        ciErrNum |= clEnqueueReadBuffer(commandQueue[i], d_odata[i], CL_TRUE, 0,
+                                size * size_y * sizeof(float), &h_odata[offset * size_y], 
+                                0, NULL, NULL);
+    }
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    for(unsigned int i = 0; i < ciDeviceCount; ++i){
+        ciErrNum |= clReleaseMemObject(d_idata[i]);
+        ciErrNum |= clReleaseMemObject(d_odata[i]);
+        ciErrNum |= clReleaseKernel(ckKernel[i]);
+    }
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    return time;
+}
+
+//! Run a simple test for CUDA
+// *********************************************************************
+int runTest( const int argc, const char** argv) 
+{
+    cl_int ciErrNum;
+    cl_uint ciDeviceCount;
+    unsigned int size_x = 2048;
+    unsigned int size_y = 2048;
+
+    int temp;
+    if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){
+        size_x = temp;
+    }
+
+    if( shrGetCmdLineArgumenti( argc, argv,"height", &temp) ){
+        size_y = temp;
+    }
+
+    // size of memory required to store the matrix
+    const size_t mem_size = sizeof(float) * size_x * size_y;
+
+    //Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    //Get the devices
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    //Create the context
+    cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+  
+    if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
+    {
+        ciDeviceCount = 0;
+        // User specified GPUs
+        char* deviceList;
+        char* deviceStr;
+
+        shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);
+
+        #ifdef WIN32
+            char* next_token;
+            deviceStr = strtok_s (deviceList," ,.-", &next_token);
+        #else
+            deviceStr = strtok (deviceList," ,.-");
+        #endif   
+        ciDeviceCount = 0;
+        while(deviceStr != NULL) 
+        {
+            // get and print the device for this queue
+            cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
+	    if( device == (cl_device_id)-1 ) {
+                shrLog(" Invalid Device: %s\n\n", deviceStr);
+                return -1;
+	    }	
+
+            shrLog("Device %d: ", atoi(deviceStr));
+            oclPrintDevName(LOGBOTH, device);            
+            shrLog("\n");
+           
+            // create command queue
+            commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
+            if (ciErrNum != CL_SUCCESS)
+            {
+                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
+                return ciErrNum;
+            }
+
+            ++ciDeviceCount;
+
+            #ifdef WIN32
+                deviceStr = strtok_s (NULL," ,.-", &next_token);
+            #else            
+                deviceStr = strtok (NULL," ,.-");
+            #endif
+        }
+
+        free(deviceList);
+    } 
+    else 
+    {
+        // Find out how many GPU's to compute on all available GPUs
+        size_t nDeviceBytes;
+        ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
+        ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);
+
+        if (ciErrNum != CL_SUCCESS)
+        {
+            shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
+            return ciErrNum;
+        }
+        else if (ciDeviceCount == 0)
+        {
+            shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
+            return -1;
+        } 
+
+        // create command-queues
+        for(unsigned int i = 0; i < ciDeviceCount; ++i) 
+        {
+            // get and print the device for this queue
+            cl_device_id device = oclGetDev(cxGPUContext, i);
+            shrLog("Device %d: ", i);
+            oclPrintDevName(LOGBOTH, device);            
+            shrLog("\n");
+
+            // create command queue
+            commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
+            if (ciErrNum != CL_SUCCESS)
+            {
+                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
+                return ciErrNum;
+            }
+        }
+    }
+ 
+    // allocate and initalize host memory
+    float* h_idata = (float*)malloc(mem_size);
+    float* h_odata = (float*) malloc(mem_size);
+    srand(15235911);
+    shrFillArray(h_idata, (size_x * size_y));
+
+    // Program Setup
+    size_t program_length;
+    char* source_path = shrFindFilePath("transpose.cl", argv[0]);
+    //oclCheckError(source_path != NULL, shrTRUE);
+    char *source = oclLoadProgSource(source_path, "", &program_length);
+    //oclCheckError(source != NULL, shrTRUE);
+
+    // create the program
+    rv_program =
+      clCreateProgramWithBuiltInKernels(context, 1, &device_id, "transpose", NULL);
+    //rv_program = clCreateProgramWithSource(cxGPUContext, 1,
+                     // (const char **)&source, &program_length, &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+    
+    // build the program
+    ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
+    if (ciErrNum != CL_SUCCESS)
+    {
+        // write out standard error, Build Log and PTX, then return error
+        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+        oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
+        oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclTranspose.ptx");
+        return(EXIT_FAILURE); 
+    }
+    
+    // Run Naive Kernel
+#ifdef GPU_PROFILING
+    // Matrix Copy kernel runs to measure reference performance.
+    double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+    double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+    double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+#endif
+
+    double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+    double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+
+#ifdef GPU_PROFILING
+    // log times
+
+    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-simple copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
+           (1.0e-9 * double(size_x * size_y * sizeof(float))/simpleCopyTime), simpleCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
+
+    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-shared memory copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
+           (1.0e-9 * double(size_x * size_y * sizeof(float))/sharedCopyTime), sharedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
+
+    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-uncoalesced copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
+           (1.0e-9 * double(size_x * size_y * sizeof(float))/uncoalescedCopyTime), uncoalescedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
+
+    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-naive, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
+           (1.0e-9 * double(size_x * size_y * sizeof(float))/naiveTime), naiveTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
+    
+    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-optimized, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
+          (1.0e-9 * double(size_x * size_y * sizeof(float))/optimizedTime), optimizedTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
+
+#endif
+  
+    // compute reference solution and cross check results
+    float* reference = (float*)malloc( mem_size);
+    computeGold( reference, h_idata, size_x, size_y);
+    shrLog("\nComparing results with CPU computation... \n\n");
+    shrBOOL res = shrComparef( reference, h_odata, size_x * size_y);
+
+    // cleanup memory
+    free(h_idata);
+    free(h_odata);
+    free(reference);
+    free(source);
+    free(source_path);
+
+    // cleanup OpenCL
+    ciErrNum = clReleaseProgram(rv_program);    
+    for(unsigned int i = 0; i < ciDeviceCount; ++i) 
+    {
+        ciErrNum |= clReleaseCommandQueue(commandQueue[i]);
+    }    
+    ciErrNum |= clReleaseContext(cxGPUContext);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    // pass or fail (cumulative... all tests in the loop)
+    shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED);
+
+    return 0;
+}
--- a/benchmarks/opencl/transpose/oclUtils.h
+++ b/benchmarks/opencl/transpose/oclUtils.h
@ -0,0 +1,198 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+#ifndef OCL_UTILS_H
+#define OCL_UTILS_H
+
+// *********************************************************************
+// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// Common headers:  Cross-API utililties and OpenCL header
+#include <shrUtils.h>
+
+// All OpenCL headers
+#if defined (__APPLE__) || defined(MACOSX)
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/opencl.h>
+#endif 
+
+// Includes
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// For systems with CL_EXT that are not updated with these extensions, we copied these
+// extensions from <CL/cl_ext.h>
+#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+  /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+  #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+  #define CL_DEVICE_WARP_SIZE_NV                      0x4003
+  #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+  #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+  #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#endif
+
+// reminders for build output window and log
+#ifdef _WIN32
+    #pragma message ("Note: including shrUtils.h")
+    #pragma message ("Note: including opencl.h")
+#endif
+
+// SDK Revision #
+#define OCL_SDKREVISION "7027912"
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) 
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
+//!
+//! @return the id 
+//! @param clSelectedPlatformID         OpenCL platform ID
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print info about the device
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and return device capability
+//!
+//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA 
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" int oclGetDevCap(cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print the device name
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the first device from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxGPUContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of device with maximal FLOPS from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program file and prepends the cPreamble to the code.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get the binary (PTX) of the program associated with the device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//! @param binary       returned code
+//! @param length       length of returned code
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram                   OpenCL program
+//! @param cdDevice                    device of interest
+//! @param const char*  cPtxFileName   optional PTX file name
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the Build Log from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
+
+// Helper function for De-allocating cl objects
+// *********************************************************************
+extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
+
+// Helper function to get OpenCL error string from constant
+// *********************************************************************
+extern "C" const char* oclErrorString(cl_int error);
+
+// Helper function to get OpenCL image format string (channel order and type) from constant
+// *********************************************************************
+extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    // An error condition is defined by the sample/test value not equal to the reference
+    if (iReference != iSample)
+    {
+        // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
+        iSample = (iSample == 0) ? -9999 : iSample; 
+
+        // Log the error info
+        shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
+
+        // Cleanup and exit, or just exit if no cleanup function pointer provided.  Use iSample (error code in this case) as process exit code.
+        if (pCleanup != NULL)
+        {
+            pCleanup(iSample);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(iSample);
+        }
+    }
+}
+
+#endif
--- a/benchmarks/opencl/transpose/shrQATest.h
+++ b/benchmarks/opencl/transpose/shrQATest.h
@ -0,0 +1,238 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_QATEST_H
+#define SHR_QATEST_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// OS dependent includes
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+    #pragma message ("Note: including time.h")
+
+// Headers needed for Windows
+    #include <windows.h>
+	#include <time.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+    #include <unistd.h>
+    #include <time.h>
+#endif
+
+#ifndef STRCASECMP
+#ifdef _WIN32
+#define STRCASECMP _stricmp
+#else
+#define STRCASECMP strcasecmp
+#endif
+#endif
+
+#ifndef STRNCASECMP
+#ifdef _WIN32
+#define STRNCASECMP _strnicmp
+#else
+#define STRNCASECMP strncasecmp
+#endif
+#endif
+
+
+// Standardized QA Start/Finish for CUDA SDK tests
+#define shrQAStart(a, b)      __shrQAStart(a, b)
+#define shrQAFinish(a, b, c)  __shrQAFinish(a, b, c)
+#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
+
+inline int findExeNameStart(const char *exec_name)
+{
+    int exename_start = (int)strlen(exec_name);
+
+    while( (exename_start > 0) && 
+            (exec_name[exename_start] != '\\') && 
+            (exec_name[exename_start] != '/') )
+    {
+        exename_start--;
+    }
+    if (exec_name[exename_start] == '\\' || 
+        exec_name[exename_start] == '/')
+    {
+        return exename_start+1;
+    } else {
+        return exename_start;
+    }
+}
+
+inline int __shrQAStart(int argc, char **argv)
+{
+    bool bQATest = false;
+    // First clear the output buffer
+    fflush(stdout);
+    fflush(stdout);
+
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+        char *string_argv = &argv[i][string_start];
+
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }
+    }
+    
+    // We don't want to print the entire path, so we search for the first 
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    return exename_start;
+}
+
+enum eQAstatus {
+    QA_FAILED = 0,
+    QA_PASSED = 1,
+    QA_WAIVED = 2
+};
+
+inline void __ExitInTime(int seconds)
+{
+    fprintf(stdout, "> exiting in %d seconds: ", seconds);
+    fflush(stdout);
+    time_t t;
+    int count;
+    for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
+        fprintf(stdout, "%d...", count);
+#ifdef WIN32
+        Sleep(1000);
+#else
+        sleep(1);
+#endif
+    }
+    fprintf(stdout,"done!\n\n"); 
+	fflush(stdout);
+}
+
+
+inline void __shrQAFinish(int argc, const char **argv, int iStatus)
+{
+    // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
+    bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }	
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bNoPrompt = true;
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bNoPrompt = false;
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    } else {
+        if (!bNoPrompt) {
+            fprintf(stdout, "\nPress <Enter> to exit...\n");
+            fflush(stdout);
+            getchar();
+        }
+    }
+}
+
+inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
+{
+    bool bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    }
+}
+
+inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish(argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); 
+}
+
+inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish2(bQAtest, argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+#endif
--- a/benchmarks/opencl/transpose/shrUtils.h
+++ b/benchmarks/opencl/transpose/shrUtils.h
@ -0,0 +1,642 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_UTILS_H
+#define SHR_UTILS_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// reminders for output window and build log
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+#endif
+
+// OS dependent includes
+#ifdef _WIN32
+    // Headers needed for Windows
+    #include <windows.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+#endif
+
+// Other headers needed for both Windows and Linux
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// Un-comment the following #define to enable profiling code in SDK apps
+//#define GPU_PROFILING
+
+// Beginning of GPU Architecture definitions
+inline int ConvertSMVer2Cores(int major, int minor)
+{
+	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+	typedef struct {
+		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+		int Cores;
+	} sSMtoCores;
+
+	sSMtoCores nGpuArchCoresPerSM[] = 
+	{ { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+	  { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+	  { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+	  { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+	  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+	  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+	  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
+	  {   -1, -1 }
+	};
+
+	int index = 0;
+	while (nGpuArchCoresPerSM[index].SM != -1) {
+		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
+			return nGpuArchCoresPerSM[index].Cores;
+		}
+		index++;
+	}
+	printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
+	return -1;
+}
+// end of GPU Architecture definitions
+
+
+// Defines and enum for use with logging functions
+// *********************************************************************
+#define DEFAULTLOGFILE "SdkConsoleLog.txt"
+#define MASTERLOGFILE "SdkMasterLog.csv"
+enum LOGMODES 
+{
+    LOGCONSOLE = 1, // bit to signal "log to console" 
+    LOGFILE    = 2, // bit to signal "log to file" 
+    LOGBOTH    = 3, // convenience union of first 2 bits to signal "log to both"
+    APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
+    MASTER     = 8, // bit to signal master .csv log output
+    ERRORMSG   = 16, // bit to signal "pre-pend Error" 
+    CLOSELOG   = 32  // bit to close log file, if open, after any requested file write
+};
+#define HDASHLINE "-----------------------------------------------------------\n"
+
+// Standardized boolean
+enum shrBOOL
+{
+    shrFALSE = 0,
+    shrTRUE = 1
+};
+
+// Standardized MAX, MIN and CLAMP
+#define MAX(a, b) ((a > b) ? a : b)
+#define MIN(a, b) ((a < b) ? a : b)
+#define CLAMP(a, b, c) MIN(MAX(a, b), c)    // double sided clip of input a
+#define TOPCLAMP(a, b) (a < b ? a:b)	    // single top side clip of input a
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) 
+
+// Standardized Exit Macro for leaving main()... extended version
+// (Companion Inline Function lower on page)
+#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
+
+// Standardized Exit Macro for leaving main()... short version
+// (Companion Inline Function lower on page)
+#define shrEXIT(a, b)        __shrExitEX(a, b, EXIT_SUCCESS)
+
+// Simple argument checker macro
+#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE 
+
+// Define for user-customized error handling
+#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
+
+// Function to deallocate memory allocated within shrUtils
+// *********************************************************************
+extern "C" void shrFree(void* ptr);
+
+// *********************************************************************
+// Helper function to log standardized information to Console, to File or to both
+//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); 
+//!         : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+//! 
+//! Automatically opens file and stores handle if needed and not done yet
+//! Closes file and nulls handle on request
+//! 
+//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.  
+//!          LOGFILE and LOGBOTH may be | 'd  with APPENDMODE to select file append mode instead of overwrite mode 
+//!          LOGFILE and LOGBOTH may be | 'd  with CLOSELOG to "write and close" 
+//!          First 3 options may be | 'd  with MASTER to enable independent write to master data log file
+//!          First 3 options may be | 'd  with ERRORMSG to start line with standard error message
+//! @param 2 dValue:    
+//!          Positive val = double value for time in secs to be formatted to 6 decimals. 
+//!          Negative val is an error code and this give error preformatting.
+//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.  
+//!          ALL printf flags, width, precision and type specifiers are supported with this exception: 
+//!              Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
+//!              Single byte char type specifiers (%s and %c) ARE supported 
+//! @param 4... variable args: like printf or fprintf.  Must match format specifer type above.  
+//! @return 0 if OK, negative value on error or if error occurs or was passed in. 
+// *********************************************************************
+extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
+
+// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, 
+// *********************************************************************
+extern "C" int shrLog(const char* cFormatString, ...);
+
+// *********************************************************************
+// Delta timer function for up to 3 independent timers using host high performance counters 
+// Maintains state for 3 independent counters
+//! Example: double dElapsedTime = shrDeltaTime(0);
+//! 
+//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
+//! @return delta time of specified counter since last call in seconds.  Otherwise -9999.0 if error
+// *********************************************************************
+extern "C" double shrDeltaT(int iCounterID);
+
+// Optional LogFileNameOverride function
+// *********************************************************************
+extern "C" void shrSetLogFileName (const char* cOverRideName);
+
+// Helper function to init data arrays 
+// *********************************************************************
+extern "C" void shrFillArray(float* pfData, int iSize);
+
+// Helper function to print data arrays 
+// *********************************************************************
+extern "C" void shrPrintArray(float* pfData, int iSize);
+
+////////////////////////////////////////////////////////////////////////////
+//! Find the path for a filename
+//! @return the path if succeeded, otherwise 0
+//! @param filename        name of the file
+//! @param executablePath  optional absolute path of the executable
+////////////////////////////////////////////////////////////////////////////
+extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing single precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing double precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing single precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
+               const float epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing double precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
+               const double epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, 
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, 
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PPM image file (with unsigned char as data element type), padding 
+//! 4th component
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param OutData  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! 
+//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, 
+                             unsigned int *w, unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+//! Save PPM image file (with unsigned char as data element type, padded to 
+//! 4 bytes)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, 
+               unsigned int w, unsigned int h);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Save PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, 
+              unsigned int w, unsigned int h); 
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
+                  unsigned int *w,unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+// Command line arguments: General notes
+// * All command line arguments begin with '--' followed by the token; 
+//   token and value are seperated by '='; example --samples=50
+// * Arrays have the form --model=[one.obj,two.obj,three.obj] 
+//   (without whitespaces)
+////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////
+//! Check if command line argument \a flag-name is given
+//! @return shrTRUE if command line argument \a flag_name has been given, 
+//!         otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param flag_name  name of command line flag
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, 
+                     const char* flag_name);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, 
+                        const char* arg_name, int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type unsigned int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, 
+                        const char* arg_name, unsigned int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type float
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, 
+                        const char* arg_name, float* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type string
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, 
+                          const char* arg_name, char** val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument list those element are strings
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  command line argument list
+//! @param len  length of the list / number of elements
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, 
+                              const char* arg_name, char** val, 
+                              unsigned int* len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparef( const float* reference, const float* data,
+             const unsigned int len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparei( const int* reference, const int* data, 
+             const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned integer arrays, with epsilon and threshold
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
+            const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned char arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
+              const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integers with a tolernance for # of byte errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays witha n epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
+              const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality and a 
+//!     threshold for # pixel errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays using L2-norm with an epsilon tolerance for 
+//! equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
+                const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PPM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PGM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
+
+extern "C" size_t shrRoundUp(int group_size, int global_size);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    if (iReference != iSample)
+    {
+        shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); 
+        if (pCleanup != NULL)
+        {
+            pCleanup(EXIT_FAILURE);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
+// Standardized Exit
+// *********************************************************************
+inline void __shrExitEX(int argc, const char** argv, int iExitCode)
+{
+#ifdef WIN32
+    if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#else 
+    if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#endif
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");                  
+        getchar();                                                           
+    }       
+    else 
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); 
+    }
+    fflush(stderr);                                                         
+    exit(iExitCode);
+}
+
+#endif
--- a/benchmarks/opencl/transpose/transpose.cl
+++ b/benchmarks/opencl/transpose/transpose.cl
@ -0,0 +1,108 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/* Matrix transpose with OpenCL
+* Device code.
+*/
+
+#define BLOCK_DIM 16
+
+// This kernel is optimized to ensure all global reads and writes are coalesced,
+// and to avoid bank conflicts in shared memory.  This kernel is up to 11x faster
+// than the naive kernel below.  Note that the shared memory array is sized to 
+// (BLOCK_DIM+1)*BLOCK_DIM.  This pads each row of the 2D block in shared memory 
+// so that bank conflicts do not occur when threads address the array column-wise.
+__kernel void transpose(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
+{
+	// read the matrix tile into shared memory
+	unsigned int xIndex = get_global_id(0);
+	unsigned int yIndex = get_global_id(1);
+
+	if((xIndex + offset < width) && (yIndex < height))
+	{
+		unsigned int index_in = yIndex * width + xIndex + offset;
+		block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// write the transposed matrix tile to global memory
+	xIndex = get_group_id(1) * BLOCK_DIM + get_local_id(0);
+	yIndex = get_group_id(0) * BLOCK_DIM + get_local_id(1);
+	if((xIndex < height) && (yIndex + offset < width))
+    {
+		unsigned int index_out = yIndex * height + xIndex;
+		odata[index_out] = block[get_local_id(0)*(BLOCK_DIM+1)+get_local_id(1)];
+	}
+}
+
+
+
+// This naive transpose kernel suffers from completely non-coalesced writes.
+// It can be up to 10x slower than the kernel above for large matrices.
+__kernel void transpose_naive(__global float *odata, __global float* idata, int offset, int width, int height)
+{
+    unsigned int xIndex = get_global_id(0);
+    unsigned int yIndex = get_global_id(1);
+    
+    if (xIndex + offset < width && yIndex < height)
+    {
+        unsigned int index_in  = xIndex + offset + width * yIndex;
+        unsigned int index_out = yIndex + height * xIndex;
+        odata[index_out] = idata[index_in]; 
+    }
+}
+
+
+__kernel void simple_copy(__global float *odata, __global float* idata, int offset, int width, int height)
+{
+    unsigned int xIndex = get_global_id(0);
+    unsigned int yIndex = get_global_id(1);
+    
+    if (xIndex + offset < width && yIndex < height)
+    {
+        unsigned int index_in  = xIndex + offset + width * yIndex;
+        odata[index_in] = idata[index_in]; 
+    }
+}
+
+__kernel void shared_copy(__global float *odata, __global float *idata, int offset, int width, int height, __local float* block)
+{
+	// read the matrix tile into shared memory
+	unsigned int xIndex = get_global_id(0);
+	unsigned int yIndex = get_global_id(1);
+
+    unsigned int index_in = yIndex * width + xIndex + offset;
+	if((xIndex + offset< width) && (yIndex < height))
+	{
+		block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)] = idata[index_in];
+	}
+
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	if((xIndex < height) && (yIndex+ offset < width))
+    {
+		odata[index_in] = block[get_local_id(1)*(BLOCK_DIM+1)+get_local_id(0)];
+	}
+}
+
+
+__kernel void uncoalesced_copy(__global float *odata, __global float* idata, int offset, int width, int height)
+{
+    unsigned int xIndex = get_global_id(0);
+    unsigned int yIndex = get_global_id(1);
+    
+    if (xIndex + offset < width && yIndex < height)
+    {
+        unsigned int index_in  = yIndex + height * (xIndex+ offset);
+        odata[index_in] = idata[index_in]; 
+    }
+}
--- a/benchmarks/opencl/transpose/transpose_gold.cpp
+++ b/benchmarks/opencl/transpose/transpose_gold.cpp
@ -0,0 +1,38 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/* Small Matrix transpose with Cuda (Example for a 16x16 matrix)
+* Reference solution.
+*/
+
+////////////////////////////////////////////////////////////////////////////////
+// export C interface
+extern "C" 
+void computeGold( float* reference, float* idata, 
+                  const unsigned int size_x, const unsigned int size_y );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute reference data set
+////////////////////////////////////////////////////////////////////////////////
+void
+computeGold( float* reference, float* idata, 
+            const unsigned int size_x, const unsigned int size_y ) 
+{
+    // transpose matrix
+    for( unsigned int y = 0; y < size_y; ++y) 
+    {
+        for( unsigned int x = 0; x < size_x; ++x) 
+        {
+            reference[(x * size_y) + y] = idata[(y * size_x) + x];
+        }
+    }  
+}
+
--- a/benchmarks/vector/saxpy/Makefile
+++ b/benchmarks/vector/saxpy/Makefile
@ -0,0 +1,33 @@
+LIB_PATH = ../../../runtime
+
+COMP     = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-gcc
+
+CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/mains/vortex_link.ld -march=rv32imv -mabi=ilp32
+
+DMP  = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objdump
+CPY  = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objcopy
+
+# VX_STR  = ../../startup/vx_start.s
+
+NEWLIB  = $(LIB_PATH)/newlib/newlib.c
+VX_STR  = $(LIB_PATH)/startup/vx_start.s
+VX_INT  = $(LIB_PATH)/intrinsics/vx_intrinsics.s
+VX_IO   = $(LIB_PATH)/io/vx_io.s $(LIB_PATH)/io/vx_io.c
+VX_API  = $(LIB_PATH)/vx_api/vx_api.c
+VX_TEST = $(LIB_PATH)/tests/tests.c
+VX_FIO  = $(LIB_PATH)/fileio/fileio.s
+VX_VEC  = vx_vec_saxpy.s       #float --> int
+LIBS    = /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libc.a /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
+
+VX_MAIN = vx_vec_saxpy
+
+all: HEX DUMP ELF
+
+DUMP: ELF
+	$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
+
+HEX: ELF
+	$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
+
+ELF:
+	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
--- a/benchmarks/vector/saxpy/vx_vec_saxpy.c
+++ b/benchmarks/vector/saxpy/vx_vec_saxpy.c
@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "../../../runtime/intrinsics/vx_intrinsics.h"
+#include "vx_vec_saxpy.h"
+
+//---------------------------------------------------------------
+/* # void saxpy(size_t n, const float a, const float *x, float *y)
+   # ==> convert to int!!
+   # void saxpy(size_t n, const int a, const int *x, int *y)
+   # { size_t i;
+   #   for (i=0; i<n; i++) y[i] = a * x[i] + y[i];  }           */
+//---------------------------------------------------------------
+
+int main()
+{
+    vx_tmc(1);
+
+    int n = 4; //#define NUM_DATA 65536
+
+    int *a = (int*)malloc(sizeof(int) * n); 
+    int *b = (int*)malloc(sizeof(int) * n); 
+    int *c = (int*)malloc(sizeof(int) * n); //verification
+
+    //  float factor = ((float)rand()/(float)(RAND_MAX)) * 100.0;
+    int factor = ((float)rand()/(RAND_MAX)) * 100.0;
+
+    for (int i = 0; i < n; ++i) { 
+        a[i] = ((float)rand()/(RAND_MAX)) * 100.0;
+        b[i] = 0; 
+        c[i] = 0;
+    }
+  //; c[i] = 2;}
+
+#if 1
+    printf("saxpy\nfactor: %d\na[%d]: ", factor, n);
+    for(int i = 0; i < n; ++i) printf("%d ", a[i]);
+//    printf("\nb[%d]: ", n);
+//    for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
+#endif
+
+    vx_vec_saxpy(n, factor, a, b);
+
+#if 1
+    printf("\nsaxpy\na[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d ", a[i]);
+    printf("\n\nb[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d ", b[i]);
+#endif
+
+    for(int i = 0; i < n; ++i) 
+    {
+        if(b[i] != ((a[i] * factor) + c[i])) 
+        {
+           printf("\n<saxpy> FAILED at <index: %d>! \n", i);
+           return 1;   
+        }
+    }
+    
+    printf("\nPASSED.......................... <saxpy> \n");
+
+
+    free(a); free(b); free(c);
+
+    vx_tmc(0);
+
+    return 0;
+
+}
--- a/benchmarks/vector/saxpy/vx_vec_saxpy.dump
+++ b/benchmarks/vector/saxpy/vx_vec_saxpy.dump
--- a/benchmarks/vector/saxpy/vx_vec_saxpy.elf
+++ b/benchmarks/vector/saxpy/vx_vec_saxpy.elf
--- a/benchmarks/vector/saxpy/vx_vec_saxpy.h
+++ b/benchmarks/vector/saxpy/vx_vec_saxpy.h
@ -0,0 +1,12 @@
+#pragma once
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vx_vec_saxpy(int n, int scalar, int* a, int* b);
+
+#ifdef __cplusplus
+}
+#endif                   
--- a/benchmarks/vector/saxpy/vx_vec_saxpy.hex
+++ b/benchmarks/vector/saxpy/vx_vec_saxpy.hex
--- a/benchmarks/vector/saxpy/vx_vec_saxpy.s
+++ b/benchmarks/vector/saxpy/vx_vec_saxpy.s
@ -0,0 +1,26 @@
+.type vx_vec_saxpy, @function
+.global vx_vec_saxpy
+# void
+# saxpy(size_t n, int factor, int *a, int *b)
+# {  for (int i=0; i<n; i++) { y[i] = a * x[i] + y[i];}  }
+#
+# register arguments:
+#     a0      n
+#     a1      factor
+#     a2      a
+#     a3      b
+vx_vec_saxpy:
+    vsetvli a4, a0, e32
+loop:
+    vlw.v v0, (a2)
+    sub a0, a0, a4
+    slli a4, a4, 2
+    add a2, a2, a4
+    vlw.v v1, (a3)
+    vmul.vx v0, v0, a1
+    vadd.vv v1, v0, v1
+#   vmacc.vx v1, rs1, v0
+    vsw.v v1, (a3)
+    add a3, a3, a4
+    bnez a0, loop
+    ret
--- a/benchmarks/vector/sgemm_nn/Makefile
+++ b/benchmarks/vector/sgemm_nn/Makefile
@ -0,0 +1,33 @@
+LIB_PATH = ../../../runtime
+
+COMP     =  /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-gcc
+
+CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/mains/vortex_link.ld -march=rv32imv -mabi=ilp32
+
+DMP  =  /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objdump
+CPY  =  /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objcopy
+
+# VX_STR  = ../../startup/vx_start.s
+
+NEWLIB  = $(LIB_PATH)/newlib/newlib.c
+VX_STR  = $(LIB_PATH)/startup/vx_start.s
+VX_INT  = $(LIB_PATH)/intrinsics/vx_intrinsics.s
+VX_IO   = $(LIB_PATH)/io/vx_io.s $(LIB_PATH)/io/vx_io.c
+VX_API  = $(LIB_PATH)/vx_api/vx_api.c
+VX_TEST = $(LIB_PATH)/tests/tests.c
+VX_FIO  = $(LIB_PATH)/fileio/fileio.s
+VX_VEC  = vx_vec_sgemm_nn.s       #float --> int
+LIBS    =  /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libc.a  /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
+
+VX_MAIN = vx_vec_sgemm_nn
+
+all: HEX DUMP ELF
+
+DUMP: ELF
+	$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
+
+HEX: ELF
+	$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
+
+ELF:
+	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
--- a/benchmarks/vector/sgemm_nn/test_asm.s
+++ b/benchmarks/vector/sgemm_nn/test_asm.s
@ -0,0 +1,38 @@
+.type vx_vec_sgemm_nn, @function
+.global vx_vec_sgemm_nn
+#
+#  for (int n = 0; n < k; n++) {
+#       for (int m = 0; m < m; m++) {
+#           for (int i = 0; i < n;) {
+#//               d1[n*k+i] += a1[n*k+m]*b1[i*n+m];
+#                 vx_vec_sgemm_nn(i, c, r, a1, b1, c1, ldc);
+#                 i = i + 4;
+#           }
+#       }
+#    } 
+# a3 = a, a4 = b, a5 = c
+# a0 = i, a1 = m, a2 = n
+# a6 = ldc
+vx_vec_sgemm_nn:
+    vsetvli t0, a6, e32
+    mul x1, a6, a2  # n*ldc
+    add x2, x1, a1  # i + (n*ldc)
+    add a3, x2, a3  # a[i+ n*ldc]
+    lw x3, (a3)
+
+    mul x4, a1, a6  # m*ldc
+    add x5, a0, x4  # i + m*ldc
+    add a4, x5, a4  # b[i + m*ldc]
+#   lw x6, (a4)
+
+    vlw.v v0, (a4)
+    vmul.vx v2, v1, x3
+ 
+    mul x6, a2, a6  # n*ldc
+    add x7, a0, x6  # i + n*ldc
+    add a5, x7, a5  # c[i + m*ldc]
+
+    vlw.v v3, (a5) #c
+    vadd.vv v3, v3, v2
+
+    ret
--- a/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.c
+++ b/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.c
@ -0,0 +1,110 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "../../../runtime/intrinsics/vx_intrinsics.h"
+#include "vx_vec_sgemm_nn.h"
+
+//---------------------------------------------------------------
+/* # void sgemm_nn(size_t n, size_t m, size_t k, 
+#          int *a,   // m * k matri size_t lda, 
+#          int *b,   // k * n matrix  size_t ldb, 
+#          int *c,   // m * n matrix  size_t ldc)
+#  c += a*b (alpha=1, no transpose on input matrices)
+#  matrices stored in C row-major order */
+//---------------------------------------------------------------
+
+int main()
+{
+    vx_tmc(1);
+
+    int m = 4;
+    int k = 4;
+    int n = 4;
+
+    int* a1 = (int*)malloc(sizeof(int) * m * k);
+    int* b1 = (int*)malloc(sizeof(int) * k * n);
+    int* c1 = (int*)malloc(sizeof(int) * m * n);
+    int* d1 = (int*)malloc(sizeof(int) * m * n); //verfication
+
+    for (int i = 0; i < (m * k); ++i) a1[i] = i;
+    for (int i = 0; i < (k * n); ++i) b1[i] = 1;
+    for (int i = 0; i < (m * n); ++i) c1[i] = 0;
+    for (int i = 0; i < (m * n); ++i) d1[i] = 0;
+
+
+#if 1
+    printf("sgemm_nn\na[%d]:", m*k);
+    for (int i = 0; i < m*k; ++i) {
+        if(!(i % k)) printf("\n");
+        printf("%d ", a1[i]);
+    }
+    printf("\n\nb[%d]:", k*n);
+    for (int i = 0; i < k*n; ++i) {
+        if (!(i % n)) printf("\n");
+        printf("%d ", b1[i]);
+    }
+#endif
+
+    int lda = 4;
+    int ldb = 4;
+    int ldc = 4; //64;
+    int vsize = 4;
+
+  for (int r = 0; r < m; r++) {
+       for (int c = 0; c < n; c++) {
+           for (int i = 0; i < k;) {
+//               d1[r*k+i] += a1[r*k+c]*b1[i*n+c];
+                 vx_vec_sgemm_nn(i, r, c, a1, b1, c1, ldc, vsize);
+                 i = i + vsize;
+           }
+       }
+    }
+//    vx_vec_sgemm_nn(n, a1, b1, c1);
+
+#if 1
+    printf("\n\nc[%d]:", m*n);
+    for (int i = 0; i < m*n; ++i) {
+        if (!(i % n)) printf("\n");
+        printf("%d ", c1[i]);
+    }
+#endif
+
+   for (int r = 0; r < m; r++) {
+       for (int c = 0; c < n; c++) {
+           for (int i = 0; i < k; i++) {
+               d1[c*ldc+i] += a1[c*ldc+r]*b1[i + (r*ldc)];
+               //printf("d[%d] += a[%d]*b[%d]\n", c*ldc+i, c*ldc+r , i + (r*ldc));
+               //printf("%d %d %d\n", d1[c*ldc+i] , a1[c*ldc+r] , b1[i + (r*ldc)]);
+           }
+       }
+    }
+
+#if 1
+   printf("\n\nc[%d]:\n", m*n);
+   for(int i = 0; i < m; ++i) {
+      for(int j = 0; j < n; ++j) {
+          printf("%d ", d1[i*m+j]);
+      }
+      printf("\n");
+    }
+#endif
+
+
+    for(int i = 0; i < m*n; ++i)
+    {
+        if(c1[i] != d1[i])
+        {
+           printf("\n<sgemm_nn> FAILED at <index: %d>! \n", i);
+           return 1;
+        }
+    }
+
+    printf("\nPASS.......................... <sgemm_nn> \n");
+
+
+    free(a1); free(b1); free(c1);
+
+    vx_tmc(0);
+
+    return 0;
+
+}
--- a/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.dump
+++ b/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.dump
--- a/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.elf
+++ b/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.elf
--- a/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.h
+++ b/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.h
@ -0,0 +1,13 @@
+#pragma once
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc);
+void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int* b1, int* c1, int ldc, int vsize);
+//void vx_vec_sgemm_nn(int n, int* a1, int* b1, int* c1);
+#ifdef __cplusplus
+}
+#endif                   
--- a/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.hex
+++ b/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.hex
--- a/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.s
+++ b/benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.s
@ -0,0 +1,42 @@
+.type vx_vec_sgemm_nn, @function
+.global vx_vec_sgemm_nn
+#
+#  for (int n = 0; n < k; n++) {
+#       for (int m = 0; m < m; m++) {
+#           for (int i = 0; i < n;) {
+#//               d1[n*k+i] += a1[n*k+m]*b1[i*n+m];
+#                 vx_vec_sgemm_nn(i, c, r, a1, b1, c1, ldc);
+#                 i = i + 4;
+#           }
+#       }
+#    } 
+# a3 = a, a4 = b, a5 = c
+# a0 = i, a1 = m, a2 = n
+# a6 = ldc
+vx_vec_sgemm_nn:
+    vsetvli t0, a7, e32
+     mul t1, a6, a2  # n*ldc
+     add t2, t1, a1  # i + (n*ldc)
+     slli t2, t2, 2
+     add a3, t2, a3  # a[i+ n*ldc]
+     lw t3, (a3)
+
+     mul t4, a1, a6  # m*ldc
+     add t5, a0, t4  # i + m*ldc
+     slli t5, t5, 2
+     add a4, t5, a4  # b[i + m*ldc]
+ #   lw x6, (a4)
+
+     vlw.v v0, (a4)
+     vmul.vx v1, v0, t3
+ 
+     mul t6, a2, a6  # n*ldc
+     add t0, a0, t6  # i + n*ldc
+     slli t0, t0, 2
+     add a5, t0, a5  # c[i + m*ldc]
+
+     vlw.v v2, (a5) #c
+     vadd.vv v2, v2, v1
+     vsw.v v2, (a5)
+
+    ret
--- a/benchmarks/vector/vecadd/Makefile
+++ b/benchmarks/vector/vecadd/Makefile
@ -0,0 +1,41 @@
+LIB_PATH = ../../../runtime
+
+COMP     = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-gcc
+
+CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/mains/vortex_link.ld -march=rv32imv -mabi=ilp32
+
+DMP  = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objdump
+CPY  = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objcopy
+
+# VX_STR  = ../../startup/vx_start.s
+
+NEWLIB  = $(LIB_PATH)/newlib/newlib.c
+VX_STR  = $(LIB_PATH)/startup/vx_start.s
+VX_INT  = $(LIB_PATH)/intrinsics/vx_intrinsics.s
+VX_IO   = $(LIB_PATH)/io/vx_io.s $(LIB_PATH)/io/vx_io.c
+VX_API  = $(LIB_PATH)/vx_api/vx_api.c
+VX_TEST = $(LIB_PATH)/tests/tests.c
+VX_FIO  = $(LIB_PATH)/fileio/fileio.s
+VX_VEC1  = vx_vec_vvaddint32.s
+#VX_VEC2  = vx_vec_saxpy.s       #float --> int
+#VX_VEC3  = vx_vec_sgemm.s       #float --> int
+#VX_VEC4  = vx_vec_vsadd.s 
+#VX_VEC5  = vx_vec_memcpy.s 
+LIBS    = /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libc.a /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
+
+VX_MAIN = vx_vec_vecadd
+
+all: HEX DUMP ELF
+
+DUMP: ELF
+	$(DMP) -D $(VX_MAIN).elf > $(VX_MAIN).dump
+
+HEX: ELF
+	$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex
+
+ELF:
+	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC1) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
+#	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
+#	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
+#	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
+#	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC5) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf~                                
--- a/benchmarks/vector/vecadd/vx_vec_vecadd.c
+++ b/benchmarks/vector/vecadd/vx_vec_vecadd.c
@ -0,0 +1,57 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "../../../runtime/intrinsics/vx_intrinsics.h"
+#include "vx_vec_vecadd.h"
+
+//---------------------------------------------------------------
+/* vvaddint32
+ * # vector-vector add routine of 32-bit integers
+ * # void vvaddint32(size_t n, const int*x, const int*y, int*z)
+ * # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } } */
+//---------------------------------------------------------------
+
+int main()
+{
+    vx_tmc(1);
+
+    int n = 4; //SIZE
+
+    int *a = (int*)malloc(sizeof(int) * n); 
+    int *b = (int*)malloc(sizeof(int) * n); 
+    int *c = (int*)malloc(sizeof(int) * n); 
+
+    // Initialize values for array members.  
+    for (int i = 0; i < n; ++i) {
+       a[i] = i * 2 + 0;
+       b[i] = i * 2 + 1;
+       c[i] = 0;
+    }
+
+#if 0
+    printf("vvaddint...\na[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d ", a[i]);
+    printf("\nb[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d ", b[i]);
+    printf("\nc[%d] = a[%d] + b[%d]: ", n, n, n);
+    for(int i = 0; i < n; ++i) printf("%d ", c[i]);
+#endif
+
+    vx_vec_vvaddint32(n, a, b, c);
+
+    for(int i = 0; i < n; ++i) 
+    {
+        if(c[i] != (a[i]+b[i])) 
+        {
+           printf("\n<vddint32> FAILED at <index: %d>! \n", i);
+           return 1;   
+        }
+    }
+    printf("\nPASSED.......................... <vddint32> \n");
+
+    free(a); free(b); free(c);
+
+    vx_tmc(0);
+
+    return 0;
+
+}
--- a/benchmarks/vector/vecadd/vx_vec_vecadd.dump
+++ b/benchmarks/vector/vecadd/vx_vec_vecadd.dump
--- a/benchmarks/vector/vecadd/vx_vec_vecadd.elf
+++ b/benchmarks/vector/vecadd/vx_vec_vecadd.elf
--- a/benchmarks/vector/vecadd/vx_vec_vecadd.h
+++ b/benchmarks/vector/vecadd/vx_vec_vecadd.h
@ -0,0 +1,17 @@
+#pragma once
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vx_vec_vvaddint32(int n, int* a, int* b, int *c);
+//void vx_vec_vsadd(int n, int* a, int scalar);
+//void vx_vec_memcpy(int* a, int* b, int n);
+//void vx_vec_saxpy(int n, int scalar, int* a, int* b);
+//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int lda, int* b1, int ldb, int* c1, int ldc);
+//void vx_vec_sgemm_nn(int n, int m, int k, int* a1, int* b1, int* c1);
+//void vx_vec_sgemm_nn(int n, int* a1, int* b1, int* c1);
+#ifdef __cplusplus
+}
+#endif                   
--- a/benchmarks/vector/vecadd/vx_vec_vecadd.hex
+++ b/benchmarks/vector/vecadd/vx_vec_vecadd.hex
--- a/benchmarks/vector/vecadd/vx_vec_vvaddint32.s
+++ b/benchmarks/vector/vecadd/vx_vec_vvaddint32.s
@ -0,0 +1,22 @@
+.type vx_vec_vvaddi32, @function
+.global vx_vec_vvaddint32
+# vector-vector add routine of 32-bit integers
+# void vvaddint32(size_t n, const int*x, const int*y, int*z)
+# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
+#
+# a0 = n, a1 = x, a2 = y, a3 = z
+# Non-vector instructions are indented
+vx_vec_vvaddint32:
+    vsetvli t0, a0, e32 # Set vector length based on 32-bit vectors
+loop:
+    vlw.v v0, (a1)           # Get first vector
+      sub a0, a0, t0         # Decrement number done
+      slli t0, t0, 2         # Multiply number done by 4 bytes
+      add a1, a1, t0         # Bump pointer
+    vlw.v v1, (a2)           # Get second vector
+      add a2, a2, t0         # Bump pointer
+    vadd.vv v2, v0, v1        # Sum vectors
+    vsw.v v2, (a3)           # Store result
+      add a3, a3, t0         # Bump pointer
+      bnez a0, loop   # Loop back 
+    ret                    # Finished
--- a/runtime/mains/vector_test/vx_vec_original.s
+++ b/runtime/mains/vector_test/vx_vec_original.s
@ -0,0 +1,22 @@
+.type vx_vec_test, @function
+.global vx_vec_test
+vx_vec_test:
+# vector-vector add routine of 32-bit integers
+# void vvaddint32(size_t n, const int*x, const int*y, int*z)
+# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
+#
+# a0 = n, a1 = x, a2 = y, a3 = z
+# Non-vector instructions are indented
+    vsetvli t0, a0, e32 # Set vector length based on 32-bit vectors
+ loop:
+    vlw.v v0, (a1)           # Get first vector
+      sub a0, a0, t0         # Decrement number done
+      slli t0, t0, 2         # Multiply number done by 4 bytes
+      add a1, a1, t0         # Bump pointer
+    vlw.v v1, (a2)           # Get second vector
+      add a2, a2, t0         # Bump pointer
+    vadd.vv v2, v0, v1        # Sum vectors
+    vsw.v v2, (a3)           # Store result
+      add a3, a3, t0         # Bump pointer
+      bnez a0, loop         # Loop back 
+    ret                    # Finished
--- a/rvvector/basic/_1_vx_vec.s
+++ b/rvvector/basic/_1_vx_vec.s
@ -0,0 +1,30 @@
+
+
+
+.type vx_vec_test, @function
+.global vx_vec_test
+vx_vec_test:
+	li a1, 7
+	sw a1, 0(a0)
+	ret
+
+
+
+
+# 	slli a0, a0, 2
+# 	add a0, a0, a3
+# 	vmv.v.x vv0, a2
+# 	# vsplat4 vv0, a2
+# stripmine_loop:
+# 	vlb4 vv1, (a1)
+# 	vcmpez4 vp0, vv1
+# 	!vp0 vlw4 vv1, (a3)
+# 	!vp0 vlw4 vv2, (a4)
+# 	!vp0 vfma4 vv1, vv0, vv1, vv2
+# 	!vp0 vsw4 vv1, (a4)
+# 	addi a1, a1, 4
+# 	addi a3, a3, 16
+# 	addi a4, a4, 16
+# 	bleu a3, a0, stripmine_loop
+	# handle edge cases
+	# when (n % 4) != 0 ...
--- a/rvvector/basic/_1_vx_vector_main.c
+++ b/rvvector/basic/_1_vx_vector_main.c
@ -0,0 +1,32 @@
+
+#include "../../runtime/intrinsics/vx_intrinsics.h"
+#include "vx_vec.h"
+
+int main()
+{
+	vx_tmc(1);
+	// int * a = malloc(4);
+	// int * b = malloc(4);
+	// int * c = malloc(4);
+
+
+	int * a = malloc(4);
+	*a = 5;
+	printf("Value of a: %d\n", *a);
+
+	vx_vec_test(a);
+
+	printf("Value of a: %d\n", *a);
+
+
+	// for (int i = 0; i < 4; i++)
+	// {
+	// 	if (c[i] != (a[i] + b[i]))
+	// 	{
+	// 		printf("Fail\n");
+	// 		break;
+	// 	}
+	// }
+
+	vx_tmc(0);
+}
--- a/rvvector/basic/__vx_vector_main.c
+++ b/rvvector/basic/__vx_vector_main.c
@ -0,0 +1,91 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "../../runtime/intrinsics/vx_intrinsics.h"
+#include "vx_vec.h"
+
+int main()
+{
+	vx_tmc(1);
+#if 0
+    # vector-vector add routine of 32-bit integers
+    # void vvaddint32(size_t n, const int*x, const int*y, int*z)
+    # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
+    #
+    # a0 = n, a1 = x, a2 = y, a3 = z
+    # Non-vector instructions are indented
+#endif   
+#if 1      
+        int n = 5;
+        int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
+        int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
+        int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
+
+        for(int i = 0; i < n; ++i)
+        {
+           a[i] = b[i] = c[i] = 1;
+        }
+
+        for(int i = 0; i < n; ++i) printf("%d, ", a[i]);
+        printf("\n");
+//        for(int i = 0; i < n; ++i) printf("%d, ", b[i]);
+//        printf("\n");
+//        for(int i = 0; i < n; ++i) printf("%d, ", c[i]);
+
+        int *d;
+        *d = 1;
+	vx_vec_test(n, d, b, c);
+
+
+        printf("(after: n = %d, %d)\n", n, *d);
+        for(int i = 0; i < n; ++i) printf("%d, ", a[i]);
+//        printf("\n");
+//        for(int i = 0; i < n; ++i) printf("%d, ", b[i]);
+//        printf("\n");
+//        for(int i = 0; i < n; ++i) printf("%d, ", c[i]);
+
+#endif
+#if 0
+	int * a = malloc(sizeof(int) * 10);
+	for(int i = 0; i < 10; ++i) a[i] = 5;
+   
+       
+	for(int i = 0; i < 10; ++i)
+	    printf("%d, ", a[i]);
+
+	vx_vec_test(a);
+	//vx_vec_test(2, a, a, a);
+
+	printf("after--------\n");
+        for(int i = 0; i < 10; ++i) 
+            printf("%d, ", a[i]);
+#endif
+#if 0
+        int n = 5;
+        int *a = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
+        int *b = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1};
+        int *c = (int*)malloc(sizeof(int) * 5); //{1, 1, 1, 1, 1}; 
+        
+        for(int i = 0; i < n; ++i)
+        {
+            a[i] = 1; 
+            b[i] = 1;
+            c[i] = 0;
+        }
+
+        printf("Value of a: %d, b: %d, c: %d, n: %d\n", a[0], b[0], c[0], n);
+        vx_vec_test(n, a, b, c);
+        printf("Value of a: %d, b: %d, c: %d, n: %d\n", a[0], b[0], c[0], n);
+        
+#endif
+
+	// for (int i = 0; i < 4; i++)
+	// {
+	// 	if (c[i] != (a[i] + b[i]))
+	// 	{
+	// 		printf("Fail\n");
+	// 		break;
+	// 	}
+	// }
+
+	vx_tmc(0);
+}
--- a/rvvector/basic/vx_vec_main.c
+++ b/rvvector/basic/vx_vec_main.c
@ -0,0 +1,27 @@
+#include "../../runtime/intrinsics/vx_intrinsics.h"
+#include "vx_vec.h"
+
+int main()
+{
+	vx_tmc(1);
+        printf("----------------hello!!! \n");
+
+        int n = 8;
+        int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
+        int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
+        int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
+         
+        printf("hello!!! \n");
+
+        for(int i = 0; i < n; ++i)
+        {
+           a[i] = b[i] = c[i] = 1;
+        }
+
+	vx_vec_test(n, a, b, c);
+
+        for(int i = 0; i < n; ++i)
+           printf("%d ", c[i]);
+
+	vx_tmc(0);
+}
--- a/rvvector/benchmark_temp/1
+++ b/rvvector/benchmark_temp/1
@ -0,0 +1,166 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "../../runtime/intrinsics/vx_intrinsics.h"
+#include "vx_vec_benchmark.h"
+
+int main()
+{
+    vx_tmc(1);
+
+    int n = 65536;
+    int scalar = 10;
+
+    int *a = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
+    int *b = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
+    int *c = (int*)malloc(sizeof(int) * n); //{1, 1, 1, 1, 1};
+
+    for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2; c[i] = 5; }
+
+#if 0
+//---------------------------------------------------------------
+/* vvaddint32
+ * # vector-vector add routine of 32-bit integers
+ * # void vvaddint32(size_t n, const int*x, const int*y, int*z)
+ * # { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } } */
+    printf("vvaddint...\na[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d ", a[i]);
+    printf("\nb[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d ", b[i]);
+    printf("\nc[%d] = a[%d] + b[%d]: ", n, n, n);
+    for(int i = 0; i < n; ++i) printf("%d ", c[i]);
+
+    vx_vec_vvaddint32(n, a, b, c);
+
+    for(int i = 0; i < n; ++i) 
+    {
+        if(c[i] != (a[i]+b[i])) 
+        {
+           printf("\n<vddint32> failed at <index: %d>! \n", i);
+           return 1;   
+        }
+    }
+    printf("\nPASSED.......................... <vddint32> \n");
+#endif
+#if 0
+//---------------------------------------------------------------
+/* #  vector-scalar add
+   # for (i=0; i<N; i++) { C[i] = A[i] + B; } // 32-bit ints */
+    for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 1;}
+    printf("vsadd...scalar:%d\na[%d]: ", scalar, n);
+    for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
+    printf("\nb: %d", scalar);
+    
+    vx_vec_vsadd(n, a, scalar);
+
+    for(int i = 0; i < n; ++i) 
+    {
+        if(a[i] != (b[i] * scalar)) 
+        {
+           printf("\n<vsadd> failed at <index: %d>! \n", i);
+           return 1;   
+        }
+    }
+    printf("\nPASSED.......................... <vsadd> \n");
+
+#endif
+#if 0
+//---------------------------------------------------------------
+/*  # memory copy
+    # void *memcpy(void* dest, const void* src, size_t n) */
+    for (int i = 0; i < n; ++i) { a[i] = 1; b[i] = 2;}
+    printf("memcpy\na[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
+    printf("\nb[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
+
+    vx_vec_memcpy(a, b, n);
+
+    for(int i = 0; i < n; ++i) 
+    {
+        if(a[i] != b[i]) 
+        {
+           printf("\n<memcpy> failed at <index: %d>! \n", i);
+           return 1;   
+        }
+    }
+    printf("\nPASSED.......................... <memcpy> \n");
+#endif
+#if 1
+//---------------------------------------------------------------
+/* # void saxpy(size_t n, const float a, const float *x, float *y)
+   # ==> convert to int!!
+   # void saxpy(size_t n, const int a, const int *x, int *y)
+   # {
+   #   size_t i;
+   #   for (i=0; i<n; i++) y[i] = a * x[i] + y[i];
+   # } */
+    for (int i = 0; i < n; ++i) { a[i] = 4; b[i] = 2; c[i] = 2;}
+
+    printf("saxpy\na[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
+    printf("\nb[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
+
+    vx_vec_saxpy(n, scalar, a, b);
+
+    printf("saxpy\na[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d \n", a[i]);
+    printf("\nb[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d \n", b[i]);
+
+    for(int i = 0; i < n; ++i) 
+    {
+        if(b[i] != ((a[i] * scalar) + c[i])) 
+        {
+           printf("\n<saxpy> failed at <index: %d>! \n", i);
+           return 1;   
+        }
+    }
+    printf("\nPASSED.......................... <saxpy> \n");
+#endif
+#if 0
+//---------------------------------------------------------------
+/* # void sgemm_nn(size_t n, size_t m, size_t k, const float*a,   // m * k matrix
+#          size_t lda, const float*b,   // k * n matrix 
+#          size_t ldb, float*c,         // m * n matrix
+#          size_t ldc)
+#  c += a*b (alpha=1, no transpose on input matrices)
+#  matrices stored in C row-major order */
+
+    int m = 8;
+    int k = 8;
+    int n = 8
+    int lda = 4;
+    int ldb = 4;
+    int ldc = 4;
+
+    int* a1 = (int*)malloc(sizeof(m * k));
+    int* b1 = (int*)malloc(sizeof(k * n));
+    int* c1 = (int*)malloc(sizeof(m * n));
+
+    for(int i = 0; i < (m * k); ++i) a1[i] = 1;
+    for(int i = 0; i < (k * n); ++i) b1[i] = 1;
+    for(int i = 0; i < (m * n); ++i) c1[i] = 1;    
+
+    printf("sgemm_nn\na[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d \n", a1[i]);
+    printf("\nb[%d]: ", n);
+    for(int i = 0; i < n; ++i) printf("%d \n", b1[i]);
+
+    vx_vec_sgemm_nn(n, m, k, a1, lda, b1, ldb, c1, ldc);
+
+    //for(int i = 0; i < n; ++i) 
+    //{
+    //    if(b[i] != ((a[i] * scalar) + c[i])) 
+    //    {
+    //       printf("\n<sgemm_nn> failed at <index: %d>! \n", i);
+    //       return;   
+    //    }
+    //}
+    printf("\nNOT TESTED.......................... <sgemm_nn> \n");
+//---------------------------------------------------------------
+#endif
+    
+    vx_tmc(0);
+    return 0;
+}
--- a/rvvector/benchmark_temp/Makefile
+++ b/rvvector/benchmark_temp/Makefile
@ -1,11 +1,11 @@
 LIB_PATH = ../../runtime

-COMP     = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-gcc
+COMP     = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-gcc

 CC_FLAGS = -ffreestanding -O0 -Wl,--gc-sections -nostartfiles -nostdlib -nostartfiles -nodefaultlibs -Wl,-Bstatic,-T,$(LIB_PATH)/mains/vortex_link.ld -march=rv32imv -mabi=ilp32

-DMP  = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objdump
-CPY  = /nethome/ekim79/riscv-gnu-toolchain/drops/bin/riscv32-unknown-elf-objcopy
+DMP  = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objdump
+CPY  = /home/priya/dev/riscv_vec/riscv-gnu/bin/riscv32-unknown-elf-objcopy

 # VX_STR  = ../../startup/vx_start.s

@ -21,7 +21,7 @@ VX_VEC2  = vx_vec_saxpy.s       #float --> int
 VX_VEC3  = vx_vec_sgemm_float.s #float --> int
 VX_VEC4  = vx_vec_vsadd.s 
 VX_VEC5  = vx_vec_memcpy.s 
-LIBS    = /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libc.a /nethome/ekim79/riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
+LIBS    = /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libc.a /home/priya/dev/riscv_vec/riscv-gnu/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc

 VX_MAIN = vx_vec_benchmark

@ -34,7 +34,6 @@ HEX: ELF
 	$(CPY) -O ihex $(VX_MAIN).elf $(VX_MAIN).hex

 ELF:
-#	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC1) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
 	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC2) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
 #	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC3) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
 #	$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC4) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) $(VX_MAIN).c $(LIBS) -Iinclude  -o $(VX_MAIN).elf
--- a/rvvector/benchmark_temp/vx_vec_benchmark.c
+++ b/rvvector/benchmark_temp/vx_vec_benchmark.c
@ -80,6 +80,12 @@ int main()
        if(a[i] != b[i]) 
        {
           printf("\n<memcpy> failed at <index: %d>! \n", i);
+<<<<<<< HEAD
+           return;   
+        }
+    }
+    printf("\nPASSED.......................... <memcpy> \n");
+=======
           return 1;   
        }
    }
@ -113,6 +119,11 @@ int main()
        if(b[i] != ((a[i] * scalar) + c[i])) 
        {
           printf("\n<saxpy> failed at <index: %d>! \n", i);
+           return;   
+        }
+    }
+    printf("\nPASSED.......................... <saxpy> \n");
+
           return 1;   
        }
    }
--- a/rvvector/benchmark_temp/vx_vec_benchmark.dump
+++ b/rvvector/benchmark_temp/vx_vec_benchmark.dump
--- a/rvvector/benchmark_temp/vx_vec_benchmark.elf
+++ b/rvvector/benchmark_temp/vx_vec_benchmark.elf
--- a/rvvector/benchmark_temp/vx_vec_benchmark.hex
+++ b/rvvector/benchmark_temp/vx_vec_benchmark.hex
--- a/rvvector/benchmark_temp/vx_vec_saxpy.s
+++ b/rvvector/benchmark_temp/vx_vec_saxpy.s
@ -13,6 +13,19 @@
 #     fa0     a
 #     a1      x
 #     a2      y
+vx_vec_saxpy:
+    vsetvli a4, a0, e32, m8
+saxpy:
+    vlw.v v0, (a1)
+    sub a0, a0, a4
+    slli a4, a4, 2
+    add a1, a1, a4
+    vlw.v v8, (a2)
+    vfmacc.vf v8, fa0, v0
+    vsw.v v8, (a2)
+    add a2, a2, a4
+    bnez a0, saxpy
+    ret
 #vx_vec_saxpy:
 #    vsetvli a4, a0, e32, m8
 #saxpy:
@ -28,15 +41,21 @@
 #    ret

 # a0 n, rs1 a, a2 x, a3 y
+
+# a0 n, a1 a, a2 x, a3 y
 vx_vec_saxpy:
-    vsetvli a4, a0, e32, m8
+    vsetvli a4, a0, e32, m1
 saxpy:
    vlw.v v0, (a2)
    sub a0, a0, a4
    slli a4, a4, 2
    add a2, a2, a4
    vlw.v v1, (a3)
-    vmacc.vx v1, rs1, v0
+    vmul.vx v0, v0, a1
+#    vmul.vv v0, v0, v1
+#    li x1, 10
+#    vmul.vx v0, v0, x1
+    vadd.vv v1, v0, v1
    vsw.v v1, (a3)
    add a3, a3, a4
    bnez a0, saxpy
--- a/simX/enc.cpp
+++ b/simX/enc.cpp
@ -285,8 +285,8 @@ Instruction *WordDecoder::decode(const std::vector<Byte> &v, Size &idx, trace_in
          inst.setVlsWidth((code>>shift_func3)  & func3_mask);
          inst.setSrcReg((code>>shift_rs2)   & reg_mask);
          inst.setVmask((code>>shift_vmask));
-          inst.setVmop((code>>shift_vmop) && func3_mask);
-          inst.setVnf((code>>shift_vnf) && func3_mask);
+          inst.setVmop((code>>shift_vmop) & func3_mask);
+          inst.setVnf((code>>shift_vnf) & func3_mask);

          trace_inst->valid_inst = true;
          trace_inst->rs1        = ((code>>shift_rs1)   & reg_mask);
@ -300,8 +300,8 @@ Instruction *WordDecoder::decode(const std::vector<Byte> &v, Size &idx, trace_in
          inst.setVlsWidth((code>>shift_func3)  & func3_mask);
          inst.setSrcReg((code>>shift_rs2)   & reg_mask);
          inst.setVmask((code>>shift_vmask));
-          inst.setVmop((code>>shift_vmop) && func3_mask);
-          inst.setVnf((code>>shift_vnf) && func3_mask);
+          inst.setVmop((code>>shift_vmop) & func3_mask);
+          inst.setVnf((code>>shift_vnf) & func3_mask);

          trace_inst->valid_inst = true;
          trace_inst->rs1        = ((code>>shift_rs1)   & reg_mask);
--- a/simX/instruction.cpp
+++ b/simX/instruction.cpp
@ -2016,6 +2016,64 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
                }
              }              
              break; 
+              case 37: //vmul
+              {
+                D(3, "vmul");
+                uint8_t *result_ptr;
+
+                vector<Reg<char *>> & vr1 = c.vreg[rsrc[0]];
+                vector<Reg<char *>> & vr2 = c.vreg[rsrc[1]];
+                vector<Reg<char *>> & vd  = c.vreg[rdest];
+                if(c.vtype.vsew == 8){
+                  for(uint8_t i = 0; i < c.vl; i++){
+                    uint8_t *first_ptr = (uint8_t *)vr1[i].val;
+                    uint8_t *second_ptr = (uint8_t *)vr2[i].val;
+                    uint8_t result =  (*first_ptr * *second_ptr);
+                    D(3,"Comparing " << *first_ptr << " + " << *second_ptr << " = " << result);
+
+                    result_ptr = (uint8_t *) vd[i].val;
+                    *result_ptr = result;
+                  }
+                  for(uint8_t i = c.vl; i < VLMAX; i++){
+                    result_ptr = (uint8_t *) vd[i].val;
+                    *result_ptr = 0;
+                  }
+                } 
+                else if(c.vtype.vsew == 16) {
+                  uint16_t *result_ptr;
+                  for(uint16_t i = 0; i < c.vl; i++){
+                    uint16_t *first_ptr = (uint16_t *)vr1[i].val;
+                    uint16_t *second_ptr = (uint16_t *)vr2[i].val;
+                    uint16_t result = (*first_ptr * *second_ptr);
+                    D(3,"Comparing " << *first_ptr << " + " << *second_ptr << " = " << result);
+
+                    result_ptr = (uint16_t *) vd[i].val;
+                    *result_ptr = result;
+                  }
+                  for(uint16_t i = c.vl; i < VLMAX; i++){
+                    result_ptr = (uint16_t *) vd[i].val;
+                    *result_ptr = 0;
+                  }
+
+                } else if(c.vtype.vsew == 32) {
+                  uint32_t *result_ptr;
+
+                  for(uint32_t i = 0; i < c.vl; i++){
+                    uint32_t *first_ptr = (uint32_t *)vr1[i].val;
+                    uint32_t *second_ptr = (uint32_t *)vr2[i].val;
+                    uint32_t result = (*first_ptr * *second_ptr);
+                    D(3,"Comparing " << *first_ptr << " + " << *second_ptr << " = " << result);
+
+                    result_ptr = (uint32_t *) vd[i].val;
+                    *result_ptr = result;
+                  }
+                  for(Word i = c.vl; i < VLMAX; i++){
+                    result_ptr = (uint32_t *) vd[i].val;
+                    *result_ptr = 0;
+                  }
+                }
+              }
+              break;
              case 45: //vmacc   
              {
                D(3, "vmacc");
@ -2077,6 +2135,129 @@ void Instruction::executeOn(Warp &c, trace_inst_t * trace_inst) {
            }
          }
          break;
+          case 6: 
+          {
+            switch(func6)
+            {
+              case 0:
+              {
+                D(3, "vmadd.vx");
+                uint8_t *result_ptr;
+
+                //vector<Reg<char *>> & vr1 = c.vreg[rsrc[0]];
+                vector<Reg<char *>> & vr2 = c.vreg[rsrc[1]];
+                vector<Reg<char *>> & vd  = c.vreg[rdest];
+                if(c.vtype.vsew == 8){
+                  for(uint8_t i = 0; i < c.vl; i++){
+                    //uint8_t *first_ptr = (uint8_t *)vr1[i].val;
+                    uint8_t *second_ptr = (uint8_t *)vr2[i].val;
+                    uint8_t result =  (reg[rsrc[0]] + *second_ptr);
+                    D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
+
+                    result_ptr = (uint8_t *) vd[i].val;
+                    *result_ptr = result;
+                  }
+                  for(uint8_t i = c.vl; i < VLMAX; i++){
+                    result_ptr = (uint8_t *) vd[i].val;
+                    *result_ptr = 0;
+                  }
+                } 
+                else if(c.vtype.vsew == 16) {
+                  uint16_t *result_ptr;
+                  for(uint16_t i = 0; i < c.vl; i++){
+                    //uint16_t *first_ptr = (uint16_t *)vr1[i].val;
+                    uint16_t *second_ptr = (uint16_t *)vr2[i].val;
+                    uint16_t result = (reg[rsrc[0]] + *second_ptr);
+                    D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
+
+                    result_ptr = (uint16_t *) vd[i].val;
+                    *result_ptr = result;
+                  }
+                  for(uint16_t i = c.vl; i < VLMAX; i++){
+                    result_ptr = (uint16_t *) vd[i].val;
+                    *result_ptr = 0;
+                  }
+
+                } else if(c.vtype.vsew == 32) {
+                  uint32_t *result_ptr;
+
+                  for(uint32_t i = 0; i < c.vl; i++){
+                    //uint32_t *first_ptr = (uint32_t *)vr1[i].val;
+                    uint32_t *second_ptr = (uint32_t *)vr2[i].val;
+                    uint32_t result = (reg[rsrc[0]] + *second_ptr);
+                    D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
+
+                    result_ptr = (uint32_t *) vd[i].val;
+                    *result_ptr = result;
+                  }
+                  for(Word i = c.vl; i < VLMAX; i++){
+                    result_ptr = (uint32_t *) vd[i].val;
+                    *result_ptr = 0;
+                  }
+                }                
+              }
+              break;
+              case 37: //vmul.vx
+              {
+                D(3, "vmul.vx");
+                uint8_t *result_ptr;
+
+                //vector<Reg<char *>> & vr1 = c.vreg[rsrc[0]];
+                vector<Reg<char *>> & vr2 = c.vreg[rsrc[1]];
+                vector<Reg<char *>> & vd  = c.vreg[rdest];
+                if(c.vtype.vsew == 8){
+                  for(uint8_t i = 0; i < c.vl; i++){
+                    //uint8_t *first_ptr = (uint8_t *)vr1[i].val;
+                    uint8_t *second_ptr = (uint8_t *)vr2[i].val;
+                    uint8_t result =  (reg[rsrc[0]] * *second_ptr);
+                    D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
+
+                    result_ptr = (uint8_t *) vd[i].val;
+                    *result_ptr = result;
+                  }
+                  for(uint8_t i = c.vl; i < VLMAX; i++){
+                    result_ptr = (uint8_t *) vd[i].val;
+                    *result_ptr = 0;
+                  }
+                } 
+                else if(c.vtype.vsew == 16) {
+                  uint16_t *result_ptr;
+                  for(uint16_t i = 0; i < c.vl; i++){
+                    //uint16_t *first_ptr = (uint16_t *)vr1[i].val;
+                    uint16_t *second_ptr = (uint16_t *)vr2[i].val;
+                    uint16_t result = (reg[rsrc[0]] * *second_ptr);
+                    D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
+
+                    result_ptr = (uint16_t *) vd[i].val;
+                    *result_ptr = result;
+                  }
+                  for(uint16_t i = c.vl; i < VLMAX; i++){
+                    result_ptr = (uint16_t *) vd[i].val;
+                    *result_ptr = 0;
+                  }
+
+                } else if(c.vtype.vsew == 32) {
+                  uint32_t *result_ptr;
+
+                  for(uint32_t i = 0; i < c.vl; i++){
+                    //uint32_t *first_ptr = (uint32_t *)vr1[i].val;
+                    uint32_t *second_ptr = (uint32_t *)vr2[i].val;
+                    uint32_t result = (reg[rsrc[0]] * *second_ptr);
+                    D(3,"Comparing " << reg[rsrc[0]] << " + " << *second_ptr << " = " << result);
+
+                    result_ptr = (uint32_t *) vd[i].val;
+                    *result_ptr = result;
+                  }
+                  for(Word i = c.vl; i < VLMAX; i++){
+                    result_ptr = (uint32_t *) vd[i].val;
+                    *result_ptr = 0;
+                  }
+                }                
+              }
+              break;
+            }
+          }
+          break;
          case 7:
          {
            is_vec = true;
--- a/simX/out
+++ b/simX/out
@ -0,0 +1,2 @@
+verilator --compiler gcc -cc cache_simX.v -I. -I../rtl/shared_memory -I../rtl/cache -I../rtl/interfaces -Isimulate -I../rtl --exe simX.cpp args.cpp mem.cpp core.cpp instruction.cpp enc.cpp util.cpp  -CFLAGS '-std=c++11 -fPIC -O3' -Wno-UNOPTFLAT -Wno-WIDTH --trace -DVL_DEBUG=1 
+Makefile:26: recipe for target 'simX' failed
--- a/simX/test_benchmark.sh
+++ b/simX/test_benchmark.sh
@ -1,6 +1,7 @@
 echo start > results.txt

-# echo ../kernel/vortex_test.hex
 make
 printf "Fasten your seatbelts ladies and gentelmen!!\n\n\n\n"
-cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../rvvector/benchmark_temp/vx_vec_benchmark.hex  -s -b 1> emulator.debug
+#cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../benchmarks/vector/vecadd/vx_vec_vecadd.hex  -s -b 1> emulator.debug
+#cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../benchmarks/vector/saxpy/vx_vec_saxpy.hex  -s -b 1> emulator.debug
+cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../benchmarks/vector/sgemm_nn/vx_vec_sgemm_nn.hex  -s -b 1> emulator.debug
--- a/simX/test_riscv.sh
+++ b/simX/test_riscv.sh
@ -3,4 +3,5 @@ echo start > results.txt
 # echo ../kernel/vortex_test.hex
 make
 printf "Fasten your seatbelts ladies and gentelmen!!\n\n\n\n"
-cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../runtime/mains/simple/vx_simple_main.hex  -s -b 1> emulator.debug
+#cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../runtime/mains/simple/vx_simple_main.hex  -s -b 1> emulator.debug
+cd obj_dir && ./Vcache_simX -E -a rv32i --core /home/priya/Desktop/new_vortex/Vortex/rvvector/benchmark_temp/vx_vec_benchmark.hex -s -b 1> emulator.debug
--- a/simX/test_vec.sh
+++ b/simX/test_vec.sh
@ -0,0 +1,6 @@
+echo start > results.txt
+
+# echo ../kernel/vortex_test.hex
+make
+printf "Fasten your seatbelts ladies and gentelmen!!\n\n\n\n"
+cd obj_dir && ./Vcache_simX -E -a rv32i --core ../../rvvector/basic/vx_vector_main.hex  -s -b 1> emulator.debug