Add files via upload

2025-04-23 13:27:29 -04:00 · 2019-11-25 01:38:35 -05:00 · 2019-11-25 01:38:35 -05:00 · 925b387990
commit 925b387990
parent aab920b1a2
6 changed files with 2023 additions and 0 deletions
--- a/benchmarks/opencl/reduce0/main.cc
+++ b/benchmarks/opencl/reduce0/main.cc
@ -0,0 +1,638 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+/*
+    Parallel reduction
+
+    This sample shows how to perform a reduction operation on an array of values
+    to produce a single value.
+
+    Reductions are a very common computation in parallel algorithms.  Any time
+    an array of values needs to be reduced to a single value using a binary 
+    associative operator, a reduction can be used.  Example applications include
+    statistics computaions such as mean and standard deviation, and image 
+    processing applications such as finding the total luminance of an
+    image.
+
+    This code performs sum reductions, but any associative operator such as
+    min() or max() could also be used.
+
+    It assumes the input size is a power of 2.
+
+    COMMAND LINE ARGUMENTS
+
+    "--shmoo":         Test performance for 1 to 32M elements with each of the 7 different kernels
+    "--n=<N>":         Specify the number of elements to reduce (default 1048576)
+    "--threads=<N>":   Specify the number of threads per block (default 128)
+    "--kernel=<N>":    Specify which kernel to run (0-6, default 6)
+    "--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
+    "--cpufinal":      Read back the per-block results and do final sum of block sums on CPU (default false)
+    "--cputhresh=<N>": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1)
+    
+*/
+
+// Common system and utility includes 
+#include <oclUtils.h>
+#include <shrQATest.h>
+
+// additional includes
+#include <sstream>
+#include <oclReduction.h>
+
+// Forward declarations and sample-specific defines
+// *********************************************************************
+enum ReduceType
+{
+    REDUCE_INT,
+    REDUCE_FLOAT,
+    REDUCE_DOUBLE
+};
+
+template <class T>
+bool runTest( int argc, const char** argv, ReduceType datatype);
+
+#define MAX_BLOCK_DIM_SIZE 65535
+
+extern "C"
+bool isPow2(unsigned int x)
+{
+    return ((x&(x-1))==0);
+}
+
+cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2);
+
+// Main function 
+// *********************************************************************
+int main( int argc, const char** argv) 
+{
+    shrQAStart(argc, (char **)argv);
+
+    // start logs 
+    shrSetLogFileName ("oclReduction.txt");
+    shrLog("%s Starting...\n\n", argv[0]); 
+
+    char *typeChoice;
+    shrGetCmdLineArgumentstr(argc, argv, "type", &typeChoice);
+
+    // determine type of array from command line args
+    if (0 == typeChoice)
+    {
+        typeChoice = (char*)malloc(7 * sizeof(char));
+        #ifdef WIN32
+            strcpy_s(typeChoice, 7 * sizeof(char) + 1, "int");
+        #else
+            strcpy(typeChoice, "int");
+        #endif
+    }
+    ReduceType datatype = REDUCE_INT;
+
+    #ifdef WIN32
+        if (!_strcmpi(typeChoice, "float"))
+            datatype = REDUCE_FLOAT;
+        else if (!_strcmpi(typeChoice, "double"))
+            datatype = REDUCE_DOUBLE;
+        else
+            datatype = REDUCE_INT;
+    #else
+        if (!strcmp(typeChoice, "float"))
+            datatype = REDUCE_FLOAT;
+        else if (!strcmp(typeChoice, "double"))
+            datatype = REDUCE_DOUBLE;
+        else
+            datatype = REDUCE_INT;
+    #endif
+
+    shrLog("Reducing array of type %s.\n", typeChoice);
+
+    //Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    //Get the devices
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+    cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    //Create the context
+    cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    // get and log the device info
+    if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
+      int device_nr = 0;
+      shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &device_nr);
+	  if( device_nr < uiNumDevices ) {
+		device = oclGetDev(cxGPUContext, device_nr);
+	  } else {
+		shrLog("Invalid Device %d Requested.\n", device_nr);
+		shrExitEX(argc, argv, EXIT_FAILURE);
+	  }
+    } else {
+      device = oclGetMaxFlopsDev(cxGPUContext);
+    }
+    oclPrintDevName(LOGBOTH, device);
+    shrLog("\n");
+
+    // create a command-queue
+    cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    source_path = shrFindFilePath("oclReduction_kernel.cl", argv[0]);
+
+    bool bSuccess = false;
+    switch (datatype)
+    {
+    default:
+    case REDUCE_INT:
+        bSuccess = runTest<int>( argc, argv, datatype);
+        break;
+    case REDUCE_FLOAT:
+        bSuccess = runTest<float>( argc, argv, datatype);
+        break;
+    }
+    
+    // finish
+    shrQAFinishExit(argc, (const char **)argv, bSuccess ? QA_PASSED : QA_FAILED);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute sum reduction on CPU
+//! We use Kahan summation for an accurate sum of large arrays.
+//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
+//! 
+//! @param data       pointer to input data
+//! @param size       number of input data elements
+////////////////////////////////////////////////////////////////////////////////
+template<class T>
+T reduceCPU(T *data, int size)
+{
+    T sum = data[0];
+    T c = (T)0.0;              
+    for (int i = 1; i < size; i++)
+    {
+        T y = data[i] - c;  
+        T t = sum + y;      
+        c = (t - sum) - y;  
+        sum = t;            
+    }
+    return sum;
+}
+
+unsigned int nextPow2( unsigned int x ) {
+    --x;
+    x |= x >> 1;
+    x |= x >> 2;
+    x |= x >> 4;
+    x |= x >> 8;
+    x |= x >> 16;
+    return ++x;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Compute the number of threads and blocks to use for the given reduction kernel
+// For the kernels >= 3, we set threads / block to the minimum of maxThreads and
+// n/2. For kernels < 3, we set to the minimum of maxThreads and n.  For kernel 
+// 6, we observe the maximum specified number of blocks, because each thread in 
+// that kernel can process a variable number of elements.
+////////////////////////////////////////////////////////////////////////////////
+void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
+{
+    if (whichKernel < 3)
+    {
+        threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
+        blocks = (n + threads - 1) / threads;
+    }
+    else
+    {
+        threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
+        blocks = (n + (threads * 2 - 1)) / (threads * 2);
+    }
+        
+
+    if (whichKernel == 6)
+        blocks = MIN(maxBlocks, blocks);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This function performs a reduction of the input data multiple times and 
+// measures the average reduction time.
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+T profileReduce(ReduceType datatype,
+                  cl_int  n, 
+                  int  numThreads,
+                  int  numBlocks,
+                  int  maxThreads,
+                  int  maxBlocks,
+                  int  whichKernel, 
+                  int  testIterations,
+                  bool cpuFinalReduction,
+                  int  cpuFinalThreshold,
+                  double* dTotalTime,
+                  T* h_odata,
+                  cl_mem d_idata, 
+                  cl_mem d_odata)
+{
+
+
+    T gpu_result = 0;
+    bool needReadBack = true;
+    cl_kernel finalReductionKernel[10];
+    int finalReductionIterations=0;
+
+    //shrLog("Profile Kernel %d\n", whichKernel);
+
+    cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) );
+    clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata);
+    clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata);
+    clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n);
+    clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL);
+
+    if( !cpuFinalReduction ) {
+        int s=numBlocks;
+        int threads = 0, blocks = 0;
+        int kernel = (whichKernel == 6) ? 5 : whichKernel;
+        
+        while(s > cpuFinalThreshold) 
+        {
+            getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
+
+            finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) );
+            clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata);
+            clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata);
+            clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n);
+            clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL);
+            
+            if (kernel < 3)
+                s = (s + threads - 1) / threads;
+            else
+                s = (s + (threads*2-1)) / (threads*2);
+
+            finalReductionIterations++;
+        }
+    }
+    
+    size_t globalWorkSize[1];
+    size_t localWorkSize[1];
+
+    for (int i = 0; i < testIterations; ++i)
+    {
+        gpu_result = 0;
+
+        clFinish(cqCommandQueue);
+        if(i>0) shrDeltaT(1);
+
+        // execute the kernel
+        globalWorkSize[0] = numBlocks * numThreads;
+        localWorkSize[0] = numThreads;
+	
+        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize,
+                                          0, NULL, NULL);               
+
+        // check if kernel execution generated an error        
+        //oclCheckError(ciErrNum, CL_SUCCESS);
+
+        if (cpuFinalReduction)
+        {
+            // sum partial sums from each block on CPU        
+            // copy result from device to host
+            clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T), 
+                                h_odata, 0, NULL, NULL);
+
+            for(int i=0; i<numBlocks; i++) 
+            {
+                gpu_result += h_odata[i];
+            }
+
+            needReadBack = false;
+        }
+        else
+        {
+            // sum partial block sums on GPU
+            int s=numBlocks;
+            int kernel = (whichKernel == 6) ? 5 : whichKernel;
+            int it = 0;
+            
+
+            while(s > cpuFinalThreshold) 
+            {
+                int threads = 0, blocks = 0;
+                getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
+
+                globalWorkSize[0] = threads * blocks;
+                localWorkSize[0] = threads;
+                
+                ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0,
+                                                  globalWorkSize, localWorkSize, 0, NULL, NULL);               
+                //oclCheckError(ciErrNum, CL_SUCCESS);
+                
+                if (kernel < 3)
+                    s = (s + threads - 1) / threads;
+                else
+                    s = (s + (threads*2-1)) / (threads*2);
+
+                it++;
+            }
+
+            if (s > 1)
+            {
+                // copy result from device to host
+                clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T), 
+                                    h_odata, 0, NULL, NULL);
+
+                for(int i=0; i < s; i++) 
+                {
+                    gpu_result += h_odata[i];
+                }
+
+                needReadBack = false;
+            }
+        }
+
+        clFinish(cqCommandQueue);
+        if(i>0) *dTotalTime += shrDeltaT(1); 
+    }
+
+    if (needReadBack)
+    {
+        // copy final sum from device to host
+        clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T), 
+                            &gpu_result, 0, NULL, NULL);
+    }
+
+    // Release the kernels
+    clReleaseKernel(reductionKernel);
+    if( !cpuFinalReduction ) {
+        for(int it=0; it<finalReductionIterations; ++it) {
+            clReleaseKernel(finalReductionKernel[it]);
+        }
+        
+    }
+
+    return gpu_result;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// This function calls profileReduce multple times for a range of array sizes
+// and prints a report in CSV (comma-separated value) format that can be used for
+// generating a "shmoo" plot showing the performance for each kernel variation
+// over a wide range of input sizes.
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
+{ 
+    // create random input data on CPU
+    unsigned int bytes = maxN * sizeof(T);
+
+    T* h_idata = (T*)malloc(bytes);
+
+    for(int i = 0; i < maxN; i++) {
+        // Keep the numbers small so we don't get truncation error in the sum
+        if (datatype == REDUCE_INT)
+            h_idata[i] = (T)(rand() & 0xFF);
+        else
+            h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
+    }
+
+    int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);
+
+    // allocate mem for the result on host side
+    T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));
+
+    // allocate device memory and data
+    cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
+    cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL);
+
+    int testIterations = 100;
+    double dTotalTime = 0.0;
+    
+    // print headers
+    shrLog("Time in seconds for various numbers of elements for each kernel\n");
+    shrLog("\n\n");
+    shrLog("Kernel");
+    for (int i = minN; i <= maxN; i *= 2)
+    {
+        shrLog(", %d", i);
+    }
+   
+    for (int kernel = 0; kernel < 7; kernel++)
+    {
+        shrLog("\n");
+        shrLog("%d", kernel);
+        for (int i = minN; i <= maxN; i *= 2)
+        {
+            int numBlocks = 0;
+            int numThreads = 0;
+            getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);
+            
+            double reduceTime;
+            if( numBlocks <= MAX_BLOCK_DIM_SIZE ) {
+                profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, 
+                                testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata);
+                reduceTime = dTotalTime/(double)testIterations;
+            } else {                
+                reduceTime = -1.0;
+            }
+            shrLog(", %.4f m", reduceTime);
+        }
+    }
+
+    // cleanup
+    free(h_idata);
+    free(h_odata);
+    clReleaseMemObject(d_idata);
+    clReleaseMemObject(d_odata);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// The main function whihc runs the reduction test.
+////////////////////////////////////////////////////////////////////////////////
+template <class T>
+bool
+runTest( int argc, const char** argv, ReduceType datatype) 
+{
+    int size = 1<<24;    // number of elements to reduce
+    int maxThreads;
+
+    cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1);        
+    clReleaseKernel(reductionKernel);
+
+    if (smallBlock) 
+      maxThreads = 64;  // number of threads per block
+    else
+      maxThreads = 128;
+
+    int whichKernel = 6;
+    int maxBlocks = 64;
+    bool cpuFinalReduction = false;
+    int cpuFinalThreshold = 1;
+
+    shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
+    shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
+    shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
+    shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
+    
+    shrLog(" %d elements\n", size);
+    shrLog(" %d threads (max)\n", maxThreads);
+
+    cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE);
+    shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);
+
+    bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE);
+
+#ifdef GPU_PROFILING
+    if (runShmoo)
+    {
+        shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
+        return true;
+    }
+    else
+#endif
+    {
+        // create random input data on CPU
+        unsigned int bytes = size * sizeof(T);
+        T* h_idata = (T*)malloc(bytes);
+
+        for(int i=0; i<size; i++) 
+        {
+            // Keep the numbers small so we don't get truncation error in the sum
+            if (datatype == REDUCE_INT)
+                h_idata[i] = (T)(rand() & 0xFF);
+            else
+                h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
+        }
+
+        int numBlocks = 0;
+        int numThreads = 0;
+        getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
+        if (numBlocks == 1) cpuFinalThreshold = 1;
+        shrLog(" %d blocks\n\n", numBlocks);
+
+        // allocate mem for the result on host side
+        T* h_odata = (T*)malloc(numBlocks * sizeof(T));
+
+        // allocate device memory and data
+        cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
+        cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL);
+      
+        int testIterations = 100;
+        double dTotalTime = 0.0;
+        T gpu_result = 0;
+        gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks,
+                                        whichKernel, testIterations, cpuFinalReduction, 
+                                        cpuFinalThreshold, &dTotalTime,
+                                        h_odata, d_idata, d_odata);
+
+#ifdef GPU_PROFILING
+        double reduceTime = dTotalTime/(double)testIterations;
+        shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 
+               1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
+#endif
+
+        // compute reference solution
+        shrLog("\nComparing against Host/C++ computation...\n"); 
+        T cpu_result = reduceCPU<T>(h_idata, size);
+        if (datatype == REDUCE_INT)
+        {
+            shrLog(" GPU result = %d\n", gpu_result);
+            shrLog(" CPU result = %d\n\n", cpu_result);
+            shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED");
+        }
+        else
+        {
+            shrLog(" GPU result = %.9f\n", gpu_result);
+            shrLog(" CPU result = %.9f\n\n", cpu_result);
+
+            double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12;
+            double diff = abs((double)gpu_result - (double)cpu_result);
+            shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED");
+        }
+      
+        // cleanup
+        free(h_idata);
+        free(h_odata);
+        clReleaseMemObject(d_idata);
+        clReleaseMemObject(d_odata);
+
+        return (gpu_result == cpu_result);
+    }
+}
+
+// Helper function to create and build program and kernel
+// *********************************************************************
+cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2)
+{
+    // compile cl program
+    size_t program_length;
+    char *source; 
+
+    std::ostringstream preamble;   
+
+    // create the program
+    // with type specification depending on datatype argument
+    switch (datatype)
+    {
+    default:
+    case REDUCE_INT:
+        preamble << "#define T int" << std::endl;
+        break;
+    case REDUCE_FLOAT:
+        preamble << "#define T float" << std::endl;
+        break;
+    }
+    
+    // set blockSize at compile time
+    preamble << "#define blockSize " << blockSize << std::endl;
+    
+    // set isPow2 at compile time
+    preamble << "#define nIsPow2 " << isPowOf2 << std::endl;
+    
+    // Load the source code and prepend the preamble
+    source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length);
+    //oclCheckError(source != NULL, shrTRUE);
+    
+    program =
+      clCreateProgramWithBuiltInKernels(context, 1, &device_id, "reduce0", NULL);
+    //cl_program rv_program = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, 
+     //                                                &program_length, &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+    free(source);
+
+    // build the program
+    ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
+    if (ciErrNum != CL_SUCCESS)
+    {
+        // write out standard error, Build Log and PTX, then cleanup and exit
+        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+        oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
+        oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclReduction.ptx");
+        //oclCheckError(ciErrNum, CL_SUCCESS); 
+    }
+    
+    // create Kernel    
+    std::ostringstream kernelName;
+    kernelName << "reduce" << whichKernel;    
+    cl_kernel ckKernel = clCreateKernel(rv_program, kernelName.str().c_str(), &ciErrNum);
+    //oclCheckError(ciErrNum, CL_SUCCESS);
+
+    size_t wgSize;
+    ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
+    if (wgSize == 64) 
+      smallBlock = true;
+    else smallBlock = false;
+
+    // NOTE: the program will get deleted when the kernel is also released
+    clReleaseProgram(rv_program);
+    
+    return ckKernel;
+}
--- a/benchmarks/opencl/reduce0/oclReduction.h
+++ b/benchmarks/opencl/reduce0/oclReduction.h
@ -0,0 +1,34 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+ #ifndef __REDUCTION_H__
+#define __REDUCTION_H__
+
+template <class T>
+void reduce_sm10(int size, int threads, int blocks, 
+                 int whichKernel, T *d_idata, T *d_odata);
+
+template <class T>
+void reduce_sm13(int size, int threads, int blocks, 
+                 int whichKernel, T *d_idata, T *d_odata);
+
+// CL objects
+cl_platform_id cpPlatform;
+cl_uint uiNumDevices;
+cl_device_id* cdDevices; 
+cl_context cxGPUContext;
+cl_command_queue cqCommandQueue;
+cl_device_id device;
+cl_int ciErrNum;
+const char* source_path;
+bool smallBlock = true;
+
+#endif
--- a/benchmarks/opencl/reduce0/oclReduction_kernel.cl
+++ b/benchmarks/opencl/reduce0/oclReduction_kernel.cl
@ -0,0 +1,273 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+/*
+    Parallel reduction kernels
+*/
+
+// The following defines are set during runtime compilation, see reduction.cpp
+// #define T float
+// #define blockSize 128
+// #define nIsPow2 1
+
+#ifndef _REDUCE_KERNEL_H_
+#define _REDUCE_KERNEL_H_
+
+/*
+    Parallel sum reduction using shared memory
+    - takes log(n) steps for n input elements
+    - uses n threads
+    - only works for power-of-2 arrays
+*/
+
+/* This reduction interleaves which threads are active by using the modulo
+   operator.  This operator is very expensive on GPUs, and the interleaved 
+   inactivity means that no whole warps are active, which is also very 
+   inefficient */
+__kernel void reduce0(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // load shared mem
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_global_id(0);
+    
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=1; s < get_local_size(0); s *= 2) {
+        // modulo arithmetic is slow!
+        if ((tid % (2*s)) == 0) {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+
+/* This version uses contiguous threads, but its interleaved 
+   addressing results in many shared memory bank conflicts. */
+__kernel void reduce1(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // load shared mem
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_global_id(0);
+    
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=1; s < get_local_size(0); s *= 2) 
+    {
+        int index = 2 * s * tid;
+
+        if (index < get_local_size(0)) 
+        {
+            sdata[index] += sdata[index + s];
+        }
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version uses sequential addressing -- no divergence or bank conflicts.
+*/
+__kernel void reduce2(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // load shared mem
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_global_id(0);
+    
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=get_local_size(0)/2; s>0; s>>=1) 
+    {
+        if (tid < s) 
+        {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version uses n/2 threads --
+    it performs the first level of reduction when reading from global memory
+*/
+__kernel void reduce3(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    if (i + get_local_size(0) < n) 
+        sdata[tid] += g_idata[i+get_local_size(0)];  
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    for(unsigned int s=get_local_size(0)/2; s>0; s>>=1) 
+    {
+        if (tid < s) 
+        {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version unrolls the last warp to avoid synchronization where it 
+    isn't needed
+*/
+__kernel void reduce4(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    if (i + get_local_size(0) < n) 
+        sdata[tid] += g_idata[i+get_local_size(0)];  
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    #pragma unroll 1
+    for(unsigned int s=get_local_size(0)/2; s>32; s>>=1) 
+    {
+        if (tid < s) 
+        {
+            sdata[tid] += sdata[tid + s];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    if (tid < 32)
+    {
+        if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; }
+        if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; }
+        if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; }
+        if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; }
+        if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; }
+        if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; }
+    }
+
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version is completely unrolled.  It uses a template parameter to achieve 
+    optimal code for any (power of 2) number of threads.  This requires a switch 
+    statement in the host code to handle all the different thread block sizes at 
+    compile time.
+*/
+__kernel void reduce5(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+
+    sdata[tid] = (i < n) ? g_idata[i] : 0;
+    if (i + blockSize < n) 
+        sdata[tid] += g_idata[i+blockSize];  
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 128) { if (tid <  64) { sdata[tid] += sdata[tid +  64]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    
+    if (tid < 32)
+    {
+        if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; }
+        if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; }
+        if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; }
+        if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; }
+        if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; }
+        if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; }
+    }
+    
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+/*
+    This version adds multiple elements per thread sequentially.  This reduces the overall
+    cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
+    (Brent's Theorem optimization)
+*/
+__kernel void reduce6(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
+{
+    // perform first level of reduction,
+    // reading from global memory, writing to shared memory
+    unsigned int tid = get_local_id(0);
+    unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
+    unsigned int gridSize = blockSize*2*get_num_groups(0);
+    sdata[tid] = 0;
+
+    // we reduce multiple elements per thread.  The number is determined by the 
+    // number of active thread blocks (via gridDim).  More blocks will result
+    // in a larger gridSize and therefore fewer elements per thread
+    while (i < n)
+    {         
+        sdata[tid] += g_idata[i];
+        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
+        if (nIsPow2 || i + blockSize < n) 
+            sdata[tid] += g_idata[i+blockSize];  
+        i += gridSize;
+    } 
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // do reduction in shared mem
+    if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    if (blockSize >= 128) { if (tid <  64) { sdata[tid] += sdata[tid +  64]; } barrier(CLK_LOCAL_MEM_FENCE); }
+    
+    if (tid < 32)
+    {
+        if (blockSize >=  64) { sdata[tid] += sdata[tid + 32]; }
+        if (blockSize >=  32) { sdata[tid] += sdata[tid + 16]; }
+        if (blockSize >=  16) { sdata[tid] += sdata[tid +  8]; }
+        if (blockSize >=   8) { sdata[tid] += sdata[tid +  4]; }
+        if (blockSize >=   4) { sdata[tid] += sdata[tid +  2]; }
+        if (blockSize >=   2) { sdata[tid] += sdata[tid +  1]; }
+    }
+    
+    // write result for this block to global mem 
+    if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
+}
+
+#endif // #ifndef _REDUCE_KERNEL_H_
--- a/benchmarks/opencl/reduce0/oclUtils.h
+++ b/benchmarks/opencl/reduce0/oclUtils.h
@ -0,0 +1,198 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+#ifndef OCL_UTILS_H
+#define OCL_UTILS_H
+
+// *********************************************************************
+// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// Common headers:  Cross-API utililties and OpenCL header
+#include <shrUtils.h>
+
+// All OpenCL headers
+#if defined (__APPLE__) || defined(MACOSX)
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/opencl.h>
+#endif 
+
+// Includes
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// For systems with CL_EXT that are not updated with these extensions, we copied these
+// extensions from <CL/cl_ext.h>
+#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+  /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+  #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+  #define CL_DEVICE_WARP_SIZE_NV                      0x4003
+  #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+  #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+  #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#endif
+
+// reminders for build output window and log
+#ifdef _WIN32
+    #pragma message ("Note: including shrUtils.h")
+    #pragma message ("Note: including opencl.h")
+#endif
+
+// SDK Revision #
+#define OCL_SDKREVISION "7027912"
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) 
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
+//!
+//! @return the id 
+//! @param clSelectedPlatformID         OpenCL platform ID
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print info about the device
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and return device capability
+//!
+//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA 
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" int oclGetDevCap(cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print the device name
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the first device from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxGPUContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of device with maximal FLOPS from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program file and prepends the cPreamble to the code.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get the binary (PTX) of the program associated with the device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//! @param binary       returned code
+//! @param length       length of returned code
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram                   OpenCL program
+//! @param cdDevice                    device of interest
+//! @param const char*  cPtxFileName   optional PTX file name
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the Build Log from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
+
+// Helper function for De-allocating cl objects
+// *********************************************************************
+extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
+
+// Helper function to get OpenCL error string from constant
+// *********************************************************************
+extern "C" const char* oclErrorString(cl_int error);
+
+// Helper function to get OpenCL image format string (channel order and type) from constant
+// *********************************************************************
+extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    // An error condition is defined by the sample/test value not equal to the reference
+    if (iReference != iSample)
+    {
+        // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
+        iSample = (iSample == 0) ? -9999 : iSample; 
+
+        // Log the error info
+        shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
+
+        // Cleanup and exit, or just exit if no cleanup function pointer provided.  Use iSample (error code in this case) as process exit code.
+        if (pCleanup != NULL)
+        {
+            pCleanup(iSample);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(iSample);
+        }
+    }
+}
+
+#endif
--- a/benchmarks/opencl/reduce0/shrQATest.h
+++ b/benchmarks/opencl/reduce0/shrQATest.h
@ -0,0 +1,238 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_QATEST_H
+#define SHR_QATEST_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// OS dependent includes
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+    #pragma message ("Note: including time.h")
+
+// Headers needed for Windows
+    #include <windows.h>
+	#include <time.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+    #include <unistd.h>
+    #include <time.h>
+#endif
+
+#ifndef STRCASECMP
+#ifdef _WIN32
+#define STRCASECMP _stricmp
+#else
+#define STRCASECMP strcasecmp
+#endif
+#endif
+
+#ifndef STRNCASECMP
+#ifdef _WIN32
+#define STRNCASECMP _strnicmp
+#else
+#define STRNCASECMP strncasecmp
+#endif
+#endif
+
+
+// Standardized QA Start/Finish for CUDA SDK tests
+#define shrQAStart(a, b)      __shrQAStart(a, b)
+#define shrQAFinish(a, b, c)  __shrQAFinish(a, b, c)
+#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
+
+inline int findExeNameStart(const char *exec_name)
+{
+    int exename_start = (int)strlen(exec_name);
+
+    while( (exename_start > 0) && 
+            (exec_name[exename_start] != '\\') && 
+            (exec_name[exename_start] != '/') )
+    {
+        exename_start--;
+    }
+    if (exec_name[exename_start] == '\\' || 
+        exec_name[exename_start] == '/')
+    {
+        return exename_start+1;
+    } else {
+        return exename_start;
+    }
+}
+
+inline int __shrQAStart(int argc, char **argv)
+{
+    bool bQATest = false;
+    // First clear the output buffer
+    fflush(stdout);
+    fflush(stdout);
+
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+        char *string_argv = &argv[i][string_start];
+
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }
+    }
+    
+    // We don't want to print the entire path, so we search for the first 
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    return exename_start;
+}
+
+enum eQAstatus {
+    QA_FAILED = 0,
+    QA_PASSED = 1,
+    QA_WAIVED = 2
+};
+
+inline void __ExitInTime(int seconds)
+{
+    fprintf(stdout, "> exiting in %d seconds: ", seconds);
+    fflush(stdout);
+    time_t t;
+    int count;
+    for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
+        fprintf(stdout, "%d...", count);
+#ifdef WIN32
+        Sleep(1000);
+#else
+        sleep(1);
+#endif
+    }
+    fprintf(stdout,"done!\n\n"); 
+	fflush(stdout);
+}
+
+
+inline void __shrQAFinish(int argc, const char **argv, int iStatus)
+{
+    // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
+    bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }	
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bNoPrompt = true;
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bNoPrompt = false;
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    } else {
+        if (!bNoPrompt) {
+            fprintf(stdout, "\nPress <Enter> to exit...\n");
+            fflush(stdout);
+            getchar();
+        }
+    }
+}
+
+inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
+{
+    bool bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    }
+}
+
+inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish(argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); 
+}
+
+inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish2(bQAtest, argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+#endif
--- a/benchmarks/opencl/reduce0/shrUtils.h
+++ b/benchmarks/opencl/reduce0/shrUtils.h
@ -0,0 +1,642 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_UTILS_H
+#define SHR_UTILS_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// reminders for output window and build log
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+#endif
+
+// OS dependent includes
+#ifdef _WIN32
+    // Headers needed for Windows
+    #include <windows.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+#endif
+
+// Other headers needed for both Windows and Linux
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// Un-comment the following #define to enable profiling code in SDK apps
+//#define GPU_PROFILING
+
+// Beginning of GPU Architecture definitions
+inline int ConvertSMVer2Cores(int major, int minor)
+{
+	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+	typedef struct {
+		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+		int Cores;
+	} sSMtoCores;
+
+	sSMtoCores nGpuArchCoresPerSM[] = 
+	{ { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+	  { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+	  { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+	  { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+	  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+	  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+	  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
+	  {   -1, -1 }
+	};
+
+	int index = 0;
+	while (nGpuArchCoresPerSM[index].SM != -1) {
+		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
+			return nGpuArchCoresPerSM[index].Cores;
+		}
+		index++;
+	}
+	printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
+	return -1;
+}
+// end of GPU Architecture definitions
+
+
+// Defines and enum for use with logging functions
+// *********************************************************************
+#define DEFAULTLOGFILE "SdkConsoleLog.txt"
+#define MASTERLOGFILE "SdkMasterLog.csv"
+enum LOGMODES 
+{
+    LOGCONSOLE = 1, // bit to signal "log to console" 
+    LOGFILE    = 2, // bit to signal "log to file" 
+    LOGBOTH    = 3, // convenience union of first 2 bits to signal "log to both"
+    APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
+    MASTER     = 8, // bit to signal master .csv log output
+    ERRORMSG   = 16, // bit to signal "pre-pend Error" 
+    CLOSELOG   = 32  // bit to close log file, if open, after any requested file write
+};
+#define HDASHLINE "-----------------------------------------------------------\n"
+
+// Standardized boolean
+enum shrBOOL
+{
+    shrFALSE = 0,
+    shrTRUE = 1
+};
+
+// Standardized MAX, MIN and CLAMP
+#define MAX(a, b) ((a > b) ? a : b)
+#define MIN(a, b) ((a < b) ? a : b)
+#define CLAMP(a, b, c) MIN(MAX(a, b), c)    // double sided clip of input a
+#define TOPCLAMP(a, b) (a < b ? a:b)	    // single top side clip of input a
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) 
+
+// Standardized Exit Macro for leaving main()... extended version
+// (Companion Inline Function lower on page)
+#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
+
+// Standardized Exit Macro for leaving main()... short version
+// (Companion Inline Function lower on page)
+#define shrEXIT(a, b)        __shrExitEX(a, b, EXIT_SUCCESS)
+
+// Simple argument checker macro
+#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE 
+
+// Define for user-customized error handling
+#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
+
+// Function to deallocate memory allocated within shrUtils
+// *********************************************************************
+extern "C" void shrFree(void* ptr);
+
+// *********************************************************************
+// Helper function to log standardized information to Console, to File or to both
+//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); 
+//!         : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+//! 
+//! Automatically opens file and stores handle if needed and not done yet
+//! Closes file and nulls handle on request
+//! 
+//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.  
+//!          LOGFILE and LOGBOTH may be | 'd  with APPENDMODE to select file append mode instead of overwrite mode 
+//!          LOGFILE and LOGBOTH may be | 'd  with CLOSELOG to "write and close" 
+//!          First 3 options may be | 'd  with MASTER to enable independent write to master data log file
+//!          First 3 options may be | 'd  with ERRORMSG to start line with standard error message
+//! @param 2 dValue:    
+//!          Positive val = double value for time in secs to be formatted to 6 decimals. 
+//!          Negative val is an error code and this give error preformatting.
+//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.  
+//!          ALL printf flags, width, precision and type specifiers are supported with this exception: 
+//!              Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
+//!              Single byte char type specifiers (%s and %c) ARE supported 
+//! @param 4... variable args: like printf or fprintf.  Must match format specifer type above.  
+//! @return 0 if OK, negative value on error or if error occurs or was passed in. 
+// *********************************************************************
+extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
+
+// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, 
+// *********************************************************************
+extern "C" int shrLog(const char* cFormatString, ...);
+
+// *********************************************************************
+// Delta timer function for up to 3 independent timers using host high performance counters 
+// Maintains state for 3 independent counters
+//! Example: double dElapsedTime = shrDeltaTime(0);
+//! 
+//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
+//! @return delta time of specified counter since last call in seconds.  Otherwise -9999.0 if error
+// *********************************************************************
+extern "C" double shrDeltaT(int iCounterID);
+
+// Optional LogFileNameOverride function
+// *********************************************************************
+extern "C" void shrSetLogFileName (const char* cOverRideName);
+
+// Helper function to init data arrays 
+// *********************************************************************
+extern "C" void shrFillArray(float* pfData, int iSize);
+
+// Helper function to print data arrays 
+// *********************************************************************
+extern "C" void shrPrintArray(float* pfData, int iSize);
+
+////////////////////////////////////////////////////////////////////////////
+//! Find the path for a filename
+//! @return the path if succeeded, otherwise 0
+//! @param filename        name of the file
+//! @param executablePath  optional absolute path of the executable
+////////////////////////////////////////////////////////////////////////////
+extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing single precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing double precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing single precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
+               const float epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing double precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
+               const double epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, 
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, 
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PPM image file (with unsigned char as data element type), padding 
+//! 4th component
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param OutData  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! 
+//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, 
+                             unsigned int *w, unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+//! Save PPM image file (with unsigned char as data element type, padded to 
+//! 4 bytes)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, 
+               unsigned int w, unsigned int h);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Save PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, 
+              unsigned int w, unsigned int h); 
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
+                  unsigned int *w,unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+// Command line arguments: General notes
+// * All command line arguments begin with '--' followed by the token; 
+//   token and value are seperated by '='; example --samples=50
+// * Arrays have the form --model=[one.obj,two.obj,three.obj] 
+//   (without whitespaces)
+////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////
+//! Check if command line argument \a flag-name is given
+//! @return shrTRUE if command line argument \a flag_name has been given, 
+//!         otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param flag_name  name of command line flag
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, 
+                     const char* flag_name);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, 
+                        const char* arg_name, int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type unsigned int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, 
+                        const char* arg_name, unsigned int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type float
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, 
+                        const char* arg_name, float* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type string
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, 
+                          const char* arg_name, char** val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument list those element are strings
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  command line argument list
+//! @param len  length of the list / number of elements
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, 
+                              const char* arg_name, char** val, 
+                              unsigned int* len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparef( const float* reference, const float* data,
+             const unsigned int len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparei( const int* reference, const int* data, 
+             const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned integer arrays, with epsilon and threshold
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
+            const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned char arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
+              const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integers with a tolernance for # of byte errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays witha n epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
+              const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality and a 
+//!     threshold for # pixel errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays using L2-norm with an epsilon tolerance for 
+//! equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
+                const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PPM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PGM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
+
+extern "C" size_t shrRoundUp(int group_size, int global_size);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    if (iReference != iSample)
+    {
+        shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); 
+        if (pCleanup != NULL)
+        {
+            pCleanup(EXIT_FAILURE);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
+// Standardized Exit
+// *********************************************************************
+inline void __shrExitEX(int argc, const char** argv, int iExitCode)
+{
+#ifdef WIN32
+    if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#else 
+    if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#endif
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");                  
+        getchar();                                                           
+    }       
+    else 
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); 
+    }
+    fflush(stderr);                                                         
+    exit(iExitCode);
+}
+
+#endif