Add files via upload

2025-04-24 05:47:35 -04:00 · 2019-11-24 23:55:42 -05:00 · 2019-11-24 23:55:42 -05:00 · 6f26b68760
commit 6f26b68760
parent 54429d0017
5 changed files with 1805 additions and 0 deletions
--- a/benchmarks/opencl/VectorHypot/VectorHypot.cl
+++ b/benchmarks/opencl/VectorHypot/VectorHypot.cl
@ -0,0 +1,41 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+// OpenCL Kernel Function Naive Implementation for hyptenuse
+__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
+{
+    // get index into global data array
+    size_t szGlobalOffset = get_global_id(0) + uiOffset;
+
+    // bound check 
+    if (szGlobalOffset >= uiNumElements)
+    {   
+        return; 
+    }
+
+    // Processing 4 elements per work item, so read fgA and fgB source values from GMEM
+    float4 f4A = fg4A[szGlobalOffset];
+    float4 f4B = fg4B[szGlobalOffset];
+    float4 f4H = (float4)0.0f;
+     
+    // Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop  
+    for (int i = 0; i < iInnerLoopCount; i++)  
+    {
+        // compute the 4 hypotenuses using built-in function
+        f4H.x = hypot (f4A.x, f4B.x);
+        f4H.y = hypot (f4A.y, f4B.y);
+        f4H.z = hypot (f4A.z, f4B.z);
+        f4H.w = hypot (f4A.w, f4B.w);
+    }
+    
+    // Write 4 result values back out to GMEM
+    fg4Hypot[szGlobalOffset] = f4H;
+}
--- a/benchmarks/opencl/VectorHypot/main.cc
+++ b/benchmarks/opencl/VectorHypot/main.cc
@ -0,0 +1,686 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// *********************************************************************
+// oclCopyComputeOverlap Notes:  
+//
+// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
+// element by element vector hyptenuse computation using 2 input float arrays
+// and 1 output float array.
+//
+// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
+// with respect to GPU computation (and with respect to host thread). 
+//
+// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
+// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
+// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
+//
+// After setup, warmup and calibration to the system, the sample runs 4 scenarios:  
+//      A) Computations with 2 command queues on GPU
+//         A multiple-cycle sequence is executed, timed and compared against the host
+//      B) Computations with 1 command queue on GPU
+//         A multiple-cycle sequence is executed, timed and compared against the host
+//
+//      The 2-command queue approach ought to be substantially faster
+//
+// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently 
+// increases compute time without increasing data size (via a loop inside the kernel)
+//
+//      At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
+//      (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
+//
+//      If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
+//        
+//          Single Queue with all the data and all the work  
+//             Ttot (serial)        = 4T + 4T + 2T      = 10T    
+//
+//          Dual Queue, where each queue has 1/2 the data and 1/2 the work 
+//             Tq0  (overlap)       = 2T + 2T + T .... 
+//             Tq1  (overlap)       = .... 2T + 2T + T
+//
+//             Ttot (elapsed, wall) = 2T + 2T + 2T + T  = 7T
+//
+//          Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 %	(Tesla arch 1.2 or 1.3, single copy engine)
+//
+//			For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
+//			This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.			
+// *********************************************************************
+
+// common SDK header for standard utilities and system libs 
+#include <oclUtils.h>
+#include <shrQATest.h>
+
+// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
+// values greater than 0.0f represent a speed-up relative to non-overlapped
+#define EXPECTED_OVERLAP 30.0f
+#define EXPECTED_OVERLAP_FERMI 45.0f
+#define PASS_FACTOR 0.60f
+#define RETRIES_ON_FAILURE 1
+
+// Base sizes for parameters manipulated dynamically or on the command line
+#define BASE_WORK_ITEMS 64
+#define BASE_ARRAY_LENGTH 40000
+#define BASE_LOOP_COUNT 32
+
+// Vars
+// *********************************************************************
+cl_platform_id cpPlatform;                          // OpenCL platform
+cl_context cxGPUContext;                            // OpenCL context
+cl_command_queue cqCommandQueue[2];                 // OpenCL command queues
+cl_device_id* cdDevices;                            // OpenCL device list  
+cl_program cpProgram;                               // OpenCL program
+cl_kernel ckKernel[2];                              // OpenCL kernel, 1 per queue
+cl_mem cmPinnedSrcA;                                // OpenCL pinned host source buffer A
+cl_mem cmPinnedSrcB;                                // OpenCL pinned host source buffer B 
+cl_mem cmPinnedResult;                              // OpenCL pinned host result buffer 
+float* fSourceA = NULL;                             // Mapped pointer for pinned Host source A buffer
+float* fSourceB = NULL;                             // Mapped pointer for pinned Host source B buffer
+float* fResult = NULL;                              // Mapped pointer for pinned Host result buffer 
+cl_mem cmDevSrcA;                                   // OpenCL device source buffer A
+cl_mem cmDevSrcB;                                   // OpenCL device source buffer B 
+cl_mem cmDevResult;                                 // OpenCL device result buffer 
+size_t szBuffBytes;                                 // Size of main buffers
+size_t szGlobalWorkSize;                            // 1D var for Total # of work items in the launched ND range
+size_t szLocalWorkSize = BASE_WORK_ITEMS;           // initial # of work items in the work group	
+cl_int ciErrNum;			                        // Error code var
+char* cPathAndName = NULL;                          // Var for full paths to data, src, etc.
+char* cSourceCL = NULL;                             // Buffer to hold source for compilation 
+const char* cExecutableName = NULL;
+
+// demo config vars
+const char* cSourceFile = "VectorHypot.cl";         // OpenCL computation kernel source code
+float* Golden = NULL;                               // temp buffer to hold golden results for cross check    
+bool bNoPrompt = false;                             // Command line switch to skip exit prompt
+bool bQATest = false;                               // Command line switch to test
+
+// Forward Declarations
+// *********************************************************************
+double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
+double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
+int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles); 
+void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
+void Cleanup (int iExitCode);
+void (*pCleanup)(int) = &Cleanup;
+
+int *gp_argc = 0;
+const char *** gp_argv = NULL;
+
+// Main function 
+// *********************************************************************
+int main(int argc, const char **argv)
+{
+    //Locals
+    size_t szKernelLength;                      // Byte size of kernel code
+    double dBuildTime;                          // Compile time
+    cl_uint uiTargetDevice = 0;	                // Default Device to compute on
+    cl_uint uiNumDevsUsed = 1;                  // Number of devices used in this sample   
+    cl_uint uiNumDevices;                       // Number of devices available 
+    int iDevCap = -1;                           // Capability of device
+    int iInnerLoopCount = BASE_LOOP_COUNT;      // Varies "compute intensity" per data within the kernel 
+    const int iTestCycles = 10;                 // How many times to run the external test loop 
+    const int iWarmupCycles = 8;                // How many times to run the warmup sequence 
+    cl_uint uiWorkGroupMultiple = 4;            // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
+    cl_uint uiNumElements = BASE_ARRAY_LENGTH;  // initial # of elements per array to process (note: procesing 4 per work item)
+    cl_uint uiSizeMultiple = 4;                 // Command line var (using "sizemult=<n>") to optionally increase vector sizes
+    bool bPassFlag = false;                     // Var to accumulate test pass/fail
+    shrBOOL bMatch = shrFALSE;                  // Cross check result
+	shrBOOL bTestOverlap = shrFALSE;
+	double dAvgGPUTime[2] = {0.0, 0.0};         // Average time of iTestCycles calls for 2-Queue and 1-Queue test
+    double dHostTime[2] = {0.0, 0.0};           // Host computation time (2nd test is redundant but a good stability indicator)
+    float fMinPassCriteria[2] = {0.0f, 0.0f};	// Test pass cireria, adjusted dependant on GPU arch
+
+    gp_argc = &argc;
+    gp_argv = &argv;
+
+    shrQAStart(argc, (char **)argv);
+
+    // start logs 
+	cExecutableName = argv[0];
+    shrSetLogFileName ("oclCopyComputeOverlap.txt");
+    shrLog("%s Starting...\n\n", argv[0]); 
+
+    // get basic command line args 
+    bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
+    bQATest   = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
+    shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
+
+    // Optional Command-line multiplier for vector size 
+    //   Default val of 4 gives 10.24 million float elements per vector
+    //   Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
+    shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
+    uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);  
+    uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
+    shrLog("Array sizes = %u float elements\n", uiNumElements); 
+
+    // Optional Command-line multiplier for workgroup size (x 64 work items)
+    //   Default val of 4 gives szLocalWorkSize of 256.
+    //   Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
+    shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
+    uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10); 
+    szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
+    shrLog("Workgroup Size = %u\n\n", szLocalWorkSize); 
+
+    // Get the NVIDIA platform if available, otherwise use default
+    shrLog("Get the Platform ID...\n\n");
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Get OpenCL platform name and version
+    char cBuffer[256];
+    ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("Platform Name = %s\n\n", cBuffer);
+
+    // Get all the devices
+    shrLog("Get the Device info and select Device...\n");
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
+
+    // Ethans changes
+    CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+    CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+    //ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Set target device and check capabilities 
+    shrLog(" # of Devices Available = %u\n", uiNumDevices); 
+    uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
+    shrLog(" Using Device %u, ", uiTargetDevice); 
+    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);  
+    iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
+    if (iDevCap > 0) {
+       shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
+    } else {
+       shrLog("\n\n", iDevCap); 
+    }
+    if (strstr(cBuffer, "NVIDIA") != NULL)
+    {
+        if (iDevCap < 12)
+        {
+            shrLog("Device doesn't have overlap capability.  Skipping test...\n"); 
+            Cleanup (EXIT_SUCCESS);
+        }
+
+		// Device and Platform eligible for overlap testing
+		bTestOverlap = shrTRUE;
+
+        // If device has overlap capability, proceed
+        fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP;               // 1st cycle overlap is same for 1 or 2 copy engines
+        if (iDevCap != 20) 
+        {
+            // Single copy engine
+            fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP;            // avg of many cycles
+        }
+        else 
+        {
+            char cDevName[1024];
+            clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
+            if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
+            {
+                // Tesla or Quadro (arch = 2.0) ... Dual copy engine 
+                fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI;  // average of many cycles
+            }
+            else
+            {
+                // Geforce ... Single copy engine
+                fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP;        // average of many cycles
+            }
+        }
+    }   
+
+    // Create the context
+    shrLog("clCreateContext...\n"); 
+    cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    
+    // Create 2 command-queues
+    cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateCommandQueue [0]...\n"); 
+    cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateCommandQueue [1]...\n"); 
+
+    // Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
+    szBuffBytes = sizeof(cl_float) * uiNumElements;
+    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements); 
+
+    // Allocate pinned source and result host buffers:  
+    //   Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
+    cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements); 
+
+    // Get mapped pointers to pinned input host buffers
+    //   Note:  This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
+    fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
+    //oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n"); 
+
+    // Alloc temp golden buffer for cross checks
+    Golden = (float*)malloc(szBuffBytes);
+    //oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
+
+    // Read the OpenCL kernel in from source file
+    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
+    //oclCheckError(cPathAndName != NULL, shrTRUE);
+    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
+   // oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
+
+    // Create the program object
+    cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateProgramWithSource...\n"); 
+
+    // Build the program for the target device
+    clFinish(cqCommandQueue[0]);
+    shrDeltaT(0);
+    ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
+    shrLog("clBuildProgram..."); 
+    if (ciErrNum != CL_SUCCESS)
+    {
+        // write out standard error, Build Log and PTX, then cleanup and exit
+        shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
+        oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
+        oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
+        Cleanup(EXIT_FAILURE);
+    }
+    dBuildTime = shrDeltaT(0);
+
+    // Ethan - Kernel Addition
+    cl_program program =
+      clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
+    if (program == NULL) {
+        std::cerr << "Failed to write program binary" << std::endl;
+        Cleanup(context, queue, program, kernel, memObjects);
+        return 1;
+    } else {
+        std::cout << "Read program from binary." << std::endl;
+    }
+
+    // Create the kernel
+    ckKernel[0] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    ckKernel[1] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clCreateKernel (ckKernel[2])...\n"); 
+
+    // Offsets for 2 queues
+    cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
+
+    // Set the Argument values for the 1st kernel instance (queue 0)
+    ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
+    ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
+    ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
+    ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
+    ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
+    ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
+    shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n"); 
+
+    // Set the Argument values for the 2d kernel instance (queue 1)
+    ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
+    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n"); 
+
+    //*******************************************
+    // Warmup the driver with dual queue sequence
+    //*******************************************
+
+    // Warmup with dual queue sequence for iTestCycles
+    shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
+    DualQueueSequence(iWarmupCycles, uiNumElements, false);
+
+    // Use single queue config to adjust compute intensity 
+    shrLog("Adjust compute for GPU / system...\n");
+    iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles); 
+    shrLog("  Kernel inner loop count = %d\n", iInnerLoopCount); 
+
+    //*******************************************
+    // Run and time with 2 command-queues
+    //*******************************************
+	for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
+	
+	// Run the sequence iTestCycles times
+    dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
+
+    // Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer) 
+    shrLog("  Device vs Host Result Comparison\t: "); 
+    VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);   
+    shrDeltaT(0);
+    for (int i = 0; i < iTestCycles; i++)
+    {
+        VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);   
+    }
+    dHostTime[0] = shrDeltaT(0)/iTestCycles;
+
+    // Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer) 
+    bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
+    shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); 
+    bPassFlag = (bMatch == shrTRUE);
+
+    //*******************************************
+    // Run and time with 1 command queue
+    //*******************************************
+    // Run the sequence iTestCycles times
+    dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
+
+    // Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer) 
+    shrLog("  Device vs Host Result Comparison\t: "); 
+    shrDeltaT(0);
+    for (int i = 0; i < iTestCycles; i++)
+    {
+        VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);   
+    }
+    dHostTime[1] = shrDeltaT(0)/iTestCycles;
+
+    // Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer) 
+    bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
+    shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); 
+    bPassFlag &= (bMatch == shrTRUE);
+
+    //*******************************************
+
+    // Compare Single and Dual queue timing 
+    shrLog("\nResult Summary:\n"); 
+
+    // Log GPU and CPU Time for 2-queue scenario
+    shrLog("  Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
+    shrLog("  Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
+
+    // Log GPU and CPU Time for 1-queue scenario
+    shrLog("  Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
+    shrLog("  Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
+
+    // Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
+    double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
+    
+	if( bTestOverlap ) {
+		bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
+		if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
+			shrLog("  Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%)  -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
+
+			// Log info to master log in standard format
+			shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", 
+			dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize); 
+			
+			bPassFlag &= bAvgOverlapOK;
+			break;
+		}
+	} 
+
+		shrLog("  Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%)  -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
+	}
+
+
+    //*******************************************
+    // Report pass/fail, cleanup and exit
+    Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+// Run 1 queue sequence for n cycles
+// *********************************************************************
+double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
+{
+    // Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer) 
+    shrFillArray(fSourceA, (int)uiNumElements);
+    shrFillArray(fSourceB, (int)uiNumElements);
+
+    // Reset Global work size for 1 command-queue, and log work sizes & dimensions
+    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
+
+    // *** Make sure queues are empty and then start timer 
+    double dAvgTime = 0.0;
+    clFinish(cqCommandQueue[0]);
+    clFinish(cqCommandQueue[1]);  
+    shrDeltaT(0);
+
+    // Run the sequence iCycles times
+    for (int i = 0; i < iCycles; i++)
+    {
+        // Nonblocking Write of all of input data from host to device in command-queue 0 
+        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
+        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+        // Launch kernel computation, command-queue 0 
+        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
+        //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+        // Non Blocking Read of output data from device to host, command-queue 0 
+        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+        // Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
+        clFlush(cqCommandQueue[0]);
+    }
+    
+    // *** Assure sync to host and return average sequence time
+    clFinish(cqCommandQueue[0]);
+    dAvgTime = shrDeltaT(0)/(double)iCycles;
+
+    // Log config if asked for
+    if (bShowConfig)
+    {
+        shrLog("\n1-Queue sequence Configuration:\n");
+        shrLog("  Global Work Size (per command-queue)\t= %u\n  Local Work Size \t\t\t= %u\n  # of Work Groups (per command-queue)\t= %u\n  # of command-queues\t\t\t= 1\n", 
+           szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize); 
+    }
+    return dAvgTime;
+}
+
+// Run 2 queue sequence for n cycles
+// *********************************************************************
+double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
+{
+    // Locals
+    size_t szHalfBuffer = szBuffBytes / 2;
+    size_t szHalfOffset = szHalfBuffer / sizeof(float);
+    double dAvgTime = 0.0;
+
+    // Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer) 
+    shrFillArray(fSourceA, (int)uiNumElements);
+    shrFillArray(fSourceB, (int)uiNumElements);
+
+    // Set Global work size for 2 command-queues, and log work sizes & dimensions
+    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
+
+    // Make sure queues are empty and then start timer 
+    clFinish(cqCommandQueue[0]);
+    clFinish(cqCommandQueue[1]);
+    shrDeltaT(0);
+
+    for (int i = 0; i < iCycles; i++)
+    {
+        // Mid Phase 0 
+        // Nonblocking Write of 1st half of input data from host to device in command-queue 0 
+        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
+        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+        // Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver 
+        // (not necessary on Linux, Mac OSX or WinXP)
+        clFlush(cqCommandQueue[0]);
+        clFlush(cqCommandQueue[1]);
+
+        // Start Phase 1 ***********************************
+
+        // Launch kernel computation, command-queue 0
+        // (Note:  The order MATTERS here on Fermi !  THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
+        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
+        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+        // Nonblocking Write of 2nd half of input data from host to device in command-queue 1 
+        // (Note:  The order MATTERS here on Fermi !  THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
+        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
+        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+        // Push out the compute for queue 0 and write for queue 1 to the driver
+        // (not necessary on Linux, Mac OSX or WinXP)
+        clFlush(cqCommandQueue[0]);
+        clFlush(cqCommandQueue[1]);
+
+        // Start Phase 2 ***********************************
+
+        // Launch kernel computation, command-queue 1 
+        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
+        //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+        // Non Blocking Read of 1st half of output data from device to host, command-queue 0 
+        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+
+        // Push out the compute for queue 1 and the read for queue 0 to the driver 
+        // (not necessary on Linux, Mac OSX or WinXP)
+        clFlush(cqCommandQueue[0]);
+        clFlush(cqCommandQueue[1]);
+
+        // Start Phase 0 (Rolls over) ***********************************
+
+        // Non Blocking Read of 2nd half of output data from device to host, command-queue 1 
+        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+    }
+
+    // *** Sync to host and get average sequence time
+    clFinish(cqCommandQueue[0]);
+    clFinish(cqCommandQueue[1]);    
+    dAvgTime = shrDeltaT(0)/(double)iCycles;
+
+    // Log config if asked for
+    if (bShowConfig)
+    {
+        shrLog("\n2-Queue sequence Configuration:\n");
+        shrLog("  Global Work Size (per command-queue)\t= %u\n  Local Work Size \t\t\t= %u\n  # of Work Groups (per command-queue)\t= %u\n  # of command-queues\t\t\t= 2\n", 
+           szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize); 
+    }
+
+    return dAvgTime;
+}
+
+// Function to adjust compute task according to device capability
+// This allows a consistent overlap % across a wide variety of GPU's for test purposes
+// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
+// *********************************************************************
+int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
+{
+    // Locals
+    double dCopyTime, dComputeTime;
+    int iComputedLoopCount; 
+
+    // Change Source Data
+    shrFillArray(fSourceA, (int)uiNumElements);
+    shrFillArray(fSourceB, (int)uiNumElements);
+
+    // Reset Global work size for 1 command-queue, and log work sizes & dimensions
+    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
+
+    // *** Make sure queues are empty and then start timer 
+    clFinish(cqCommandQueue[0]);
+    clFinish(cqCommandQueue[1]);  
+    shrDeltaT(0);
+
+    // Run the copy iCycles times and measure copy time on this system
+    for (int i = 0; i < iCycles; i++)
+    {
+        // Nonblocking Write of all of input data from host to device in command-queue 0 
+        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
+        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
+        ciErrNum |= clFlush(cqCommandQueue[0]);
+        shrCheckError(ciErrNum, CL_SUCCESS);
+    }
+    clFinish(cqCommandQueue[0]);
+    dCopyTime = shrDeltaT(0);
+
+    // Run the compute iCycles times and measure compute time on this system
+    for (int i = 0; i < iCycles; i++)
+    {
+        // Launch kernel computation, command-queue 0 
+        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
+        ciErrNum |= clFlush(cqCommandQueue[0]);
+        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    }
+    clFinish(cqCommandQueue[0]);
+    dComputeTime = shrDeltaT(0);
+
+    // Determine number of core loop cycles proportional to copy/compute time ratio
+    dComputeTime = MAX(dComputeTime, 1.0e-6);
+    iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
+    ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
+    ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
+    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    return (iComputedLoopCount);
+} 
+
+// Cleanup/Exit function 
+// *********************************************************************
+void Cleanup (int iExitCode)
+{
+    // Cleanup allocated objects
+    shrLog("Starting Cleanup...\n\n");
+    if(cPathAndName)free(cPathAndName);
+    if(cSourceCL)free(cSourceCL);
+    if(Golden)free(Golden);
+    if(ckKernel[0])clReleaseKernel(ckKernel[0]);
+    if(ckKernel[1])clReleaseKernel(ckKernel[1]);
+    if(cpProgram)clReleaseProgram(cpProgram);
+    if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
+    if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
+    if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
+    if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
+    if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
+    if(cmDevResult)clReleaseMemObject(cmDevResult);
+    if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
+    if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
+    if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
+    if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
+    if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
+    if(cxGPUContext)clReleaseContext(cxGPUContext);
+    if(cdDevices)free(cdDevices);
+
+    // Master status Pass/Fail (all tests)
+    shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
+}
+
+// "Golden" Host processing vector hyptenuse function for comparison purposes
+// *********************************************************************
+void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
+{
+    for (unsigned int i = 0; i < uiNumElements; i++) 
+    {
+        float fA = pfData1[i];
+        float fB = pfData2[i];
+        float fC = sqrtf(fA * fA + fB * fB);
+
+        pfResult[i] = fC;
+    }
+}
--- a/benchmarks/opencl/VectorHypot/oclUtils.h
+++ b/benchmarks/opencl/VectorHypot/oclUtils.h
@ -0,0 +1,198 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+ 
+#ifndef OCL_UTILS_H
+#define OCL_UTILS_H
+
+// *********************************************************************
+// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// Common headers:  Cross-API utililties and OpenCL header
+#include <shrUtils.h>
+
+// All OpenCL headers
+#if defined (__APPLE__) || defined(MACOSX)
+    #include <OpenCL/opencl.h>
+#else
+    #include <CL/opencl.h>
+#endif 
+
+// Includes
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// For systems with CL_EXT that are not updated with these extensions, we copied these
+// extensions from <CL/cl_ext.h>
+#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+  /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+  #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+  #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+  #define CL_DEVICE_WARP_SIZE_NV                      0x4003
+  #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+  #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+  #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#endif
+
+// reminders for build output window and log
+#ifdef _WIN32
+    #pragma message ("Note: including shrUtils.h")
+    #pragma message ("Note: including opencl.h")
+#endif
+
+// SDK Revision #
+#define OCL_SDKREVISION "7027912"
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0) 
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
+//!
+//! @return the id 
+//! @param clSelectedPlatformID         OpenCL platform ID
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print info about the device
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and return device capability
+//!
+//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA 
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" int oclGetDevCap(cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Print the device name
+//!
+//! @param iLogMode       enum LOGBOTH, LOGCONSOLE, LOGFILE
+//! @param device         OpenCL id of the device
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the first device from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of the nth device from the context
+//!
+//! @return the id or -1 when out of range
+//! @param cxGPUContext         OpenCL context
+//! @param device_idx            index of the device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Gets the id of device with maximal FLOPS from the context
+//!
+//! @return the id 
+//! @param cxGPUContext         OpenCL context
+//////////////////////////////////////////////////////////////////////////////
+extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Loads a Program file and prepends the cPreamble to the code.
+//!
+//! @return the source string if succeeded, 0 otherwise
+//! @param cFilename        program filename
+//! @param cPreamble        code that is prepended to the loaded file, typically a set of #defines or a header
+//! @param szFinalLength    returned length of the code string
+//////////////////////////////////////////////////////////////////////////////
+extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get the binary (PTX) of the program associated with the device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//! @param binary       returned code
+//! @param length       length of returned code
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram                   OpenCL program
+//! @param cdDevice                    device of interest
+//! @param const char*  cPtxFileName   optional PTX file name
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
+
+//////////////////////////////////////////////////////////////////////////////
+//! Get and log the Build Log from the OpenCL compiler for the requested program & device
+//!
+//! @param cpProgram    OpenCL program
+//! @param cdDevice     device of interest
+//////////////////////////////////////////////////////////////////////////////
+extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
+
+// Helper function for De-allocating cl objects
+// *********************************************************************
+extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
+
+// Helper function to get OpenCL error string from constant
+// *********************************************************************
+extern "C" const char* oclErrorString(cl_int error);
+
+// Helper function to get OpenCL image format string (channel order and type) from constant
+// *********************************************************************
+extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    // An error condition is defined by the sample/test value not equal to the reference
+    if (iReference != iSample)
+    {
+        // If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
+        iSample = (iSample == 0) ? -9999 : iSample; 
+
+        // Log the error info
+        shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
+
+        // Cleanup and exit, or just exit if no cleanup function pointer provided.  Use iSample (error code in this case) as process exit code.
+        if (pCleanup != NULL)
+        {
+            pCleanup(iSample);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(iSample);
+        }
+    }
+}
+
+#endif
--- a/benchmarks/opencl/VectorHypot/shrQATest.h
+++ b/benchmarks/opencl/VectorHypot/shrQATest.h
@ -0,0 +1,238 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_QATEST_H
+#define SHR_QATEST_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// OS dependent includes
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+    #pragma message ("Note: including time.h")
+
+// Headers needed for Windows
+    #include <windows.h>
+	#include <time.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+    #include <unistd.h>
+    #include <time.h>
+#endif
+
+#ifndef STRCASECMP
+#ifdef _WIN32
+#define STRCASECMP _stricmp
+#else
+#define STRCASECMP strcasecmp
+#endif
+#endif
+
+#ifndef STRNCASECMP
+#ifdef _WIN32
+#define STRNCASECMP _strnicmp
+#else
+#define STRNCASECMP strncasecmp
+#endif
+#endif
+
+
+// Standardized QA Start/Finish for CUDA SDK tests
+#define shrQAStart(a, b)      __shrQAStart(a, b)
+#define shrQAFinish(a, b, c)  __shrQAFinish(a, b, c)
+#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
+
+inline int findExeNameStart(const char *exec_name)
+{
+    int exename_start = (int)strlen(exec_name);
+
+    while( (exename_start > 0) && 
+            (exec_name[exename_start] != '\\') && 
+            (exec_name[exename_start] != '/') )
+    {
+        exename_start--;
+    }
+    if (exec_name[exename_start] == '\\' || 
+        exec_name[exename_start] == '/')
+    {
+        return exename_start+1;
+    } else {
+        return exename_start;
+    }
+}
+
+inline int __shrQAStart(int argc, char **argv)
+{
+    bool bQATest = false;
+    // First clear the output buffer
+    fflush(stdout);
+    fflush(stdout);
+
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+        char *string_argv = &argv[i][string_start];
+
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }
+    }
+    
+    // We don't want to print the entire path, so we search for the first 
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    return exename_start;
+}
+
+enum eQAstatus {
+    QA_FAILED = 0,
+    QA_PASSED = 1,
+    QA_WAIVED = 2
+};
+
+inline void __ExitInTime(int seconds)
+{
+    fprintf(stdout, "> exiting in %d seconds: ", seconds);
+    fflush(stdout);
+    time_t t;
+    int count;
+    for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
+        fprintf(stdout, "%d...", count);
+#ifdef WIN32
+        Sleep(1000);
+#else
+        sleep(1);
+#endif
+    }
+    fprintf(stdout,"done!\n\n"); 
+	fflush(stdout);
+}
+
+
+inline void __shrQAFinish(int argc, const char **argv, int iStatus)
+{
+    // By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
+    bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        if (!STRCASECMP(string_argv, "qatest")) {
+           bQATest = true;
+        }	
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bNoPrompt = true;
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bNoPrompt = false;
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    printf("\n"); fflush(stdout);
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    } else {
+        if (!bNoPrompt) {
+            fprintf(stdout, "\nPress <Enter> to exit...\n");
+            fflush(stdout);
+            getchar();
+        }
+    }
+}
+
+inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
+{
+    bool bQuitInTime = true;
+    const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
+	
+    for (int i=1; i < argc; i++) {
+        int string_start = 0;
+        while (argv[i][string_start] == '-')
+           string_start++;
+
+        const char *string_argv = &argv[i][string_start];
+        // For SDK individual samples that don't specify -noprompt or -prompt, 
+        // a 3 second delay will happen before exiting, giving a user time to view results
+        if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
+            bQuitInTime = false;
+        }
+        if (!STRCASECMP(string_argv, "prompt")) {
+            bQuitInTime = false;
+        }
+    }
+
+    int exename_start = findExeNameStart(argv[0]);
+    if (bQATest) {
+        fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
+        for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
+        fprintf(stdout, "\n");
+    } else {
+        fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
+    }
+    fflush(stdout);
+    
+    if (bQuitInTime) {
+        __ExitInTime(3);
+    }
+}
+
+inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish(argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE); 
+}
+
+inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
+{
+    __shrQAFinish2(bQAtest, argc, argv, iStatus);
+
+    exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+#endif
--- a/benchmarks/opencl/VectorHypot/shrUtils.h
+++ b/benchmarks/opencl/VectorHypot/shrUtils.h
@ -0,0 +1,642 @@
+/*
+* Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+*
+* Please refer to the NVIDIA end user license agreement (EULA) associated
+* with this source code for terms and conditions that govern your use of
+* this software. Any use, reproduction, disclosure, or distribution of
+* this software and related documentation outside the terms of the EULA
+* is strictly prohibited.
+*
+*/
+
+#ifndef SHR_UTILS_H
+#define SHR_UTILS_H
+
+// *********************************************************************
+// Generic utilities for NVIDIA GPU Computing SDK 
+// *********************************************************************
+
+// reminders for output window and build log
+#ifdef _WIN32
+    #pragma message ("Note: including windows.h")
+    #pragma message ("Note: including math.h")
+    #pragma message ("Note: including assert.h")
+#endif
+
+// OS dependent includes
+#ifdef _WIN32
+    // Headers needed for Windows
+    #include <windows.h>
+#else
+    // Headers needed for Linux
+    #include <sys/stat.h>
+    #include <sys/types.h>
+    #include <sys/time.h>
+    #include <stdio.h>
+    #include <stdlib.h>
+    #include <string.h>
+    #include <stdarg.h>
+#endif
+
+// Other headers needed for both Windows and Linux
+#include <math.h>
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+// Un-comment the following #define to enable profiling code in SDK apps
+//#define GPU_PROFILING
+
+// Beginning of GPU Architecture definitions
+inline int ConvertSMVer2Cores(int major, int minor)
+{
+	// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+	typedef struct {
+		int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+		int Cores;
+	} sSMtoCores;
+
+	sSMtoCores nGpuArchCoresPerSM[] = 
+	{ { 0x10,  8 }, // Tesla Generation (SM 1.0) G80 class
+	  { 0x11,  8 }, // Tesla Generation (SM 1.1) G8x class
+	  { 0x12,  8 }, // Tesla Generation (SM 1.2) G9x class
+	  { 0x13,  8 }, // Tesla Generation (SM 1.3) GT200 class
+	  { 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
+	  { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
+	  { 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
+	  {   -1, -1 }
+	};
+
+	int index = 0;
+	while (nGpuArchCoresPerSM[index].SM != -1) {
+		if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
+			return nGpuArchCoresPerSM[index].Cores;
+		}
+		index++;
+	}
+	printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
+	return -1;
+}
+// end of GPU Architecture definitions
+
+
+// Defines and enum for use with logging functions
+// *********************************************************************
+#define DEFAULTLOGFILE "SdkConsoleLog.txt"
+#define MASTERLOGFILE "SdkMasterLog.csv"
+enum LOGMODES 
+{
+    LOGCONSOLE = 1, // bit to signal "log to console" 
+    LOGFILE    = 2, // bit to signal "log to file" 
+    LOGBOTH    = 3, // convenience union of first 2 bits to signal "log to both"
+    APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
+    MASTER     = 8, // bit to signal master .csv log output
+    ERRORMSG   = 16, // bit to signal "pre-pend Error" 
+    CLOSELOG   = 32  // bit to close log file, if open, after any requested file write
+};
+#define HDASHLINE "-----------------------------------------------------------\n"
+
+// Standardized boolean
+enum shrBOOL
+{
+    shrFALSE = 0,
+    shrTRUE = 1
+};
+
+// Standardized MAX, MIN and CLAMP
+#define MAX(a, b) ((a > b) ? a : b)
+#define MIN(a, b) ((a < b) ? a : b)
+#define CLAMP(a, b, c) MIN(MAX(a, b), c)    // double sided clip of input a
+#define TOPCLAMP(a, b) (a < b ? a:b)	    // single top side clip of input a
+
+// Error and Exit Handling Macros... 
+// *********************************************************************
+// Full error handling macro with Cleanup() callback (if supplied)... 
+// (Companion Inline Function lower on page)
+#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__) 
+
+// Short version without Cleanup() callback pointer
+// Both Input (a) and Reference (b) are specified as args
+#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0) 
+
+// Standardized Exit Macro for leaving main()... extended version
+// (Companion Inline Function lower on page)
+#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
+
+// Standardized Exit Macro for leaving main()... short version
+// (Companion Inline Function lower on page)
+#define shrEXIT(a, b)        __shrExitEX(a, b, EXIT_SUCCESS)
+
+// Simple argument checker macro
+#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE 
+
+// Define for user-customized error handling
+#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
+
+// Function to deallocate memory allocated within shrUtils
+// *********************************************************************
+extern "C" void shrFree(void* ptr);
+
+// *********************************************************************
+// Helper function to log standardized information to Console, to File or to both
+//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n"); 
+//!         : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+//! 
+//! Automatically opens file and stores handle if needed and not done yet
+//! Closes file and nulls handle on request
+//! 
+//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.  
+//!          LOGFILE and LOGBOTH may be | 'd  with APPENDMODE to select file append mode instead of overwrite mode 
+//!          LOGFILE and LOGBOTH may be | 'd  with CLOSELOG to "write and close" 
+//!          First 3 options may be | 'd  with MASTER to enable independent write to master data log file
+//!          First 3 options may be | 'd  with ERRORMSG to start line with standard error message
+//! @param 2 dValue:    
+//!          Positive val = double value for time in secs to be formatted to 6 decimals. 
+//!          Negative val is an error code and this give error preformatting.
+//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.  
+//!          ALL printf flags, width, precision and type specifiers are supported with this exception: 
+//!              Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
+//!              Single byte char type specifiers (%s and %c) ARE supported 
+//! @param 4... variable args: like printf or fprintf.  Must match format specifer type above.  
+//! @return 0 if OK, negative value on error or if error occurs or was passed in. 
+// *********************************************************************
+extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
+
+// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0, 
+// *********************************************************************
+extern "C" int shrLog(const char* cFormatString, ...);
+
+// *********************************************************************
+// Delta timer function for up to 3 independent timers using host high performance counters 
+// Maintains state for 3 independent counters
+//! Example: double dElapsedTime = shrDeltaTime(0);
+//! 
+//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
+//! @return delta time of specified counter since last call in seconds.  Otherwise -9999.0 if error
+// *********************************************************************
+extern "C" double shrDeltaT(int iCounterID);
+
+// Optional LogFileNameOverride function
+// *********************************************************************
+extern "C" void shrSetLogFileName (const char* cOverRideName);
+
+// Helper function to init data arrays 
+// *********************************************************************
+extern "C" void shrFillArray(float* pfData, int iSize);
+
+// Helper function to print data arrays 
+// *********************************************************************
+extern "C" void shrPrintArray(float* pfData, int iSize);
+
+////////////////////////////////////////////////////////////////////////////
+//! Find the path for a filename
+//! @return the path if succeeded, otherwise 0
+//! @param filename        name of the file
+//! @param executablePath  optional absolute path of the executable
+////////////////////////////////////////////////////////////////////////////
+extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing single precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing double precision floating point data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned integer data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is 
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len, 
+              bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Read file \filename containing unsigned char / byte data
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//! @note If a NULL pointer is passed to this function and it is
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data, 
+               unsigned int* len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing single precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
+               const float epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing double precision floating point 
+//! data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
+               const double epsilon, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned integer data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data, 
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len, 
+               bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename containing unsigned char / byte data
+//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
+//! @param filename name of the file to write
+//! @param data  pointer to data to write
+//! @param len  number of data elements in data, -1 on error
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
+                unsigned int len, bool verbose = false);
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PPM image file (with unsigned char as data element type), padding 
+//! 4th component
+//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param OutData  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! 
+//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData, 
+                             unsigned int *w, unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+//! Save PPM image file (with unsigned char as data element type, padded to 
+//! 4 bytes)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data, 
+               unsigned int w, unsigned int h);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Save PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data, 
+              unsigned int w, unsigned int h); 
+
+////////////////////////////////////////////////////////////////////////////
+//! Load PGM image file (with unsigned char as data element type)
+//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
+//! @param file  name of the image file
+//! @param data  handle to the data read
+//! @param w     width of the image
+//! @param h     height of the image
+//! @note If a NULL pointer is passed to this function and it is initialized 
+//!       within shrUtils, then free() has to be used to deallocate the memory
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
+                  unsigned int *w,unsigned int *h);
+
+////////////////////////////////////////////////////////////////////////////
+// Command line arguments: General notes
+// * All command line arguments begin with '--' followed by the token; 
+//   token and value are seperated by '='; example --samples=50
+// * Arrays have the form --model=[one.obj,two.obj,three.obj] 
+//   (without whitespaces)
+////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////
+//! Check if command line argument \a flag-name is given
+//! @return shrTRUE if command line argument \a flag_name has been given, 
+//!         otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param flag_name  name of command line flag
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv, 
+                     const char* flag_name);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv, 
+                        const char* arg_name, int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type unsigned int
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv, 
+                        const char* arg_name, unsigned int* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type float
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv, 
+                        const char* arg_name, float* val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument of type string
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  value of the command line argument
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv, 
+                          const char* arg_name, char** val);
+
+////////////////////////////////////////////////////////////////////////////
+//! Get the value of a command line argument list those element are strings
+//! @return shrTRUE if command line argument \a arg_name has been given and
+//!         is of the requested type, otherwise shrFALSE
+//! @param argc  argc as passed to main()
+//! @param argv  argv as passed to main()
+//! @param arg_name  name of the command line argument
+//! @param val  command line argument list
+//! @param len  length of the list / number of elements
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv, 
+                              const char* arg_name, char** val, 
+                              unsigned int* len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparef( const float* reference, const float* data,
+             const unsigned int len);
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparei( const int* reference, const int* data, 
+             const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned integer arrays, with epsilon and threshold
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
+            const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two unsigned char arrays
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
+              const unsigned int len ); 
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integers with a tolernance for # of byte errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  tolerance % # of comparison errors (0.15f = 15%)
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two integer arrays witha n epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
+             const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
+              const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays with an epsilon tolerance for equality and a 
+//!     threshold for # pixel errors
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
+             const unsigned int len, const float epsilon, const float threshold );
+
+////////////////////////////////////////////////////////////////////////////
+//! Compare two float arrays using L2-norm with an epsilon tolerance for 
+//! equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
+                const unsigned int len, const float epsilon );
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PPM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compare two PGM image files with an epsilon tolerance for equality
+//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
+//! @param src_file   filename for the image to be compared
+//! @param data       filename for the reference data / gold image
+//! @param epsilon    epsilon to use for the comparison
+//! @param threshold  threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
+//! $param verboseErrors output details of image mismatch to std::err
+////////////////////////////////////////////////////////////////////////////////
+extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
+
+extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
+
+extern "C" size_t shrRoundUp(int group_size, int global_size);
+
+// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
+// *********************************************************************
+inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
+{
+    if (iReference != iSample)
+    {
+        shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile); 
+        if (pCleanup != NULL)
+        {
+            pCleanup(EXIT_FAILURE);
+        }
+        else 
+        {
+            shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+
+// Standardized Exit
+// *********************************************************************
+inline void __shrExitEX(int argc, const char** argv, int iExitCode)
+{
+#ifdef WIN32
+    if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#else 
+    if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest")) 
+#endif
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");                  
+        getchar();                                                           
+    }       
+    else 
+    {
+        shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]); 
+    }
+    fflush(stderr);                                                         
+    exit(iExitCode);
+}
+
+#endif