mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
Add files via upload
This commit is contained in:
parent
54429d0017
commit
6f26b68760
5 changed files with 1805 additions and 0 deletions
41
benchmarks/opencl/VectorHypot/VectorHypot.cl
Normal file
41
benchmarks/opencl/VectorHypot/VectorHypot.cl
Normal file
|
@ -0,0 +1,41 @@
|
|||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// OpenCL Kernel Function Naive Implementation for hyptenuse
|
||||
__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
|
||||
{
|
||||
// get index into global data array
|
||||
size_t szGlobalOffset = get_global_id(0) + uiOffset;
|
||||
|
||||
// bound check
|
||||
if (szGlobalOffset >= uiNumElements)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Processing 4 elements per work item, so read fgA and fgB source values from GMEM
|
||||
float4 f4A = fg4A[szGlobalOffset];
|
||||
float4 f4B = fg4B[szGlobalOffset];
|
||||
float4 f4H = (float4)0.0f;
|
||||
|
||||
// Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop
|
||||
for (int i = 0; i < iInnerLoopCount; i++)
|
||||
{
|
||||
// compute the 4 hypotenuses using built-in function
|
||||
f4H.x = hypot (f4A.x, f4B.x);
|
||||
f4H.y = hypot (f4A.y, f4B.y);
|
||||
f4H.z = hypot (f4A.z, f4B.z);
|
||||
f4H.w = hypot (f4A.w, f4B.w);
|
||||
}
|
||||
|
||||
// Write 4 result values back out to GMEM
|
||||
fg4Hypot[szGlobalOffset] = f4H;
|
||||
}
|
686
benchmarks/opencl/VectorHypot/main.cc
Normal file
686
benchmarks/opencl/VectorHypot/main.cc
Normal file
|
@ -0,0 +1,686 @@
|
|||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// *********************************************************************
|
||||
// oclCopyComputeOverlap Notes:
|
||||
//
|
||||
// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
|
||||
// element by element vector hyptenuse computation using 2 input float arrays
|
||||
// and 1 output float array.
|
||||
//
|
||||
// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
|
||||
// with respect to GPU computation (and with respect to host thread).
|
||||
//
|
||||
// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
|
||||
// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
|
||||
// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
|
||||
//
|
||||
// After setup, warmup and calibration to the system, the sample runs 4 scenarios:
|
||||
// A) Computations with 2 command queues on GPU
|
||||
// A multiple-cycle sequence is executed, timed and compared against the host
|
||||
// B) Computations with 1 command queue on GPU
|
||||
// A multiple-cycle sequence is executed, timed and compared against the host
|
||||
//
|
||||
// The 2-command queue approach ought to be substantially faster
|
||||
//
|
||||
// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently
|
||||
// increases compute time without increasing data size (via a loop inside the kernel)
|
||||
//
|
||||
// At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
|
||||
// (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
|
||||
//
|
||||
// If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
|
||||
//
|
||||
// Single Queue with all the data and all the work
|
||||
// Ttot (serial) = 4T + 4T + 2T = 10T
|
||||
//
|
||||
// Dual Queue, where each queue has 1/2 the data and 1/2 the work
|
||||
// Tq0 (overlap) = 2T + 2T + T ....
|
||||
// Tq1 (overlap) = .... 2T + 2T + T
|
||||
//
|
||||
// Ttot (elapsed, wall) = 2T + 2T + 2T + T = 7T
|
||||
//
|
||||
// Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 % (Tesla arch 1.2 or 1.3, single copy engine)
|
||||
//
|
||||
// For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
|
||||
// This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.
|
||||
// *********************************************************************
|
||||
|
||||
// common SDK header for standard utilities and system libs
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
|
||||
// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
|
||||
// values greater than 0.0f represent a speed-up relative to non-overlapped
|
||||
#define EXPECTED_OVERLAP 30.0f
|
||||
#define EXPECTED_OVERLAP_FERMI 45.0f
|
||||
#define PASS_FACTOR 0.60f
|
||||
#define RETRIES_ON_FAILURE 1
|
||||
|
||||
// Base sizes for parameters manipulated dynamically or on the command line
|
||||
#define BASE_WORK_ITEMS 64
|
||||
#define BASE_ARRAY_LENGTH 40000
|
||||
#define BASE_LOOP_COUNT 32
|
||||
|
||||
// Vars
|
||||
// *********************************************************************
|
||||
cl_platform_id cpPlatform; // OpenCL platform
|
||||
cl_context cxGPUContext; // OpenCL context
|
||||
cl_command_queue cqCommandQueue[2]; // OpenCL command queues
|
||||
cl_device_id* cdDevices; // OpenCL device list
|
||||
cl_program cpProgram; // OpenCL program
|
||||
cl_kernel ckKernel[2]; // OpenCL kernel, 1 per queue
|
||||
cl_mem cmPinnedSrcA; // OpenCL pinned host source buffer A
|
||||
cl_mem cmPinnedSrcB; // OpenCL pinned host source buffer B
|
||||
cl_mem cmPinnedResult; // OpenCL pinned host result buffer
|
||||
float* fSourceA = NULL; // Mapped pointer for pinned Host source A buffer
|
||||
float* fSourceB = NULL; // Mapped pointer for pinned Host source B buffer
|
||||
float* fResult = NULL; // Mapped pointer for pinned Host result buffer
|
||||
cl_mem cmDevSrcA; // OpenCL device source buffer A
|
||||
cl_mem cmDevSrcB; // OpenCL device source buffer B
|
||||
cl_mem cmDevResult; // OpenCL device result buffer
|
||||
size_t szBuffBytes; // Size of main buffers
|
||||
size_t szGlobalWorkSize; // 1D var for Total # of work items in the launched ND range
|
||||
size_t szLocalWorkSize = BASE_WORK_ITEMS; // initial # of work items in the work group
|
||||
cl_int ciErrNum; // Error code var
|
||||
char* cPathAndName = NULL; // Var for full paths to data, src, etc.
|
||||
char* cSourceCL = NULL; // Buffer to hold source for compilation
|
||||
const char* cExecutableName = NULL;
|
||||
|
||||
// demo config vars
|
||||
const char* cSourceFile = "VectorHypot.cl"; // OpenCL computation kernel source code
|
||||
float* Golden = NULL; // temp buffer to hold golden results for cross check
|
||||
bool bNoPrompt = false; // Command line switch to skip exit prompt
|
||||
bool bQATest = false; // Command line switch to test
|
||||
|
||||
// Forward Declarations
|
||||
// *********************************************************************
|
||||
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
|
||||
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
|
||||
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles);
|
||||
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
|
||||
void Cleanup (int iExitCode);
|
||||
void (*pCleanup)(int) = &Cleanup;
|
||||
|
||||
int *gp_argc = 0;
|
||||
const char *** gp_argv = NULL;
|
||||
|
||||
// Main function
|
||||
// *********************************************************************
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
//Locals
|
||||
size_t szKernelLength; // Byte size of kernel code
|
||||
double dBuildTime; // Compile time
|
||||
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||
cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
|
||||
cl_uint uiNumDevices; // Number of devices available
|
||||
int iDevCap = -1; // Capability of device
|
||||
int iInnerLoopCount = BASE_LOOP_COUNT; // Varies "compute intensity" per data within the kernel
|
||||
const int iTestCycles = 10; // How many times to run the external test loop
|
||||
const int iWarmupCycles = 8; // How many times to run the warmup sequence
|
||||
cl_uint uiWorkGroupMultiple = 4; // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
|
||||
cl_uint uiNumElements = BASE_ARRAY_LENGTH; // initial # of elements per array to process (note: procesing 4 per work item)
|
||||
cl_uint uiSizeMultiple = 4; // Command line var (using "sizemult=<n>") to optionally increase vector sizes
|
||||
bool bPassFlag = false; // Var to accumulate test pass/fail
|
||||
shrBOOL bMatch = shrFALSE; // Cross check result
|
||||
shrBOOL bTestOverlap = shrFALSE;
|
||||
double dAvgGPUTime[2] = {0.0, 0.0}; // Average time of iTestCycles calls for 2-Queue and 1-Queue test
|
||||
double dHostTime[2] = {0.0, 0.0}; // Host computation time (2nd test is redundant but a good stability indicator)
|
||||
float fMinPassCriteria[2] = {0.0f, 0.0f}; // Test pass cireria, adjusted dependant on GPU arch
|
||||
|
||||
gp_argc = &argc;
|
||||
gp_argv = &argv;
|
||||
|
||||
shrQAStart(argc, (char **)argv);
|
||||
|
||||
// start logs
|
||||
cExecutableName = argv[0];
|
||||
shrSetLogFileName ("oclCopyComputeOverlap.txt");
|
||||
shrLog("%s Starting...\n\n", argv[0]);
|
||||
|
||||
// get basic command line args
|
||||
bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
|
||||
bQATest = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
|
||||
shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
|
||||
|
||||
// Optional Command-line multiplier for vector size
|
||||
// Default val of 4 gives 10.24 million float elements per vector
|
||||
// Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
|
||||
shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
|
||||
uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);
|
||||
uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
|
||||
shrLog("Array sizes = %u float elements\n", uiNumElements);
|
||||
|
||||
// Optional Command-line multiplier for workgroup size (x 64 work items)
|
||||
// Default val of 4 gives szLocalWorkSize of 256.
|
||||
// Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
|
||||
shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
|
||||
uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10);
|
||||
szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
|
||||
shrLog("Workgroup Size = %u\n\n", szLocalWorkSize);
|
||||
|
||||
// Get the NVIDIA platform if available, otherwise use default
|
||||
shrLog("Get the Platform ID...\n\n");
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Get OpenCL platform name and version
|
||||
char cBuffer[256];
|
||||
ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("Platform Name = %s\n\n", cBuffer);
|
||||
|
||||
// Get all the devices
|
||||
shrLog("Get the Device info and select Device...\n");
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
|
||||
|
||||
// Ethans changes
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
|
||||
//ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Set target device and check capabilities
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||
shrLog(" Using Device %u, ", uiTargetDevice);
|
||||
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||
iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
|
||||
if (iDevCap > 0) {
|
||||
shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
|
||||
} else {
|
||||
shrLog("\n\n", iDevCap);
|
||||
}
|
||||
if (strstr(cBuffer, "NVIDIA") != NULL)
|
||||
{
|
||||
if (iDevCap < 12)
|
||||
{
|
||||
shrLog("Device doesn't have overlap capability. Skipping test...\n");
|
||||
Cleanup (EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
// Device and Platform eligible for overlap testing
|
||||
bTestOverlap = shrTRUE;
|
||||
|
||||
// If device has overlap capability, proceed
|
||||
fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP; // 1st cycle overlap is same for 1 or 2 copy engines
|
||||
if (iDevCap != 20)
|
||||
{
|
||||
// Single copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // avg of many cycles
|
||||
}
|
||||
else
|
||||
{
|
||||
char cDevName[1024];
|
||||
clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
|
||||
if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
|
||||
{
|
||||
// Tesla or Quadro (arch = 2.0) ... Dual copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI; // average of many cycles
|
||||
}
|
||||
else
|
||||
{
|
||||
// Geforce ... Single copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // average of many cycles
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create the context
|
||||
shrLog("clCreateContext...\n");
|
||||
cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Create 2 command-queues
|
||||
cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateCommandQueue [0]...\n");
|
||||
cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateCommandQueue [1]...\n");
|
||||
|
||||
// Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
|
||||
szBuffBytes = sizeof(cl_float) * uiNumElements;
|
||||
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements);
|
||||
|
||||
// Allocate pinned source and result host buffers:
|
||||
// Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
|
||||
cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements);
|
||||
|
||||
// Get mapped pointers to pinned input host buffers
|
||||
// Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
|
||||
fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n");
|
||||
|
||||
// Alloc temp golden buffer for cross checks
|
||||
Golden = (float*)malloc(szBuffBytes);
|
||||
//oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
|
||||
|
||||
// Read the OpenCL kernel in from source file
|
||||
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
|
||||
//oclCheckError(cPathAndName != NULL, shrTRUE);
|
||||
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
|
||||
// oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
|
||||
|
||||
// Create the program object
|
||||
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateProgramWithSource...\n");
|
||||
|
||||
// Build the program for the target device
|
||||
clFinish(cqCommandQueue[0]);
|
||||
shrDeltaT(0);
|
||||
ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
|
||||
shrLog("clBuildProgram...");
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
// write out standard error, Build Log and PTX, then cleanup and exit
|
||||
shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
|
||||
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
|
||||
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
|
||||
Cleanup(EXIT_FAILURE);
|
||||
}
|
||||
dBuildTime = shrDeltaT(0);
|
||||
|
||||
// Ethan - Kernel Addition
|
||||
cl_program program =
|
||||
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
|
||||
if (program == NULL) {
|
||||
std::cerr << "Failed to write program binary" << std::endl;
|
||||
Cleanup(context, queue, program, kernel, memObjects);
|
||||
return 1;
|
||||
} else {
|
||||
std::cout << "Read program from binary." << std::endl;
|
||||
}
|
||||
|
||||
// Create the kernel
|
||||
ckKernel[0] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
ckKernel[1] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateKernel (ckKernel[2])...\n");
|
||||
|
||||
// Offsets for 2 queues
|
||||
cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
|
||||
|
||||
// Set the Argument values for the 1st kernel instance (queue 0)
|
||||
ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
|
||||
shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n");
|
||||
|
||||
// Set the Argument values for the 2d kernel instance (queue 1)
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n");
|
||||
|
||||
//*******************************************
|
||||
// Warmup the driver with dual queue sequence
|
||||
//*******************************************
|
||||
|
||||
// Warmup with dual queue sequence for iTestCycles
|
||||
shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
|
||||
DualQueueSequence(iWarmupCycles, uiNumElements, false);
|
||||
|
||||
// Use single queue config to adjust compute intensity
|
||||
shrLog("Adjust compute for GPU / system...\n");
|
||||
iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles);
|
||||
shrLog(" Kernel inner loop count = %d\n", iInnerLoopCount);
|
||||
|
||||
//*******************************************
|
||||
// Run and time with 2 command-queues
|
||||
//*******************************************
|
||||
for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
|
||||
|
||||
// Run the sequence iTestCycles times
|
||||
dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
|
||||
|
||||
// Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrLog(" Device vs Host Result Comparison\t: ");
|
||||
VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
|
||||
shrDeltaT(0);
|
||||
for (int i = 0; i < iTestCycles; i++)
|
||||
{
|
||||
VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
|
||||
}
|
||||
dHostTime[0] = shrDeltaT(0)/iTestCycles;
|
||||
|
||||
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
|
||||
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
|
||||
bPassFlag = (bMatch == shrTRUE);
|
||||
|
||||
//*******************************************
|
||||
// Run and time with 1 command queue
|
||||
//*******************************************
|
||||
// Run the sequence iTestCycles times
|
||||
dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
|
||||
|
||||
// Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrLog(" Device vs Host Result Comparison\t: ");
|
||||
shrDeltaT(0);
|
||||
for (int i = 0; i < iTestCycles; i++)
|
||||
{
|
||||
VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);
|
||||
}
|
||||
dHostTime[1] = shrDeltaT(0)/iTestCycles;
|
||||
|
||||
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
|
||||
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
|
||||
bPassFlag &= (bMatch == shrTRUE);
|
||||
|
||||
//*******************************************
|
||||
|
||||
// Compare Single and Dual queue timing
|
||||
shrLog("\nResult Summary:\n");
|
||||
|
||||
// Log GPU and CPU Time for 2-queue scenario
|
||||
shrLog(" Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
|
||||
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
|
||||
|
||||
// Log GPU and CPU Time for 1-queue scenario
|
||||
shrLog(" Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
|
||||
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
|
||||
|
||||
// Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
|
||||
double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
|
||||
|
||||
if( bTestOverlap ) {
|
||||
bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
|
||||
if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
|
||||
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
|
||||
|
||||
// Log info to master log in standard format
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||
dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize);
|
||||
|
||||
bPassFlag &= bAvgOverlapOK;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
|
||||
}
|
||||
|
||||
|
||||
//*******************************************
|
||||
// Report pass/fail, cleanup and exit
|
||||
Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Run 1 queue sequence for n cycles
|
||||
// *********************************************************************
|
||||
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
|
||||
{
|
||||
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
|
||||
|
||||
// *** Make sure queues are empty and then start timer
|
||||
double dAvgTime = 0.0;
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
// Run the sequence iCycles times
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Nonblocking Write of all of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Launch kernel computation, command-queue 0
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Non Blocking Read of output data from device to host, command-queue 0
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
}
|
||||
|
||||
// *** Assure sync to host and return average sequence time
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dAvgTime = shrDeltaT(0)/(double)iCycles;
|
||||
|
||||
// Log config if asked for
|
||||
if (bShowConfig)
|
||||
{
|
||||
shrLog("\n1-Queue sequence Configuration:\n");
|
||||
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 1\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
|
||||
}
|
||||
return dAvgTime;
|
||||
}
|
||||
|
||||
// Run 2 queue sequence for n cycles
|
||||
// *********************************************************************
|
||||
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
|
||||
{
|
||||
// Locals
|
||||
size_t szHalfBuffer = szBuffBytes / 2;
|
||||
size_t szHalfOffset = szHalfBuffer / sizeof(float);
|
||||
double dAvgTime = 0.0;
|
||||
|
||||
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Set Global work size for 2 command-queues, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
|
||||
|
||||
// Make sure queues are empty and then start timer
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Mid Phase 0
|
||||
// Nonblocking Write of 1st half of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 1 ***********************************
|
||||
|
||||
// Launch kernel computation, command-queue 0
|
||||
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Nonblocking Write of 2nd half of input data from host to device in command-queue 1
|
||||
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the compute for queue 0 and write for queue 1 to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 2 ***********************************
|
||||
|
||||
// Launch kernel computation, command-queue 1
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Non Blocking Read of 1st half of output data from device to host, command-queue 0
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the compute for queue 1 and the read for queue 0 to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 0 (Rolls over) ***********************************
|
||||
|
||||
// Non Blocking Read of 2nd half of output data from device to host, command-queue 1
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
|
||||
// *** Sync to host and get average sequence time
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
dAvgTime = shrDeltaT(0)/(double)iCycles;
|
||||
|
||||
// Log config if asked for
|
||||
if (bShowConfig)
|
||||
{
|
||||
shrLog("\n2-Queue sequence Configuration:\n");
|
||||
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 2\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
|
||||
}
|
||||
|
||||
return dAvgTime;
|
||||
}
|
||||
|
||||
// Function to adjust compute task according to device capability
|
||||
// This allows a consistent overlap % across a wide variety of GPU's for test purposes
|
||||
// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
|
||||
// *********************************************************************
|
||||
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
|
||||
{
|
||||
// Locals
|
||||
double dCopyTime, dComputeTime;
|
||||
int iComputedLoopCount;
|
||||
|
||||
// Change Source Data
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
|
||||
|
||||
// *** Make sure queues are empty and then start timer
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
// Run the copy iCycles times and measure copy time on this system
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Nonblocking Write of all of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
ciErrNum |= clFlush(cqCommandQueue[0]);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dCopyTime = shrDeltaT(0);
|
||||
|
||||
// Run the compute iCycles times and measure compute time on this system
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Launch kernel computation, command-queue 0
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
ciErrNum |= clFlush(cqCommandQueue[0]);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
}
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dComputeTime = shrDeltaT(0);
|
||||
|
||||
// Determine number of core loop cycles proportional to copy/compute time ratio
|
||||
dComputeTime = MAX(dComputeTime, 1.0e-6);
|
||||
iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
return (iComputedLoopCount);
|
||||
}
|
||||
|
||||
// Cleanup/Exit function
|
||||
// *********************************************************************
|
||||
void Cleanup (int iExitCode)
|
||||
{
|
||||
// Cleanup allocated objects
|
||||
shrLog("Starting Cleanup...\n\n");
|
||||
if(cPathAndName)free(cPathAndName);
|
||||
if(cSourceCL)free(cSourceCL);
|
||||
if(Golden)free(Golden);
|
||||
if(ckKernel[0])clReleaseKernel(ckKernel[0]);
|
||||
if(ckKernel[1])clReleaseKernel(ckKernel[1]);
|
||||
if(cpProgram)clReleaseProgram(cpProgram);
|
||||
if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
|
||||
if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
|
||||
if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
|
||||
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
|
||||
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
|
||||
if(cmDevResult)clReleaseMemObject(cmDevResult);
|
||||
if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
|
||||
if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
|
||||
if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
|
||||
if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
|
||||
if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
|
||||
if(cxGPUContext)clReleaseContext(cxGPUContext);
|
||||
if(cdDevices)free(cdDevices);
|
||||
|
||||
// Master status Pass/Fail (all tests)
|
||||
shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
|
||||
}
|
||||
|
||||
// "Golden" Host processing vector hyptenuse function for comparison purposes
|
||||
// *********************************************************************
|
||||
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
|
||||
{
|
||||
for (unsigned int i = 0; i < uiNumElements; i++)
|
||||
{
|
||||
float fA = pfData1[i];
|
||||
float fB = pfData2[i];
|
||||
float fC = sqrtf(fA * fA + fB * fB);
|
||||
|
||||
pfResult[i] = fC;
|
||||
}
|
||||
}
|
198
benchmarks/opencl/VectorHypot/oclUtils.h
Normal file
198
benchmarks/opencl/VectorHypot/oclUtils.h
Normal file
|
@ -0,0 +1,198 @@
|
|||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef OCL_UTILS_H
|
||||
#define OCL_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// Common headers: Cross-API utililties and OpenCL header
|
||||
#include <shrUtils.h>
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
// Includes
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||
// extensions from <CL/cl_ext.h>
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||
#endif
|
||||
|
||||
// reminders for build output window and log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including shrUtils.h")
|
||||
#pragma message ("Note: including opencl.h")
|
||||
#endif
|
||||
|
||||
// SDK Revision #
|
||||
#define OCL_SDKREVISION "7027912"
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" int oclGetDevCap(cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclErrorString(cl_int error);
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
// An error condition is defined by the sample/test value not equal to the reference
|
||||
if (iReference != iSample)
|
||||
{
|
||||
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||
iSample = (iSample == 0) ? -9999 : iSample;
|
||||
|
||||
// Log the error info
|
||||
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||
|
||||
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(iSample);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(iSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
238
benchmarks/opencl/VectorHypot/shrQATest.h
Normal file
238
benchmarks/opencl/VectorHypot/shrQATest.h
Normal file
|
@ -0,0 +1,238 @@
|
|||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_QATEST_H
|
||||
#define SHR_QATEST_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#pragma message ("Note: including time.h")
|
||||
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRCASECMP _stricmp
|
||||
#else
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef STRNCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRNCASECMP _strnicmp
|
||||
#else
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Standardized QA Start/Finish for CUDA SDK tests
|
||||
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||
|
||||
inline int findExeNameStart(const char *exec_name)
|
||||
{
|
||||
int exename_start = (int)strlen(exec_name);
|
||||
|
||||
while( (exename_start > 0) &&
|
||||
(exec_name[exename_start] != '\\') &&
|
||||
(exec_name[exename_start] != '/') )
|
||||
{
|
||||
exename_start--;
|
||||
}
|
||||
if (exec_name[exename_start] == '\\' ||
|
||||
exec_name[exename_start] == '/')
|
||||
{
|
||||
return exename_start+1;
|
||||
} else {
|
||||
return exename_start;
|
||||
}
|
||||
}
|
||||
|
||||
inline int __shrQAStart(int argc, char **argv)
|
||||
{
|
||||
bool bQATest = false;
|
||||
// First clear the output buffer
|
||||
fflush(stdout);
|
||||
fflush(stdout);
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
char *string_argv = &argv[i][string_start];
|
||||
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to print the entire path, so we search for the first
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
return exename_start;
|
||||
}
|
||||
|
||||
enum eQAstatus {
|
||||
QA_FAILED = 0,
|
||||
QA_PASSED = 1,
|
||||
QA_WAIVED = 2
|
||||
};
|
||||
|
||||
inline void __ExitInTime(int seconds)
|
||||
{
|
||||
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||
fflush(stdout);
|
||||
time_t t;
|
||||
int count;
|
||||
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||
fprintf(stdout, "%d...", count);
|
||||
#ifdef WIN32
|
||||
Sleep(1000);
|
||||
#else
|
||||
sleep(1);
|
||||
#endif
|
||||
}
|
||||
fprintf(stdout,"done!\n\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bNoPrompt = true;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bNoPrompt = false;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
} else {
|
||||
if (!bNoPrompt) {
|
||||
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||
fflush(stdout);
|
||||
getchar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
bool bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
}
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish(argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#endif
|
642
benchmarks/opencl/VectorHypot/shrUtils.h
Normal file
642
benchmarks/opencl/VectorHypot/shrUtils.h
Normal file
|
@ -0,0 +1,642 @@
|
|||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_UTILS_H
|
||||
#define SHR_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// reminders for output window and build log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#endif
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
// Other headers needed for both Windows and Linux
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Un-comment the following #define to enable profiling code in SDK apps
|
||||
//#define GPU_PROFILING
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int ConvertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] =
|
||||
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||
{ -1, -1 }
|
||||
};
|
||||
|
||||
int index = 0;
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||
return -1;
|
||||
}
|
||||
// end of GPU Architecture definitions
|
||||
|
||||
|
||||
// Defines and enum for use with logging functions
|
||||
// *********************************************************************
|
||||
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||
enum LOGMODES
|
||||
{
|
||||
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||
LOGFILE = 2, // bit to signal "log to file"
|
||||
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||
MASTER = 8, // bit to signal master .csv log output
|
||||
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||
};
|
||||
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||
|
||||
// Standardized boolean
|
||||
enum shrBOOL
|
||||
{
|
||||
shrFALSE = 0,
|
||||
shrTRUE = 1
|
||||
};
|
||||
|
||||
// Standardized MAX, MIN and CLAMP
|
||||
#define MAX(a, b) ((a > b) ? a : b)
|
||||
#define MIN(a, b) ((a < b) ? a : b)
|
||||
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... extended version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... short version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||
|
||||
// Simple argument checker macro
|
||||
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||
|
||||
// Define for user-customized error handling
|
||||
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||
|
||||
// Function to deallocate memory allocated within shrUtils
|
||||
// *********************************************************************
|
||||
extern "C" void shrFree(void* ptr);
|
||||
|
||||
// *********************************************************************
|
||||
// Helper function to log standardized information to Console, to File or to both
|
||||
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
//!
|
||||
//! Automatically opens file and stores handle if needed and not done yet
|
||||
//! Closes file and nulls handle on request
|
||||
//!
|
||||
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||
//! @param 2 dValue:
|
||||
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||
//! Negative val is an error code and this give error preformatting.
|
||||
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||
// *********************************************************************
|
||||
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||
|
||||
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||
// *********************************************************************
|
||||
extern "C" int shrLog(const char* cFormatString, ...);
|
||||
|
||||
// *********************************************************************
|
||||
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||
// Maintains state for 3 independent counters
|
||||
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||
//!
|
||||
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||
// *********************************************************************
|
||||
extern "C" double shrDeltaT(int iCounterID);
|
||||
|
||||
// Optional LogFileNameOverride function
|
||||
// *********************************************************************
|
||||
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||
|
||||
// Helper function to init data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||
|
||||
// Helper function to print data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param OutData handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//!
|
||||
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||
//! otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type unsigned int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||
const char* arg_name, unsigned int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integers with a tolernance for # of byte errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||
//! threshold for # pixel errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||
|
||||
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
if (iReference != iSample)
|
||||
{
|
||||
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Standardized Exit
|
||||
// *********************************************************************
|
||||
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||
{
|
||||
#ifdef WIN32
|
||||
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#else
|
||||
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#endif
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||
getchar();
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||
}
|
||||
fflush(stderr);
|
||||
exit(iExitCode);
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Add table
Add a link
Reference in a new issue