Add files via upload

This commit is contained in:
Lyons, Ethan Tyler 2019-11-25 01:38:35 -05:00 committed by GitHub Enterprise
parent aab920b1a2
commit 925b387990
6 changed files with 2023 additions and 0 deletions

View file

@ -0,0 +1,638 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/*
Parallel reduction
This sample shows how to perform a reduction operation on an array of values
to produce a single value.
Reductions are a very common computation in parallel algorithms. Any time
an array of values needs to be reduced to a single value using a binary
associative operator, a reduction can be used. Example applications include
statistics computaions such as mean and standard deviation, and image
processing applications such as finding the total luminance of an
image.
This code performs sum reductions, but any associative operator such as
min() or max() could also be used.
It assumes the input size is a power of 2.
COMMAND LINE ARGUMENTS
"--shmoo": Test performance for 1 to 32M elements with each of the 7 different kernels
"--n=<N>": Specify the number of elements to reduce (default 1048576)
"--threads=<N>": Specify the number of threads per block (default 128)
"--kernel=<N>": Specify which kernel to run (0-6, default 6)
"--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
"--cpufinal": Read back the per-block results and do final sum of block sums on CPU (default false)
"--cputhresh=<N>": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1)
*/
// Common system and utility includes
#include <oclUtils.h>
#include <shrQATest.h>
// additional includes
#include <sstream>
#include <oclReduction.h>
// Forward declarations and sample-specific defines
// *********************************************************************
enum ReduceType
{
REDUCE_INT,
REDUCE_FLOAT,
REDUCE_DOUBLE
};
template <class T>
bool runTest( int argc, const char** argv, ReduceType datatype);
#define MAX_BLOCK_DIM_SIZE 65535
extern "C"
bool isPow2(unsigned int x)
{
return ((x&(x-1))==0);
}
cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2);
// Main function
// *********************************************************************
int main( int argc, const char** argv)
{
shrQAStart(argc, (char **)argv);
// start logs
shrSetLogFileName ("oclReduction.txt");
shrLog("%s Starting...\n\n", argv[0]);
char *typeChoice;
shrGetCmdLineArgumentstr(argc, argv, "type", &typeChoice);
// determine type of array from command line args
if (0 == typeChoice)
{
typeChoice = (char*)malloc(7 * sizeof(char));
#ifdef WIN32
strcpy_s(typeChoice, 7 * sizeof(char) + 1, "int");
#else
strcpy(typeChoice, "int");
#endif
}
ReduceType datatype = REDUCE_INT;
#ifdef WIN32
if (!_strcmpi(typeChoice, "float"))
datatype = REDUCE_FLOAT;
else if (!_strcmpi(typeChoice, "double"))
datatype = REDUCE_DOUBLE;
else
datatype = REDUCE_INT;
#else
if (!strcmp(typeChoice, "float"))
datatype = REDUCE_FLOAT;
else if (!strcmp(typeChoice, "double"))
datatype = REDUCE_DOUBLE;
else
datatype = REDUCE_INT;
#endif
shrLog("Reducing array of type %s.\n", typeChoice);
//Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckError(ciErrNum, CL_SUCCESS);
//Get the devices
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
//Create the context
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
// get and log the device info
if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
int device_nr = 0;
shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &device_nr);
if( device_nr < uiNumDevices ) {
device = oclGetDev(cxGPUContext, device_nr);
} else {
shrLog("Invalid Device %d Requested.\n", device_nr);
shrExitEX(argc, argv, EXIT_FAILURE);
}
} else {
device = oclGetMaxFlopsDev(cxGPUContext);
}
oclPrintDevName(LOGBOTH, device);
shrLog("\n");
// create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
source_path = shrFindFilePath("oclReduction_kernel.cl", argv[0]);
bool bSuccess = false;
switch (datatype)
{
default:
case REDUCE_INT:
bSuccess = runTest<int>( argc, argv, datatype);
break;
case REDUCE_FLOAT:
bSuccess = runTest<float>( argc, argv, datatype);
break;
}
// finish
shrQAFinishExit(argc, (const char **)argv, bSuccess ? QA_PASSED : QA_FAILED);
}
////////////////////////////////////////////////////////////////////////////////
//! Compute sum reduction on CPU
//! We use Kahan summation for an accurate sum of large arrays.
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
//!
//! @param data pointer to input data
//! @param size number of input data elements
////////////////////////////////////////////////////////////////////////////////
template<class T>
T reduceCPU(T *data, int size)
{
T sum = data[0];
T c = (T)0.0;
for (int i = 1; i < size; i++)
{
T y = data[i] - c;
T t = sum + y;
c = (t - sum) - y;
sum = t;
}
return sum;
}
unsigned int nextPow2( unsigned int x ) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}
////////////////////////////////////////////////////////////////////////////////
// Compute the number of threads and blocks to use for the given reduction kernel
// For the kernels >= 3, we set threads / block to the minimum of maxThreads and
// n/2. For kernels < 3, we set to the minimum of maxThreads and n. For kernel
// 6, we observe the maximum specified number of blocks, because each thread in
// that kernel can process a variable number of elements.
////////////////////////////////////////////////////////////////////////////////
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
{
if (whichKernel < 3)
{
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
blocks = (n + threads - 1) / threads;
}
else
{
threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
blocks = (n + (threads * 2 - 1)) / (threads * 2);
}
if (whichKernel == 6)
blocks = MIN(maxBlocks, blocks);
}
////////////////////////////////////////////////////////////////////////////////
// This function performs a reduction of the input data multiple times and
// measures the average reduction time.
////////////////////////////////////////////////////////////////////////////////
template <class T>
T profileReduce(ReduceType datatype,
cl_int n,
int numThreads,
int numBlocks,
int maxThreads,
int maxBlocks,
int whichKernel,
int testIterations,
bool cpuFinalReduction,
int cpuFinalThreshold,
double* dTotalTime,
T* h_odata,
cl_mem d_idata,
cl_mem d_odata)
{
T gpu_result = 0;
bool needReadBack = true;
cl_kernel finalReductionKernel[10];
int finalReductionIterations=0;
//shrLog("Profile Kernel %d\n", whichKernel);
cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) );
clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata);
clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata);
clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n);
clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL);
if( !cpuFinalReduction ) {
int s=numBlocks;
int threads = 0, blocks = 0;
int kernel = (whichKernel == 6) ? 5 : whichKernel;
while(s > cpuFinalThreshold)
{
getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) );
clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata);
clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata);
clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n);
clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL);
if (kernel < 3)
s = (s + threads - 1) / threads;
else
s = (s + (threads*2-1)) / (threads*2);
finalReductionIterations++;
}
}
size_t globalWorkSize[1];
size_t localWorkSize[1];
for (int i = 0; i < testIterations; ++i)
{
gpu_result = 0;
clFinish(cqCommandQueue);
if(i>0) shrDeltaT(1);
// execute the kernel
globalWorkSize[0] = numBlocks * numThreads;
localWorkSize[0] = numThreads;
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize,
0, NULL, NULL);
// check if kernel execution generated an error
//oclCheckError(ciErrNum, CL_SUCCESS);
if (cpuFinalReduction)
{
// sum partial sums from each block on CPU
// copy result from device to host
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T),
h_odata, 0, NULL, NULL);
for(int i=0; i<numBlocks; i++)
{
gpu_result += h_odata[i];
}
needReadBack = false;
}
else
{
// sum partial block sums on GPU
int s=numBlocks;
int kernel = (whichKernel == 6) ? 5 : whichKernel;
int it = 0;
while(s > cpuFinalThreshold)
{
int threads = 0, blocks = 0;
getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
globalWorkSize[0] = threads * blocks;
localWorkSize[0] = threads;
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0,
globalWorkSize, localWorkSize, 0, NULL, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
if (kernel < 3)
s = (s + threads - 1) / threads;
else
s = (s + (threads*2-1)) / (threads*2);
it++;
}
if (s > 1)
{
// copy result from device to host
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T),
h_odata, 0, NULL, NULL);
for(int i=0; i < s; i++)
{
gpu_result += h_odata[i];
}
needReadBack = false;
}
}
clFinish(cqCommandQueue);
if(i>0) *dTotalTime += shrDeltaT(1);
}
if (needReadBack)
{
// copy final sum from device to host
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T),
&gpu_result, 0, NULL, NULL);
}
// Release the kernels
clReleaseKernel(reductionKernel);
if( !cpuFinalReduction ) {
for(int it=0; it<finalReductionIterations; ++it) {
clReleaseKernel(finalReductionKernel[it]);
}
}
return gpu_result;
}
////////////////////////////////////////////////////////////////////////////////
// This function calls profileReduce multple times for a range of array sizes
// and prints a report in CSV (comma-separated value) format that can be used for
// generating a "shmoo" plot showing the performance for each kernel variation
// over a wide range of input sizes.
////////////////////////////////////////////////////////////////////////////////
template <class T>
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
{
// create random input data on CPU
unsigned int bytes = maxN * sizeof(T);
T* h_idata = (T*)malloc(bytes);
for(int i = 0; i < maxN; i++) {
// Keep the numbers small so we don't get truncation error in the sum
if (datatype == REDUCE_INT)
h_idata[i] = (T)(rand() & 0xFF);
else
h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
}
int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);
// allocate mem for the result on host side
T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));
// allocate device memory and data
cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL);
int testIterations = 100;
double dTotalTime = 0.0;
// print headers
shrLog("Time in seconds for various numbers of elements for each kernel\n");
shrLog("\n\n");
shrLog("Kernel");
for (int i = minN; i <= maxN; i *= 2)
{
shrLog(", %d", i);
}
for (int kernel = 0; kernel < 7; kernel++)
{
shrLog("\n");
shrLog("%d", kernel);
for (int i = minN; i <= maxN; i *= 2)
{
int numBlocks = 0;
int numThreads = 0;
getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);
double reduceTime;
if( numBlocks <= MAX_BLOCK_DIM_SIZE ) {
profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel,
testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata);
reduceTime = dTotalTime/(double)testIterations;
} else {
reduceTime = -1.0;
}
shrLog(", %.4f m", reduceTime);
}
}
// cleanup
free(h_idata);
free(h_odata);
clReleaseMemObject(d_idata);
clReleaseMemObject(d_odata);
}
////////////////////////////////////////////////////////////////////////////////
// The main function whihc runs the reduction test.
////////////////////////////////////////////////////////////////////////////////
template <class T>
bool
runTest( int argc, const char** argv, ReduceType datatype)
{
int size = 1<<24; // number of elements to reduce
int maxThreads;
cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1);
clReleaseKernel(reductionKernel);
if (smallBlock)
maxThreads = 64; // number of threads per block
else
maxThreads = 128;
int whichKernel = 6;
int maxBlocks = 64;
bool cpuFinalReduction = false;
int cpuFinalThreshold = 1;
shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
shrLog(" %d elements\n", size);
shrLog(" %d threads (max)\n", maxThreads);
cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE);
shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);
bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE);
#ifdef GPU_PROFILING
if (runShmoo)
{
shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
return true;
}
else
#endif
{
// create random input data on CPU
unsigned int bytes = size * sizeof(T);
T* h_idata = (T*)malloc(bytes);
for(int i=0; i<size; i++)
{
// Keep the numbers small so we don't get truncation error in the sum
if (datatype == REDUCE_INT)
h_idata[i] = (T)(rand() & 0xFF);
else
h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
}
int numBlocks = 0;
int numThreads = 0;
getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
if (numBlocks == 1) cpuFinalThreshold = 1;
shrLog(" %d blocks\n\n", numBlocks);
// allocate mem for the result on host side
T* h_odata = (T*)malloc(numBlocks * sizeof(T));
// allocate device memory and data
cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL);
int testIterations = 100;
double dTotalTime = 0.0;
T gpu_result = 0;
gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks,
whichKernel, testIterations, cpuFinalReduction,
cpuFinalThreshold, &dTotalTime,
h_odata, d_idata, d_odata);
#ifdef GPU_PROFILING
double reduceTime = dTotalTime/(double)testIterations;
shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
#endif
// compute reference solution
shrLog("\nComparing against Host/C++ computation...\n");
T cpu_result = reduceCPU<T>(h_idata, size);
if (datatype == REDUCE_INT)
{
shrLog(" GPU result = %d\n", gpu_result);
shrLog(" CPU result = %d\n\n", cpu_result);
shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED");
}
else
{
shrLog(" GPU result = %.9f\n", gpu_result);
shrLog(" CPU result = %.9f\n\n", cpu_result);
double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12;
double diff = abs((double)gpu_result - (double)cpu_result);
shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED");
}
// cleanup
free(h_idata);
free(h_odata);
clReleaseMemObject(d_idata);
clReleaseMemObject(d_odata);
return (gpu_result == cpu_result);
}
}
// Helper function to create and build program and kernel
// *********************************************************************
cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2)
{
// compile cl program
size_t program_length;
char *source;
std::ostringstream preamble;
// create the program
// with type specification depending on datatype argument
switch (datatype)
{
default:
case REDUCE_INT:
preamble << "#define T int" << std::endl;
break;
case REDUCE_FLOAT:
preamble << "#define T float" << std::endl;
break;
}
// set blockSize at compile time
preamble << "#define blockSize " << blockSize << std::endl;
// set isPow2 at compile time
preamble << "#define nIsPow2 " << isPowOf2 << std::endl;
// Load the source code and prepend the preamble
source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length);
//oclCheckError(source != NULL, shrTRUE);
program =
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "reduce0", NULL);
//cl_program rv_program = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source,
// &program_length, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
free(source);
// build the program
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclReduction.ptx");
//oclCheckError(ciErrNum, CL_SUCCESS);
}
// create Kernel
std::ostringstream kernelName;
kernelName << "reduce" << whichKernel;
cl_kernel ckKernel = clCreateKernel(rv_program, kernelName.str().c_str(), &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
size_t wgSize;
ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
if (wgSize == 64)
smallBlock = true;
else smallBlock = false;
// NOTE: the program will get deleted when the kernel is also released
clReleaseProgram(rv_program);
return ckKernel;
}

View file

@ -0,0 +1,34 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef __REDUCTION_H__
#define __REDUCTION_H__
template <class T>
void reduce_sm10(int size, int threads, int blocks,
int whichKernel, T *d_idata, T *d_odata);
template <class T>
void reduce_sm13(int size, int threads, int blocks,
int whichKernel, T *d_idata, T *d_odata);
// CL objects
cl_platform_id cpPlatform;
cl_uint uiNumDevices;
cl_device_id* cdDevices;
cl_context cxGPUContext;
cl_command_queue cqCommandQueue;
cl_device_id device;
cl_int ciErrNum;
const char* source_path;
bool smallBlock = true;
#endif

View file

@ -0,0 +1,273 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/*
Parallel reduction kernels
*/
// The following defines are set during runtime compilation, see reduction.cpp
// #define T float
// #define blockSize 128
// #define nIsPow2 1
#ifndef _REDUCE_KERNEL_H_
#define _REDUCE_KERNEL_H_
/*
Parallel sum reduction using shared memory
- takes log(n) steps for n input elements
- uses n threads
- only works for power-of-2 arrays
*/
/* This reduction interleaves which threads are active by using the modulo
operator. This operator is very expensive on GPUs, and the interleaved
inactivity means that no whole warps are active, which is also very
inefficient */
__kernel void reduce0(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
{
// load shared mem
unsigned int tid = get_local_id(0);
unsigned int i = get_global_id(0);
sdata[tid] = (i < n) ? g_idata[i] : 0;
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s=1; s < get_local_size(0); s *= 2) {
// modulo arithmetic is slow!
if ((tid % (2*s)) == 0) {
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
}
/* This version uses contiguous threads, but its interleaved
addressing results in many shared memory bank conflicts. */
__kernel void reduce1(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
{
// load shared mem
unsigned int tid = get_local_id(0);
unsigned int i = get_global_id(0);
sdata[tid] = (i < n) ? g_idata[i] : 0;
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s=1; s < get_local_size(0); s *= 2)
{
int index = 2 * s * tid;
if (index < get_local_size(0))
{
sdata[index] += sdata[index + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
}
/*
This version uses sequential addressing -- no divergence or bank conflicts.
*/
__kernel void reduce2(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
{
// load shared mem
unsigned int tid = get_local_id(0);
unsigned int i = get_global_id(0);
sdata[tid] = (i < n) ? g_idata[i] : 0;
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s=get_local_size(0)/2; s>0; s>>=1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
}
/*
This version uses n/2 threads --
it performs the first level of reduction when reading from global memory
*/
__kernel void reduce3(__global T *g_idata, __global T *g_odata, unsigned int n, __local T* sdata)
{
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = get_local_id(0);
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
sdata[tid] = (i < n) ? g_idata[i] : 0;
if (i + get_local_size(0) < n)
sdata[tid] += g_idata[i+get_local_size(0)];
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
for(unsigned int s=get_local_size(0)/2; s>0; s>>=1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// write result for this block to global mem
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
}
/*
This version unrolls the last warp to avoid synchronization where it
isn't needed
*/
__kernel void reduce4(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
{
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = get_local_id(0);
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
sdata[tid] = (i < n) ? g_idata[i] : 0;
if (i + get_local_size(0) < n)
sdata[tid] += g_idata[i+get_local_size(0)];
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
#pragma unroll 1
for(unsigned int s=get_local_size(0)/2; s>32; s>>=1)
{
if (tid < s)
{
sdata[tid] += sdata[tid + s];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (tid < 32)
{
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
}
// write result for this block to global mem
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
}
/*
This version is completely unrolled. It uses a template parameter to achieve
optimal code for any (power of 2) number of threads. This requires a switch
statement in the host code to handle all the different thread block sizes at
compile time.
*/
__kernel void reduce5(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
{
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = get_local_id(0);
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
sdata[tid] = (i < n) ? g_idata[i] : 0;
if (i + blockSize < n)
sdata[tid] += g_idata[i+blockSize];
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
if (tid < 32)
{
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
}
// write result for this block to global mem
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
}
/*
This version adds multiple elements per thread sequentially. This reduces the overall
cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
(Brent's Theorem optimization)
*/
__kernel void reduce6(__global T *g_idata, __global T *g_odata, unsigned int n, __local volatile T* sdata)
{
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = get_local_id(0);
unsigned int i = get_group_id(0)*(get_local_size(0)*2) + get_local_id(0);
unsigned int gridSize = blockSize*2*get_num_groups(0);
sdata[tid] = 0;
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n)
{
sdata[tid] += g_idata[i];
// ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
if (nIsPow2 || i + blockSize < n)
sdata[tid] += g_idata[i+blockSize];
i += gridSize;
}
barrier(CLK_LOCAL_MEM_FENCE);
// do reduction in shared mem
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } barrier(CLK_LOCAL_MEM_FENCE); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } barrier(CLK_LOCAL_MEM_FENCE); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } barrier(CLK_LOCAL_MEM_FENCE); }
if (tid < 32)
{
if (blockSize >= 64) { sdata[tid] += sdata[tid + 32]; }
if (blockSize >= 32) { sdata[tid] += sdata[tid + 16]; }
if (blockSize >= 16) { sdata[tid] += sdata[tid + 8]; }
if (blockSize >= 8) { sdata[tid] += sdata[tid + 4]; }
if (blockSize >= 4) { sdata[tid] += sdata[tid + 2]; }
if (blockSize >= 2) { sdata[tid] += sdata[tid + 1]; }
}
// write result for this block to global mem
if (tid == 0) g_odata[get_group_id(0)] = sdata[0];
}
#endif // #ifndef _REDUCE_KERNEL_H_

View file

@ -0,0 +1,198 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef OCL_UTILS_H
#define OCL_UTILS_H
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
// Common headers: Cross-API utililties and OpenCL header
#include <shrUtils.h>
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Includes
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// For systems with CL_EXT that are not updated with these extensions, we copied these
// extensions from <CL/cl_ext.h>
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
#endif
// reminders for build output window and log
#ifdef _WIN32
#pragma message ("Note: including shrUtils.h")
#pragma message ("Note: including opencl.h")
#endif
// SDK Revision #
#define OCL_SDKREVISION "7027912"
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platform ID
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" int oclGetDevCap(cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
// Helper function for De-allocating cl objects
// *********************************************************************
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
// Helper function to get OpenCL error string from constant
// *********************************************************************
extern "C" const char* oclErrorString(cl_int error);
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
// An error condition is defined by the sample/test value not equal to the reference
if (iReference != iSample)
{
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
iSample = (iSample == 0) ? -9999 : iSample;
// Log the error info
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
if (pCleanup != NULL)
{
pCleanup(iSample);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(iSample);
}
}
}
#endif

View file

@ -0,0 +1,238 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_QATEST_H
#define SHR_QATEST_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// OS dependent includes
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#pragma message ("Note: including time.h")
// Headers needed for Windows
#include <windows.h>
#include <time.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <time.h>
#endif
#ifndef STRCASECMP
#ifdef _WIN32
#define STRCASECMP _stricmp
#else
#define STRCASECMP strcasecmp
#endif
#endif
#ifndef STRNCASECMP
#ifdef _WIN32
#define STRNCASECMP _strnicmp
#else
#define STRNCASECMP strncasecmp
#endif
#endif
// Standardized QA Start/Finish for CUDA SDK tests
#define shrQAStart(a, b) __shrQAStart(a, b)
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
inline int findExeNameStart(const char *exec_name)
{
int exename_start = (int)strlen(exec_name);
while( (exename_start > 0) &&
(exec_name[exename_start] != '\\') &&
(exec_name[exename_start] != '/') )
{
exename_start--;
}
if (exec_name[exename_start] == '\\' ||
exec_name[exename_start] == '/')
{
return exename_start+1;
} else {
return exename_start;
}
}
inline int __shrQAStart(int argc, char **argv)
{
bool bQATest = false;
// First clear the output buffer
fflush(stdout);
fflush(stdout);
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
}
// We don't want to print the entire path, so we search for the first
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
}
fflush(stdout);
printf("\n"); fflush(stdout);
return exename_start;
}
enum eQAstatus {
QA_FAILED = 0,
QA_PASSED = 1,
QA_WAIVED = 2
};
inline void __ExitInTime(int seconds)
{
fprintf(stdout, "> exiting in %d seconds: ", seconds);
fflush(stdout);
time_t t;
int count;
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
fprintf(stdout, "%d...", count);
#ifdef WIN32
Sleep(1000);
#else
sleep(1);
#endif
}
fprintf(stdout,"done!\n\n");
fflush(stdout);
}
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
{
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bNoPrompt = true;
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bNoPrompt = false;
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
printf("\n"); fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
} else {
if (!bNoPrompt) {
fprintf(stdout, "\nPress <Enter> to exit...\n");
fflush(stdout);
getchar();
}
}
}
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
{
bool bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
}
}
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
{
__shrQAFinish(argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
{
__shrQAFinish2(bQAtest, argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
#endif

View file

@ -0,0 +1,642 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_UTILS_H
#define SHR_UTILS_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// reminders for output window and build log
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#endif
// OS dependent includes
#ifdef _WIN32
// Headers needed for Windows
#include <windows.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#endif
// Other headers needed for both Windows and Linux
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Un-comment the following #define to enable profiling code in SDK apps
//#define GPU_PROFILING
// Beginning of GPU Architecture definitions
inline int ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] =
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
{ -1, -1 }
};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
return -1;
}
// end of GPU Architecture definitions
// Defines and enum for use with logging functions
// *********************************************************************
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
#define MASTERLOGFILE "SdkMasterLog.csv"
enum LOGMODES
{
LOGCONSOLE = 1, // bit to signal "log to console"
LOGFILE = 2, // bit to signal "log to file"
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
MASTER = 8, // bit to signal master .csv log output
ERRORMSG = 16, // bit to signal "pre-pend Error"
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
};
#define HDASHLINE "-----------------------------------------------------------\n"
// Standardized boolean
enum shrBOOL
{
shrFALSE = 0,
shrTRUE = 1
};
// Standardized MAX, MIN and CLAMP
#define MAX(a, b) ((a > b) ? a : b)
#define MIN(a, b) ((a < b) ? a : b)
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
// Standardized Exit Macro for leaving main()... extended version
// (Companion Inline Function lower on page)
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
// Standardized Exit Macro for leaving main()... short version
// (Companion Inline Function lower on page)
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
// Simple argument checker macro
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
// Define for user-customized error handling
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
// Function to deallocate memory allocated within shrUtils
// *********************************************************************
extern "C" void shrFree(void* ptr);
// *********************************************************************
// Helper function to log standardized information to Console, to File or to both
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
//!
//! Automatically opens file and stores handle if needed and not done yet
//! Closes file and nulls handle on request
//!
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
//! @param 2 dValue:
//! Positive val = double value for time in secs to be formatted to 6 decimals.
//! Negative val is an error code and this give error preformatting.
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
//! Single byte char type specifiers (%s and %c) ARE supported
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
// *********************************************************************
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
// *********************************************************************
extern "C" int shrLog(const char* cFormatString, ...);
// *********************************************************************
// Delta timer function for up to 3 independent timers using host high performance counters
// Maintains state for 3 independent counters
//! Example: double dElapsedTime = shrDeltaTime(0);
//!
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
// *********************************************************************
extern "C" double shrDeltaT(int iCounterID);
// Optional LogFileNameOverride function
// *********************************************************************
extern "C" void shrSetLogFileName (const char* cOverRideName);
// Helper function to init data arrays
// *********************************************************************
extern "C" void shrFillArray(float* pfData, int iSize);
// Helper function to print data arrays
// *********************************************************************
extern "C" void shrPrintArray(float* pfData, int iSize);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing single precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing double precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
const float epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
const double epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param OutData handle to the data read
//! @param w width of the image
//! @param h height of the image
//!
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return shrTRUE if command line argument \a flag_name has been given,
//! otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param flag_name name of command line flag
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
const char* flag_name);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
const char* arg_name, int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type unsigned int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
const char* arg_name, unsigned int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type float
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
const char* arg_name, float* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type string
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
const char* arg_name, char** val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument list those element are strings
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val command line argument list
//! @param len length of the list / number of elements
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
unsigned int* len);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned integer arrays, with epsilon and threshold
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integers with a tolernance for # of byte errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality and a
//! threshold for # pixel errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two PPM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
////////////////////////////////////////////////////////////////////////////////
//! Compare two PGM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
extern "C" size_t shrRoundUp(int group_size, int global_size);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
if (iReference != iSample)
{
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
if (pCleanup != NULL)
{
pCleanup(EXIT_FAILURE);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(EXIT_FAILURE);
}
}
}
// Standardized Exit
// *********************************************************************
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
{
#ifdef WIN32
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#else
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#endif
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
getchar();
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
}
fflush(stderr);
exit(iExitCode);
}
#endif