mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-21 20:37:45 -04:00
638 lines
22 KiB
C++
638 lines
22 KiB
C++
/*
|
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
|
* with this source code for terms and conditions that govern your use of
|
|
* this software. Any use, reproduction, disclosure, or distribution of
|
|
* this software and related documentation outside the terms of the EULA
|
|
* is strictly prohibited.
|
|
*
|
|
*/
|
|
|
|
/*
|
|
Parallel reduction
|
|
|
|
This sample shows how to perform a reduction operation on an array of values
|
|
to produce a single value.
|
|
|
|
Reductions are a very common computation in parallel algorithms. Any time
|
|
an array of values needs to be reduced to a single value using a binary
|
|
associative operator, a reduction can be used. Example applications include
|
|
statistics computaions such as mean and standard deviation, and image
|
|
processing applications such as finding the total luminance of an
|
|
image.
|
|
|
|
This code performs sum reductions, but any associative operator such as
|
|
min() or max() could also be used.
|
|
|
|
It assumes the input size is a power of 2.
|
|
|
|
COMMAND LINE ARGUMENTS
|
|
|
|
"--shmoo": Test performance for 1 to 32M elements with each of the 7 different kernels
|
|
"--n=<N>": Specify the number of elements to reduce (default 1048576)
|
|
"--threads=<N>": Specify the number of threads per block (default 128)
|
|
"--kernel=<N>": Specify which kernel to run (0-6, default 6)
|
|
"--maxblocks=<N>": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64)
|
|
"--cpufinal": Read back the per-block results and do final sum of block sums on CPU (default false)
|
|
"--cputhresh=<N>": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1)
|
|
|
|
*/
|
|
|
|
// Common system and utility includes
|
|
#include <oclUtils.h>
|
|
#include <shrQATest.h>
|
|
|
|
// additional includes
|
|
#include <sstream>
|
|
#include <oclReduction.h>
|
|
|
|
// Forward declarations and sample-specific defines
|
|
// *********************************************************************
|
|
enum ReduceType
|
|
{
|
|
REDUCE_INT,
|
|
REDUCE_FLOAT,
|
|
REDUCE_DOUBLE
|
|
};
|
|
|
|
template <class T>
|
|
bool runTest( int argc, const char** argv, ReduceType datatype);
|
|
|
|
#define MAX_BLOCK_DIM_SIZE 65535
|
|
|
|
extern "C"
|
|
bool isPow2(unsigned int x)
|
|
{
|
|
return ((x&(x-1))==0);
|
|
}
|
|
|
|
cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2);
|
|
|
|
// Main function
|
|
// *********************************************************************
|
|
int main( int argc, const char** argv)
|
|
{
|
|
shrQAStart(argc, (char **)argv);
|
|
|
|
// start logs
|
|
shrSetLogFileName ("oclReduction.txt");
|
|
shrLog("%s Starting...\n\n", argv[0]);
|
|
|
|
char *typeChoice;
|
|
shrGetCmdLineArgumentstr(argc, argv, "type", &typeChoice);
|
|
|
|
// determine type of array from command line args
|
|
if (0 == typeChoice)
|
|
{
|
|
typeChoice = (char*)malloc(7 * sizeof(char));
|
|
#ifdef WIN32
|
|
strcpy_s(typeChoice, 7 * sizeof(char) + 1, "int");
|
|
#else
|
|
strcpy(typeChoice, "int");
|
|
#endif
|
|
}
|
|
ReduceType datatype = REDUCE_INT;
|
|
|
|
#ifdef WIN32
|
|
if (!_strcmpi(typeChoice, "float"))
|
|
datatype = REDUCE_FLOAT;
|
|
else if (!_strcmpi(typeChoice, "double"))
|
|
datatype = REDUCE_DOUBLE;
|
|
else
|
|
datatype = REDUCE_INT;
|
|
#else
|
|
if (!strcmp(typeChoice, "float"))
|
|
datatype = REDUCE_FLOAT;
|
|
else if (!strcmp(typeChoice, "double"))
|
|
datatype = REDUCE_DOUBLE;
|
|
else
|
|
datatype = REDUCE_INT;
|
|
#endif
|
|
|
|
shrLog("Reducing array of type %s.\n", typeChoice);
|
|
|
|
//Get the NVIDIA platform
|
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
//Get the devices
|
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
//Create the context
|
|
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// get and log the device info
|
|
if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) {
|
|
int device_nr = 0;
|
|
shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &device_nr);
|
|
if( device_nr < uiNumDevices ) {
|
|
device = oclGetDev(cxGPUContext, device_nr);
|
|
} else {
|
|
shrLog("Invalid Device %d Requested.\n", device_nr);
|
|
shrExitEX(argc, argv, EXIT_FAILURE);
|
|
}
|
|
} else {
|
|
device = oclGetMaxFlopsDev(cxGPUContext);
|
|
}
|
|
oclPrintDevName(LOGBOTH, device);
|
|
shrLog("\n");
|
|
|
|
// create a command-queue
|
|
cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
source_path = shrFindFilePath("oclReduction_kernel.cl", argv[0]);
|
|
|
|
bool bSuccess = false;
|
|
switch (datatype)
|
|
{
|
|
default:
|
|
case REDUCE_INT:
|
|
bSuccess = runTest<int>( argc, argv, datatype);
|
|
break;
|
|
case REDUCE_FLOAT:
|
|
bSuccess = runTest<float>( argc, argv, datatype);
|
|
break;
|
|
}
|
|
|
|
// finish
|
|
shrQAFinishExit(argc, (const char **)argv, bSuccess ? QA_PASSED : QA_FAILED);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
//! Compute sum reduction on CPU
|
|
//! We use Kahan summation for an accurate sum of large arrays.
|
|
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
|
|
//!
|
|
//! @param data pointer to input data
|
|
//! @param size number of input data elements
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
template<class T>
|
|
T reduceCPU(T *data, int size)
|
|
{
|
|
T sum = data[0];
|
|
T c = (T)0.0;
|
|
for (int i = 1; i < size; i++)
|
|
{
|
|
T y = data[i] - c;
|
|
T t = sum + y;
|
|
c = (t - sum) - y;
|
|
sum = t;
|
|
}
|
|
return sum;
|
|
}
|
|
|
|
unsigned int nextPow2( unsigned int x ) {
|
|
--x;
|
|
x |= x >> 1;
|
|
x |= x >> 2;
|
|
x |= x >> 4;
|
|
x |= x >> 8;
|
|
x |= x >> 16;
|
|
return ++x;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Compute the number of threads and blocks to use for the given reduction kernel
|
|
// For the kernels >= 3, we set threads / block to the minimum of maxThreads and
|
|
// n/2. For kernels < 3, we set to the minimum of maxThreads and n. For kernel
|
|
// 6, we observe the maximum specified number of blocks, because each thread in
|
|
// that kernel can process a variable number of elements.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads)
|
|
{
|
|
if (whichKernel < 3)
|
|
{
|
|
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
|
|
blocks = (n + threads - 1) / threads;
|
|
}
|
|
else
|
|
{
|
|
threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads;
|
|
blocks = (n + (threads * 2 - 1)) / (threads * 2);
|
|
}
|
|
|
|
|
|
if (whichKernel == 6)
|
|
blocks = MIN(maxBlocks, blocks);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// This function performs a reduction of the input data multiple times and
|
|
// measures the average reduction time.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
template <class T>
|
|
T profileReduce(ReduceType datatype,
|
|
cl_int n,
|
|
int numThreads,
|
|
int numBlocks,
|
|
int maxThreads,
|
|
int maxBlocks,
|
|
int whichKernel,
|
|
int testIterations,
|
|
bool cpuFinalReduction,
|
|
int cpuFinalThreshold,
|
|
double* dTotalTime,
|
|
T* h_odata,
|
|
cl_mem d_idata,
|
|
cl_mem d_odata)
|
|
{
|
|
|
|
|
|
T gpu_result = 0;
|
|
bool needReadBack = true;
|
|
cl_kernel finalReductionKernel[10];
|
|
int finalReductionIterations=0;
|
|
|
|
//shrLog("Profile Kernel %d\n", whichKernel);
|
|
|
|
cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) );
|
|
clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata);
|
|
clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata);
|
|
clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n);
|
|
clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL);
|
|
|
|
if( !cpuFinalReduction ) {
|
|
int s=numBlocks;
|
|
int threads = 0, blocks = 0;
|
|
int kernel = (whichKernel == 6) ? 5 : whichKernel;
|
|
|
|
while(s > cpuFinalThreshold)
|
|
{
|
|
getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
|
|
|
|
finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) );
|
|
clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata);
|
|
clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata);
|
|
clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n);
|
|
clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL);
|
|
|
|
if (kernel < 3)
|
|
s = (s + threads - 1) / threads;
|
|
else
|
|
s = (s + (threads*2-1)) / (threads*2);
|
|
|
|
finalReductionIterations++;
|
|
}
|
|
}
|
|
|
|
size_t globalWorkSize[1];
|
|
size_t localWorkSize[1];
|
|
|
|
for (int i = 0; i < testIterations; ++i)
|
|
{
|
|
gpu_result = 0;
|
|
|
|
clFinish(cqCommandQueue);
|
|
if(i>0) shrDeltaT(1);
|
|
|
|
// execute the kernel
|
|
globalWorkSize[0] = numBlocks * numThreads;
|
|
localWorkSize[0] = numThreads;
|
|
|
|
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize,
|
|
0, NULL, NULL);
|
|
|
|
// check if kernel execution generated an error
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
if (cpuFinalReduction)
|
|
{
|
|
// sum partial sums from each block on CPU
|
|
// copy result from device to host
|
|
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T),
|
|
h_odata, 0, NULL, NULL);
|
|
|
|
for(int i=0; i<numBlocks; i++)
|
|
{
|
|
gpu_result += h_odata[i];
|
|
}
|
|
|
|
needReadBack = false;
|
|
}
|
|
else
|
|
{
|
|
// sum partial block sums on GPU
|
|
int s=numBlocks;
|
|
int kernel = (whichKernel == 6) ? 5 : whichKernel;
|
|
int it = 0;
|
|
|
|
|
|
while(s > cpuFinalThreshold)
|
|
{
|
|
int threads = 0, blocks = 0;
|
|
getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads);
|
|
|
|
globalWorkSize[0] = threads * blocks;
|
|
localWorkSize[0] = threads;
|
|
|
|
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0,
|
|
globalWorkSize, localWorkSize, 0, NULL, NULL);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
if (kernel < 3)
|
|
s = (s + threads - 1) / threads;
|
|
else
|
|
s = (s + (threads*2-1)) / (threads*2);
|
|
|
|
it++;
|
|
}
|
|
|
|
if (s > 1)
|
|
{
|
|
// copy result from device to host
|
|
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T),
|
|
h_odata, 0, NULL, NULL);
|
|
|
|
for(int i=0; i < s; i++)
|
|
{
|
|
gpu_result += h_odata[i];
|
|
}
|
|
|
|
needReadBack = false;
|
|
}
|
|
}
|
|
|
|
clFinish(cqCommandQueue);
|
|
if(i>0) *dTotalTime += shrDeltaT(1);
|
|
}
|
|
|
|
if (needReadBack)
|
|
{
|
|
// copy final sum from device to host
|
|
clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T),
|
|
&gpu_result, 0, NULL, NULL);
|
|
}
|
|
|
|
// Release the kernels
|
|
clReleaseKernel(reductionKernel);
|
|
if( !cpuFinalReduction ) {
|
|
for(int it=0; it<finalReductionIterations; ++it) {
|
|
clReleaseKernel(finalReductionKernel[it]);
|
|
}
|
|
|
|
}
|
|
|
|
return gpu_result;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// This function calls profileReduce multple times for a range of array sizes
|
|
// and prints a report in CSV (comma-separated value) format that can be used for
|
|
// generating a "shmoo" plot showing the performance for each kernel variation
|
|
// over a wide range of input sizes.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
template <class T>
|
|
void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype)
|
|
{
|
|
// create random input data on CPU
|
|
unsigned int bytes = maxN * sizeof(T);
|
|
|
|
T* h_idata = (T*)malloc(bytes);
|
|
|
|
for(int i = 0; i < maxN; i++) {
|
|
// Keep the numbers small so we don't get truncation error in the sum
|
|
if (datatype == REDUCE_INT)
|
|
h_idata[i] = (T)(rand() & 0xFF);
|
|
else
|
|
h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
|
|
}
|
|
|
|
int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE);
|
|
|
|
// allocate mem for the result on host side
|
|
T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T));
|
|
|
|
// allocate device memory and data
|
|
cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
|
|
cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL);
|
|
|
|
int testIterations = 100;
|
|
double dTotalTime = 0.0;
|
|
|
|
// print headers
|
|
shrLog("Time in seconds for various numbers of elements for each kernel\n");
|
|
shrLog("\n\n");
|
|
shrLog("Kernel");
|
|
for (int i = minN; i <= maxN; i *= 2)
|
|
{
|
|
shrLog(", %d", i);
|
|
}
|
|
|
|
for (int kernel = 0; kernel < 7; kernel++)
|
|
{
|
|
shrLog("\n");
|
|
shrLog("%d", kernel);
|
|
for (int i = minN; i <= maxN; i *= 2)
|
|
{
|
|
int numBlocks = 0;
|
|
int numThreads = 0;
|
|
getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads);
|
|
|
|
double reduceTime;
|
|
if( numBlocks <= MAX_BLOCK_DIM_SIZE ) {
|
|
profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel,
|
|
testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata);
|
|
reduceTime = dTotalTime/(double)testIterations;
|
|
} else {
|
|
reduceTime = -1.0;
|
|
}
|
|
shrLog(", %.4f m", reduceTime);
|
|
}
|
|
}
|
|
|
|
// cleanup
|
|
free(h_idata);
|
|
free(h_odata);
|
|
clReleaseMemObject(d_idata);
|
|
clReleaseMemObject(d_odata);
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// The main function whihc runs the reduction test.
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
template <class T>
|
|
bool
|
|
runTest( int argc, const char** argv, ReduceType datatype)
|
|
{
|
|
int size = 1<<24; // number of elements to reduce
|
|
int maxThreads;
|
|
|
|
cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1);
|
|
clReleaseKernel(reductionKernel);
|
|
|
|
if (smallBlock)
|
|
maxThreads = 64; // number of threads per block
|
|
else
|
|
maxThreads = 128;
|
|
|
|
int whichKernel = 6;
|
|
int maxBlocks = 64;
|
|
bool cpuFinalReduction = false;
|
|
int cpuFinalThreshold = 1;
|
|
|
|
shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size);
|
|
shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads);
|
|
shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel);
|
|
shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks);
|
|
|
|
shrLog(" %d elements\n", size);
|
|
shrLog(" %d threads (max)\n", maxThreads);
|
|
|
|
cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE);
|
|
shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold);
|
|
|
|
bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE);
|
|
|
|
#ifdef GPU_PROFILING
|
|
if (runShmoo)
|
|
{
|
|
shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype);
|
|
return true;
|
|
}
|
|
else
|
|
#endif
|
|
{
|
|
// create random input data on CPU
|
|
unsigned int bytes = size * sizeof(T);
|
|
T* h_idata = (T*)malloc(bytes);
|
|
|
|
for(int i=0; i<size; i++)
|
|
{
|
|
// Keep the numbers small so we don't get truncation error in the sum
|
|
if (datatype == REDUCE_INT)
|
|
h_idata[i] = (T)(rand() & 0xFF);
|
|
else
|
|
h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX;
|
|
}
|
|
|
|
int numBlocks = 0;
|
|
int numThreads = 0;
|
|
getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads);
|
|
if (numBlocks == 1) cpuFinalThreshold = 1;
|
|
shrLog(" %d blocks\n\n", numBlocks);
|
|
|
|
// allocate mem for the result on host side
|
|
T* h_odata = (T*)malloc(numBlocks * sizeof(T));
|
|
|
|
// allocate device memory and data
|
|
cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL);
|
|
cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL);
|
|
|
|
int testIterations = 100;
|
|
double dTotalTime = 0.0;
|
|
T gpu_result = 0;
|
|
gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks,
|
|
whichKernel, testIterations, cpuFinalReduction,
|
|
cpuFinalThreshold, &dTotalTime,
|
|
h_odata, d_idata, d_odata);
|
|
|
|
#ifdef GPU_PROFILING
|
|
double reduceTime = dTotalTime/(double)testIterations;
|
|
shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n",
|
|
1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads);
|
|
#endif
|
|
|
|
// compute reference solution
|
|
shrLog("\nComparing against Host/C++ computation...\n");
|
|
T cpu_result = reduceCPU<T>(h_idata, size);
|
|
if (datatype == REDUCE_INT)
|
|
{
|
|
shrLog(" GPU result = %d\n", gpu_result);
|
|
shrLog(" CPU result = %d\n\n", cpu_result);
|
|
shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED");
|
|
}
|
|
else
|
|
{
|
|
shrLog(" GPU result = %.9f\n", gpu_result);
|
|
shrLog(" CPU result = %.9f\n\n", cpu_result);
|
|
|
|
double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12;
|
|
double diff = abs((double)gpu_result - (double)cpu_result);
|
|
shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED");
|
|
}
|
|
|
|
// cleanup
|
|
free(h_idata);
|
|
free(h_odata);
|
|
clReleaseMemObject(d_idata);
|
|
clReleaseMemObject(d_odata);
|
|
|
|
return (gpu_result == cpu_result);
|
|
}
|
|
}
|
|
|
|
// Helper function to create and build program and kernel
|
|
// *********************************************************************
|
|
cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2)
|
|
{
|
|
// compile cl program
|
|
size_t program_length;
|
|
char *source;
|
|
|
|
std::ostringstream preamble;
|
|
|
|
// create the program
|
|
// with type specification depending on datatype argument
|
|
switch (datatype)
|
|
{
|
|
default:
|
|
case REDUCE_INT:
|
|
preamble << "#define T int" << std::endl;
|
|
break;
|
|
case REDUCE_FLOAT:
|
|
preamble << "#define T float" << std::endl;
|
|
break;
|
|
}
|
|
|
|
// set blockSize at compile time
|
|
preamble << "#define blockSize " << blockSize << std::endl;
|
|
|
|
// set isPow2 at compile time
|
|
preamble << "#define nIsPow2 " << isPowOf2 << std::endl;
|
|
|
|
// Load the source code and prepend the preamble
|
|
source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length);
|
|
//oclCheckError(source != NULL, shrTRUE);
|
|
|
|
program =
|
|
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "reduce0", NULL);
|
|
//cl_program rv_program = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source,
|
|
// &program_length, &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
free(source);
|
|
|
|
// build the program
|
|
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
|
|
if (ciErrNum != CL_SUCCESS)
|
|
{
|
|
// write out standard error, Build Log and PTX, then cleanup and exit
|
|
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
|
oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
|
|
oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclReduction.ptx");
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
}
|
|
|
|
// create Kernel
|
|
std::ostringstream kernelName;
|
|
kernelName << "reduce" << whichKernel;
|
|
cl_kernel ckKernel = clCreateKernel(rv_program, kernelName.str().c_str(), &ciErrNum);
|
|
//oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
size_t wgSize;
|
|
ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL);
|
|
if (wgSize == 64)
|
|
smallBlock = true;
|
|
else smallBlock = false;
|
|
|
|
// NOTE: the program will get deleted when the kernel is also released
|
|
clReleaseProgram(rv_program);
|
|
|
|
return ckKernel;
|
|
}
|