/* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ /* Parallel reduction This sample shows how to perform a reduction operation on an array of values to produce a single value. Reductions are a very common computation in parallel algorithms. Any time an array of values needs to be reduced to a single value using a binary associative operator, a reduction can be used. Example applications include statistics computaions such as mean and standard deviation, and image processing applications such as finding the total luminance of an image. This code performs sum reductions, but any associative operator such as min() or max() could also be used. It assumes the input size is a power of 2. COMMAND LINE ARGUMENTS "--shmoo": Test performance for 1 to 32M elements with each of the 7 different kernels "--n=": Specify the number of elements to reduce (default 1048576) "--threads=": Specify the number of threads per block (default 128) "--kernel=": Specify which kernel to run (0-6, default 6) "--maxblocks=": Specify the maximum number of thread blocks to launch (kernel 6 only, default 64) "--cpufinal": Read back the per-block results and do final sum of block sums on CPU (default false) "--cputhresh=": The threshold of number of blocks sums below which to perform a CPU final reduction (default 1) */ // Common system and utility includes #include #include // additional includes #include #include // Forward declarations and sample-specific defines // ********************************************************************* enum ReduceType { REDUCE_INT, REDUCE_FLOAT, REDUCE_DOUBLE }; template bool runTest( int argc, const char** argv, ReduceType datatype); #define MAX_BLOCK_DIM_SIZE 65535 extern "C" bool isPow2(unsigned int x) { return ((x&(x-1))==0); } cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2); // Main function // ********************************************************************* int main( int argc, const char** argv) { shrQAStart(argc, (char **)argv); // start logs shrSetLogFileName ("oclReduction.txt"); shrLog("%s Starting...\n\n", argv[0]); char *typeChoice; shrGetCmdLineArgumentstr(argc, argv, "type", &typeChoice); // determine type of array from command line args if (0 == typeChoice) { typeChoice = (char*)malloc(7 * sizeof(char)); #ifdef WIN32 strcpy_s(typeChoice, 7 * sizeof(char) + 1, "int"); #else strcpy(typeChoice, "int"); #endif } ReduceType datatype = REDUCE_INT; #ifdef WIN32 if (!_strcmpi(typeChoice, "float")) datatype = REDUCE_FLOAT; else if (!_strcmpi(typeChoice, "double")) datatype = REDUCE_DOUBLE; else datatype = REDUCE_INT; #else if (!strcmp(typeChoice, "float")) datatype = REDUCE_FLOAT; else if (!strcmp(typeChoice, "double")) datatype = REDUCE_DOUBLE; else datatype = REDUCE_INT; #endif shrLog("Reducing array of type %s.\n", typeChoice); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); //oclCheckError(ciErrNum, CL_SUCCESS); //Get the devices ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices); //oclCheckError(ciErrNum, CL_SUCCESS); cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL); //oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum); //oclCheckError(ciErrNum, CL_SUCCESS); // get and log the device info if( shrCheckCmdLineFlag(argc, (const char**)argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, (const char**)argv, "device", &device_nr); if( device_nr < uiNumDevices ) { device = oclGetDev(cxGPUContext, device_nr); } else { shrLog("Invalid Device %d Requested.\n", device_nr); shrExitEX(argc, argv, EXIT_FAILURE); } } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevName(LOGBOTH, device); shrLog("\n"); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); //oclCheckError(ciErrNum, CL_SUCCESS); source_path = shrFindFilePath("oclReduction_kernel.cl", argv[0]); bool bSuccess = false; switch (datatype) { default: case REDUCE_INT: bSuccess = runTest( argc, argv, datatype); break; case REDUCE_FLOAT: bSuccess = runTest( argc, argv, datatype); break; } // finish shrQAFinishExit(argc, (const char **)argv, bSuccess ? QA_PASSED : QA_FAILED); } //////////////////////////////////////////////////////////////////////////////// //! Compute sum reduction on CPU //! We use Kahan summation for an accurate sum of large arrays. //! http://en.wikipedia.org/wiki/Kahan_summation_algorithm //! //! @param data pointer to input data //! @param size number of input data elements //////////////////////////////////////////////////////////////////////////////// template T reduceCPU(T *data, int size) { T sum = data[0]; T c = (T)0.0; for (int i = 1; i < size; i++) { T y = data[i] - c; T t = sum + y; c = (t - sum) - y; sum = t; } return sum; } unsigned int nextPow2( unsigned int x ) { --x; x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; return ++x; } //////////////////////////////////////////////////////////////////////////////// // Compute the number of threads and blocks to use for the given reduction kernel // For the kernels >= 3, we set threads / block to the minimum of maxThreads and // n/2. For kernels < 3, we set to the minimum of maxThreads and n. For kernel // 6, we observe the maximum specified number of blocks, because each thread in // that kernel can process a variable number of elements. //////////////////////////////////////////////////////////////////////////////// void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks, int maxThreads, int &blocks, int &threads) { if (whichKernel < 3) { threads = (n < maxThreads) ? nextPow2(n) : maxThreads; blocks = (n + threads - 1) / threads; } else { threads = (n < maxThreads*2) ? nextPow2((n + 1)/ 2) : maxThreads; blocks = (n + (threads * 2 - 1)) / (threads * 2); } if (whichKernel == 6) blocks = MIN(maxBlocks, blocks); } //////////////////////////////////////////////////////////////////////////////// // This function performs a reduction of the input data multiple times and // measures the average reduction time. //////////////////////////////////////////////////////////////////////////////// template T profileReduce(ReduceType datatype, cl_int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int whichKernel, int testIterations, bool cpuFinalReduction, int cpuFinalThreshold, double* dTotalTime, T* h_odata, cl_mem d_idata, cl_mem d_odata) { T gpu_result = 0; bool needReadBack = true; cl_kernel finalReductionKernel[10]; int finalReductionIterations=0; //shrLog("Profile Kernel %d\n", whichKernel); cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) ); clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata); clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n); clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL); if( !cpuFinalReduction ) { int s=numBlocks; int threads = 0, blocks = 0; int kernel = (whichKernel == 6) ? 5 : whichKernel; while(s > cpuFinalThreshold) { getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) ); clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n); clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); finalReductionIterations++; } } size_t globalWorkSize[1]; size_t localWorkSize[1]; for (int i = 0; i < testIterations; ++i) { gpu_result = 0; clFinish(cqCommandQueue); if(i>0) shrDeltaT(1); // execute the kernel globalWorkSize[0] = numBlocks * numThreads; localWorkSize[0] = numThreads; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize, 0, NULL, NULL); // check if kernel execution generated an error //oclCheckError(ciErrNum, CL_SUCCESS); if (cpuFinalReduction) { // sum partial sums from each block on CPU // copy result from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T), h_odata, 0, NULL, NULL); for(int i=0; i cpuFinalThreshold) { int threads = 0, blocks = 0; getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); globalWorkSize[0] = threads * blocks; localWorkSize[0] = threads; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0, globalWorkSize, localWorkSize, 0, NULL, NULL); //oclCheckError(ciErrNum, CL_SUCCESS); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); it++; } if (s > 1) { // copy result from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T), h_odata, 0, NULL, NULL); for(int i=0; i < s; i++) { gpu_result += h_odata[i]; } needReadBack = false; } } clFinish(cqCommandQueue); if(i>0) *dTotalTime += shrDeltaT(1); } if (needReadBack) { // copy final sum from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T), &gpu_result, 0, NULL, NULL); } // Release the kernels clReleaseKernel(reductionKernel); if( !cpuFinalReduction ) { for(int it=0; it void shmoo(int minN, int maxN, int maxThreads, int maxBlocks, ReduceType datatype) { // create random input data on CPU unsigned int bytes = maxN * sizeof(T); T* h_idata = (T*)malloc(bytes); for(int i = 0; i < maxN; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int maxNumBlocks = MIN( maxN / maxThreads, MAX_BLOCK_DIM_SIZE); // allocate mem for the result on host side T* h_odata = (T*) malloc(maxNumBlocks*sizeof(T)); // allocate device memory and data cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, maxNumBlocks * sizeof(T), NULL, NULL); int testIterations = 100; double dTotalTime = 0.0; // print headers shrLog("Time in seconds for various numbers of elements for each kernel\n"); shrLog("\n\n"); shrLog("Kernel"); for (int i = minN; i <= maxN; i *= 2) { shrLog(", %d", i); } for (int kernel = 0; kernel < 7; kernel++) { shrLog("\n"); shrLog("%d", kernel); for (int i = minN; i <= maxN; i *= 2) { int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(kernel, i, maxBlocks, maxThreads, numBlocks, numThreads); double reduceTime; if( numBlocks <= MAX_BLOCK_DIM_SIZE ) { profileReduce(datatype, i, numThreads, numBlocks, maxThreads, maxBlocks, kernel, testIterations, false, 1, &dTotalTime, h_odata, d_idata, d_odata); reduceTime = dTotalTime/(double)testIterations; } else { reduceTime = -1.0; } shrLog(", %.4f m", reduceTime); } } // cleanup free(h_idata); free(h_odata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); } //////////////////////////////////////////////////////////////////////////////// // The main function whihc runs the reduction test. //////////////////////////////////////////////////////////////////////////////// template bool runTest( int argc, const char** argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads; cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1); clReleaseKernel(reductionKernel); if (smallBlock) maxThreads = 64; // number of threads per block else maxThreads = 128; int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); shrLog(" %d elements\n", size); shrLog(" %d threads (max)\n", maxThreads); cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE); shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE); #ifdef GPU_PROFILING if (runShmoo) { shmoo(1, 33554432, maxThreads, maxBlocks, datatype); return true; } else #endif { // create random input data on CPU unsigned int bytes = size * sizeof(T); T* h_idata = (T*)malloc(bytes); for(int i=0; i(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, &dTotalTime, h_odata, d_idata, d_odata); #ifdef GPU_PROFILING double reduceTime = dTotalTime/(double)testIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); #endif // compute reference solution shrLog("\nComparing against Host/C++ computation...\n"); T cpu_result = reduceCPU(h_idata, size); if (datatype == REDUCE_INT) { shrLog(" GPU result = %d\n", gpu_result); shrLog(" CPU result = %d\n\n", cpu_result); shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED"); } else { shrLog(" GPU result = %.9f\n", gpu_result); shrLog(" CPU result = %.9f\n\n", cpu_result); double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12; double diff = abs((double)gpu_result - (double)cpu_result); shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED"); } // cleanup free(h_idata); free(h_odata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); return (gpu_result == cpu_result); } } // Helper function to create and build program and kernel // ********************************************************************* cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2) { // compile cl program size_t program_length; char *source; std::ostringstream preamble; // create the program // with type specification depending on datatype argument switch (datatype) { default: case REDUCE_INT: preamble << "#define T int" << std::endl; break; case REDUCE_FLOAT: preamble << "#define T float" << std::endl; break; } // set blockSize at compile time preamble << "#define blockSize " << blockSize << std::endl; // set isPow2 at compile time preamble << "#define nIsPow2 " << isPowOf2 << std::endl; // Load the source code and prepend the preamble source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length); //oclCheckError(source != NULL, shrTRUE); program = clCreateProgramWithBuiltInKernels(context, 1, &device_id, "reduce0", NULL); //cl_program rv_program = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, // &program_length, &ciErrNum); //oclCheckError(ciErrNum, CL_SUCCESS); free(source); // build the program ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext)); oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclReduction.ptx"); //oclCheckError(ciErrNum, CL_SUCCESS); } // create Kernel std::ostringstream kernelName; kernelName << "reduce" << whichKernel; cl_kernel ckKernel = clCreateKernel(rv_program, kernelName.str().c_str(), &ciErrNum); //oclCheckError(ciErrNum, CL_SUCCESS); size_t wgSize; ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); if (wgSize == 64) smallBlock = true; else smallBlock = false; // NOTE: the program will get deleted when the kernel is also released clReleaseProgram(rv_program); return ckKernel; }