bfs benchmark

This commit is contained in:
Blaise Tine 2019-11-21 23:32:06 -05:00
parent d95506edd2
commit 1788207aa6
9 changed files with 1447 additions and 2 deletions

814
benchmarks/opencl/bfs/CLHelper.h Executable file
View file

@ -0,0 +1,814 @@
//------------------------------------------
//--cambine:helper function for OpenCL
//--programmer: Jianbin Fang
//--date: 27/12/2010
//------------------------------------------
#ifndef _CL_HELPER_
#define _CL_HELPER_
#include <CL/cl.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <string>
using std::string;
using std::ifstream;
using std::cerr;
using std::endl;
using std::cout;
//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
#define WORK_DIM 2 //work-items dimensions
struct oclHandleStruct
{
cl_context context;
cl_device_id *devices;
cl_command_queue queue;
cl_program program;
cl_int cl_status;
std::string error_str;
std::vector<cl_kernel> kernel;
};
struct oclHandleStruct oclHandles;
char kernel_file[100] = "Kernels.cl";
int total_kernels = 2;
string kernel_names[2] = {"BFS_1", "BFS_2"};
int work_group_size = 512;
int device_id_inused = 0; //deviced id used (default : 0)
/*
* Converts the contents of a file into a string
*/
string FileToString(const string fileName)
{
ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
try
{
size_t size;
char* str;
string s;
if(f.is_open())
{
size_t fileSize;
f.seekg(0, ifstream::end);
size = fileSize = f.tellg();
f.seekg(0, ifstream::beg);
str = new char[size+1];
if (!str) throw(string("Could not allocate memory"));
f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete [] str;
return s;
}
}
catch(std::string msg)
{
cerr << "Exception caught in FileToString(): " << msg << endl;
if(f.is_open())
f.close();
}
catch(...)
{
cerr << "Exception caught in FileToString()" << endl;
if(f.is_open())
f.close();
}
string errorMsg = "FileToString()::Error: Unable to open file "
+ fileName;
throw(errorMsg);
}
//---------------------------------------
//Read command line parameters
//
void _clCmdParams(int argc, char* argv[]){
for (int i =0; i < argc; ++i)
{
switch (argv[i][1])
{
case 'g': //--g stands for size of work group
if (++i < argc)
{
sscanf(argv[i], "%u", &work_group_size);
}
else
{
std::cerr << "Could not read argument after option " << argv[i-1] << std::endl;
throw;
}
break;
case 'd': //--d stands for device id used in computaion
if (++i < argc)
{
sscanf(argv[i], "%u", &device_id_inused);
}
else
{
std::cerr << "Could not read argument after option " << argv[i-1] << std::endl;
throw;
}
break;
default:
;
}
}
}
//---------------------------------------
//Initlize CL objects
//--description: there are 5 steps to initialize all the OpenCL objects needed
//--revised on 04/01/2011: get the number of devices and
// devices have no relationship with context
void _clInit()
{
int DEVICE_ID_INUSED = device_id_inused;
cl_int resultCL;
oclHandles.context = NULL;
oclHandles.devices = NULL;
oclHandles.queue = NULL;
oclHandles.program = NULL;
cl_uint deviceListSize;
//-----------------------------------------------
//--cambine-1: find the available platforms and select one
cl_uint numPlatforms;
cl_platform_id targetPlatform = NULL;
resultCL = clGetPlatformIDs(0, NULL, &numPlatforms);
if (resultCL != CL_SUCCESS)
throw (string("InitCL()::Error: Getting number of platforms (clGetPlatformIDs)"));
//printf("number of platforms:%d\n",numPlatforms); //by cambine
if (!(numPlatforms > 0))
throw (string("InitCL()::Error: No platforms found (clGetPlatformIDs)"));
cl_platform_id* allPlatforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id));
resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
if (resultCL != CL_SUCCESS)
throw (string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
/* Select the target platform. Default: first platform */
targetPlatform = allPlatforms[0];
for (int i = 0; i < numPlatforms; i++)
{
char pbuff[128];
resultCL = clGetPlatformInfo( allPlatforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuff),
pbuff,
NULL);
if (resultCL != CL_SUCCESS)
throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
//printf("vedor is %s\n",pbuff);
}
free(allPlatforms);
//-----------------------------------------------
//--cambine-2: create an OpenCL context
cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)targetPlatform, 0 };
oclHandles.context = clCreateContextFromType(cprops,
CL_DEVICE_TYPE_GPU,
NULL,
NULL,
&resultCL);
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
throw (string("InitCL()::Error: Creating Context (clCreateContextFromType)"));
//-----------------------------------------------
//--cambine-3: detect OpenCL devices
/* First, get the size of device list */
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceListSize);
if(oclHandles.cl_status!=CL_SUCCESS){
throw(string("exception in _clInit -> clGetDeviceIDs"));
}
if (deviceListSize == 0)
throw(string("InitCL()::Error: No devices found."));
//std::cout<<"device number:"<<deviceListSize<<std::endl;
/* Now, allocate the device list */
oclHandles.devices = (cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
if (oclHandles.devices == 0)
throw(string("InitCL()::Error: Could not allocate memory."));
/* Next, get the device list data */
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, deviceListSize, \
oclHandles.devices, NULL);
if(oclHandles.cl_status!=CL_SUCCESS){
throw(string("exception in _clInit -> clGetDeviceIDs-2"));
}
//-----------------------------------------------
//--cambine-4: Create an OpenCL command queue
oclHandles.queue = clCreateCommandQueue(oclHandles.context,
oclHandles.devices[DEVICE_ID_INUSED],
0,
&resultCL);
if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
//-----------------------------------------------
//--cambine-5: Load CL file, build CL program object, create CL kernel object
std::string source_str = FileToString(kernel_file);
const char * source = source_str.c_str();
size_t sourceSize[] = { source_str.length() };
oclHandles.program = clCreateProgramWithSource(oclHandles.context,
1,
&source,
sourceSize,
&resultCL);
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)"));
//insert debug information
//std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
//options += " -cl-nv-opt-level=3";
resultCL = clBuildProgram(oclHandles.program, deviceListSize, oclHandles.devices, NULL, NULL,NULL);
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
{
cerr << "InitCL()::Error: In clBuildProgram" << endl;
size_t length;
resultCL = clGetProgramBuildInfo(oclHandles.program,
oclHandles.devices[DEVICE_ID_INUSED],
CL_PROGRAM_BUILD_LOG,
0,
NULL,
&length);
if(resultCL != CL_SUCCESS)
throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
char* buffer = (char*)malloc(length);
resultCL = clGetProgramBuildInfo(oclHandles.program,
oclHandles.devices[DEVICE_ID_INUSED],
CL_PROGRAM_BUILD_LOG,
length,
buffer,
NULL);
if(resultCL != CL_SUCCESS)
throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
cerr << buffer << endl;
free(buffer);
throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
}
//get program information in intermediate representation
#ifdef PTX_MSG
size_t binary_sizes[deviceListSize];
char * binaries[deviceListSize];
//figure out number of devices and the sizes of the binary for each device.
oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*deviceListSize, &binary_sizes, NULL );
if(oclHandles.cl_status!=CL_SUCCESS){
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
}
std::cout<<"--cambine:"<<binary_sizes<<std::endl;
//copy over all of the generated binaries.
for(int i=0;i<deviceListSize;i++)
binaries[i] = (char *)malloc( sizeof(char)*(binary_sizes[i]+1));
oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES, sizeof(char *)*deviceListSize, binaries, NULL );
if(oclHandles.cl_status!=CL_SUCCESS){
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
}
for(int i=0;i<deviceListSize;i++)
binaries[i][binary_sizes[i]] = '\0';
std::cout<<"--cambine:writing ptd information..."<<std::endl;
FILE * ptx_file = fopen("cl.ptx","w");
if(ptx_file==NULL){
throw(string("exceptions in allocate ptx file."));
}
fprintf(ptx_file,"%s",binaries[DEVICE_ID_INUSED]);
fclose(ptx_file);
std::cout<<"--cambine:writing ptd information done."<<std::endl;
for(int i=0;i<deviceListSize;i++)
free(binaries[i]);
#endif
for (int nKernel = 0; nKernel < total_kernels; nKernel++)
{
/* get a kernel object handle for a kernel with the given name */
cl_kernel kernel = clCreateKernel(oclHandles.program,
(kernel_names[nKernel]).c_str(),
&resultCL);
if ((resultCL != CL_SUCCESS) || (kernel == NULL))
{
string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" + kernel_names[nKernel] + "\"";
throw(errorMsg);
}
oclHandles.kernel.push_back(kernel);
}
//get resource alocation information
#ifdef RES_MSG
char * build_log;
size_t ret_val_size;
oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
if(oclHandles.cl_status!=CL_SUCCESS){
throw(string("exceptions in _InitCL -> getting resource information"));
}
build_log = (char *)malloc(ret_val_size+1);
oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
if(oclHandles.cl_status!=CL_SUCCESS){
throw(string("exceptions in _InitCL -> getting resources allocation information-2"));
}
build_log[ret_val_size] = '\0';
std::cout<<"--cambine:"<<build_log<<std::endl;
free(build_log);
#endif
}
//---------------------------------------
//release CL objects
void _clRelease()
{
char errorFlag = false;
for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++)
{
if (oclHandles.kernel[nKernel] != NULL)
{
cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
if (resultCL != CL_SUCCESS)
{
cerr << "ReleaseCL()::Error: In clReleaseKernel" << endl;
errorFlag = true;
}
oclHandles.kernel[nKernel] = NULL;
}
oclHandles.kernel.clear();
}
if (oclHandles.program != NULL)
{
cl_int resultCL = clReleaseProgram(oclHandles.program);
if (resultCL != CL_SUCCESS)
{
cerr << "ReleaseCL()::Error: In clReleaseProgram" << endl;
errorFlag = true;
}
oclHandles.program = NULL;
}
if (oclHandles.queue != NULL)
{
cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
if (resultCL != CL_SUCCESS)
{
cerr << "ReleaseCL()::Error: In clReleaseCommandQueue" << endl;
errorFlag = true;
}
oclHandles.queue = NULL;
}
free(oclHandles.devices);
if (oclHandles.context != NULL)
{
cl_int resultCL = clReleaseContext(oclHandles.context);
if (resultCL != CL_SUCCESS)
{
cerr << "ReleaseCL()::Error: In clReleaseContext" << endl;
errorFlag = true;
}
oclHandles.context = NULL;
}
if (errorFlag) throw(string("ReleaseCL()::Error encountered."));
}
//--------------------------------------------------------
//--cambine:create buffer and then copy data from host to device
cl_mem _clCreateAndCpyMem(int size, void * h_mem_source) throw(string){
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, \
size, h_mem_source, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem()"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: create read only buffer for devices
//--date: 17/01/2011
cl_mem _clMallocRW(int size, void * h_mem_ptr) throw(string){
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, h_mem_ptr, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMallocRW"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: create read and write buffer for devices
//--date: 17/01/2011
cl_mem _clMalloc(int size, void * h_mem_ptr) throw(string){
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, size, h_mem_ptr, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMalloc"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: transfer data from host to device
//--date: 17/01/2011
void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string){
oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_ptr, 0, NULL, NULL);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMemcpyH2D"));
#endif
}
//--------------------------------------------------------
//--cambine:create buffer and then copy data from host to device with pinned
// memory
cl_mem _clCreateAndCpyPinnedMem(int size, float* h_mem_source) throw(string){
cl_mem d_mem, d_mem_pinned;
float * h_mem_pinned = NULL;
d_mem_pinned = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, \
size, NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
#endif
//------------
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, \
size, NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
#endif
//----------
h_mem_pinned = (cl_float *)clEnqueueMapBuffer(oclHandles.queue, d_mem_pinned, CL_TRUE, \
CL_MAP_WRITE, 0, size, 0, NULL, \
NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
#endif
int element_number = size/sizeof(float);
#pragma omp parallel for
for(int i=0;i<element_number;i++){
h_mem_pinned[i] = h_mem_source[i];
}
//----------
oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem, \
CL_TRUE, 0, size, h_mem_pinned, \
0, NULL, NULL);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
#endif
return d_mem;
}
//--------------------------------------------------------
//--cambine:create write only buffer on device
cl_mem _clMallocWO(int size) throw(string){
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateMem()"));
#endif
return d_mem;
}
//--------------------------------------------------------
//transfer data from device to host
void _clMemcpyD2H(cl_mem d_mem, int size, void * h_mem) throw(string){
oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem, 0,0,0);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
switch(oclHandles.cl_status){
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_VALUE:
oclHandles.error_str += "CL_INVALID_VALUE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if(oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
//--------------------------------------------------------
//set kernel arguments
void _clSetArgs(int kernel_id, int arg_idx, void * d_mem, int size = 0) throw(string){
if(!size){
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, sizeof(d_mem), &d_mem);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
switch(oclHandles.cl_status){
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_ARG_INDEX:
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
break;
case CL_INVALID_ARG_VALUE:
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_SAMPLER:
oclHandles.error_str += "CL_INVALID_SAMPLER";
break;
case CL_INVALID_ARG_SIZE:
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if(oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
else{
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
switch(oclHandles.cl_status){
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_ARG_INDEX:
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
break;
case CL_INVALID_ARG_VALUE:
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_SAMPLER:
oclHandles.error_str += "CL_INVALID_SAMPLER";
break;
case CL_INVALID_ARG_SIZE:
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if(oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
}
void _clFinish() throw(string){
oclHandles.cl_status = clFinish(oclHandles.queue);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clFinish";
switch(oclHandles.cl_status){
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reasons";
break;
}
if(oclHandles.cl_status!=CL_SUCCESS){
throw(oclHandles.error_str);
}
#endif
}
//--------------------------------------------------------
//--cambine:enqueue kernel
void _clInvokeKernel(int kernel_id, int work_items, int work_group_size) throw(string){
cl_uint work_dim = WORK_DIM;
cl_event e[1];
if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size
work_items = work_items + (work_group_size-(work_items%work_group_size));
size_t local_work_size[] = {work_group_size, 1};
size_t global_work_size[] = {work_items, 1};
oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
global_work_size, local_work_size, 0 , 0, &(e[0]) );
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
switch(oclHandles.cl_status)
{
case CL_INVALID_PROGRAM_EXECUTABLE:
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
break;
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_KERNEL_ARGS:
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
break;
case CL_INVALID_WORK_DIMENSION:
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
break;
case CL_INVALID_GLOBAL_WORK_SIZE:
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
break;
case CL_INVALID_WORK_GROUP_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
break;
case CL_INVALID_WORK_ITEM_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
break;
case CL_INVALID_GLOBAL_OFFSET:
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if(oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
//_clFinish();
// oclHandles.cl_status = clWaitForEvents(1, &e[0]);
// #ifdef ERRMSG
// if (oclHandles.cl_status!= CL_SUCCESS)
// throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
// #endif
}
void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x, int group_y) throw(string){
cl_uint work_dim = WORK_DIM;
size_t local_work_size[] = {group_x, group_y};
size_t global_work_size[] = {range_x, range_y};
cl_event e[1];
/*if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size
work_items = work_items + (work_group_size-(work_items%work_group_size));*/
oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
global_work_size, local_work_size, 0 , 0, &(e[0]) );
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
switch(oclHandles.cl_status)
{
case CL_INVALID_PROGRAM_EXECUTABLE:
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
break;
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_KERNEL_ARGS:
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
break;
case CL_INVALID_WORK_DIMENSION:
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
break;
case CL_INVALID_GLOBAL_WORK_SIZE:
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
break;
case CL_INVALID_WORK_GROUP_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
break;
case CL_INVALID_WORK_ITEM_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
break;
case CL_INVALID_GLOBAL_OFFSET:
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if(oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
//_clFinish();
/*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
#ifdef ERRMSG
if (oclHandles.cl_status!= CL_SUCCESS)
throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
#endif*/
}
//--------------------------------------------------------
//release OpenCL objects
void _clFree(cl_mem ob) throw(string){
if(ob!=NULL)
oclHandles.cl_status = clReleaseMemObject(ob);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clFree() ->";
switch(oclHandles.cl_status)
{
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if (oclHandles.cl_status!= CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
#endif //_CL_HELPER_

View file

@ -29,12 +29,12 @@ CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sectio
LIBS = -lOpenCL
PROJECT=saxpy
PROJECT=bfs
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf

53
benchmarks/opencl/bfs/kernel.cl Executable file
View file

@ -0,0 +1,53 @@
/* ============================================================
//--cambine: kernel funtion of Breadth-First-Search
//--author: created by Jianbin Fang
//--date: 06/12/2010
============================================================ */
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
//Structure to hold a node information
typedef struct{
int starting;
int no_of_edges;
} Node;
//--7 parameters
__kernel void BFS_1( const __global Node* g_graph_nodes,
const __global int* g_graph_edges,
__global char* g_graph_mask,
__global char* g_updating_graph_mask,
__global char* g_graph_visited,
__global int* g_cost,
const int no_of_nodes){
int tid = get_global_id(0);
if( tid<no_of_nodes && g_graph_mask[tid]){
g_graph_mask[tid]=false;
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
int id = g_graph_edges[i];
if(!g_graph_visited[id]){
g_cost[id]=g_cost[tid]+1;
g_updating_graph_mask[id]=true;
}
}
}
}
//--5 parameters
__kernel void BFS_2(__global char* g_graph_mask,
__global char* g_updating_graph_mask,
__global char* g_graph_visited,
__global char* g_over,
const int no_of_nodes
) {
int tid = get_global_id(0);
if( tid<no_of_nodes && g_updating_graph_mask[tid]){
g_graph_mask[tid]=true;
g_graph_visited[tid]=true;
*g_over=true;
g_updating_graph_mask[tid]=false;
}
}

Binary file not shown.

299
benchmarks/opencl/bfs/main.cc Executable file
View file

@ -0,0 +1,299 @@
//--by Jianbin Fang
#define __CL_ENABLE_EXCEPTIONS
#include <cstdlib>
#include <iostream>
#include <string>
#include <cstring>
#ifdef PROFILING
#include "timer.h"
#endif
#include "CLHelper.h"
#include "util.h"
#define MAX_THREADS_PER_BLOCK 256
//Structure to hold a node information
struct Node
{
int starting;
int no_of_edges;
};
//----------------------------------------------------------
//--bfs on cpu
//--programmer: jianbin
//--date: 26/01/2011
//--note: width is changed to the new_width
//----------------------------------------------------------
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
char *h_graph_visited, int *h_cost_ref){
char stop;
int k = 0;
do{
//if no thread changes this value then the loop stops
stop=false;
for(int tid = 0; tid < no_of_nodes; tid++ )
{
if (h_graph_mask[tid] == true){
h_graph_mask[tid]=false;
for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){
int id = h_graph_edges[i]; //--cambine: node id is connected with node tid
if(!h_graph_visited[id]){ //--cambine: if node id has not been visited, enter the body below
h_cost_ref[id]=h_cost_ref[tid]+1;
h_updating_graph_mask[id]=true;
}
}
}
}
for(int tid=0; tid< no_of_nodes ; tid++ )
{
if (h_updating_graph_mask[tid] == true){
h_graph_mask[tid]=true;
h_graph_visited[tid]=true;
stop=true;
h_updating_graph_mask[tid]=false;
}
}
k++;
}
while(stop);
}
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
char *h_graph_visited, int *h_cost)
throw(std::string){
//int number_elements = height*width;
char h_over;
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
d_graph_visited, d_cost, d_over;
try{
//--1 transfer data from host to device
_clInit();
d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask);
d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask);
d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited);
d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
d_over = _clMallocRW(sizeof(char), &h_over);
_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);
_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask);
_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited);
_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);
//--2 invoke kernel
#ifdef PROFILING
timer kernel_timer;
double kernel_time = 0.0;
kernel_timer.reset();
kernel_timer.start();
#endif
do{
h_over = false;
_clMemcpyH2D(d_over, sizeof(char), &h_over);
//--kernel 0
int kernel_id = 0;
int kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_cost);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//int work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
//--kernel 1
kernel_id = 1;
kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_over);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
_clMemcpyD2H(d_over,sizeof(char), &h_over);
}while(h_over);
_clFinish();
#ifdef PROFILING
kernel_timer.stop();
kernel_time = kernel_timer.getTimeInSeconds();
#endif
//--3 transfer data from device to host
_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
//--statistics
#ifdef PROFILING
std::cout<<"kernel time(s):"<<kernel_time<<std::endl;
#endif
//--4 release cl resources.
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
}
catch(std::string msg){
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
std::string e_str = "in run_transpose_gpu -> ";
e_str += msg;
throw(e_str);
}
return ;
}
void Usage(int argc, char**argv){
fprintf(stderr,"Usage: %s <input_file>\n", argv[0]);
}
//----------------------------------------------------------
//--cambine: main function
//--author: created by Jianbin Fang
//--date: 25/01/2011
//----------------------------------------------------------
int main(int argc, char * argv[])
{
int no_of_nodes;
int edge_list_size;
FILE *fp;
Node* h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
try{
char *input_f;
if(argc!=2){
Usage(argc, argv);
exit(0);
}
input_f = argv[1];
printf("Reading File\n");
//Read in Graph from a file
fp = fopen(input_f,"r");
if(!fp){
printf("Error Reading graph file\n");
return 0;
}
int source = 0;
fscanf(fp,"%d",&no_of_nodes);
int num_of_blocks = 1;
int num_of_threads_per_block = no_of_nodes;
//Make execution Parameters according to the number of nodes
//Distribute threads across multiple Blocks if necessary
if(no_of_nodes>MAX_THREADS_PER_BLOCK){
num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
work_group_size = num_of_threads_per_block;
// allocate host memory
h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes);
int start, edgeno;
// initalize the memory
for(int i = 0; i < no_of_nodes; i++){
fscanf(fp,"%d %d",&start,&edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i]=false;
h_updating_graph_mask[i]=false;
h_graph_visited[i]=false;
}
//read the source node from the file
fscanf(fp,"%d",&source);
source=0;
//set the source node as true in the mask
h_graph_mask[source]=true;
h_graph_visited[source]=true;
fscanf(fp,"%d",&edge_list_size);
int id,cost;
int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);
for(int i=0; i < edge_list_size ; i++){
fscanf(fp,"%d",&id);
fscanf(fp,"%d",&cost);
h_graph_edges[i] = id;
}
if(fp)
fclose(fp);
// allocate mem for the result on host side
int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes);
int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes);
for(int i=0;i<no_of_nodes;i++){
h_cost[i]=-1;
h_cost_ref[i] = -1;
}
h_cost[source]=0;
h_cost_ref[source]=0;
//---------------------------------------------------------
//--gpu entry
run_bfs_gpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
//---------------------------------------------------------
//--cpu entry
// initalize the memory again
for(int i = 0; i < no_of_nodes; i++){
h_graph_mask[i]=false;
h_updating_graph_mask[i]=false;
h_graph_visited[i]=false;
}
//set the source node as true in the mask
source=0;
h_graph_mask[source]=true;
h_graph_visited[source]=true;
run_bfs_cpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost_ref);
//---------------------------------------------------------
//--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
//release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
}
catch(std::string msg){
std::cout<<"--cambine: exception in main ->"<<msg<<std::endl;
//release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
}
return 0;
}

1
benchmarks/opencl/bfs/run Executable file
View file

@ -0,0 +1 @@
./bfs ../../data/bfs/graph1MW_6.txt

78
benchmarks/opencl/bfs/timer.cc Executable file
View file

@ -0,0 +1,78 @@
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iomanip>
#include "timer.h"
using namespace std;
double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
double timer::get_CPU_speed_in_MHz()
{
#if defined __linux__
ifstream infile("/proc/cpuinfo");
char buffer[256], *colon;
while (infile.good()) {
infile.getline(buffer, 256);
if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
return atof(colon + 2);
}
#endif
return 0.0;
}
void timer::print_time(ostream &str, const char *which, double time) const
{
static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 };
const char **unit = units;
time = 1000.0 * time / CPU_speed_in_MHz;
while (time >= 999.5 && unit[1] != 0) {
time /= 1000.0;
++ unit;
}
str << which << " = " << setprecision(3) << setw(4) << time << *unit;
}
ostream &timer::print(ostream &str)
{
str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
if (CPU_speed_in_MHz == 0)
str << "could not determine CPU speed\n";
else if (count > 0) {
double total = static_cast<double>(total_time);
print_time(str, "avg", total / static_cast<double>(count));
print_time(str, ", total", total);
str << ", count = " << setw(9) << count << '\n';
}
else
str << "not used\n";
return str;
}
ostream &operator << (ostream &str, class timer &timer)
{
return timer.print(str);
}
double timer::getTimeInSeconds()
{
double total = static_cast<double>(total_time);
double res = (total / 1000000.0) / CPU_speed_in_MHz;
return res;
}

128
benchmarks/opencl/bfs/timer.h Executable file
View file

@ -0,0 +1,128 @@
#ifndef timer_h
#define timer_h
#include <iostream>
class timer {
public:
timer(const char *name = 0);
timer(const char *name, std::ostream &write_on_exit);
~timer();
void start(), stop();
void reset();
std::ostream &print(std::ostream &);
double getTimeInSeconds();
private:
void print_time(std::ostream &, const char *which, double time) const;
union {
long long total_time;
struct {
#if defined __PPC__
int high, low;
#else
int low, high;
#endif
};
};
unsigned long long count;
const char *const name;
std::ostream *const write_on_exit;
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
};
std::ostream &operator << (std::ostream &, class timer &);
inline void timer::reset()
{
total_time = 0;
count = 0;
}
inline timer::timer(const char *name)
:
name(name),
write_on_exit(0)
{
reset();
}
inline timer::timer(const char *name, std::ostream &write_on_exit)
:
name(name),
write_on_exit(&write_on_exit)
{
reset();
}
inline timer::~timer()
{
if (write_on_exit != 0)
print(*write_on_exit);
}
inline void timer::start()
{
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
total_time -= ((unsigned long long) edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
asm volatile
(
"rdtsc\n\t"
"subl %%eax, %0\n\t"
"sbbl %%edx, %1"
:
"+m" (low), "+m" (high)
:
:
"eax", "edx"
);
#else
#error Compiler/Architecture not recognized
#endif
}
inline void timer::stop()
{
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
total_time += ((unsigned long long) edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
asm volatile
(
"rdtsc\n\t"
"addl %%eax, %0\n\t"
"adcl %%edx, %1"
:
"+m" (low), "+m" (high)
:
:
"eax", "edx"
);
#endif
++ count;
}
#endif

72
benchmarks/opencl/bfs/util.h Executable file
View file

@ -0,0 +1,72 @@
#ifndef _C_UTIL_
#define _C_UTIL_
#include <math.h>
#include <iostream>
//-------------------------------------------------------------------
//--initialize array with maximum limit
//-------------------------------------------------------------------
template<typename datatype>
void fill(datatype *A, const int n, const datatype maxi){
for (int j = 0; j < n; j++)
{
A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
}
}
//--print matrix
template<typename datatype>
void print_matrix(datatype *A, int height, int width){
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int idx = i*width + j;
std::cout<<A[idx]<<" ";
}
std::cout<<std::endl;
}
return;
}
//-------------------------------------------------------------------
//--verify results
//-------------------------------------------------------------------
#define MAX_RELATIVE_ERROR .002
template<typename datatype>
void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
char passed = true;
#pragma omp parallel for
for (int i=0; i<size; i++){
if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
passed = false;
}
}
if (passed){
std::cout << "--cambine:passed:-)" << endl;
}
else{
std::cout << "--cambine: failed:-(" << endl;
}
return ;
}
template<typename datatype>
void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
char passed = true;
//#pragma omp parallel for
for (int i=0; i<size; i++){
if (cpu_results[i]!=gpu_results[i]){
passed = false;
}
}
if (passed){
std::cout << "--cambine:passed:-)" << endl;
}
else{
std::cout << "--cambine: failed:-(" << endl;
}
return ;
}
#endif