bfs benchmark

2025-04-23 21:39:10 -04:00 · 2019-11-21 23:32:06 -05:00 · 2019-11-21 23:32:06 -05:00 · 1788207aa6
commit 1788207aa6
parent d95506edd2
9 changed files with 1447 additions and 2 deletions
--- a/benchmarks/opencl/bfs/CLHelper.h
+++ b/benchmarks/opencl/bfs/CLHelper.h
@ -0,0 +1,814 @@
+//------------------------------------------
+//--cambine:helper function for OpenCL
+//--programmer:	Jianbin Fang
+//--date:	27/12/2010
+//------------------------------------------
+#ifndef _CL_HELPER_
+#define _CL_HELPER_
+
+#include <CL/cl.h>
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <string>
+
+using std::string;
+using std::ifstream;
+using std::cerr;
+using std::endl;
+using std::cout;
+//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
+#define WORK_DIM 2	//work-items dimensions
+
+struct oclHandleStruct
+{
+    cl_context              context;
+    cl_device_id            *devices;
+    cl_command_queue        queue;
+    cl_program              program;
+    cl_int		cl_status;
+    std::string error_str;
+    std::vector<cl_kernel>  kernel;
+};
+
+struct oclHandleStruct oclHandles;
+
+char kernel_file[100]  = "Kernels.cl";
+int total_kernels = 2;
+string kernel_names[2] = {"BFS_1", "BFS_2"};
+int work_group_size = 512;
+int device_id_inused = 0; //deviced id used (default : 0)
+
+/*
+ * Converts the contents of a file into a string
+ */
+string FileToString(const string fileName)
+{
+    ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
+
+    try
+    {
+        size_t size;
+        char*  str;
+        string s;
+
+        if(f.is_open())
+        {
+            size_t fileSize;
+            f.seekg(0, ifstream::end);
+            size = fileSize = f.tellg();
+            f.seekg(0, ifstream::beg);
+
+            str = new char[size+1];
+            if (!str) throw(string("Could not allocate memory"));
+
+            f.read(str, fileSize);
+            f.close();
+            str[size] = '\0';
+        
+            s = str;
+            delete [] str;
+            return s;
+        }
+    }
+    catch(std::string msg)
+    {
+        cerr << "Exception caught in FileToString(): " << msg << endl;
+        if(f.is_open())
+            f.close();
+    }
+    catch(...)
+    {
+        cerr << "Exception caught in FileToString()" << endl;
+        if(f.is_open())
+            f.close();
+    }
+    string errorMsg = "FileToString()::Error: Unable to open file "
+                            + fileName;
+    throw(errorMsg);
+}
+//---------------------------------------
+//Read command line parameters
+//
+void _clCmdParams(int argc, char* argv[]){
+	for (int i =0; i < argc; ++i)
+	{
+		switch (argv[i][1])
+		{
+		case 'g':	//--g stands for size of work group
+			if (++i < argc)
+			{
+				sscanf(argv[i], "%u", &work_group_size);
+			}
+			else
+			{
+				std::cerr << "Could not read argument after option " << argv[i-1] << std::endl;
+				throw;
+			}
+			break;
+		  case 'd':	 //--d stands for device id used in computaion
+			if (++i < argc)
+			{
+				sscanf(argv[i], "%u", &device_id_inused);
+			}
+			else
+			{
+				std::cerr << "Could not read argument after option " << argv[i-1] << std::endl;
+				throw;
+			}
+			break;
+		default:
+			;
+		}
+	}
+	
+}
+
+//---------------------------------------
+//Initlize CL objects
+//--description: there are 5 steps to initialize all the OpenCL objects needed
+//--revised on 04/01/2011: get the number of devices  and 
+//  devices have no relationship with context
+void _clInit()
+{
+    int DEVICE_ID_INUSED = device_id_inused;
+    cl_int resultCL;
+    
+    oclHandles.context = NULL;
+    oclHandles.devices = NULL;
+    oclHandles.queue = NULL;
+    oclHandles.program = NULL;
+
+    cl_uint deviceListSize;
+
+    //-----------------------------------------------
+    //--cambine-1: find the available platforms and select one
+
+    cl_uint numPlatforms;
+    cl_platform_id targetPlatform = NULL;
+
+    resultCL = clGetPlatformIDs(0, NULL, &numPlatforms);
+    if (resultCL != CL_SUCCESS)
+        throw (string("InitCL()::Error: Getting number of platforms (clGetPlatformIDs)"));
+    //printf("number of platforms:%d\n",numPlatforms);	//by cambine
+
+    if (!(numPlatforms > 0))
+        throw (string("InitCL()::Error: No platforms found (clGetPlatformIDs)"));
+
+    cl_platform_id* allPlatforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id));
+
+    resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
+    if (resultCL != CL_SUCCESS)
+        throw (string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
+
+    /* Select the target platform. Default: first platform */
+    targetPlatform = allPlatforms[0];
+    for (int i = 0; i < numPlatforms; i++)
+    {
+        char pbuff[128];
+        resultCL = clGetPlatformInfo( allPlatforms[i],
+                                        CL_PLATFORM_VENDOR,
+                                        sizeof(pbuff),
+                                        pbuff,
+                                        NULL);
+        if (resultCL != CL_SUCCESS)
+            throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
+
+		//printf("vedor is %s\n",pbuff);
+
+    }
+    free(allPlatforms);
+
+    //-----------------------------------------------
+    //--cambine-2: create an OpenCL context
+    cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)targetPlatform, 0 };
+    oclHandles.context = clCreateContextFromType(cprops, 
+                                                CL_DEVICE_TYPE_GPU, 
+                                                NULL, 
+                                                NULL, 
+                                                &resultCL);
+
+    if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
+        throw (string("InitCL()::Error: Creating Context (clCreateContextFromType)"));
+    //-----------------------------------------------
+    //--cambine-3: detect OpenCL devices	
+    /* First, get the size of device list */
+   oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceListSize);
+   if(oclHandles.cl_status!=CL_SUCCESS){
+   	throw(string("exception in _clInit -> clGetDeviceIDs"));   	
+   }
+   if (deviceListSize == 0)
+        throw(string("InitCL()::Error: No devices found."));
+
+    //std::cout<<"device number:"<<deviceListSize<<std::endl;
+
+    /* Now, allocate the device list */
+    oclHandles.devices = (cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
+
+    if (oclHandles.devices == 0)
+        throw(string("InitCL()::Error: Could not allocate memory."));
+
+    /* Next, get the device list data */
+   oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, deviceListSize, \
+								oclHandles.devices, NULL);
+   if(oclHandles.cl_status!=CL_SUCCESS){
+   	throw(string("exception in _clInit -> clGetDeviceIDs-2"));   	
+   }
+   //-----------------------------------------------
+   //--cambine-4: Create an OpenCL command queue    
+    oclHandles.queue = clCreateCommandQueue(oclHandles.context, 
+                                            oclHandles.devices[DEVICE_ID_INUSED], 
+                                            0, 
+                                            &resultCL);
+
+    if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
+        throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
+    //-----------------------------------------------
+    //--cambine-5: Load CL file, build CL program object, create CL kernel object
+    std::string  source_str = FileToString(kernel_file);
+    const char * source    = source_str.c_str();
+    size_t sourceSize[]    = { source_str.length() };
+
+    oclHandles.program = clCreateProgramWithSource(oclHandles.context, 
+                                                    1, 
+                                                    &source,
+                                                    sourceSize,
+                                                    &resultCL);
+
+    if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
+        throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)"));    
+    //insert debug information
+    //std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
+    //options += " -cl-nv-opt-level=3";
+    resultCL = clBuildProgram(oclHandles.program, deviceListSize, oclHandles.devices, NULL, NULL,NULL);
+
+    if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
+    {
+        cerr << "InitCL()::Error: In clBuildProgram" << endl;
+
+		size_t length;
+        resultCL = clGetProgramBuildInfo(oclHandles.program, 
+                                        oclHandles.devices[DEVICE_ID_INUSED], 
+                                        CL_PROGRAM_BUILD_LOG, 
+                                        0, 
+                                        NULL, 
+                                        &length);
+        if(resultCL != CL_SUCCESS) 
+            throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
+
+		char* buffer = (char*)malloc(length);
+        resultCL = clGetProgramBuildInfo(oclHandles.program, 
+                                        oclHandles.devices[DEVICE_ID_INUSED], 
+                                        CL_PROGRAM_BUILD_LOG, 
+                                        length, 
+                                        buffer, 
+                                        NULL);
+        if(resultCL != CL_SUCCESS) 
+            throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
+
+		cerr << buffer << endl;
+        free(buffer);
+
+        throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
+    } 
+
+    //get program information in intermediate representation
+    #ifdef PTX_MSG    
+    size_t binary_sizes[deviceListSize];
+    char * binaries[deviceListSize];
+    //figure out number of devices and the sizes of the binary for each device. 
+    oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*deviceListSize, &binary_sizes, NULL );
+    if(oclHandles.cl_status!=CL_SUCCESS){
+        throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
+    }
+
+    std::cout<<"--cambine:"<<binary_sizes<<std::endl;
+    //copy over all of the generated binaries. 
+    for(int i=0;i<deviceListSize;i++)
+	binaries[i] = (char *)malloc( sizeof(char)*(binary_sizes[i]+1));
+    oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES, sizeof(char *)*deviceListSize, binaries, NULL );
+    if(oclHandles.cl_status!=CL_SUCCESS){
+        throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
+    }
+    for(int i=0;i<deviceListSize;i++)
+      binaries[i][binary_sizes[i]] = '\0';
+    std::cout<<"--cambine:writing ptd information..."<<std::endl;
+    FILE * ptx_file = fopen("cl.ptx","w");
+    if(ptx_file==NULL){
+	throw(string("exceptions in allocate ptx file."));
+    }
+    fprintf(ptx_file,"%s",binaries[DEVICE_ID_INUSED]);
+    fclose(ptx_file);
+    std::cout<<"--cambine:writing ptd information done."<<std::endl;
+    for(int i=0;i<deviceListSize;i++)
+	free(binaries[i]);
+    #endif
+
+    for (int nKernel = 0; nKernel < total_kernels; nKernel++)
+    {
+        /* get a kernel object handle for a kernel with the given name */
+        cl_kernel kernel = clCreateKernel(oclHandles.program,
+                                            (kernel_names[nKernel]).c_str(),
+                                            &resultCL);
+
+        if ((resultCL != CL_SUCCESS) || (kernel == NULL))
+        {
+            string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" + kernel_names[nKernel] + "\"";
+            throw(errorMsg);
+        }
+
+        oclHandles.kernel.push_back(kernel);
+    }
+  //get resource alocation information
+    #ifdef RES_MSG
+    char * build_log;
+    size_t ret_val_size;
+    oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+    if(oclHandles.cl_status!=CL_SUCCESS){
+	throw(string("exceptions in _InitCL -> getting resource information"));
+    }    
+
+    build_log = (char *)malloc(ret_val_size+1);
+    oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+    if(oclHandles.cl_status!=CL_SUCCESS){
+	throw(string("exceptions in _InitCL -> getting resources allocation information-2"));
+    }
+    build_log[ret_val_size] = '\0';
+    std::cout<<"--cambine:"<<build_log<<std::endl;
+    free(build_log);
+    #endif
+}
+
+//---------------------------------------
+//release CL objects
+void _clRelease()
+{
+    char errorFlag = false;
+
+    for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++)
+    {
+        if (oclHandles.kernel[nKernel] != NULL)
+        {
+            cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
+            if (resultCL != CL_SUCCESS)
+            {
+                cerr << "ReleaseCL()::Error: In clReleaseKernel" << endl;
+                errorFlag = true;
+            }
+            oclHandles.kernel[nKernel] = NULL;
+        }
+        oclHandles.kernel.clear();
+    }
+
+    if (oclHandles.program != NULL)
+    {
+        cl_int resultCL = clReleaseProgram(oclHandles.program);
+        if (resultCL != CL_SUCCESS)
+        {
+            cerr << "ReleaseCL()::Error: In clReleaseProgram" << endl;
+            errorFlag = true;
+        }
+        oclHandles.program = NULL;
+    }
+
+    if (oclHandles.queue != NULL)
+    {
+        cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
+        if (resultCL != CL_SUCCESS)
+        {
+            cerr << "ReleaseCL()::Error: In clReleaseCommandQueue" << endl;
+            errorFlag = true;
+        }
+        oclHandles.queue = NULL;
+    }
+
+    free(oclHandles.devices);
+
+    if (oclHandles.context != NULL)
+    {
+        cl_int resultCL = clReleaseContext(oclHandles.context);
+        if (resultCL != CL_SUCCESS)
+        {
+            cerr << "ReleaseCL()::Error: In clReleaseContext" << endl;
+            errorFlag = true;
+        }
+        oclHandles.context = NULL;
+    }
+
+    if (errorFlag) throw(string("ReleaseCL()::Error encountered."));
+}
+//--------------------------------------------------------
+//--cambine:create buffer and then copy data from host to device
+cl_mem _clCreateAndCpyMem(int size, void * h_mem_source) throw(string){
+	cl_mem d_mem;
+	d_mem = clCreateBuffer(oclHandles.context,	CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,  \
+									size, h_mem_source, &oclHandles.cl_status);
+	#ifdef ERRMSG
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(string("excpetion in _clCreateAndCpyMem()"));
+	#endif
+	return d_mem;
+}
+//-------------------------------------------------------
+//--cambine:	create read only  buffer for devices
+//--date:	17/01/2011	
+cl_mem _clMallocRW(int size, void * h_mem_ptr) throw(string){
+ 	cl_mem d_mem;
+	d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size, h_mem_ptr, &oclHandles.cl_status);
+	#ifdef ERRMSG
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(string("excpetion in _clMallocRW"));
+	#endif
+	return d_mem;
+}
+//-------------------------------------------------------
+//--cambine:	create read and write buffer for devices
+//--date:	17/01/2011	
+cl_mem _clMalloc(int size, void * h_mem_ptr) throw(string){
+ 	cl_mem d_mem;
+	d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, size, h_mem_ptr, &oclHandles.cl_status);
+	#ifdef ERRMSG
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(string("excpetion in _clMalloc"));
+	#endif
+	return d_mem;
+}
+
+//-------------------------------------------------------
+//--cambine:	transfer data from host to device
+//--date:	17/01/2011
+void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string){
+	oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_ptr, 0, NULL, NULL);
+	#ifdef ERRMSG
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(string("excpetion in _clMemcpyH2D"));
+	#endif
+}
+//--------------------------------------------------------
+//--cambine:create buffer and then copy data from host to device with pinned 
+// memory
+cl_mem _clCreateAndCpyPinnedMem(int size, float* h_mem_source) throw(string){
+	cl_mem d_mem, d_mem_pinned;
+	float * h_mem_pinned = NULL;
+	d_mem_pinned = clCreateBuffer(oclHandles.context,	CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,  \
+									size, NULL, &oclHandles.cl_status);
+	#ifdef ERRMSG
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
+	#endif
+	//------------
+	d_mem = clCreateBuffer(oclHandles.context,	CL_MEM_READ_ONLY,  \
+									size, NULL, &oclHandles.cl_status);
+	#ifdef ERRMSG
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
+	#endif
+	//----------
+	h_mem_pinned = (cl_float *)clEnqueueMapBuffer(oclHandles.queue, d_mem_pinned, CL_TRUE,  \
+										CL_MAP_WRITE, 0, size, 0, NULL,  \
+										NULL,  &oclHandles.cl_status);
+	#ifdef ERRMSG
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
+	#endif
+	int element_number = size/sizeof(float);
+	#pragma omp parallel for
+	for(int i=0;i<element_number;i++){
+		h_mem_pinned[i] = h_mem_source[i];
+	}
+	//----------
+	oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem, 	\
+									CL_TRUE, 0, size, h_mem_pinned,  \
+									0, NULL, NULL);
+	#ifdef ERRMSG
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
+	#endif
+	
+	return d_mem;
+}
+
+
+//--------------------------------------------------------
+//--cambine:create write only buffer on device
+cl_mem _clMallocWO(int size) throw(string){
+	cl_mem d_mem;
+	d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0, &oclHandles.cl_status);
+	#ifdef ERRMSG
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(string("excpetion in _clCreateMem()"));
+	#endif
+	return d_mem;
+}
+
+//--------------------------------------------------------
+//transfer data from device to host
+void _clMemcpyD2H(cl_mem d_mem, int size, void * h_mem) throw(string){
+	oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem, 0,0,0);
+	#ifdef ERRMSG
+		oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
+		switch(oclHandles.cl_status){
+			case CL_INVALID_COMMAND_QUEUE:
+				oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+				break;
+			case CL_INVALID_CONTEXT:
+				oclHandles.error_str += "CL_INVALID_CONTEXT";
+				break;	
+			case CL_INVALID_MEM_OBJECT:
+				oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+				break;
+			case CL_INVALID_VALUE:
+				oclHandles.error_str += "CL_INVALID_VALUE";
+				break;	
+			case CL_INVALID_EVENT_WAIT_LIST:
+				oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
+				break;
+			case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+				oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+				break;	
+			case CL_OUT_OF_HOST_MEMORY:
+				oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+				break;		
+			default:
+				oclHandles.error_str += "Unknown reason";
+				break;
+		}
+		if(oclHandles.cl_status != CL_SUCCESS)
+			throw(oclHandles.error_str);
+	#endif
+}
+
+//--------------------------------------------------------
+//set kernel arguments
+void _clSetArgs(int kernel_id, int arg_idx, void * d_mem, int size = 0) throw(string){
+	if(!size){
+		oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, sizeof(d_mem), &d_mem);
+		#ifdef ERRMSG
+		oclHandles.error_str = "excpetion in _clSetKernelArg() ";
+		switch(oclHandles.cl_status){
+			case CL_INVALID_KERNEL:
+				oclHandles.error_str += "CL_INVALID_KERNEL";
+				break;
+			case CL_INVALID_ARG_INDEX:
+				oclHandles.error_str += "CL_INVALID_ARG_INDEX";
+				break;	
+			case CL_INVALID_ARG_VALUE:
+				oclHandles.error_str += "CL_INVALID_ARG_VALUE";
+				break;
+			case CL_INVALID_MEM_OBJECT:
+				oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+				break;	
+			case CL_INVALID_SAMPLER:
+				oclHandles.error_str += "CL_INVALID_SAMPLER";
+				break;
+			case CL_INVALID_ARG_SIZE:
+				oclHandles.error_str += "CL_INVALID_ARG_SIZE";
+				break;	
+			case CL_OUT_OF_RESOURCES:
+				oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+				break;
+			case CL_OUT_OF_HOST_MEMORY:
+				oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+				break;
+			default:
+				oclHandles.error_str += "Unknown reason";
+				break;
+		}
+		if(oclHandles.cl_status != CL_SUCCESS)
+			throw(oclHandles.error_str);
+		#endif
+	}
+	else{
+		oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
+		#ifdef ERRMSG
+		oclHandles.error_str = "excpetion in _clSetKernelArg() ";
+		switch(oclHandles.cl_status){
+			case CL_INVALID_KERNEL:
+				oclHandles.error_str += "CL_INVALID_KERNEL";
+				break;
+			case CL_INVALID_ARG_INDEX:
+				oclHandles.error_str += "CL_INVALID_ARG_INDEX";
+				break;	
+			case CL_INVALID_ARG_VALUE:
+				oclHandles.error_str += "CL_INVALID_ARG_VALUE";
+				break;
+			case CL_INVALID_MEM_OBJECT:
+				oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+				break;	
+			case CL_INVALID_SAMPLER:
+				oclHandles.error_str += "CL_INVALID_SAMPLER";
+				break;
+			case CL_INVALID_ARG_SIZE:
+				oclHandles.error_str += "CL_INVALID_ARG_SIZE";
+				break;	
+			case CL_OUT_OF_RESOURCES:
+				oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+				break;
+			case CL_OUT_OF_HOST_MEMORY:
+				oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+				break;
+			default:
+				oclHandles.error_str += "Unknown reason";
+				break;
+		}
+		if(oclHandles.cl_status != CL_SUCCESS)
+			throw(oclHandles.error_str);
+		#endif
+	}
+}
+void _clFinish() throw(string){
+	oclHandles.cl_status = clFinish(oclHandles.queue);	
+	#ifdef ERRMSG
+	oclHandles.error_str = "excpetion in _clFinish";
+	switch(oclHandles.cl_status){
+		case CL_INVALID_COMMAND_QUEUE:
+			oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+			break;
+		case CL_OUT_OF_RESOURCES:
+			oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+			break;		
+		case CL_OUT_OF_HOST_MEMORY:
+			oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+			break;
+		default:
+			oclHandles.error_str += "Unknown reasons";
+			break;
+
+	}
+	if(oclHandles.cl_status!=CL_SUCCESS){
+		throw(oclHandles.error_str);
+	}
+	#endif
+}
+//--------------------------------------------------------
+//--cambine:enqueue kernel
+void _clInvokeKernel(int kernel_id, int work_items, int work_group_size) throw(string){
+	cl_uint work_dim = WORK_DIM;
+	cl_event e[1];
+	if(work_items%work_group_size != 0)	//process situations that work_items cannot be divided by work_group_size
+	  work_items = work_items + (work_group_size-(work_items%work_group_size));
+  	size_t local_work_size[] = {work_group_size, 1};
+	size_t global_work_size[] = {work_items, 1};
+	oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
+											global_work_size, local_work_size, 0 , 0, &(e[0]) );	
+	#ifdef ERRMSG
+	oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
+	switch(oclHandles.cl_status)
+	{
+		case CL_INVALID_PROGRAM_EXECUTABLE:
+			oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
+			break;
+		case CL_INVALID_COMMAND_QUEUE:
+			oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+			break;
+		case CL_INVALID_KERNEL:
+			oclHandles.error_str += "CL_INVALID_KERNEL";
+			break;
+		case CL_INVALID_CONTEXT:
+			oclHandles.error_str += "CL_INVALID_CONTEXT";
+			break;
+		case CL_INVALID_KERNEL_ARGS:
+			oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
+			break;
+		case CL_INVALID_WORK_DIMENSION:
+			oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
+			break;
+		case CL_INVALID_GLOBAL_WORK_SIZE:
+			oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
+			break;
+		case CL_INVALID_WORK_GROUP_SIZE:
+			oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
+			break;
+		case CL_INVALID_WORK_ITEM_SIZE:
+			oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
+			break;
+		case CL_INVALID_GLOBAL_OFFSET:
+			oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
+			break;
+		case CL_OUT_OF_RESOURCES:
+			oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+			break;
+		case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+			oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+			break;
+		case CL_INVALID_EVENT_WAIT_LIST:
+			oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
+			break;
+		case CL_OUT_OF_HOST_MEMORY:
+			oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+			break;
+		default: 
+			oclHandles.error_str += "Unkown reseason";
+			break;		
+	}
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(oclHandles.error_str);	
+	#endif
+	//_clFinish();
+	// oclHandles.cl_status = clWaitForEvents(1, &e[0]);
+	// #ifdef ERRMSG
+        // if (oclHandles.cl_status!= CL_SUCCESS)
+        //     throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
+	// #endif
+}
+void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x, int group_y) throw(string){
+	cl_uint work_dim = WORK_DIM;
+	size_t local_work_size[] = {group_x, group_y};
+	size_t global_work_size[] = {range_x, range_y};
+	cl_event e[1];
+	/*if(work_items%work_group_size != 0)	//process situations that work_items cannot be divided by work_group_size
+	  work_items = work_items + (work_group_size-(work_items%work_group_size));*/
+	oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
+											global_work_size, local_work_size, 0 , 0, &(e[0]) );	
+	#ifdef ERRMSG
+	oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
+	switch(oclHandles.cl_status)
+	{
+		case CL_INVALID_PROGRAM_EXECUTABLE:
+			oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
+			break;
+		case CL_INVALID_COMMAND_QUEUE:
+			oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+			break;
+		case CL_INVALID_KERNEL:
+			oclHandles.error_str += "CL_INVALID_KERNEL";
+			break;
+		case CL_INVALID_CONTEXT:
+			oclHandles.error_str += "CL_INVALID_CONTEXT";
+			break;
+		case CL_INVALID_KERNEL_ARGS:
+			oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
+			break;
+		case CL_INVALID_WORK_DIMENSION:
+			oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
+			break;
+		case CL_INVALID_GLOBAL_WORK_SIZE:
+			oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
+			break;
+		case CL_INVALID_WORK_GROUP_SIZE:
+			oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
+			break;
+		case CL_INVALID_WORK_ITEM_SIZE:
+			oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
+			break;
+		case CL_INVALID_GLOBAL_OFFSET:
+			oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
+			break;
+		case CL_OUT_OF_RESOURCES:
+			oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+			break;
+		case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+			oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+			break;
+		case CL_INVALID_EVENT_WAIT_LIST:
+			oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
+			break;
+		case CL_OUT_OF_HOST_MEMORY:
+			oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+			break;
+		default: 
+			oclHandles.error_str += "Unkown reseason";
+			break;		
+	}
+	if(oclHandles.cl_status != CL_SUCCESS)
+		throw(oclHandles.error_str);	
+	#endif
+	//_clFinish();
+	/*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
+
+	#ifdef ERRMSG
+
+        if (oclHandles.cl_status!= CL_SUCCESS)
+
+            throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
+
+	#endif*/
+}
+
+//--------------------------------------------------------
+//release OpenCL objects
+void _clFree(cl_mem ob) throw(string){
+	if(ob!=NULL)
+		oclHandles.cl_status = clReleaseMemObject(ob);	
+	#ifdef ERRMSG
+	oclHandles.error_str = "excpetion in _clFree() ->";
+	switch(oclHandles.cl_status)
+	{
+		case CL_INVALID_MEM_OBJECT:
+			oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+			break;
+		case CL_OUT_OF_RESOURCES:
+			oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+			break;
+		case CL_OUT_OF_HOST_MEMORY:
+			oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+			break;			
+		default: 
+			oclHandles.error_str += "Unkown reseason";
+			break;		
+	}        
+    if (oclHandles.cl_status!= CL_SUCCESS)
+       throw(oclHandles.error_str);
+	#endif
+}
+#endif //_CL_HELPER_
--- a/benchmarks/opencl/bfs/Makefile
+++ b/benchmarks/opencl/bfs/Makefile
@ -29,12 +29,12 @@ CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sectio

 LIBS = -lOpenCL

-PROJECT=saxpy
+PROJECT=bfs

 all: $(PROJECT).dump $(PROJECT).hex

 lib$(PROJECT).a: kernel.cl
-	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
+	POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl

 $(PROJECT).elf: main.cc lib$(PROJECT).a
 	$(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf
--- a/benchmarks/opencl/bfs/kernel.cl
+++ b/benchmarks/opencl/bfs/kernel.cl
@ -0,0 +1,53 @@
+/* ============================================================
+//--cambine: kernel funtion of Breadth-First-Search
+//--author:	created by Jianbin Fang
+//--date:	06/12/2010
+============================================================ */
+
+//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+
+//Structure to hold a node information
+typedef struct{
+	int starting;
+	int no_of_edges;
+} Node;
+
+//--7 parameters
+__kernel void BFS_1( const __global Node* g_graph_nodes,
+					const __global int* g_graph_edges, 
+					__global char* g_graph_mask, 
+					__global char* g_updating_graph_mask, 
+					__global char* g_graph_visited, 
+					__global int* g_cost, 
+					const  int no_of_nodes){
+	int tid = get_global_id(0);
+	if( tid<no_of_nodes && g_graph_mask[tid]){
+		g_graph_mask[tid]=false;
+		for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
+			int id = g_graph_edges[i];
+			if(!g_graph_visited[id]){
+				g_cost[id]=g_cost[tid]+1;
+				g_updating_graph_mask[id]=true;
+				}
+			}
+	}	
+}
+
+//--5 parameters
+__kernel void BFS_2(__global char* g_graph_mask, 
+					__global char* g_updating_graph_mask, 
+					__global char* g_graph_visited, 
+					__global char* g_over,
+					const  int no_of_nodes
+					) {
+	int tid = get_global_id(0);
+	if( tid<no_of_nodes && g_updating_graph_mask[tid]){
+
+		g_graph_mask[tid]=true;
+		g_graph_visited[tid]=true;
+		*g_over=true;
+		g_updating_graph_mask[tid]=false;
+	}
+}
+
+
--- a/benchmarks/opencl/bfs/libbfs.a
+++ b/benchmarks/opencl/bfs/libbfs.a
--- a/benchmarks/opencl/bfs/main.cc
+++ b/benchmarks/opencl/bfs/main.cc
@ -0,0 +1,299 @@
+//--by Jianbin Fang
+
+#define __CL_ENABLE_EXCEPTIONS
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <cstring>
+
+#ifdef  PROFILING
+#include "timer.h"
+#endif
+
+#include "CLHelper.h"
+#include "util.h"
+
+#define MAX_THREADS_PER_BLOCK 256
+
+//Structure to hold a node information
+struct Node
+{
+	int starting;
+	int no_of_edges;
+};
+
+
+//----------------------------------------------------------
+//--bfs on cpu
+//--programmer:	jianbin
+//--date:	26/01/2011
+//--note: width is changed to the new_width
+//----------------------------------------------------------
+void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
+		int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
+		char *h_graph_visited, int *h_cost_ref){
+	char stop;
+	int k = 0;
+	do{
+		//if no thread changes this value then the loop stops
+		stop=false;
+		for(int tid = 0; tid < no_of_nodes; tid++ )
+		{
+			if (h_graph_mask[tid] == true){ 
+				h_graph_mask[tid]=false;
+				for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){
+					int id = h_graph_edges[i];	//--cambine: node id is connected with node tid
+					if(!h_graph_visited[id]){	//--cambine: if node id has not been visited, enter the body below
+						h_cost_ref[id]=h_cost_ref[tid]+1;
+						h_updating_graph_mask[id]=true;
+					}
+				}
+			}		
+		}
+
+  		for(int tid=0; tid< no_of_nodes ; tid++ )
+		{
+			if (h_updating_graph_mask[tid] == true){
+			h_graph_mask[tid]=true;
+			h_graph_visited[tid]=true;
+			stop=true;
+			h_updating_graph_mask[tid]=false;
+			}
+		}
+		k++;
+	}
+	while(stop);
+}
+//----------------------------------------------------------
+//--breadth first search on GPUs
+//----------------------------------------------------------
+void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
+		int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
+		char *h_graph_visited, int *h_cost) 
+					throw(std::string){
+
+	//int number_elements = height*width;
+	char h_over;
+	cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
+			d_graph_visited, d_cost, d_over;
+	try{
+		//--1 transfer data from host to device
+		_clInit();			
+		d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
+		d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
+		d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask);
+		d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask);
+		d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited);
+
+
+		d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
+		d_over = _clMallocRW(sizeof(char), &h_over);
+		
+		_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
+		_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);	
+		_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask);	
+		_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask);	
+		_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited);	
+		_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);	
+			
+		//--2 invoke kernel
+#ifdef	PROFILING
+		timer kernel_timer;
+		double kernel_time = 0.0;		
+		kernel_timer.reset();
+		kernel_timer.start();
+#endif
+		do{
+			h_over = false;
+			_clMemcpyH2D(d_over, sizeof(char), &h_over);
+			//--kernel 0
+			int kernel_id = 0;
+			int kernel_idx = 0;
+			_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
+			_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
+			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
+			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
+			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
+			_clSetArgs(kernel_id, kernel_idx++, d_cost);
+			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
+			
+			//int work_items = no_of_nodes;
+			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
+			
+			//--kernel 1
+			kernel_id = 1;
+			kernel_idx = 0;			
+			_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
+			_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
+			_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
+			_clSetArgs(kernel_id, kernel_idx++, d_over);
+			_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
+			
+			//work_items = no_of_nodes;
+			_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);			
+			
+			_clMemcpyD2H(d_over,sizeof(char), &h_over);
+			}while(h_over);
+			
+		_clFinish();
+#ifdef	PROFILING
+		kernel_timer.stop();
+		kernel_time = kernel_timer.getTimeInSeconds();
+#endif
+		//--3 transfer data from device to host
+		_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
+		//--statistics
+#ifdef	PROFILING
+		std::cout<<"kernel time(s):"<<kernel_time<<std::endl;		
+#endif
+		//--4 release cl resources.
+		_clFree(d_graph_nodes);
+		_clFree(d_graph_edges);
+		_clFree(d_graph_mask);
+		_clFree(d_updating_graph_mask);
+		_clFree(d_graph_visited);
+		_clFree(d_cost);
+		_clFree(d_over);
+		_clRelease();
+	}
+	catch(std::string msg){		
+		_clFree(d_graph_nodes);
+		_clFree(d_graph_edges);
+		_clFree(d_graph_mask);
+		_clFree(d_updating_graph_mask);
+		_clFree(d_graph_visited);
+		_clFree(d_cost);
+		_clFree(d_over);
+		_clRelease();
+		std::string e_str = "in run_transpose_gpu -> ";
+		e_str += msg;
+		throw(e_str);
+	}
+	return ;
+}
+void Usage(int argc, char**argv){
+
+fprintf(stderr,"Usage: %s <input_file>\n", argv[0]);
+
+}
+//----------------------------------------------------------
+//--cambine:	main function
+//--author:		created by Jianbin Fang
+//--date:		25/01/2011
+//----------------------------------------------------------
+int main(int argc, char * argv[])
+{
+	int no_of_nodes;
+	int edge_list_size;
+	FILE *fp;
+	Node* h_graph_nodes;
+	char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
+	try{
+		char *input_f;
+		if(argc!=2){
+		  Usage(argc, argv);
+		  exit(0);
+		}
+	
+		input_f = argv[1];
+		printf("Reading File\n");
+		//Read in Graph from a file
+		fp = fopen(input_f,"r");
+		if(!fp){
+		  printf("Error Reading graph file\n");
+		  return 0;
+		}
+
+		int source = 0;
+
+		fscanf(fp,"%d",&no_of_nodes);
+
+		int num_of_blocks = 1;
+		int num_of_threads_per_block = no_of_nodes;
+
+		//Make execution Parameters according to the number of nodes
+		//Distribute threads across multiple Blocks if necessary
+		if(no_of_nodes>MAX_THREADS_PER_BLOCK){
+			num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK); 
+			num_of_threads_per_block = MAX_THREADS_PER_BLOCK; 
+		}
+		work_group_size = num_of_threads_per_block;
+		// allocate host memory
+		h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
+		h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
+		h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
+		h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes);
+	
+		int start, edgeno;   
+		// initalize the memory
+		for(int i = 0; i < no_of_nodes; i++){
+			fscanf(fp,"%d %d",&start,&edgeno);
+			h_graph_nodes[i].starting = start;
+			h_graph_nodes[i].no_of_edges = edgeno;
+			h_graph_mask[i]=false;
+			h_updating_graph_mask[i]=false;
+			h_graph_visited[i]=false;
+		}
+		//read the source node from the file
+		fscanf(fp,"%d",&source);
+		source=0;
+		//set the source node as true in the mask
+		h_graph_mask[source]=true;
+		h_graph_visited[source]=true;
+    	fscanf(fp,"%d",&edge_list_size);
+   		int id,cost;
+		int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);
+		for(int i=0; i < edge_list_size ; i++){
+			fscanf(fp,"%d",&id);
+			fscanf(fp,"%d",&cost);
+			h_graph_edges[i] = id;
+		}
+
+		if(fp)
+			fclose(fp);    
+		// allocate mem for the result on host side
+		int	*h_cost = (int*) malloc(sizeof(int)*no_of_nodes);
+		int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes);
+		for(int i=0;i<no_of_nodes;i++){
+			h_cost[i]=-1;
+			h_cost_ref[i] = -1;
+		}
+		h_cost[source]=0;
+		h_cost_ref[source]=0;		
+		//---------------------------------------------------------
+		//--gpu entry
+		run_bfs_gpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);	
+		//---------------------------------------------------------
+		//--cpu entry
+		// initalize the memory again
+		for(int i = 0; i < no_of_nodes; i++){
+			h_graph_mask[i]=false;
+			h_updating_graph_mask[i]=false;
+			h_graph_visited[i]=false;
+		}
+		//set the source node as true in the mask
+		source=0;
+		h_graph_mask[source]=true;
+		h_graph_visited[source]=true;
+		run_bfs_cpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost_ref);
+		//---------------------------------------------------------
+		//--result varification
+		compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
+		//release host memory		
+		free(h_graph_nodes);
+		free(h_graph_mask);
+		free(h_updating_graph_mask);
+		free(h_graph_visited);
+
+	}
+	catch(std::string msg){
+		std::cout<<"--cambine: exception in main ->"<<msg<<std::endl;
+		//release host memory
+		free(h_graph_nodes);
+		free(h_graph_mask);
+		free(h_updating_graph_mask);
+		free(h_graph_visited);		
+	}
+		
+    return 0;
+}
--- a/benchmarks/opencl/bfs/run
+++ b/benchmarks/opencl/bfs/run
@ -0,0 +1 @@
+./bfs ../../data/bfs/graph1MW_6.txt
--- a/benchmarks/opencl/bfs/timer.cc
+++ b/benchmarks/opencl/bfs/timer.cc
@ -0,0 +1,78 @@
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "timer.h"
+
+
+using namespace std;
+
+double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
+
+
+double timer::get_CPU_speed_in_MHz()
+{
+#if defined __linux__
+    ifstream infile("/proc/cpuinfo");
+    char     buffer[256], *colon;
+
+    while (infile.good()) {
+	infile.getline(buffer, 256);
+
+	if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
+	    return atof(colon + 2);
+    }
+#endif
+
+    return 0.0;
+}
+
+
+void timer::print_time(ostream &str, const char *which, double time) const
+{
+    static const char *units[] = { " ns", " us", " ms", "  s", " ks", 0 };
+    const char	      **unit   = units;
+
+    time = 1000.0 * time / CPU_speed_in_MHz;
+
+    while (time >= 999.5 && unit[1] != 0) {
+	time /= 1000.0;
+	++ unit;
+    }
+
+    str << which << " = " << setprecision(3) << setw(4) << time << *unit;
+}
+
+
+ostream &timer::print(ostream &str)
+{
+    str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
+
+    if (CPU_speed_in_MHz == 0)
+	str << "could not determine CPU speed\n";
+    else if (count > 0) {
+	double total = static_cast<double>(total_time);
+
+	print_time(str, "avg", total / static_cast<double>(count));
+	print_time(str, ", total", total);
+	str << ", count = " << setw(9) << count << '\n';
+    }
+    else
+	str << "not used\n";
+
+    return str;
+}
+
+
+ostream &operator << (ostream &str, class timer &timer)
+{
+    return timer.print(str);
+}
+
+double timer::getTimeInSeconds()
+{
+    double total = static_cast<double>(total_time);
+    double res = (total / 1000000.0) / CPU_speed_in_MHz;
+    return res;
+}
--- a/benchmarks/opencl/bfs/timer.h
+++ b/benchmarks/opencl/bfs/timer.h
@ -0,0 +1,128 @@
+#ifndef timer_h
+#define timer_h
+
+#include <iostream>
+
+
+class timer {
+    public:
+			   timer(const char *name = 0);
+			   timer(const char *name, std::ostream &write_on_exit);
+
+			   ~timer();
+
+	void		   start(), stop();
+	void		   reset();
+	std::ostream   	   &print(std::ostream &);
+
+	double             getTimeInSeconds();
+
+    private:
+	void		   print_time(std::ostream &, const char *which, double time) const;
+
+	union {
+	    long long	   total_time;
+	    struct {
+#if defined __PPC__
+		int	   high, low;
+#else
+		int	   low, high;
+#endif
+	    };
+	};
+
+	unsigned long long count;
+	const char	   *const name;
+	std::ostream	   *const write_on_exit;
+
+	static double	   CPU_speed_in_MHz, get_CPU_speed_in_MHz();
+};
+
+
+std::ostream &operator << (std::ostream &, class timer &);
+
+
+inline void timer::reset()
+{
+    total_time = 0;
+    count      = 0;
+}
+
+
+inline timer::timer(const char *name)
+:
+    name(name),
+    write_on_exit(0)
+{
+    reset();
+}
+
+
+inline timer::timer(const char *name, std::ostream &write_on_exit)
+:
+    name(name),
+    write_on_exit(&write_on_exit)
+{
+    reset();
+}
+
+
+inline timer::~timer()
+{
+    if (write_on_exit != 0)
+	print(*write_on_exit);
+}
+
+
+inline void timer::start()
+{
+#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
+    unsigned eax, edx;
+
+    asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
+
+    total_time -= ((unsigned long long) edx << 32) + eax;
+#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
+    asm volatile
+    (
+	"rdtsc\n\t"
+	"subl %%eax, %0\n\t"
+	"sbbl %%edx, %1"
+    :
+	"+m" (low), "+m" (high)
+    :
+    :
+	"eax", "edx"
+    );
+#else
+#error Compiler/Architecture not recognized
+#endif
+}
+
+
+inline void timer::stop()
+{
+#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
+    unsigned eax, edx;
+
+    asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
+
+    total_time += ((unsigned long long) edx << 32) + eax;
+#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
+    asm volatile
+    (
+	"rdtsc\n\t"
+	"addl %%eax, %0\n\t"
+	"adcl %%edx, %1"
+    :
+	"+m" (low), "+m" (high)
+    :
+    :
+	"eax", "edx"
+    );
+#endif
+
+    ++ count;
+}
+
+#endif
--- a/benchmarks/opencl/bfs/util.h
+++ b/benchmarks/opencl/bfs/util.h
@ -0,0 +1,72 @@
+#ifndef _C_UTIL_
+#define _C_UTIL_
+#include <math.h>
+#include <iostream>
+
+//-------------------------------------------------------------------
+//--initialize array with maximum limit
+//-------------------------------------------------------------------
+template<typename datatype>
+void fill(datatype *A, const int n, const datatype maxi){
+    for (int j = 0; j < n; j++) 
+    {
+        A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
+    }
+}
+
+//--print matrix
+template<typename datatype>
+void print_matrix(datatype *A, int height, int width){
+	for(int i=0; i<height; i++){
+		for(int j=0; j<width; j++){
+			int idx = i*width + j;
+			std::cout<<A[idx]<<" ";
+		}
+		std::cout<<std::endl;
+	}
+
+	return;
+}
+//-------------------------------------------------------------------
+//--verify results
+//-------------------------------------------------------------------
+#define MAX_RELATIVE_ERROR  .002
+template<typename datatype>
+void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
+
+    char passed = true; 
+#pragma omp parallel for
+    for (int i=0; i<size; i++){
+      if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
+         passed = false; 
+      }
+    }
+    if (passed){
+        std::cout << "--cambine:passed:-)" << endl;
+    }
+    else{
+        std::cout << "--cambine: failed:-(" << endl;
+    }
+    return ;
+}
+template<typename datatype>
+void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
+
+    char passed = true; 
+//#pragma omp parallel for
+    for (int i=0; i<size; i++){
+      if (cpu_results[i]!=gpu_results[i]){
+         passed = false; 
+      }
+    }
+    if (passed){
+        std::cout << "--cambine:passed:-)" << endl;
+    }
+    else{
+        std::cout << "--cambine: failed:-(" << endl;
+    }
+    return ;
+}
+
+#endif
+