opencl tests GPU support

This commit is contained in:
Blaise Tine 2024-04-09 18:20:24 -07:00
parent de66d2ec3e
commit 3de8075636
15 changed files with 112 additions and 57 deletions

View file

@ -36,7 +36,7 @@ struct oclHandleStruct oclHandles;
char kernel_file[100] = "Kernels.cl";
int total_kernels = 2;
string kernel_names[2] = {"BFS_1", "BFS_2"};
int work_group_size = 512;
int work_group_size = 1; // 512
int device_id_inused = 0; // deviced id used (default : 0)
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
@ -255,13 +255,19 @@ free(allPlatforms);*/
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
std::abort();
oclHandles.program = clCreateProgramWithSource(
oclHandles.context, 1, (const char**)&kernel_bin, &kernel_size, &resultCL);
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
std::abort();
oclHandles.program = clCreateProgramWithBinary(
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &resultCL);
oclHandles.program = clCreateProgramWithBinary(
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &resultCL);
#endif
free(kernel_bin);
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
throw(string("InitCL()::Error: Loading Binary into cl_program. "
"(clCreateProgramWithBinary)"));

View file

@ -182,7 +182,7 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int main(int argc, char *argv[]) {
printf("enter demo main\n");
int errors = 0;
int errors = -1;
int no_of_nodes;
int edge_list_size;
FILE *fp;
@ -289,7 +289,6 @@ int main(int argc, char *argv[]) {
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
} catch (std::string msg) {
printf("--cambine: exception in main ->%s\n", msg);
// release host memory

View file

@ -48,17 +48,21 @@ extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqPar
shrCheckError(cBlackScholes != NULL, shrTRUE);*/
shrLog("...creating BlackScholes program\n");
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
shrCheckError(ciErrNum, CL_SUCCESS);
cl_device_id device_id = oclGetFirstDev(cxGPUContext);
cpBlackScholes = clCreateProgramWithBinary(
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
shrCheckError(ciErrNum, CL_SUCCESS);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
cl_device_id device_id = oclGetFirstDev(cxGPUContext);
#ifdef HOSTGPU
ciErrNum = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
shrCheckError(ciErrNum, CL_SUCCESS);
cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernel_bin, &kernel_size, &ciErrNum);
#else
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
shrCheckError(ciErrNum, CL_SUCCESS);
cpBlackScholes = clCreateProgramWithBinary(
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
#endif
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("...building BlackScholes program\n");
ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);
@ -144,8 +148,8 @@ extern "C" void BlackScholes(
shrCheckError(ciErrNum, CL_SUCCESS);
//Run the kernel
size_t globalWorkSize = 16;//60 * 1024;
size_t localWorkSize = 16;//128;
size_t globalWorkSize = 128;//60 * 1024;
size_t localWorkSize = 1;//128;
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
}

View file

@ -54,7 +54,8 @@ endif
endif
endif
OBJS := $(addsuffix .o, $(filter-out main.cc,$(notdir $(SRCS))))
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
OBJS_HOST := $(addsuffix .host.o, $(notdir $(SRCS)))
.DEFAULT_GOAL := all
all: $(PROJECT) kernel.pocl
@ -74,20 +75,23 @@ kernel.pocl: $(SRC_DIR)/kernel.cl
%.c.o: $(SRC_DIR)/%.c
$(CC) $(CXXFLAGS) -c $< -o $@
main.cc.o: $(SRC_DIR)/main.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
main.cc.host.o: $(SRC_DIR)/main.cc
%.cc.host.o: $(SRC_DIR)/%.cc
$(CXX) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
%.cpp.host.o: $(SRC_DIR)/%.cpp
$(CXX) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
%.c.host.o: $(SRC_DIR)/%.c
$(CC) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
ifndef USE_SETUP
setup:
endif
$(PROJECT): setup main.cc.o $(OBJS)
$(PROJECT): setup $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out setup, $^) $(LDFLAGS) -L$(ROOT_DIR)/runtime/stub -lvortex -L$(POCL_RT_PATH)/lib -lOpenCL -o $@
$(PROJECT).host: setup main.cc.host.o $(OBJS)
$(PROJECT).host: setup $(OBJS_HOST)
$(CXX) $(CXXFLAGS) $(filter-out setup, $^) $(LDFLAGS) -lOpenCL -o $@
run-gpu: $(PROJECT).host kernel.cl

View file

@ -247,7 +247,7 @@ int main (int argc, char **argv) {
}
}
if (errors != 0) {
printf("FAILED! - %d errors\n", errors);
printf("FAILED! - %d errors\n", errors);
} else {
printf("PASSED!\n");
}

View file

@ -120,7 +120,7 @@ int main(int argc, char **argv)
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
// set and log Global and Local work size dimensions
szLocalWorkSize = 16;
szLocalWorkSize = 1; // 16
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));

View file

@ -410,6 +410,10 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
cl_errChk(status, "Oops!", true);
context = clCreateContext(NULL, numDevices[0], devices, NULL, NULL, &status);
cl_errChk(status, "Oops!", true);
char device_string[1024];
clGetDeviceInfo(devices[0], CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
printf("Using device: %s\n", device_string);
device=devices[device_touse];
@ -876,13 +880,19 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
cl_program clProgramReturn;
// Create the program object
//cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status);
cl_program clProgramReturn = clCreateProgramWithBinary(
#ifdef HOSTGPU
int err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, &kernel_size, &status);
#else
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
clProgramReturn = clCreateProgramWithBinary(
context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
#endif
free(kernel_bin);
cl_errChk(status, "Creating program", true);

View file

@ -172,6 +172,6 @@ float** kmeans_clustering(float **feature, /* in: [npoints][nfeatures] */
free(new_centers_len);
free(initial);
return clusters;
return clusters;
}

View file

@ -35,7 +35,7 @@ double gettime() {
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE RD_WG_SIZE
#else
#define BLOCK_SIZE 256
#define BLOCK_SIZE 1 //256
#endif
#ifdef RD_WG_SIZE_1_0
@ -45,7 +45,7 @@ double gettime() {
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE2 RD_WG_SIZE
#else
#define BLOCK_SIZE2 256
#define BLOCK_SIZE2 1 //256
#endif
// local variables
@ -200,18 +200,28 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
cl_program prog;
#ifdef HOSTGPU
err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
if (err != CL_SUCCESS) {
printf("ERROR: read_kernel_file() => %d\n", err);
return -1;
}
prog = clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &err);
#else
err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
if (err != CL_SUCCESS) {
printf("ERROR: read_kernel_file() => %d\n", err);
return -1;
}
cl_program prog = clCreateProgramWithBinary(
prog = clCreateProgramWithBinary(
context, 1, device_list, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateProgramWithBinary() => %d\n", err);
return -1;
}
#endif
free(kernel_bin);

View file

@ -259,7 +259,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
return 0;
}
cl_context cl_init_context(int platform, int dev,int quiet) {
int printInfo=1;
if (platform >= 0 && dev >= 0) printInfo = 0;
@ -410,6 +409,10 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
cl_errChk(status, "Oops!", true);
context = clCreateContext(NULL, numDevices[0], devices, NULL, NULL, &status);
cl_errChk(status, "Oops!", true);
char device_string[1024];
clGetDeviceInfo(devices[0], CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
printf("Using device: %s\n", device_string);
device=devices[device_touse];
@ -876,13 +879,19 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
cl_program clProgramReturn;
// Create the program object
//cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status);
cl_program clProgramReturn = clCreateProgramWithBinary(
#ifdef HOSTGPU
int err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, &kernel_size, &status);
#else
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
clProgramReturn = clCreateProgramWithBinary(
context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
#endif
free(kernel_bin);
cl_errChk(status, "Creating program", true);

View file

@ -139,14 +139,18 @@ static void parse_args(int argc, char **argv) {
exit(-1);
}
}
printf("Workload size=%d\n", size);
}
int main(int argc, char **argv) {
// parse command arguments
parse_args(argc, argv);
printf("input size=%d\n", size);
if (size < 3) {
printf("Error: input size must be >= 3\n");
return -1;
}
cl_platform_id platform_id;
cl_device_id device_id;
cl_program program;
@ -223,9 +227,9 @@ int main(int argc, char **argv) {
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(m7), (&m7)));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(m8), (&m8)));
size_t global_offset[2] = {1, 1};
size_t global_offset[2] = {1, 1};
size_t global_work_size[2] = {size - 2, size - 2};
size_t local_work_size[2] = {size - 2, 1};
size_t local_work_size[2] = {1, 1}; // {size-2,1}
printf("enqueue write buffer\n");
std::vector<float> ref_vec(size * size);

View file

@ -299,11 +299,10 @@ int main(int argc, char **argv) {
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
size_t grid;
size_t block;
compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
clDeviceProp.minor, clDeviceProp.multiProcessorCount);
size_t grid = nzcnt_len * pad;
size_t block = 1;
/*compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
clDeviceProp.minor, clDeviceProp.multiProcessorCount);*/
printf("grid size=%ld, block size=%ld, dim=%d\n", grid, block, dim);
clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);

View file

@ -270,8 +270,8 @@ int main(int argc, char** argv) {
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
//only use 1D thread block
int tx = 128;
size_t block[3] = {tx,1,1};
int tx = 128;
size_t block[3] = {1,1,1}; // {tx,1,1}
size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
//size_t grid[3] = {nx-2,ny-2,nz-2};
size_t offset[3] = {1,1,1};

View file

@ -1363,7 +1363,7 @@ void pb_sig_clmem(char* s, cl_command_queue command_queue, cl_mem memobj, int ty
printf ("Something wrong.\n");
assert(0);
} else {
printf ("size = %d\n", sz);
printf ("size = %ld\n", sz);
}
char* hp; // = (char*) malloc(sz);
//posix_memalign((void**)&hp, 64, sz);

View file

@ -193,6 +193,16 @@ int runTest( const int argc, const char** argv)
size_y = temp;
}
if ((size_x / BLOCK_DIM) * BLOCK_DIM != size_x) {
printf("Error: size_x must be a multiple of %d\n", BLOCK_DIM);
return -1;
}
if ((size_y / BLOCK_DIM) * BLOCK_DIM != size_y) {
printf("Error: size_y must be a multiple of %d\n", BLOCK_DIM);
return -1;
}
// size of memory required to store the matrix
const size_t mem_size = sizeof(float) * size_x * size_y;