mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
opencl tests GPU support
This commit is contained in:
parent
de66d2ec3e
commit
3de8075636
15 changed files with 112 additions and 57 deletions
|
@ -36,7 +36,7 @@ struct oclHandleStruct oclHandles;
|
|||
char kernel_file[100] = "Kernels.cl";
|
||||
int total_kernels = 2;
|
||||
string kernel_names[2] = {"BFS_1", "BFS_2"};
|
||||
int work_group_size = 512;
|
||||
int work_group_size = 1; // 512
|
||||
int device_id_inused = 0; // deviced id used (default : 0)
|
||||
|
||||
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
|
@ -255,13 +255,19 @@ free(allPlatforms);*/
|
|||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
std::abort();
|
||||
oclHandles.program = clCreateProgramWithSource(
|
||||
oclHandles.context, 1, (const char**)&kernel_bin, &kernel_size, &resultCL);
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
std::abort();
|
||||
|
||||
oclHandles.program = clCreateProgramWithBinary(
|
||||
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &resultCL);
|
||||
oclHandles.program = clCreateProgramWithBinary(
|
||||
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &resultCL);
|
||||
#endif
|
||||
free(kernel_bin);
|
||||
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
|
||||
throw(string("InitCL()::Error: Loading Binary into cl_program. "
|
||||
"(clCreateProgramWithBinary)"));
|
||||
|
|
|
@ -182,7 +182,7 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
|
|||
int main(int argc, char *argv[]) {
|
||||
printf("enter demo main\n");
|
||||
|
||||
int errors = 0;
|
||||
int errors = -1;
|
||||
int no_of_nodes;
|
||||
int edge_list_size;
|
||||
FILE *fp;
|
||||
|
@ -289,7 +289,6 @@ int main(int argc, char *argv[]) {
|
|||
free(h_graph_mask);
|
||||
free(h_updating_graph_mask);
|
||||
free(h_graph_visited);
|
||||
|
||||
} catch (std::string msg) {
|
||||
printf("--cambine: exception in main ->%s\n", msg);
|
||||
// release host memory
|
||||
|
|
|
@ -48,17 +48,21 @@ extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqPar
|
|||
shrCheckError(cBlackScholes != NULL, shrTRUE);*/
|
||||
|
||||
shrLog("...creating BlackScholes program\n");
|
||||
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
cl_device_id device_id = oclGetFirstDev(cxGPUContext);
|
||||
cpBlackScholes = clCreateProgramWithBinary(
|
||||
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
cl_device_id device_id = oclGetFirstDev(cxGPUContext);
|
||||
#ifdef HOSTGPU
|
||||
ciErrNum = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernel_bin, &kernel_size, &ciErrNum);
|
||||
#else
|
||||
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
cpBlackScholes = clCreateProgramWithBinary(
|
||||
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
|
||||
#endif
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
shrLog("...building BlackScholes program\n");
|
||||
ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);
|
||||
|
||||
|
@ -144,8 +148,8 @@ extern "C" void BlackScholes(
|
|||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Run the kernel
|
||||
size_t globalWorkSize = 16;//60 * 1024;
|
||||
size_t localWorkSize = 16;//128;
|
||||
size_t globalWorkSize = 128;//60 * 1024;
|
||||
size_t localWorkSize = 1;//128;
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
|
|
|
@ -54,7 +54,8 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
OBJS := $(addsuffix .o, $(filter-out main.cc,$(notdir $(SRCS))))
|
||||
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
|
||||
OBJS_HOST := $(addsuffix .host.o, $(notdir $(SRCS)))
|
||||
|
||||
.DEFAULT_GOAL := all
|
||||
all: $(PROJECT) kernel.pocl
|
||||
|
@ -74,20 +75,23 @@ kernel.pocl: $(SRC_DIR)/kernel.cl
|
|||
%.c.o: $(SRC_DIR)/%.c
|
||||
$(CC) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
main.cc.o: $(SRC_DIR)/main.cc
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
main.cc.host.o: $(SRC_DIR)/main.cc
|
||||
%.cc.host.o: $(SRC_DIR)/%.cc
|
||||
$(CXX) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
|
||||
|
||||
%.cpp.host.o: $(SRC_DIR)/%.cpp
|
||||
$(CXX) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
|
||||
|
||||
%.c.host.o: $(SRC_DIR)/%.c
|
||||
$(CC) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
|
||||
|
||||
ifndef USE_SETUP
|
||||
setup:
|
||||
endif
|
||||
|
||||
$(PROJECT): setup main.cc.o $(OBJS)
|
||||
$(PROJECT): setup $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out setup, $^) $(LDFLAGS) -L$(ROOT_DIR)/runtime/stub -lvortex -L$(POCL_RT_PATH)/lib -lOpenCL -o $@
|
||||
|
||||
$(PROJECT).host: setup main.cc.host.o $(OBJS)
|
||||
$(PROJECT).host: setup $(OBJS_HOST)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out setup, $^) $(LDFLAGS) -lOpenCL -o $@
|
||||
|
||||
run-gpu: $(PROJECT).host kernel.cl
|
||||
|
|
|
@ -247,7 +247,7 @@ int main (int argc, char **argv) {
|
|||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
printf("FAILED! - %d errors\n", errors);
|
||||
printf("FAILED! - %d errors\n", errors);
|
||||
} else {
|
||||
printf("PASSED!\n");
|
||||
}
|
||||
|
|
|
@ -120,7 +120,7 @@ int main(int argc, char **argv)
|
|||
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
|
||||
|
||||
// set and log Global and Local work size dimensions
|
||||
szLocalWorkSize = 16;
|
||||
szLocalWorkSize = 1; // 16
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
|
||||
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
|
||||
|
|
|
@ -410,6 +410,10 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
cl_errChk(status, "Oops!", true);
|
||||
context = clCreateContext(NULL, numDevices[0], devices, NULL, NULL, &status);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
|
||||
char device_string[1024];
|
||||
clGetDeviceInfo(devices[0], CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
printf("Using device: %s\n", device_string);
|
||||
|
||||
device=devices[device_touse];
|
||||
|
||||
|
@ -876,13 +880,19 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
|
|||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
|
||||
cl_program clProgramReturn;
|
||||
|
||||
// Create the program object
|
||||
//cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status);
|
||||
cl_program clProgramReturn = clCreateProgramWithBinary(
|
||||
#ifdef HOSTGPU
|
||||
int err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, &kernel_size, &status);
|
||||
#else
|
||||
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
clProgramReturn = clCreateProgramWithBinary(
|
||||
context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
|
||||
#endif
|
||||
free(kernel_bin);
|
||||
cl_errChk(status, "Creating program", true);
|
||||
|
||||
|
|
|
@ -172,6 +172,6 @@ float** kmeans_clustering(float **feature, /* in: [npoints][nfeatures] */
|
|||
free(new_centers_len);
|
||||
free(initial);
|
||||
|
||||
return clusters;
|
||||
return clusters;
|
||||
}
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ double gettime() {
|
|||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE 256
|
||||
#define BLOCK_SIZE 1 //256
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_1_0
|
||||
|
@ -45,7 +45,7 @@ double gettime() {
|
|||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE2 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE2 256
|
||||
#define BLOCK_SIZE2 1 //256
|
||||
#endif
|
||||
|
||||
// local variables
|
||||
|
@ -200,18 +200,28 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
|||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
cl_program prog;
|
||||
#ifdef HOSTGPU
|
||||
err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: read_kernel_file() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
prog = clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &err);
|
||||
#else
|
||||
err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: read_kernel_file() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
cl_program prog = clCreateProgramWithBinary(
|
||||
prog = clCreateProgramWithBinary(
|
||||
context, 1, device_list, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateProgramWithBinary() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
free(kernel_bin);
|
||||
|
||||
|
|
|
@ -259,7 +259,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
|
|||
return 0;
|
||||
}
|
||||
|
||||
|
||||
cl_context cl_init_context(int platform, int dev,int quiet) {
|
||||
int printInfo=1;
|
||||
if (platform >= 0 && dev >= 0) printInfo = 0;
|
||||
|
@ -410,6 +409,10 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
cl_errChk(status, "Oops!", true);
|
||||
context = clCreateContext(NULL, numDevices[0], devices, NULL, NULL, &status);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
|
||||
char device_string[1024];
|
||||
clGetDeviceInfo(devices[0], CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
printf("Using device: %s\n", device_string);
|
||||
|
||||
device=devices[device_touse];
|
||||
|
||||
|
@ -876,13 +879,19 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
|
|||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
|
||||
cl_program clProgramReturn;
|
||||
|
||||
// Create the program object
|
||||
//cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status);
|
||||
cl_program clProgramReturn = clCreateProgramWithBinary(
|
||||
#ifdef HOSTGPU
|
||||
int err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, &kernel_size, &status);
|
||||
#else
|
||||
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
clProgramReturn = clCreateProgramWithBinary(
|
||||
context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
|
||||
#endif
|
||||
free(kernel_bin);
|
||||
cl_errChk(status, "Creating program", true);
|
||||
|
||||
|
|
|
@ -139,14 +139,18 @@ static void parse_args(int argc, char **argv) {
|
|||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
printf("Workload size=%d\n", size);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
printf("input size=%d\n", size);
|
||||
if (size < 3) {
|
||||
printf("Error: input size must be >= 3\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
cl_platform_id platform_id;
|
||||
cl_device_id device_id;
|
||||
cl_program program;
|
||||
|
@ -223,9 +227,9 @@ int main(int argc, char **argv) {
|
|||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(m7), (&m7)));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(m8), (&m8)));
|
||||
|
||||
size_t global_offset[2] = {1, 1};
|
||||
size_t global_offset[2] = {1, 1};
|
||||
size_t global_work_size[2] = {size - 2, size - 2};
|
||||
size_t local_work_size[2] = {size - 2, 1};
|
||||
size_t local_work_size[2] = {1, 1}; // {size-2,1}
|
||||
|
||||
printf("enqueue write buffer\n");
|
||||
std::vector<float> ref_vec(size * size);
|
||||
|
|
|
@ -299,11 +299,10 @@ int main(int argc, char **argv) {
|
|||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
|
||||
size_t grid;
|
||||
size_t block;
|
||||
|
||||
compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
|
||||
clDeviceProp.minor, clDeviceProp.multiProcessorCount);
|
||||
size_t grid = nzcnt_len * pad;
|
||||
size_t block = 1;
|
||||
/*compute_active_thread(&block, &grid, nzcnt_len, pad, clDeviceProp.major,
|
||||
clDeviceProp.minor, clDeviceProp.multiProcessorCount);*/
|
||||
printf("grid size=%ld, block size=%ld, dim=%d\n", grid, block, dim);
|
||||
|
||||
clStatus = clSetKernelArg(clKernel, 0, sizeof(cl_mem), &d_Ax_vector);
|
||||
|
|
|
@ -270,8 +270,8 @@ int main(int argc, char** argv) {
|
|||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
|
||||
//only use 1D thread block
|
||||
int tx = 128;
|
||||
size_t block[3] = {tx,1,1};
|
||||
int tx = 128;
|
||||
size_t block[3] = {1,1,1}; // {tx,1,1}
|
||||
size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
|
||||
//size_t grid[3] = {nx-2,ny-2,nz-2};
|
||||
size_t offset[3] = {1,1,1};
|
||||
|
|
|
@ -1363,7 +1363,7 @@ void pb_sig_clmem(char* s, cl_command_queue command_queue, cl_mem memobj, int ty
|
|||
printf ("Something wrong.\n");
|
||||
assert(0);
|
||||
} else {
|
||||
printf ("size = %d\n", sz);
|
||||
printf ("size = %ld\n", sz);
|
||||
}
|
||||
char* hp; // = (char*) malloc(sz);
|
||||
//posix_memalign((void**)&hp, 64, sz);
|
||||
|
|
|
@ -193,6 +193,16 @@ int runTest( const int argc, const char** argv)
|
|||
size_y = temp;
|
||||
}
|
||||
|
||||
if ((size_x / BLOCK_DIM) * BLOCK_DIM != size_x) {
|
||||
printf("Error: size_x must be a multiple of %d\n", BLOCK_DIM);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((size_y / BLOCK_DIM) * BLOCK_DIM != size_y) {
|
||||
printf("Error: size_y must be a multiple of %d\n", BLOCK_DIM);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// size of memory required to store the matrix
|
||||
const size_t mem_size = sizeof(float) * size_x * size_y;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue