removed NULL local_group to prevent OpenCL runtime automatic allocation of moving global group into local group which will be inefficient on Vortex.

This commit is contained in:
Blaise Tine 2024-03-04 20:30:32 -08:00
parent de8453d0be
commit 274e6a4c52
4 changed files with 13 additions and 12 deletions

View file

@ -189,6 +189,7 @@ int main (int argc, char **argv) {
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
size_t global_size[2] = {size, size};
size_t local_size[2] = {1, 1};
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj));
@ -225,7 +226,7 @@ int main (int argc, char **argv) {
printf("Execute the kernel\n");
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();

View file

@ -155,12 +155,10 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);
// 3. Determine block sizes
size_t globalWorksizeFan1[1];
size_t globalWorksizeFan2[2];
globalWorksizeFan1[0] = size;
globalWorksizeFan2[0] = size;
globalWorksizeFan2[1] = size;
size_t globalWorksizeFan1[1] = {size};
size_t globalWorksizeFan2[2] = {size, size};
size_t localWorksizeFan1[1] = {1};
size_t localWorksizeFan2[2] = {1, 1};
int t;
// 4. Setup and Run kernels
@ -178,7 +176,7 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
// launch kernel
error =
clEnqueueNDRangeKernel(command_queue, fan1_kernel, 1, 0,
globalWorksizeFan1, NULL, 0, NULL, &kernelEvent);
globalWorksizeFan1, localWorksizeFan1, 0, NULL, &kernelEvent);
cl_errChk(error, "ERROR in Executing Fan1 Kernel", true);
if (timing) {
@ -202,7 +200,7 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
// launch kernel
error =
clEnqueueNDRangeKernel(command_queue, fan2_kernel, 2, 0,
globalWorksizeFan2, NULL, 0, NULL, &kernelEvent);
globalWorksizeFan2, localWorksizeFan2, 0, NULL, &kernelEvent);
cl_errChk(error, "ERROR in Executing Fan1 Kernel", true);
if (timing) {

View file

@ -107,13 +107,14 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
// 4. enqueue kernel
size_t globalWorkSize[1];
size_t localWorkSize[1] = {1};
globalWorkSize[0] = numRecords;
if (numRecords % 64)
globalWorkSize[0] += 64 - (numRecords % 64);
// printf("Global Work Size: %zu\n",globalWorkSize[0]);
error = clEnqueueNDRangeKernel(command_queue, NN_kernel, 1, 0, globalWorkSize,
NULL, 0, NULL, &kernelEvent);
localWorkSize, 0, NULL, &kernelEvent);
cl_errChk(error, "ERROR in Executing Kernel NearestNeighbor", true);

View file

@ -179,9 +179,10 @@ int main (int argc, char **argv) {
CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b, 0, NULL, NULL));
printf("Execute the kernel\n");
size_t global_work_size[1] = {size};
size_t global_work_size[1] = {size};
size_t local_work_size[1] = {1};
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL));
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();