mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
removed NULL local_group to prevent OpenCL runtime automatic allocation of moving global group into local group which will be inefficient on Vortex.
This commit is contained in:
parent
de8453d0be
commit
274e6a4c52
4 changed files with 13 additions and 12 deletions
|
@ -189,6 +189,7 @@ int main (int argc, char **argv) {
|
|||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||
|
||||
size_t global_size[2] = {size, size};
|
||||
size_t local_size[2] = {1, 1};
|
||||
|
||||
// Set kernel arguments
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj));
|
||||
|
@ -225,7 +226,7 @@ int main (int argc, char **argv) {
|
|||
|
||||
printf("Execute the kernel\n");
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(commandQueue));
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
|
|
|
@ -155,12 +155,10 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
|
|||
writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);
|
||||
|
||||
// 3. Determine block sizes
|
||||
size_t globalWorksizeFan1[1];
|
||||
size_t globalWorksizeFan2[2];
|
||||
|
||||
globalWorksizeFan1[0] = size;
|
||||
globalWorksizeFan2[0] = size;
|
||||
globalWorksizeFan2[1] = size;
|
||||
size_t globalWorksizeFan1[1] = {size};
|
||||
size_t globalWorksizeFan2[2] = {size, size};
|
||||
size_t localWorksizeFan1[1] = {1};
|
||||
size_t localWorksizeFan2[2] = {1, 1};
|
||||
|
||||
int t;
|
||||
// 4. Setup and Run kernels
|
||||
|
@ -178,7 +176,7 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
|
|||
// launch kernel
|
||||
error =
|
||||
clEnqueueNDRangeKernel(command_queue, fan1_kernel, 1, 0,
|
||||
globalWorksizeFan1, NULL, 0, NULL, &kernelEvent);
|
||||
globalWorksizeFan1, localWorksizeFan1, 0, NULL, &kernelEvent);
|
||||
|
||||
cl_errChk(error, "ERROR in Executing Fan1 Kernel", true);
|
||||
if (timing) {
|
||||
|
@ -202,7 +200,7 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
|
|||
// launch kernel
|
||||
error =
|
||||
clEnqueueNDRangeKernel(command_queue, fan2_kernel, 2, 0,
|
||||
globalWorksizeFan2, NULL, 0, NULL, &kernelEvent);
|
||||
globalWorksizeFan2, localWorksizeFan2, 0, NULL, &kernelEvent);
|
||||
|
||||
cl_errChk(error, "ERROR in Executing Fan1 Kernel", true);
|
||||
if (timing) {
|
||||
|
|
|
@ -107,13 +107,14 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
|
|||
|
||||
// 4. enqueue kernel
|
||||
size_t globalWorkSize[1];
|
||||
size_t localWorkSize[1] = {1};
|
||||
globalWorkSize[0] = numRecords;
|
||||
if (numRecords % 64)
|
||||
globalWorkSize[0] += 64 - (numRecords % 64);
|
||||
// printf("Global Work Size: %zu\n",globalWorkSize[0]);
|
||||
|
||||
error = clEnqueueNDRangeKernel(command_queue, NN_kernel, 1, 0, globalWorkSize,
|
||||
NULL, 0, NULL, &kernelEvent);
|
||||
localWorkSize, 0, NULL, &kernelEvent);
|
||||
|
||||
cl_errChk(error, "ERROR in Executing Kernel NearestNeighbor", true);
|
||||
|
||||
|
|
|
@ -179,9 +179,10 @@ int main (int argc, char **argv) {
|
|||
CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b, 0, NULL, NULL));
|
||||
|
||||
printf("Execute the kernel\n");
|
||||
size_t global_work_size[1] = {size};
|
||||
size_t global_work_size[1] = {size};
|
||||
size_t local_work_size[1] = {1};
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(commandQueue));
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue