k1x-gpu-test/openCLDemo/add_demo.c
2024-10-08 11:14:06 +08:00

299 lines
9.9 KiB
C

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <sys/time.h>
#include <CL/cl.h>
// OpenCL kernel to perform an element-wise addition
#if 1
const char *programSource =
"__kernel \n"
"void vec_add(__global float *A, \n"
" __global float *B, \n"
" __global float *C) \n"
"{ \n"
" // Get the work-item's unique ID \n"
" int idx = (get_global_id(0) << 2) & (16384 - 1); \n"
" float4 vector_a = vload4(0, A + idx); \n"
" float4 vector_b = vload4(0, B + idx); \n"
" float4 vector_c = vector_a + vector_b; \n"
" \n"
" // Add the corresponding locations of \n"
" // 'A' and 'B', and store the reasult in 'C' \n"
" vstore4(vector_c, 0, C + idx); \n"
"} \n";
#else
// OpenCL kernel to perform an element-wise addition
const char *programSource =
"__kernel \n"
"void vec_add(__global float *A, \n"
" __global float *B, \n"
" __global float *C) \n"
"{ \n"
" // Get the work-item's unique ID \n"
" int idx = get_global_id(0); \n"
" \n"
" // Add the corresponding locations of \n"
" // 'A' and 'B', and store the reasult in 'C' \n"
" C[idx] = A[idx] + B[idx]; \n"
"} \n";
#endif
// Choose OpenCL platform and create a context
cl_context CreateContext()
{
/* 1. get platform information */
cl_uint num_platforms;
cl_platform_id first_platform_id;
cl_int err_num;
// get num_platforms and platforms ID
err_num = clGetPlatformIDs(1, &first_platform_id, &num_platforms);
if (err_num != CL_SUCCESS || num_platforms <= 0)
{
fprintf(stderr, "Failed to get number of platforms: %d\n", err_num);
return NULL;
}
/* 2. create context */
cl_context context = NULL;
cl_context_properties context_prop[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)first_platform_id,
0};
context = clCreateContextFromType(context_prop, CL_DEVICE_TYPE_ALL, NULL, NULL, &err_num);
if (err_num != CL_SUCCESS)
{
fprintf(stderr, "Failed to create context: %d\n", err_num);
return NULL;
}
return context;
}
/* Create command queue */
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
{
cl_int err_num;
cl_device_id *devices_arr;
cl_command_queue command_queue = NULL;
size_t deviceBufferSize = -1;
// Get device buffer size
err_num = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
if (deviceBufferSize <= 0)
{
fprintf(stderr, "No devices available: %d\n", err_num);
return NULL;
}
// Allocate cache space for devices
devices_arr = malloc(deviceBufferSize / sizeof(cl_device_id));
err_num = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices_arr, NULL);
// Select the first available device
command_queue = clCreateCommandQueueWithProperties(context, devices_arr[0], 0, NULL);
*device = devices_arr[0];
free(devices_arr);
return command_queue;
}
// Create and build program objects
cl_program CreateProgram(cl_context context, cl_device_id device, const char *srcStr)
{
cl_int err_num;
cl_program program;
program = clCreateProgramWithSource(context, 1, (const char **)&srcStr, NULL, &err_num);
if (err_num != CL_SUCCESS)
{
fprintf(stderr, "Failed to create program: %d\n", err_num);
return NULL;
}
// err_num = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
err_num = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (err_num != CL_SUCCESS)
{
fprintf(stderr, "Failed to build program: %d\n", err_num);
return NULL;
}
return program;
}
// Create memory object
bool CreateMemObjects(cl_context context, cl_mem mem_objects[3], float *a, float *b, size_t datasize)
{
mem_objects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
datasize, a, NULL);
mem_objects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
datasize, b, NULL);
mem_objects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
datasize, NULL, NULL);
return true;
}
// Clean up
void Cleanup(cl_context context, cl_command_queue command_queue,
cl_program program, cl_kernel kernel, cl_mem mem_objects[3])
{
for (int i = 0; i < 3; i++)
{
if (mem_objects[i] != 0)
clReleaseMemObject(mem_objects[i]);
}
if (command_queue != 0)
clReleaseCommandQueue(command_queue);
if (kernel != 0)
clReleaseKernel(kernel);
if (program != 0)
clReleaseProgram(program);
if (context != 0)
clReleaseContext(context);
}
int main(void)
{
struct timeval tv_start, tv_end;
uint64_t time_start, time_end;
cl_int err_num = 0;
cl_context context = 0;
cl_command_queue command_queue = 0;
cl_program program = 0;
cl_device_id device = 0;
cl_kernel kernel = 0;
cl_mem mem_objects[3] = {0, 0, 0};
/* 1. Choose OpenCL platform and create a context */
context = CreateContext();
fprintf(stderr, "Create context success.\n");
/* 2. Create command queue */
command_queue = CreateCommandQueue(context, &device);
fprintf(stderr, "Create command queue success.\n");
/* 3. Create and build program objects */
program = CreateProgram(context, device, programSource);
fprintf(stderr, "Create program success.\n");
/* 4. create kernel */
kernel = clCreateKernel(program, "vec_add", &err_num);
if (err_num != CL_SUCCESS)
{
fprintf(stderr, "Failed to create kernel: %d\n", err_num);
return -1;
}
/* 5. set input data && create memory object */
// Elements in each array
const int elements = 102400000;
// Compute the size of the data
size_t datasize = sizeof(float) * (16384);
// Allocate space for input/output host data
float *A = (float *)malloc(datasize); // Input array
float *B = (float *)malloc(datasize); // Input array
float *C = (float *)malloc(datasize); // Output array
// Initialize the input data
for (int i = 0; i < (16384); i++)
{
A[i] = (float)i;
B[i] = 3.0 * (float)i;
}
// Create MemObjects
if (!CreateMemObjects(context, mem_objects, A, B, datasize))
{
Cleanup(context, command_queue, program, kernel, mem_objects);
return 1;
}
// Write data from the input arrays to the buffers
err_num = clEnqueueWriteBuffer(command_queue, mem_objects[0], CL_FALSE, 0, datasize, A, 0, NULL, NULL);
err_num = clEnqueueWriteBuffer(command_queue, mem_objects[1], CL_FALSE, 0, datasize, B, 0, NULL, NULL);
/* 6. set kernel argument */
err_num = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_objects[0]);
if (err_num != CL_SUCCESS)
{
fprintf(stderr, "Failed to set kernel argument 0: %d\n", err_num);
Cleanup(context, command_queue, program, kernel, mem_objects);
}
err_num = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_objects[1]);
if (err_num != CL_SUCCESS)
{
fprintf(stderr, "Failed to set kernel argument 1: %d\n", err_num);
Cleanup(context, command_queue, program, kernel, mem_objects);
}
err_num = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_objects[2]);
if (err_num != CL_SUCCESS)
{
fprintf(stderr, "Failed to set kernel argument 2: %d\n", err_num);
Cleanup(context, command_queue, program, kernel, mem_objects);
}
gettimeofday(&tv_start, NULL);
/* 7. send kernel to execute */
size_t globalWorkSize[] = {elements / 4, 1, 1};
size_t localWorkSize[] = {32, 1, 1};
int num_loops = 1;
for (int batch = 0; batch < num_loops; batch += 1)
{
err_num = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalWorkSize,
localWorkSize, 0, NULL, NULL);
if (err_num != CL_SUCCESS)
{
fprintf(stderr, "Failed to enqueue kernel: %d\n", err_num);
Cleanup(context, command_queue, program, kernel, mem_objects);
}
}
#if 1
/* 8. read data from output */
err_num = clEnqueueReadBuffer(command_queue, mem_objects[2], CL_TRUE, 0, datasize, C, 0, NULL, NULL);
if (err_num != CL_SUCCESS)
{
fprintf(stderr, "Failed to read buffer: %d\n", err_num);
Cleanup(context, command_queue, program, kernel, mem_objects);
}
#endif
clFinish(command_queue);
gettimeofday(&tv_end, NULL);
#if 1
for (int i = 0; i < 16384; i++)
{
printf("%f ", C[i]);
}
printf("\n");
#endif
long int start_time_us = tv_start.tv_sec * 1000000L + tv_start.tv_usec;
long int end_time_us = tv_end.tv_sec * 1000000L + tv_end.tv_usec;
long int diff_time_us = end_time_us - start_time_us;
printf("Start time: %ld sec, %ld usec\n", tv_start.tv_sec, tv_start.tv_usec);
printf("End time: %ld sec, %ld usec\n", tv_end.tv_sec, tv_end.tv_usec);
printf("Calculate time for %d addition operations: %ld us\n", elements * num_loops, diff_time_us);
/* 9. clean up */
free(A);
free(B);
free(C);
Cleanup(context, command_queue, program, kernel, mem_objects);
return 0;
}