mirror of
https://gitee.com/bianbu-linux/k1x-gpu-test
synced 2025-04-24 05:57:27 -04:00
299 lines
9.9 KiB
C
299 lines
9.9 KiB
C
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <stdbool.h>
|
|
#include <sys/time.h>
|
|
#include <CL/cl.h>
|
|
|
|
// OpenCL kernel to perform an element-wise addition
|
|
#if 1
|
|
const char *programSource =
|
|
"__kernel \n"
|
|
"void vec_add(__global float *A, \n"
|
|
" __global float *B, \n"
|
|
" __global float *C) \n"
|
|
"{ \n"
|
|
" // Get the work-item's unique ID \n"
|
|
" int idx = (get_global_id(0) << 2) & (16384 - 1); \n"
|
|
" float4 vector_a = vload4(0, A + idx); \n"
|
|
" float4 vector_b = vload4(0, B + idx); \n"
|
|
" float4 vector_c = vector_a + vector_b; \n"
|
|
" \n"
|
|
" // Add the corresponding locations of \n"
|
|
" // 'A' and 'B', and store the reasult in 'C' \n"
|
|
" vstore4(vector_c, 0, C + idx); \n"
|
|
"} \n";
|
|
#else
|
|
// OpenCL kernel to perform an element-wise addition
|
|
const char *programSource =
|
|
"__kernel \n"
|
|
"void vec_add(__global float *A, \n"
|
|
" __global float *B, \n"
|
|
" __global float *C) \n"
|
|
"{ \n"
|
|
" // Get the work-item's unique ID \n"
|
|
" int idx = get_global_id(0); \n"
|
|
" \n"
|
|
" // Add the corresponding locations of \n"
|
|
" // 'A' and 'B', and store the reasult in 'C' \n"
|
|
" C[idx] = A[idx] + B[idx]; \n"
|
|
"} \n";
|
|
#endif
|
|
|
|
// Choose OpenCL platform and create a context
|
|
cl_context CreateContext()
|
|
{
|
|
/* 1. get platform information */
|
|
cl_uint num_platforms;
|
|
cl_platform_id first_platform_id;
|
|
cl_int err_num;
|
|
|
|
// get num_platforms and platforms ID
|
|
err_num = clGetPlatformIDs(1, &first_platform_id, &num_platforms);
|
|
if (err_num != CL_SUCCESS || num_platforms <= 0)
|
|
{
|
|
fprintf(stderr, "Failed to get number of platforms: %d\n", err_num);
|
|
return NULL;
|
|
}
|
|
|
|
/* 2. create context */
|
|
cl_context context = NULL;
|
|
cl_context_properties context_prop[] = {
|
|
CL_CONTEXT_PLATFORM,
|
|
(cl_context_properties)first_platform_id,
|
|
0};
|
|
|
|
context = clCreateContextFromType(context_prop, CL_DEVICE_TYPE_ALL, NULL, NULL, &err_num);
|
|
|
|
if (err_num != CL_SUCCESS)
|
|
{
|
|
fprintf(stderr, "Failed to create context: %d\n", err_num);
|
|
return NULL;
|
|
}
|
|
|
|
return context;
|
|
}
|
|
|
|
/* Create command queue */
|
|
cl_command_queue CreateCommandQueue(cl_context context, cl_device_id *device)
|
|
{
|
|
cl_int err_num;
|
|
cl_device_id *devices_arr;
|
|
cl_command_queue command_queue = NULL;
|
|
size_t deviceBufferSize = -1;
|
|
|
|
// Get device buffer size
|
|
err_num = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceBufferSize);
|
|
|
|
if (deviceBufferSize <= 0)
|
|
{
|
|
fprintf(stderr, "No devices available: %d\n", err_num);
|
|
return NULL;
|
|
}
|
|
|
|
// Allocate cache space for devices
|
|
devices_arr = malloc(deviceBufferSize / sizeof(cl_device_id));
|
|
err_num = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceBufferSize, devices_arr, NULL);
|
|
|
|
// Select the first available device
|
|
command_queue = clCreateCommandQueueWithProperties(context, devices_arr[0], 0, NULL);
|
|
|
|
*device = devices_arr[0];
|
|
free(devices_arr);
|
|
return command_queue;
|
|
}
|
|
|
|
// Create and build program objects
|
|
cl_program CreateProgram(cl_context context, cl_device_id device, const char *srcStr)
|
|
{
|
|
cl_int err_num;
|
|
cl_program program;
|
|
|
|
program = clCreateProgramWithSource(context, 1, (const char **)&srcStr, NULL, &err_num);
|
|
|
|
if (err_num != CL_SUCCESS)
|
|
{
|
|
fprintf(stderr, "Failed to create program: %d\n", err_num);
|
|
return NULL;
|
|
}
|
|
|
|
// err_num = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
|
|
err_num = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
|
|
if (err_num != CL_SUCCESS)
|
|
{
|
|
fprintf(stderr, "Failed to build program: %d\n", err_num);
|
|
return NULL;
|
|
}
|
|
|
|
return program;
|
|
}
|
|
|
|
// Create memory object
|
|
bool CreateMemObjects(cl_context context, cl_mem mem_objects[3], float *a, float *b, size_t datasize)
|
|
{
|
|
mem_objects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
|
datasize, a, NULL);
|
|
mem_objects[1] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
|
datasize, b, NULL);
|
|
mem_objects[2] = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
|
datasize, NULL, NULL);
|
|
return true;
|
|
}
|
|
|
|
// Clean up
|
|
void Cleanup(cl_context context, cl_command_queue command_queue,
|
|
cl_program program, cl_kernel kernel, cl_mem mem_objects[3])
|
|
{
|
|
for (int i = 0; i < 3; i++)
|
|
{
|
|
if (mem_objects[i] != 0)
|
|
clReleaseMemObject(mem_objects[i]);
|
|
}
|
|
if (command_queue != 0)
|
|
clReleaseCommandQueue(command_queue);
|
|
|
|
if (kernel != 0)
|
|
clReleaseKernel(kernel);
|
|
|
|
if (program != 0)
|
|
clReleaseProgram(program);
|
|
|
|
if (context != 0)
|
|
clReleaseContext(context);
|
|
}
|
|
|
|
int main(void)
|
|
{
|
|
struct timeval tv_start, tv_end;
|
|
uint64_t time_start, time_end;
|
|
|
|
cl_int err_num = 0;
|
|
cl_context context = 0;
|
|
cl_command_queue command_queue = 0;
|
|
cl_program program = 0;
|
|
cl_device_id device = 0;
|
|
cl_kernel kernel = 0;
|
|
cl_mem mem_objects[3] = {0, 0, 0};
|
|
|
|
/* 1. Choose OpenCL platform and create a context */
|
|
context = CreateContext();
|
|
fprintf(stderr, "Create context success.\n");
|
|
/* 2. Create command queue */
|
|
command_queue = CreateCommandQueue(context, &device);
|
|
fprintf(stderr, "Create command queue success.\n");
|
|
/* 3. Create and build program objects */
|
|
program = CreateProgram(context, device, programSource);
|
|
fprintf(stderr, "Create program success.\n");
|
|
|
|
/* 4. create kernel */
|
|
kernel = clCreateKernel(program, "vec_add", &err_num);
|
|
if (err_num != CL_SUCCESS)
|
|
{
|
|
fprintf(stderr, "Failed to create kernel: %d\n", err_num);
|
|
return -1;
|
|
}
|
|
|
|
/* 5. set input data && create memory object */
|
|
|
|
// Elements in each array
|
|
const int elements = 102400000;
|
|
// Compute the size of the data
|
|
size_t datasize = sizeof(float) * (16384);
|
|
// Allocate space for input/output host data
|
|
float *A = (float *)malloc(datasize); // Input array
|
|
float *B = (float *)malloc(datasize); // Input array
|
|
float *C = (float *)malloc(datasize); // Output array
|
|
// Initialize the input data
|
|
for (int i = 0; i < (16384); i++)
|
|
{
|
|
A[i] = (float)i;
|
|
B[i] = 3.0 * (float)i;
|
|
}
|
|
|
|
// Create MemObjects
|
|
if (!CreateMemObjects(context, mem_objects, A, B, datasize))
|
|
{
|
|
Cleanup(context, command_queue, program, kernel, mem_objects);
|
|
return 1;
|
|
}
|
|
|
|
// Write data from the input arrays to the buffers
|
|
err_num = clEnqueueWriteBuffer(command_queue, mem_objects[0], CL_FALSE, 0, datasize, A, 0, NULL, NULL);
|
|
err_num = clEnqueueWriteBuffer(command_queue, mem_objects[1], CL_FALSE, 0, datasize, B, 0, NULL, NULL);
|
|
|
|
/* 6. set kernel argument */
|
|
err_num = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_objects[0]);
|
|
if (err_num != CL_SUCCESS)
|
|
{
|
|
fprintf(stderr, "Failed to set kernel argument 0: %d\n", err_num);
|
|
Cleanup(context, command_queue, program, kernel, mem_objects);
|
|
}
|
|
|
|
err_num = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_objects[1]);
|
|
if (err_num != CL_SUCCESS)
|
|
{
|
|
fprintf(stderr, "Failed to set kernel argument 1: %d\n", err_num);
|
|
Cleanup(context, command_queue, program, kernel, mem_objects);
|
|
}
|
|
|
|
err_num = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_objects[2]);
|
|
if (err_num != CL_SUCCESS)
|
|
{
|
|
fprintf(stderr, "Failed to set kernel argument 2: %d\n", err_num);
|
|
Cleanup(context, command_queue, program, kernel, mem_objects);
|
|
}
|
|
|
|
gettimeofday(&tv_start, NULL);
|
|
/* 7. send kernel to execute */
|
|
size_t globalWorkSize[] = {elements / 4, 1, 1};
|
|
size_t localWorkSize[] = {32, 1, 1};
|
|
int num_loops = 1;
|
|
for (int batch = 0; batch < num_loops; batch += 1)
|
|
{
|
|
err_num = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, globalWorkSize,
|
|
localWorkSize, 0, NULL, NULL);
|
|
if (err_num != CL_SUCCESS)
|
|
{
|
|
fprintf(stderr, "Failed to enqueue kernel: %d\n", err_num);
|
|
Cleanup(context, command_queue, program, kernel, mem_objects);
|
|
}
|
|
}
|
|
|
|
#if 1
|
|
/* 8. read data from output */
|
|
err_num = clEnqueueReadBuffer(command_queue, mem_objects[2], CL_TRUE, 0, datasize, C, 0, NULL, NULL);
|
|
if (err_num != CL_SUCCESS)
|
|
{
|
|
fprintf(stderr, "Failed to read buffer: %d\n", err_num);
|
|
Cleanup(context, command_queue, program, kernel, mem_objects);
|
|
}
|
|
#endif
|
|
clFinish(command_queue);
|
|
gettimeofday(&tv_end, NULL);
|
|
|
|
#if 1
|
|
for (int i = 0; i < 16384; i++)
|
|
{
|
|
printf("%f ", C[i]);
|
|
}
|
|
printf("\n");
|
|
#endif
|
|
|
|
long int start_time_us = tv_start.tv_sec * 1000000L + tv_start.tv_usec;
|
|
long int end_time_us = tv_end.tv_sec * 1000000L + tv_end.tv_usec;
|
|
|
|
long int diff_time_us = end_time_us - start_time_us;
|
|
|
|
printf("Start time: %ld sec, %ld usec\n", tv_start.tv_sec, tv_start.tv_usec);
|
|
printf("End time: %ld sec, %ld usec\n", tv_end.tv_sec, tv_end.tv_usec);
|
|
printf("Calculate time for %d addition operations: %ld us\n", elements * num_loops, diff_time_us);
|
|
|
|
/* 9. clean up */
|
|
free(A);
|
|
free(B);
|
|
free(C);
|
|
Cleanup(context, command_queue, program, kernel, mem_objects);
|
|
|
|
return 0;
|
|
}
|
|
|