mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
adding opencl convolution benchmark
This commit is contained in:
parent
4e7a536918
commit
61e3442ef8
16 changed files with 490 additions and 170 deletions
|
@ -18,6 +18,7 @@ all:
|
|||
$(MAKE) -C oclprintf
|
||||
$(MAKE) -C blackscholes
|
||||
$(MAKE) -C matmul
|
||||
$(MAKE) -C convolution
|
||||
|
||||
run-simx:
|
||||
$(MAKE) -C vecadd run-simx
|
||||
|
@ -37,6 +38,7 @@ run-simx:
|
|||
$(MAKE) -C blackscholes run-simx
|
||||
$(MAKE) -C matmul run-simx
|
||||
$(MAKE) -C transpose run-simx
|
||||
$(MAKE) -C convolution run-simx
|
||||
# $(MAKE) -C vectorhypot run-simx
|
||||
# $(MAKE) -C mri-q run-simx
|
||||
|
||||
|
@ -58,6 +60,7 @@ run-rtlsim:
|
|||
$(MAKE) -C oclprintf run-rtlsim
|
||||
$(MAKE) -C blackscholes run-rtlsim
|
||||
$(MAKE) -C matmul run-rtlsim
|
||||
$(MAKE) -C convolution run-rtlsim
|
||||
# $(MAKE) -C vectorhypot run-rtlsim
|
||||
# $(MAKE) -C mri-q run-rtlsim
|
||||
|
||||
|
@ -79,6 +82,7 @@ run-opae:
|
|||
$(MAKE) -C oclprintf run-opae
|
||||
$(MAKE) -C blackscholes run-opae
|
||||
$(MAKE) -C matmul run-opae
|
||||
$(MAKE) -C convolution run-opae
|
||||
# $(MAKE) -C vectorhypot run-opae
|
||||
# $(MAKE) -C mri-q run-opae
|
||||
|
||||
|
@ -102,6 +106,7 @@ clean:
|
|||
$(MAKE) -C oclprintf clean
|
||||
$(MAKE) -C blackscholes clean
|
||||
$(MAKE) -C matmul clean
|
||||
$(MAKE) -C convolution clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C vecadd clean-all
|
||||
|
@ -124,3 +129,4 @@ clean-all:
|
|||
$(MAKE) -C oclprintf clean-all
|
||||
$(MAKE) -C blackscholes clean-all
|
||||
$(MAKE) -C matmul clean-all
|
||||
$(MAKE) -C convolution clean-all
|
||||
|
|
7
tests/opencl/convolution/Makefile
Normal file
7
tests/opencl/convolution/Makefile
Normal file
|
@ -0,0 +1,7 @@
|
|||
PROJECT = convolution
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
OPTS ?= -n32
|
||||
|
||||
include ../common.mk
|
32
tests/opencl/convolution/kernel.cl
Normal file
32
tests/opencl/convolution/kernel.cl
Normal file
|
@ -0,0 +1,32 @@
|
|||
__kernel void conv3x3(__global float* output,
|
||||
__global float* input,
|
||||
__global float* weights,
|
||||
const int width,
|
||||
const int height)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
// Adjust for padded borders
|
||||
int paddedWidth = width + 2;
|
||||
int paddedX = x + 1;
|
||||
int paddedY = y + 1;
|
||||
|
||||
// Compute the convolution sum
|
||||
float sum = 0.0f;
|
||||
|
||||
sum += input[(paddedY - 1) * paddedWidth + (paddedX - 1)] * weights[0]; // Top-left
|
||||
sum += input[(paddedY - 1) * paddedWidth + paddedX] * weights[1]; // Top-center
|
||||
sum += input[(paddedY - 1) * paddedWidth + (paddedX + 1)] * weights[2]; // Top-right
|
||||
|
||||
sum += input[paddedY * paddedWidth + (paddedX - 1)] * weights[3]; // Middle-left
|
||||
sum += input[paddedY * paddedWidth + paddedX] * weights[4]; // Center
|
||||
sum += input[paddedY * paddedWidth + (paddedX + 1)] * weights[5]; // Middle-right
|
||||
|
||||
sum += input[(paddedY + 1) * paddedWidth + (paddedX - 1)] * weights[6]; // Bottom-left
|
||||
sum += input[(paddedY + 1) * paddedWidth + paddedX] * weights[7]; // Bottom-center
|
||||
sum += input[(paddedY + 1) * paddedWidth + (paddedX + 1)] * weights[8]; // Bottom-right
|
||||
|
||||
// Store the result in the output array
|
||||
output[y * width + x] = sum;
|
||||
}
|
258
tests/opencl/convolution/main.cc
Normal file
258
tests/opencl/convolution/main.cc
Normal file
|
@ -0,0 +1,258 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <CL/opencl.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define KERNEL_NAME "conv3x3"
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
do { \
|
||||
cl_int _err = _expr; \
|
||||
if (_err == CL_SUCCESS) \
|
||||
break; \
|
||||
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (0)
|
||||
|
||||
#define CL_CHECK2(_expr) \
|
||||
({ \
|
||||
cl_int _err = CL_INVALID_VALUE; \
|
||||
decltype(_expr) _ret = _expr; \
|
||||
if (_err != CL_SUCCESS) { \
|
||||
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool compare_equal(float a, float b) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
return d <= FLOAT_ULP;
|
||||
}
|
||||
|
||||
static void convolution_cpu(float *O, float *I, float *W, int32_t width, int32_t height) {
|
||||
int paddedWidth = width + 2;
|
||||
for (int32_t y = 0; y < height; ++y) {
|
||||
for (int32_t x = 0; x < width; ++x) {
|
||||
int paddedY = y + 1;
|
||||
int paddedX = x + 1;
|
||||
float sum = 0.0f;
|
||||
for (int32_t ky = -1; ky <= 1; ++ky) {
|
||||
for (int32_t kx = -1; kx <= 1; ++kx) {
|
||||
int32_t iy = paddedY + ky;
|
||||
int32_t ix = paddedX + kx;
|
||||
float value = I[iy * paddedWidth + ix];
|
||||
float weight = W[(ky + 1) * 3 + (kx + 1)];
|
||||
sum += value * weight;
|
||||
}
|
||||
}
|
||||
O[y * width + x] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cl_device_id device_id = NULL;
|
||||
cl_context context = NULL;
|
||||
cl_command_queue commandQueue = NULL;
|
||||
cl_program program = NULL;
|
||||
cl_kernel kernel = NULL;
|
||||
cl_mem i_memobj = NULL;
|
||||
cl_mem w_memobj = NULL;
|
||||
cl_mem o_memobj = NULL;
|
||||
uint8_t* kernel_bin = NULL;
|
||||
|
||||
static void cleanup() {
|
||||
if (commandQueue) clReleaseCommandQueue(commandQueue);
|
||||
if (kernel) clReleaseKernel(kernel);
|
||||
if (program) clReleaseProgram(program);
|
||||
if (i_memobj) clReleaseMemObject(i_memobj);
|
||||
if (w_memobj) clReleaseMemObject(w_memobj);
|
||||
if (o_memobj) clReleaseMemObject(o_memobj);
|
||||
if (context) clReleaseContext(context);
|
||||
if (device_id) clReleaseDevice(device_id);
|
||||
if (kernel_bin) free(kernel_bin);
|
||||
}
|
||||
|
||||
int size = 32;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-h: help]\n");
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, char **argv) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
printf("Matrix size=%d\n", size);
|
||||
|
||||
uint32_t o_points = size * size;
|
||||
uint32_t i_points = (size+2) * (size+2);
|
||||
uint32_t w_points = 3 * 3;
|
||||
|
||||
cl_platform_id platform_id;
|
||||
size_t kernel_size;
|
||||
|
||||
// Getting platform and device information
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
|
||||
|
||||
printf("Create context\n");
|
||||
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
|
||||
|
||||
char device_string[1024];
|
||||
clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
printf("Using device: %s\n", device_string);
|
||||
|
||||
printf("Allocate device buffers\n");
|
||||
size_t i_nbytes = i_points * sizeof(float);
|
||||
size_t w_nbytes = w_points * sizeof(float);
|
||||
size_t o_nbytes = o_points * sizeof(float);
|
||||
i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, i_nbytes, NULL, &_err));
|
||||
w_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, w_nbytes, NULL, &_err));
|
||||
o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
||||
// Create kernel
|
||||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||
|
||||
size_t global_size[2] = {size, size};
|
||||
|
||||
// Set kernel arguments
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&i_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&w_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(uint32_t), &size));
|
||||
|
||||
// Allocate memories for input arrays and output arrays.
|
||||
std::vector<float> h_i(i_points);
|
||||
std::vector<float> h_w(w_points);
|
||||
std::vector<float> h_o(o_points, 0.0f);
|
||||
|
||||
// Generate input values
|
||||
for (int32_t y = -1; y < size+1; ++y) {
|
||||
for (int32_t x = -1; x < size+1; ++x) {
|
||||
if (x >= 0 && x < size && y >= 0 && y < size) {
|
||||
h_i[(y+1) * (size+2) + (x+1)] = static_cast<float>(rand()) / RAND_MAX;
|
||||
} else {
|
||||
h_i[(y+1) * (size+2) + (x+1)] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (uint32_t i = 0; i < w_points; ++i) {
|
||||
h_w[i] = static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
|
||||
// Creating command queue
|
||||
commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
|
||||
|
||||
printf("Upload source buffers\n");
|
||||
CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, i_nbytes, h_i.data(), 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueWriteBuffer(commandQueue, w_memobj, CL_TRUE, 0, w_nbytes, h_w.data(), 0, NULL, NULL));
|
||||
|
||||
printf("Execute the kernel\n");
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(commandQueue));
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
printf("Download destination buffer\n");
|
||||
CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, o_nbytes, h_o.data(), 0, NULL, NULL));
|
||||
|
||||
printf("Verify result\n");
|
||||
std::vector<float> ref_vec(o_points);
|
||||
convolution_cpu(ref_vec.data(), h_i.data(), h_w.data(), size, size);
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < o_points; ++i) {
|
||||
if (!compare_equal(h_o[i], ref_vec[i])) {
|
||||
if (errors < 100)
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_o[i]);
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
printf("FAILED! - %d errors\n", errors);
|
||||
} else {
|
||||
printf("PASSED!\n");
|
||||
}
|
||||
|
||||
// Clean up
|
||||
cleanup();
|
||||
|
||||
return errors;
|
||||
}
|
|
@ -2,6 +2,6 @@ PROJECT = matmul
|
|||
|
||||
SRCS = main.cc
|
||||
|
||||
OPTS ?= -n16
|
||||
OPTS ?= -n32
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -7,43 +7,41 @@ __kernel void matmul(__global float *A,
|
|||
{
|
||||
int globalRow = get_global_id(1);
|
||||
int globalCol = get_global_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int localCol = get_local_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int localCol = get_local_id(0);
|
||||
int localSize = get_local_size(0); // assuming square local size
|
||||
|
||||
float sum = 0.0f;
|
||||
|
||||
// Load initial blocks of A and B into local memory
|
||||
int k = 0;
|
||||
localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
|
||||
localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
|
||||
// Loop over all blocks of both matrices
|
||||
for (int k = 0; k < N; k += localSize) {
|
||||
// Load block of matrix A to local memory
|
||||
localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
|
||||
|
||||
// Iterate over blocks
|
||||
for (k = 0; k < N; k += 16) {
|
||||
// Ensure the initial block is loaded
|
||||
// Load block of matrix B to local memory, adjusting for column-major access
|
||||
localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
|
||||
|
||||
// Synchronize to make sure the tiles are loaded
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Compute multiplication for this block
|
||||
for (int j = 0; j < 16; j++) {
|
||||
// Multiply the two matrix blocks and accumulate result
|
||||
for (int j = 0; j < localSize; j++) {
|
||||
sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
|
||||
}
|
||||
|
||||
// Load the next block of matrix A into local memory
|
||||
if (k + 16 < N) {
|
||||
localA[localRow * localSize + localCol] = A[globalRow * N + k + 16 + localCol];
|
||||
localB[localRow * localSize + localCol] = B[(k + 16 + localRow) * N + globalCol];
|
||||
}
|
||||
}
|
||||
|
||||
C[globalRow * N + globalCol] = sum;
|
||||
}
|
||||
|
||||
/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
|
||||
/*__kernel void matmul(__global float *A,
|
||||
__global float *B,
|
||||
__global float *C,
|
||||
const unsigned int N)
|
||||
{
|
||||
int globalRow = get_global_id(1);
|
||||
int globalCol = get_global_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int localCol = get_local_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int localCol = get_local_id(0);
|
||||
|
||||
// Static local memory declaration
|
||||
__local float localA[16][16];
|
||||
|
@ -51,26 +49,21 @@ __kernel void matmul(__global float *A,
|
|||
|
||||
float sum = 0.0f;
|
||||
|
||||
// Load initial blocks of A and B into local memory
|
||||
int k = 0;
|
||||
localA[localRow][localCol] = A[globalRow * N + k + localCol];
|
||||
localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
|
||||
|
||||
// Iterate over blocks
|
||||
for (k = 0; k < N; k += 16) {
|
||||
// Ensure the initial block is loaded
|
||||
for (int k = 0; k < N; k += 16) {
|
||||
// Load a block of matrix A into local memory
|
||||
localA[localRow][localCol] = A[globalRow * N + k + localCol];
|
||||
|
||||
// Load a block of matrix B into local memory
|
||||
localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
|
||||
|
||||
// Ensure the entire block is loaded
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Compute multiplication for this block
|
||||
for (int j = 0; j < 16; j++) {
|
||||
sum += localA[localRow][j] * localB[j][localCol];
|
||||
}
|
||||
|
||||
// Load the next block of matrix A into local memory
|
||||
if (k + 16 < N) {
|
||||
localA[localRow][localCol] = A[globalRow * N + k + 16 + localCol];
|
||||
localB[localRow][localCol] = B[(k + 16 + localRow) * N + globalCol];
|
||||
}
|
||||
}
|
||||
|
||||
C[globalRow * N + globalCol] = sum;
|
||||
|
|
|
@ -10,6 +10,8 @@
|
|||
|
||||
#define LOCAL_SIZE 16
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define KERNEL_NAME "matmul"
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
|
@ -56,15 +58,16 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static bool compare_equal(float a, float b, int ulp = 21) {
|
||||
union fi_t { int i; float f; };
|
||||
static bool compare_equal(float a, float b) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
return std::abs(fa.i - fb.i) <= ulp;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
return d <= FLOAT_ULP;
|
||||
}
|
||||
|
||||
static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
|
||||
static void matmul_cpu(float *C, float *A, float *B, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
float sum = 0.0f;
|
||||
|
@ -98,7 +101,7 @@ static void cleanup() {
|
|||
if (kernel_bin) free(kernel_bin);
|
||||
}
|
||||
|
||||
int size = 64;
|
||||
int size = 32;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-h: help]\n");
|
||||
|
@ -106,7 +109,7 @@ static void show_usage() {
|
|||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "fn:h?")) != -1) {
|
||||
while ((c = getopt(argc, argv, "n:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
|
@ -127,6 +130,8 @@ int main (int argc, char **argv) {
|
|||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
uint32_t num_points = size * size;
|
||||
|
||||
printf("Matrix size=%d\n", size);
|
||||
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
|
||||
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
|
||||
|
@ -148,7 +153,7 @@ int main (int argc, char **argv) {
|
|||
printf("Using device: %s\n", device_string);
|
||||
|
||||
printf("Allocate device buffers\n");
|
||||
size_t nbytes = size * size * sizeof(float);
|
||||
size_t nbytes = num_points * sizeof(float);
|
||||
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
@ -176,32 +181,26 @@ int main (int argc, char **argv) {
|
|||
// Create kernel
|
||||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||
|
||||
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
|
||||
size_t global_size[2] = {size, size};
|
||||
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
|
||||
|
||||
// Set kernel arguments
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
|
||||
//CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
|
||||
//CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
|
||||
|
||||
// Allocate memories for input arrays and output arrays.
|
||||
std::vector<float> h_a(size * size);
|
||||
std::vector<float> h_b(size * size);
|
||||
std::vector<float> h_c(size * size);
|
||||
std::vector<float> h_a(num_points);
|
||||
std::vector<float> h_b(num_points);
|
||||
std::vector<float> h_c(num_points);
|
||||
|
||||
// Initialize values for array members.
|
||||
for (int i = 0; i < (size * size); ++i) {
|
||||
#ifdef USE_FLOAT
|
||||
h_a[i] = (float)rand() / (float)RAND_MAX;
|
||||
h_b[i] = (float)rand() / (float)RAND_MAX;
|
||||
#else
|
||||
h_a[i] = rand();
|
||||
h_b[i] = rand();
|
||||
#endif
|
||||
h_c[i] = 0xdeadbeef;
|
||||
// Generate input values
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_a[i] = static_cast<float>(rand()) / RAND_MAX;
|
||||
h_b[i] = static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
|
||||
// Creating command queue
|
||||
|
@ -223,10 +222,10 @@ int main (int argc, char **argv) {
|
|||
CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
|
||||
|
||||
printf("Verify result\n");
|
||||
std::vector<float> ref_vec(size * size);
|
||||
matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
|
||||
std::vector<float> ref_vec(num_points);
|
||||
matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size);
|
||||
int errors = 0;
|
||||
for (int i = 0; i < (size * size); i++) {
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
if (!compare_equal(h_c[i], ref_vec[i])) {
|
||||
if (errors < 100)
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
|
||||
|
|
|
@ -143,7 +143,7 @@ int main (int argc, char **argv) {
|
|||
// Allocate memories for input arrays and output arrays.
|
||||
h_a = (int*)malloc(nbytes);
|
||||
|
||||
// Initialize values for array members.
|
||||
// Generate input values
|
||||
for (int i = 0; i < size; ++i) {
|
||||
h_a[i] = -1 + i;
|
||||
}
|
||||
|
|
|
@ -155,9 +155,8 @@ int main (int argc, char **argv) {
|
|||
h_a = (int*)malloc(nbytes);
|
||||
h_c = (int*)malloc(nbytes);
|
||||
|
||||
// Initialize values for array members.
|
||||
// Generate input values
|
||||
for (int i = 0; i < size; ++i) {
|
||||
h_c[i] = 0xdeadbeef;
|
||||
if (float_enable) {
|
||||
float value = sinf(i)*sinf(i);
|
||||
h_a[i] = *(int*)&value;
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
#ifndef COMMON_H
|
||||
#define COMMON_H
|
||||
|
||||
#define USE_FLOAT
|
||||
|
||||
#ifdef USE_FLOAT
|
||||
#ifndef TYPE
|
||||
#define TYPE float
|
||||
#else
|
||||
#define TYPE int
|
||||
#endif
|
||||
|
||||
#endif // COMMON_H
|
|
@ -11,6 +11,8 @@
|
|||
|
||||
#define KERNEL_NAME "sgemm"
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
do { \
|
||||
cl_int _err = _expr; \
|
||||
|
@ -33,6 +35,66 @@
|
|||
_ret; \
|
||||
})
|
||||
|
||||
template <typename Type>
|
||||
class Comparator {};
|
||||
|
||||
template <>
|
||||
class Comparator<int> {
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class Comparator<float> {
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
if (d > FLOAT_ULP) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/*static void sgemm_cpu(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
|
||||
for (int m = 0; m < M; ++m) {
|
||||
for (int n = 0; n < N; ++n) {
|
||||
TYPE acc = 0;
|
||||
for (int k = 0; k < K; ++k) {
|
||||
acc += A[k * M + m] * B[n * K + k];
|
||||
}
|
||||
C[n * M + m] = acc;
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
@ -54,32 +116,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*static void matmul(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
|
||||
for (int m = 0; m < M; ++m) {
|
||||
for (int n = 0; n < N; ++n) {
|
||||
TYPE acc = 0;
|
||||
for (int k = 0; k < K; ++k) {
|
||||
acc += A[k * M + m] * B[n * K + k];
|
||||
}
|
||||
C[n * M + m] = acc;
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
#ifdef USE_FLOAT
|
||||
static bool compare_equal(float a, float b, int ulp = 21) {
|
||||
union fi_t { int i; float f; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
return std::abs(fa.i - fb.i) <= ulp;
|
||||
}
|
||||
#else
|
||||
static bool compare_equal(int a, int b, int ulp = 21) {
|
||||
return (a == b);
|
||||
}
|
||||
#endif
|
||||
|
||||
cl_device_id device_id = NULL;
|
||||
cl_context context = NULL;
|
||||
cl_command_queue commandQueue = NULL;
|
||||
|
@ -145,6 +181,8 @@ int main (int argc, char **argv) {
|
|||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
uint32_t num_points = size * size;
|
||||
|
||||
cl_platform_id platform_id;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status;
|
||||
|
@ -163,7 +201,7 @@ int main (int argc, char **argv) {
|
|||
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
|
||||
|
||||
// Allocate device buffers
|
||||
size_t nbytes = size * size * sizeof(TYPE);
|
||||
size_t nbytes = num_points * sizeof(TYPE);
|
||||
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
@ -194,23 +232,17 @@ int main (int argc, char **argv) {
|
|||
h_b = (TYPE*)malloc(nbytes);
|
||||
h_c = (TYPE*)malloc(nbytes);
|
||||
|
||||
// Initialize values for array members.
|
||||
for (int i = 0; i < (size * size); ++i) {
|
||||
#ifdef USE_FLOAT
|
||||
h_a[i] = (float)rand() / (float)RAND_MAX;
|
||||
h_b[i] = (float)rand() / (float)RAND_MAX;
|
||||
#else
|
||||
h_a[i] = rand();
|
||||
h_b[i] = rand();
|
||||
#endif
|
||||
h_c[i] = 0xdeadbeef;
|
||||
// Generate input values
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_a[i] = Comparator<TYPE>::generate();
|
||||
h_b[i] = Comparator<TYPE>::generate();
|
||||
}
|
||||
|
||||
size_t global_offset[2] = {0, 0};
|
||||
size_t global_work_size[2] = {size, size};
|
||||
size_t local_work_size[2] = {1, 1};
|
||||
|
||||
std::vector<float> ref_vec(size * size);
|
||||
std::vector<float> ref_vec(num_points);
|
||||
|
||||
// reference generation
|
||||
size_t num_groups_y = global_work_size[1] / local_work_size[1];
|
||||
|
@ -228,12 +260,7 @@ int main (int argc, char **argv) {
|
|||
TYPE acc = 0;
|
||||
for (int k = 0; k < width; k++) {
|
||||
acc += h_a[k * width + r] * h_b[c * width + k];
|
||||
}
|
||||
/*#ifdef USE_FLOAT
|
||||
printf("*** r=%d, c=%d, v=%f\n", r, c, acc);
|
||||
#else
|
||||
printf("*** r=%d, c=%d, v=%d\n", r, c, acc);
|
||||
#endif*/
|
||||
}
|
||||
ref_vec[c * width + r] = acc;
|
||||
}
|
||||
}
|
||||
|
@ -260,14 +287,8 @@ int main (int argc, char **argv) {
|
|||
|
||||
printf("Verify result\n");
|
||||
int errors = 0;
|
||||
for (int i = 0; i < (size * size); i++) {
|
||||
if (!compare_equal(h_c[i], ref_vec[i])) {
|
||||
if (errors < 100)
|
||||
#ifdef USE_FLOAT
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
|
||||
#else
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", i, ref_vec[i], h_c[i]);
|
||||
#endif
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
if (!Comparator<TYPE>::compare(h_c[i], ref_vec[i], i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -166,12 +166,10 @@ int main (int argc, char **argv) {
|
|||
h_b = (float*)malloc(nbytes);
|
||||
h_c = (float*)malloc(nbytes);
|
||||
|
||||
// Initialize values for array members.
|
||||
// Generate input values
|
||||
for (int i = 0; i < size; ++i) {
|
||||
h_a[i] = sinf(i)*sinf(i);
|
||||
h_b[i] = cosf(i)*cosf(i);
|
||||
h_c[i] = 0xdeadbeef;
|
||||
//printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
|
||||
}
|
||||
|
||||
// Creating command queue
|
||||
|
|
|
@ -19,16 +19,6 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
union Float_t {
|
||||
float f;
|
||||
int i;
|
||||
struct {
|
||||
uint32_t man : 23;
|
||||
uint32_t exp : 8;
|
||||
uint32_t sign : 1;
|
||||
} parts;
|
||||
};
|
||||
|
||||
template <typename Type>
|
||||
class Comparator {};
|
||||
|
||||
|
@ -38,22 +28,41 @@ public:
|
|||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static bool compare(int a, int b) {
|
||||
return a == b;
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class Comparator<float> {
|
||||
private:
|
||||
union Float_t { float f; int i; };
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static bool compare(float a, float b) {
|
||||
Float_t fa{a}, fb{b};
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
if (d > FLOAT_ULP) {
|
||||
std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -127,9 +136,7 @@ int run_test(const kernel_arg_t& kernel_arg,
|
|||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
|
||||
auto cur = buf_ptr[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref)) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
@ -196,8 +203,7 @@ int main(int argc, char *argv[]) {
|
|||
// generate source data
|
||||
source_data.resize(2 * num_points);
|
||||
for (uint32_t i = 0; i < source_data.size(); ++i) {
|
||||
auto r = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
source_data[i] = static_cast<TYPE>(r * 2 * num_points);
|
||||
source_data[i] = Comparator<TYPE>::generate();
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
|
|
|
@ -4,6 +4,6 @@ SRCS = main.cpp
|
|||
|
||||
VX_SRCS = kernel.cpp
|
||||
|
||||
OPTS ?= -s16
|
||||
OPTS ?= -n32
|
||||
|
||||
include ../common.mk
|
|
@ -12,10 +12,10 @@ inline uint32_t log2_fast(uint32_t x) {
|
|||
}
|
||||
|
||||
void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto size = arg->size;
|
||||
auto A = reinterpret_cast<TYPE*>(arg->A_addr);
|
||||
auto A = reinterpret_cast<TYPE*>(arg->A_addr);
|
||||
auto B = reinterpret_cast<TYPE*>(arg->B_addr);
|
||||
auto C = reinterpret_cast<TYPE*>(arg->C_addr);
|
||||
auto size = arg->size;
|
||||
|
||||
uint32_t row, col;
|
||||
if (is_log2(size)) {
|
||||
|
|
|
@ -19,16 +19,6 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
union Float_t {
|
||||
float f;
|
||||
int i;
|
||||
struct {
|
||||
uint32_t man : 23;
|
||||
uint32_t exp : 8;
|
||||
uint32_t sign : 1;
|
||||
} parts;
|
||||
};
|
||||
|
||||
template <typename Type>
|
||||
class Comparator {};
|
||||
|
||||
|
@ -38,8 +28,17 @@ public:
|
|||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static bool compare(int a, int b) {
|
||||
return a == b;
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -49,18 +48,26 @@ public:
|
|||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static bool compare(float a, float b) {
|
||||
Float_t fa{a}, fb{b};
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
if (d > FLOAT_ULP) {
|
||||
std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
|
||||
static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
|
||||
for (uint32_t row = 0; row < height; ++row) {
|
||||
for (uint32_t col = 0; col < width; ++col) {
|
||||
TYPE sum(0);
|
||||
|
@ -73,7 +80,7 @@ static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t
|
|||
}
|
||||
|
||||
const char* kernel_file = "kernel.bin";
|
||||
uint32_t size = 16;
|
||||
uint32_t size = 32;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
|
@ -81,14 +88,14 @@ kernel_arg_t kernel_arg = {};
|
|||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Test." << std::endl;
|
||||
std::cout << "Usage: [-k: kernel] [-s size] [-h: help]" << std::endl;
|
||||
std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "s:k:h?")) != -1) {
|
||||
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 's':
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 'k':
|
||||
|
@ -138,9 +145,7 @@ int run_test(const kernel_arg_t& kernel_arg,
|
|||
for (uint32_t i = 0; i < refs.size(); ++i) {
|
||||
auto ref = refs[i];
|
||||
auto cur = buf_ptr[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref)) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
@ -208,7 +213,7 @@ int main(int argc, char *argv[]) {
|
|||
src_A[i] = static_cast<TYPE>(a * size);
|
||||
src_B[i] = static_cast<TYPE>(b * size);
|
||||
}
|
||||
cpuMatrixMultiply(refs.data(), src_A.data(), src_B.data(), size, size);
|
||||
matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size);
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue