adding opencl convolution benchmark

2025-04-23 21:39:10 -04:00 · 2023-11-14 22:31:30 -08:00 · 2023-11-14 22:31:30 -08:00 · 61e3442ef8
commit 61e3442ef8
parent 4e7a536918
16 changed files with 490 additions and 170 deletions
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@ -18,6 +18,7 @@ all:
 	$(MAKE) -C oclprintf
 	$(MAKE) -C blackscholes
 	$(MAKE) -C matmul
+	$(MAKE) -C convolution

 run-simx:
 	$(MAKE) -C vecadd run-simx
@ -37,6 +38,7 @@ run-simx:
 	$(MAKE) -C blackscholes run-simx
 	$(MAKE) -C matmul run-simx
 	$(MAKE) -C transpose run-simx
+	$(MAKE) -C convolution run-simx
 #	$(MAKE) -C vectorhypot run-simx	
 #	$(MAKE) -C mri-q run-simx

@ -58,6 +60,7 @@ run-rtlsim:
 	$(MAKE) -C oclprintf run-rtlsim
 	$(MAKE) -C blackscholes run-rtlsim
 	$(MAKE) -C matmul run-rtlsim
+	$(MAKE) -C convolution run-rtlsim
 #	$(MAKE) -C vectorhypot run-rtlsim
 #	$(MAKE) -C mri-q run-rtlsim

@ -79,6 +82,7 @@ run-opae:
 	$(MAKE) -C oclprintf run-opae
 	$(MAKE) -C blackscholes run-opae
 	$(MAKE) -C matmul run-opae
+	$(MAKE) -C convolution run-opae
 #	$(MAKE) -C vectorhypot run-opae
 #	$(MAKE) -C mri-q run-opae

@ -102,6 +106,7 @@ clean:
 	$(MAKE) -C oclprintf clean
 	$(MAKE) -C blackscholes clean
 	$(MAKE) -C matmul clean
+	$(MAKE) -C convolution clean

 clean-all:
 	$(MAKE) -C vecadd clean-all
@ -124,3 +129,4 @@ clean-all:
 	$(MAKE) -C oclprintf clean-all
 	$(MAKE) -C blackscholes clean-all
 	$(MAKE) -C matmul clean-all
+	$(MAKE) -C convolution clean-all
--- a/tests/opencl/convolution/Makefile
+++ b/tests/opencl/convolution/Makefile
@ -0,0 +1,7 @@
+PROJECT = convolution
+
+SRCS = main.cc
+
+OPTS ?= -n32
+
+include ../common.mk
--- a/tests/opencl/convolution/kernel.cl
+++ b/tests/opencl/convolution/kernel.cl
@ -0,0 +1,32 @@
+__kernel void conv3x3(__global float* output, 
+                      __global float* input,
+                      __global float* weights, 
+                      const int width, 
+                      const int height) 
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    // Adjust for padded borders
+    int paddedWidth = width + 2;
+    int paddedX = x + 1;
+    int paddedY = y + 1;
+
+    // Compute the convolution sum
+    float sum = 0.0f;
+
+    sum += input[(paddedY - 1) * paddedWidth + (paddedX - 1)] * weights[0]; // Top-left
+    sum += input[(paddedY - 1) * paddedWidth + paddedX] * weights[1];       // Top-center
+    sum += input[(paddedY - 1) * paddedWidth + (paddedX + 1)] * weights[2]; // Top-right
+
+    sum += input[paddedY * paddedWidth + (paddedX - 1)] * weights[3];       // Middle-left
+    sum += input[paddedY * paddedWidth + paddedX] * weights[4];             // Center
+    sum += input[paddedY * paddedWidth + (paddedX + 1)] * weights[5];       // Middle-right
+
+    sum += input[(paddedY + 1) * paddedWidth + (paddedX - 1)] * weights[6]; // Bottom-left
+    sum += input[(paddedY + 1) * paddedWidth + paddedX] * weights[7];       // Bottom-center
+    sum += input[(paddedY + 1) * paddedWidth + (paddedX + 1)] * weights[8]; // Bottom-right
+
+    // Store the result in the output array
+    output[y * width + x] = sum;
+}
--- a/tests/opencl/convolution/main.cc
+++ b/tests/opencl/convolution/main.cc
@ -0,0 +1,258 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <CL/opencl.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h> 
+#include <chrono>
+#include <vector>
+
+#define FLOAT_ULP 6
+
+#define KERNEL_NAME "conv3x3"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static bool compare_equal(float a, float b) {
+  union fi_t { float f; int32_t i; };
+  fi_t fa, fb;
+  fa.f = a;
+  fb.f = b;
+  auto d = std::abs(fa.i - fb.i);
+  return d <= FLOAT_ULP;
+}
+
+static void convolution_cpu(float *O, float *I, float *W, int32_t width, int32_t height) {
+  int paddedWidth = width + 2;
+  for (int32_t y = 0; y < height; ++y) {
+    for (int32_t x = 0; x < width; ++x) {
+      int paddedY = y + 1;
+      int paddedX = x + 1;
+      float sum = 0.0f;
+      for (int32_t ky = -1; ky <= 1; ++ky) {
+        for (int32_t kx = -1; kx <= 1; ++kx) {
+          int32_t iy = paddedY + ky;
+          int32_t ix = paddedX + kx;
+          float value = I[iy * paddedWidth + ix];
+          float weight = W[(ky + 1) * 3 + (kx + 1)];
+          sum += value * weight;
+        }
+      }
+      O[y * width + x] = sum;
+    }
+  }
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem i_memobj = NULL;
+cl_mem w_memobj = NULL;
+cl_mem o_memobj = NULL;
+uint8_t* kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (i_memobj) clReleaseMemObject(i_memobj);
+  if (w_memobj) clReleaseMemObject(w_memobj);
+  if (o_memobj) clReleaseMemObject(o_memobj);
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);  
+  if (kernel_bin) free(kernel_bin);
+}
+
+int size = 32;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  printf("Matrix size=%d\n", size);
+
+  uint32_t o_points = size * size;
+  uint32_t i_points = (size+2) * (size+2);
+  uint32_t w_points = 3 * 3;
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  char device_string[1024];
+  clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
+  printf("Using device: %s\n", device_string);
+
+  printf("Allocate device buffers\n");  
+  size_t i_nbytes = i_points * sizeof(float);
+  size_t w_nbytes = w_points * sizeof(float);
+  size_t o_nbytes = o_points * sizeof(float);
+  i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, i_nbytes, NULL, &_err));
+  w_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, w_nbytes, NULL, &_err));
+  o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif  
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+  
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  size_t global_size[2] = {size, size};
+
+  // Set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&i_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&w_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
+  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(uint32_t), &size));
+
+  // Allocate memories for input arrays and output arrays.
+  std::vector<float> h_i(i_points);
+  std::vector<float> h_w(w_points);
+  std::vector<float> h_o(o_points, 0.0f);
+	
+  // Generate input values
+  for (int32_t y = -1; y < size+1; ++y) {
+    for (int32_t x = -1; x < size+1; ++x) {
+      if (x >= 0 && x < size && y >= 0 && y < size) {
+        h_i[(y+1) * (size+2) + (x+1)] = static_cast<float>(rand()) / RAND_MAX;
+      } else {
+        h_i[(y+1) * (size+2) + (x+1)] = 0;
+      }
+    }
+  }
+  for (uint32_t i = 0; i < w_points; ++i) {
+    h_w[i] = static_cast<float>(rand()) / RAND_MAX;
+  }
+
+  // Creating command queue
+  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
+
+	printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, i_nbytes, h_i.data(), 0, NULL, NULL));
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, w_memobj, CL_TRUE, 0, w_nbytes, h_w.data(), 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, o_nbytes, h_o.data(), 0, NULL, NULL));
+
+  printf("Verify result\n");
+  std::vector<float> ref_vec(o_points);
+  convolution_cpu(ref_vec.data(), h_i.data(), h_w.data(), size, size);
+  int errors = 0;
+  for (uint32_t i = 0; i < o_points; ++i) {
+    if (!compare_equal(h_o[i], ref_vec[i])) {
+      if (errors < 100) 
+        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_o[i]);
+      ++errors;
+    }
+  }
+  if (errors != 0) {
+    printf("FAILED! - %d errors\n", errors);    
+  } else {
+    printf("PASSED!\n");
+  }
+
+  // Clean up		
+  cleanup();  
+
+  return errors;
+}
--- a/tests/opencl/matmul/Makefile
+++ b/tests/opencl/matmul/Makefile
@ -2,6 +2,6 @@ PROJECT = matmul

 SRCS = main.cc

-OPTS ?= -n16
+OPTS ?= -n32

 include ../common.mk
--- a/tests/opencl/matmul/kernel.cl
+++ b/tests/opencl/matmul/kernel.cl
@ -7,43 +7,41 @@ __kernel void matmul(__global float *A,
 {
    int globalRow = get_global_id(1);
    int globalCol = get_global_id(0);
-    int localRow = get_local_id(1);
-    int localCol = get_local_id(0);
+    int localRow  = get_local_id(1);
+    int localCol  = get_local_id(0);
    int localSize = get_local_size(0);  // assuming square local size

    float sum = 0.0f;

-    // Load initial blocks of A and B into local memory
-    int k = 0;
-    localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
-    localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
+    // Loop over all blocks of both matrices
+    for (int k = 0; k < N; k += localSize) {
+        // Load block of matrix A to local memory
+        localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];

-    // Iterate over blocks
-    for (k = 0; k < N; k += 16) {
-        // Ensure the initial block is loaded
+        // Load block of matrix B to local memory, adjusting for column-major access
+        localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
+
+        // Synchronize to make sure the tiles are loaded
        barrier(CLK_LOCAL_MEM_FENCE);

-        // Compute multiplication for this block
-        for (int j = 0; j < 16; j++) {
+        // Multiply the two matrix blocks and accumulate result
+        for (int j = 0; j < localSize; j++) {
            sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
        }
-
-        // Load the next block of matrix A into local memory
-        if (k + 16 < N) {
-            localA[localRow * localSize + localCol] = A[globalRow * N + k + 16 + localCol];
-            localB[localRow * localSize + localCol] = B[(k + 16 + localRow) * N + globalCol];
-        }
    }

    C[globalRow * N + globalCol] = sum;
 }

-/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
+/*__kernel void matmul(__global float *A, 
+                       __global float *B, 
+                       __global float *C, 
+                       const unsigned int N)
 {
    int globalRow = get_global_id(1);
    int globalCol = get_global_id(0);
-    int localRow = get_local_id(1);
-    int localCol = get_local_id(0);
+    int localRow  = get_local_id(1);
+    int localCol  = get_local_id(0);

    // Static local memory declaration
    __local float localA[16][16];
@ -51,26 +49,21 @@ __kernel void matmul(__global float *A,

    float sum = 0.0f;

-    // Load initial blocks of A and B into local memory
-    int k = 0;
-    localA[localRow][localCol] = A[globalRow * N + k + localCol];
-    localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
-
    // Iterate over blocks
-    for (k = 0; k < N; k += 16) {
-        // Ensure the initial block is loaded
+    for (int k = 0; k < N; k += 16) {
+        // Load a block of matrix A into local memory
+        localA[localRow][localCol] = A[globalRow * N + k + localCol];
+
+        // Load a block of matrix B into local memory
+        localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
+
+        // Ensure the entire block is loaded
        barrier(CLK_LOCAL_MEM_FENCE);

        // Compute multiplication for this block
        for (int j = 0; j < 16; j++) {
            sum += localA[localRow][j] * localB[j][localCol];
        }
-
-        // Load the next block of matrix A into local memory
-        if (k + 16 < N) {
-            localA[localRow][localCol] = A[globalRow * N + k + 16 + localCol];
-            localB[localRow][localCol] = B[(k + 16 + localRow) * N + globalCol];
-        }
    }

    C[globalRow * N + globalCol] = sum;
--- a/tests/opencl/matmul/main.cc
+++ b/tests/opencl/matmul/main.cc
@ -10,6 +10,8 @@

 #define LOCAL_SIZE 16

+#define FLOAT_ULP 6
+
 #define KERNEL_NAME "matmul"

 #define CL_CHECK(_expr)                                                \
@ -56,15 +58,16 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
  return 0;
 }

-static bool compare_equal(float a, float b, int ulp = 21) {
-  union fi_t { int i; float f; };
+static bool compare_equal(float a, float b) {
+  union fi_t { float f; int32_t i; };
  fi_t fa, fb;
  fa.f = a;
  fb.f = b;
-  return std::abs(fa.i - fb.i) <= ulp;
+  auto d = std::abs(fa.i - fb.i);
+  return d <= FLOAT_ULP;
 }

-static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
+static void matmul_cpu(float *C, float *A, float *B, int N) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0.0f;
@ -98,7 +101,7 @@ static void cleanup() {
  if (kernel_bin) free(kernel_bin);
 }

-int size = 64;
+int size = 32;

 static void show_usage() {
  printf("Usage: [-n size] [-h: help]\n");
@ -106,7 +109,7 @@ static void show_usage() {

 static void parse_args(int argc, char **argv) {
  int c;
-  while ((c = getopt(argc, argv, "fn:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
    switch (c) {
    case 'n':
      size = atoi(optarg);
@ -127,6 +130,8 @@ int main (int argc, char **argv) {
  // parse command arguments
  parse_args(argc, argv);

+  uint32_t num_points = size * size;
+
  printf("Matrix size=%d\n", size);
  if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
    printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
@ -148,7 +153,7 @@ int main (int argc, char **argv) {
  printf("Using device: %s\n", device_string);

  printf("Allocate device buffers\n");
-  size_t nbytes = size * size * sizeof(float);
+  size_t nbytes = num_points * sizeof(float);
  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
  b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
@ -176,32 +181,26 @@ int main (int argc, char **argv) {
  // Create kernel
  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));

-  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
  size_t global_size[2] = {size, size};
+  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};  

  // Set kernel arguments
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
-  //CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
-  //CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
+  CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
+  CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));

 // Allocate memories for input arrays and output arrays.
- std::vector<float> h_a(size * size);
- std::vector<float> h_b(size * size);
- std::vector<float> h_c(size * size);
+ std::vector<float> h_a(num_points);
+ std::vector<float> h_b(num_points);
+ std::vector<float> h_c(num_points);
 	
-  // Initialize values for array members.  
-  for (int i = 0; i < (size * size); ++i) {
-  #ifdef USE_FLOAT
-    h_a[i] = (float)rand() / (float)RAND_MAX;
-    h_b[i] = (float)rand() / (float)RAND_MAX;
-  #else
-    h_a[i] = rand();
-    h_b[i] = rand();
-  #endif
-    h_c[i] = 0xdeadbeef;
+  // Generate input values
+  for (uint32_t i = 0; i < num_points; ++i) {
+    h_a[i] = static_cast<float>(rand()) / RAND_MAX;
+    h_b[i] = static_cast<float>(rand()) / RAND_MAX;
  }

  // Creating command queue
@ -223,10 +222,10 @@ int main (int argc, char **argv) {
  CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));

  printf("Verify result\n");
-  std::vector<float> ref_vec(size * size);
-  matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
+  std::vector<float> ref_vec(num_points);
+  matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size);
  int errors = 0;
-  for (int i = 0; i < (size * size); i++) {
+  for (uint32_t i = 0; i < num_points; ++i) {
    if (!compare_equal(h_c[i], ref_vec[i])) {
      if (errors < 100) 
        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
--- a/tests/opencl/oclprintf/main.cc
+++ b/tests/opencl/oclprintf/main.cc
@ -143,7 +143,7 @@ int main (int argc, char **argv) {
  // Allocate memories for input arrays and output arrays.    
  h_a = (int*)malloc(nbytes);
 	
-  // Initialize values for array members.  
+  // Generate input values
  for (int i = 0; i < size; ++i) {
    h_a[i] = -1 + i;
  }
--- a/tests/opencl/psort/main.cc
+++ b/tests/opencl/psort/main.cc
@ -155,9 +155,8 @@ int main (int argc, char **argv) {
  h_a = (int*)malloc(nbytes);
  h_c = (int*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
+  // Generate input values
  for (int i = 0; i < size; ++i) {
-    h_c[i] = 0xdeadbeef;
    if (float_enable) {
      float value = sinf(i)*sinf(i);
      h_a[i] = *(int*)&value;
--- a/tests/opencl/sgemm/common.h
+++ b/tests/opencl/sgemm/common.h
@ -1,12 +1,8 @@
 #ifndef COMMON_H
 #define COMMON_H

-#define USE_FLOAT
-
-#ifdef USE_FLOAT
+#ifndef TYPE
 #define TYPE float
-#else
-#define TYPE int
 #endif

 #endif // COMMON_H
--- a/tests/opencl/sgemm/main.cc
+++ b/tests/opencl/sgemm/main.cc
@ -11,6 +11,8 @@

 #define KERNEL_NAME "sgemm"

+#define FLOAT_ULP 6
+
 #define CL_CHECK(_expr)                                                \
   do {                                                                \
     cl_int _err = _expr;                                              \
@ -33,6 +35,66 @@
     _ret;                                                             \
   })

+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) { 
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+/*static void sgemm_cpu(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      TYPE acc = 0;
+      for (int k = 0; k < K; ++k) {
+          acc += A[k * M + m] * B[n * K + k];
+      }
+      C[n * M + m] = acc;
+    }
+  }
+}*/
+
 static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
  if (nullptr == filename || nullptr == data || 0 == size)
    return -1;
@ -54,32 +116,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
  return 0;
 }

-/*static void matmul(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      TYPE acc = 0;
-      for (int k = 0; k < K; ++k) {
-          acc += A[k * M + m] * B[n * K + k];
-      }
-      C[n * M + m] = acc;
-    }
-  }
-}*/
-
-#ifdef USE_FLOAT
-static bool compare_equal(float a, float b, int ulp = 21) {
-  union fi_t { int i; float f; };
-  fi_t fa, fb;
-  fa.f = a;
-  fb.f = b;
-  return std::abs(fa.i - fb.i) <= ulp;
-}
-#else
-static bool compare_equal(int a, int b, int ulp = 21) {
-  return (a == b);
-}
-#endif
-
 cl_device_id device_id = NULL;
 cl_context context = NULL;
 cl_command_queue commandQueue = NULL;
@ -145,6 +181,8 @@ int main (int argc, char **argv) {
  // parse command arguments
  parse_args(argc, argv);

+  uint32_t num_points = size * size;
+
  cl_platform_id platform_id;
  size_t kernel_size;
  cl_int binary_status;
@ -163,7 +201,7 @@ int main (int argc, char **argv) {
  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));

  // Allocate device buffers
-  size_t nbytes = size * size * sizeof(TYPE);
+  size_t nbytes = num_points * sizeof(TYPE);
  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
  b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
@ -194,23 +232,17 @@ int main (int argc, char **argv) {
  h_b = (TYPE*)malloc(nbytes);
  h_c = (TYPE*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
-  for (int i = 0; i < (size * size); ++i) {
-  #ifdef USE_FLOAT
-    h_a[i] = (float)rand() / (float)RAND_MAX;
-    h_b[i] = (float)rand() / (float)RAND_MAX;
-  #else
-    h_a[i] = rand();
-    h_b[i] = rand();
-  #endif
-    h_c[i] = 0xdeadbeef;
+  // Generate input values 
+  for (uint32_t i = 0; i < num_points; ++i) {
+    h_a[i] = Comparator<TYPE>::generate();
+    h_b[i] = Comparator<TYPE>::generate();
  }

  size_t global_offset[2] = {0, 0};
  size_t global_work_size[2] = {size, size};
  size_t local_work_size[2] = {1, 1};

-  std::vector<float> ref_vec(size * size);
+  std::vector<float> ref_vec(num_points);

  // reference generation
  size_t num_groups_y = global_work_size[1] / local_work_size[1];
@ -228,12 +260,7 @@ int main (int argc, char **argv) {
          TYPE acc = 0;
          for (int k = 0; k < width; k++) {
            acc += h_a[k * width + r] * h_b[c * width + k];
-          }
-        /*#ifdef USE_FLOAT
-          printf("*** r=%d, c=%d, v=%f\n", r, c, acc);
-        #else
-          printf("*** r=%d, c=%d, v=%d\n", r, c, acc);
-        #endif*/                   
+          }                  
          ref_vec[c * width + r] = acc;         
        }
      }
@ -260,14 +287,8 @@ int main (int argc, char **argv) {

  printf("Verify result\n");
  int errors = 0;
-  for (int i = 0; i < (size * size); i++) {
-    if (!compare_equal(h_c[i], ref_vec[i])) {
-      if (errors < 100) 
-      #ifdef USE_FLOAT
-        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
-      #else
-        printf("*** error: [%d] expected=%d, actual=%d\n", i, ref_vec[i], h_c[i]);
-      #endif
+  for (uint32_t i = 0; i < num_points; ++i) {
+    if (!Comparator<TYPE>::compare(h_c[i], ref_vec[i], i, errors)) {
      ++errors;
    }
  }
--- a/tests/opencl/vecadd/main.cc
+++ b/tests/opencl/vecadd/main.cc
@ -166,12 +166,10 @@ int main (int argc, char **argv) {
  h_b = (float*)malloc(nbytes);
  h_c = (float*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
+  // Generate input values
  for (int i = 0; i < size; ++i) {
    h_a[i] = sinf(i)*sinf(i);
    h_b[i] = cosf(i)*cosf(i);
-    h_c[i] = 0xdeadbeef;
-    //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
  }

  // Creating command queue
--- a/tests/regression/demo/main.cpp
+++ b/tests/regression/demo/main.cpp
@ -19,16 +19,6 @@

 ///////////////////////////////////////////////////////////////////////////////

-union Float_t {    
-    float f;
-    int   i;
-    struct {
-        uint32_t man  : 23;
-        uint32_t exp  : 8;
-        uint32_t sign : 1;
-    } parts;
-};
-
 template <typename Type>
 class Comparator {};

@ -38,22 +28,41 @@ public:
  static const char* type_str() {
    return "integer";
  }
-  static bool compare(int a, int b) { 
-    return a == b; 
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
  }  
 };

 template <>
 class Comparator<float> {
+private:
+  union Float_t { float f; int i; };
 public:
  static const char* type_str() {
    return "float";
  }
-  static bool compare(float a, float b) { 
-    Float_t fa{a}, fb{b};
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) {     
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
    auto d = std::abs(fa.i - fb.i);
    if (d > FLOAT_ULP) {
-      std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
      return false;
    }
    return true;
@ -127,9 +136,7 @@ int run_test(const kernel_arg_t& kernel_arg,
    for (uint32_t i = 0; i < num_points; ++i) {
      auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
      auto cur = buf_ptr[i];
-      if (!Comparator<TYPE>::compare(cur, ref)) {
-        std::cout << "error at result #" << std::dec << i
-                  << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
+      if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
        ++errors;
      }
    }
@ -196,8 +203,7 @@ int main(int argc, char *argv[]) {
  // generate source data
  source_data.resize(2 * num_points);
  for (uint32_t i = 0; i < source_data.size(); ++i) {
-    auto r = static_cast<float>(std::rand()) / RAND_MAX;
-    source_data[i] = static_cast<TYPE>(r * 2 * num_points);
+    source_data[i] = Comparator<TYPE>::generate();
  }

  // upload source buffer0
--- a/tests/regression/tensor/Makefile
+++ b/tests/regression/tensor/Makefile
@ -4,6 +4,6 @@ SRCS = main.cpp

 VX_SRCS = kernel.cpp

-OPTS ?= -s16
+OPTS ?= -n32

 include ../common.mk
--- a/tests/regression/tensor/kernel.cpp
+++ b/tests/regression/tensor/kernel.cpp
@ -12,10 +12,10 @@ inline uint32_t log2_fast(uint32_t x) {
 }

 void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
-	auto size  = arg->size;
-    auto A = reinterpret_cast<TYPE*>(arg->A_addr);
+	auto A = reinterpret_cast<TYPE*>(arg->A_addr);
 	auto B = reinterpret_cast<TYPE*>(arg->B_addr);
 	auto C = reinterpret_cast<TYPE*>(arg->C_addr);
+    auto size  = arg->size;

    uint32_t row, col;
    if (is_log2(size)) {
--- a/tests/regression/tensor/main.cpp
+++ b/tests/regression/tensor/main.cpp
@ -19,16 +19,6 @@

 ///////////////////////////////////////////////////////////////////////////////

-union Float_t {    
-    float f;
-    int   i;
-    struct {
-        uint32_t man  : 23;
-        uint32_t exp  : 8;
-        uint32_t sign : 1;
-    } parts;
-};
-
 template <typename Type>
 class Comparator {};

@ -38,8 +28,17 @@ public:
  static const char* type_str() {
    return "integer";
  }
-  static bool compare(int a, int b) { 
-    return a == b; 
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
  }  
 };

@ -49,18 +48,26 @@ public:
  static const char* type_str() {
    return "float";
  }
-  static bool compare(float a, float b) { 
-    Float_t fa{a}, fb{b};
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) { 
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
    auto d = std::abs(fa.i - fb.i);
    if (d > FLOAT_ULP) {
-      std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
      return false;
    }
    return true;
  }  
 };

-static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
+static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
  for (uint32_t row = 0; row < height; ++row) {
    for (uint32_t col = 0; col < width; ++col) {
      TYPE sum(0);
@ -73,7 +80,7 @@ static void cpuMatrixMultiply(TYPE* out, const TYPE* A, const TYPE* B, uint32_t
 }

 const char* kernel_file = "kernel.bin";
-uint32_t size = 16;
+uint32_t size = 32;

 vx_device_h device = nullptr;
 std::vector<uint8_t> staging_buf;
@ -81,14 +88,14 @@ kernel_arg_t kernel_arg = {};

 static void show_usage() {
   std::cout << "Vortex Test." << std::endl;
-   std::cout << "Usage: [-k: kernel] [-s size] [-h: help]" << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
 }

 static void parse_args(int argc, char **argv) {
  int c;
-  while ((c = getopt(argc, argv, "s:k:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
    switch (c) {
-    case 's':
+    case 'n':
      size = atoi(optarg);
      break;
    case 'k':
@ -138,9 +145,7 @@ int run_test(const kernel_arg_t& kernel_arg,
    for (uint32_t i = 0; i < refs.size(); ++i) {
      auto ref = refs[i];
      auto cur = buf_ptr[i];
-      if (!Comparator<TYPE>::compare(cur, ref)) {
-        std::cout << "error at result #" << std::dec << i
-                  << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
+      if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
        ++errors;
      }
    }
@ -208,7 +213,7 @@ int main(int argc, char *argv[]) {
    src_A[i] = static_cast<TYPE>(a * size);
    src_B[i] = static_cast<TYPE>(b * size);
  }
-  cpuMatrixMultiply(refs.data(), src_A.data(), src_B.data(), size, size);
+  matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size);

  // upload source buffer0
  {