new conv3x regression test

2025-04-23 21:39:10 -04:00 · 2024-03-26 16:30:47 -07:00 · 2024-03-26 16:30:47 -07:00 · 8ab4c53e27
commit 8ab4c53e27
parent c8dd0aafb0
5 changed files with 387 additions and 0 deletions
--- a/tests/regression/Makefile
+++ b/tests/regression/Makefile
@ -11,6 +11,7 @@ all:
 	$(MAKE) -C no_mf_ext
 	$(MAKE) -C vecaddx
 	$(MAKE) -C sgemmx
+	$(MAKE) -C conv3x

 run-simx:
 	$(MAKE) -C basic run-simx
@ -25,6 +26,7 @@ run-simx:
 	$(MAKE) -C no_mf_ext run-simx
 	$(MAKE) -C vecaddx run-simx
 	$(MAKE) -C sgemmx run-simx
+	$(MAKE) -C conv3x run-simx

 run-rtlsim:
 	$(MAKE) -C basic run-rtlsim
@ -39,6 +41,7 @@ run-rtlsim:
 	$(MAKE) -C no_mf_ext run-rtlsim
 	$(MAKE) -C vecaddx run-rtlsim
 	$(MAKE) -C sgemmx run-rtlsim
+	$(MAKE) -C conv3x run-rtlsim

 run-opae:
 	$(MAKE) -C basic run-opae
@ -53,6 +56,7 @@ run-opae:
 	$(MAKE) -C no_mf_ext run-opae
 	$(MAKE) -C vecaddx run-opae
 	$(MAKE) -C sgemmx run-opae
+	$(MAKE) -C conv3x run-opae

 clean:
 	$(MAKE) -C basic clean
@ -67,6 +71,7 @@ clean:
 	$(MAKE) -C no_mf_ext clean
 	$(MAKE) -C vecaddx clean
 	$(MAKE) -C sgemmx clean
+	$(MAKE) -C conv3x clean

 clean-all:
 	$(MAKE) -C basic clean-all
@ -81,3 +86,4 @@ clean-all:
 	$(MAKE) -C no_mf_ext clean-all
 	$(MAKE) -C vecaddx clean-all
 	$(MAKE) -C sgemmx clean-all
+	$(MAKE) -C conv3x clean-all
--- a/tests/regression/conv3x/Makefile
+++ b/tests/regression/conv3x/Makefile
@ -0,0 +1,9 @@
+PROJECT = conv3x
+
+SRCS = main.cpp
+
+VX_SRCS = kernel.cpp
+
+OPTS ?= -n64
+
+include ../common.mk
--- a/tests/regression/conv3x/common.h
+++ b/tests/regression/conv3x/common.h
@ -0,0 +1,20 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+typedef struct {
+  uint32_t num_tasks;
+  uint32_t width;
+  uint32_t log2_width;
+  uint64_t lmem_addr;
+  uint64_t I_addr;
+  uint64_t W_addr;
+  uint64_t O_addr;
+} kernel_arg_t;
+
+#endif
--- a/tests/regression/conv3x/kernel.cpp
+++ b/tests/regression/conv3x/kernel.cpp
@ -0,0 +1,59 @@
+#include <stdint.h>
+#include <vx_intrinsics.h>
+#include <vx_spawn.h>
+#include "common.h"
+
+inline char is_log2(uint32_t x) {
+    return ((x & (x-1)) == 0);
+}
+
+void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
+    auto I = reinterpret_cast<TYPE*>(arg->I_addr);
+    auto W = reinterpret_cast<TYPE*>((arg->lmem_addr != 0) ? arg->lmem_addr : arg->W_addr);
+	auto O = reinterpret_cast<TYPE*>(arg->O_addr); 
+    auto width = arg->width;
+
+    uint32_t row, col;
+    if (is_log2(width)) {
+        row = task_id >> arg->log2_width;
+        col = task_id & (width-1);
+    } else {
+        row = task_id / width;
+    }
+
+     // Adjust for padded borders
+    int paddedWidth = width + 2;
+    int paddedX = col + 1;
+    int paddedY = row + 1;
+
+    // Compute 3x3 convolution sum
+    float sum = 0.0f;
+
+    sum += I[(paddedY - 1) * paddedWidth + (paddedX - 1)] * W[0]; // Top-left
+    sum += I[(paddedY - 1) * paddedWidth + paddedX] * W[1];       // Top-center
+    sum += I[(paddedY - 1) * paddedWidth + (paddedX + 1)] * W[2]; // Top-right
+
+    sum += I[paddedY * paddedWidth + (paddedX - 1)] * W[3];       // Middle-left
+    sum += I[paddedY * paddedWidth + paddedX] * W[4];             // Center
+    sum += I[paddedY * paddedWidth + (paddedX + 1)] * W[5];       // Middle-right
+
+    sum += I[(paddedY + 1) * paddedWidth + (paddedX - 1)] * W[6]; // Bottom-left
+    sum += I[(paddedY + 1) * paddedWidth + paddedX] * W[7];       // Bottom-center
+    sum += I[(paddedY + 1) * paddedWidth + (paddedX + 1)] * W[8]; // Bottom-right
+
+    O[row * width + col] = sum;
+}
+
+int main() {
+    kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
+    if (arg->lmem_addr != 0) {
+        // populate local memory
+        auto W = reinterpret_cast<TYPE*>(arg->W_addr);
+        auto L = reinterpret_cast<TYPE*>(arg->lmem_addr);
+        for (int i = 0; i < (3*3); ++i) {
+            L[i] = W[i];
+        }
+    }    
+    vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
+    return 0;
+}
--- a/tests/regression/conv3x/main.cpp
+++ b/tests/regression/conv3x/main.cpp
@ -0,0 +1,293 @@
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vector>
+#include <chrono>
+#include <vortex.h>
+#include <cmath>
+#include "common.h"
+
+#define FLOAT_ULP 6
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) { 
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+static void convolution_cpu(TYPE *O, TYPE *I, TYPE *W, int32_t width, int32_t height) {
+  int paddedWidth = width + 2;
+  for (int32_t y = 0; y < height; ++y) {
+    for (int32_t x = 0; x < width; ++x) {
+      int paddedY = y + 1;
+      int paddedX = x + 1;
+      TYPE sum(0);
+      for (int32_t ky = -1; ky <= 1; ++ky) {
+        for (int32_t kx = -1; kx <= 1; ++kx) {
+          int32_t iy = paddedY + ky;
+          int32_t ix = paddedX + kx;
+          TYPE value = I[iy * paddedWidth + ix];
+          TYPE weight = W[(ky + 1) * 3 + (kx + 1)];
+          sum += value * weight;
+        }
+      }
+      O[y * width + x] = sum;
+    }
+  }
+}
+
+const char* kernel_file = "kernel.bin";
+int size = 32;
+bool use_lmem = false;
+
+vx_device_h device = nullptr;
+std::vector<uint8_t> staging_buf;
+kernel_arg_t kernel_arg = {};
+
+static void show_usage() {
+   std::cout << "Vortex Test." << std::endl;
+   std::cout << "Usage: [-k kernel] [-l: local memory] [-n size] [-h|?: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:lh?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'l':      
+      use_lmem = true;
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {    
+    vx_mem_free(device, kernel_arg.I_addr);
+    if (!use_lmem) {
+      vx_mem_free(device, kernel_arg.W_addr);
+    }
+    vx_mem_free(device, kernel_arg.O_addr);
+    vx_dev_close(device);
+  }
+}
+
+int main(int argc, char *argv[]) {  
+  // parse command arguments
+  parse_args(argc, argv);
+
+  std::srand(50);
+
+  // open device connection
+  std::cout << "open device connection" << std::endl;  
+  RT_CHECK(vx_dev_open(&device));
+
+  uint32_t num_points = size * size;
+
+  std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
+  std::cout << "matrix size: " << size << "x" << size << std::endl;
+
+  uint32_t o_points = size * size;
+  uint32_t i_points = (size+2) * (size+2);
+  uint32_t w_points = 3 * 3;
+
+  // upload program
+  std::cout << "upload program" << std::endl;  
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file));
+
+  // allocate device memory
+  std::cout << "allocate device memory" << std::endl;  
+  size_t i_nbytes = i_points * sizeof(TYPE);
+  size_t w_nbytes = w_points * sizeof(TYPE);
+  size_t o_nbytes = o_points * sizeof(TYPE);
+  RT_CHECK(vx_mem_alloc(device, i_nbytes, &kernel_arg.I_addr));  
+  RT_CHECK(vx_mem_alloc(device, o_nbytes, &kernel_arg.O_addr));
+  RT_CHECK(vx_mem_alloc(device, w_nbytes, &kernel_arg.W_addr));
+
+  if (use_lmem) {
+    uint64_t dev_local_mem_size;
+    RT_CHECK(vx_dev_caps(device, VX_CAPS_LOCAL_MEM_SIZE, &dev_local_mem_size));
+    if (w_nbytes > dev_local_mem_size) {
+      std::cout << "Error: Not enough local memory: needed=" << w_nbytes << ", available=" << dev_local_mem_size << std::endl;
+      cleanup();
+      exit(1);
+    }
+    RT_CHECK(vx_dev_caps(device, VX_CAPS_LOCAL_MEM_ADDR, &kernel_arg.lmem_addr));
+    std::cout << "using local memory: base_addr=" << std::hex << kernel_arg.lmem_addr << std::dec << std::endl;
+  } else {
+    kernel_arg.lmem_addr = 0;
+  }
+
+  kernel_arg.num_tasks = num_points;
+  kernel_arg.width = size;
+  kernel_arg.log2_width = log2(size);
+
+  std::cout << "dev_argI=0x" << std::hex << kernel_arg.I_addr << std::endl;
+  std::cout << "dev_argW=0x" << std::hex << kernel_arg.W_addr << std::endl;
+  std::cout << "dev_argO=0x" << std::hex << kernel_arg.O_addr << std::endl;
+  
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
+  uint32_t alloc_size = std::max<uint32_t>(i_nbytes, sizeof(kernel_arg_t));
+  staging_buf.resize(alloc_size);
+  
+  // upload kernel argument
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+
+  // Generate input values
+  std::vector<TYPE> h_I(i_points);
+  std::vector<TYPE> h_W(w_points);
+  std::vector<TYPE> h_O(o_points);
+  for (int32_t y = -1; y < size+1; ++y) {
+    for (int32_t x = -1; x < size+1; ++x) {
+      if (x >= 0 && x < size && y >= 0 && y < size) {
+        h_I[(y+1) * (size+2) + (x+1)] = static_cast<TYPE>(rand()) / RAND_MAX;
+      } else {
+        h_I[(y+1) * (size+2) + (x+1)] = 0;
+      }
+    }
+  }
+  for (uint32_t i = 0; i < w_points; ++i) {
+    h_W[i] = static_cast<TYPE>(rand()) / RAND_MAX;
+  }
+  convolution_cpu(h_O.data(), h_I.data(), h_W.data(), size, size);
+
+  // upload input buffer
+  {
+    std::cout << "upload source buffer" << std::endl;
+    auto buf_ptr = (TYPE*)staging_buf.data();
+    for (uint32_t i = 0; i < i_points; ++i) {
+      buf_ptr[i] = h_I[i];
+    }
+    RT_CHECK(vx_copy_to_dev(device, kernel_arg.I_addr, staging_buf.data(), i_nbytes));
+  }
+
+  // upload weight buffer
+  {
+    std::cout << "upload weight buffer" << std::endl;
+    auto buf_ptr = (TYPE*)staging_buf.data();
+    for (uint32_t i = 0; i < w_points; ++i) {
+      buf_ptr[i] = h_W[i];
+    }   
+    RT_CHECK(vx_copy_to_dev(device, kernel_arg.W_addr, staging_buf.data(), w_nbytes));
+  }
+
+  // clear destination buffer
+  std::cout << "clear destination buffer" << std::endl;
+  memset(staging_buf.data(), 0, o_nbytes);
+  RT_CHECK(vx_copy_to_dev(device, kernel_arg.O_addr, staging_buf.data(), o_nbytes));  
+
+  auto time_start = std::chrono::high_resolution_clock::now();
+  
+  // start device
+  std::cout << "start device" << std::endl;
+  RT_CHECK(vx_start(device));
+
+  // wait for completion
+  std::cout << "wait for completion" << std::endl;
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));  
+
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  // download destination buffer
+  std::cout << "download destination buffer" << std::endl;
+  RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.O_addr, o_nbytes));
+
+  // verify result
+  std::cout << "verify result" << std::endl;  
+  {
+    int errors = 0;
+    auto buf_ptr = (TYPE*)staging_buf.data();
+    for (uint32_t i = 0; i < h_O.size(); ++i) {
+      auto ref = h_O[i];
+      auto cur = buf_ptr[i];
+      if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
+        ++errors;
+      }
+    }
+    if (errors != 0) {
+      std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+      std::cout << "FAILED!" << std::endl;
+      return 1;  
+    }
+  }
+
+  // cleanup
+  std::cout << "cleanup" << std::endl;  
+  cleanup();
+
+  std::cout << "PASSED!" << std::endl;
+
+  return 0;
+}