Fixes for PR

2025-04-23 21:39:10 -04:00 · 2024-06-25 03:18:50 -04:00 · 2024-06-25 03:18:50 -04:00 · 5b0fc8cbd4
commit 5b0fc8cbd4
parent a378aed67c
18 changed files with 77 additions and 91 deletions
--- a/ci/blackbox.sh
+++ b/ci/blackbox.sh
@ -48,8 +48,6 @@ PERF_CLASS=0
 REBUILD=2
 TEMPBUILD=0
 LOGFILE=run.log
-TC_SIZE=567
-TC_NUM=123

 for i in "$@"
 do
@ -182,7 +180,6 @@ then
 fi

 CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
-# CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS"
 echo "CONFIGS=$CONFIGS"

 if [ $REBUILD -ne 0 ]
--- a/ci/regression.sh.in
+++ b/ci/regression.sh.in
@ -124,7 +124,9 @@ regression()
    # test local barrier
    ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
    ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar"
-    
+
+    # test for matmul
+    CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1" 

    echo "regression tests done!"
 }
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -111,20 +111,20 @@
 `endif
 `define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)

+// Size of Tensor Core
 `ifndef TC_SIZE
-`define TC_SIZE 4
+`define TC_SIZE 8
 `endif

+// Number of TCs per Warp
 `ifndef TC_NUM
-`define TC_NUM 1
+`define TC_NUM 4
 `endif

-// Number of TCU units
 `ifndef NUM_TCU_LANES
 `define NUM_TCU_LANES   `TC_NUM
 `endif

-// Number of TCU units
 `ifndef NUM_TCU_BLOCKS
 `define NUM_TCU_BLOCKS  `ISSUE_WIDTH
 `endif
--- a/hw/rtl/VX_types.vh
+++ b/hw/rtl/VX_types.vh
@ -196,7 +196,7 @@
 `define VX_CSR_NUM_CORES                12'hFC2
 `define VX_CSR_LOCAL_MEM_BASE           12'hFC3

-`define VX_MAT_MUL_SIZE                 12'hFC4
+`define VX_MAT_MUL_SIZE                 12'hFC4     // VX_MAT_MUL_SIZE = Matrix Size / TC Size
 `define VX_TC_NUM                       12'hFC5
 `define VX_TC_SIZE                      12'hFC6

--- a/kernel/include/vx_intrinsics.h
+++ b/kernel/include/vx_intrinsics.h
@ -222,21 +222,19 @@ inline void vx_fence() {
 }

 //Matrix load
-//Converted instruction type cause destination registers were not getiing blocked otherwise
-inline void mload(unsigned dest, unsigned  addr) 
+inline void vx_matrix_load(unsigned dest, unsigned  addr) 
 {
    asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
 }

-//mat store
-inline void ms(unsigned  addr) 
+//Matrix Store
+inline void vx_matrix_store(unsigned  addr) 
 {
    asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr));
 }

-//mat mul
-//num tiles along reduced K dimension of matmul as imm value (can use rd,rs field to expand range of n_tiles from 12 bits)
-inline void mm() 
+//Matrix Mul
+inline void vx_matrix_mul() 
 {
    asm volatile (".insn i 0x7b, 2, x0, 0(x0)");
 }
--- a/run_final.sh
+++ b/run_final.sh
@ -1,22 +0,0 @@
-# Define arrays for threads, warps, and matrix sizes
-matrix_sizes=(16 32 64 128 256 512)
-tcsizes=(8 16 32)
-tcnums=(4 8 16 32)
-#lsulanes=(4 16)
-#cores=(32)
-
-
-# Loop through each combination of threads and warps
-for size in "${matrix_sizes[@]}"; do
-    sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" ../tests/regression/matmul/Makefile
-    sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" tests/regression/matmul/Makefile
-    echo "Matrix size changed to ${size} in Makefile"
-    for tcsize in "${tcsizes[@]}"; do
-        for tcnum in "${tcnums[@]}"; do
-            log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
-            command="./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --tc_size=${tcsize} --tc_num=${tcnum} --rebuild=1 --perf=1  > ${log_name} 2>&1"
-            echo "$command"
-            eval "$command"
-        done
-    done    
-done
--- a/runtime/simx/vortex.cpp
+++ b/runtime/simx/vortex.cpp
@ -69,12 +69,12 @@ public:
    case VX_CAPS_NUM_CORES:
      _value = NUM_CORES * NUM_CLUSTERS;
      break;
-    // case VX_CAPS_TC_SIZE:
-    //   _value = TC_SIZE;
-    //   break;
-    // case VX_CAPS_TC_NUM:
-    //   _value = TC_NUM;
-    //   break;
+    case VX_CAPS_TC_SIZE:
+      _value = TC_SIZE;
+      break;
+    case VX_CAPS_TC_NUM:
+      _value = TC_NUM;
+      break;
    case VX_CAPS_CACHE_LINE_SIZE:
      _value = CACHE_BLOCK_SIZE;
      break;
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@ -410,9 +410,9 @@ static const char* op_string(const Instr &instr) {
  case Opcode::TCU:
    switch(func3)
    {
-      case 0: return "ML"; //
-      case 1: return "MS"; //
-      case 2: return "MATMUL";
+      case 0: return "ML";     // Matrix Load
+      case 1: return "MS";     // Matrix Store
+      case 2: return "MATMUL"; // Matrix Multiply
      default:
        std::abort();
    }
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@ -74,7 +74,10 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
    , core_(core)
    , warps_(arch.num_warps(), arch)
    , barriers_(arch.num_barriers(), 0)
-    , scratchpad(std::vector<Word>(32 * 32 * 32768)) //Fix this : Max TC_SIZE = 32
+    // Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is
+    // considered to be big enough to hold input tiles for one output tile.
+    // In future versions, scratchpad size should be fixed to an appropriate value.
+    , scratchpad(std::vector<Word>(32 * 32 * 32768)) 
 {
  this->clear();
 }
@ -360,6 +363,11 @@ Word Emulator::get_tc_size()
  return tc_size;
 }

+Word Emulator::get_tc_num()
+{
+  return tc_num;
+}
+
 Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
  auto core_perf = core_->perf_stats();
  switch (addr) {
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@ -56,7 +56,8 @@ public:
  
  Word get_tiles();
  Word get_tc_size();
-
+  Word get_tc_num();
+  
 private:

  struct ipdom_entry_t {
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@ -1429,8 +1429,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
    uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid);  //CSR instruction before MLOAD will ensure that this csr has value
    int num_data_per_thread;
    int num_data_per_thread_st;
-    int num_threads_actv;
-    int num_threads_actv_st;
+    uint32_t num_threads_actv;
+    uint32_t num_threads_actv_st;
    uint32_t data_bytes_load;
    uint32_t data_bytes_store;
    uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp);
@ -1506,7 +1506,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {

        auto trace_data = std::make_shared<LsuTraceData>(num_threads);
        trace->data = trace_data;
-        uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;

        for (uint32_t t = thread_start; t < num_threads_actv_st; ++t) 
        {
@ -1521,12 +1520,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
          //Store C
          for (int n=0; n<num_data_per_thread_st; n++)
          {
-            uint64_t mem_addr = (base_addr+(n*mem_bytes));
-            uint32_t csr_index = (2*num_data_per_thread_st) + n;
-            uint32_t scratchpad_index = (tc_size*tc_size*2) + (t*num_data_per_thread) + n;
-            
-            //scratchpad -> csr (TODO :: removed intermediate CSR stage ; incorporate limited scratchmad implementation)
-            //core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_);
            Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
            *temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n];

@ -1534,7 +1527,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
          }
        }
        //Clear the scratchpad
-        for(int i =0 ; i < scratchpad.size(); i++)
+        for(long unsigned int i=0 ; i < scratchpad.size(); i++)
        {
          scratchpad[i] = 0;
        }
@ -1545,7 +1538,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
        DP(4, "TCU MULTIPLY MAT");
        trace->fu_type = FUType::TCU;
        trace->tcu_type = TCUType::TCU_MUL;
-        uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
        uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp);
        for (uint32_t t = thread_start; t < num_threads_actv; ++t) 
        {
@ -1556,12 +1548,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
          //TC operation [only 1 thread in 1 warp needs to do this]
          if (t%threads_per_tc == 0)
          {
-            //TODO : change to systolic array implementation
-            uint32_t thread_offset = t*(tc_size*tc_size);
-            int loop_offset = 0;
-            int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
            /*
            // TODO : Fix needed for functional correctness
+            // TODO : change to systolic array implementation
+            uint32_t thread_offset = t*(tc_size*tc_size);
+
+            int loop_offset = 0;
+            int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
+            uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
            for(int tiles = 0 ; tiles < n_tiles ; tiles++)  //What's the HW implication of this?? A counter implementation?
            { 
              for (int i = 0; i < tc_size; i++) { //ROW-1
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@ -255,7 +255,6 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {

 TcuUnit::TcuUnit(const SimContext& ctx, Core* core) 
    : FuncUnit(ctx, core, "TCU")
-    // , tc_size (core_->arch().tc_size())
    {}

 void TcuUnit::tick() {
--- a/sim/simx/func_unit.h
+++ b/sim/simx/func_unit.h
@ -103,7 +103,6 @@ private:
 class TcuUnit : public FuncUnit {
 public:
    TcuUnit(const SimContext& ctx, Core*);
-    // uint64_t tc_size;
    void tick();
 };

--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@ -35,8 +35,6 @@ static void show_usage() {
 uint32_t num_threads = NUM_THREADS;
 uint32_t num_warps = NUM_WARPS;
 uint32_t num_cores = NUM_CORES;
-uint32_t tc_size = TC_SIZE;
-uint32_t  tc_num = TC_NUM;
 bool showStats = false;
 const char* program = nullptr;

--- a/tests/regression/matmul/Makefile
+++ b/tests/regression/matmul/Makefile
@ -9,6 +9,6 @@ SRCS := $(SRC_DIR)/main.cpp

 VX_SRCS := $(SRC_DIR)/kernel.cpp

-OPTS ?= -n512 -d1 -s4 -t4
+OPTS ?= -n128 -d1

 include ../common.mk
--- a/tests/regression/matmul/kernel.cpp
+++ b/tests/regression/matmul/kernel.cpp
@ -107,15 +107,15 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
 		csr_write(VX_TC_NUM,TC_per_warp);
 		csr_write(VX_TC_SIZE,tc_size);

-		mload (0, a_addr_base);
-		mload (1, b_addr_base);
+		vx_matrix_load (0, a_addr_base);
+		vx_matrix_load (1, b_addr_base);
 		//In case of multiple threads - sync load
 		vx_fence();

-		mm();   //Assuming padding to ensure matrix size is a multiple of tc_size
+		vx_matrix_mul();   //Assuming padding to ensure matrix size is a multiple of tc_size
 		vx_fence();
 		if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit_c)
-			ms(c_addr_base);
+			vx_matrix_store(c_addr_base);
 		//In case of multiple threads - sync store
 		vx_fence();
 	}	
--- a/tests/regression/matmul/main.cpp
+++ b/tests/regression/matmul/main.cpp
@ -21,8 +21,6 @@

 const char* kernel_file = "kernel.vxbin";
 uint32_t matrix_size = 0;
-uint32_t tc_num = 4;
-uint32_t TC_size = 8;

 vx_device_h device = nullptr;
 vx_buffer_h A_buffer = nullptr;
@ -41,7 +39,7 @@ static void show_usage() {

 static void parse_args(int argc, char **argv, uint32_t &data_size) {
  int c;
-  while ((c = getopt(argc, argv, "n:k:d:t:s:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) {
    switch (c) {
    case 'n':
      matrix_size = atoi(optarg);
@ -52,12 +50,6 @@ static void parse_args(int argc, char **argv, uint32_t &data_size) {
    case 'd':
      data_size = atoi(optarg);
      break; 
-    case 't':
-      tc_num = atoi(optarg);
-      break;
-    case 's':
-      TC_size = atoi(optarg);
-      break; 
    case 'h':
    case '?': {
      show_usage();
@ -151,21 +143,15 @@ int main(int argc, char *argv[]) {
  RT_CHECK(vx_dev_open(&device));

  uint64_t num_cores, num_warps, num_threads;
-  uint32_t tc_size, TC_per_warp;
+  uint64_t tc_size, TC_per_warp;

  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
  
-  std::cout << "Debug :: tc_size (optarg) = " << TC_size << std::endl;
-  std::cout << "Debug :: tc_num (optarg) = " << tc_num << std::endl;
-
  //Add assert/knob
-  tc_size = TC_size;
-  TC_per_warp = tc_num;
-
-  // RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
-  // RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
+  RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));

  std::cout << "Debug :: tc_size = " << tc_size << std::endl;
  std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl;
--- a/tests/regression/matmul/matmul_regression.sh
+++ b/tests/regression/matmul/matmul_regression.sh
@ -0,0 +1,26 @@
+#!/bin/bash 
+
+# README:
+# This script launches a sweep of TC_SIZE, TC_NUM and MATRIX SIZES
+# default values of NUM_WARPS=32, NUM_THREADS=32, NUM_CORES=4, DATA_SIZE=1
+# Edit matrix_sizes, tcsizes & tcnums variables to vary the sweep limits
+
+# Define arrays for tc_size,tc_num and matrix sizes
+matrix_sizes=(16 32 64 128 256 512)
+tcsizes=(8 16 32)
+tcnums=(4 8 16 32)
+
+cd ../../../build/
+
+# Loop through each combination of above configs
+for size in "${matrix_sizes[@]}"; do
+    for tcsize in "${tcsizes[@]}"; do
+        for tcnum in "${tcnums[@]}"; do
+            mkdir -p sim_final/mat${size}
+            log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
+            cmd="CONFIGS=\"-DTC_NUM=${tcnum} -DTC_SIZE=${tcsize}\" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args=\"-n${size} -d1\" --rebuild=1 --perf=1  > ${log_name} 2>&1"
+            echo $cmd
+            eval $cmd
+        done
+    done    
+done