code refactoring to cleanup existing TPU and VPU code in SimX

2025-04-24 05:47:35 -04:00 · 2025-02-02 23:44:31 -08:00 · 2025-02-02 23:44:31 -08:00 · a920025582
commit a920025582
parent 4785736e4d
27 changed files with 1434 additions and 3277 deletions
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -31,6 +31,7 @@
 `endif

 ///////////////////////////////////////////////////////////////////////////////
+
 `ifndef EXT_M_DISABLE
 `define EXT_M_ENABLE
 `endif
@ -113,24 +114,6 @@
 `define SOCKET_SIZE `MIN(4, `NUM_CORES)
 `endif

-// Size of Tensor Core
-`ifndef TC_SIZE
-`define TC_SIZE 8
-`endif
-
-// Number of TCs per Warp
-`ifndef TC_NUM
-`define TC_NUM 4
-`endif
-
-`ifndef NUM_TCU_LANES
-`define NUM_TCU_LANES   `TC_NUM
-`endif
-
-`ifndef NUM_TCU_BLOCKS
-`define NUM_TCU_BLOCKS  `ISSUE_WIDTH
-`endif
-
 `ifdef L2_ENABLE
    `define L2_ENABLED   1
 `else
@ -317,7 +300,8 @@
 `define MEM_PAGE_LOG2_SIZE (12)
 `endif

-// Virtual Memory Configuration ///////////////////////////////////////////////////////
+// Virtual Memory Configuration ///////////////////////////////////////////////
+
 `ifdef VM_ENABLE
    `ifdef XLEN_32
        `ifndef VM_ADDR_MODE
@ -545,6 +529,12 @@
 `define FNCP_PE_RATIO 2
 `endif

+// Tensore Units //////////////////////////////////////////////////////////////
+
+`ifndef NUM_TENSOR_CORES
+`define NUM_TENSOR_CORES `ISSUE_WIDTH
+`endif
+
 // Icache Configurable Knobs //////////////////////////////////////////////////

 // Cache Enable
@ -864,6 +854,12 @@
    `define EXT_ZICOND_ENABLED 0
 `endif

+`ifdef EXT_TPU_ENABLE
+    `define EXT_TPU_ENABLED 1
+`else
+    `define EXT_TPU_ENABLED 0
+`endif
+
 `define ISA_STD_A           0
 `define ISA_STD_C           2
 `define ISA_STD_D           3
--- a/hw/rtl/VX_types.vh
+++ b/hw/rtl/VX_types.vh
@ -215,10 +215,4 @@
 `define VX_CSR_NUM_CORES                12'hFC2
 `define VX_CSR_LOCAL_MEM_BASE           12'hFC3

-`define VX_MAT_MUL_SIZE                 12'hFC4     // VX_MAT_MUL_SIZE = Matrix Size / TC Size
-`define VX_TC_NUM                       12'hFC5
-`define VX_TC_SIZE                      12'hFC6
-
-
-
 `endif // VX_TYPES_VH
--- a/kernel/include/vx_intrinsics.h
+++ b/kernel/include/vx_intrinsics.h
@ -221,22 +221,28 @@ inline void vx_fence() {
    __asm__ volatile ("fence iorw, iorw");
 }

-//Matrix load
-inline void vx_matrix_load(unsigned dest, unsigned  addr) 
-{
-    asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
+inline unsigned vx_u4_mmadd(unsigned a, unsigned b, unsigned c) {
+    unsigned ret;
+    asm volatile (".insn r4 %1, 0, %2, %0, %3, %4, %5" : "=r"(ret) : "i"(RISCV_CUSTOM1), "i"(0), "r"(a), "r"(b), "r"(c));
+    return ret;
 }

-//Matrix Store
-inline void vx_matrix_store(unsigned  addr) 
-{
-    asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr));
+inline unsigned vx_u8_mmadd(unsigned a, unsigned b, unsigned c) {
+    unsigned ret;
+    asm volatile (".insn r4 %1, 0, %2, %0, %3, %4, %5" : "=r"(ret) : "i"(RISCV_CUSTOM1), "i"(1), "r"(a), "r"(b), "r"(c));
+    return ret;
 }

-//Matrix Mul
-inline void vx_matrix_mul() 
-{
-    asm volatile (".insn i 0x7b, 2, x0, 0(x0)");
+inline unsigned vx_f16_mmadd(unsigned a, unsigned b, unsigned c) {
+    unsigned ret;
+    asm volatile (".insn r4 %1, 0, %2, %0, %3, %4, %5" : "=r"(ret) : "i"(RISCV_CUSTOM1), "i"(2), "r"(a), "r"(b), "r"(c));
+    return ret;
+}
+
+inline unsigned vx_bf16_mmadd(unsigned a, unsigned b, unsigned c) {
+    unsigned ret;
+    asm volatile (".insn r4 %1, 0, %2, %0, %3, %4, %5" : "=r"(ret) : "i"(RISCV_CUSTOM1), "i"(3), "r"(a), "r"(b), "r"(c));
+    return ret;
 }

 #ifdef __cplusplus
--- a/runtime/include/vortex.h
+++ b/runtime/include/vortex.h
@ -36,8 +36,6 @@ typedef void* vx_buffer_h;
 #define VX_CAPS_ISA_FLAGS           0x7
 #define VX_CAPS_NUM_MEM_BANKS       0x8
 #define VX_CAPS_MEM_BANK_SIZE       0x9
-#define VX_CAPS_TC_SIZE             0xA
-#define VX_CAPS_TC_NUM              0xB

 // device isa flags
 #define VX_ISA_STD_A                (1ull << ISA_STD_A)
--- a/runtime/simx/vortex.cpp
+++ b/runtime/simx/vortex.cpp
@ -94,12 +94,6 @@ public:
    case VX_CAPS_NUM_CORES:
      _value = NUM_CORES * NUM_CLUSTERS;
      break;
-    case VX_CAPS_TC_SIZE:
-      _value = TC_SIZE;
-      break;
-    case VX_CAPS_TC_NUM:
-      _value = TC_NUM;
-      break;
    case VX_CAPS_CACHE_LINE_SIZE:
      _value = CACHE_BLOCK_SIZE;
      break;
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@ -25,6 +25,11 @@ ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
  SRCS += $(SRC_DIR)/vpu.cpp
 endif

+# Add TPU extension sources
+ifneq ($(findstring -DEXT_TPU_ENABLE, $(CONFIGS)),)
+  SRCS += $(SRC_DIR)/tensor_unit.cpp
+endif
+
 # Debugging
 ifdef DEBUG
 	CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
--- a/sim/simx/constants.h
+++ b/sim/simx/constants.h
@ -23,6 +23,8 @@
 #define MEM_CLOCK_RATIO   1
 #endif

+inline constexpr int VLENB            = (VLEN / 8);
+
 inline constexpr int LSU_WORD_SIZE    = (XLEN / 8);
 inline constexpr int LSU_CHANNELS     = NUM_LSU_LANES;
 inline constexpr int LSU_NUM_REQS	    = (NUM_LSU_BLOCKS * LSU_CHANNELS);
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@ -51,8 +51,15 @@ Core::Core(const SimContext& ctx,
 {
  char sname[100];

+#ifdef EXT_TPU_ENABLE
+  {
+    snprintf(sname, 100, "%s-tpu", this->name().c_str());
+    tensor_unit_ = TensorUnit::Create(sname);
+  }
+#endif
+
  for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
-    operands_.at(i) = SimPlatform::instance().create_object<Operand>();
+    operands_.at(i) = Operand::Create();
  }

  // create the memory coalescer
@ -131,14 +138,12 @@ Core::Core(const SimContext& ctx,
  dispatchers_.at((int)FUType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
  dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES);
  dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES);
-  dispatchers_.at((int)FUType::TCU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_TCU_BLOCKS, NUM_TCU_LANES);

  // initialize execute units
  func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
  func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
  func_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
  func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
-  func_units_.at((int)FUType::TCU) = SimPlatform::instance().create_object<TcuUnit>(this);

  // bind commit arbiters
  for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@ -150,6 +150,10 @@ private:
  Socket* socket_;
  const Arch& arch_;

+#ifdef EXT_TPU_ENABLE
+  TensorUnit::Ptr tensor_unit_;
+#endif
+
  Emulator emulator_;

  std::vector<IBuffer> ibuffers_;
@ -180,7 +184,6 @@ private:
  friend class AluUnit;
  friend class FpuUnit;
  friend class SfuUnit;
-  friend class TcuUnit;
 };

 } // namespace vortex
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@ -51,8 +51,7 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
  {Opcode::EXT1,    InstType::R},
  {Opcode::EXT2,    InstType::R4},
  {Opcode::R_W,     InstType::R},
-  {Opcode::I_W,     InstType::I},
-  {Opcode::TCU,     InstType::I},
+  {Opcode::I_W,     InstType::I}
 };

 static const char* op_string(const Instr &instr) {
@ -390,15 +389,22 @@ static const char* op_string(const Instr &instr) {
    default:
      std::abort();
    }
-
-  case Opcode::TCU:
-    switch(func3)
-    {
-      case 0: return "ML";     // Matrix Load
-      case 1: return "MS";     // Matrix Store
-      case 2: return "MATMUL"; // Matrix Multiply
+  case Opcode::EXT2:
+    switch(func3) {
+    case 0: // reserved
+    case 1: // reserved
+      std::abort();
+    case 2:
+      switch (func2) {
+      case 0: return "MMADD.u4_i32";
+      case 1: return "MMADD.u8_i32";
+      case 2: return "MMADD.f16_f32";
+      case 3: return "MMADD.bf16_f32";
      default:
        std::abort();
+      }
+    default:
+      std::abort();
    }
  default:
    std::abort();
@ -455,12 +461,12 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
    if (sep++ != 0) { os << ", "; } else { os << " "; }
    os << "0x" << std::hex << instr.getImm() << std::dec;
  }
-#ifdef EXT_V_ENABLE
  if (instr.getOpcode() == Opcode::SYS && instr.getFunc3() >= 5) {
    // CSRs with immediate values
    if (sep++ != 0) { os << ", "; } else { os << " "; }
    os << "0x" << std::hex << instr.getRSrc(0);
  }
+#ifdef EXT_V_ENABLE
  // Log vector-specific attributes
  if (instr.getVattrMask() != 0) {
    print_vec_attr(os, instr);
@ -592,14 +598,6 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {

  case InstType::I: {
    switch (op) {
-    case Opcode::TCU: {
-      instr->setDestReg(rs1, RegType::Integer);
-      instr->addSrcReg(rs1, RegType::Integer);
-      instr->setFunc3(func3);
-      instr->setFunc7(func7);
-      auto imm = code >> shift_rs2;
-      instr->setImm(sext(imm, width_i_imm));
-    } break;
    case Opcode::I:
    case Opcode::I_W:
    case Opcode::JALR:
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@ -30,16 +30,17 @@

 using namespace vortex;

-Emulator::warp_t::warp_t(const Arch& arch)
-  : ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
-  , freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
+warp_t::warp_t(uint32_t num_threads)
+  : ireg_file(MAX_NUM_REGS, std::vector<Word>(num_threads))
+  , freg_file(MAX_NUM_REGS, std::vector<uint64_t>(num_threads))
 #ifdef EXT_V_ENABLE
-  , vreg_file(MAX_NUM_REGS, std::vector<Byte>(MAX_NUM_REGS))
+  , vreg_file(num_threads, std::vector(MAX_NUM_REGS, std::vector<Byte>(VLEN)))
+  , vcsrs(num_threads)
 #endif
  , uuid(0)
 {}

-void Emulator::warp_t::clear(uint64_t startup_addr) {
+void warp_t::clear(uint64_t startup_addr) {
  this->PC = startup_addr;
  this->tmask.reset();
  this->uuid = 0;
@ -53,7 +54,11 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
      reg = std::rand();
    #endif
    }
-    reg_file.at(0) = 0; // r0 = 0
+  }
+
+  // set x0 to zero
+  for (auto& reg : this->ireg_file.at(0)) {
+    reg = 0;
  }

  for (auto& reg_file : this->freg_file) {
@ -69,13 +74,24 @@ void Emulator::warp_t::clear(uint64_t startup_addr) {
 #ifdef EXT_V_ENABLE
  for (auto& reg_file : this->vreg_file) {
    for (auto& reg : reg_file) {
-    #ifndef NDEBUG
-      reg = 0;
-    #else
-      reg = std::rand();
-    #endif
+      for (auto& elm : reg) {
+      #ifndef NDEBUG
+        elm = 0;
+      #else
+        elm = std::rand();
+      #endif
+      }
    }
  }
+  for (auto& vcsrs : this->vcsrs) {
+    vcsrs.vstart = 0;
+    vcsrs.vxsat = 0;
+    vcsrs.vxrm = 0;
+    vcsrs.vcsr = 0;
+    vcsrs.vlenb = 0;
+    vcsrs.vtype = 0;
+    vcsrs.vl = 0;
+  }
  this->vtype = {0, 0, 0, 0, 0};
  this->vl = 0;
  this->vlmax = 0;
@ -88,25 +104,11 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
    : arch_(arch)
    , dcrs_(dcrs)
    , core_(core)
-    , warps_(arch.num_warps(), arch)
+    , warps_(arch.num_warps(), arch.num_threads())
    , barriers_(arch.num_barriers(), 0)
    , ipdom_size_(arch.num_threads()-1)
-    // [TBC] Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is
-    // considered to be big enough to hold input tiles for one output tile.
-    // In future versions, scratchpad size should be fixed to an appropriate value.
-    , scratchpad(std::vector<Word>(32 * 32 * 32768))
-  #ifdef EXT_V_ENABLE
-    , csrs_(arch.num_warps())
-  #endif
 {
  std::srand(50);
-
-#ifdef EXT_V_ENABLE
-  for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
-    csrs_.at(i).resize(arch.num_threads());
-  }
-#endif
-
  this->clear();
 }

@ -142,10 +144,6 @@ void Emulator::clear() {
  active_warps_.set(0);
  warps_[0].tmask.set(0);
  wspawn_.valid = false;
-
-  for (auto& reg : scratchpad) {
-    reg = 0;
-  }
 }

 void Emulator::attach_ram(RAM* ram) {
@ -449,18 +447,6 @@ void Emulator::cout_flush() {
    case (addr + (VX_CSR_MPM_BASE_H-VX_CSR_MPM_BASE)) : return ((value >> 32) & 0xFFFFFFFF)
 #endif

-Word Emulator::get_tiles() {
-  return mat_size;
-}
-
-Word Emulator::get_tc_size() {
-  return tc_size;
-}
-
-Word Emulator::get_tc_num() {
-  return tc_num;
-}
-
 Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
  auto core_perf = core_->perf_stats();
  switch (addr) {
@ -482,35 +468,26 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
  case VX_CSR_MCAUSE:
    return 0;

-  case VX_CSR_FFLAGS:     return warps_.at(wid).fcsr & 0x1F;
-  case VX_CSR_FRM:        return (warps_.at(wid).fcsr >> 5);
-  case VX_CSR_FCSR:       return warps_.at(wid).fcsr;
+  case VX_CSR_FFLAGS: return warps_.at(wid).fcsr & 0x1F;
+  case VX_CSR_FRM:    return (warps_.at(wid).fcsr >> 5);
+  case VX_CSR_FCSR:   return warps_.at(wid).fcsr;

 #ifdef EXT_V_ENABLE
  // Vector CRSs
  case VX_CSR_VSTART:
-    return csrs_.at(wid).at(tid)[VX_CSR_VSTART];
+    return  warps_.at(wid).vcsrs.at(tid).vstart;
  case VX_CSR_VXSAT:
-    return csrs_.at(wid).at(tid)[VX_CSR_VXSAT];
+    return  warps_.at(wid).vcsrs.at(tid).vxsat;
  case VX_CSR_VXRM:
-    return csrs_.at(wid).at(tid)[VX_CSR_VXRM];
-  case VX_CSR_VCSR: {
-    Word vxsat = csrs_.at(wid).at(tid)[VX_CSR_VXSAT];
-    Word vxrm = csrs_.at(wid).at(tid)[VX_CSR_VXRM];
-    return (vxrm << 1) | vxsat;
-  }
+    return  warps_.at(wid).vcsrs.at(tid).vxrm;
+  case VX_CSR_VCSR:
+    return ( warps_.at(wid).vcsrs.at(tid).vxrm << 1) |  warps_.at(wid).vcsrs.at(tid).vxsat;
  case VX_CSR_VL:
-    return csrs_.at(wid).at(tid)[VX_CSR_VL];
+    return  warps_.at(wid).vcsrs.at(tid).vl;
  case VX_CSR_VTYPE:
-    return csrs_.at(wid).at(tid)[VX_CSR_VTYPE];
+    return  warps_.at(wid).vcsrs.at(tid).vtype;
  case VX_CSR_VLENB:
-    return VLEN / 8;
-  case VX_CSR_VCYCLE:
-    return csrs_.at(wid).at(tid)[VX_CSR_VCYCLE];
-  case VX_CSR_VTIME:
-    return csrs_.at(wid).at(tid)[VX_CSR_VTIME];
-  case VX_CSR_VINSTRET:
-    return csrs_.at(wid).at(tid)[VX_CSR_VINSTRET];
+    return VLENB;
 #endif

  case VX_CSR_MHARTID:    return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid;
@ -524,9 +501,6 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
  case VX_CSR_NUM_CORES:  return uint32_t(arch_.num_cores()) * arch_.num_clusters();
  case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base();
  case VX_CSR_MSCRATCH:   return csr_mscratch_;
-  case VX_MAT_MUL_SIZE:   return mat_size;
-  case VX_TC_NUM:         return tc_num;
-  case VX_TC_SIZE:        return tc_size;

  CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles);
  CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs);
@ -639,34 +613,34 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
 #ifdef EXT_V_ENABLE
  // Vector CRSs
  case VX_CSR_VSTART:
-    csrs_.at(wid).at(tid)[VX_CSR_VSTART] = value;
+     warps_.at(wid).vcsrs.at(tid).vstart = value;
    break;
  case VX_CSR_VXSAT:
-    csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1;
+     warps_.at(wid).vcsrs.at(tid).vxsat = value & 0b1;
    break;
  case VX_CSR_VXRM:
-    csrs_.at(wid).at(tid)[VX_CSR_VXRM] = value & 0b11;
+     warps_.at(wid).vcsrs.at(tid).vxrm = value & 0b11;
    break;
  case VX_CSR_VCSR:
-    csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1;
-    csrs_.at(wid).at(tid)[VX_CSR_VXRM] = (value >> 1) & 0b11;
+     warps_.at(wid).vcsrs.at(tid).vxsat = value & 0b1;
+     warps_.at(wid).vcsrs.at(tid).vxrm = (value >> 1) & 0b11;
    break;
-  case VX_CSR_VL: // read only, written by vset(i)vl(i)
-    csrs_.at(wid).at(tid)[VX_CSR_VL] = value;
+  case VX_CSR_VL:
+     warps_.at(wid).vcsrs.at(tid).vl = value;
    break;
-  case VX_CSR_VTYPE: // read only, written by vset(i)vl(i)
-    csrs_.at(wid).at(tid)[VX_CSR_VTYPE] = value;
+  case VX_CSR_VTYPE:
+     warps_.at(wid).vcsrs.at(tid).vtype = value;
+    break;
+  case VX_CSR_VLENB: // read only
+    std::abort();
    break;
-  case VX_CSR_VLENB: // read only, set to VLEN / 8
 #endif

  case VX_CSR_SATP:
  #ifdef VM_ENABLE
-    // warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0x1F) | (value & 0x1F);
-    // csrs_.at(wid).at(tid)[addr] = value; //what is wid and tid?
    mmu_.set_satp(value);
-    break;
  #endif
+    break;
  case VX_CSR_MSTATUS:
  case VX_CSR_MEDELEG:
  case VX_CSR_MIDELEG:
@ -678,18 +652,10 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
  case VX_CSR_MNSTATUS:
  case VX_CSR_MCAUSE:
    break;
-  case VX_MAT_MUL_SIZE:
-    mat_size = value;
-    break;
-  case VX_TC_NUM:
-    tc_num = value;
-    break;
-  case VX_TC_SIZE:
-    tc_size = value;
-    break;

  default: {
      std::cout << "Error: invalid CSR write addr=0x" << std::hex << addr << ", value=0x" << value << std::dec << std::endl;
+      std::flush(std::cout);
      std::abort();
    }
  }
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@ -19,6 +19,7 @@
 #include <stack>
 #include <mem.h>
 #include "types.h"
+#include "tensor_unit.h"

 namespace vortex {

@ -28,11 +29,78 @@ class Core;
 class Instr;
 class instr_trace_t;

+struct ipdom_entry_t {
+  ipdom_entry_t(const ThreadMask &orig_tmask, const ThreadMask &else_tmask, Word PC)
+    : orig_tmask (orig_tmask)
+    , else_tmask (else_tmask)
+    , PC         (PC)
+    , fallthrough(false)
+  {}
+
+  ThreadMask  orig_tmask;
+  ThreadMask  else_tmask;
+  Word        PC;
+  bool        fallthrough;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct vtype_t {
+  uint32_t vill;
+  uint32_t vma;
+  uint32_t vta;
+  uint32_t vsew;
+  uint32_t vlmul;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct vcsrs_t {
+  uint32_t vstart;
+  uint32_t vxsat;
+  uint32_t vxrm;
+  uint32_t vcsr;
+  uint32_t vl;
+  uint32_t vtype;
+  uint32_t vlenb;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct warp_t {
+  warp_t(uint32_t num_threads);
+  void clear(uint64_t startup_addr);
+
+  Word                              PC;
+  ThreadMask                        tmask;
+  std::vector<std::vector<Word>>    ireg_file;
+  std::vector<std::vector<uint64_t>>freg_file;
+  std::stack<ipdom_entry_t>         ipdom_stack;
+  Byte                              fcsr;
+#ifdef EXT_V_ENABLE
+  std::vector<std::vector<std::vector<Byte>>> vreg_file;
+  std::vector<vcsrs_t>              vcsrs;
+  vtype_t                           vtype;
+  uint32_t                          vl;
+  uint32_t                          vlmax;
+
+#endif
+  uint32_t                          uuid;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct wspawn_t {
+  bool      valid;
+  uint32_t  num_warps;
+  Word      nextPC;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
 class Emulator {
 public:
-  Emulator(const Arch &arch,
-           const DCRS &dcrs,
-           Core* core);
+  Emulator(const Arch &arch, const DCRS &dcrs, Core* core);

  ~Emulator();

@ -57,83 +125,20 @@ public:

  int get_exitcode() const;

-  Word get_tiles();
-  Word get_tc_size();
-  Word get_tc_num();
-
  void dcache_read(void* data, uint64_t addr, uint32_t size);

  void dcache_write(const void* data, uint64_t addr, uint32_t size);

 private:

-  struct ipdom_entry_t {
-    ipdom_entry_t(const ThreadMask &orig_tmask, const ThreadMask &else_tmask, Word PC)
-      : orig_tmask (orig_tmask)
-      , else_tmask (else_tmask)
-      , PC         (PC)
-      , fallthrough(false)
-    {}
-
-    ThreadMask  orig_tmask;
-    ThreadMask  else_tmask;
-    Word        PC;
-    bool        fallthrough;
-  };
-
-  struct vtype_t {
-    uint32_t vill;
-    uint32_t vma;
-    uint32_t vta;
-    uint32_t vsew;
-    uint32_t vlmul;
-  };
-
-  union reg_data_t {
-    Word     u;
-    WordI    i;
-    WordF    f;
-    float    f32;
-    double   f64;
-    uint32_t u32;
-    uint64_t u64;
-    int32_t  i32;
-    int64_t  i64;
-  };
-
-  struct warp_t {
-    warp_t(const Arch& arch);
-    void clear(uint64_t startup_addr);
-
-    Word                              PC;
-    ThreadMask                        tmask;
-    std::vector<std::vector<Word>>    ireg_file;
-    std::vector<std::vector<uint64_t>>freg_file;
-    std::stack<ipdom_entry_t>         ipdom_stack;
-    Byte                              fcsr;
-#ifdef EXT_V_ENABLE
-    std::vector<std::vector<Byte>>    vreg_file;
-    vtype_t                           vtype;
-    uint32_t                          vl;
-    Word                              vlmax;
-#endif
-    uint32_t                          uuid;
-  };
-
-  struct wspawn_t {
-    bool valid;
-    uint32_t num_warps;
-    Word nextPC;
-  };
-
  std::shared_ptr<Instr> decode(uint32_t code) const;

  void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace);

 #ifdef EXT_V_ENABLE
-  void loadVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
-  void storeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
-  void executeVector(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata);
+  void loadVector(const Instr &instr, uint32_t wid, uint32_t tid, WordI rs1_data, WordI rs2_data);
+  void storeVector(const Instr &instr, uint32_t wid, uint32_t tid, WordI rs1_data, WordI rs2_data);
+  bool executeVector(const Instr &instr, uint32_t wid, uint32_t tid, WordI rs1_data, WordI rs2_data, WordI* rd_data);
 #endif

  void icache_read(void* data, uint64_t addr, uint32_t size);
@ -171,11 +176,6 @@ private:
  uint32_t    ipdom_size_;
  Word        csr_mscratch_;
  wspawn_t    wspawn_;
-  std::vector<Word> scratchpad;
-  uint32_t mat_size;
-  uint32_t tc_size;
-  uint32_t tc_num;
-  std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
 };

 }
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@ -166,7 +166,7 @@ void LsuUnit::tick() {
 			continue;
 		}

-		bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE));
+		bool is_write = (trace->lsu_type == LsuType::STORE);

 		// check pending queue capacity
 		if (!is_write && state.pending_rd_reqs.full()) {
@ -222,96 +222,6 @@ void LsuUnit::tick() {
 		input.pop();
 	}
 }
-/*  TO BE FIXED:Tensor_core code
-    send_request is not used anymore. Need to be modified number of load
-*/
-/*
-int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
-	int count = 0;
-
-	auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
-	bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE));
-
-	uint16_t req_per_thread = 1;
-	if ((trace->lsu_type == LsuType::TCU_LOAD) || (trace->lsu_type == LsuType::TCU_STORE))
-	{
- 		req_per_thread= (1>(trace_data->mem_addrs.at(0).size)/4)? 1: ((trace_data->mem_addrs.at(0).size)/4);
-	}
-
-	auto t0 = trace->pid * NUM_LSU_LANES;
-
-	for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
-		uint32_t t = t0 + i;
-		if (!trace->tmask.test(t))
-			continue;
-
-		int req_idx = block_idx * LSU_CHANNELS + (i % LSU_CHANNELS);
-		auto& dcache_req_port = core_->lmem_switch_.at(req_idx)->ReqIn;
-
-		auto mem_addr = trace_data->mem_addrs.at(t);
-		auto type = get_addr_type(mem_addr.addr);
-		// DT(3, "addr_type = " << type << ", " << *trace);
-		uint32_t mem_bytes = 1;
-		for (int i = 0; i < req_per_thread; i++)
-		{
-			MemReq mem_req;
-			mem_req.addr  = mem_addr.addr + (i*mem_bytes);
-			mem_req.write = is_write;
-			mem_req.type  = type;
-			mem_req.tag   = tag;
-			mem_req.cid   = trace->cid;
-			mem_req.uuid  = trace->uuid;
-
-			dcache_req_port.push(mem_req, 1);
-			DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
-				<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
-
-			if (is_write) {
-				++core_->perf_stats_.stores;
-			} else {
-				++core_->perf_stats_.loads;
-				++pending_loads_;
-			}
-
-			++count;
-		}
-	}
-	return count;
-}
-*/
-
-///////////////////////////////////////////////////////////////////////////////
-
-TcuUnit::TcuUnit(const SimContext& ctx, Core* core)
-    : FuncUnit(ctx, core, "TCU")
-    {}
-
-void TcuUnit::tick() {
-
-	for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
-        auto& input = Inputs.at(i);
-        if (input.empty())
-            continue;
-        auto& output = Outputs.at(i);
-        auto trace = input.front();
-        uint32_t n_tiles = core_->emulator_.get_tiles();
-		uint32_t tc_size = core_->emulator_.get_tc_size();
-
-        switch (trace->tcu_type) {
-            case TCUType::TCU_MUL:
-            {    //mat size = n_tiles * tc_size
-                int matmul_latency = (n_tiles * tc_size) + tc_size + tc_size;
-                output.push(trace, matmul_latency);
-				DT(3, "matmul_latency = " << matmul_latency << ", " << *trace);
-                break;
-            }
-            default:
-                std::abort();
-        }
-        DT(3, "pipeline-execute: op=" << trace->tcu_type << ", " << *trace);
-        input.pop();
-    }
-}

 ///////////////////////////////////////////////////////////////////////////////

@ -354,6 +264,17 @@ void SfuUnit::tick() {
 				release_warp = core_->barrier(trace_data->arg1, trace_data->arg2, trace->wid);
 			}
 		} break;
+	#ifdef EXT_TPU_ENABLE
+		case SfuType::MMADD_U4:
+		case SfuType::MMADD_U8:
+		case SfuType::MMADD_F16:
+		case SfuType::MMADD_BF16:	{
+			if (trace->eop) {
+				auto trace_data = std::dynamic_pointer_cast<TensorUnit::TraceData>(trace->data);
+				output.push(trace, trace_data->latency + delay);
+			}
+		} break;
+	#endif
 		default:
 			std::abort();
 		}
--- a/sim/simx/func_unit.h
+++ b/sim/simx/func_unit.h
@ -98,14 +98,6 @@ private:

 ///////////////////////////////////////////////////////////////////////////////

-class TcuUnit : public FuncUnit {
-public:
-    TcuUnit(const SimContext& ctx, Core*);
-    void tick();
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
 class SfuUnit : public FuncUnit {
 public:
 	SfuUnit(const SimContext& ctx, Core*);
--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@ -48,7 +48,7 @@ enum class Opcode {
  EXT1      = 0x0b,
  EXT2      = 0x2b,
  EXT3      = 0x5b,
-  TCU       = 0x7b
+  EXT4      = 0x7b
 };

 enum class InstType {
@ -73,11 +73,11 @@ enum DecodeConstants {
  width_vmask = 1,
  width_i_imm = 12,
  width_j_imm = 20,
-  width_v_zimm = 11,
-  width_v_ma = 1,
-  width_v_ta = 1,
+  width_v_zimm= 11,
+  width_v_ma  = 1,
+  width_v_ta  = 1,
  width_v_sew = 3,
-  width_v_lmul = 3,
+  width_v_lmul= 3,
  width_aq    = 1,
  width_rl    = 1,

@ -142,9 +142,9 @@ public:
    , func7_(0)
    , vmask_(0)
    , vlsWidth_(0)
-    , vMop_(0)
-    , vUmop_(0)
-    , vNf_(0)
+    , vmop_(0)
+    , vumop_(0)
+    , vnf_(0)
    , vs3_(0)
    , has_zimm_(false)
    , vlmul_(0)
@ -189,10 +189,10 @@ public:

  // Attributes for Vector instructions
  void setVlsWidth(uint32_t width) { vlsWidth_ = width; vattr_mask_ |= vattr_vlswidth; }
-  void setVmop(uint32_t mop) { vMop_ = mop; vattr_mask_ |= vattr_vmop; }
-  void setVumop(uint32_t umop) { vUmop_ = umop; vattr_mask_ |= vattr_vumop; }
-  void setVnf(uint32_t nf) { vNf_ = nf; vattr_mask_ |= vattr_vnf; }
-  void setVmask(uint32_t mask) { vmask_ = mask; vattr_mask_ |= vattr_vmask; }
+  void setVmop(uint32_t mop) { vmop_ = mop; vattr_mask_ |= vattr_vmop; }
+  void setVumop(uint32_t umop) { vumop_ = umop; vattr_mask_ |= vattr_vumop; }
+  void setVnf(uint32_t nf) { vnf_ = nf; vattr_mask_ |= vattr_vnf; }
+  void setVmask(uint32_t vmask) { vmask_ = vmask; vattr_mask_ |= vattr_vmask; }
  void setVs3(uint32_t vs) { vs3_ = vs; vattr_mask_ |= vattr_vs3; }
  void setZimm(bool has_zimm) { has_zimm_ = has_zimm; vattr_mask_ |= vattr_zimm; }
  void setVlmul(uint32_t lmul) { vlmul_ = lmul; vattr_mask_ |= vattr_vlmul; }
@ -218,10 +218,11 @@ public:
  uint32_t getFunc6() const { return func6_; }
  uint32_t getFunc7() const { return func7_; }

+  // Vector
  uint32_t getVlsWidth() const { return vlsWidth_; }
-  uint32_t getVmop() const { return vMop_; }
-  uint32_t getVumop() const { return vUmop_; }
-  uint32_t getVnf() const { return vNf_; }
+  uint32_t getVmop() const { return vmop_; }
+  uint32_t getVumop() const { return vumop_; }
+  uint32_t getVnf() const { return vnf_; }
  uint32_t getVmask() const { return vmask_; }
  uint32_t getVs3() const { return vs3_; }
  bool     hasZimm() const { return has_zimm_; }
@ -254,9 +255,9 @@ private:
  // Vector
  uint32_t vmask_;
  uint32_t vlsWidth_;
-  uint32_t vMop_;
-  uint32_t vUmop_;
-  uint32_t vNf_;
+  uint32_t vmop_;
+  uint32_t vumop_;
+  uint32_t vnf_;
  uint32_t vs3_;
  bool     has_zimm_;
  uint32_t vlmul_;
--- a/sim/simx/instr_trace.h
+++ b/sim/simx/instr_trace.h
@ -77,7 +77,6 @@ public:
    AluType  alu_type;
    FpuType  fpu_type;
    SfuType  sfu_type;
-    TCUType  tcu_type; 
  };

  ITraceData::Ptr data;
--- a/sim/simx/tensor_unit.cpp
+++ b/sim/simx/tensor_unit.cpp
@ -0,0 +1,137 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensor_unit.h"
+#include "mem.h"
+#include <VX_config.h>
+#include <algorithm>
+
+using namespace vortex;
+
+class TensorCore : public SimObject<TensorCore> {
+public:
+  struct PerfStats {
+    uint64_t reads;
+    uint64_t writes;
+    uint64_t latency;
+    uint64_t stalls;
+
+    PerfStats()
+      : reads(0)
+      , writes(0)
+      , latency(0)
+      , stalls(0)
+    {}
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->reads   += rhs.reads;
+      this->writes  += rhs.writes;
+      this->latency += rhs.latency;
+      this->stalls  += rhs.stalls;
+      return *this;
+    }
+  };
+
+  SimPort<instr_trace_t*> Input;
+  SimPort<instr_trace_t*> Output;
+
+  TensorCore(const SimContext& ctx, const char* name);
+  ~TensorCore();
+
+  void reset();
+
+  void tick();
+
+  void attach_ram(RAM* mem);
+
+  void mmadd(TensorUnit::TraceData::Ptr trace_data);
+
+  const PerfStats& perf_stats() const;
+
+private:
+
+  class Impl;
+  Impl* impl_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class TensorUnit::Impl {
+public:
+
+  Impl(TensorUnit* simobject)
+    : simobject_(simobject)
+  {
+    this->reset();
+  }
+
+  ~Impl() {}
+
+  void reset() {
+    //--
+  }
+
+  void tick() {
+    //--
+  }
+
+  void mmadd(const std::vector<reg_data_t>& rs1_data,
+             const std::vector<reg_data_t>& rs2_data,
+             const std::vector<reg_data_t>& rs3_data,
+             std::vector<reg_data_t>& rd_data,
+             TensorUnit::TraceData::Ptr& trace_data) {
+    //--
+  }
+
+  const PerfStats& perf_stats() const {
+    return perf_stats_;
+  }
+
+private:
+
+  TensorUnit* simobject_;
+  PerfStats perf_stats_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+TensorUnit::TensorUnit(const SimContext& ctx, const char* name)
+  : SimObject<TensorUnit>(ctx, name)
+  , Input(this)
+  , Output(this)
+  , impl_(new Impl(this))
+{}
+
+TensorUnit::~TensorUnit() {
+  delete impl_;
+}
+
+void TensorUnit::reset() {
+  impl_->reset();
+}
+
+void TensorUnit::tick() {
+  impl_->tick();
+}
+
+void TensorUnit::mmadd(const std::vector<reg_data_t>& rs1_data,
+                        const std::vector<reg_data_t>& rs2_data,
+                        const std::vector<reg_data_t>& rs3_data,
+                        std::vector<reg_data_t>& rd_data,
+                        TensorUnit::TraceData::Ptr& trace_data) {
+  impl_->mmadd(rs1_data, rs2_data, rs3_data, rd_data, trace_data);
+}
+
+const TensorUnit::PerfStats& TensorUnit::perf_stats() const {
+  return impl_->perf_stats();
+}
--- a/sim/simx/tensor_unit.h
+++ b/sim/simx/tensor_unit.h
@ -0,0 +1,62 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <simobject.h>
+#include "pipeline.h"
+
+namespace vortex {
+
+class TensorUnit : public SimObject<TensorUnit> {
+public:
+  struct TraceData : public ITraceData {
+    using Ptr = std::shared_ptr<TraceData>;
+    uint32_t latency;
+  };
+
+  struct PerfStats {
+    uint64_t latency;
+    uint64_t stalls;
+
+    PerfStats()
+      : latency(0)
+      , stalls(0)
+    {}
+  };
+
+  SimPort<instr_trace_t*> Input;
+  SimPort<instr_trace_t*> Output;
+
+  TensorUnit(const SimContext& ctx, const char* name);
+  ~TensorUnit();
+
+  void reset();
+
+  void tick();
+
+  void mmadd(const std::vector<reg_data_t>& rs1_data,
+             const std::vector<reg_data_t>& rs2_data,
+             const std::vector<reg_data_t>& rs3_data,
+             std::vector<reg_data_t>& rd_data,
+             TensorUnit::TraceData::Ptr& trace_data);
+
+  const PerfStats& perf_stats() const;
+
+private:
+
+  class Impl;
+  Impl* impl_;
+};
+
+}
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@ -24,8 +24,9 @@
 #include <VX_types.h>
 #include <simobject.h>
 #include <bitvector.h>
-#include "debug.h"
 #include <iostream>
+#include "debug.h"
+#include "constants.h"

 namespace vortex {

@ -35,13 +36,11 @@ typedef uint32_t Word;
 typedef int32_t  WordI;
 typedef uint64_t DWord;
 typedef int64_t  DWordI;
-typedef uint32_t WordF;
 #elif (XLEN == 64)
 typedef uint64_t Word;
 typedef int64_t  WordI;
 typedef __uint128_t DWord;
 typedef __int128_t DWordI;
-typedef uint64_t WordF;
 #else
 #error unsupported XLEN
 #endif
@ -59,6 +58,21 @@ typedef std::bitset<MAX_NUM_WARPS>   WarpMask;

 ///////////////////////////////////////////////////////////////////////////////

+union reg_data_t {
+  uint8_t  u8;
+  uint16_t u16;
+  Word     u;
+  WordI    i;
+  float    f32;
+  double   f64;
+  uint32_t u32;
+  uint64_t u64;
+  int32_t  i32;
+  int64_t  i64;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
 class ThreadMaskOS {
 public:
  ThreadMaskOS(const ThreadMask& mask, int size)
@ -106,7 +120,6 @@ enum class FUType {
  LSU,
  FPU,
  SFU,
-  TCU,
  Count
 };

@ -116,7 +129,6 @@ inline std::ostream &operator<<(std::ostream &os, const FUType& type) {
  case FUType::LSU: os << "LSU"; break;
  case FUType::FPU: os << "FPU"; break;
  case FUType::SFU: os << "SFU"; break;
-  case FUType::TCU: os << "TCU"; break;
  default: assert(false);
  }
  return os;
@ -148,30 +160,14 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {

 enum class LsuType {
  LOAD,
-  TCU_LOAD,
  STORE,
-  TCU_STORE,
  FENCE
 };

-enum class TCUType {
-  TCU_MUL
-};
-
-inline std::ostream &operator<<(std::ostream &os, const TCUType& type) {
-  switch (type) {
-  case TCUType::TCU_MUL: os << "TCU MUL"; break;
-  default: assert(false);
-  }
-  return os;
-}
-
 inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
  switch (type) {
  case LsuType::LOAD:  os << "LOAD"; break;
-  case LsuType::TCU_LOAD: os << "TCU_LOAD"; break;
  case LsuType::STORE: os << "STORE"; break;
-  case LsuType::TCU_STORE: os << "TCU_STORE"; break;
  case LsuType::FENCE: os << "FENCE"; break;
  default: assert(false);
  }
@ -248,7 +244,11 @@ enum class SfuType {
  PRED,
  CSRRW,
  CSRRS,
-  CSRRC
+  CSRRC,
+  MMADD_U4,
+  MMADD_U8,
+  MMADD_F16,
+  MMADD_BF16
 };

 inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
@ -262,6 +262,12 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
  case SfuType::CSRRW:  os << "CSRRW"; break;
  case SfuType::CSRRS:  os << "CSRRS"; break;
  case SfuType::CSRRC:  os << "CSRRC"; break;
+#ifdef EXT_TPU_ENABLE
+  case SfuType::MMADD_U4: os << "MMADD_U4"; break;
+  case SfuType::MMADD_U8: os << "MMADD_U8"; break;
+  case SfuType::MMADD_F16: os << "MMADD_F16"; break;
+  case SfuType::MMADD_BF16: os << "MMADD_BF16"; break;
+#endif
  default: assert(false);
  }
  return os;
--- a/sim/simx/vpu.cpp
+++ b/sim/simx/vpu.cpp
--- a/sim/simx/vpu.h
+++ b/sim/simx/vpu.h
@ -1,7 +1,18 @@
-#ifdef EXT_V_ENABLE
 #pragma once

-using namespace vortex;
+#include <stdlib.h>
+#include <cstdint>
+#include <string>
+#include <type_traits>
+#include <vector>
+#include <limits>
+#include <iostream>
+#include <algorithm>
+#include <bitmanip.h>
+#include <rvfloats.h>
+#include "types.h"
+
+namespace vortex {

 template <typename T, typename R>
 class Add {
@ -1120,224 +1131,21 @@ public:

 ///////////////////////////////////////////////////////////////////////////////

-bool isMasked(std::vector<std::vector<Byte>> &vreg_file, uint32_t maskVreg, uint32_t byteI, bool vmask) {
+bool isMasked(std::vector<std::vector<Byte>> &vreg_file, uint32_t maskVreg, uint32_t byteI, uint32_t vmask) {
+  if (vmask == 1)
+    return false; // unmasked
+
  auto &mask = vreg_file.at(maskVreg);
  uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8);
  uint8_t value = (emask >> (byteI % 8)) & 0x1;
-  DP(4, "Masking enabled: " << +!vmask << " mask element: " << +value);
-  return !vmask && value == 0;
+  DP(4, "Masking enabled: " << +value);
+  return (value == 0);
 }

 template <typename DT>
-uint32_t getVreg(uint32_t baseVreg, uint32_t byteI) {
-  uint32_t vsew = sizeof(DT) * 8;
-  return (baseVreg + (byteI / (VLEN / vsew))) % 32;
-}
-
-template <typename DT>
-DT &getVregData(std::vector<vortex::Byte> &baseVregVec, uint32_t byteI) {
-  uint32_t vsew = sizeof(DT) * 8;
-  return *(DT *)(baseVregVec.data() + (byteI % (VLEN / vsew)) * vsew / 8);
-}
-
-template <typename DT>
-DT &getVregData(std::vector<std::vector<vortex::Byte>> &vreg_file, uint32_t baseVreg, uint32_t byteI) {
-  auto &vr1 = vreg_file.at(getVreg<DT>(baseVreg, byteI));
-  return getVregData<DT>(vr1, byteI);
-}
-
-template <typename DT>
-void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  if (nfields * emul > 8) {
-    std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
-    std::abort();
-  }
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    uint32_t nfields_strided = strided ? nfields : 1;
-    Word mem_addr = (base_addr & 0xFFFFFFFC) + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
-    Word mem_data = 0;
-    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
-    DP(4, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
-    DP(4, "Previous data: " << +result);
-    result = (DT)mem_data;
-  }
-}
-
-void vector_op_vix_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_load<uint8_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vix_load<uint16_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vix_load<uint32_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vix_load<uint64_t>(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VLE for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  if (nfields * emul > 8) {
-    std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl;
-    std::abort();
-  }
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    Word offset = 0;
-    switch (iSew) {
-    case 8:
-      offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 16:
-      offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 32:
-      offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 64:
-      offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    default:
-      std::cout << "Unsupported iSew: " << iSew << std::endl;
-      std::abort();
-    }
-
-    Word mem_addr = (base_addr & 0xFFFFFFFC) + offset + (i % nfields) * sizeof(DT);
-    Word mem_data = 0;
-    emul_->dcache_read(&mem_data, mem_addr, vsew / 8);
-    DP(4, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg<DT>(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    DT &result = getVregData<DT>(vreg_file, rdest + (i % nfields) * emul, i / nfields);
-    DP(4, "Previous data: " << +result);
-    result = (DT)mem_data;
-  }
-}
-
-void vector_op_vv_load(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_load<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vv_load<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vv_load<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vv_load<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VLUX/VLOX for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    uint32_t nfields_strided = strided ? nfields : 1;
-    Word mem_addr = base_addr + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT);
-    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
-    DP(4, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
-  }
-}
-
-void vector_op_vix_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vix_store<uint8_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vix_store<uint16_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vix_store<uint32_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vix_store<uint64_t>(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VSE for vsew: " << vsew << std::endl;
-    std::abort();
-  }
-}
-
-template <typename DT>
-void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  uint32_t vsew = sizeof(DT) * 8;
-  uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11);
-  for (uint32_t i = 0; i < vl * nfields; i++) {
-    if (isMasked(vreg_file, 0, i / nfields, vmask))
-      continue;
-
-    Word offset = 0;
-    switch (iSew) {
-    case 8:
-      offset = getVregData<uint8_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 16:
-      offset = getVregData<uint16_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 32:
-      offset = getVregData<uint32_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    case 64:
-      offset = getVregData<uint64_t>(vreg_file, rsrc1, i / nfields);
-      break;
-    default:
-      std::cout << "Unsupported iSew: " << iSew << std::endl;
-      std::abort();
-    }
-
-    Word mem_addr = base_addr + offset + (i % nfields) * sizeof(DT);
-    Word mem_data = getVregData<DT>(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields);
-    DP(4, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg<DT>(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields);
-    emul_->dcache_write(&mem_data, mem_addr, vsew / 8);
-  }
-}
-
-void vector_op_vv_store(std::vector<std::vector<Byte>> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) {
-  switch (vsew) {
-  case 8:
-    vector_op_vv_store<uint8_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 16:
-    vector_op_vv_store<uint16_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 32:
-    vector_op_vv_store<uint32_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  case 64:
-    vector_op_vv_store<uint64_t>(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask);
-    break;
-  default:
-    std::cout << "Failed to execute VSUX/VSOX for vsew: " << vsew << std::endl;
-    std::abort();
-  }
+DT &getVregData(std::vector<Byte> &vreg_file, uint32_t elm) {
+  assert(elm < (VLENB / sizeof(DT)));
+  return *reinterpret_cast<DT*>(vreg_file.data() + elm * sizeof(DT));
 }

 template <template <typename DT1, typename DT2> class OP, typename DT>
@ -1345,12 +1153,11 @@ void vector_op_vix(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    DT third = getVregData<DT>(vreg_file, rdest, i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc0), i);
+    DT third = getVregData<DT>(vreg_file.at(rdest), i);
    DT result = OP<DT, DT>::apply(first, second, third);
    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
+    getVregData<DT>(vreg_file.at(rdest), i) = result;
  }
 }

@ -1378,11 +1185,11 @@ void vector_op_vix(Word src1, std::vector<std::vector<Byte>> &vreg_file, uint32_
 template <template <typename DT1, typename DT2> class OP, typename DT>
 void vector_op_vix_carry(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl) {
  for (uint32_t i = 0; i < vl; i++) {
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc0), i);
    bool third = !isMasked(vreg_file, 0, i, false);
    DT result = OP<DT, DT>::apply(first, second, third);
    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
+    getVregData<DT>(vreg_file.at(rdest), i) = result;
  }
 }

@ -1410,14 +1217,14 @@ void vector_op_vix_carry(Word src1, std::vector<std::vector<Byte>> &vreg_file, u
 template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
 void vector_op_vix_carry_out(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
  for (uint32_t i = 0; i < vl; i++) {
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc0), i);
    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
    bool result = OP<DT, DTR>::apply(first, second, third);
    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+      getVregData<uint8_t>(vreg_file.at(rdest), i / 8) |= 1 << (i % 8);
    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+      getVregData<uint8_t>(vreg_file.at(rdest), i / 8) &= ~(1 << (i % 8));
    }
  }
 }
@ -1446,9 +1253,9 @@ void vector_op_vix_carry_out(Word src1, std::vector<std::vector<Byte>> &vreg_fil
 template <typename DT>
 void vector_op_vix_merge(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rdest, uint32_t vl, uint32_t vmask) {
  for (uint32_t i = 0; i < vl; i++) {
-    DT result = isMasked(vreg_file, 0, i, vmask) ? getVregData<DT>(vreg_file, rsrc0, i) : first;
+    DT result = isMasked(vreg_file, 0, i, vmask) ? getVregData<DT>(vreg_file.at(rsrc0), i) : first;
    DP(4, "Merge - Choosing result: " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
+    getVregData<DT>(vreg_file.at(rdest), i) = result;
  }
 }

@ -1481,16 +1288,16 @@ void vector_op_scalar(DT &dest, std::vector<std::vector<Byte>> &vreg_file, uint3
  }
  switch (vsew) {
  case 8:
-    dest = getVregData<uint8_t>(vreg_file, rsrc1, 0);
+    dest = getVregData<uint8_t>(vreg_file.at(rsrc1), 0);
    break;
  case 16:
-    dest = getVregData<uint16_t>(vreg_file, rsrc1, 0);
+    dest = getVregData<uint16_t>(vreg_file.at(rsrc1), 0);
    break;
  case 32:
-    dest = getVregData<uint32_t>(vreg_file, rsrc1, 0);
+    dest = getVregData<uint32_t>(vreg_file.at(rsrc1), 0);
    break;
  case 64:
-    dest = getVregData<uint64_t>(vreg_file, rsrc1, 0);
+    dest = getVregData<uint64_t>(vreg_file.at(rsrc1), 0);
    break;
  default:
    std::cout << "Failed to execute vmv.x.s/vfmv.f.s for vsew: " << vsew << std::endl;
@ -1503,12 +1310,11 @@ void vector_op_vix_w(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc0), i);
+    DTR third = getVregData<DTR>(vreg_file.at(rdest), i);
    DTR result = OP<DT, DTR>::apply(first, second, third);
    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
+    getVregData<DTR>(vreg_file.at(rdest), i) = result;
  }
 }

@ -1553,11 +1359,10 @@ void vector_op_vix_n(DT first, std::vector<std::vector<Byte>> &vreg_file, uint32
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc0), i);
    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
+    getVregData<DTR>(vreg_file.at(rdest), i) = result;
  }
 }

@ -1584,11 +1389,10 @@ void vector_op_vix_sat(DTR first, std::vector<std::vector<Byte>> &vreg_file, uin
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT second = getVregData<DTR>(vreg_file, rsrc0, i);
+    DT second = getVregData<DTR>(vreg_file.at(rsrc0), i);
    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
+    getVregData<DTR>(vreg_file.at(rdest), i) = result;
  }
 }

@ -1701,14 +1505,13 @@ void vector_op_vix_mask(DT first, std::vector<std::vector<Byte>> &vreg_file, uin
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT second = getVregData<DT>(vreg_file, rsrc0, i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc0), i);
    bool result = OP<DT, bool>::apply(first, second, 0);
    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+      getVregData<uint8_t>(vreg_file.at(rdest), i / 8) |= 1 << (i % 8);
    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+      getVregData<uint8_t>(vreg_file.at(rdest), i / 8) &= ~(1 << (i % 8));
    }
  }
 }
@ -1743,7 +1546,7 @@ void vector_op_vix_slide(Word first, std::vector<std::vector<Byte>> &vreg_file,
  // so first is our scalar value and we need to overwrite it with 1 for later computations
  if (scalar && vl && !isMasked(vreg_file, 0, scalarPos, vmask)) {
    DP(4, "Slide - Moving scalar value " << +first << " to position " << +scalarPos);
-    getVregData<DT>(vreg_file, rdest, scalarPos) = first;
+    getVregData<DT>(vreg_file.at(rdest), scalarPos) = first;
  }
  first = scalar ? 1 : first;

@ -1752,9 +1555,9 @@ void vector_op_vix_slide(Word first, std::vector<std::vector<Byte>> &vreg_file,
      continue;

    __uint128_t iSrc = slideDown ? (__uint128_t)i + (__uint128_t)first : (__uint128_t)i - (__uint128_t)first; // prevent overflows/underflows
-    DT value = (!slideDown || iSrc < vlmax) ? getVregData<DT>(vreg_file, rsrc0, iSrc) : 0;
+    DT value = (!slideDown || iSrc < vlmax) ? getVregData<DT>(vreg_file.at(rsrc0), iSrc) : 0;
    DP(4, "Slide - Moving value " << +value << " from position " << (uint64_t)iSrc << " to position " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = value;
+    getVregData<DT>(vreg_file.at(rdest), i) = value;
  }
 }

@ -1784,10 +1587,9 @@ void vector_op_vix_gather(Word first, std::vector<std::vector<Byte>> &vreg_file,
  for (Word i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc0, first) : 0;
+    DT value = first < vlmax ? getVregData<DT>(vreg_file.at(rsrc0), first) : 0;
    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = value;
+    getVregData<DT>(vreg_file.at(rdest), i) = value;
  }
 }

@ -1817,13 +1619,12 @@ void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uin
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DT third = getVregData<DT>(vreg_file, rdest, i);
+    DT first = getVregData<DT>(vreg_file.at(rsrc0), i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc1), i);
+    DT third = getVregData<DT>(vreg_file.at(rdest), i);
    DT result = OP<DT, DT>::apply(first, second, third);
    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
+    getVregData<DT>(vreg_file.at(rdest), i) = result;
  }
 }

@ -1851,12 +1652,12 @@ void vector_op_vv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uin
 template <template <typename DT1, typename DT2> class OP, typename DT>
 void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
  for (uint32_t i = 0; i < vl; i++) {
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DT first = getVregData<DT>(vreg_file.at(rsrc0), i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc1), i);
    bool third = !isMasked(vreg_file, 0, i, false);
    DT result = OP<DT, DT>::apply(first, second, third);
    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
+    getVregData<DT>(vreg_file.at(rdest), i) = result;
  }
 }

@ -1884,15 +1685,15 @@ void vector_op_vv_carry(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc
 template <template <typename DT1, typename DT2> class OP, typename DT, typename DTR>
 void vector_op_vv_carry_out(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
  for (uint32_t i = 0; i < vl; i++) {
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DT first = getVregData<DT>(vreg_file.at(rsrc0), i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc1), i);
    bool third = !vmask && !isMasked(vreg_file, 0, i, vmask);
    bool result = OP<DT, DTR>::apply(first, second, third);
    DP(4, (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+      getVregData<uint8_t>(vreg_file.at(rdest), i / 8) |= 1 << (i % 8);
    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+      getVregData<uint8_t>(vreg_file.at(rdest), i / 8) &= ~(1 << (i % 8));
    }
  }
 }
@ -1922,9 +1723,9 @@ template <typename DT>
 void vector_op_vv_merge(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl, uint32_t vmask) {
  for (uint32_t i = 0; i < vl; i++) {
    uint32_t rsrc = isMasked(vreg_file, 0, i, vmask) ? rsrc1 : rsrc0;
-    DT result = getVregData<DT>(vreg_file, rsrc, i);
+    DT result = getVregData<DT>(vreg_file.at(rsrc), i);
    DP(4, "Merge - Choosing result: " << +result);
-    getVregData<DT>(vreg_file, rdest, i) = result;
+    getVregData<DT>(vreg_file.at(rdest), i) = result;
  }
 }

@ -1954,11 +1755,10 @@ void vector_op_vv_gather(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsr
  for (Word i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    uint32_t first = ei16 ? getVregData<uint16_t>(vreg_file, rsrc0, i) : getVregData<DT>(vreg_file, rsrc0, i);
-    DT value = first < vlmax ? getVregData<DT>(vreg_file, rsrc1, first) : 0;
+    uint32_t first = ei16 ? getVregData<uint16_t>(vreg_file.at(rsrc0), i) : getVregData<DT>(vreg_file.at(rsrc0), i);
+    DT value = first < vlmax ? getVregData<DT>(vreg_file.at(rsrc1), first) : 0;
    DP(4, "Register gather - Moving value " << +value << " from position " << +first << " to position " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = value;
+    getVregData<DT>(vreg_file.at(rdest), i) = value;
  }
 }

@ -1988,13 +1788,12 @@ void vector_op_vv_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, u
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DT first = getVregData<DT>(vreg_file.at(rsrc0), i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc1), i);
+    DTR third = getVregData<DTR>(vreg_file.at(rdest), i);
    DTR result = OP<DT, DTR>::apply(first, second, third);
    DP(4, "Widening " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
+    getVregData<DTR>(vreg_file.at(rdest), i) = result;
  }
 }

@ -2021,13 +1820,12 @@ void vector_op_vv_wv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DT first = getVregData<DT>(vreg_file.at(rsrc0), i);
+    DTR second = getVregData<DTR>(vreg_file.at(rsrc1), i);
+    DTR third = getVregData<DTR>(vreg_file.at(rdest), i);
    DTR result = OP<DTR, DTR>::apply(first, second, third);
    DP(4, "Widening wv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
+    getVregData<DTR>(vreg_file.at(rdest), i) = result;
  }
 }

@ -2054,13 +1852,12 @@ void vector_op_vv_wfv(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DTR second = getVregData<DTR>(vreg_file, rsrc1, i);
-    DTR third = getVregData<DTR>(vreg_file, rdest, i);
+    DT first = getVregData<DT>(vreg_file.at(rsrc0), i);
+    DTR second = getVregData<DTR>(vreg_file.at(rsrc1), i);
+    DTR third = getVregData<DTR>(vreg_file.at(rdest), i);
    DTR result = OP<DTR, DTR>::apply(rv_ftod(first), second, third);
    DP(4, "Widening wfv " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ", " << +third << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
+    getVregData<DTR>(vreg_file.at(rdest), i) = result;
  }
 }

@ -2079,12 +1876,11 @@ void vector_op_vv_n(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, u
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DTR first = getVregData<DTR>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR first = getVregData<DTR>(vreg_file.at(rsrc0), i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc1), i);
    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
    DP(4, "Narrowing " << (OP<DT, DTR>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
+    getVregData<DTR>(vreg_file.at(rdest), i) = result;
  }
 }

@ -2111,12 +1907,11 @@ void vector_op_vv_sat(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT first = getVregData<DTR>(vreg_file, rsrc0, i);
-    DT second = getVregData<DTR>(vreg_file, rsrc1, i);
+    DT first = getVregData<DTR>(vreg_file.at(rsrc0), i);
+    DT second = getVregData<DTR>(vreg_file.at(rsrc1), i);
    DTR result = OP<DT, DTR>::apply(first, second, vxrm, vxsat);
    DP(4, "Saturating " << (OP<DT, DTR>::name()) << "(" << +(DTR)first << ", " << +(DTR)second << ")" << " = " << +(DTR)result);
-    getVregData<DTR>(vreg_file, rdest, i) = result;
+    getVregData<DTR>(vreg_file.at(rdest), i) = result;
  }
 }

@ -2167,16 +1962,15 @@ void vector_op_vv_red(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0,
  for (uint32_t i = 0; i < vl; i++) {
    // use rdest as accumulator
    if (i == 0) {
-      getVregData<DT>(vreg_file, rdest, 0) = getVregData<DT>(vreg_file, rsrc0, 0);
+      getVregData<DT>(vreg_file.at(rdest), 0) = getVregData<DT>(vreg_file.at(rsrc0), 0);
    }
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT first = getVregData<DT>(vreg_file, rdest, 0);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DT first = getVregData<DT>(vreg_file.at(rdest), 0);
+    DT second = getVregData<DT>(vreg_file.at(rsrc1), i);
    DT result = OP<DT, DT>::apply(first, second, 0);
    DP(4, "Reduction " << (OP<DT, DT>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
-    getVregData<DT>(vreg_file, rdest, 0) = result;
+    getVregData<DT>(vreg_file.at(rdest), 0) = result;
  }
 }

@ -2206,17 +2000,16 @@ void vector_op_vv_red_w(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc
  for (uint32_t i = 0; i < vl; i++) {
    // use rdest as accumulator
    if (i == 0) {
-      getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
+      getVregData<DTR>(vreg_file.at(rdest), 0) = getVregData<DTR>(vreg_file.at(rsrc0), 0);
    }
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DTR first = getVregData<DTR>(vreg_file, rdest, 0);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR first = getVregData<DTR>(vreg_file.at(rdest), 0);
+    DT second = getVregData<DT>(vreg_file.at(rsrc1), i);
    DTR second_w = std::is_signed<DT>() ? sext((DTR)second, sizeof(DT) * 8) : zext((DTR)second, sizeof(DT) * 8);
    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
    DP(4, "Widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, 0) = result;
+    getVregData<DTR>(vreg_file.at(rdest), 0) = result;
  }
 }

@ -2243,17 +2036,16 @@ void vector_op_vv_red_wf(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsr
  for (uint32_t i = 0; i < vl; i++) {
    // use rdest as accumulator
    if (i == 0) {
-      getVregData<DTR>(vreg_file, rdest, 0) = getVregData<DTR>(vreg_file, rsrc0, 0);
+      getVregData<DTR>(vreg_file.at(rdest), 0) = getVregData<DTR>(vreg_file.at(rsrc0), 0);
    }
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DTR first = getVregData<DTR>(vreg_file, rdest, 0);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DTR first = getVregData<DTR>(vreg_file.at(rdest), 0);
+    DT second = getVregData<DT>(vreg_file.at(rsrc1), i);
    DTR second_w = rv_ftod(second);
    DTR result = OP<DTR, DTR>::apply(first, second_w, 0);
    DP(4, "Float widening reduction " << (OP<DTR, DTR>::name()) << "(" << +first << ", " << +second_w << ")" << " = " << +result);
-    getVregData<DTR>(vreg_file, rdest, 0) = result;
+    getVregData<DTR>(vreg_file.at(rdest), 0) = result;
  }
 }

@ -2272,9 +2064,8 @@ void vector_op_vid(std::vector<std::vector<Byte>> &vreg_file, uint32_t rdest, ui
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
    DP(4, "Element Index = " << +i);
-    getVregData<DT>(vreg_file, rdest, i) = i;
+    getVregData<DT>(vreg_file.at(rdest), i) = i;
  }
 }

@ -2303,15 +2094,14 @@ void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0
  for (uint32_t i = 0; i < vl; i++) {
    if (isMasked(vreg_file, 0, i, vmask))
      continue;
-
-    DT first = getVregData<DT>(vreg_file, rsrc0, i);
-    DT second = getVregData<DT>(vreg_file, rsrc1, i);
+    DT first = getVregData<DT>(vreg_file.at(rsrc0), i);
+    DT second = getVregData<DT>(vreg_file.at(rsrc1), i);
    bool result = OP<DT, bool>::apply(first, second, 0);
    DP(4, "Integer/float compare mask " << (OP<DT, bool>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+      getVregData<uint8_t>(vreg_file.at(rdest), i / 8) |= 1 << (i % 8);
    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+      getVregData<uint8_t>(vreg_file.at(rdest), i / 8) &= ~(1 << (i % 8));
    }
  }
 }
@ -2340,16 +2130,16 @@ void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0
 template <template <typename DT1, typename DT2> class OP>
 void vector_op_vv_mask(std::vector<std::vector<Byte>> &vreg_file, uint32_t rsrc0, uint32_t rsrc1, uint32_t rdest, uint32_t vl) {
  for (uint32_t i = 0; i < vl; i++) {
-    uint8_t firstMask = getVregData<uint8_t>(vreg_file, rsrc0, i / 8);
+    uint8_t firstMask = getVregData<uint8_t>(vreg_file.at(rsrc0), i / 8);
    bool first = (firstMask >> (i % 8)) & 0x1;
-    uint8_t secondMask = getVregData<uint8_t>(vreg_file, rsrc1, i / 8);
+    uint8_t secondMask = getVregData<uint8_t>(vreg_file.at(rsrc1), i / 8);
    bool second = (secondMask >> (i % 8)) & 0x1;
    bool result = OP<uint8_t, uint8_t>::apply(first, second, 0) & 0x1;
    DP(4, "Compare mask bits " << (OP<uint8_t, uint8_t>::name()) << "(" << +first << ", " << +second << ")" << " = " << +result);
    if (result) {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) |= 1 << (i % 8);
+      getVregData<uint8_t>(vreg_file.at(rdest), i / 8) |= 1 << (i % 8);
    } else {
-      getVregData<uint8_t>(vreg_file, rdest, i / 8) &= ~(1 << (i % 8));
+      getVregData<uint8_t>(vreg_file.at(rdest), i / 8) &= ~(1 << (i % 8));
    }
  }
 }
@ -2362,10 +2152,9 @@ void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t r
    // This instruction is always masked (vmask == 0), but encoded as unmasked (vmask == 1)
    if (isMasked(vreg_file, rsrc0, i, 0))
      continue;
-
-    DT value = getVregData<DT>(vreg_file, rsrc1, i);
+    DT value = getVregData<DT>(vreg_file.at(rsrc1), i);
    DP(4, "Compression - Moving value " << +value << " from position " << i << " to position " << currPos);
-    getVregData<DT>(vreg_file, rdest, currPos) = value;
+    getVregData<DT>(vreg_file.at(rdest), currPos) = value;
    currPos++;
  }
 }
@ -2390,4 +2179,5 @@ void vector_op_vv_compress(std::vector<std::vector<Byte>> &vreg_file, uint32_t r
    std::abort();
  }
 }
-#endif
+
+}
--- a/tests/regression/matmul/Makefile
+++ b/tests/regression/matmul/Makefile
@ -1,14 +0,0 @@
-ROOT_DIR := $(realpath ../../..)
-include $(ROOT_DIR)/config.mk
-
-PROJECT := matmul
-
-SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
-
-SRCS := $(SRC_DIR)/main.cpp
-
-VX_SRCS := $(SRC_DIR)/kernel.cpp
-
-OPTS ?= -n128 -d1
-
-include ../common.mk
--- a/tests/regression/matmul/common.h
+++ b/tests/regression/matmul/common.h
@ -1,17 +0,0 @@
-#ifndef _COMMON_H_
-#define _COMMON_H_
-
-typedef struct {
-  uint32_t num_tasks;
-  uint32_t num_warps;
-  uint32_t num_threads;
-  uint32_t TC_per_warp;
-  uint32_t matrix_size;
-  uint32_t data_size;
-  uint64_t tc_size;
-  uint64_t src0_addr;
-  uint64_t src1_addr;
-  uint64_t dst_addr;
-} kernel_arg_t;
-
-#endif
--- a/tests/regression/matmul/kernel.cpp
+++ b/tests/regression/matmul/kernel.cpp
@ -1,127 +0,0 @@
-#include <stdint.h>
-#include <vx_intrinsics.h>
-#include <vx_spawn.h>
-#include "common.h"
-
-void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
-	uint32_t task_id = blockIdx.x;
-	int32_t* src0_ptr = (int32_t*)arg->src0_addr;
-	int32_t* src1_ptr = (int32_t*)arg->src1_addr;
-	int32_t* dst_ptr  = (int32_t*)arg->dst_addr;
-	uint64_t a_addr = reinterpret_cast<uint64_t>(src0_ptr);
-	uint64_t b_addr = reinterpret_cast<uint64_t>(src1_ptr);
-	uint64_t c_addr = reinterpret_cast<uint64_t>(dst_ptr);
-
-	uint32_t tc_size = arg->tc_size;
-	uint32_t TC_per_warp = arg->TC_per_warp;
-	unsigned num_threads = arg->num_threads;
-	int num_warps = arg->num_warps;
-	uint32_t matrix_size = arg->matrix_size;
-	
-	int n_tiles = matrix_size/tc_size;
-	int num_output_tiles = (matrix_size*matrix_size)/(tc_size*tc_size);
-	
-	int num_tasks = arg->num_tasks;
-
-	//Assuming matrix size always > tensor core size
-	int warps_actual;
-	if (TC_per_warp > num_output_tiles)
-		warps_actual = 1;
-	else 
-		warps_actual = num_output_tiles/TC_per_warp;
-
-	int num_warps_actual = (warps_actual < num_warps)? warps_actual: num_warps;
-	int num_threads_per_tc = (1> num_threads/TC_per_warp)? 1: num_threads/TC_per_warp;
-	
-	int num_tasks_per_thread = (1> (num_tasks/(num_threads*num_warps_actual)))? 1: (num_tasks/(num_threads*num_warps_actual));
-	int num_tasks_per_warp = (1 > num_tasks/num_warps_actual)? 1:num_tasks/num_warps_actual;
-	int task_id_first_warp = task_id%num_tasks_per_warp;
-
-	//A&B
-	int num_data_per_op_tile = tc_size*tc_size*n_tiles;
-	int num_data_per_warp = num_data_per_op_tile*((1> (num_output_tiles/num_warps_actual))?1:(num_output_tiles/num_warps_actual));
-	
-	int addr_shift;
-	if (((tc_size*tc_size*n_tiles)/(num_threads)) > 1)
-		addr_shift = (tc_size*tc_size*n_tiles)/(num_threads);
-	else
-		addr_shift = 1;
-	//Offset for 1st warp
-	int offset = ((task_id_first_warp/num_tasks_per_thread)*addr_shift) + ((task_id_first_warp%num_tasks_per_thread)*num_data_per_op_tile);
-	offset = offset + (num_data_per_warp*(task_id/num_tasks_per_warp));
-
-	//C
-	int num_data_per_op_tile_c = tc_size*tc_size;
-	int num_data_per_warp_c = num_data_per_warp/n_tiles;
-	
-	int addr_shift_c;
-	if (((tc_size*tc_size)/(num_threads)) > 1)
-		addr_shift_c = tc_size;
-	else
-		addr_shift_c = 1;
-	//Offset for 1st warp
-	int offset_c = ((task_id_first_warp/num_tasks_per_thread)*addr_shift_c) + ((task_id_first_warp%num_tasks_per_thread)*num_data_per_op_tile_c);
-	offset_c = offset_c + (num_data_per_warp_c*(task_id/num_tasks_per_warp));
-	
-	int thread_limit = (num_threads < tc_size*tc_size*n_tiles*TC_per_warp)? num_threads : tc_size*tc_size*n_tiles*TC_per_warp;
-	int thread_limit_c = (num_threads<tc_size*tc_size)? num_threads:tc_size*tc_size;
-	
-	//OLD TASK DISTRIBUTION // For 8x8 matrix, 2x2 tc_size, 1 tc_num, 4threads, 2warps => 64 tasks => 32 tasks/warp => 8 tasks/thread
-	/*task0->thread0, warp0
-	task1->thread0 , warp0
-	task2->thread0 , warp0
-	.
-	task7->thread0 
-	task8->thread1
-	task9->thread1 
-	.
-	.
-	------
-	task32 -> thread0, warp1
-	task33 -> thread1, warp1
-	.
-	*/
-
-	//NEW TASK DISTRIBUTION // For 8x8 matrix, 2x2 tc_size, 1 tc_num, 4threads, 2warps => 64 tasks => 32 tasks/warp => 8 tasks/thread
-	/*task0->thread0, warp0
-	task1->thread1 , warp0
-	task2->thread2 , warp0
-	task3->thread3 ,...
-	task4->thread0
-	task5->thread1 
-	.
-	.
-	------
-	task32 -> thread0, warp1
-	task33 -> thread1, warp1
-	.
-	.*/
-	
-	//TODO :: change this for new task->thread distribution
-	if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit)
-	{	
-		uint64_t a_addr_base = a_addr + offset*arg->data_size;
-		uint64_t b_addr_base = b_addr + offset*arg->data_size;
-		uint64_t c_addr_base = c_addr + offset_c*arg->data_size;
-		csr_write(VX_MAT_MUL_SIZE,n_tiles);
-		csr_write(VX_TC_NUM,TC_per_warp);
-		csr_write(VX_TC_SIZE,tc_size);
-
-		vx_matrix_load (0, a_addr_base);
-		vx_matrix_load (1, b_addr_base);
-		//In case of multiple threads - sync load
-		vx_fence();
-
-		vx_matrix_mul();   //Assuming padding to ensure matrix size is a multiple of tc_size
-		vx_fence();
-		if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit_c)
-			vx_matrix_store(c_addr_base);
-		//In case of multiple threads - sync store
-		vx_fence();
-	}	
-}
-
-int main() {
-	kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
-	return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
-}
--- a/tests/regression/matmul/main.cpp
+++ b/tests/regression/matmul/main.cpp
@ -1,348 +0,0 @@
-#include <iostream>
-#include <unistd.h>
-#include <string.h>
-#include <vector>
-#include <chrono>
-#include <vortex.h>
-#include <cmath>
-#include "common.h"
-
-#define RT_CHECK(_expr)                                         \
-   do {                                                         \
-     int _ret = _expr;                                          \
-     if (0 == _ret)                                             \
-       break;                                                   \
-     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
-	 cleanup();			                                              \
-     exit(-1);                                                  \
-   } while (false)
-
-///////////////////////////////////////////////////////////////////////////////
-
-const char* kernel_file = "kernel.vxbin";
-uint32_t matrix_size = 0;
-
-vx_device_h device = nullptr;
-vx_buffer_h A_buffer = nullptr;
-vx_buffer_h B_buffer = nullptr;
-vx_buffer_h C_buffer = nullptr;
-vx_buffer_h krnl_buffer = nullptr;
-vx_buffer_h args_buffer = nullptr;
-
-std::vector<uint8_t> staging_buf;
-kernel_arg_t kernel_arg = {};
-
-static void show_usage() {
-   std::cout << "Vortex Test." << std::endl;
-   std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
-}
-
-static void parse_args(int argc, char **argv, uint32_t &data_size) {
-  int c;
-  while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) {
-    switch (c) {
-    case 'n':
-      matrix_size = atoi(optarg);
-      break;
-    case 'k':
-      kernel_file = optarg;
-      break;
-    case 'd':
-      data_size = atoi(optarg);
-      break; 
-    case 'h':
-    case '?': {
-      show_usage();
-      exit(0);
-    } break;
-    default:
-      show_usage();
-      exit(-1);
-    }
-  }
-}
-
-void cleanup() {
-  if (device) {
-    vx_mem_free(A_buffer);
-    vx_mem_free(B_buffer);
-    vx_mem_free(C_buffer);
-    vx_mem_free(krnl_buffer);
-    vx_mem_free(args_buffer);
-    vx_dev_close(device);
-  }
-}
-
-template<typename TYPE>
-class mainVariables
-{
-  public:
-    // Constructor
-    mainVariables(uint32_t bufSize, uint32_t dataSize, uint32_t matrixSize)
-        : buf_size(bufSize), data_size(dataSize), matrix_size(matrixSize)
-    {
-        // Resize vectors to specified sizes
-        src_A.resize(buf_size/data_size);
-        src_B.resize(buf_size/data_size);
-        refs.resize(buf_size/data_size);
-    }
-
-  void init_inputs ()
-  {
-    std::cout << "inside init" << std::endl;
-    for (uint32_t i = 0; i < matrix_size*matrix_size; ++i) 
-    {
-      auto a = static_cast<float>(std::rand()) / RAND_MAX;
-      auto b = static_cast<float>(std::rand()) / RAND_MAX;
-      src_A[i] = static_cast<TYPE>(a * matrix_size);
-      src_B[i] = static_cast<TYPE>(b * matrix_size);
-    }
-  }
-
-  void matmul_cpu() 
-  {
-    for (uint32_t row = 0; row < matrix_size; ++row) 
-    {
-      for (uint32_t col = 0; col < matrix_size; ++col) 
-      {
-        TYPE sum(0);
-        for (uint32_t e = 0; e < matrix_size; ++e) {
-            sum += src_A[row * matrix_size + e] * src_B[e * matrix_size + col];
-        }
-        refs[row * matrix_size + col] = sum;
-      }
-    }
-  }
-
-  //Public variables
-  std::vector<TYPE> src_A;
-  std::vector<TYPE> src_B;
-  std::vector<TYPE> refs;
-
-  std::vector<uint8_t> A_mat;
-  std::vector<uint8_t> B_mat;
-
-  private:
-    uint32_t buf_size;
-    uint32_t data_size;
-    uint32_t matrix_size;
-};
-
-
-
-int main(int argc, char *argv[]) {  
-  // parse command arguments
-  uint32_t data_size = 0;
-  parse_args(argc, argv, data_size);
-  if (matrix_size == 0) {
-    matrix_size = 2;
-  }
-
-  // open device connection
-  std::cout << "open device connection" << std::endl;  
-  RT_CHECK(vx_dev_open(&device));
-
-  uint64_t num_cores, num_warps, num_threads;
-  uint64_t tc_size, TC_per_warp;
-
-  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
-  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
-  RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
-  
-  //Add assert/knob
-  RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
-  RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
-
-  std::cout << "Debug :: tc_size = " << tc_size << std::endl;
-  std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl;
-
-  int threads_per_tc;
-  //TODO - can be changed
-  //Number of output tiles * number of threads
-  if (TC_per_warp > num_threads)
-    threads_per_tc = 1;
-  else
-    threads_per_tc = num_threads/TC_per_warp;
-
-  uint32_t num_tasks  = ((matrix_size*matrix_size)/(tc_size*tc_size))*threads_per_tc;
-
-  //size of each operand
-  uint32_t buf_size   =  ((matrix_size*matrix_size)/(tc_size*tc_size))*(matrix_size/(tc_size))*(tc_size*tc_size)*data_size;
-
-  //256
-  std::cout << "Debug :: buf_size: " << buf_size << " bytes" << std::endl;
-
-  // allocate device memory
-  std::cout << "allocate device memory" << std::endl;
-
-  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &A_buffer));
-  RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.src0_addr));
-  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &B_buffer));
-  RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.src1_addr));
-  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &C_buffer));
-  RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.dst_addr));
-
-  std::cout << "A_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
-  std::cout << "B_addr=0x" << std::hex << kernel_arg.src1_addr << std::endl;
-  std::cout << "C_addr=0x" << std::hex << kernel_arg.dst_addr << std::endl;
-
-  mainVariables<int> variables (buf_size, data_size, matrix_size);
-  variables.init_inputs();
-  
-  //////////////////////////////////////////////////
-  // generate source data
-  //////////////////////////////////////////////////
-  variables.matmul_cpu();
-
-  uint32_t tc_size_f = tc_size*tc_size;
-  uint32_t n_tiles = matrix_size/tc_size;
-  
-  variables.A_mat.resize(buf_size);
-  variables.B_mat.resize(buf_size);
-
-  //Demand matrix creation for A / traverse through the rows
-  for(uint32_t k=0; k<n_tiles; k++)
-  {
-    //traverse through output tiles in a row
-    for(uint32_t i=0; i<n_tiles; i++)
-    {
-      //traverse through tiles for one output tile
-        for(uint32_t j=0; j< n_tiles; j++)
-        {
-          for(int t=0; t < tc_size*tc_size; t++)
-          { 
-            variables.A_mat[n_tiles*n_tiles*tc_size_f*k + n_tiles*tc_size_f*i+tc_size_f*j + t]   = variables.src_A[k*tc_size*matrix_size+ tc_size*j +(t/tc_size)*matrix_size + t%tc_size];
-          }
-        }
-    }
-  }
-  
-  //Demand matrix creation for B / traverse through the rows
-  for(uint32_t k=0; k<n_tiles; k++)
-  {
-    //traverse through output tiles in a row
-    for(uint32_t i=0; i<n_tiles; i++)
-    {
-      //traverse through tiles for one output tile
-      for(uint32_t j=0; j< n_tiles; j++)
-      {
-        for(int t=0; t < tc_size*tc_size; t++)
-        {
-          variables.B_mat[n_tiles*n_tiles*tc_size_f*k + n_tiles*tc_size_f*i+tc_size_f*j + t]   = variables.src_B[i*tc_size+ tc_size*matrix_size*j +(t/tc_size)*matrix_size + t%tc_size];
-        }
-      }
-    }
-  }
-  
-  //////////////////////////////////////////////////
-  //////////////////////////////////////////////////
-
-  // upload matrix A buffer
-  {
-    std::cout << "upload matrix A buffer" << std::endl;
-    RT_CHECK(vx_copy_to_dev(A_buffer, (int8_t*)variables.A_mat.data(), 0, buf_size));
-  }
-
-  // upload matrix B buffer
-  {
-    std::cout << "upload matrix B buffer" << std::endl;
-    RT_CHECK(vx_copy_to_dev(B_buffer, (int8_t*)variables.B_mat.data(), 0, buf_size));
-  }
-
-  // upload program
-  std::cout << "upload program" << std::endl;
-  RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
-
-  //////////////////////////////////////////////////
-  //Prep kernel arguments
-  //////////////////////////////////////////////////  
-  //1
-  std::cout << "Debug :: num_tasks = " << num_tasks << std::endl;
-  kernel_arg.num_tasks = num_tasks;
-  kernel_arg.num_warps = num_warps;
-  kernel_arg.num_threads = num_threads;
-  kernel_arg.TC_per_warp = TC_per_warp;
-  //1
-  kernel_arg.matrix_size = matrix_size;
-  kernel_arg.data_size = data_size;
-  kernel_arg.tc_size = tc_size;
-
-  std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
-  std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
-  std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
-
-  //////////////////////////////////////////////////  
-  //////////////////////////////////////////////////
-  
-  // upload kernel argument
-  std::cout << "upload kernel argument" << std::endl;
-  RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
-
-  auto time_start = std::chrono::high_resolution_clock::now();
-
-    std::cout << "start device" << std::endl;
-    RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
-
-    // wait for completion
-    std::cout << "wait for completion" << std::endl;
-    RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
-
-  auto time_end = std::chrono::high_resolution_clock::now();
-  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
-  printf("Elapsed time: %lg ms\n", elapsed);
-
-  // download destination buffer
-  std::cout << "download destination buffer" << std::endl;
-  RT_CHECK(vx_copy_from_dev((int8_t*)variables.B_mat.data(), C_buffer, 0, buf_size));
-
-  // verify result (TODO : needs to be fixed for for functional correctness)
-  /*
-  std::cout << "verify result" << std::endl;  
-  {
-    int errors = 0;
-    auto buf_ptr = (int8_t*)staging_buf.data();
-    uint64_t tc_size = kernel_arg.tc_size;
-    std::cout << "tc_size = " << tc_size << std::endl;
-    int Result[matrix_size*matrix_size];
-    int n_tiles = (matrix_size/tc_size);
-    int tc_size_f = tc_size*tc_size;
-
-    //converting buf ptr (tile by tile) to CPU style linear (row by row)
-    for(int k = 0; k < matrix_size/tc_size; k+= 1)
-    {
-      for(int j = 0; j < matrix_size; j+= tc_size)
-      {
-        for(int i =0; i < tc_size*tc_size; i++)
-        {
-          Result[ tc_size*matrix_size*k +j+ (i/tc_size)*matrix_size +i%(tc_size)]  = buf_ptr[matrix_size*tc_size*k+tc_size*j+i];
-        }
-      }    
-    }
-
-    for (uint32_t i = 0; i < matrix_size*matrix_size; ++i) {
-      //int ref = i + i; 
-      int cur = Result[i];
-      if (cur != refs[i]) {
-        ++errors;
-      }
-    }
-    if (errors != 0) {
-      std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
-      std::cout << "FAILED!" << std::endl;
-      return 1;  
-    }
-    else
-    {
-      std::cout << "CONDITIONALLY PASSED!" << std::endl;
-    }
-  }
-  */
-
-  // cleanup
-  std::cout << "cleanup" << std::endl;  
-  cleanup();
-
-  std::cout << "PASSED!" << std::endl;
-
-  return 0;
-}
--- a/tests/regression/matmul/matmul_regression.sh
+++ b/tests/regression/matmul/matmul_regression.sh
@ -1,26 +0,0 @@
-#!/bin/bash 
-
-# README:
-# This script launches a sweep of TC_SIZE, TC_NUM and MATRIX SIZES
-# default values of NUM_WARPS=32, NUM_THREADS=32, NUM_CORES=4, DATA_SIZE=1
-# Edit matrix_sizes, tcsizes & tcnums variables to vary the sweep limits
-
-# Define arrays for tc_size,tc_num and matrix sizes
-matrix_sizes=(16 32 64 128 256 512)
-tcsizes=(8 16 32)
-tcnums=(4 8 16 32)
-
-cd ../../../build/
-
-# Loop through each combination of above configs
-for size in "${matrix_sizes[@]}"; do
-    for tcsize in "${tcsizes[@]}"; do
-        for tcnum in "${tcnums[@]}"; do
-            mkdir -p sim_final/mat${size}
-            log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
-            cmd="CONFIGS=\"-DTC_NUM=${tcnum} -DTC_SIZE=${tcsize}\" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args=\"-n${size} -d1\" --rebuild=1 --perf=1  > ${log_name} 2>&1"
-            echo $cmd
-            eval $cmd
-        done
-    done    
-done