Added matrix multiplication simx updates

2025-06-27 17:01:10 -04:00 · 2023-02-02 16:26:57 -05:00 · 2023-02-02 16:26:57 -05:00 · 86e4ba9e4f
commit 86e4ba9e4f
parent 88ed687557
20 changed files with 60306 additions and 32 deletions
--- a/runtime/include/vx_intrinsics.h
+++ b/runtime/include/vx_intrinsics.h
@ -198,6 +198,26 @@ inline void vx_fence() {
    asm volatile ("fence iorw, iorw");
 }

+inline void ml(unsigned dest, unsigned addr) {
+    asm volatile (".insn s 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
+}
+
+inline void ms(unsigned addr) {
+    asm volatile (".insn s 0x7b, 1, x0, 0(%0)" :: "r"(addr));
+}
+
+inline void mm() {
+    asm volatile (".insn s 0x7b, 2, x0, 0(x0)");
+}
+//inline void vx_prefetch(unsigned addr) {
+//    asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
+//}
+//inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
+//    asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
+//}
+
+
+
 #define __if(b) vx_split(b); \
                if (b) 

@ -211,4 +231,4 @@ inline void vx_fence() {
 }
 #endif

-#endif
+#endif
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@ -181,11 +181,17 @@ void Core::cout_flush() {
 }

 void Core::tick() {
+  //std::cout << "coreid=" << id_ << " Start tick. Before commit" << std::endl;
  this->commit();
+  //std::cout << "coreid=" << id_ << " After Commit. Before execute" << std::endl;
  this->execute();
+  //std::cout << "coreid=" << id_ << " After execute. Before decode" << std::endl;
  this->decode();
+  //std::cout << "coreid=" << id_ << " After decode. Before fetch" << std::endl;
  this->fetch();
+  //std::cout << "coreid=" << id_ << " After fetch. Before schedule" << std::endl;
  this->schedule();
+  //std::cout << "coreid=" << id_ << " After schedule" << std::endl;

  // update perf counter  
  perf_stats_.mem_latency += perf_mem_pending_reads_;
--- a/sim/simx/debug.h
+++ b/sim/simx/debug.h
@ -1,7 +1,7 @@
 #pragma once

 #ifndef DEBUG_LEVEL
-#define DEBUG_LEVEL 3
+#define DEBUG_LEVEL 4
 #endif

 #define DEBUG_HEADER << "DEBUG "
@ -10,7 +10,7 @@
 #define TRACE_HEADER << "TRACE "
 //#define TRACE_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "

-#ifndef NDEBUG
+#ifdef NDEBUG

 #include <iostream>
 #include <iomanip>
--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@ -42,6 +42,7 @@ static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable =
  {Opcode::VSET,       {false, InstType::V_TYPE}}, 
  {Opcode::GPGPU,      {false, InstType::R_TYPE}},
  {Opcode::GPU,        {false, InstType::R4_TYPE}},
+  {Opcode::TCU,        {false, InstType::S_TYPE}},
  {Opcode::R_INST_W,   {false, InstType::R_TYPE}},
  {Opcode::I_INST_W,   {false, InstType::I_TYPE}},
 };
@ -368,6 +369,14 @@ static const char* op_string(const Instr &instr) {
    default:
      std::abort();
    }
+  case Opcode::TCU:
+    switch(func3){
+      case 0: return "ML";
+      case 1: return "MS";
+      case 2: return "MM";
+      default:
+        std::abort();
+    }
  default:
    std::abort();
  }
@ -430,7 +439,7 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {

  auto op_it = sc_instTable.find(op);
  if (op_it == sc_instTable.end()) {
-    std::cout << std::hex << "Error: invalid opcode: 0x" << op << std::endl;
+    std::cout << std::hex << "Error: asdada invalid opcode: 0x" << op << std::endl;
    return nullptr;
  }

@ -546,8 +555,10 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
    }
    instr->setFunc3(func3);
    auto imm = (func7 << width_reg) | rd;
-    instr->setImm(sext(imm, width_i_imm));
-  } break;
+    instr->setImm(sext(imm, width_i_imm)); 
+    if (op == Opcode::TCU)
+      std::cout << "TCUDEBUG: immediate val: " << imm << ", address in reg# " << rs1  << ", zero: " << rs2 << std::endl;
+  } break; 

  case InstType::B_TYPE: {
    instr->setSrcReg(rs1, RegType::Integer);
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@ -124,6 +124,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {

  std::vector<reg_data_t[3]> rsdata(num_threads);
  std::vector<reg_data_t> rddata(num_threads);
+  std::vector<reg_data_t[SIZE_SQ]> rddata_arr(num_threads);

  auto num_rsrcs = instr.getNRSrc();
  if (num_rsrcs) {              
@ -1451,6 +1452,141 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      std::abort();
    }
  } break;
+  case TCU: {
+    switch (func3) {
+      case 0: { //Matrix Load
+        trace->exe_type = ExeType::LSU;    
+        trace->lsu.type = LsuType::LOAD;
+        trace->used_iregs.set(rsrc0);
+        trace->used_iregs.set(rsrc1);
+        uint32_t mem_bytes = 1 << (2 & 0x3);
+        for (uint32_t t = 0; t < num_threads; ++t) {
+          if (!tmask_.test(t))
+            continue;
+          uint64_t mem_addr = rsdata[t][0].i ;         
+          
+          uint64_t mem_addr_arr[SIZE][SIZE];
+          uint64_t base_addr = rsdata[t][0].i ;
+         
+          //get the memory addresses
+          for(int i = 0; i < SIZE; i++){
+            for(int j = 0; j < SIZE; j++){
+              mem_addr_arr[i][j] = base_addr + ((SIZE*i) + j)*4;
+            }
+          }
+
+          uint64_t mem_data = 0;
+          
+          uint64_t mem_data_arr[SIZE][SIZE];
+          
+          //core_->dcache_read(&mem_data, mem_addr, mem_bytes);
+
+          //load memory addresses
+          
+          for(int i = 0; i < SIZE; i++){
+            for(int j = 0; j < SIZE; j++){
+              uint64_t* temp_ref = &mem_data_arr[i][j];
+              //core_->dcache_read(mem_data_arr[i][j], mem_addr_arr[i][j], mem_bytes);
+              core_->dcache_read(temp_ref, mem_addr_arr[i][j], mem_bytes);
+              DP(4, "TCU LOAD MEM: ADDRESS=0x" << std::hex << mem_addr_arr[i][j] << ", DATA=0x" << mem_data_arr[i][j]);
+            }
+          }
+          trace->mem_addrs.at(t).push_back({mem_addr, mem_bytes});        
+          DP(4, "TCU LOAD MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << mem_data);
+          //load 32 bit data into rdata. Now what?
+          // RV32I: LW
+          rddata[t].i = sext((Word)mem_data, 32);
+
+          //put into rddata_arr[]
+          for(int i = 0; i < SIZE; i++){
+            for(int j = 0; j < SIZE; j++){
+              rddata_arr[t][SIZE*i + j].i = sext((Word)mem_data_arr[i][j], 32);
+            }
+          }
+        }
+        rd_write = true;
+      } break;
+      case 1: { //Matrix Store
+        trace->exe_type = ExeType::LSU;    
+        trace->lsu.type = LsuType::STORE;
+        trace->used_iregs.set(rsrc0);
+        trace->used_iregs.set(rsrc1);
+        DP(4, "TCU STORE MEM: ADDRESS=0x");// << std::hex << mem_addr << ", DATA=0x" << mem_data);
+
+        uint32_t mem_bytes = 1 << (2 & 0x3);
+        uint64_t mask = ((uint64_t(1) << (8 * mem_bytes))-1);
+        for (uint32_t t = 0; t < num_threads; ++t) {
+          if (!tmask_.test(t))
+            continue;
+          uint64_t mem_addr = rsdata[t][0].i;
+          uint64_t base_addr = rsdata[t][0].i;
+          uint64_t mem_data = tcore_ireg_c[t][0];
+          uint64_t mem_addr_arr[SIZE][SIZE];
+          uint64_t mem_data_arr[SIZE][SIZE];
+
+          //memory addr array
+          for(int i = 0; i < SIZE; i++){
+            for(int j = 0; j < SIZE; j++){
+              mem_addr_arr[i][j] = base_addr + ((SIZE*i) + j)*4;
+            }
+          }
+
+          //data array from tcore reg c
+          for(int i = 0; i < SIZE; i++){
+            for(int j = 0; j < SIZE; j++){
+              mem_data_arr[i][j] = tcore_ireg_c.at(t)[i*SIZE + j];
+              //tcore_ireg_a.at(t)[i] = rddata_arr[t].i;
+            }
+          }
+
+          if (mem_bytes < 8) {
+            mem_data &= mask;
+          }
+          trace->mem_addrs.at(t).push_back({mem_addr, mem_bytes});        
+          DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << mem_data);
+          //core_->dcache_write(&mem_data, mem_addr, mem_bytes);  
+          for(int i = 0; i < SIZE; i++){
+            for(int j = 0; j < SIZE; j++){
+              uint64_t* temp_ref = &mem_data_arr[i][j];
+              core_->dcache_write(temp_ref, mem_addr_arr[i][j], mem_bytes);  
+              DP(4, "TCU STORE MEM: ADDRESS=0x" << std::hex << mem_addr_arr[i][j] << ", DATA=0x" << mem_data_arr[i][j]);
+            }
+          }
+        }
+      } break;
+      case 2: { //Matrix Multiply
+        DP(4, "TCU MULTIPLY MAT");// << std::hex << mem_addr << ", DATA=0x" << mem_data);
+        trace->exe_type = ExeType::ALU;    
+        trace->alu.type = AluType::ARITH;    
+        trace->used_tcore_iregs_a.set(rsrc0);
+        trace->used_tcore_iregs_b.set(rsrc0);
+        for (uint32_t t = 0; t < num_threads; ++t) {
+          if (!tmask_.test(t))
+            continue;
+          //tcore_ireg_c.at(t)[0] = tcore_ireg_a.at(t)[0] * tcore_ireg_b.at(t)[0] + tcore_ireg_a.at(t)[1] * tcore_ireg_b.at(t)[2];
+          //tcore_ireg_c.at(t)[1] = tcore_ireg_a.at(t)[0] * tcore_ireg_b.at(t)[1] + tcore_ireg_a.at(t)[1] * tcore_ireg_b.at(t)[3];
+          //tcore_ireg_c.at(t)[2] = tcore_ireg_a.at(t)[2] * tcore_ireg_b.at(t)[0] + tcore_ireg_a.at(t)[3] * tcore_ireg_b.at(t)[2];
+          //tcore_ireg_c.at(t)[3] = tcore_ireg_a.at(t)[2] * tcore_ireg_b.at(t)[1] + tcore_ireg_a.at(t)[3] * tcore_ireg_b.at(t)[3];
+          for (int i = 0; i < SIZE; i++) { //ROW-1
+            for (int j = 0; j < SIZE; j++) { //COL-2
+              int sum = 0;
+              for (int k = 0; k < SIZE; k++){ //COL-1
+                sum = sum + tcore_ireg_a.at(t)[i * SIZE + k] * tcore_ireg_b.at(t)[k * SIZE + j]; //sum = [i * col1 + k] * [k * col2 + j]
+              }
+              tcore_ireg_c.at(t)[i * SIZE + j] = sum; //[i * col2 + j] = sum
+            }
+          }
+          for (int i = 0; i < SIZE_SQ; i++){
+            trace->used_tcore_iregs_c[i] = 1;
+            std::cout << "TCU MM: Multiplication result: " << std::hex << tcore_ireg_c.at(t)[i] << std::endl;
+          }
+        }
+        rd_write = true;
+      }break;
+      default:
+        std::abort();
+    }
+  } break;
  case VSET: {
    uint32_t VLEN = core_->arch().vsize() * 8;
    uint32_t VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
@ -2307,40 +2443,81 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    trace->wb = true;
    DPH(2, "Dest Reg: ");
    auto type = instr.getRDType();    
-    switch (type) {
-    case RegType::Integer:      
-      if (rdest) {    
-        DPN(2, type << std::dec << rdest << "={");    
-        for (uint32_t t = 0; t < num_threads; ++t) {
-          if (t) DPN(2, ", ");
-          if (!tmask_.test(t)) {
-            DPN(2, "-");
-            continue;            
-          }
-          ireg_file_.at(t)[rdest] = rddata[t].i;
-          DPN(2, "0x" << std::hex << rddata[t].i);         
-        }
-        DPN(2, "}" << std::endl);
-        trace->used_iregs[rdest] = 1;
-      }
-      break;
-    case RegType::Float:
-      DPN(2, type << std::dec << rdest << "={");
+    if(opcode == Opcode::TCU){ //tensor core
+      //iterate over threads
+      //put in tensor core reg.
+      std::cout << "TCU if condition" << std::endl;
+      DPN(2, type << std::dec << immsrc << "={"); //FIX   
+      
      for (uint32_t t = 0; t < num_threads; ++t) {
        if (t) DPN(2, ", ");
        if (!tmask_.test(t)) {
          DPN(2, "-");
          continue;            
        }
-        freg_file_.at(t)[rdest] = rddata[t].f;        
-        DPN(2, "0x" << std::hex << rddata[t].f);         
+        //check immediate value
+        if(immsrc == 0){ // 0 => A; Load A
+          //iterate over all regs in A
+          for (int i = 0; i < SIZE_SQ; i++){
+            tcore_ireg_a.at(t)[i] = rddata_arr[t][i].i;
+            trace->used_tcore_iregs_a[i] = 1;
+          }
+        }
+        else if(immsrc == 1){ // 0 => B; Load B
+          //iterate over all regs in B
+          for (int i = 0; i < SIZE_SQ; i++){
+            tcore_ireg_b.at(t)[i] = rddata_arr[t][i].i;
+            trace->used_tcore_iregs_b[i] = 1;
+          }
+        }
+        /*
+        else if(){ // Mul A x B
+          for (int i = 0; i < SIZE_SQ; i++){
+            tcore_ireg_c.at(t)[i] = rddata[t].i;
+            trace->used_tcore_iregs_b[i] = 1;
+          }
+        }
+        */
+        DPN(2, "0x" << std::hex << rddata[t].i);         
      }
      DPN(2, "}" << std::endl);
-      trace->used_fregs[rdest] = 1;
-      break;
-    default:
-      std::abort();
-      break;
+    }
+    else{
+      switch (type) {
+        case RegType::Integer:      
+          if (rdest) {    
+            DPN(2, type << std::dec << rdest << "={");    
+            for (uint32_t t = 0; t < num_threads; ++t) {
+              if (t) DPN(2, ", ");
+              if (!tmask_.test(t)) {
+                DPN(2, "-");
+                continue;            
+              }
+              ireg_file_.at(t)[rdest] = rddata[t].i;
+              DPN(2, "0x" << std::hex << rddata[t].i);         
+            }
+            DPN(2, "}" << std::endl);
+            trace->used_iregs[rdest] = 1;
+          }
+          break;
+        case RegType::Float:
+          DPN(2, type << std::dec << rdest << "={");
+          for (uint32_t t = 0; t < num_threads; ++t) {
+            if (t) DPN(2, ", ");
+            if (!tmask_.test(t)) {
+              DPN(2, "-");
+              continue;            
+            }
+            freg_file_.at(t)[rdest] = rddata[t].f;        
+            DPN(2, "0x" << std::hex << rddata[t].f);         
+          }
+          DPN(2, "}" << std::endl);
+          trace->used_fregs[rdest] = 1;
+          break;
+        default:
+          std::abort();
+          break;
+      }
    }
  }

--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@ -32,6 +32,8 @@ enum Opcode {
  // GPGPU Extension
  GPGPU     = 0x6b,
  GPU       = 0x5b,
+  // tensorcore Extension
+  TCU       = 0x7b,
  // RV64 Standard Extensions
  R_INST_W  = 0x3b,
  I_INST_W  = 0x1b,
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@ -76,6 +76,7 @@ int main(int argc, char **argv) {
    // attach memory module
    processor.attach_ram(&ram);   

+    //std::cout << "Test msg" << " Num of cores: " << num_cores << ". Num of warps: " << num_warps << ". Num of threads: " << num_threads << std::endl;
    // run simulation
    exitcode = processor.run();

--- a/sim/simx/pipeline.h
+++ b/sim/simx/pipeline.h
@ -32,6 +32,9 @@ struct pipeline_trace_t {
  RegMask     used_iregs;
  RegMask     used_fregs;
  RegMask     used_vregs;
+  RegMask     used_tcore_iregs_a;
+  RegMask     used_tcore_iregs_b;
+  RegMask     used_tcore_iregs_c;

  //- 
  ExeType     exe_type; 
--- a/sim/simx/scoreboard.h
+++ b/sim/simx/scoreboard.h
@ -16,6 +16,9 @@ private:
    std::vector<RegMask> in_use_iregs_;
    std::vector<RegMask> in_use_fregs_;
    std::vector<RegMask> in_use_vregs_;
+    std::vector<RegMask> in_use_tcore_iregs_a;
+    std::vector<RegMask> in_use_tcore_iregs_b;
+    std::vector<RegMask> in_use_tcore_iregs_c;
    std::unordered_map<uint32_t, uint64_t> owners_; 

 public:    
@ -23,6 +26,9 @@ public:
        : in_use_iregs_(arch.num_warps())
        , in_use_fregs_(arch.num_warps())
        , in_use_vregs_(arch.num_warps())
+        , in_use_tcore_iregs_a(arch.num_warps())
+        , in_use_tcore_iregs_b(arch.num_warps())
+        , in_use_tcore_iregs_c(arch.num_warps())
    {
        this->clear();
    }
@ -32,6 +38,9 @@ public:
            in_use_iregs_.at(i).reset();
            in_use_fregs_.at(i).reset();
            in_use_vregs_.at(i).reset();
+            in_use_tcore_iregs_a.at(i).reset();
+            in_use_tcore_iregs_b.at(i).reset();
+            in_use_tcore_iregs_c.at(i).reset();
        }
        owners_.clear();
    }
--- a/sim/simx/warp.cpp
+++ b/sim/simx/warp.cpp
@ -16,6 +16,9 @@ Warp::Warp(Core *core, uint32_t id)
    , ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
    , freg_file_(core->arch().num_threads(), std::vector<FWord>(core->arch().num_regs()))
    , vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
+    , tcore_ireg_a(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
+    , tcore_ireg_b(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
+    , tcore_ireg_c(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
 {
  this->clear();
 }
@ -34,6 +37,30 @@ void Warp::clear() {
    for (auto& reg : vreg_file_.at(i)) {
      reg = 0;
    }
+    /*
+    for (auto& reg : treg_file_.at(i)) {
+      reg = 0;
+    }
+    */
+   
+    for (auto& reg : tcore_ireg_a.at(i)) {
+      reg = 0;
+    }
+    for (auto& reg : tcore_ireg_b.at(i)) {
+      reg = 0;
+    }
+    for (auto& reg : tcore_ireg_c.at(i)) {
+      reg = 0;
+    }
+    
+    //clear the tensorcore regs
+    //for (int j = 0; j < SIZE_SQ; j++){
+    //  //for (int j = 0; j < 2; j++){
+    //    tcore_ireg_a[j] = 0;
+    //    tcore_ireg_b[j] = 0;
+    //    tcore_ireg_c[j] = 0;
+    //  //}
+    //}
  }
 }

--- a/sim/simx/warp.h
+++ b/sim/simx/warp.h
@ -5,6 +5,10 @@
 #include <stack>
 #include "types.h"

+
+#define SIZE 2
+#define SIZE_SQ  SIZE*SIZE
+
 namespace vortex {

 class Core;
@ -105,6 +109,12 @@ private:
  std::vector<std::vector<Word>> ireg_file_;
  std::vector<std::vector<FWord>> freg_file_;
  std::vector<std::vector<Byte>> vreg_file_;
+
+  //tensorcore registers
+  std::vector<std::vector<Word>> tcore_ireg_a;
+  std::vector<std::vector<Word>> tcore_ireg_b;
+  std::vector<std::vector<Word>> tcore_ireg_c; //accumulator
+  
  std::stack<DomStackEntry> dom_stack_;

  struct vtype vtype_;
--- a/tests/runtime/matmul/Makefile
+++ b/tests/runtime/matmul/Makefile
@ -0,0 +1,55 @@
+XLEN ?= 32
+
+ifeq ($(XLEN),32)
+RISCV_TOOLCHAIN_PATH = /opt/riscv-gnu-toolchain
+else
+RISCV_TOOLCHAIN_PATH = /opt/riscv64-gnu-toolchain
+endif
+
+RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
+VORTEX_RT_PATH ?= $(realpath ../../../runtime)
+
+CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc
+AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc-ar
+DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objdump
+CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objcopy
+
+SIM_DIR=../../../sim
+
+ifeq ($(XLEN),32)
+CFLAGS += -march=rv32imf -mabi=ilp32f
+else
+CFLAGS += -march=rv64imfd -mabi=lp64d
+endif
+
+CFLAGS += -O3 -Wstack-usage=1024 -mcmodel=medany -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
+CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I./
+
+LDFLAGS += -lm -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
+
+PROJECT = matmul
+
+SRCS = vx_mat_mulint32.s main.cpp
+
+all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
+
+$(PROJECT).dump: $(PROJECT).elf
+	$(DP) -D $(PROJECT).elf > $(PROJECT).dump
+
+$(PROJECT).bin: $(PROJECT).elf
+	$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
+
+$(PROJECT).elf: $(SRCS)
+	$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
+
+run-rtlsim: $(PROJECT).bin
+	$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
+
+run-simx: $(PROJECT).bin
+	$(SIM_DIR)/simx/simx -c 1 -i $(PROJECT).bin
+
+.depend: $(SRCS)
+	$(CC) $(CFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf *.elf *.bin *.dump .depend 
--- a/tests/runtime/matmul/main.cpp
+++ b/tests/runtime/matmul/main.cpp
@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <vx_print.h>
+#include <vx_intrinsics.h>
+#include "vx_mat_mulint32.h"
+
+
+int A[SIZE][SIZE] =
+{
+    {3,5},
+    {7,9}
+};
+
+int B[SIZE][SIZE] =
+{
+    {11,13},
+    {15,17}
+};
+
+int Ans[SIZE][SIZE] =
+{
+    {108,124},
+    {212,244}
+};
+
+int main() {
+	int errors = 0;
+	vx_printf("KDEBUG Initializing output matrix\n");
+    int C[SIZE][SIZE] =
+    {
+        {0,0},
+        {0,0}
+    };
+
+    uint32_t a_addr = (uint32_t)A ;
+    uint32_t b_addr = (uint32_t)B ;
+    uint32_t c_addr = (uint32_t)C;
+	vx_printf("KDEBUG Done Initializing output matrix\n");
+
+	vx_printf("KDEBUG Starting Matmul\n");
+
+    	//matmul on vortex
+    	// for(int i = 0; i < SIZE; i++){
+    	//     for(int j = 0; j < SIZE; j++){
+    	//         for(int k = 0; k < SIZE; k++)
+    	//         {
+    	//             vx_printf("KDEBUG Just before multiply add\n");
+    	//             C[i][j] += A[i][k] * B[k][j];
+    	// 	    vx_printf("KDEBUG Just after multiply add\n");
+    	//         }
+    	//     }
+    	// }
+
+	// vx_printf("KDEBUG TEST matrix address A = %u, B = %u, C = %u\n", a_addr,  b_addr, c_addr);
+    	 ml(0,a_addr);
+    	 ml(1,b_addr);
+    	 mm();
+    	 ms(c_addr);
+	// vx_printf("KDEBUG Finished Matmul\n");
+
+	vx_printf("KDEBUG Result of mul(%dx%d) = %d\n", A[0][0], B[0][0], C[0][0]);
+
+    	//comparison
+	vx_printf("KDEBUG Starting Comparison\n");
+    	bool flag = true;
+    	for(int i = 0; i < SIZE; i++){
+        	for(int j = 0; j < SIZE; j++){
+            		if(C[i][j] != Ans[i][j]){
+                		flag = false;
+                		break;
+            		}
+        	}
+    	}
+    
+	vx_printf("KDEBUG Finished Comparison\n");
+
+	if (flag) {
+		vx_printf("Passed!\n");
+	} else {
+		vx_printf("Failed!");
+		errors = 1;
+	}
+
+	return errors;
+}
+
+
+
+
+
--- a/tests/runtime/matmul/matmul.bin
+++ b/tests/runtime/matmul/matmul.bin
--- a/tests/runtime/matmul/matmul.dump
+++ b/tests/runtime/matmul/matmul.dump
--- a/tests/runtime/matmul/matmul.elf
+++ b/tests/runtime/matmul/matmul.elf
--- a/tests/runtime/matmul/ramulator.ddr4.log
+++ b/tests/runtime/matmul/ramulator.ddr4.log
@ -0,0 +1,278 @@
+               ramulator.active_cycles_0                1372                                      # Total active cycles for level _0
+                 ramulator.busy_cycles_0                1372                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0
+            ramulator.serving_requests_0                1372                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0
+    ramulator.average_serving_requests_0            0.006794                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0
+             ramulator.active_cycles_0_0                1372                                      # Total active cycles for level _0_0
+               ramulator.busy_cycles_0_0                7924                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0
+          ramulator.serving_requests_0_0                1372                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0
+  ramulator.average_serving_requests_0_0            0.006794                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0
+           ramulator.active_cycles_0_0_0                 952                                      # Total active cycles for level _0_0_0
+             ramulator.busy_cycles_0_0_0                 952                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0
+        ramulator.serving_requests_0_0_0                 952                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0
+ramulator.average_serving_requests_0_0_0            0.004714                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0
+         ramulator.active_cycles_0_0_0_0                 952                                      # Total active cycles for level _0_0_0_0
+           ramulator.busy_cycles_0_0_0_0                 952                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_0
+      ramulator.serving_requests_0_0_0_0                 952                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_0
+ramulator.average_serving_requests_0_0_0_0            0.004714                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_0
+         ramulator.active_cycles_0_0_0_1                   0                                      # Total active cycles for level _0_0_0_1
+           ramulator.busy_cycles_0_0_0_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_1
+      ramulator.serving_requests_0_0_0_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_1
+ramulator.average_serving_requests_0_0_0_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_1
+         ramulator.active_cycles_0_0_0_2                   0                                      # Total active cycles for level _0_0_0_2
+           ramulator.busy_cycles_0_0_0_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_2
+      ramulator.serving_requests_0_0_0_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_2
+ramulator.average_serving_requests_0_0_0_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_2
+         ramulator.active_cycles_0_0_0_3                   0                                      # Total active cycles for level _0_0_0_3
+           ramulator.busy_cycles_0_0_0_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_3
+      ramulator.serving_requests_0_0_0_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_3
+ramulator.average_serving_requests_0_0_0_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_3
+           ramulator.active_cycles_0_0_1                 420                                      # Total active cycles for level _0_0_1
+             ramulator.busy_cycles_0_0_1                 420                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1
+        ramulator.serving_requests_0_0_1                 420                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1
+ramulator.average_serving_requests_0_0_1            0.002080                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1
+         ramulator.active_cycles_0_0_1_0                 420                                      # Total active cycles for level _0_0_1_0
+           ramulator.busy_cycles_0_0_1_0                 420                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_0
+      ramulator.serving_requests_0_0_1_0                 420                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_0
+ramulator.average_serving_requests_0_0_1_0            0.002080                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_0
+         ramulator.active_cycles_0_0_1_1                   0                                      # Total active cycles for level _0_0_1_1
+           ramulator.busy_cycles_0_0_1_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_1
+      ramulator.serving_requests_0_0_1_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_1
+ramulator.average_serving_requests_0_0_1_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_1
+         ramulator.active_cycles_0_0_1_2                   0                                      # Total active cycles for level _0_0_1_2
+           ramulator.busy_cycles_0_0_1_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_2
+      ramulator.serving_requests_0_0_1_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_2
+ramulator.average_serving_requests_0_0_1_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_2
+         ramulator.active_cycles_0_0_1_3                   0                                      # Total active cycles for level _0_0_1_3
+           ramulator.busy_cycles_0_0_1_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_3
+      ramulator.serving_requests_0_0_1_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_3
+ramulator.average_serving_requests_0_0_1_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_3
+           ramulator.active_cycles_0_0_2                   0                                      # Total active cycles for level _0_0_2
+             ramulator.busy_cycles_0_0_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2
+        ramulator.serving_requests_0_0_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2
+ramulator.average_serving_requests_0_0_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2
+         ramulator.active_cycles_0_0_2_0                   0                                      # Total active cycles for level _0_0_2_0
+           ramulator.busy_cycles_0_0_2_0                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_0
+      ramulator.serving_requests_0_0_2_0                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_0
+ramulator.average_serving_requests_0_0_2_0            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_0
+         ramulator.active_cycles_0_0_2_1                   0                                      # Total active cycles for level _0_0_2_1
+           ramulator.busy_cycles_0_0_2_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_1
+      ramulator.serving_requests_0_0_2_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_1
+ramulator.average_serving_requests_0_0_2_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_1
+         ramulator.active_cycles_0_0_2_2                   0                                      # Total active cycles for level _0_0_2_2
+           ramulator.busy_cycles_0_0_2_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_2
+      ramulator.serving_requests_0_0_2_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_2
+ramulator.average_serving_requests_0_0_2_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_2
+         ramulator.active_cycles_0_0_2_3                   0                                      # Total active cycles for level _0_0_2_3
+           ramulator.busy_cycles_0_0_2_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_3
+      ramulator.serving_requests_0_0_2_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_3
+ramulator.average_serving_requests_0_0_2_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_3
+           ramulator.active_cycles_0_0_3                   0                                      # Total active cycles for level _0_0_3
+             ramulator.busy_cycles_0_0_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3
+        ramulator.serving_requests_0_0_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3
+ramulator.average_serving_requests_0_0_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3
+         ramulator.active_cycles_0_0_3_0                   0                                      # Total active cycles for level _0_0_3_0
+           ramulator.busy_cycles_0_0_3_0                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_0
+      ramulator.serving_requests_0_0_3_0                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_0
+ramulator.average_serving_requests_0_0_3_0            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_0
+         ramulator.active_cycles_0_0_3_1                   0                                      # Total active cycles for level _0_0_3_1
+           ramulator.busy_cycles_0_0_3_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_1
+      ramulator.serving_requests_0_0_3_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_1
+ramulator.average_serving_requests_0_0_3_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_1
+         ramulator.active_cycles_0_0_3_2                   0                                      # Total active cycles for level _0_0_3_2
+           ramulator.busy_cycles_0_0_3_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_2
+      ramulator.serving_requests_0_0_3_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_2
+ramulator.average_serving_requests_0_0_3_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_2
+         ramulator.active_cycles_0_0_3_3                   0                                      # Total active cycles for level _0_0_3_3
+           ramulator.busy_cycles_0_0_3_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_3
+      ramulator.serving_requests_0_0_3_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_3
+ramulator.average_serving_requests_0_0_3_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_3
+      ramulator.read_transaction_bytes_0                3008                                      # The total byte of read transaction per channel
+     ramulator.write_transaction_bytes_0                2304                                      # The total byte of write transaction per channel
+       ramulator.row_hits_channel_0_core                  64                                      # Number of row hits per channel per core
+     ramulator.row_misses_channel_0_core                  11                                      # Number of row misses per channel per core
+  ramulator.row_conflicts_channel_0_core                   8                                      # Number of row conflicts per channel per core
+  ramulator.read_row_hits_channel_0_core                  32                                      # Number of row hits for read requests per channel per core
+                                     [0]                32.0                                      # 
+ramulator.read_row_misses_channel_0_core                  11                                      # Number of row misses for read requests per channel per core
+                                     [0]                11.0                                      # 
+ramulator.read_row_conflicts_channel_0_core                   4                                      # Number of row conflicts for read requests per channel per core
+                                     [0]                 4.0                                      # 
+ ramulator.write_row_hits_channel_0_core                  32                                      # Number of row hits for write requests per channel per core
+                                     [0]                32.0                                      # 
+ramulator.write_row_misses_channel_0_core                   0                                      # Number of row misses for write requests per channel per core
+                                     [0]                 0.0                                      # 
+ramulator.write_row_conflicts_channel_0_core                   4                                      # Number of row conflicts for write requests per channel per core
+                                     [0]                 4.0                                      # 
+      ramulator.useless_activates_0_core                   0                                      # Number of useless activations. E.g, ACT -> PRE w/o RD or WR
+            ramulator.read_latency_avg_0           28.234043                                      # The average memory latency cycles (in memory time domain) per request for all read requests in this channel
+            ramulator.read_latency_sum_0                1327                                      # The memory latency cycles (in memory time domain) sum for all read requests in this channel
+        ramulator.req_queue_length_avg_0            0.006571                                      # Average of read and write queue length per memory cycle per channel.
+        ramulator.req_queue_length_sum_0                1327                                      # Sum of read and write queue length per memory cycle per channel.
+   ramulator.read_req_queue_length_avg_0            0.005382                                      # Read queue length average per memory cycle per channel.
+   ramulator.read_req_queue_length_sum_0                1087                                      # Read queue length sum per memory cycle per channel.
+  ramulator.write_req_queue_length_avg_0            0.001188                                      # Write queue length average per memory cycle per channel.
+  ramulator.write_req_queue_length_sum_0                 240                                      # Write queue length sum per memory cycle per channel.
+              ramulator.record_read_hits                 0.0                                      # record read hit count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+            ramulator.record_read_misses                 0.0                                      # record_read_miss count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+         ramulator.record_read_conflicts                 0.0                                      # record read conflict count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+             ramulator.record_write_hits                 0.0                                      # record write hit count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+           ramulator.record_write_misses                 0.0                                      # record write miss count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+        ramulator.record_write_conflicts                 0.0                                      # record write conflict for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+               ramulator.active_cycles_1                1442                                      # Total active cycles for level _1
+                 ramulator.busy_cycles_1                1442                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1
+            ramulator.serving_requests_1                1464                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1
+    ramulator.average_serving_requests_1            0.007249                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1
+             ramulator.active_cycles_1_0                1442                                      # Total active cycles for level _1_0
+               ramulator.busy_cycles_1_0                7994                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0
+          ramulator.serving_requests_1_0                1464                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0
+  ramulator.average_serving_requests_1_0            0.007249                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0
+           ramulator.active_cycles_1_0_0                 790                                      # Total active cycles for level _1_0_0
+             ramulator.busy_cycles_1_0_0                 790                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0
+        ramulator.serving_requests_1_0_0                 812                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0
+ramulator.average_serving_requests_1_0_0            0.004021                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0
+         ramulator.active_cycles_1_0_0_0                 790                                      # Total active cycles for level _1_0_0_0
+           ramulator.busy_cycles_1_0_0_0                 790                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_0
+      ramulator.serving_requests_1_0_0_0                 812                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_0
+ramulator.average_serving_requests_1_0_0_0            0.004021                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_0
+         ramulator.active_cycles_1_0_0_1                   0                                      # Total active cycles for level _1_0_0_1
+           ramulator.busy_cycles_1_0_0_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_1
+      ramulator.serving_requests_1_0_0_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_1
+ramulator.average_serving_requests_1_0_0_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_1
+         ramulator.active_cycles_1_0_0_2                   0                                      # Total active cycles for level _1_0_0_2
+           ramulator.busy_cycles_1_0_0_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_2
+      ramulator.serving_requests_1_0_0_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_2
+ramulator.average_serving_requests_1_0_0_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_2
+         ramulator.active_cycles_1_0_0_3                   0                                      # Total active cycles for level _1_0_0_3
+           ramulator.busy_cycles_1_0_0_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_3
+      ramulator.serving_requests_1_0_0_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_3
+ramulator.average_serving_requests_1_0_0_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_3
+           ramulator.active_cycles_1_0_1                 300                                      # Total active cycles for level _1_0_1
+             ramulator.busy_cycles_1_0_1                 300                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1
+        ramulator.serving_requests_1_0_1                 300                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1
+ramulator.average_serving_requests_1_0_1            0.001486                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1
+         ramulator.active_cycles_1_0_1_0                 300                                      # Total active cycles for level _1_0_1_0
+           ramulator.busy_cycles_1_0_1_0                 300                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_0
+      ramulator.serving_requests_1_0_1_0                 300                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_0
+ramulator.average_serving_requests_1_0_1_0            0.001486                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_0
+         ramulator.active_cycles_1_0_1_1                   0                                      # Total active cycles for level _1_0_1_1
+           ramulator.busy_cycles_1_0_1_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_1
+      ramulator.serving_requests_1_0_1_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_1
+ramulator.average_serving_requests_1_0_1_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_1
+         ramulator.active_cycles_1_0_1_2                   0                                      # Total active cycles for level _1_0_1_2
+           ramulator.busy_cycles_1_0_1_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_2
+      ramulator.serving_requests_1_0_1_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_2
+ramulator.average_serving_requests_1_0_1_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_2
+         ramulator.active_cycles_1_0_1_3                   0                                      # Total active cycles for level _1_0_1_3
+           ramulator.busy_cycles_1_0_1_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_3
+      ramulator.serving_requests_1_0_1_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_3
+ramulator.average_serving_requests_1_0_1_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_3
+           ramulator.active_cycles_1_0_2                   0                                      # Total active cycles for level _1_0_2
+             ramulator.busy_cycles_1_0_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2
+        ramulator.serving_requests_1_0_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2
+ramulator.average_serving_requests_1_0_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2
+         ramulator.active_cycles_1_0_2_0                   0                                      # Total active cycles for level _1_0_2_0
+           ramulator.busy_cycles_1_0_2_0                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_0
+      ramulator.serving_requests_1_0_2_0                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_0
+ramulator.average_serving_requests_1_0_2_0            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_0
+         ramulator.active_cycles_1_0_2_1                   0                                      # Total active cycles for level _1_0_2_1
+           ramulator.busy_cycles_1_0_2_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_1
+      ramulator.serving_requests_1_0_2_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_1
+ramulator.average_serving_requests_1_0_2_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_1
+         ramulator.active_cycles_1_0_2_2                   0                                      # Total active cycles for level _1_0_2_2
+           ramulator.busy_cycles_1_0_2_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_2
+      ramulator.serving_requests_1_0_2_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_2
+ramulator.average_serving_requests_1_0_2_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_2
+         ramulator.active_cycles_1_0_2_3                   0                                      # Total active cycles for level _1_0_2_3
+           ramulator.busy_cycles_1_0_2_3                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_3
+      ramulator.serving_requests_1_0_2_3                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_3
+ramulator.average_serving_requests_1_0_2_3            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_3
+           ramulator.active_cycles_1_0_3                 352                                      # Total active cycles for level _1_0_3
+             ramulator.busy_cycles_1_0_3                 352                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3
+        ramulator.serving_requests_1_0_3                 352                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3
+ramulator.average_serving_requests_1_0_3            0.001743                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3
+         ramulator.active_cycles_1_0_3_0                   0                                      # Total active cycles for level _1_0_3_0
+           ramulator.busy_cycles_1_0_3_0                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_0
+      ramulator.serving_requests_1_0_3_0                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_0
+ramulator.average_serving_requests_1_0_3_0            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_0
+         ramulator.active_cycles_1_0_3_1                   0                                      # Total active cycles for level _1_0_3_1
+           ramulator.busy_cycles_1_0_3_1                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_1
+      ramulator.serving_requests_1_0_3_1                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_1
+ramulator.average_serving_requests_1_0_3_1            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_1
+         ramulator.active_cycles_1_0_3_2                   0                                      # Total active cycles for level _1_0_3_2
+           ramulator.busy_cycles_1_0_3_2                   0                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_2
+      ramulator.serving_requests_1_0_3_2                   0                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_2
+ramulator.average_serving_requests_1_0_3_2            0.000000                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_2
+         ramulator.active_cycles_1_0_3_3                 352                                      # Total active cycles for level _1_0_3_3
+           ramulator.busy_cycles_1_0_3_3                 352                                      # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_3
+      ramulator.serving_requests_1_0_3_3                 352                                      # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_3
+ramulator.average_serving_requests_1_0_3_3            0.001743                                      # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_3
+      ramulator.read_transaction_bytes_1                2688                                      # The total byte of read transaction per channel
+     ramulator.write_transaction_bytes_1               46528                                      # The total byte of write transaction per channel
+       ramulator.row_hits_channel_1_core                 733                                      # Number of row hits per channel per core
+     ramulator.row_misses_channel_1_core                  33                                      # Number of row misses per channel per core
+  ramulator.row_conflicts_channel_1_core                   3                                      # Number of row conflicts per channel per core
+  ramulator.read_row_hits_channel_1_core                  30                                      # Number of row hits for read requests per channel per core
+                                     [0]                30.0                                      # 
+ramulator.read_row_misses_channel_1_core                  11                                      # Number of row misses for read requests per channel per core
+                                     [0]                11.0                                      # 
+ramulator.read_row_conflicts_channel_1_core                   1                                      # Number of row conflicts for read requests per channel per core
+                                     [0]                 1.0                                      # 
+ ramulator.write_row_hits_channel_1_core                 703                                      # Number of row hits for write requests per channel per core
+                                     [0]               703.0                                      # 
+ramulator.write_row_misses_channel_1_core                  22                                      # Number of row misses for write requests per channel per core
+                                     [0]                22.0                                      # 
+ramulator.write_row_conflicts_channel_1_core                   2                                      # Number of row conflicts for write requests per channel per core
+                                     [0]                 2.0                                      # 
+      ramulator.useless_activates_1_core                   0                                      # Number of useless activations. E.g, ACT -> PRE w/o RD or WR
+            ramulator.read_latency_avg_1           32.261905                                      # The average memory latency cycles (in memory time domain) per request for all read requests in this channel
+            ramulator.read_latency_sum_1                1355                                      # The memory latency cycles (in memory time domain) sum for all read requests in this channel
+        ramulator.req_queue_length_avg_1            0.029056                                      # Average of read and write queue length per memory cycle per channel.
+        ramulator.req_queue_length_sum_1                5868                                      # Sum of read and write queue length per memory cycle per channel.
+   ramulator.read_req_queue_length_avg_1            0.005759                                      # Read queue length average per memory cycle per channel.
+   ramulator.read_req_queue_length_sum_1                1163                                      # Read queue length sum per memory cycle per channel.
+  ramulator.write_req_queue_length_avg_1            0.023298                                      # Write queue length average per memory cycle per channel.
+  ramulator.write_req_queue_length_sum_1                4705                                      # Write queue length sum per memory cycle per channel.
+              ramulator.record_read_hits                 0.0                                      # record read hit count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+            ramulator.record_read_misses                 0.0                                      # record_read_miss count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+         ramulator.record_read_conflicts                 0.0                                      # record read conflict count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+             ramulator.record_write_hits                 0.0                                      # record write hit count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+           ramulator.record_write_misses                 0.0                                      # record write miss count for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+        ramulator.record_write_conflicts                 0.0                                      # record write conflict for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+                 ramulator.dram_capacity          8589934592                                      # Number of bytes in simulated DRAM
+                   ramulator.dram_cycles              201952                                      # Number of DRAM cycles simulated
+             ramulator.incoming_requests                 852                                      # Number of incoming requests to DRAM
+                 ramulator.read_requests                  89                                      # Number of incoming read requests to DRAM per core
+                                     [0]                89.0                                      # 
+                ramulator.write_requests                 763                                      # Number of incoming write requests to DRAM per core
+                                     [0]               763.0                                      # 
+       ramulator.ramulator_active_cycles                2780                                      # The total number of cycles that the DRAM part is active (serving R/W)
+ ramulator.incoming_requests_per_channel               852.0                                      # Number of incoming requests to each DRAM channel
+                                     [0]                83.0                                      # 
+                                     [1]               769.0                                      # 
+ramulator.incoming_read_reqs_per_channel                89.0                                      # Number of incoming read requests to each DRAM channel
+                                     [0]                47.0                                      # 
+                                     [1]                42.0                                      # 
+     ramulator.physical_page_replacement                   0                                      # The number of times that physical page replacement happens.
+             ramulator.maximum_bandwidth         38400000000                                      # The theoretical maximum bandwidth (Bps)
+          ramulator.in_queue_req_num_sum                7195                                      # Sum of read/write queue length
+     ramulator.in_queue_read_req_num_sum                2250                                      # Sum of read queue length
+    ramulator.in_queue_write_req_num_sum                4945                                      # Sum of write queue length
+          ramulator.in_queue_req_num_avg            0.035627                                      # Average of read/write queue length per memory cycle
+     ramulator.in_queue_read_req_num_avg            0.011141                                      # Average of read queue length per memory cycle
+    ramulator.in_queue_write_req_num_avg            0.024486                                      # Average of write queue length per memory cycle
+          ramulator.record_read_requests                 0.0                                      # record read requests for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
+         ramulator.record_write_requests                 0.0                                      # record write requests for this core when it reaches request limit or to the end
+                                     [0]                 0.0                                      # 
--- a/tests/runtime/matmul/vx_mat_mulint32.h
+++ b/tests/runtime/matmul/vx_mat_mulint32.h
@ -0,0 +1,16 @@
+#ifndef VX_MATMUL_H
+#define VX_MATMUL_H
+
+#include <stdarg.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SIZE 2
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/tests/runtime/matmul/vx_mat_mulint32.s
+++ b/tests/runtime/matmul/vx_mat_mulint32.s
@ -0,0 +1,53 @@
+.type vx_mat_muli32, @function
+.global vx_mat_mulint32
+# vector-vector add routine of 32-bit integers
+# void vvaddint32(size_t n, const int*x, const int*y, int*z)
+# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
+#
+# a0 = C, a1 = A, a2 = B
+# Non-vector instructions are indented
+vx_mat_mulint32:
+    #load from a1 to r1
+    #mla
+    # #load from a1+1 to r2
+    # mla s2, (a1)
+    # #load from a1+2 to r3
+    # mla s3, (a1)
+    # #load from a1+3 to r4
+    # lw s4, (a1)
+
+    # #load from a2 to r5
+    # lw s5, (a2)
+    # #load from a2+1 to r6
+    # lw s6, (a2)
+    # #load from a2+2 to r7
+    # lw s7, (a2)
+    # #load from a2+3 to r8
+    # lw s8, (a2)
+
+    # #multiply and store in regs t1, t2, t3, t4 
+
+    # #store r9 in a0
+    # sw t1, (a0)
+    # #store r10 in a0+1
+    # sw t2, (a0)
+    # #store r11 in a0+2
+    # sw t3, (a0)
+    # #store r12 in a0+3
+    # sw t4, (a0)
+
+    #return
+    ret
+
+#loop:
+#    vlw.v v0, (a1)           # Get first vector
+#      sub a0, a0, t0         # Decrement number done
+#      slli t0, t0, 2         # Multiply number done by 4 bytes
+#      add a1, a1, t0         # Bump pointer
+#    vlw.v v1, (a2)           # Get second vector
+#      add a2, a2, t0         # Bump pointer
+#    vadd.vv v2, v0, v1        # Sum vectors
+#    vsw.v v2, (a3)           # Store result
+#      add a3, a3, t0         # Bump pointer
+#      bnez a0, loop   # Loop back 
+#    ret                    # Finished
--- a/tests/runtime/matmul/vx_mat_mulint32_BAK.s
+++ b/tests/runtime/matmul/vx_mat_mulint32_BAK.s
@ -0,0 +1,22 @@
+.type vx_vec_vvaddi32, @function
+.global vx_vec_vvaddint32
+# vector-vector add routine of 32-bit integers
+# void vvaddint32(size_t n, const int*x, const int*y, int*z)
+# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
+#
+# a0 = n, a1 = x, a2 = y, a3 = z
+# Non-vector instructions are indented
+vx_vec_vvaddint32:
+    vsetvli t0, a0, e32 # Set vector length based on 32-bit vectors
+loop:
+    vlw.v v0, (a1)           # Get first vector
+      sub a0, a0, t0         # Decrement number done
+      slli t0, t0, 2         # Multiply number done by 4 bytes
+      add a1, a1, t0         # Bump pointer
+    vlw.v v1, (a2)           # Get second vector
+      add a2, a2, t0         # Bump pointer
+    vadd.vv v2, v0, v1        # Sum vectors
+    vsw.v v2, (a3)           # Store result
+      add a3, a3, t0         # Bump pointer
+      bnez a0, loop   # Loop back 
+    ret                    # Finished