Added a vpu and vec units

2025-04-24 05:47:35 -04:00 · 2025-02-09 17:09:38 -08:00 · 2025-02-09 17:09:38 -08:00 · b2ad2e5281
commit b2ad2e5281
parent e80ee2c819
14 changed files with 436 additions and 7 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -106,7 +106,7 @@ jobs:
          make tests -s > /dev/null

      - name: Upload Build Artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
          name: build-${{ matrix.xlen }}
          path: build${{ matrix.xlen }}
@ -147,7 +147,7 @@ jobs:
            ${{ runner.os }}-thirdparty-

      - name: Download Build Artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
        with:
          name: build-${{ matrix.xlen }}
          path: build${{ matrix.xlen }}
--- a/hw/rtl/VX_types.vh
+++ b/hw/rtl/VX_types.vh
@ -35,6 +35,7 @@
 `define VX_DCR_MPM_CLASS_NONE           0
 `define VX_DCR_MPM_CLASS_CORE           1
 `define VX_DCR_MPM_CLASS_MEM            2
+`define VX_DCR_MPM_CLASS_VEC            3

 // User Floating-Point CSRs ///////////////////////////////////////////////////

@ -99,6 +100,8 @@
 `define VX_CSR_MPM_SCRB_CSRS_H          12'hB8C
 `define VX_CSR_MPM_SCRB_WCTL            12'hB0D
 `define VX_CSR_MPM_SCRB_WCTL_H          12'hB8D
+`define VX_CSR_MPM_SCRB_VEC             12'hB13 // Vector scoreboard
+`define VX_CSR_MPM_SCRB_VEC_H           12'hB93
 // PERF: memory
 `define VX_CSR_MPM_IFETCHES             12'hB0E
 `define VX_CSR_MPM_IFETCHES_H           12'hB8E
@ -182,6 +185,17 @@
 // Machine Performance-monitoring memory counters (class 3) ///////////////////
 // <Add your own counters: use addresses hB03..B1F, hB83..hB9F>

+// Machine Performance-monitoring vector counters
+// PERF: vector unit
+`define VX_CSR_MPM_VEC_READS            12'hB03     // vector reads
+`define VX_CSR_MPM_VEC_READS_H          12'hB83
+`define VX_CSR_MPM_VEC_WRITES           12'hB04     // vector writes
+`define VX_CSR_MPM_VEC_WRITES_H         12'hB84
+`define VX_CSR_MPM_VEC_LAT              12'hB05     // vector latency
+`define VX_CSR_MPM_VEC_LAT_H            12'hB85
+`define VX_CSR_MPM_VEC_ST               12'hB06     // vector stalls
+`define VX_CSR_MPM_VEC_ST_H             12'hB86
+
 // Machine Information Registers //////////////////////////////////////////////

 `define VX_CSR_MVENDORID                12'hF11
--- a/runtime/stub/utils.cpp
+++ b/runtime/stub/utils.cpp
@ -188,6 +188,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
  uint64_t scrb_lsu = 0;
  uint64_t scrb_csrs = 0;
  uint64_t scrb_wctl = 0;
+#ifdef EXT_V_ENABLE
+  uint64_t scrb_vpu = 0;
+#endif
  uint64_t ifetches = 0;
  uint64_t loads = 0;
  uint64_t stores = 0;
@ -212,6 +215,13 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
  uint64_t mem_writes = 0;
  uint64_t mem_lat = 0;
  uint64_t mem_bank_stalls = 0;
+#ifdef EXT_V_ENABLE
+  // PERF: vecunit
+  uint64_t vec_mem_reads = 0;
+  uint64_t vec_mem_writes = 0;
+  uint64_t vec_mem_lat = 0;
+  uint64_t vec_stall_cycles = 0;
+#endif

  uint64_t num_cores;
  CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
@ -312,13 +322,25 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
          return err;
        });
+      #ifdef EXT_V_ENABLE
+        uint64_t scrb_vpu_per_core;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_VPU, core_id, &scrb_vpu_per_core), {
+          return err;
+        });
+      #endif
        scrb_alu += scrb_alu_per_core;
        scrb_fpu += scrb_fpu_per_core;
        scrb_lsu += scrb_lsu_per_core;
        scrb_csrs += scrb_csrs_per_core;
        scrb_wctl += scrb_wctl_per_core;
+      #ifdef EXT_V_ENABLE
+        scrb_vpu += scrb_vpu_per_core;
+      #endif
        if (num_cores > 1) {
          uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core;
+        #ifdef EXT_V_ENABLE
+          scrb_total += scrb_vpu_per_core;
+        #endif
          int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
          fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n"
          , core_id
@ -329,6 +351,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
          , calcAvgPercent(scrb_lsu_per_core, scrb_total)
          , calcAvgPercent(scrb_csrs_per_core, scrb_total)
          , calcAvgPercent(scrb_wctl_per_core, scrb_total)
+        #ifdef EXT_V_ENABLE
+          , calcAvgPercent(scrb_vpu_per_core, scrb_total)
+        #endif
          );
        }
        scrb_stalls += scrb_stalls_per_core;
@ -555,6 +580,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
        });
      }
    } break;
+  #ifdef EXT_V_ENABLE
+    case VX_DCR_MPM_CLASS_VEC: {
+      uint64_t tmp;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_READS, core_id, &tmp), { return err; });
+			vec_mem_reads += tmp;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_WRITES, core_id, &tmp), { return err; });
+			vec_mem_writes += tmp;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_LAT, core_id, &tmp), { return err; });
+			vec_mem_lat += tmp;
+      CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_ST, core_id, &tmp), { return err; });
+			vec_stall_cycles += tmp;
+    } break;
+  #endif
    default:
      break;
    }
@ -576,6 +614,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
    int ifetch_avg_lat = caclAverage(ifetch_lat, ifetches);
    int load_avg_lat = caclAverage(load_lat, loads);
    uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_csrs + scrb_wctl;
+  #ifdef EXT_V_ENABLE
+    scrb_total += scrb_vpu;
+  #endif
    fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
    fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
    fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
@ -587,6 +628,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      , calcAvgPercent(scrb_lsu, scrb_total)
      , calcAvgPercent(scrb_csrs, scrb_total)
      , calcAvgPercent(scrb_wctl, scrb_total)
+    #ifdef EXT_V_ENABLE
+      , calcAvgPercent(scrb_vpu, scrb_total)
+    #endif
    );
    fprintf(stream, "PERF: operands stalls=%ld (%d%%)\n", opds_stalls, opds_percent);
    fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
@ -637,6 +681,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization);
    }
  } break;
+#ifdef EXT_V_ENABLE
+  case VX_DCR_MPM_CLASS_VEC: {
+    vec_mem_reads /= num_cores;
+    vec_mem_writes /= num_cores;
+    vec_mem_lat /= num_cores;
+    vec_stall_cycles /= num_cores;
+    int vec_avg_lat = caclAverage(vec_mem_lat, vec_mem_reads);
+    int vec_stall_cycles_ratio = calcRatio(vec_stall_cycles, total_cycles);
+    fprintf(stream, "PERF: vec memory reads=%ld\n", vec_mem_reads);
+    fprintf(stream, "PERF: vec memory writes=%ld\n", vec_mem_writes);
+    fprintf(stream, "PERF: vec memory latency=%d cycles\n", vec_avg_lat);
+    fprintf(stream, "PERF: vec stalls=%ld (%d%%)\n", vec_stall_cycles, vec_stall_cycles_ratio);
+  } break;
+#endif
  default:
    break;
  }
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@ -22,7 +22,7 @@ SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(

 # Add V extension sources
 ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
-  SRCS += $(SRC_DIR)/vpu.cpp
+  SRCS += $(SRC_DIR)/vec_unit.cpp $(SRC_DIR)/vpu.cpp
 endif

 # Debugging
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@ -29,7 +29,8 @@ Core::Core(const SimContext& ctx,
           uint32_t core_id,
           Socket* socket,
           const Arch &arch,
-           const DCRS &dcrs)
+           const DCRS &dcrs
+           )
  : SimObject(ctx, StrFormat("core%d", core_id))
  , icache_req_ports(1, this)
  , icache_rsp_ports(1, this)
@ -38,6 +39,9 @@ Core::Core(const SimContext& ctx,
  , core_id_(core_id)
  , socket_(socket)
  , arch_(arch)
+#ifdef EXT_V_ENABLE
+  , vec_unit_(VecUnit::Create("vpu", arch))
+#endif
  , emulator_(arch, dcrs, this)
  , ibuffers_(arch.num_warps(), IBUF_SIZE)
  , scoreboard_(arch_)
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@ -27,6 +27,9 @@
 #include "func_unit.h"
 #include "mem_coalescer.h"
 #include "VX_config.h"
+#ifdef EXT_V_ENABLE
+#include "vec_unit.h"
+#endif

 namespace vortex {

@ -52,6 +55,9 @@ public:
    uint64_t scrb_sfu;
    uint64_t scrb_csrs;
    uint64_t scrb_wctl;
+  #ifdef EXT_V_ENABLE
+    uint64_t scrb_vpu;
+  #endif
    uint64_t ifetches;
    uint64_t loads;
    uint64_t stores;
@ -72,6 +78,9 @@ public:
      , scrb_sfu(0)
      , scrb_csrs(0)
      , scrb_wctl(0)
+    #ifdef EXT_V_ENABLE
+      , scrb_vpu(0)
+    #endif
      , ifetches(0)
      , loads(0)
      , stores(0)
@ -90,7 +99,8 @@ public:
       uint32_t core_id,
       Socket* socket,
       const Arch &arch,
-       const DCRS &dcrs);
+       const DCRS &dcrs
+       );

  ~Core();

@ -131,6 +141,12 @@ public:
    return mem_coalescers_.at(idx);
  }

+#ifdef EXT_V_ENABLE
+  VecUnit::Ptr& vec_unit() {
+    return vec_unit_;
+  }
+#endif
+
  const PerfStats& perf_stats() const {
    return perf_stats_;
  }
@ -150,6 +166,10 @@ private:
  Socket* socket_;
  const Arch& arch_;

+#ifdef EXT_V_ENABLE
+  VecUnit::Ptr vec_unit_;
+#endif
+
  Emulator emulator_;

  std::vector<IBuffer> ibuffers_;
--- a/sim/simx/emulator.cpp
+++ b/sim/simx/emulator.cpp
@ -34,7 +34,7 @@ Emulator::warp_t::warp_t(const Arch& arch)
  : ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
  , freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
 #ifdef EXT_V_ENABLE
-  , vreg_file(MAX_NUM_REGS, std::vector<Byte>(MAX_NUM_REGS))
+  , vreg_file(MAX_NUM_REGS, std::vector<Byte>(VLEN / 8))
 #endif
  , uuid(0)
 {}
@ -96,6 +96,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
    // In future versions, scratchpad size should be fixed to an appropriate value.
    , scratchpad(std::vector<Word>(32 * 32 * 32768))
  #ifdef EXT_V_ENABLE
+    , vec_unit_(core->vec_unit())
    , csrs_(arch.num_warps())
  #endif
 {
@ -133,6 +134,10 @@ void Emulator::clear() {
    barrier.reset();
  }

+#ifdef EXT_V_ENABLE
+  vec_unit_->reset();
+#endif
+
  csr_mscratch_ = startup_arg;

  stalled_warps_.reset();
@ -607,6 +612,18 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
        CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls);
        }
      } break;
+    #ifdef EXT_V_ENABLE
+      case VX_DCR_MPM_CLASS_VEC: {
+        VecUnit::PerfStats vec_perf_stats;
+        vec_perf_stats += vec_unit_->perf_stats();
+        switch (addr) {
+        CSR_READ_64(VX_CSR_MPM_VEC_READS, vec_perf_stats.reads);
+        CSR_READ_64(VX_CSR_MPM_VEC_WRITES, vec_perf_stats.writes);
+        CSR_READ_64(VX_CSR_MPM_VEC_LAT, vec_perf_stats.latency);
+        CSR_READ_64(VX_CSR_MPM_VEC_ST, vec_perf_stats.stalls);
+        }
+      } break;
+    #endif
      default: {
        std::cout << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
        std::abort();
--- a/sim/simx/emulator.h
+++ b/sim/simx/emulator.h
@ -19,6 +19,9 @@
 #include <stack>
 #include <mem.h>
 #include "types.h"
+#ifdef EXT_V_ENABLE
+#include "vec_unit.h"
+#endif

 namespace vortex {

@ -175,7 +178,10 @@ private:
  uint32_t mat_size;
  uint32_t tc_size;
  uint32_t tc_num;
+#ifdef EXT_V_ENABLE
+  VecUnit::Ptr vec_unit_;
  std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
+#endif
 };

 }
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@ -366,3 +366,52 @@ void SfuUnit::tick() {
 		input.pop();
 	}
 }
+
+///////////////////////////////////////////////////////////////////////////////
+
+#ifdef EXT_V_ENABLE
+VpuUnit::VpuUnit(const SimContext& ctx, Core* core)
+	: FuncUnit(ctx, core, "vpu-unit")
+{}
+
+void VpuUnit::tick() {
+  for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
+		auto& input = Inputs.at(iw);
+		if (input.empty())
+			continue;
+		//auto& output = Outputs.at(iw);
+		auto trace = input.front();
+		//int delay = 2;
+		switch (trace->vpu_type) {
+		case VpuType::VSET:
+		case VpuType::VL:
+		case VpuType::VS:
+		case VpuType::ARITHVV:
+		case VpuType::MULVV:
+		case VpuType::DIVVV:
+		case VpuType::ARITHVX:
+		case VpuType::MULVX:
+		case VpuType::DIVVX:
+		case VpuType::ARITHVI:
+		case VpuType::MULVI:
+		case VpuType::DIVVI:
+		case VpuType::ARITHFVV:
+		case VpuType::MULFVV:
+		case VpuType::DIVFVV:
+		case VpuType::ARITHFVX:
+		case VpuType::MULFVX:
+		case VpuType::DIVFVX:
+		case VpuType::ARITHFVI:
+		case VpuType::MULFVI:
+		case VpuType::DIVFVI:
+		default:
+			std::abort();
+		}
+		DT(3, this->name() << ": op=" << trace->vpu_type << ", " << *trace);
+		if (trace->eop && trace->fetch_stall) {
+			core_->resume(trace->wid);
+		}
+		input.pop();
+	}
+}
+#endif
--- a/sim/simx/func_unit.h
+++ b/sim/simx/func_unit.h
@ -113,4 +113,39 @@ public:
 	void tick();
 };

+///////////////////////////////////////////////////////////////////////////////
+
+class VpuUnit : public FuncUnit {
+public:
+	VpuUnit(const SimContext& ctx, Core*);
+
+	void tick();
+};
+
+// Simulate clock cycles depending on instruction type and element width and #lanes
+// VSET = 1 cycle
+// Vector instructions take the same amount of time as ALU instructions.
+// In general there should be less overall instructions (hence the SIMD vector speedup).
+// But, each vector instruction is bigger, and # of lanes greatly effects execution speed.
+
+// Whenever we change VL using imm/VSET, we need to keep track of the new VL and SEW.
+// By default, VL is set to MAXVL.
+// After determining VL, we use VL and #lanes in order to determine overall cycle time.
+// For example, for a vector add with VL=4 and #lanes=2, we will probably take 2 cycles,
+// since we can only operate on two elements of the vector each cycle (limited by #lanes).
+// SEW (element width) likely affects the cycle time, we can probably observe
+// ALU operation cycle time in relation to element width to determine this though.
+
+// The RTL implementation has an unroll and accumulate stage.
+// The unroll stage sends vector elements to the appropriate functional unit up to VL,
+// limited by the # lanes available.
+// The accumulate stage deals with combining the results from the functional units,
+// into the destination vector register.
+// Which exact pipeline stage does the VPU unroll the vector (decode or execute)?
+// Which exact pipeline stage does the VPU accumulate results?
+
+// How do vector loads and stores interact with the cache?
+// How about loading and storing scalars in vector registers?
+// How does striding affect loads and stores?
+
 }
--- a/sim/simx/instr_trace.h
+++ b/sim/simx/instr_trace.h
@ -43,6 +43,13 @@ struct SFUTraceData : public ITraceData {
  SFUTraceData(Word arg1, Word arg2) : arg1(arg1), arg2(arg2) {}
 };

+struct VPUTraceData : public ITraceData {
+  using Ptr = std::shared_ptr<VPUTraceData>;
+  Word arg1;
+  Word arg2;
+  VPUTraceData(Word arg1, Word arg2) : arg1(arg1), arg2(arg2) {}
+};
+
 struct instr_trace_t {
 public:
  struct reg_t {
@ -77,7 +84,10 @@ public:
    AluType  alu_type;
    FpuType  fpu_type;
    SfuType  sfu_type;
-    TCUType  tcu_type; 
+  #ifdef EXT_V_ENABLE
+    VpuType  vpu_type;
+  #endif
+    TCUType  tcu_type;
  };

  ITraceData::Ptr data;
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@ -269,6 +269,48 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {

 ///////////////////////////////////////////////////////////////////////////////

+enum class VpuType {
+  VSET,     // Set vector length
+  VL,       // Vector load
+  VS,       // Vector store
+
+  // ALU OPERATIONS
+  ARITHVV,  // Vector-vector
+  MULVV,
+  DIVVV,
+
+  ARITHVX,  // Vector-scalar
+  MULVX,
+  DIVVX,
+
+  ARITHVI,  // Vector-immediate
+  MULVI,
+  DIVVI,
+
+  // FPU OPERATIONS
+  ARITHFVV,  // Vector-vector
+  MULFVV,
+  DIVFVV,
+
+  ARITHFVX,  // Vector-scalar
+  MULFVX,
+  DIVFVX,
+
+  ARITHFVI,  // Vector-immediate
+  MULFVI,
+  DIVFVI
+};
+
+inline std::ostream &operator<<(std::ostream &os, const VpuType& type) {
+  switch (type) {
+  case VpuType::VSET:   os << "VSET"; break;
+  default: assert(false);
+  }
+  return os;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
 enum class ArbiterType {
  Priority,
  RoundRobin
--- a/sim/simx/vec_unit.cpp
+++ b/sim/simx/vec_unit.cpp
@ -0,0 +1,87 @@
+#ifdef EXT_V_ENABLE
+
+#include "vec_unit.h"
+
+using namespace vortex;
+
+class VecUnit::Impl {
+public:
+    Impl(VecUnit* simobject, const Arch& /*arch*/)
+        : simobject_(simobject)
+    {
+        this->clear();
+    }
+
+    ~Impl() {}
+
+    void clear() {
+        perf_stats_ = PerfStats();
+    }
+
+    void tick() {
+    }
+
+/*
+    void load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
+    }
+
+    void store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
+    }
+
+    void execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata) {
+    }
+*/
+
+    const PerfStats& perf_stats() const {
+        return perf_stats_;
+    }
+
+private:
+
+    VecUnit* simobject_;
+    std::vector<std::vector<Byte>>  vreg_file_;
+    vtype_t                         vtype_;
+    uint32_t                        vl_;
+    Word                            vlmax_;
+    PerfStats perf_stats_;
+};
+
+VecUnit::VecUnit(const SimContext& ctx,
+                 const char* name,
+                 const Arch &arch)
+    : SimObject<VecUnit>(ctx, name)
+    , Input(this)
+    , Output(this)
+    , impl_(new Impl(this, arch))
+{}
+
+VecUnit::~VecUnit() {
+    delete impl_;
+}
+
+void VecUnit::reset() {
+    impl_->clear();
+}
+
+void VecUnit::tick() {
+    impl_->tick();
+}
+
+/*
+void VecUnit::load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
+    return impl_->load(instr, wid, rsdata);
+}
+
+void VecUnit::store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
+    return impl_->store(instr, wid, rsdata);
+}
+
+void VecUnit::execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata) {
+    return impl_->execute(instr, wid, rsdata, rddata);
+}
+*/
+
+const VecUnit::PerfStats& VecUnit::perf_stats() const {
+    return impl_->perf_stats();
+}
+#endif
--- a/sim/simx/vec_unit.h
+++ b/sim/simx/vec_unit.h
@ -0,0 +1,87 @@
+#ifdef EXT_V_ENABLE
+#pragma once
+
+#include "arch.h"
+#include "instr.h"
+#include "instr_trace.h"
+#include <simobject.h>
+#include "types.h"
+
+namespace vortex {
+
+struct vtype_t {
+  uint32_t vill;
+  uint32_t vma;
+  uint32_t vta;
+  uint32_t vsew;
+  uint32_t vlmul;
+};
+
+union reg_data_t {
+  Word     u;
+  WordI    i;
+  WordF    f;
+  float    f32;
+  double   f64;
+  uint32_t u32;
+  uint64_t u64;
+  int32_t  i32;
+  int64_t  i64;
+};
+
+class VecUnit : public SimObject<VecUnit> {
+public:
+  struct PerfStats {
+    uint64_t reads;
+    uint64_t writes;
+    uint64_t latency;
+    uint64_t stalls;
+
+    PerfStats()
+      : reads(0)
+      , writes(0)
+      , latency(0)
+      , stalls(0)
+    {}
+
+    PerfStats& operator+=(const PerfStats& rhs) {
+      this->reads   += rhs.reads;
+      this->writes  += rhs.writes;
+      this->latency += rhs.latency;
+      this->stalls  += rhs.stalls;
+      return *this;
+    }
+  };
+
+  std::vector<SimPort<MemReq>> MemReqs;
+  std::vector<SimPort<MemRsp>> MemRsps;
+
+  SimPort<instr_trace_t*> Input;
+  SimPort<instr_trace_t*> Output;
+
+  VecUnit(const SimContext& ctx,
+          const char* name,
+          const Arch &arch);
+
+  ~VecUnit();
+
+  void reset();
+
+  void tick();
+
+  void load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
+
+  void store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
+
+  void execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata);
+
+  const PerfStats& perf_stats() const;
+
+private:
+
+  class Impl;
+  Impl* impl_;
+};
+
+}
+#endif