Added a vpu and vec units

This commit is contained in:
MichaelJSr 2025-02-09 17:09:38 -08:00
parent e80ee2c819
commit b2ad2e5281
14 changed files with 436 additions and 7 deletions

View file

@ -106,7 +106,7 @@ jobs:
make tests -s > /dev/null
- name: Upload Build Artifact
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: build-${{ matrix.xlen }}
path: build${{ matrix.xlen }}
@ -147,7 +147,7 @@ jobs:
${{ runner.os }}-thirdparty-
- name: Download Build Artifact
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: build-${{ matrix.xlen }}
path: build${{ matrix.xlen }}

View file

@ -35,6 +35,7 @@
`define VX_DCR_MPM_CLASS_NONE 0
`define VX_DCR_MPM_CLASS_CORE 1
`define VX_DCR_MPM_CLASS_MEM 2
`define VX_DCR_MPM_CLASS_VEC 3
// User Floating-Point CSRs ///////////////////////////////////////////////////
@ -99,6 +100,8 @@
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
`define VX_CSR_MPM_SCRB_WCTL 12'hB0D
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB8D
`define VX_CSR_MPM_SCRB_VEC 12'hB13 // Vector scoreboard
`define VX_CSR_MPM_SCRB_VEC_H 12'hB93
// PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0E
`define VX_CSR_MPM_IFETCHES_H 12'hB8E
@ -182,6 +185,17 @@
// Machine Performance-monitoring memory counters (class 3) ///////////////////
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>
// Machine Performance-monitoring vector counters
// PERF: vector unit
`define VX_CSR_MPM_VEC_READS 12'hB03 // vector reads
`define VX_CSR_MPM_VEC_READS_H 12'hB83
`define VX_CSR_MPM_VEC_WRITES 12'hB04 // vector writes
`define VX_CSR_MPM_VEC_WRITES_H 12'hB84
`define VX_CSR_MPM_VEC_LAT 12'hB05 // vector latency
`define VX_CSR_MPM_VEC_LAT_H 12'hB85
`define VX_CSR_MPM_VEC_ST 12'hB06 // vector stalls
`define VX_CSR_MPM_VEC_ST_H 12'hB86
// Machine Information Registers //////////////////////////////////////////////
`define VX_CSR_MVENDORID 12'hF11

View file

@ -188,6 +188,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t scrb_lsu = 0;
uint64_t scrb_csrs = 0;
uint64_t scrb_wctl = 0;
#ifdef EXT_V_ENABLE
uint64_t scrb_vpu = 0;
#endif
uint64_t ifetches = 0;
uint64_t loads = 0;
uint64_t stores = 0;
@ -212,6 +215,13 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t mem_writes = 0;
uint64_t mem_lat = 0;
uint64_t mem_bank_stalls = 0;
#ifdef EXT_V_ENABLE
// PERF: vecunit
uint64_t vec_mem_reads = 0;
uint64_t vec_mem_writes = 0;
uint64_t vec_mem_lat = 0;
uint64_t vec_stall_cycles = 0;
#endif
uint64_t num_cores;
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
@ -312,13 +322,25 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
return err;
});
#ifdef EXT_V_ENABLE
uint64_t scrb_vpu_per_core;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_VPU, core_id, &scrb_vpu_per_core), {
return err;
});
#endif
scrb_alu += scrb_alu_per_core;
scrb_fpu += scrb_fpu_per_core;
scrb_lsu += scrb_lsu_per_core;
scrb_csrs += scrb_csrs_per_core;
scrb_wctl += scrb_wctl_per_core;
#ifdef EXT_V_ENABLE
scrb_vpu += scrb_vpu_per_core;
#endif
if (num_cores > 1) {
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core;
#ifdef EXT_V_ENABLE
scrb_total += scrb_vpu_per_core;
#endif
int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n"
, core_id
@ -329,6 +351,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
, calcAvgPercent(scrb_lsu_per_core, scrb_total)
, calcAvgPercent(scrb_csrs_per_core, scrb_total)
, calcAvgPercent(scrb_wctl_per_core, scrb_total)
#ifdef EXT_V_ENABLE
, calcAvgPercent(scrb_vpu_per_core, scrb_total)
#endif
);
}
scrb_stalls += scrb_stalls_per_core;
@ -555,6 +580,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
});
}
} break;
#ifdef EXT_V_ENABLE
case VX_DCR_MPM_CLASS_VEC: {
uint64_t tmp;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_READS, core_id, &tmp), { return err; });
vec_mem_reads += tmp;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_WRITES, core_id, &tmp), { return err; });
vec_mem_writes += tmp;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_LAT, core_id, &tmp), { return err; });
vec_mem_lat += tmp;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_ST, core_id, &tmp), { return err; });
vec_stall_cycles += tmp;
} break;
#endif
default:
break;
}
@ -576,6 +614,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int ifetch_avg_lat = caclAverage(ifetch_lat, ifetches);
int load_avg_lat = caclAverage(load_lat, loads);
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_csrs + scrb_wctl;
#ifdef EXT_V_ENABLE
scrb_total += scrb_vpu;
#endif
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
@ -587,6 +628,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
, calcAvgPercent(scrb_lsu, scrb_total)
, calcAvgPercent(scrb_csrs, scrb_total)
, calcAvgPercent(scrb_wctl, scrb_total)
#ifdef EXT_V_ENABLE
, calcAvgPercent(scrb_vpu, scrb_total)
#endif
);
fprintf(stream, "PERF: operands stalls=%ld (%d%%)\n", opds_stalls, opds_percent);
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
@ -637,6 +681,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization);
}
} break;
#ifdef EXT_V_ENABLE
case VX_DCR_MPM_CLASS_VEC: {
vec_mem_reads /= num_cores;
vec_mem_writes /= num_cores;
vec_mem_lat /= num_cores;
vec_stall_cycles /= num_cores;
int vec_avg_lat = caclAverage(vec_mem_lat, vec_mem_reads);
int vec_stall_cycles_ratio = calcRatio(vec_stall_cycles, total_cycles);
fprintf(stream, "PERF: vec memory reads=%ld\n", vec_mem_reads);
fprintf(stream, "PERF: vec memory writes=%ld\n", vec_mem_writes);
fprintf(stream, "PERF: vec memory latency=%d cycles\n", vec_avg_lat);
fprintf(stream, "PERF: vec stalls=%ld (%d%%)\n", vec_stall_cycles, vec_stall_cycles_ratio);
} break;
#endif
default:
break;
}

View file

@ -22,7 +22,7 @@ SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(
# Add V extension sources
ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
SRCS += $(SRC_DIR)/vpu.cpp
SRCS += $(SRC_DIR)/vec_unit.cpp $(SRC_DIR)/vpu.cpp
endif
# Debugging

View file

@ -29,7 +29,8 @@ Core::Core(const SimContext& ctx,
uint32_t core_id,
Socket* socket,
const Arch &arch,
const DCRS &dcrs)
const DCRS &dcrs
)
: SimObject(ctx, StrFormat("core%d", core_id))
, icache_req_ports(1, this)
, icache_rsp_ports(1, this)
@ -38,6 +39,9 @@ Core::Core(const SimContext& ctx,
, core_id_(core_id)
, socket_(socket)
, arch_(arch)
#ifdef EXT_V_ENABLE
, vec_unit_(VecUnit::Create("vpu", arch))
#endif
, emulator_(arch, dcrs, this)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)

View file

@ -27,6 +27,9 @@
#include "func_unit.h"
#include "mem_coalescer.h"
#include "VX_config.h"
#ifdef EXT_V_ENABLE
#include "vec_unit.h"
#endif
namespace vortex {
@ -52,6 +55,9 @@ public:
uint64_t scrb_sfu;
uint64_t scrb_csrs;
uint64_t scrb_wctl;
#ifdef EXT_V_ENABLE
uint64_t scrb_vpu;
#endif
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
@ -72,6 +78,9 @@ public:
, scrb_sfu(0)
, scrb_csrs(0)
, scrb_wctl(0)
#ifdef EXT_V_ENABLE
, scrb_vpu(0)
#endif
, ifetches(0)
, loads(0)
, stores(0)
@ -90,7 +99,8 @@ public:
uint32_t core_id,
Socket* socket,
const Arch &arch,
const DCRS &dcrs);
const DCRS &dcrs
);
~Core();
@ -131,6 +141,12 @@ public:
return mem_coalescers_.at(idx);
}
#ifdef EXT_V_ENABLE
VecUnit::Ptr& vec_unit() {
return vec_unit_;
}
#endif
const PerfStats& perf_stats() const {
return perf_stats_;
}
@ -150,6 +166,10 @@ private:
Socket* socket_;
const Arch& arch_;
#ifdef EXT_V_ENABLE
VecUnit::Ptr vec_unit_;
#endif
Emulator emulator_;
std::vector<IBuffer> ibuffers_;

View file

@ -34,7 +34,7 @@ Emulator::warp_t::warp_t(const Arch& arch)
: ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
, freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
#ifdef EXT_V_ENABLE
, vreg_file(MAX_NUM_REGS, std::vector<Byte>(MAX_NUM_REGS))
, vreg_file(MAX_NUM_REGS, std::vector<Byte>(VLEN / 8))
#endif
, uuid(0)
{}
@ -96,6 +96,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
// In future versions, scratchpad size should be fixed to an appropriate value.
, scratchpad(std::vector<Word>(32 * 32 * 32768))
#ifdef EXT_V_ENABLE
, vec_unit_(core->vec_unit())
, csrs_(arch.num_warps())
#endif
{
@ -133,6 +134,10 @@ void Emulator::clear() {
barrier.reset();
}
#ifdef EXT_V_ENABLE
vec_unit_->reset();
#endif
csr_mscratch_ = startup_arg;
stalled_warps_.reset();
@ -607,6 +612,18 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls);
}
} break;
#ifdef EXT_V_ENABLE
case VX_DCR_MPM_CLASS_VEC: {
VecUnit::PerfStats vec_perf_stats;
vec_perf_stats += vec_unit_->perf_stats();
switch (addr) {
CSR_READ_64(VX_CSR_MPM_VEC_READS, vec_perf_stats.reads);
CSR_READ_64(VX_CSR_MPM_VEC_WRITES, vec_perf_stats.writes);
CSR_READ_64(VX_CSR_MPM_VEC_LAT, vec_perf_stats.latency);
CSR_READ_64(VX_CSR_MPM_VEC_ST, vec_perf_stats.stalls);
}
} break;
#endif
default: {
std::cout << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
std::abort();

View file

@ -19,6 +19,9 @@
#include <stack>
#include <mem.h>
#include "types.h"
#ifdef EXT_V_ENABLE
#include "vec_unit.h"
#endif
namespace vortex {
@ -175,7 +178,10 @@ private:
uint32_t mat_size;
uint32_t tc_size;
uint32_t tc_num;
#ifdef EXT_V_ENABLE
VecUnit::Ptr vec_unit_;
std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
#endif
};
}

View file

@ -366,3 +366,52 @@ void SfuUnit::tick() {
input.pop();
}
}
///////////////////////////////////////////////////////////////////////////////
#ifdef EXT_V_ENABLE
VpuUnit::VpuUnit(const SimContext& ctx, Core* core)
: FuncUnit(ctx, core, "vpu-unit")
{}
void VpuUnit::tick() {
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
auto& input = Inputs.at(iw);
if (input.empty())
continue;
//auto& output = Outputs.at(iw);
auto trace = input.front();
//int delay = 2;
switch (trace->vpu_type) {
case VpuType::VSET:
case VpuType::VL:
case VpuType::VS:
case VpuType::ARITHVV:
case VpuType::MULVV:
case VpuType::DIVVV:
case VpuType::ARITHVX:
case VpuType::MULVX:
case VpuType::DIVVX:
case VpuType::ARITHVI:
case VpuType::MULVI:
case VpuType::DIVVI:
case VpuType::ARITHFVV:
case VpuType::MULFVV:
case VpuType::DIVFVV:
case VpuType::ARITHFVX:
case VpuType::MULFVX:
case VpuType::DIVFVX:
case VpuType::ARITHFVI:
case VpuType::MULFVI:
case VpuType::DIVFVI:
default:
std::abort();
}
DT(3, this->name() << ": op=" << trace->vpu_type << ", " << *trace);
if (trace->eop && trace->fetch_stall) {
core_->resume(trace->wid);
}
input.pop();
}
}
#endif

View file

@ -113,4 +113,39 @@ public:
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class VpuUnit : public FuncUnit {
public:
VpuUnit(const SimContext& ctx, Core*);
void tick();
};
// Simulate clock cycles depending on instruction type and element width and #lanes
// VSET = 1 cycle
// Vector instructions take the same amount of time as ALU instructions.
// In general there should be less overall instructions (hence the SIMD vector speedup).
// But, each vector instruction is bigger, and # of lanes greatly effects execution speed.
// Whenever we change VL using imm/VSET, we need to keep track of the new VL and SEW.
// By default, VL is set to MAXVL.
// After determining VL, we use VL and #lanes in order to determine overall cycle time.
// For example, for a vector add with VL=4 and #lanes=2, we will probably take 2 cycles,
// since we can only operate on two elements of the vector each cycle (limited by #lanes).
// SEW (element width) likely affects the cycle time, we can probably observe
// ALU operation cycle time in relation to element width to determine this though.
// The RTL implementation has an unroll and accumulate stage.
// The unroll stage sends vector elements to the appropriate functional unit up to VL,
// limited by the # lanes available.
// The accumulate stage deals with combining the results from the functional units,
// into the destination vector register.
// Which exact pipeline stage does the VPU unroll the vector (decode or execute)?
// Which exact pipeline stage does the VPU accumulate results?
// How do vector loads and stores interact with the cache?
// How about loading and storing scalars in vector registers?
// How does striding affect loads and stores?
}

View file

@ -43,6 +43,13 @@ struct SFUTraceData : public ITraceData {
SFUTraceData(Word arg1, Word arg2) : arg1(arg1), arg2(arg2) {}
};
struct VPUTraceData : public ITraceData {
using Ptr = std::shared_ptr<VPUTraceData>;
Word arg1;
Word arg2;
VPUTraceData(Word arg1, Word arg2) : arg1(arg1), arg2(arg2) {}
};
struct instr_trace_t {
public:
struct reg_t {
@ -77,7 +84,10 @@ public:
AluType alu_type;
FpuType fpu_type;
SfuType sfu_type;
TCUType tcu_type;
#ifdef EXT_V_ENABLE
VpuType vpu_type;
#endif
TCUType tcu_type;
};
ITraceData::Ptr data;

View file

@ -269,6 +269,48 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
///////////////////////////////////////////////////////////////////////////////
enum class VpuType {
VSET, // Set vector length
VL, // Vector load
VS, // Vector store
// ALU OPERATIONS
ARITHVV, // Vector-vector
MULVV,
DIVVV,
ARITHVX, // Vector-scalar
MULVX,
DIVVX,
ARITHVI, // Vector-immediate
MULVI,
DIVVI,
// FPU OPERATIONS
ARITHFVV, // Vector-vector
MULFVV,
DIVFVV,
ARITHFVX, // Vector-scalar
MULFVX,
DIVFVX,
ARITHFVI, // Vector-immediate
MULFVI,
DIVFVI
};
inline std::ostream &operator<<(std::ostream &os, const VpuType& type) {
switch (type) {
case VpuType::VSET: os << "VSET"; break;
default: assert(false);
}
return os;
}
///////////////////////////////////////////////////////////////////////////////
enum class ArbiterType {
Priority,
RoundRobin

87
sim/simx/vec_unit.cpp Normal file
View file

@ -0,0 +1,87 @@
#ifdef EXT_V_ENABLE
#include "vec_unit.h"
using namespace vortex;
class VecUnit::Impl {
public:
Impl(VecUnit* simobject, const Arch& /*arch*/)
: simobject_(simobject)
{
this->clear();
}
~Impl() {}
void clear() {
perf_stats_ = PerfStats();
}
void tick() {
}
/*
void load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
}
void store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
}
void execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata) {
}
*/
const PerfStats& perf_stats() const {
return perf_stats_;
}
private:
VecUnit* simobject_;
std::vector<std::vector<Byte>> vreg_file_;
vtype_t vtype_;
uint32_t vl_;
Word vlmax_;
PerfStats perf_stats_;
};
VecUnit::VecUnit(const SimContext& ctx,
const char* name,
const Arch &arch)
: SimObject<VecUnit>(ctx, name)
, Input(this)
, Output(this)
, impl_(new Impl(this, arch))
{}
VecUnit::~VecUnit() {
delete impl_;
}
void VecUnit::reset() {
impl_->clear();
}
void VecUnit::tick() {
impl_->tick();
}
/*
void VecUnit::load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
return impl_->load(instr, wid, rsdata);
}
void VecUnit::store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
return impl_->store(instr, wid, rsdata);
}
void VecUnit::execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata) {
return impl_->execute(instr, wid, rsdata, rddata);
}
*/
const VecUnit::PerfStats& VecUnit::perf_stats() const {
return impl_->perf_stats();
}
#endif

87
sim/simx/vec_unit.h Normal file
View file

@ -0,0 +1,87 @@
#ifdef EXT_V_ENABLE
#pragma once
#include "arch.h"
#include "instr.h"
#include "instr_trace.h"
#include <simobject.h>
#include "types.h"
namespace vortex {
struct vtype_t {
uint32_t vill;
uint32_t vma;
uint32_t vta;
uint32_t vsew;
uint32_t vlmul;
};
union reg_data_t {
Word u;
WordI i;
WordF f;
float f32;
double f64;
uint32_t u32;
uint64_t u64;
int32_t i32;
int64_t i64;
};
class VecUnit : public SimObject<VecUnit> {
public:
struct PerfStats {
uint64_t reads;
uint64_t writes;
uint64_t latency;
uint64_t stalls;
PerfStats()
: reads(0)
, writes(0)
, latency(0)
, stalls(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->reads += rhs.reads;
this->writes += rhs.writes;
this->latency += rhs.latency;
this->stalls += rhs.stalls;
return *this;
}
};
std::vector<SimPort<MemReq>> MemReqs;
std::vector<SimPort<MemRsp>> MemRsps;
SimPort<instr_trace_t*> Input;
SimPort<instr_trace_t*> Output;
VecUnit(const SimContext& ctx,
const char* name,
const Arch &arch);
~VecUnit();
void reset();
void tick();
void load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
void store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
void execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata);
const PerfStats& perf_stats() const;
private:
class Impl;
Impl* impl_;
};
}
#endif