mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
Added a vpu and vec units
This commit is contained in:
parent
e80ee2c819
commit
b2ad2e5281
14 changed files with 436 additions and 7 deletions
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
|
@ -106,7 +106,7 @@ jobs:
|
|||
make tests -s > /dev/null
|
||||
|
||||
- name: Upload Build Artifact
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: build-${{ matrix.xlen }}
|
||||
path: build${{ matrix.xlen }}
|
||||
|
@ -147,7 +147,7 @@ jobs:
|
|||
${{ runner.os }}-thirdparty-
|
||||
|
||||
- name: Download Build Artifact
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: build-${{ matrix.xlen }}
|
||||
path: build${{ matrix.xlen }}
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
`define VX_DCR_MPM_CLASS_NONE 0
|
||||
`define VX_DCR_MPM_CLASS_CORE 1
|
||||
`define VX_DCR_MPM_CLASS_MEM 2
|
||||
`define VX_DCR_MPM_CLASS_VEC 3
|
||||
|
||||
// User Floating-Point CSRs ///////////////////////////////////////////////////
|
||||
|
||||
|
@ -99,6 +100,8 @@
|
|||
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
|
||||
`define VX_CSR_MPM_SCRB_WCTL 12'hB0D
|
||||
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB8D
|
||||
`define VX_CSR_MPM_SCRB_VEC 12'hB13 // Vector scoreboard
|
||||
`define VX_CSR_MPM_SCRB_VEC_H 12'hB93
|
||||
// PERF: memory
|
||||
`define VX_CSR_MPM_IFETCHES 12'hB0E
|
||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8E
|
||||
|
@ -182,6 +185,17 @@
|
|||
// Machine Performance-monitoring memory counters (class 3) ///////////////////
|
||||
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>
|
||||
|
||||
// Machine Performance-monitoring vector counters
|
||||
// PERF: vector unit
|
||||
`define VX_CSR_MPM_VEC_READS 12'hB03 // vector reads
|
||||
`define VX_CSR_MPM_VEC_READS_H 12'hB83
|
||||
`define VX_CSR_MPM_VEC_WRITES 12'hB04 // vector writes
|
||||
`define VX_CSR_MPM_VEC_WRITES_H 12'hB84
|
||||
`define VX_CSR_MPM_VEC_LAT 12'hB05 // vector latency
|
||||
`define VX_CSR_MPM_VEC_LAT_H 12'hB85
|
||||
`define VX_CSR_MPM_VEC_ST 12'hB06 // vector stalls
|
||||
`define VX_CSR_MPM_VEC_ST_H 12'hB86
|
||||
|
||||
// Machine Information Registers //////////////////////////////////////////////
|
||||
|
||||
`define VX_CSR_MVENDORID 12'hF11
|
||||
|
|
|
@ -188,6 +188,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t scrb_lsu = 0;
|
||||
uint64_t scrb_csrs = 0;
|
||||
uint64_t scrb_wctl = 0;
|
||||
#ifdef EXT_V_ENABLE
|
||||
uint64_t scrb_vpu = 0;
|
||||
#endif
|
||||
uint64_t ifetches = 0;
|
||||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
|
@ -212,6 +215,13 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t mem_writes = 0;
|
||||
uint64_t mem_lat = 0;
|
||||
uint64_t mem_bank_stalls = 0;
|
||||
#ifdef EXT_V_ENABLE
|
||||
// PERF: vecunit
|
||||
uint64_t vec_mem_reads = 0;
|
||||
uint64_t vec_mem_writes = 0;
|
||||
uint64_t vec_mem_lat = 0;
|
||||
uint64_t vec_stall_cycles = 0;
|
||||
#endif
|
||||
|
||||
uint64_t num_cores;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
|
||||
|
@ -312,13 +322,25 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
|
||||
return err;
|
||||
});
|
||||
#ifdef EXT_V_ENABLE
|
||||
uint64_t scrb_vpu_per_core;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_VPU, core_id, &scrb_vpu_per_core), {
|
||||
return err;
|
||||
});
|
||||
#endif
|
||||
scrb_alu += scrb_alu_per_core;
|
||||
scrb_fpu += scrb_fpu_per_core;
|
||||
scrb_lsu += scrb_lsu_per_core;
|
||||
scrb_csrs += scrb_csrs_per_core;
|
||||
scrb_wctl += scrb_wctl_per_core;
|
||||
#ifdef EXT_V_ENABLE
|
||||
scrb_vpu += scrb_vpu_per_core;
|
||||
#endif
|
||||
if (num_cores > 1) {
|
||||
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core;
|
||||
#ifdef EXT_V_ENABLE
|
||||
scrb_total += scrb_vpu_per_core;
|
||||
#endif
|
||||
int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
|
||||
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n"
|
||||
, core_id
|
||||
|
@ -329,6 +351,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
, calcAvgPercent(scrb_lsu_per_core, scrb_total)
|
||||
, calcAvgPercent(scrb_csrs_per_core, scrb_total)
|
||||
, calcAvgPercent(scrb_wctl_per_core, scrb_total)
|
||||
#ifdef EXT_V_ENABLE
|
||||
, calcAvgPercent(scrb_vpu_per_core, scrb_total)
|
||||
#endif
|
||||
);
|
||||
}
|
||||
scrb_stalls += scrb_stalls_per_core;
|
||||
|
@ -555,6 +580,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
});
|
||||
}
|
||||
} break;
|
||||
#ifdef EXT_V_ENABLE
|
||||
case VX_DCR_MPM_CLASS_VEC: {
|
||||
uint64_t tmp;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_READS, core_id, &tmp), { return err; });
|
||||
vec_mem_reads += tmp;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_WRITES, core_id, &tmp), { return err; });
|
||||
vec_mem_writes += tmp;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_LAT, core_id, &tmp), { return err; });
|
||||
vec_mem_lat += tmp;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_VEC_ST, core_id, &tmp), { return err; });
|
||||
vec_stall_cycles += tmp;
|
||||
} break;
|
||||
#endif
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -576,6 +614,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
int ifetch_avg_lat = caclAverage(ifetch_lat, ifetches);
|
||||
int load_avg_lat = caclAverage(load_lat, loads);
|
||||
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_csrs + scrb_wctl;
|
||||
#ifdef EXT_V_ENABLE
|
||||
scrb_total += scrb_vpu;
|
||||
#endif
|
||||
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
|
||||
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
||||
|
@ -587,6 +628,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
, calcAvgPercent(scrb_lsu, scrb_total)
|
||||
, calcAvgPercent(scrb_csrs, scrb_total)
|
||||
, calcAvgPercent(scrb_wctl, scrb_total)
|
||||
#ifdef EXT_V_ENABLE
|
||||
, calcAvgPercent(scrb_vpu, scrb_total)
|
||||
#endif
|
||||
);
|
||||
fprintf(stream, "PERF: operands stalls=%ld (%d%%)\n", opds_stalls, opds_percent);
|
||||
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
||||
|
@ -637,6 +681,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization);
|
||||
}
|
||||
} break;
|
||||
#ifdef EXT_V_ENABLE
|
||||
case VX_DCR_MPM_CLASS_VEC: {
|
||||
vec_mem_reads /= num_cores;
|
||||
vec_mem_writes /= num_cores;
|
||||
vec_mem_lat /= num_cores;
|
||||
vec_stall_cycles /= num_cores;
|
||||
int vec_avg_lat = caclAverage(vec_mem_lat, vec_mem_reads);
|
||||
int vec_stall_cycles_ratio = calcRatio(vec_stall_cycles, total_cycles);
|
||||
fprintf(stream, "PERF: vec memory reads=%ld\n", vec_mem_reads);
|
||||
fprintf(stream, "PERF: vec memory writes=%ld\n", vec_mem_writes);
|
||||
fprintf(stream, "PERF: vec memory latency=%d cycles\n", vec_avg_lat);
|
||||
fprintf(stream, "PERF: vec stalls=%ld (%d%%)\n", vec_stall_cycles, vec_stall_cycles_ratio);
|
||||
} break;
|
||||
#endif
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -22,7 +22,7 @@ SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(
|
|||
|
||||
# Add V extension sources
|
||||
ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),)
|
||||
SRCS += $(SRC_DIR)/vpu.cpp
|
||||
SRCS += $(SRC_DIR)/vec_unit.cpp $(SRC_DIR)/vpu.cpp
|
||||
endif
|
||||
|
||||
# Debugging
|
||||
|
|
|
@ -29,7 +29,8 @@ Core::Core(const SimContext& ctx,
|
|||
uint32_t core_id,
|
||||
Socket* socket,
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs)
|
||||
const DCRS &dcrs
|
||||
)
|
||||
: SimObject(ctx, StrFormat("core%d", core_id))
|
||||
, icache_req_ports(1, this)
|
||||
, icache_rsp_ports(1, this)
|
||||
|
@ -38,6 +39,9 @@ Core::Core(const SimContext& ctx,
|
|||
, core_id_(core_id)
|
||||
, socket_(socket)
|
||||
, arch_(arch)
|
||||
#ifdef EXT_V_ENABLE
|
||||
, vec_unit_(VecUnit::Create("vpu", arch))
|
||||
#endif
|
||||
, emulator_(arch, dcrs, this)
|
||||
, ibuffers_(arch.num_warps(), IBUF_SIZE)
|
||||
, scoreboard_(arch_)
|
||||
|
|
|
@ -27,6 +27,9 @@
|
|||
#include "func_unit.h"
|
||||
#include "mem_coalescer.h"
|
||||
#include "VX_config.h"
|
||||
#ifdef EXT_V_ENABLE
|
||||
#include "vec_unit.h"
|
||||
#endif
|
||||
|
||||
namespace vortex {
|
||||
|
||||
|
@ -52,6 +55,9 @@ public:
|
|||
uint64_t scrb_sfu;
|
||||
uint64_t scrb_csrs;
|
||||
uint64_t scrb_wctl;
|
||||
#ifdef EXT_V_ENABLE
|
||||
uint64_t scrb_vpu;
|
||||
#endif
|
||||
uint64_t ifetches;
|
||||
uint64_t loads;
|
||||
uint64_t stores;
|
||||
|
@ -72,6 +78,9 @@ public:
|
|||
, scrb_sfu(0)
|
||||
, scrb_csrs(0)
|
||||
, scrb_wctl(0)
|
||||
#ifdef EXT_V_ENABLE
|
||||
, scrb_vpu(0)
|
||||
#endif
|
||||
, ifetches(0)
|
||||
, loads(0)
|
||||
, stores(0)
|
||||
|
@ -90,7 +99,8 @@ public:
|
|||
uint32_t core_id,
|
||||
Socket* socket,
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs);
|
||||
const DCRS &dcrs
|
||||
);
|
||||
|
||||
~Core();
|
||||
|
||||
|
@ -131,6 +141,12 @@ public:
|
|||
return mem_coalescers_.at(idx);
|
||||
}
|
||||
|
||||
#ifdef EXT_V_ENABLE
|
||||
VecUnit::Ptr& vec_unit() {
|
||||
return vec_unit_;
|
||||
}
|
||||
#endif
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
@ -150,6 +166,10 @@ private:
|
|||
Socket* socket_;
|
||||
const Arch& arch_;
|
||||
|
||||
#ifdef EXT_V_ENABLE
|
||||
VecUnit::Ptr vec_unit_;
|
||||
#endif
|
||||
|
||||
Emulator emulator_;
|
||||
|
||||
std::vector<IBuffer> ibuffers_;
|
||||
|
|
|
@ -34,7 +34,7 @@ Emulator::warp_t::warp_t(const Arch& arch)
|
|||
: ireg_file(arch.num_threads(), std::vector<Word>(MAX_NUM_REGS))
|
||||
, freg_file(arch.num_threads(), std::vector<uint64_t>(MAX_NUM_REGS))
|
||||
#ifdef EXT_V_ENABLE
|
||||
, vreg_file(MAX_NUM_REGS, std::vector<Byte>(MAX_NUM_REGS))
|
||||
, vreg_file(MAX_NUM_REGS, std::vector<Byte>(VLEN / 8))
|
||||
#endif
|
||||
, uuid(0)
|
||||
{}
|
||||
|
@ -96,6 +96,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
|
|||
// In future versions, scratchpad size should be fixed to an appropriate value.
|
||||
, scratchpad(std::vector<Word>(32 * 32 * 32768))
|
||||
#ifdef EXT_V_ENABLE
|
||||
, vec_unit_(core->vec_unit())
|
||||
, csrs_(arch.num_warps())
|
||||
#endif
|
||||
{
|
||||
|
@ -133,6 +134,10 @@ void Emulator::clear() {
|
|||
barrier.reset();
|
||||
}
|
||||
|
||||
#ifdef EXT_V_ENABLE
|
||||
vec_unit_->reset();
|
||||
#endif
|
||||
|
||||
csr_mscratch_ = startup_arg;
|
||||
|
||||
stalled_warps_.reset();
|
||||
|
@ -607,6 +612,18 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls);
|
||||
}
|
||||
} break;
|
||||
#ifdef EXT_V_ENABLE
|
||||
case VX_DCR_MPM_CLASS_VEC: {
|
||||
VecUnit::PerfStats vec_perf_stats;
|
||||
vec_perf_stats += vec_unit_->perf_stats();
|
||||
switch (addr) {
|
||||
CSR_READ_64(VX_CSR_MPM_VEC_READS, vec_perf_stats.reads);
|
||||
CSR_READ_64(VX_CSR_MPM_VEC_WRITES, vec_perf_stats.writes);
|
||||
CSR_READ_64(VX_CSR_MPM_VEC_LAT, vec_perf_stats.latency);
|
||||
CSR_READ_64(VX_CSR_MPM_VEC_ST, vec_perf_stats.stalls);
|
||||
}
|
||||
} break;
|
||||
#endif
|
||||
default: {
|
||||
std::cout << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
|
||||
std::abort();
|
||||
|
|
|
@ -19,6 +19,9 @@
|
|||
#include <stack>
|
||||
#include <mem.h>
|
||||
#include "types.h"
|
||||
#ifdef EXT_V_ENABLE
|
||||
#include "vec_unit.h"
|
||||
#endif
|
||||
|
||||
namespace vortex {
|
||||
|
||||
|
@ -175,7 +178,10 @@ private:
|
|||
uint32_t mat_size;
|
||||
uint32_t tc_size;
|
||||
uint32_t tc_num;
|
||||
#ifdef EXT_V_ENABLE
|
||||
VecUnit::Ptr vec_unit_;
|
||||
std::vector<std::vector<std::unordered_map<uint32_t, uint32_t>>> csrs_;
|
||||
#endif
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -366,3 +366,52 @@ void SfuUnit::tick() {
|
|||
input.pop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef EXT_V_ENABLE
|
||||
VpuUnit::VpuUnit(const SimContext& ctx, Core* core)
|
||||
: FuncUnit(ctx, core, "vpu-unit")
|
||||
{}
|
||||
|
||||
void VpuUnit::tick() {
|
||||
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
|
||||
auto& input = Inputs.at(iw);
|
||||
if (input.empty())
|
||||
continue;
|
||||
//auto& output = Outputs.at(iw);
|
||||
auto trace = input.front();
|
||||
//int delay = 2;
|
||||
switch (trace->vpu_type) {
|
||||
case VpuType::VSET:
|
||||
case VpuType::VL:
|
||||
case VpuType::VS:
|
||||
case VpuType::ARITHVV:
|
||||
case VpuType::MULVV:
|
||||
case VpuType::DIVVV:
|
||||
case VpuType::ARITHVX:
|
||||
case VpuType::MULVX:
|
||||
case VpuType::DIVVX:
|
||||
case VpuType::ARITHVI:
|
||||
case VpuType::MULVI:
|
||||
case VpuType::DIVVI:
|
||||
case VpuType::ARITHFVV:
|
||||
case VpuType::MULFVV:
|
||||
case VpuType::DIVFVV:
|
||||
case VpuType::ARITHFVX:
|
||||
case VpuType::MULFVX:
|
||||
case VpuType::DIVFVX:
|
||||
case VpuType::ARITHFVI:
|
||||
case VpuType::MULFVI:
|
||||
case VpuType::DIVFVI:
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, this->name() << ": op=" << trace->vpu_type << ", " << *trace);
|
||||
if (trace->eop && trace->fetch_stall) {
|
||||
core_->resume(trace->wid);
|
||||
}
|
||||
input.pop();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -113,4 +113,39 @@ public:
|
|||
void tick();
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class VpuUnit : public FuncUnit {
|
||||
public:
|
||||
VpuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void tick();
|
||||
};
|
||||
|
||||
// Simulate clock cycles depending on instruction type and element width and #lanes
|
||||
// VSET = 1 cycle
|
||||
// Vector instructions take the same amount of time as ALU instructions.
|
||||
// In general there should be less overall instructions (hence the SIMD vector speedup).
|
||||
// But, each vector instruction is bigger, and # of lanes greatly effects execution speed.
|
||||
|
||||
// Whenever we change VL using imm/VSET, we need to keep track of the new VL and SEW.
|
||||
// By default, VL is set to MAXVL.
|
||||
// After determining VL, we use VL and #lanes in order to determine overall cycle time.
|
||||
// For example, for a vector add with VL=4 and #lanes=2, we will probably take 2 cycles,
|
||||
// since we can only operate on two elements of the vector each cycle (limited by #lanes).
|
||||
// SEW (element width) likely affects the cycle time, we can probably observe
|
||||
// ALU operation cycle time in relation to element width to determine this though.
|
||||
|
||||
// The RTL implementation has an unroll and accumulate stage.
|
||||
// The unroll stage sends vector elements to the appropriate functional unit up to VL,
|
||||
// limited by the # lanes available.
|
||||
// The accumulate stage deals with combining the results from the functional units,
|
||||
// into the destination vector register.
|
||||
// Which exact pipeline stage does the VPU unroll the vector (decode or execute)?
|
||||
// Which exact pipeline stage does the VPU accumulate results?
|
||||
|
||||
// How do vector loads and stores interact with the cache?
|
||||
// How about loading and storing scalars in vector registers?
|
||||
// How does striding affect loads and stores?
|
||||
|
||||
}
|
|
@ -43,6 +43,13 @@ struct SFUTraceData : public ITraceData {
|
|||
SFUTraceData(Word arg1, Word arg2) : arg1(arg1), arg2(arg2) {}
|
||||
};
|
||||
|
||||
struct VPUTraceData : public ITraceData {
|
||||
using Ptr = std::shared_ptr<VPUTraceData>;
|
||||
Word arg1;
|
||||
Word arg2;
|
||||
VPUTraceData(Word arg1, Word arg2) : arg1(arg1), arg2(arg2) {}
|
||||
};
|
||||
|
||||
struct instr_trace_t {
|
||||
public:
|
||||
struct reg_t {
|
||||
|
@ -77,7 +84,10 @@ public:
|
|||
AluType alu_type;
|
||||
FpuType fpu_type;
|
||||
SfuType sfu_type;
|
||||
TCUType tcu_type;
|
||||
#ifdef EXT_V_ENABLE
|
||||
VpuType vpu_type;
|
||||
#endif
|
||||
TCUType tcu_type;
|
||||
};
|
||||
|
||||
ITraceData::Ptr data;
|
||||
|
|
|
@ -269,6 +269,48 @@ inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class VpuType {
|
||||
VSET, // Set vector length
|
||||
VL, // Vector load
|
||||
VS, // Vector store
|
||||
|
||||
// ALU OPERATIONS
|
||||
ARITHVV, // Vector-vector
|
||||
MULVV,
|
||||
DIVVV,
|
||||
|
||||
ARITHVX, // Vector-scalar
|
||||
MULVX,
|
||||
DIVVX,
|
||||
|
||||
ARITHVI, // Vector-immediate
|
||||
MULVI,
|
||||
DIVVI,
|
||||
|
||||
// FPU OPERATIONS
|
||||
ARITHFVV, // Vector-vector
|
||||
MULFVV,
|
||||
DIVFVV,
|
||||
|
||||
ARITHFVX, // Vector-scalar
|
||||
MULFVX,
|
||||
DIVFVX,
|
||||
|
||||
ARITHFVI, // Vector-immediate
|
||||
MULFVI,
|
||||
DIVFVI
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const VpuType& type) {
|
||||
switch (type) {
|
||||
case VpuType::VSET: os << "VSET"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class ArbiterType {
|
||||
Priority,
|
||||
RoundRobin
|
||||
|
|
87
sim/simx/vec_unit.cpp
Normal file
87
sim/simx/vec_unit.cpp
Normal file
|
@ -0,0 +1,87 @@
|
|||
#ifdef EXT_V_ENABLE
|
||||
|
||||
#include "vec_unit.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
class VecUnit::Impl {
|
||||
public:
|
||||
Impl(VecUnit* simobject, const Arch& /*arch*/)
|
||||
: simobject_(simobject)
|
||||
{
|
||||
this->clear();
|
||||
}
|
||||
|
||||
~Impl() {}
|
||||
|
||||
void clear() {
|
||||
perf_stats_ = PerfStats();
|
||||
}
|
||||
|
||||
void tick() {
|
||||
}
|
||||
|
||||
/*
|
||||
void load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
|
||||
}
|
||||
|
||||
void store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
|
||||
}
|
||||
|
||||
void execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata) {
|
||||
}
|
||||
*/
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
VecUnit* simobject_;
|
||||
std::vector<std::vector<Byte>> vreg_file_;
|
||||
vtype_t vtype_;
|
||||
uint32_t vl_;
|
||||
Word vlmax_;
|
||||
PerfStats perf_stats_;
|
||||
};
|
||||
|
||||
VecUnit::VecUnit(const SimContext& ctx,
|
||||
const char* name,
|
||||
const Arch &arch)
|
||||
: SimObject<VecUnit>(ctx, name)
|
||||
, Input(this)
|
||||
, Output(this)
|
||||
, impl_(new Impl(this, arch))
|
||||
{}
|
||||
|
||||
VecUnit::~VecUnit() {
|
||||
delete impl_;
|
||||
}
|
||||
|
||||
void VecUnit::reset() {
|
||||
impl_->clear();
|
||||
}
|
||||
|
||||
void VecUnit::tick() {
|
||||
impl_->tick();
|
||||
}
|
||||
|
||||
/*
|
||||
void VecUnit::load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
|
||||
return impl_->load(instr, wid, rsdata);
|
||||
}
|
||||
|
||||
void VecUnit::store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata) {
|
||||
return impl_->store(instr, wid, rsdata);
|
||||
}
|
||||
|
||||
void VecUnit::execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata) {
|
||||
return impl_->execute(instr, wid, rsdata, rddata);
|
||||
}
|
||||
*/
|
||||
|
||||
const VecUnit::PerfStats& VecUnit::perf_stats() const {
|
||||
return impl_->perf_stats();
|
||||
}
|
||||
#endif
|
87
sim/simx/vec_unit.h
Normal file
87
sim/simx/vec_unit.h
Normal file
|
@ -0,0 +1,87 @@
|
|||
#ifdef EXT_V_ENABLE
|
||||
#pragma once
|
||||
|
||||
#include "arch.h"
|
||||
#include "instr.h"
|
||||
#include "instr_trace.h"
|
||||
#include <simobject.h>
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
struct vtype_t {
|
||||
uint32_t vill;
|
||||
uint32_t vma;
|
||||
uint32_t vta;
|
||||
uint32_t vsew;
|
||||
uint32_t vlmul;
|
||||
};
|
||||
|
||||
union reg_data_t {
|
||||
Word u;
|
||||
WordI i;
|
||||
WordF f;
|
||||
float f32;
|
||||
double f64;
|
||||
uint32_t u32;
|
||||
uint64_t u64;
|
||||
int32_t i32;
|
||||
int64_t i64;
|
||||
};
|
||||
|
||||
class VecUnit : public SimObject<VecUnit> {
|
||||
public:
|
||||
struct PerfStats {
|
||||
uint64_t reads;
|
||||
uint64_t writes;
|
||||
uint64_t latency;
|
||||
uint64_t stalls;
|
||||
|
||||
PerfStats()
|
||||
: reads(0)
|
||||
, writes(0)
|
||||
, latency(0)
|
||||
, stalls(0)
|
||||
{}
|
||||
|
||||
PerfStats& operator+=(const PerfStats& rhs) {
|
||||
this->reads += rhs.reads;
|
||||
this->writes += rhs.writes;
|
||||
this->latency += rhs.latency;
|
||||
this->stalls += rhs.stalls;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<SimPort<MemReq>> MemReqs;
|
||||
std::vector<SimPort<MemRsp>> MemRsps;
|
||||
|
||||
SimPort<instr_trace_t*> Input;
|
||||
SimPort<instr_trace_t*> Output;
|
||||
|
||||
VecUnit(const SimContext& ctx,
|
||||
const char* name,
|
||||
const Arch &arch);
|
||||
|
||||
~VecUnit();
|
||||
|
||||
void reset();
|
||||
|
||||
void tick();
|
||||
|
||||
void load(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
|
||||
|
||||
void store(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata);
|
||||
|
||||
void execute(const Instr &instr, uint32_t wid, std::vector<reg_data_t[3]> &rsdata, std::vector<reg_data_t> &rddata);
|
||||
|
||||
const PerfStats& perf_stats() const;
|
||||
|
||||
private:
|
||||
|
||||
class Impl;
|
||||
Impl* impl_;
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
Loading…
Add table
Add a link
Reference in a new issue