mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
simx refactoring - emulation vs simulation discrete separation
This commit is contained in:
parent
ff6f33acff
commit
840ced22a9
33 changed files with 873 additions and 897 deletions
|
@ -39,7 +39,7 @@ using namespace vortex;
|
|||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
: ram_(RAM_PAGE_SIZE)
|
||||
: ram_(0, RAM_PAGE_SIZE)
|
||||
, global_mem_(
|
||||
ALLOC_BASE_ADDR,
|
||||
ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
|
||||
|
|
|
@ -88,7 +88,7 @@ class vx_device {
|
|||
public:
|
||||
vx_device()
|
||||
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
|
||||
, ram_(RAM_PAGE_SIZE)
|
||||
, ram_(0, RAM_PAGE_SIZE)
|
||||
, processor_(arch_)
|
||||
, global_mem_(
|
||||
ALLOC_BASE_ADDR,
|
||||
|
@ -183,7 +183,7 @@ public:
|
|||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run(false);
|
||||
processor_.run();
|
||||
});
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -190,14 +190,17 @@ void MemoryUnit::tlbRm(uint64_t va) {
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
RAM::RAM(uint32_t page_size, uint64_t capacity)
|
||||
RAM::RAM(uint64_t capacity, uint32_t page_size)
|
||||
: capacity_(capacity)
|
||||
, page_bits_(log2ceil(page_size))
|
||||
, last_page_(nullptr)
|
||||
, last_page_index_(0) {
|
||||
assert(ispow2(page_size));
|
||||
assert(0 == capacity || ispow2(capacity));
|
||||
assert(0 == (capacity % page_size));
|
||||
assert(ispow2(page_size));
|
||||
if (capacity != 0) {
|
||||
assert(ispow2(capacity));
|
||||
assert(page_size <= capacity);
|
||||
assert(0 == (capacity % page_size));
|
||||
}
|
||||
}
|
||||
|
||||
RAM::~RAM() {
|
||||
|
|
|
@ -158,7 +158,8 @@ private:
|
|||
class RAM : public MemDevice {
|
||||
public:
|
||||
|
||||
RAM(uint32_t page_size, uint64_t capacity = 0);
|
||||
RAM(uint64_t capacity, uint32_t page_size);
|
||||
RAM(uint64_t capacity) : RAM(capacity, capacity) {}
|
||||
~RAM();
|
||||
|
||||
void clear();
|
||||
|
|
|
@ -125,7 +125,7 @@ public:
|
|||
trace_->open("trace.vcd");
|
||||
#endif
|
||||
|
||||
ram_ = new RAM(RAM_PAGE_SIZE);
|
||||
ram_ = new RAM(0, RAM_PAGE_SIZE);
|
||||
|
||||
// initialize dram simulator
|
||||
ramulator::Config ram_config;
|
||||
|
|
|
@ -65,7 +65,7 @@ int main(int argc, char **argv) {
|
|||
parse_args(argc, argv);
|
||||
|
||||
// create memory module
|
||||
vortex::RAM ram(RAM_PAGE_SIZE);
|
||||
vortex::RAM ram(0, RAM_PAGE_SIZE);
|
||||
|
||||
// create processor
|
||||
vortex::Processor processor;
|
||||
|
|
|
@ -16,7 +16,7 @@ LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
|||
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
|
||||
|
||||
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
|
||||
SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp local_mem.cpp dcrs.cpp
|
||||
SRCS += processor.cpp cluster.cpp socket.cpp core.cpp emulator.cpp decode.cpp execute.cpp func_unit.cpp cache_sim.cpp mem_sim.cpp local_mem.cpp dcrs.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
|
|
|
@ -114,19 +114,12 @@ bool Cluster::running() const {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
|
||||
bool done = true;
|
||||
Word exitcode_ = 0;
|
||||
int Cluster::get_exitcode() const {
|
||||
int exitcode = 0;
|
||||
for (auto& socket : sockets_) {
|
||||
Word ec;
|
||||
if (socket->check_exit(&ec, riscv_test)) {
|
||||
exitcode_ |= ec;
|
||||
} else {
|
||||
done = false;
|
||||
}
|
||||
exitcode |= socket->get_exitcode();
|
||||
}
|
||||
*exitcode = exitcode_;
|
||||
return done;
|
||||
return exitcode;
|
||||
}
|
||||
|
||||
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
|
||||
|
|
|
@ -59,7 +59,7 @@ public:
|
|||
|
||||
bool running() const;
|
||||
|
||||
bool check_exit(Word* exitcode, bool riscv_test) const;
|
||||
int get_exitcode() const;
|
||||
|
||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
|
||||
|
||||
|
|
|
@ -19,12 +19,9 @@
|
|||
#include "types.h"
|
||||
#include "arch.h"
|
||||
#include "mem.h"
|
||||
#include "decode.h"
|
||||
#include "core.h"
|
||||
#include "socket.h"
|
||||
#include "debug.h"
|
||||
#include "constants.h"
|
||||
#include "processor_impl.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
|
@ -41,31 +38,18 @@ Core::Core(const SimContext& ctx,
|
|||
, core_id_(core_id)
|
||||
, socket_(socket)
|
||||
, arch_(arch)
|
||||
, dcrs_(dcrs)
|
||||
, decoder_(arch)
|
||||
, warps_(arch.num_warps())
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, fcsrs_(arch.num_warps(), 0)
|
||||
, emulator_(arch, dcrs, this)
|
||||
, ibuffers_(arch.num_warps(), IBUF_SIZE)
|
||||
, scoreboard_(arch_)
|
||||
, operands_(ISSUE_WIDTH)
|
||||
, dispatchers_((uint32_t)FUType::Count)
|
||||
, exe_units_((uint32_t)FUType::Count)
|
||||
, func_units_((uint32_t)FUType::Count)
|
||||
, lmem_demuxs_(NUM_LSU_LANES)
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, csrs_(arch.num_warps())
|
||||
, commit_arbs_(ISSUE_WIDTH)
|
||||
{
|
||||
char sname[100];
|
||||
|
||||
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
|
||||
csrs_.at(i).resize(arch.num_threads());
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
|
||||
warps_.at(i) = std::make_shared<Warp>(this, i);
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
operands_.at(i) = SimPlatform::instance().create_object<Operand>();
|
||||
}
|
||||
|
@ -99,17 +83,17 @@ Core::Core(const SimContext& ctx,
|
|||
dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, 1, NUM_SFU_LANES);
|
||||
|
||||
// initialize execute units
|
||||
exe_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
|
||||
exe_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
|
||||
exe_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
|
||||
exe_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
|
||||
func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
|
||||
func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
|
||||
func_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
|
||||
func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
|
||||
|
||||
// bind commit arbiters
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
|
||||
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)FUType::Count, 1);
|
||||
for (uint32_t j = 0; j < (uint32_t)FUType::Count; ++j) {
|
||||
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
|
||||
func_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
|
||||
}
|
||||
commit_arbs_.at(i) = arbiter;
|
||||
}
|
||||
|
@ -118,17 +102,14 @@ Core::Core(const SimContext& ctx,
|
|||
}
|
||||
|
||||
Core::~Core() {
|
||||
this->cout_flush();
|
||||
//--
|
||||
}
|
||||
|
||||
void Core::reset() {
|
||||
for (auto& warp : warps_) {
|
||||
warp->reset();
|
||||
}
|
||||
warps_.at(0)->setTmask(0, true);
|
||||
active_warps_ = 1;
|
||||
|
||||
for (auto& exe_unit : exe_units_) {
|
||||
emulator_.clear();
|
||||
|
||||
for (auto& exe_unit : func_units_) {
|
||||
exe_unit->reset();
|
||||
}
|
||||
|
||||
|
@ -136,29 +117,20 @@ void Core::reset() {
|
|||
commit_arb->reset();
|
||||
}
|
||||
|
||||
for (auto& barrier : barriers_) {
|
||||
barrier.reset();
|
||||
}
|
||||
|
||||
for (auto& fcsr : fcsrs_) {
|
||||
fcsr = 0;
|
||||
}
|
||||
|
||||
for (auto& ibuf : ibuffers_) {
|
||||
ibuf.clear();
|
||||
}
|
||||
|
||||
ibuffer_idx_ = 0;
|
||||
|
||||
scoreboard_.clear();
|
||||
fetch_latch_.clear();
|
||||
decode_latch_.clear();
|
||||
pending_icache_.clear();
|
||||
stalled_warps_.reset();
|
||||
pending_instrs_ = 0;
|
||||
exited_ = false;
|
||||
perf_stats_ = PerfStats();
|
||||
|
||||
ibuffer_idx_ = 0;
|
||||
pending_instrs_ = 0;
|
||||
pending_ifetches_ = 0;
|
||||
|
||||
perf_stats_ = PerfStats();
|
||||
}
|
||||
|
||||
void Core::tick() {
|
||||
|
@ -174,28 +146,14 @@ void Core::tick() {
|
|||
}
|
||||
|
||||
void Core::schedule() {
|
||||
int scheduled_warp = -1;
|
||||
|
||||
// find next ready warp
|
||||
for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) {
|
||||
bool warp_active = active_warps_.test(wid);
|
||||
bool warp_stalled = stalled_warps_.test(wid);
|
||||
if (warp_active && !warp_stalled) {
|
||||
scheduled_warp = wid;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (scheduled_warp == -1) {
|
||||
auto trace = emulator_.step();
|
||||
if (trace == nullptr) {
|
||||
++perf_stats_.sched_idle;
|
||||
return;
|
||||
}
|
||||
|
||||
// suspend warp until decode
|
||||
stalled_warps_.set(scheduled_warp);
|
||||
|
||||
// evaluate scheduled warp
|
||||
auto& warp = warps_.at(scheduled_warp);
|
||||
auto trace = warp->eval();
|
||||
emulator_.suspend(trace->wid);
|
||||
|
||||
DT(3, "pipeline-schedule: " << *trace);
|
||||
|
||||
|
@ -255,9 +213,8 @@ void Core::decode() {
|
|||
}
|
||||
|
||||
// release warp
|
||||
if (!trace->fetch_stall) {
|
||||
assert(stalled_warps_.test(trace->wid));
|
||||
stalled_warps_.reset(trace->wid);
|
||||
if (!trace->fetch_stall) {
|
||||
emulator_.resume(trace->wid);
|
||||
}
|
||||
|
||||
DT(3, "pipeline-decode: " << *trace);
|
||||
|
@ -355,7 +312,7 @@ void Core::issue() {
|
|||
void Core::execute() {
|
||||
for (uint32_t i = 0; i < (uint32_t)FUType::Count; ++i) {
|
||||
auto& dispatch = dispatchers_.at(i);
|
||||
auto& exe_unit = exe_units_.at(i);
|
||||
auto& exe_unit = func_units_.at(i);
|
||||
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
|
||||
if (dispatch->Outputs.at(j).empty())
|
||||
continue;
|
||||
|
@ -396,361 +353,22 @@ void Core::commit() {
|
|||
}
|
||||
}
|
||||
|
||||
void Core::wspawn(uint32_t num_warps, Word nextPC) {
|
||||
uint32_t active_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
|
||||
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC);
|
||||
for (uint32_t i = 1; i < active_warps; ++i) {
|
||||
auto warp = warps_.at(i);
|
||||
warp->setPC(nextPC);
|
||||
warp->setTmask(0, true);
|
||||
active_warps_.set(i);
|
||||
}
|
||||
}
|
||||
|
||||
void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
|
||||
uint32_t bar_idx = bar_id & 0x7fffffff;
|
||||
bool is_global = (bar_id >> 31);
|
||||
|
||||
auto& barrier = barriers_.at(bar_idx);
|
||||
barrier.set(warp_id);
|
||||
DP(3, "*** Suspend core #" << core_id_ << ", warp #" << warp_id << " at barrier #" << bar_idx);
|
||||
|
||||
if (is_global) {
|
||||
// global barrier handling
|
||||
if (barrier.count() == active_warps_.count()) {
|
||||
socket_->barrier(bar_idx, count, core_id_);
|
||||
barrier.reset();
|
||||
}
|
||||
} else {
|
||||
// local barrier handling
|
||||
if (barrier.count() == (size_t)count) {
|
||||
// resume suspended warps
|
||||
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
|
||||
if (barrier.test(i)) {
|
||||
DP(3, "*** Resume core #" << core_id_ << ", warp #" << i << " at barrier #" << bar_idx);
|
||||
stalled_warps_.reset(i);
|
||||
}
|
||||
}
|
||||
barrier.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Core::icache_read(void *data, uint64_t addr, uint32_t size) {
|
||||
mmu_.read(data, addr, size, 0);
|
||||
}
|
||||
|
||||
AddrType Core::get_addr_type(uint64_t addr) {
|
||||
if (LMEM_ENABLED) {
|
||||
if (addr >= LMEM_BASE_ADDR && addr < (LMEM_BASE_ADDR + (1 << LMEM_LOG_SIZE))) {
|
||||
return AddrType::Shared;
|
||||
}
|
||||
}
|
||||
if (addr >= IO_BASE_ADDR) {
|
||||
return AddrType::IO;
|
||||
}
|
||||
return AddrType::Global;
|
||||
}
|
||||
|
||||
void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {
|
||||
auto type = this->get_addr_type(addr);
|
||||
if (type == AddrType::Shared) {
|
||||
local_mem_->read(data, addr, size);
|
||||
} else {
|
||||
mmu_.read(data, addr, size, 0);
|
||||
}
|
||||
|
||||
DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
|
||||
}
|
||||
|
||||
void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) {
|
||||
auto type = this->get_addr_type(addr);
|
||||
if (addr >= uint64_t(IO_COUT_ADDR)
|
||||
&& addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
||||
this->writeToStdOut(data, addr, size);
|
||||
} else {
|
||||
if (type == AddrType::Shared) {
|
||||
local_mem_->write(data, addr, size);
|
||||
} else {
|
||||
mmu_.write(data, addr, size, 0);
|
||||
}
|
||||
}
|
||||
DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
|
||||
}
|
||||
|
||||
void Core::dcache_amo_reserve(uint64_t addr) {
|
||||
auto type = this->get_addr_type(addr);
|
||||
if (type == AddrType::Global) {
|
||||
mmu_.amo_reserve(addr);
|
||||
}
|
||||
}
|
||||
|
||||
bool Core::dcache_amo_check(uint64_t addr) {
|
||||
auto type = this->get_addr_type(addr);
|
||||
if (type == AddrType::Global) {
|
||||
return mmu_.amo_check(addr);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void Core::writeToStdOut(const void* data, uint64_t addr, uint32_t size) {
|
||||
if (size != 1)
|
||||
std::abort();
|
||||
uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
|
||||
auto& ss_buf = print_bufs_[tid];
|
||||
char c = *(char*)data;
|
||||
ss_buf << c;
|
||||
if (c == '\n') {
|
||||
std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush;
|
||||
ss_buf.str("");
|
||||
}
|
||||
}
|
||||
|
||||
void Core::cout_flush() {
|
||||
for (auto& buf : print_bufs_) {
|
||||
auto str = buf.second.str();
|
||||
if (!str.empty()) {
|
||||
std::cout << "#" << buf.first << ": " << str << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
switch (addr) {
|
||||
case VX_CSR_SATP:
|
||||
case VX_CSR_PMPCFG0:
|
||||
case VX_CSR_PMPADDR0:
|
||||
case VX_CSR_MSTATUS:
|
||||
case VX_CSR_MISA:
|
||||
case VX_CSR_MEDELEG:
|
||||
case VX_CSR_MIDELEG:
|
||||
case VX_CSR_MIE:
|
||||
case VX_CSR_MTVEC:
|
||||
case VX_CSR_MEPC:
|
||||
case VX_CSR_MNSTATUS:
|
||||
return 0;
|
||||
|
||||
case VX_CSR_FFLAGS:
|
||||
return fcsrs_.at(wid) & 0x1F;
|
||||
case VX_CSR_FRM:
|
||||
return (fcsrs_.at(wid) >> 5);
|
||||
case VX_CSR_FCSR:
|
||||
return fcsrs_.at(wid);
|
||||
case VX_CSR_MHARTID: // global thread ID
|
||||
return (core_id_ * arch_.num_warps() + wid) * arch_.num_threads() + tid;
|
||||
case VX_CSR_THREAD_ID: // thread ID
|
||||
return tid;
|
||||
case VX_CSR_WARP_ID: // warp ID
|
||||
return wid;
|
||||
case VX_CSR_CORE_ID: // core ID
|
||||
return core_id_;
|
||||
case VX_CSR_THREAD_MASK: // thread mask
|
||||
return warps_.at(wid)->getTmask();
|
||||
case VX_CSR_WARP_MASK: // active warps
|
||||
return active_warps_.to_ulong();
|
||||
case VX_CSR_NUM_THREADS: // Number of threads per warp
|
||||
return arch_.num_threads();
|
||||
case VX_CSR_NUM_WARPS: // Number of warps per core
|
||||
return arch_.num_warps();
|
||||
case VX_CSR_NUM_CORES: // Number of cores per cluster
|
||||
return uint32_t(arch_.num_cores()) * arch_.num_clusters();
|
||||
case VX_CSR_MCYCLE: // NumCycles
|
||||
return perf_stats_.cycles & 0xffffffff;
|
||||
case VX_CSR_MCYCLE_H: // NumCycles
|
||||
return (uint32_t)(perf_stats_.cycles >> 32);
|
||||
case VX_CSR_MINSTRET: // NumInsts
|
||||
return perf_stats_.instrs & 0xffffffff;
|
||||
case VX_CSR_MINSTRET_H: // NumInsts
|
||||
return (uint32_t)(perf_stats_.instrs >> 32);
|
||||
default:
|
||||
if ((addr >= VX_CSR_MPM_BASE && addr < (VX_CSR_MPM_BASE + 32))
|
||||
|| (addr >= VX_CSR_MPM_BASE_H && addr < (VX_CSR_MPM_BASE_H + 32))) {
|
||||
// user-defined MPM CSRs
|
||||
auto perf_class = dcrs_.base_dcrs.read(VX_DCR_BASE_MPM_CLASS);
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_NONE:
|
||||
break;
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idle & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32;
|
||||
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
|
||||
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
|
||||
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
|
||||
case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
|
||||
case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_FPU_H:return perf_stats_.scrb_fpu >> 32;
|
||||
case VX_CSR_MPM_SCRB_LSU: return perf_stats_.scrb_lsu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
|
||||
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
|
||||
case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32;
|
||||
case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32;
|
||||
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
|
||||
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
|
||||
case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
|
||||
case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
|
||||
case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
|
||||
case VX_CSR_MPM_IFETCH_LT: return perf_stats_.ifetch_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCH_LT_H: return perf_stats_.ifetch_latency >> 32;
|
||||
case VX_CSR_MPM_LOAD_LT: return perf_stats_.load_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_LOAD_LT_H: return perf_stats_.load_latency >> 32;
|
||||
}
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
auto proc_perf = socket_->cluster()->processor()->perf_stats();
|
||||
auto cluster_perf = socket_->cluster()->perf_stats();
|
||||
auto socket_perf = socket_->perf_stats();
|
||||
auto lmem_perf = local_mem_->perf_stats();
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32;
|
||||
case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_WRITES: return proc_perf.l3cache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_WRITES_H: return proc_perf.l3cache.writes >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_R: return proc_perf.l3cache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_R_H: return proc_perf.l3cache.read_misses >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_W: return proc_perf.l3cache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_W_H: return proc_perf.l3cache.write_misses >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_BANK_ST: return proc_perf.l3cache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_BANK_ST_H:return proc_perf.l3cache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
||||
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
|
||||
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
|
||||
|
||||
case VX_CSR_MPM_LMEM_READS: return lmem_perf.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_LMEM_READS_H: return lmem_perf.reads >> 32;
|
||||
case VX_CSR_MPM_LMEM_WRITES: return lmem_perf.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_LMEM_WRITES_H: return lmem_perf.writes >> 32;
|
||||
case VX_CSR_MPM_LMEM_BANK_ST: return lmem_perf.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_LMEM_BANK_ST_H: return lmem_perf.bank_stalls >> 32;
|
||||
}
|
||||
} break;
|
||||
default: {
|
||||
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
|
||||
std::abort();
|
||||
} break;
|
||||
}
|
||||
} else {
|
||||
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void Core::set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid) {
|
||||
__unused (tid);
|
||||
switch (addr) {
|
||||
case VX_CSR_FFLAGS:
|
||||
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F);
|
||||
break;
|
||||
case VX_CSR_FRM:
|
||||
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5);
|
||||
break;
|
||||
case VX_CSR_FCSR:
|
||||
fcsrs_.at(wid) = value & 0xff;
|
||||
break;
|
||||
case VX_CSR_SATP:
|
||||
case VX_CSR_MSTATUS:
|
||||
case VX_CSR_MEDELEG:
|
||||
case VX_CSR_MIDELEG:
|
||||
case VX_CSR_MIE:
|
||||
case VX_CSR_MTVEC:
|
||||
case VX_CSR_MEPC:
|
||||
case VX_CSR_PMPCFG0:
|
||||
case VX_CSR_PMPADDR0:
|
||||
case VX_CSR_MNSTATUS:
|
||||
break;
|
||||
default:
|
||||
{
|
||||
std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Core::trigger_ecall() {
|
||||
active_warps_.reset();
|
||||
exited_ = true;
|
||||
}
|
||||
|
||||
void Core::trigger_ebreak() {
|
||||
active_warps_.reset();
|
||||
exited_ = true;
|
||||
}
|
||||
|
||||
bool Core::check_exit(Word* exitcode, bool riscv_test) const {
|
||||
if (exited_) {
|
||||
Word ec = warps_.at(0)->getIRegValue(3);
|
||||
if (riscv_test) {
|
||||
*exitcode = (1 - ec);
|
||||
} else {
|
||||
*exitcode = ec;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
int Core::get_exitcode() const {
|
||||
return emulator_.get_exitcode();
|
||||
}
|
||||
|
||||
bool Core::running() const {
|
||||
return (pending_instrs_ != 0);
|
||||
return emulator_.running() || (pending_instrs_ != 0);
|
||||
}
|
||||
|
||||
void Core::resume() {
|
||||
stalled_warps_.reset();
|
||||
void Core::resume(uint32_t wid) {
|
||||
emulator_.resume(wid);
|
||||
}
|
||||
|
||||
void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
|
||||
emulator_.barrier(bar_id, count, wid);
|
||||
}
|
||||
|
||||
void Core::attach_ram(RAM* ram) {
|
||||
// bind RAM to memory unit
|
||||
#if (XLEN == 64)
|
||||
mmu_.attach(*ram, 0, 0xFFFFFFFFFFFFFFFF);
|
||||
#else
|
||||
mmu_.attach(*ram, 0, 0xFFFFFFFF);
|
||||
#endif
|
||||
emulator_.attach_ram(ram);
|
||||
}
|
||||
|
|
|
@ -13,21 +13,10 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <list>
|
||||
#include <stack>
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
#include <memory>
|
||||
#include <set>
|
||||
#include <simobject.h>
|
||||
#include <mem.h>
|
||||
#include "debug.h"
|
||||
#include "types.h"
|
||||
#include "arch.h"
|
||||
#include "decode.h"
|
||||
#include "warp.h"
|
||||
#include "emulator.h"
|
||||
#include "pipeline.h"
|
||||
#include "cache_sim.h"
|
||||
#include "local_mem.h"
|
||||
|
@ -35,12 +24,13 @@
|
|||
#include "scoreboard.h"
|
||||
#include "operand.h"
|
||||
#include "dispatcher.h"
|
||||
#include "exe_unit.h"
|
||||
#include "dcrs.h"
|
||||
#include "func_unit.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Socket;
|
||||
class Arch;
|
||||
class DCRS;
|
||||
|
||||
using TraceSwitch = Mux<instr_trace_t*>;
|
||||
|
||||
|
@ -108,49 +98,31 @@ public:
|
|||
|
||||
bool running() const;
|
||||
|
||||
void resume();
|
||||
void resume(uint32_t wid);
|
||||
|
||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
|
||||
|
||||
uint32_t id() const {
|
||||
return core_id_;
|
||||
}
|
||||
|
||||
Socket* socket() const {
|
||||
return socket_;
|
||||
}
|
||||
|
||||
const Arch& arch() const {
|
||||
return arch_;
|
||||
}
|
||||
|
||||
const DCRS& dcrs() const {
|
||||
return dcrs_;
|
||||
Socket* socket() const {
|
||||
return socket_;
|
||||
}
|
||||
|
||||
uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
|
||||
|
||||
void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);
|
||||
const LocalMem::Ptr& local_mem() const {
|
||||
return local_mem_;
|
||||
}
|
||||
|
||||
void wspawn(uint32_t num_warps, Word nextPC);
|
||||
|
||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
AddrType get_addr_type(uint64_t addr);
|
||||
|
||||
void icache_read(void* data, uint64_t addr, uint32_t size);
|
||||
|
||||
void dcache_read(void* data, uint64_t addr, uint32_t size);
|
||||
|
||||
void dcache_write(const void* data, uint64_t addr, uint32_t size);
|
||||
|
||||
void dcache_amo_reserve(uint64_t addr);
|
||||
|
||||
bool dcache_amo_check(uint64_t addr);
|
||||
|
||||
void trigger_ecall();
|
||||
|
||||
void trigger_ebreak();
|
||||
|
||||
bool check_exit(Word* exitcode, bool riscv_test) const;
|
||||
int get_exitcode() const;
|
||||
|
||||
private:
|
||||
|
||||
|
@ -160,27 +132,18 @@ private:
|
|||
void issue();
|
||||
void execute();
|
||||
void commit();
|
||||
|
||||
void writeToStdOut(const void* data, uint64_t addr, uint32_t size);
|
||||
|
||||
void cout_flush();
|
||||
|
||||
uint32_t core_id_;
|
||||
Socket* socket_;
|
||||
const Arch& arch_;
|
||||
const DCRS &dcrs_;
|
||||
|
||||
const Decoder decoder_;
|
||||
MemoryUnit mmu_;
|
||||
|
||||
std::vector<std::shared_ptr<Warp>> warps_;
|
||||
std::vector<WarpMask> barriers_;
|
||||
std::vector<Byte> fcsrs_;
|
||||
Emulator emulator_;
|
||||
|
||||
std::vector<IBuffer> ibuffers_;
|
||||
Scoreboard scoreboard_;
|
||||
std::vector<Operand::Ptr> operands_;
|
||||
std::vector<Dispatcher::Ptr> dispatchers_;
|
||||
std::vector<ExeUnit::Ptr> exe_units_;
|
||||
std::vector<FuncUnit::Ptr> func_units_;
|
||||
LocalMem::Ptr local_mem_;
|
||||
std::vector<LocalMemDemux::Ptr> lmem_demuxs_;
|
||||
|
||||
|
@ -188,16 +151,9 @@ private:
|
|||
PipelineLatch decode_latch_;
|
||||
|
||||
HashTable<instr_trace_t*> pending_icache_;
|
||||
WarpMask active_warps_;
|
||||
WarpMask stalled_warps_;
|
||||
uint64_t pending_instrs_;
|
||||
bool exited_;
|
||||
|
||||
uint64_t pending_ifetches_;
|
||||
|
||||
std::unordered_map<int, std::stringstream> print_bufs_;
|
||||
|
||||
std::vector<std::vector<CSRs>> csrs_;
|
||||
|
||||
PerfStats perf_stats_;
|
||||
|
||||
|
@ -206,7 +162,6 @@ private:
|
|||
uint32_t commit_exe_;
|
||||
uint32_t ibuffer_idx_;
|
||||
|
||||
friend class Warp;
|
||||
friend class LsuUnit;
|
||||
friend class AluUnit;
|
||||
friend class FpuUnit;
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
#include <util.h>
|
||||
#include "debug.h"
|
||||
#include "types.h"
|
||||
#include "decode.h"
|
||||
#include "emulator.h"
|
||||
#include "arch.h"
|
||||
#include "instr.h"
|
||||
|
||||
|
@ -435,9 +435,7 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
|
|||
}
|
||||
}
|
||||
|
||||
Decoder::Decoder(const Arch&) {}
|
||||
|
||||
std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
|
||||
std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
|
||||
auto instr = std::make_shared<Instr>();
|
||||
auto op = Opcode((code >> shift_opcode) & mask_opcode);
|
||||
instr->setOpcode(op);
|
||||
|
|
|
@ -1,31 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Arch;
|
||||
class Instr;
|
||||
|
||||
class Decoder {
|
||||
public:
|
||||
Decoder(const Arch &);
|
||||
|
||||
std::shared_ptr<Instr> decode(uint32_t code) const;
|
||||
};
|
||||
|
||||
}
|
|
@ -15,6 +15,7 @@
|
|||
|
||||
#include "instr_trace.h"
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
|
|
527
sim/simx/emulator.cpp
Normal file
527
sim/simx/emulator.cpp
Normal file
|
@ -0,0 +1,527 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <util.h>
|
||||
|
||||
#include "emulator.h"
|
||||
#include "instr_trace.h"
|
||||
#include "instr.h"
|
||||
#include "dcrs.h"
|
||||
#include "core.h"
|
||||
#include "socket.h"
|
||||
#include "cluster.h"
|
||||
#include "processor_impl.h"
|
||||
#include "local_mem.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
Emulator::ipdom_entry_t::ipdom_entry_t(const ThreadMask &tmask, Word PC)
|
||||
: tmask(tmask)
|
||||
, PC(PC)
|
||||
, fallthrough(false)
|
||||
{}
|
||||
|
||||
Emulator::ipdom_entry_t::ipdom_entry_t(const ThreadMask &tmask)
|
||||
: tmask(tmask)
|
||||
, fallthrough(true)
|
||||
{}
|
||||
|
||||
Emulator::warp_t::warp_t(const Arch& arch)
|
||||
: ireg_file(arch.num_threads(), std::vector<Word>(arch.num_regs()))
|
||||
, freg_file(arch.num_threads(), std::vector<uint64_t>(arch.num_regs()))
|
||||
{}
|
||||
|
||||
void Emulator::warp_t::clear(const Arch& arch, const DCRS &dcrs) {
|
||||
this->PC = dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
|
||||
#if (XLEN == 64)
|
||||
this->PC = (uint64_t(dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
|
||||
#endif
|
||||
this->tmask.reset();
|
||||
for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) {
|
||||
for (auto& reg : this->ireg_file.at(i)) {
|
||||
reg = 0;
|
||||
}
|
||||
for (auto& reg : this->freg_file.at(i)) {
|
||||
reg = 0;
|
||||
}
|
||||
}
|
||||
this->fcsr = 0;
|
||||
this->uui_gen.reset();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
|
||||
: arch_(arch)
|
||||
, dcrs_(dcrs)
|
||||
, core_(core)
|
||||
, warps_(arch.num_warps(), arch)
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
{
|
||||
this->clear();
|
||||
}
|
||||
|
||||
Emulator::~Emulator() {
|
||||
this->cout_flush();
|
||||
}
|
||||
|
||||
void Emulator::clear() {
|
||||
for (auto& warp : warps_) {
|
||||
warp.clear(arch_, dcrs_);
|
||||
}
|
||||
|
||||
for (auto& barrier : barriers_) {
|
||||
barrier.reset();
|
||||
}
|
||||
|
||||
stalled_warps_.reset();
|
||||
active_warps_.reset();
|
||||
|
||||
// activate first warp and thread
|
||||
active_warps_.set(0);
|
||||
warps_[0].tmask.set(0);
|
||||
}
|
||||
|
||||
void Emulator::attach_ram(RAM* ram) {
|
||||
// bind RAM to memory unit
|
||||
#if (XLEN == 64)
|
||||
mmu_.attach(*ram, 0, 0xFFFFFFFFFFFFFFFF);
|
||||
#else
|
||||
mmu_.attach(*ram, 0, 0xFFFFFFFF);
|
||||
#endif
|
||||
}
|
||||
|
||||
instr_trace_t* Emulator::step() {
|
||||
int scheduled_warp = -1;
|
||||
|
||||
// find next ready warp
|
||||
for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) {
|
||||
bool warp_active = active_warps_.test(wid);
|
||||
bool warp_stalled = stalled_warps_.test(wid);
|
||||
if (warp_active && !warp_stalled) {
|
||||
scheduled_warp = wid;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (scheduled_warp == -1)
|
||||
return nullptr;
|
||||
|
||||
// suspend warp until decode
|
||||
auto& warp = warps_.at(scheduled_warp);
|
||||
assert(warp.tmask.any());
|
||||
|
||||
#ifndef NDEBUG
|
||||
uint32_t instr_uuid = warp.uui_gen.get_uuid(warp.PC);
|
||||
uint32_t g_wid = core_->id() * arch_.num_warps() + scheduled_warp;
|
||||
uint32_t instr_id = instr_uuid & 0xffff;
|
||||
uint32_t instr_ref = instr_uuid >> 16;
|
||||
uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
|
||||
#else
|
||||
uint64_t uuid = 0;
|
||||
#endif
|
||||
|
||||
DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << scheduled_warp << ", tmask=");
|
||||
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
|
||||
DPN(1, warp.tmask.test(i));
|
||||
DPN(1, ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << uuid << ")" << std::endl);
|
||||
|
||||
// Fetch
|
||||
uint32_t instr_code = 0;
|
||||
this->icache_read(&instr_code, warp.PC, sizeof(uint32_t));
|
||||
|
||||
// Decode
|
||||
auto instr = this->decode(instr_code);
|
||||
if (!instr) {
|
||||
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << warp.PC << " (#" << std::dec << uuid << ")" << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
|
||||
DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);
|
||||
|
||||
// Create trace
|
||||
auto trace = new instr_trace_t(uuid, arch_);
|
||||
|
||||
// Execute
|
||||
this->execute(*instr, scheduled_warp, trace);
|
||||
|
||||
DP(5, "Register state:");
|
||||
for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
|
||||
DPN(5, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
|
||||
// Integer register file
|
||||
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
|
||||
DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << warp.ireg_file.at(j).at(i) << std::setfill(' ') << ' ');
|
||||
}
|
||||
DPN(5, '|');
|
||||
// Floating point register file
|
||||
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
|
||||
DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << warp.freg_file.at(j).at(i) << std::setfill(' ') << ' ');
|
||||
}
|
||||
DPN(5, std::endl);
|
||||
}
|
||||
|
||||
return trace;
|
||||
}
|
||||
|
||||
bool Emulator::running() const {
|
||||
return active_warps_.any();
|
||||
}
|
||||
|
||||
int Emulator::get_exitcode() const {
|
||||
return warps_.at(0).ireg_file.at(0).at(3);
|
||||
}
|
||||
|
||||
void Emulator::suspend(uint32_t wid) {
|
||||
assert(!stalled_warps_.test(wid));
|
||||
stalled_warps_.set(wid);
|
||||
}
|
||||
|
||||
void Emulator::resume(uint32_t wid) {
|
||||
if (wid != 0xffffffff) {
|
||||
assert(stalled_warps_.test(wid));
|
||||
stalled_warps_.reset(wid);
|
||||
} else {
|
||||
stalled_warps_.reset();
|
||||
}
|
||||
}
|
||||
|
||||
void Emulator::wspawn(uint32_t num_warps, Word nextPC) {
|
||||
uint32_t active_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
|
||||
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC);
|
||||
for (uint32_t i = 1; i < active_warps; ++i) {
|
||||
auto& warp = warps_.at(i);
|
||||
warp.PC = nextPC;
|
||||
warp.tmask.set(0);
|
||||
active_warps_.set(i);
|
||||
}
|
||||
}
|
||||
|
||||
void Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
|
||||
uint32_t bar_idx = bar_id & 0x7fffffff;
|
||||
bool is_global = (bar_id >> 31);
|
||||
|
||||
auto& barrier = barriers_.at(bar_idx);
|
||||
barrier.set(wid);
|
||||
DP(3, "*** Suspend core #" << core_->id() << ", warp #" << wid << " at barrier #" << bar_idx);
|
||||
|
||||
if (is_global) {
|
||||
// global barrier handling
|
||||
if (barrier.count() == active_warps_.count()) {
|
||||
core_->socket()->barrier(bar_idx, count, core_->id());
|
||||
barrier.reset();
|
||||
}
|
||||
} else {
|
||||
// local barrier handling
|
||||
if (barrier.count() == (size_t)count) {
|
||||
// resume suspended warps
|
||||
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
|
||||
if (barrier.test(i)) {
|
||||
DP(3, "*** Resume core #" << core_->id() << ", warp #" << i << " at barrier #" << bar_idx);
|
||||
stalled_warps_.reset(i);
|
||||
}
|
||||
}
|
||||
barrier.reset();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Emulator::icache_read(void *data, uint64_t addr, uint32_t size) {
|
||||
mmu_.read(data, addr, size, 0);
|
||||
}
|
||||
|
||||
void Emulator::dcache_read(void *data, uint64_t addr, uint32_t size) {
|
||||
auto type = get_addr_type(addr);
|
||||
if (type == AddrType::Shared) {
|
||||
core_->local_mem()->read(data, addr, size);
|
||||
} else {
|
||||
mmu_.read(data, addr, size, 0);
|
||||
}
|
||||
|
||||
DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
|
||||
}
|
||||
|
||||
void Emulator::dcache_write(const void* data, uint64_t addr, uint32_t size) {
|
||||
auto type = get_addr_type(addr);
|
||||
if (addr >= uint64_t(IO_COUT_ADDR)
|
||||
&& addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
||||
this->writeToStdOut(data, addr, size);
|
||||
} else {
|
||||
if (type == AddrType::Shared) {
|
||||
core_->local_mem()->write(data, addr, size);
|
||||
} else {
|
||||
mmu_.write(data, addr, size, 0);
|
||||
}
|
||||
}
|
||||
DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
|
||||
}
|
||||
|
||||
void Emulator::dcache_amo_reserve(uint64_t addr) {
|
||||
auto type = get_addr_type(addr);
|
||||
if (type == AddrType::Global) {
|
||||
mmu_.amo_reserve(addr);
|
||||
}
|
||||
}
|
||||
|
||||
bool Emulator::dcache_amo_check(uint64_t addr) {
|
||||
auto type = get_addr_type(addr);
|
||||
if (type == AddrType::Global) {
|
||||
return mmu_.amo_check(addr);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void Emulator::writeToStdOut(const void* data, uint64_t addr, uint32_t size) {
|
||||
if (size != 1)
|
||||
std::abort();
|
||||
uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
|
||||
auto& ss_buf = print_bufs_[tid];
|
||||
char c = *(char*)data;
|
||||
ss_buf << c;
|
||||
if (c == '\n') {
|
||||
std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush;
|
||||
ss_buf.str("");
|
||||
}
|
||||
}
|
||||
|
||||
void Emulator::cout_flush() {
|
||||
for (auto& buf : print_bufs_) {
|
||||
auto str = buf.second.str();
|
||||
if (!str.empty()) {
|
||||
std::cout << "#" << buf.first << ": " << str << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
auto core_perf = core_->perf_stats();
|
||||
switch (addr) {
|
||||
case VX_CSR_SATP:
|
||||
case VX_CSR_PMPCFG0:
|
||||
case VX_CSR_PMPADDR0:
|
||||
case VX_CSR_MSTATUS:
|
||||
case VX_CSR_MISA:
|
||||
case VX_CSR_MEDELEG:
|
||||
case VX_CSR_MIDELEG:
|
||||
case VX_CSR_MIE:
|
||||
case VX_CSR_MTVEC:
|
||||
case VX_CSR_MEPC:
|
||||
case VX_CSR_MNSTATUS:
|
||||
return 0;
|
||||
|
||||
case VX_CSR_FFLAGS:
|
||||
return warps_.at(wid).fcsr & 0x1F;
|
||||
case VX_CSR_FRM:
|
||||
return (warps_.at(wid).fcsr >> 5);
|
||||
case VX_CSR_FCSR:
|
||||
return warps_.at(wid).fcsr;
|
||||
case VX_CSR_MHARTID: // global thread ID
|
||||
return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid;
|
||||
case VX_CSR_THREAD_ID: // thread ID
|
||||
return tid;
|
||||
case VX_CSR_WARP_ID: // warp ID
|
||||
return wid;
|
||||
case VX_CSR_CORE_ID: // core ID
|
||||
return core_->id();
|
||||
case VX_CSR_THREAD_MASK: // thread mask
|
||||
return warps_.at(wid).tmask.to_ulong();
|
||||
case VX_CSR_WARP_MASK: // active warps
|
||||
return active_warps_.to_ulong();
|
||||
case VX_CSR_NUM_THREADS: // Number of threads per warp
|
||||
return arch_.num_threads();
|
||||
case VX_CSR_NUM_WARPS: // Number of warps per core
|
||||
return arch_.num_warps();
|
||||
case VX_CSR_NUM_CORES: // Number of cores per cluster
|
||||
return uint32_t(arch_.num_cores()) * arch_.num_clusters();
|
||||
case VX_CSR_MCYCLE: // NumCycles
|
||||
return core_perf.cycles & 0xffffffff;
|
||||
case VX_CSR_MCYCLE_H: // NumCycles
|
||||
return (uint32_t)(core_perf.cycles >> 32);
|
||||
case VX_CSR_MINSTRET: // NumInsts
|
||||
return core_perf.instrs & 0xffffffff;
|
||||
case VX_CSR_MINSTRET_H: // NumInsts
|
||||
return (uint32_t)(core_perf.instrs >> 32);
|
||||
default:
|
||||
if ((addr >= VX_CSR_MPM_BASE && addr < (VX_CSR_MPM_BASE + 32))
|
||||
|| (addr >= VX_CSR_MPM_BASE_H && addr < (VX_CSR_MPM_BASE_H + 32))) {
|
||||
// user-defined MPM CSRs
|
||||
auto perf_class = dcrs_.base_dcrs.read(VX_DCR_BASE_MPM_CLASS);
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_NONE:
|
||||
break;
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_SCHED_ID: return core_perf.sched_idle & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ID_H:return core_perf.sched_idle >> 32;
|
||||
case VX_CSR_MPM_SCHED_ST: return core_perf.sched_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ST_H:return core_perf.sched_stalls >> 32;
|
||||
case VX_CSR_MPM_IBUF_ST: return core_perf.ibuf_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_IBUF_ST_H: return core_perf.ibuf_stalls >> 32;
|
||||
case VX_CSR_MPM_SCRB_ST: return core_perf.scrb_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_ST_H: return core_perf.scrb_stalls >> 32;
|
||||
case VX_CSR_MPM_SCRB_ALU: return core_perf.scrb_alu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_ALU_H:return core_perf.scrb_alu >> 32;
|
||||
case VX_CSR_MPM_SCRB_FPU: return core_perf.scrb_fpu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_FPU_H:return core_perf.scrb_fpu >> 32;
|
||||
case VX_CSR_MPM_SCRB_LSU: return core_perf.scrb_lsu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_LSU_H:return core_perf.scrb_lsu >> 32;
|
||||
case VX_CSR_MPM_SCRB_SFU: return core_perf.scrb_sfu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_SFU_H:return core_perf.scrb_sfu >> 32;
|
||||
case VX_CSR_MPM_SCRB_WCTL: return core_perf.scrb_wctl & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_WCTL_H: return core_perf.scrb_wctl >> 32;
|
||||
case VX_CSR_MPM_SCRB_CSRS: return core_perf.scrb_csrs & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_CSRS_H: return core_perf.scrb_csrs >> 32;
|
||||
case VX_CSR_MPM_IFETCHES: return core_perf.ifetches & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCHES_H: return core_perf.ifetches >> 32;
|
||||
case VX_CSR_MPM_LOADS: return core_perf.loads & 0xffffffff;
|
||||
case VX_CSR_MPM_LOADS_H: return core_perf.loads >> 32;
|
||||
case VX_CSR_MPM_STORES: return core_perf.stores & 0xffffffff;
|
||||
case VX_CSR_MPM_STORES_H: return core_perf.stores >> 32;
|
||||
case VX_CSR_MPM_IFETCH_LT: return core_perf.ifetch_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCH_LT_H: return core_perf.ifetch_latency >> 32;
|
||||
case VX_CSR_MPM_LOAD_LT: return core_perf.load_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_LOAD_LT_H: return core_perf.load_latency >> 32;
|
||||
}
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
auto proc_perf = core_->socket()->cluster()->processor()->perf_stats();
|
||||
auto cluster_perf = core_->socket()->cluster()->perf_stats();
|
||||
auto socket_perf = core_->socket()->perf_stats();
|
||||
auto lmem_perf = core_->local_mem()->perf_stats();
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32;
|
||||
case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_WRITES: return proc_perf.l3cache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_WRITES_H: return proc_perf.l3cache.writes >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_R: return proc_perf.l3cache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_R_H: return proc_perf.l3cache.read_misses >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_W: return proc_perf.l3cache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_W_H: return proc_perf.l3cache.write_misses >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_BANK_ST: return proc_perf.l3cache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_BANK_ST_H:return proc_perf.l3cache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
||||
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
|
||||
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
|
||||
|
||||
case VX_CSR_MPM_LMEM_READS: return lmem_perf.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_LMEM_READS_H: return lmem_perf.reads >> 32;
|
||||
case VX_CSR_MPM_LMEM_WRITES: return lmem_perf.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_LMEM_WRITES_H: return lmem_perf.writes >> 32;
|
||||
case VX_CSR_MPM_LMEM_BANK_ST: return lmem_perf.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_LMEM_BANK_ST_H: return lmem_perf.bank_stalls >> 32;
|
||||
}
|
||||
} break;
|
||||
default: {
|
||||
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
|
||||
std::abort();
|
||||
} break;
|
||||
}
|
||||
} else {
|
||||
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void Emulator::set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid) {
|
||||
__unused (tid);
|
||||
switch (addr) {
|
||||
case VX_CSR_FFLAGS:
|
||||
warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0x1F) | (value & 0x1F);
|
||||
break;
|
||||
case VX_CSR_FRM:
|
||||
warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0xE0) | (value << 5);
|
||||
break;
|
||||
case VX_CSR_FCSR:
|
||||
warps_.at(wid).fcsr = value & 0xff;
|
||||
break;
|
||||
case VX_CSR_SATP:
|
||||
case VX_CSR_MSTATUS:
|
||||
case VX_CSR_MEDELEG:
|
||||
case VX_CSR_MIDELEG:
|
||||
case VX_CSR_MIE:
|
||||
case VX_CSR_MTVEC:
|
||||
case VX_CSR_MEPC:
|
||||
case VX_CSR_PMPCFG0:
|
||||
case VX_CSR_PMPADDR0:
|
||||
case VX_CSR_MNSTATUS:
|
||||
break;
|
||||
default:
|
||||
{
|
||||
std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t Emulator::get_fpu_rm(uint32_t func3, uint32_t tid, uint32_t wid) {
|
||||
return (func3 == 0x7) ? this->get_csr(VX_CSR_FRM, tid, wid) : func3;
|
||||
}
|
||||
|
||||
void Emulator::update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid) {
|
||||
if (fflags) {
|
||||
this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid);
|
||||
this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid);
|
||||
}
|
||||
}
|
||||
|
||||
void Emulator::trigger_ecall() {
|
||||
active_warps_.reset();
|
||||
}
|
||||
|
||||
void Emulator::trigger_ebreak() {
|
||||
active_warps_.reset();
|
||||
}
|
126
sim/simx/emulator.h
Normal file
126
sim/simx/emulator.h
Normal file
|
@ -0,0 +1,126 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef __WARP_H
|
||||
#define __WARP_H
|
||||
|
||||
#include <vector>
|
||||
#include <stack>
|
||||
#include <mem.h>
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Arch;
|
||||
class DCRS;
|
||||
class Core;
|
||||
class Instr;
|
||||
class instr_trace_t;
|
||||
|
||||
class Emulator {
|
||||
public:
|
||||
Emulator(const Arch &arch,
|
||||
const DCRS &dcrs,
|
||||
Core* core);
|
||||
|
||||
~Emulator();
|
||||
|
||||
void clear();
|
||||
|
||||
void attach_ram(RAM* ram);
|
||||
|
||||
instr_trace_t* step();
|
||||
|
||||
bool running() const;
|
||||
|
||||
void suspend(uint32_t wid);
|
||||
|
||||
void resume(uint32_t wid);
|
||||
|
||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
|
||||
|
||||
int get_exitcode() const;
|
||||
|
||||
private:
|
||||
|
||||
struct ipdom_entry_t {
|
||||
ipdom_entry_t(const ThreadMask &tmask, Word PC);
|
||||
ipdom_entry_t(const ThreadMask &tmask);
|
||||
|
||||
ThreadMask tmask;
|
||||
Word PC;
|
||||
bool fallthrough;
|
||||
};
|
||||
|
||||
struct warp_t {
|
||||
warp_t(const Arch& arch);
|
||||
|
||||
void clear(const Arch& arch, const DCRS &dcrs);
|
||||
|
||||
Word PC;
|
||||
ThreadMask tmask;
|
||||
|
||||
std::vector<std::vector<Word>> ireg_file;
|
||||
std::vector<std::vector<uint64_t>> freg_file;
|
||||
std::stack<ipdom_entry_t> ipdom_stack;
|
||||
Byte fcsr;
|
||||
|
||||
UUIDGenerator uui_gen;
|
||||
};
|
||||
|
||||
std::shared_ptr<Instr> decode(uint32_t code) const;
|
||||
|
||||
void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace);
|
||||
|
||||
void wspawn(uint32_t num_warps, Word nextPC);
|
||||
|
||||
void icache_read(void* data, uint64_t addr, uint32_t size);
|
||||
|
||||
void dcache_read(void* data, uint64_t addr, uint32_t size);
|
||||
|
||||
void dcache_write(const void* data, uint64_t addr, uint32_t size);
|
||||
|
||||
void dcache_amo_reserve(uint64_t addr);
|
||||
|
||||
bool dcache_amo_check(uint64_t addr);
|
||||
|
||||
void writeToStdOut(const void* data, uint64_t addr, uint32_t size);
|
||||
|
||||
void cout_flush();
|
||||
|
||||
uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
|
||||
|
||||
void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);
|
||||
|
||||
uint32_t get_fpu_rm(uint32_t func3, uint32_t tid, uint32_t wid);
|
||||
|
||||
void update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid);
|
||||
|
||||
void trigger_ecall();
|
||||
|
||||
void trigger_ebreak();
|
||||
|
||||
const Arch& arch_;
|
||||
const DCRS& dcrs_;
|
||||
Core* core_;
|
||||
std::vector<warp_t> warps_;
|
||||
WarpMask active_warps_;
|
||||
WarpMask stalled_warps_;
|
||||
std::vector<WarpMask> barriers_;
|
||||
std::unordered_map<int, std::stringstream> print_bufs_;
|
||||
MemoryUnit mmu_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
|
@ -22,7 +22,7 @@
|
|||
#include <assert.h>
|
||||
#include <util.h>
|
||||
#include <rvfloats.h>
|
||||
#include "warp.h"
|
||||
#include "emulator.h"
|
||||
#include "instr.h"
|
||||
#include "core.h"
|
||||
|
||||
|
@ -40,17 +40,6 @@ union reg_data_t {
|
|||
int64_t i64;
|
||||
};
|
||||
|
||||
inline uint32_t get_fpu_rm(uint32_t func3, Core* core, uint32_t tid, uint32_t wid) {
|
||||
return (func3 == 0x7) ? core->get_csr(VX_CSR_FRM, tid, wid) : func3;
|
||||
}
|
||||
|
||||
inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid) {
|
||||
if (fflags) {
|
||||
core->set_csr(VX_CSR_FCSR, core->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid);
|
||||
core->set_csr(VX_CSR_FFLAGS, core->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid);
|
||||
}
|
||||
}
|
||||
|
||||
inline uint64_t nan_box(uint32_t value) {
|
||||
uint64_t mask = 0xffffffff00000000;
|
||||
return value | mask;
|
||||
|
@ -66,11 +55,20 @@ inline int64_t check_boxing(int64_t a) {
|
|||
return nan_box(0x7fc00000); // NaN
|
||||
}
|
||||
|
||||
void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
||||
assert(tmask_.any());
|
||||
void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
||||
auto& warp = warps_.at(wid);
|
||||
assert(warp.tmask.any());
|
||||
|
||||
auto next_pc = PC_ + 4;
|
||||
auto next_tmask = tmask_;
|
||||
// initialize instruction trace
|
||||
trace->cid = core_->id();
|
||||
trace->wid = wid;
|
||||
trace->PC = warp.PC;
|
||||
trace->tmask = warp.tmask;
|
||||
trace->rdest = instr.getRDest();
|
||||
trace->rdest_type = instr.getRDType();
|
||||
|
||||
auto next_pc = warp.PC + 4;
|
||||
auto next_tmask = warp.tmask;
|
||||
|
||||
auto opcode = instr.getOpcode();
|
||||
auto func2 = instr.getFunc2();
|
||||
|
@ -86,7 +84,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
|
||||
uint32_t thread_start = 0;
|
||||
for (; thread_start < num_threads; ++thread_start) {
|
||||
if (tmask_.test(thread_start))
|
||||
if (warp.tmask.test(thread_start))
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -103,11 +101,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
if (t) DPN(2, ", ");
|
||||
if (!tmask_.test(t)) {
|
||||
if (!warp.tmask.test(t)) {
|
||||
DPN(2, "-");
|
||||
continue;
|
||||
}
|
||||
rsdata[t][i].u = ireg_file_.at(t)[reg];
|
||||
rsdata[t][i].u = warp.ireg_file.at(t)[reg];
|
||||
DPN(2, "0x" << std::hex << rsdata[t][i].i);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
|
@ -116,11 +114,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
if (t) DPN(2, ", ");
|
||||
if (!tmask_.test(t)) {
|
||||
if (!warp.tmask.test(t)) {
|
||||
DPN(2, "-");
|
||||
continue;
|
||||
}
|
||||
rsdata[t][i].u64 = freg_file_.at(t)[reg];
|
||||
rsdata[t][i].u64 = warp.freg_file.at(t)[reg];
|
||||
DPN(2, "0x" << std::hex << rsdata[t][i].f);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
|
@ -139,7 +137,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->fu_type = FUType::ALU;
|
||||
trace->alu_type = AluType::ARITH;
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
rddata[t].i = immsrc;
|
||||
}
|
||||
|
@ -151,9 +149,9 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->fu_type = FUType::ALU;
|
||||
trace->alu_type = AluType::ARITH;
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
rddata[t].i = immsrc + PC_;
|
||||
rddata[t].i = immsrc + warp.PC;
|
||||
}
|
||||
rd_write = true;
|
||||
break;
|
||||
|
@ -164,7 +162,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
if (func7 & 0x1) {
|
||||
switch (func3) {
|
||||
|
@ -324,7 +322,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->alu_type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
switch (func3) {
|
||||
case 0: {
|
||||
|
@ -385,7 +383,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
if (func7 & 0x1) {
|
||||
switch (func3) {
|
||||
|
@ -511,7 +509,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->alu_type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
switch (func3) {
|
||||
case 0: {
|
||||
|
@ -555,48 +553,48 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
switch (func3) {
|
||||
case 0: {
|
||||
// RV32I: BEQ
|
||||
if (rsdata[t][0].i == rsdata[t][1].i) {
|
||||
next_pc = PC_ + immsrc;
|
||||
next_pc = warp.PC + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
// RV32I: BNE
|
||||
if (rsdata[t][0].i != rsdata[t][1].i) {
|
||||
next_pc = PC_ + immsrc;
|
||||
next_pc = warp.PC + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
// RV32I: BLT
|
||||
if (rsdata[t][0].i < rsdata[t][1].i) {
|
||||
next_pc = PC_ + immsrc;
|
||||
next_pc = warp.PC + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 5: {
|
||||
// RV32I: BGE
|
||||
if (rsdata[t][0].i >= rsdata[t][1].i) {
|
||||
next_pc = PC_ + immsrc;
|
||||
next_pc = warp.PC + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 6: {
|
||||
// RV32I: BLTU
|
||||
if (rsdata[t][0].u < rsdata[t][1].u) {
|
||||
next_pc = PC_ + immsrc;
|
||||
next_pc = warp.PC + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 7: {
|
||||
// RV32I: BGEU
|
||||
if (rsdata[t][0].u >= rsdata[t][1].u) {
|
||||
next_pc = PC_ + immsrc;
|
||||
next_pc = warp.PC + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -613,11 +611,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->fu_type = FUType::ALU;
|
||||
trace->alu_type = AluType::BRANCH;
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
rddata[t].i = next_pc;
|
||||
}
|
||||
next_pc = PC_ + immsrc;
|
||||
next_pc = warp.PC + immsrc;
|
||||
trace->fetch_stall = true;
|
||||
rd_write = true;
|
||||
break;
|
||||
|
@ -628,7 +626,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->alu_type = AluType::BRANCH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
rddata[t].i = next_pc;
|
||||
}
|
||||
|
@ -647,11 +645,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
uint32_t data_bytes = 1 << (func3 & 0x3);
|
||||
uint32_t data_width = 8 * data_bytes;
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
uint64_t mem_addr = rsdata[t][0].i + immsrc;
|
||||
uint64_t read_data = 0;
|
||||
core_->dcache_read(&read_data, mem_addr, data_bytes);
|
||||
this->dcache_read(&read_data, mem_addr, data_bytes);
|
||||
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
|
||||
switch (func3) {
|
||||
case 0: // RV32I: LB
|
||||
|
@ -691,7 +689,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->data = trace_data;
|
||||
uint32_t data_bytes = 1 << (func3 & 0x3);
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
uint64_t mem_addr = rsdata[t][0].i + immsrc;
|
||||
uint64_t write_data = rsdata[t][1].u64;
|
||||
|
@ -701,7 +699,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
case 1:
|
||||
case 2:
|
||||
case 3:
|
||||
core_->dcache_write(&write_data, mem_addr, data_bytes);
|
||||
this->dcache_write(&write_data, mem_addr, data_bytes);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
|
@ -720,26 +718,26 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
uint32_t data_bytes = 1 << (func3 & 0x3);
|
||||
uint32_t data_width = 8 * data_bytes;
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
uint64_t mem_addr = rsdata[t][0].u;
|
||||
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
|
||||
if (amo_type == 0x02) { // LR
|
||||
uint64_t read_data = 0;
|
||||
core_->dcache_read(&read_data, mem_addr, data_bytes);
|
||||
core_->dcache_amo_reserve(mem_addr);
|
||||
this->dcache_read(&read_data, mem_addr, data_bytes);
|
||||
this->dcache_amo_reserve(mem_addr);
|
||||
rddata[t].i = sext((Word)read_data, data_width);
|
||||
} else
|
||||
if (amo_type == 0x03) { // SC
|
||||
if (core_->dcache_amo_check(mem_addr)) {
|
||||
core_->dcache_write(&rsdata[t][1].u64, mem_addr, data_bytes);
|
||||
if (this->dcache_amo_check(mem_addr)) {
|
||||
this->dcache_write(&rsdata[t][1].u64, mem_addr, data_bytes);
|
||||
rddata[t].i = 0;
|
||||
} else {
|
||||
rddata[t].i = 1;
|
||||
}
|
||||
} else {
|
||||
uint64_t read_data = 0;
|
||||
core_->dcache_read(&read_data, mem_addr, data_bytes);
|
||||
this->dcache_read(&read_data, mem_addr, data_bytes);
|
||||
auto read_data_i = sext((WordI)read_data, data_width);
|
||||
auto rs1_data_i = sext((WordI)rsdata[t][1].u64, data_width);
|
||||
auto read_data_u = zext((Word)read_data, data_width);
|
||||
|
@ -776,7 +774,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
default:
|
||||
std::abort();
|
||||
}
|
||||
core_->dcache_write(&result, mem_addr, data_bytes);
|
||||
this->dcache_write(&result, mem_addr, data_bytes);
|
||||
rddata[t].i = read_data_i;
|
||||
}
|
||||
}
|
||||
|
@ -785,7 +783,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
}
|
||||
case Opcode::SYS: {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
uint32_t csr_addr = immsrc;
|
||||
uint32_t csr_value;
|
||||
|
@ -796,11 +794,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
switch (csr_addr) {
|
||||
case 0:
|
||||
// RV32I: ECALL
|
||||
core_->trigger_ecall();
|
||||
this->trigger_ecall();
|
||||
break;
|
||||
case 1:
|
||||
// RV32I: EBREAK
|
||||
core_->trigger_ebreak();
|
||||
this->trigger_ebreak();
|
||||
break;
|
||||
case 0x002: // URET
|
||||
case 0x102: // SRET
|
||||
|
@ -812,12 +810,12 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
} else {
|
||||
trace->fu_type = FUType::SFU;
|
||||
trace->fetch_stall = true;
|
||||
csr_value = core_->get_csr(csr_addr, t, warp_id_);
|
||||
csr_value = this->get_csr(csr_addr, t, wid);
|
||||
switch (func3) {
|
||||
case 1: {
|
||||
// RV32I: CSRRW
|
||||
rddata[t].i = csr_value;
|
||||
core_->set_csr(csr_addr, rsdata[t][0].i, t, warp_id_);
|
||||
this->set_csr(csr_addr, rsdata[t][0].i, t, wid);
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->sfu_type = SfuType::CSRRW;
|
||||
rd_write = true;
|
||||
|
@ -827,7 +825,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
// RV32I: CSRRS
|
||||
rddata[t].i = csr_value;
|
||||
if (rsdata[t][0].i != 0) {
|
||||
core_->set_csr(csr_addr, csr_value | rsdata[t][0].i, t, warp_id_);
|
||||
this->set_csr(csr_addr, csr_value | rsdata[t][0].i, t, wid);
|
||||
}
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->sfu_type = SfuType::CSRRS;
|
||||
|
@ -838,7 +836,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
// RV32I: CSRRC
|
||||
rddata[t].i = csr_value;
|
||||
if (rsdata[t][0].i != 0) {
|
||||
core_->set_csr(csr_addr, csr_value & ~rsdata[t][0].i, t, warp_id_);
|
||||
this->set_csr(csr_addr, csr_value & ~rsdata[t][0].i, t, wid);
|
||||
}
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->sfu_type = SfuType::CSRRC;
|
||||
|
@ -848,7 +846,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
case 5: {
|
||||
// RV32I: CSRRWI
|
||||
rddata[t].i = csr_value;
|
||||
core_->set_csr(csr_addr, rsrc0, t, warp_id_);
|
||||
this->set_csr(csr_addr, rsrc0, t, wid);
|
||||
trace->sfu_type = SfuType::CSRRW;
|
||||
rd_write = true;
|
||||
break;
|
||||
|
@ -857,7 +855,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
// RV32I: CSRRSI;
|
||||
rddata[t].i = csr_value;
|
||||
if (rsrc0 != 0) {
|
||||
core_->set_csr(csr_addr, csr_value | rsrc0, t, warp_id_);
|
||||
this->set_csr(csr_addr, csr_value | rsrc0, t, wid);
|
||||
}
|
||||
trace->sfu_type = SfuType::CSRRS;
|
||||
rd_write = true;
|
||||
|
@ -867,7 +865,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
// RV32I: CSRRCI
|
||||
rddata[t].i = csr_value;
|
||||
if (rsrc0 != 0) {
|
||||
core_->set_csr(csr_addr, csr_value & ~rsrc0, t, warp_id_);
|
||||
this->set_csr(csr_addr, csr_value & ~rsrc0, t, wid);
|
||||
}
|
||||
trace->sfu_type = SfuType::CSRRC;
|
||||
rd_write = true;
|
||||
|
@ -889,9 +887,9 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
case Opcode::FCI: {
|
||||
trace->fu_type = FUType::FPU;
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
|
||||
uint32_t frm = this->get_fpu_rm(func3, t, wid);
|
||||
uint32_t fflags = 0;
|
||||
switch (func7) {
|
||||
case 0x00: { // RV32F: FADD.S
|
||||
|
@ -1206,7 +1204,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
break;
|
||||
}
|
||||
}
|
||||
update_fcrs(fflags, core_, t, warp_id_);
|
||||
this->update_fcrs(fflags, t, wid);
|
||||
}
|
||||
rd_write = true;
|
||||
break;
|
||||
|
@ -1220,9 +1218,9 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->used_fregs.set(rsrc1);
|
||||
trace->used_fregs.set(rsrc2);
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
|
||||
uint32_t frm = this->get_fpu_rm(func3, t, wid);
|
||||
uint32_t fflags = 0;
|
||||
switch (opcode) {
|
||||
case Opcode::FMADD:
|
||||
|
@ -1260,7 +1258,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
default:
|
||||
break;
|
||||
}
|
||||
update_fcrs(fflags, core_, t, warp_id_);
|
||||
this->update_fcrs(fflags, t, wid);
|
||||
}
|
||||
rd_write = true;
|
||||
break;
|
||||
|
@ -1287,7 +1285,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
core_->wspawn(rsdata.at(thread_start)[0].i, rsdata.at(thread_start)[1].i);
|
||||
this->wspawn(rsdata.at(thread_start)[0].i, rsdata.at(thread_start)[1].i);
|
||||
} break;
|
||||
case 2: {
|
||||
// SPLIT
|
||||
|
@ -1298,23 +1296,23 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
|
||||
ThreadMask then_tmask, else_tmask;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
auto cond = ireg_file_.at(t).at(rsrc0);
|
||||
then_tmask[t] = tmask_.test(t) && cond;
|
||||
else_tmask[t] = tmask_.test(t) && !cond;
|
||||
auto cond = warp.ireg_file.at(t).at(rsrc0);
|
||||
then_tmask[t] = warp.tmask.test(t) && cond;
|
||||
else_tmask[t] = warp.tmask.test(t) && !cond;
|
||||
}
|
||||
|
||||
bool is_divergent = then_tmask.any() && else_tmask.any();
|
||||
if (is_divergent) {
|
||||
if (ipdom_stack_.size() == arch_.ipdom_size()) {
|
||||
std::cout << "IPDOM stack is full! size=" << std::dec << ipdom_stack_.size() << ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")\n" << std::dec << std::flush;
|
||||
if (warp.ipdom_stack.size() == arch_.ipdom_size()) {
|
||||
std::cout << "IPDOM stack is full! size=" << std::dec << warp.ipdom_stack.size() << ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << trace->uuid << ")\n" << std::dec << std::flush;
|
||||
std::abort();
|
||||
}
|
||||
// set new thread mask
|
||||
next_tmask = then_tmask;
|
||||
// push reconvergence thread mask onto the stack
|
||||
ipdom_stack_.emplace(tmask_);
|
||||
warp.ipdom_stack.emplace(warp.tmask);
|
||||
// push else's thread mask onto the stack
|
||||
ipdom_stack_.emplace(else_tmask, next_pc);
|
||||
warp.ipdom_stack.emplace(else_tmask, next_pc);
|
||||
}
|
||||
// return divergent state
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
|
@ -1329,17 +1327,17 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc0);
|
||||
trace->fetch_stall = true;
|
||||
|
||||
int is_divergent = ireg_file_.at(thread_start).at(rsrc0);
|
||||
int is_divergent = warp.ireg_file.at(thread_start).at(rsrc0);
|
||||
if (is_divergent != 0) {
|
||||
if (ipdom_stack_.empty()) {
|
||||
if (warp.ipdom_stack.empty()) {
|
||||
std::cout << "IPDOM stack is empty!\n" << std::flush;
|
||||
std::abort();
|
||||
}
|
||||
next_tmask = ipdom_stack_.top().tmask;
|
||||
if (!ipdom_stack_.top().fallthrough) {
|
||||
next_pc = ipdom_stack_.top().PC;
|
||||
next_tmask = warp.ipdom_stack.top().tmask;
|
||||
if (!warp.ipdom_stack.top().fallthrough) {
|
||||
next_pc = warp.ipdom_stack.top().PC;
|
||||
}
|
||||
ipdom_stack_.pop();
|
||||
warp.ipdom_stack.pop();
|
||||
}
|
||||
} break;
|
||||
case 4: {
|
||||
|
@ -1360,12 +1358,12 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->fetch_stall = true;
|
||||
ThreadMask pred;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
pred[t] = tmask_.test(t) && (ireg_file_.at(t).at(rsrc0) & 0x1);
|
||||
pred[t] = warp.tmask.test(t) && (warp.ireg_file.at(t).at(rsrc0) & 0x1);
|
||||
}
|
||||
if (pred.any()) {
|
||||
next_tmask &= pred;
|
||||
} else {
|
||||
next_tmask = ireg_file_.at(thread_start).at(rsrc1);
|
||||
next_tmask = warp.ireg_file.at(thread_start).at(rsrc1);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
|
@ -1387,7 +1385,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc1);
|
||||
trace->used_iregs.set(rsrc2);
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
rddata[t].i = rsdata[t][0].i ? rsdata[t][1].i : rsdata[t][2].i;
|
||||
}
|
||||
|
@ -1414,11 +1412,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
DPH(2, "Dest Reg: " << type << std::dec << rdest << "={");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
if (t) DPN(2, ", ");
|
||||
if (!tmask_.test(t)) {
|
||||
if (!warp.tmask.test(t)) {
|
||||
DPN(2, "-");
|
||||
continue;
|
||||
}
|
||||
ireg_file_.at(t)[rdest] = rddata[t].i;
|
||||
warp.ireg_file.at(t)[rdest] = rddata[t].i;
|
||||
DPN(2, "0x" << std::hex << rddata[t].i);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
|
@ -1433,11 +1431,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
DPH(2, "Dest Reg: " << type << std::dec << rdest << "={");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
if (t) DPN(2, ", ");
|
||||
if (!tmask_.test(t)) {
|
||||
if (!warp.tmask.test(t)) {
|
||||
DPN(2, "-");
|
||||
continue;
|
||||
}
|
||||
freg_file_.at(t)[rdest] = rddata[t].u64;
|
||||
warp.freg_file.at(t)[rdest] = rddata[t].u64;
|
||||
DPN(2, "0x" << std::hex << rddata[t].f);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
|
@ -1449,19 +1447,21 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
|
|||
}
|
||||
}
|
||||
|
||||
PC_ += 4;
|
||||
if (PC_ != next_pc) {
|
||||
warp.PC += 4;
|
||||
|
||||
if (warp.PC != next_pc) {
|
||||
DP(3, "*** Next PC=0x" << std::hex << next_pc << std::dec);
|
||||
PC_ = next_pc;
|
||||
warp.PC = next_pc;
|
||||
}
|
||||
if (tmask_ != next_tmask) {
|
||||
|
||||
if (warp.tmask != next_tmask) {
|
||||
DPH(3, "*** New Tmask=");
|
||||
for (uint32_t i = 0; i < num_threads; ++i)
|
||||
DPN(3, next_tmask.test(i));
|
||||
DPN(3, std::endl);
|
||||
tmask_ = next_tmask;
|
||||
warp.tmask = next_tmask;
|
||||
if (!next_tmask.any()) {
|
||||
core_->active_warps_.reset(warp_id_);
|
||||
active_warps_.reset(wid);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -11,7 +11,7 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "exe_unit.h"
|
||||
#include "func_unit.h"
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
#include <string.h>
|
||||
|
@ -24,7 +24,7 @@
|
|||
|
||||
using namespace vortex;
|
||||
|
||||
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
|
||||
AluUnit::AluUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "ALU") {}
|
||||
|
||||
void AluUnit::tick() {
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
|
@ -48,8 +48,7 @@ void AluUnit::tick() {
|
|||
}
|
||||
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
|
||||
if (trace->eop && trace->fetch_stall) {
|
||||
assert(core_->stalled_warps_.test(trace->wid));
|
||||
core_->stalled_warps_.reset(trace->wid);
|
||||
core_->resume(trace->wid);
|
||||
}
|
||||
input.pop();
|
||||
}
|
||||
|
@ -57,7 +56,7 @@ void AluUnit::tick() {
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
|
||||
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "FPU") {}
|
||||
|
||||
void FpuUnit::tick() {
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
|
@ -93,7 +92,7 @@ void FpuUnit::tick() {
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
|
||||
: ExeUnit(ctx, core, "LSU")
|
||||
: FuncUnit(ctx, core, "LSU")
|
||||
, pending_rd_reqs_(LSUQ_IN_SIZE)
|
||||
, num_lanes_(NUM_LSU_LANES)
|
||||
, pending_loads_(0)
|
||||
|
@ -230,7 +229,7 @@ void LsuUnit::tick() {
|
|||
|
||||
auto& dcache_req_port = core_->lmem_demuxs_.at(t)->ReqIn;
|
||||
auto mem_addr = trace_data->mem_addrs.at(t + t0);
|
||||
auto type = core_->get_addr_type(mem_addr.addr);
|
||||
auto type = get_addr_type(mem_addr.addr);
|
||||
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr.addr;
|
||||
|
@ -271,7 +270,7 @@ void LsuUnit::tick() {
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
|
||||
: ExeUnit(ctx, core, "SFU")
|
||||
: FuncUnit(ctx, core, "SFU")
|
||||
, input_idx_(0)
|
||||
{}
|
||||
|
||||
|
@ -315,8 +314,7 @@ void SfuUnit::tick() {
|
|||
|
||||
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
|
||||
if (trace->eop && release_warp) {
|
||||
assert(core_->stalled_warps_.test(trace->wid));
|
||||
core_->stalled_warps_.reset(trace->wid);
|
||||
core_->resume(trace->wid);
|
||||
}
|
||||
|
||||
input.pop();
|
|
@ -20,19 +20,19 @@ namespace vortex {
|
|||
|
||||
class Core;
|
||||
|
||||
class ExeUnit : public SimObject<ExeUnit> {
|
||||
class FuncUnit : public SimObject<FuncUnit> {
|
||||
public:
|
||||
std::vector<SimPort<instr_trace_t*>> Inputs;
|
||||
std::vector<SimPort<instr_trace_t*>> Outputs;
|
||||
|
||||
ExeUnit(const SimContext& ctx, Core* core, const char* name)
|
||||
: SimObject<ExeUnit>(ctx, name)
|
||||
FuncUnit(const SimContext& ctx, Core* core, const char* name)
|
||||
: SimObject<FuncUnit>(ctx, name)
|
||||
, Inputs(ISSUE_WIDTH, this)
|
||||
, Outputs(ISSUE_WIDTH, this)
|
||||
, core_(core)
|
||||
{}
|
||||
|
||||
virtual ~ExeUnit() {}
|
||||
virtual ~FuncUnit() {}
|
||||
|
||||
virtual void reset() {}
|
||||
|
||||
|
@ -44,7 +44,7 @@ protected:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class AluUnit : public ExeUnit {
|
||||
class AluUnit : public FuncUnit {
|
||||
public:
|
||||
AluUnit(const SimContext& ctx, Core*);
|
||||
|
||||
|
@ -53,7 +53,7 @@ public:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class FpuUnit : public ExeUnit {
|
||||
class FpuUnit : public FuncUnit {
|
||||
public:
|
||||
FpuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
|
@ -62,7 +62,7 @@ public:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class LsuUnit : public ExeUnit {
|
||||
class LsuUnit : public FuncUnit {
|
||||
public:
|
||||
LsuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
|
@ -85,7 +85,7 @@ private:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class SfuUnit : public ExeUnit {
|
||||
class SfuUnit : public FuncUnit {
|
||||
public:
|
||||
SfuUnit(const SimContext& ctx, Core*);
|
||||
|
|
@ -39,7 +39,7 @@ public:
|
|||
Impl(LocalMem* simobject, const Config& config)
|
||||
: simobject_(simobject)
|
||||
, config_(config)
|
||||
, ram_(config.capacity, config.capacity)
|
||||
, ram_(config.capacity)
|
||||
, bank_sel_addr_start_(0)
|
||||
, bank_sel_addr_end_(0 + log2ceil(config.num_banks)-1)
|
||||
{}
|
||||
|
|
|
@ -24,6 +24,7 @@
|
|||
#include "constants.h"
|
||||
#include <util.h>
|
||||
#include "core.h"
|
||||
#include "VX_types.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
|
@ -87,7 +88,7 @@ int main(int argc, char **argv) {
|
|||
Arch arch(num_threads, num_warps, num_cores);
|
||||
|
||||
// create memory module
|
||||
RAM ram(RAM_PAGE_SIZE);
|
||||
RAM ram(0, RAM_PAGE_SIZE);
|
||||
|
||||
// create processor
|
||||
Processor processor(arch);
|
||||
|
@ -117,7 +118,10 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
|
||||
// run simulation
|
||||
exitcode = processor.run(riscv_test);
|
||||
exitcode = processor.run();
|
||||
if (riscv_test) {
|
||||
exitcode = (1 - exitcode);
|
||||
}
|
||||
}
|
||||
|
||||
if (exitcode != 0) {
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
#pragma once
|
||||
|
||||
#include "instr_trace.h"
|
||||
#include <queue>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "instr_trace.h"
|
||||
#include <queue>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
|
|
|
@ -83,24 +83,21 @@ void ProcessorImpl::attach_ram(RAM* ram) {
|
|||
}
|
||||
}
|
||||
|
||||
int ProcessorImpl::run(bool riscv_test) {
|
||||
int ProcessorImpl::run() {
|
||||
SimPlatform::instance().reset();
|
||||
this->reset();
|
||||
|
||||
bool done;
|
||||
Word exitcode = 0;
|
||||
int exitcode = 0;
|
||||
do {
|
||||
SimPlatform::instance().tick();
|
||||
done = true;
|
||||
for (auto cluster : clusters_) {
|
||||
if (cluster->running()) {
|
||||
Word ec;
|
||||
if (cluster->check_exit(&ec, riscv_test)) {
|
||||
exitcode |= ec;
|
||||
} else {
|
||||
done = false;
|
||||
}
|
||||
done = false;
|
||||
continue;
|
||||
}
|
||||
exitcode |= cluster->get_exitcode();
|
||||
}
|
||||
perf_mem_latency_ += perf_mem_pending_reads_;
|
||||
} while (!done);
|
||||
|
@ -143,8 +140,8 @@ void Processor::attach_ram(RAM* mem) {
|
|||
impl_->attach_ram(mem);
|
||||
}
|
||||
|
||||
int Processor::run(bool riscv_test) {
|
||||
return impl_->run(riscv_test);
|
||||
int Processor::run() {
|
||||
return impl_->run();
|
||||
}
|
||||
|
||||
void Processor::write_dcr(uint32_t addr, uint32_t value) {
|
||||
|
|
|
@ -28,7 +28,7 @@ public:
|
|||
|
||||
void attach_ram(RAM* mem);
|
||||
|
||||
int run(bool riscv_test);
|
||||
int run();
|
||||
|
||||
void write_dcr(uint32_t addr, uint32_t value);
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ public:
|
|||
|
||||
void attach_ram(RAM* mem);
|
||||
|
||||
int run(bool riscv_test);
|
||||
int run();
|
||||
|
||||
void write_dcr(uint32_t addr, uint32_t value);
|
||||
|
||||
|
|
|
@ -14,7 +14,8 @@
|
|||
#pragma once
|
||||
|
||||
#include "instr_trace.h"
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
|
|
BIN
sim/simx/simx
BIN
sim/simx/simx
Binary file not shown.
|
@ -118,19 +118,12 @@ bool Socket::running() const {
|
|||
return false;
|
||||
}
|
||||
|
||||
bool Socket::check_exit(Word* exitcode, bool riscv_test) const {
|
||||
bool done = true;
|
||||
Word exitcode_ = 0;
|
||||
int Socket::get_exitcode() const {
|
||||
int exitcode = 0;
|
||||
for (auto& core : cores_) {
|
||||
Word ec;
|
||||
if (core->check_exit(&ec, riscv_test)) {
|
||||
exitcode_ |= ec;
|
||||
} else {
|
||||
done = false;
|
||||
}
|
||||
exitcode |= core->get_exitcode();
|
||||
}
|
||||
*exitcode = exitcode_;
|
||||
return done;
|
||||
return exitcode;
|
||||
}
|
||||
|
||||
void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
|
||||
|
@ -138,7 +131,7 @@ void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
|
|||
}
|
||||
|
||||
void Socket::resume(uint32_t core_index) {
|
||||
cores_.at(core_index)->resume();
|
||||
cores_.at(core_index)->resume(-1);
|
||||
}
|
||||
|
||||
Socket::PerfStats Socket::perf_stats() const {
|
||||
|
|
|
@ -62,7 +62,7 @@ public:
|
|||
|
||||
bool running() const;
|
||||
|
||||
bool check_exit(Word* exitcode, bool riscv_test) const;
|
||||
int get_exitcode() const;
|
||||
|
||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <stdint.h>
|
||||
#include <bitset>
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <util.h>
|
||||
#include <stringutil.h>
|
||||
|
@ -53,8 +54,6 @@ typedef std::bitset<MAX_NUM_REGS> RegMask;
|
|||
typedef std::bitset<MAX_NUM_THREADS> ThreadMask;
|
||||
typedef std::bitset<MAX_NUM_WARPS> WarpMask;
|
||||
|
||||
typedef std::unordered_map<uint32_t, uint32_t> CSRs;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class RegType {
|
||||
|
@ -142,6 +141,18 @@ enum class AddrType {
|
|||
IO
|
||||
};
|
||||
|
||||
inline AddrType get_addr_type(uint64_t addr) {
|
||||
if (LMEM_ENABLED) {
|
||||
if (addr >= LMEM_BASE_ADDR && addr < (LMEM_BASE_ADDR + (1 << LMEM_LOG_SIZE))) {
|
||||
return AddrType::Shared;
|
||||
}
|
||||
}
|
||||
if (addr >= IO_BASE_ADDR) {
|
||||
return AddrType::IO;
|
||||
}
|
||||
return AddrType::Global;
|
||||
}
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
|
||||
switch (type) {
|
||||
case AddrType::Global: os << "Global"; break;
|
||||
|
|
|
@ -1,112 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <iostream>
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <util.h>
|
||||
|
||||
#include "instr.h"
|
||||
#include "core.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
Warp::Warp(Core *core, uint32_t warp_id)
|
||||
: warp_id_(warp_id)
|
||||
, arch_(core->arch())
|
||||
, core_(core)
|
||||
, ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
|
||||
, freg_file_(core->arch().num_threads(), std::vector<uint64_t>(core->arch().num_regs()))
|
||||
{
|
||||
this->reset();
|
||||
}
|
||||
|
||||
void Warp::reset() {
|
||||
PC_ = core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
|
||||
#if (XLEN == 64)
|
||||
PC_ = (uint64_t(core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
|
||||
#endif
|
||||
tmask_.reset();
|
||||
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i) {
|
||||
for (auto& reg : ireg_file_.at(i)) {
|
||||
reg = 0;
|
||||
}
|
||||
for (auto& reg : freg_file_.at(i)) {
|
||||
reg = 0;
|
||||
}
|
||||
}
|
||||
uui_gen_.reset();
|
||||
}
|
||||
|
||||
instr_trace_t* Warp::eval() {
|
||||
assert(tmask_.any());
|
||||
|
||||
#ifndef NDEBUG
|
||||
uint32_t instr_uuid = uui_gen_.get_uuid(PC_);
|
||||
uint32_t g_wid = core_->id() * arch_.num_warps() + warp_id_;
|
||||
uint32_t instr_id = instr_uuid & 0xffff;
|
||||
uint32_t instr_ref = instr_uuid >> 16;
|
||||
uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
|
||||
#else
|
||||
uint64_t uuid = 0;
|
||||
#endif
|
||||
|
||||
DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
|
||||
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
|
||||
DPN(1, tmask_.test(i));
|
||||
DPN(1, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << uuid << ")" << std::endl);
|
||||
|
||||
// Fetch
|
||||
uint32_t instr_code = 0;
|
||||
core_->icache_read(&instr_code, PC_, sizeof(uint32_t));
|
||||
|
||||
// Decode
|
||||
auto instr = core_->decoder_.decode(instr_code);
|
||||
if (!instr) {
|
||||
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << PC_ << " (#" << std::dec << uuid << ")" << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
|
||||
DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);
|
||||
|
||||
// Create trace
|
||||
auto trace = new instr_trace_t(uuid, arch_);
|
||||
trace->cid = core_->id();
|
||||
trace->wid = warp_id_;
|
||||
trace->PC = PC_;
|
||||
trace->tmask = tmask_;
|
||||
trace->rdest = instr->getRDest();
|
||||
trace->rdest_type = instr->getRDType();
|
||||
|
||||
// Execute
|
||||
this->execute(*instr, trace);
|
||||
|
||||
DP(5, "Register state:");
|
||||
for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
|
||||
DPN(5, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
|
||||
// Integer register file
|
||||
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
|
||||
DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
|
||||
}
|
||||
DPN(5, '|');
|
||||
// Floating point register file
|
||||
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
|
||||
DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
|
||||
}
|
||||
DPN(5, std::endl);
|
||||
}
|
||||
|
||||
return trace;
|
||||
}
|
107
sim/simx/warp.h
107
sim/simx/warp.h
|
@ -1,107 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#ifndef __WARP_H
|
||||
#define __WARP_H
|
||||
|
||||
#include <vector>
|
||||
#include <stack>
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Arch;
|
||||
class Core;
|
||||
class Instr;
|
||||
class instr_trace_t;
|
||||
|
||||
struct DomStackEntry {
|
||||
DomStackEntry(const ThreadMask &tmask, Word PC)
|
||||
: tmask(tmask)
|
||||
, PC(PC)
|
||||
, fallthrough(false)
|
||||
{}
|
||||
|
||||
DomStackEntry(const ThreadMask &tmask)
|
||||
: tmask(tmask)
|
||||
, fallthrough(true)
|
||||
{}
|
||||
|
||||
ThreadMask tmask;
|
||||
Word PC;
|
||||
bool fallthrough;
|
||||
};
|
||||
|
||||
struct vtype {
|
||||
uint32_t vill;
|
||||
uint32_t vediv;
|
||||
uint32_t vsew;
|
||||
uint32_t vlmul;
|
||||
};
|
||||
|
||||
class Warp {
|
||||
public:
|
||||
Warp(Core *core, uint32_t warp_id);
|
||||
|
||||
void reset();
|
||||
|
||||
uint32_t id() const {
|
||||
return warp_id_;
|
||||
}
|
||||
|
||||
Word getPC() const {
|
||||
return PC_;
|
||||
}
|
||||
|
||||
void setPC(Word PC) {
|
||||
PC_ = PC;
|
||||
}
|
||||
|
||||
void setTmask(size_t index, bool value) {
|
||||
tmask_.set(index, value);
|
||||
}
|
||||
|
||||
uint64_t getTmask() const {
|
||||
return tmask_.to_ulong();
|
||||
}
|
||||
|
||||
Word getIRegValue(uint32_t reg) const {
|
||||
return ireg_file_.at(0).at(reg);
|
||||
}
|
||||
|
||||
instr_trace_t* eval();
|
||||
|
||||
private:
|
||||
|
||||
void execute(const Instr &instr, instr_trace_t *trace);
|
||||
|
||||
UUIDGenerator uui_gen_;
|
||||
|
||||
uint32_t warp_id_;
|
||||
const Arch& arch_;
|
||||
Core *core_;
|
||||
|
||||
Word PC_;
|
||||
ThreadMask tmask_;
|
||||
|
||||
std::vector<std::vector<Word>> ireg_file_;
|
||||
std::vector<std::vector<uint64_t>> freg_file_;
|
||||
std::stack<DomStackEntry> ipdom_stack_;
|
||||
|
||||
struct vtype vtype_;
|
||||
uint32_t vl_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
Loading…
Add table
Add a link
Reference in a new issue