simx refactoring - emulation vs simulation discrete separation

This commit is contained in:
Blaise Tine 2024-03-12 00:23:42 -07:00
parent ff6f33acff
commit 840ced22a9
33 changed files with 873 additions and 897 deletions

View file

@ -39,7 +39,7 @@ using namespace vortex;
class vx_device {
public:
vx_device()
: ram_(RAM_PAGE_SIZE)
: ram_(0, RAM_PAGE_SIZE)
, global_mem_(
ALLOC_BASE_ADDR,
ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,

View file

@ -88,7 +88,7 @@ class vx_device {
public:
vx_device()
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
, ram_(RAM_PAGE_SIZE)
, ram_(0, RAM_PAGE_SIZE)
, processor_(arch_)
, global_mem_(
ALLOC_BASE_ADDR,
@ -183,7 +183,7 @@ public:
// start new run
future_ = std::async(std::launch::async, [&]{
processor_.run(false);
processor_.run();
});
return 0;

View file

@ -190,14 +190,17 @@ void MemoryUnit::tlbRm(uint64_t va) {
///////////////////////////////////////////////////////////////////////////////
RAM::RAM(uint32_t page_size, uint64_t capacity)
RAM::RAM(uint64_t capacity, uint32_t page_size)
: capacity_(capacity)
, page_bits_(log2ceil(page_size))
, last_page_(nullptr)
, last_page_index_(0) {
assert(ispow2(page_size));
assert(0 == capacity || ispow2(capacity));
assert(0 == (capacity % page_size));
assert(ispow2(page_size));
if (capacity != 0) {
assert(ispow2(capacity));
assert(page_size <= capacity);
assert(0 == (capacity % page_size));
}
}
RAM::~RAM() {

View file

@ -158,7 +158,8 @@ private:
class RAM : public MemDevice {
public:
RAM(uint32_t page_size, uint64_t capacity = 0);
RAM(uint64_t capacity, uint32_t page_size);
RAM(uint64_t capacity) : RAM(capacity, capacity) {}
~RAM();
void clear();

View file

@ -125,7 +125,7 @@ public:
trace_->open("trace.vcd");
#endif
ram_ = new RAM(RAM_PAGE_SIZE);
ram_ = new RAM(0, RAM_PAGE_SIZE);
// initialize dram simulator
ramulator::Config ram_config;

View file

@ -65,7 +65,7 @@ int main(int argc, char **argv) {
parse_args(argc, argv);
// create memory module
vortex::RAM ram(RAM_PAGE_SIZE);
vortex::RAM ram(0, RAM_PAGE_SIZE);
// create processor
vortex::Processor processor;

View file

@ -16,7 +16,7 @@ LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
SRCS += processor.cpp cluster.cpp socket.cpp core.cpp warp.cpp decode.cpp execute.cpp exe_unit.cpp cache_sim.cpp mem_sim.cpp local_mem.cpp dcrs.cpp
SRCS += processor.cpp cluster.cpp socket.cpp core.cpp emulator.cpp decode.cpp execute.cpp func_unit.cpp cache_sim.cpp mem_sim.cpp local_mem.cpp dcrs.cpp
# Debugigng
ifdef DEBUG

View file

@ -114,19 +114,12 @@ bool Cluster::running() const {
return false;
}
bool Cluster::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true;
Word exitcode_ = 0;
int Cluster::get_exitcode() const {
int exitcode = 0;
for (auto& socket : sockets_) {
Word ec;
if (socket->check_exit(&ec, riscv_test)) {
exitcode_ |= ec;
} else {
done = false;
}
exitcode |= socket->get_exitcode();
}
*exitcode = exitcode_;
return done;
return exitcode;
}
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {

View file

@ -59,7 +59,7 @@ public:
bool running() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
int get_exitcode() const;
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);

View file

@ -19,12 +19,9 @@
#include "types.h"
#include "arch.h"
#include "mem.h"
#include "decode.h"
#include "core.h"
#include "socket.h"
#include "debug.h"
#include "constants.h"
#include "processor_impl.h"
using namespace vortex;
@ -41,31 +38,18 @@ Core::Core(const SimContext& ctx,
, core_id_(core_id)
, socket_(socket)
, arch_(arch)
, dcrs_(dcrs)
, decoder_(arch)
, warps_(arch.num_warps())
, barriers_(arch.num_barriers(), 0)
, fcsrs_(arch.num_warps(), 0)
, emulator_(arch, dcrs, this)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)
, operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)FUType::Count)
, exe_units_((uint32_t)FUType::Count)
, func_units_((uint32_t)FUType::Count)
, lmem_demuxs_(NUM_LSU_LANES)
, pending_icache_(arch_.num_warps())
, csrs_(arch.num_warps())
, commit_arbs_(ISSUE_WIDTH)
{
char sname[100];
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
csrs_.at(i).resize(arch.num_threads());
}
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
warps_.at(i) = std::make_shared<Warp>(this, i);
}
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
operands_.at(i) = SimPlatform::instance().create_object<Operand>();
}
@ -99,17 +83,17 @@ Core::Core(const SimContext& ctx,
dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, 1, NUM_SFU_LANES);
// initialize execute units
exe_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
exe_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
exe_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
exe_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
func_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
// bind commit arbiters
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)FUType::Count, 1);
for (uint32_t j = 0; j < (uint32_t)FUType::Count; ++j) {
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
func_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
}
commit_arbs_.at(i) = arbiter;
}
@ -118,17 +102,14 @@ Core::Core(const SimContext& ctx,
}
Core::~Core() {
this->cout_flush();
//--
}
void Core::reset() {
for (auto& warp : warps_) {
warp->reset();
}
warps_.at(0)->setTmask(0, true);
active_warps_ = 1;
for (auto& exe_unit : exe_units_) {
emulator_.clear();
for (auto& exe_unit : func_units_) {
exe_unit->reset();
}
@ -136,29 +117,20 @@ void Core::reset() {
commit_arb->reset();
}
for (auto& barrier : barriers_) {
barrier.reset();
}
for (auto& fcsr : fcsrs_) {
fcsr = 0;
}
for (auto& ibuf : ibuffers_) {
ibuf.clear();
}
ibuffer_idx_ = 0;
scoreboard_.clear();
fetch_latch_.clear();
decode_latch_.clear();
pending_icache_.clear();
stalled_warps_.reset();
pending_instrs_ = 0;
exited_ = false;
perf_stats_ = PerfStats();
ibuffer_idx_ = 0;
pending_instrs_ = 0;
pending_ifetches_ = 0;
perf_stats_ = PerfStats();
}
void Core::tick() {
@ -174,28 +146,14 @@ void Core::tick() {
}
void Core::schedule() {
int scheduled_warp = -1;
// find next ready warp
for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) {
bool warp_active = active_warps_.test(wid);
bool warp_stalled = stalled_warps_.test(wid);
if (warp_active && !warp_stalled) {
scheduled_warp = wid;
break;
}
}
if (scheduled_warp == -1) {
auto trace = emulator_.step();
if (trace == nullptr) {
++perf_stats_.sched_idle;
return;
}
// suspend warp until decode
stalled_warps_.set(scheduled_warp);
// evaluate scheduled warp
auto& warp = warps_.at(scheduled_warp);
auto trace = warp->eval();
emulator_.suspend(trace->wid);
DT(3, "pipeline-schedule: " << *trace);
@ -255,9 +213,8 @@ void Core::decode() {
}
// release warp
if (!trace->fetch_stall) {
assert(stalled_warps_.test(trace->wid));
stalled_warps_.reset(trace->wid);
if (!trace->fetch_stall) {
emulator_.resume(trace->wid);
}
DT(3, "pipeline-decode: " << *trace);
@ -355,7 +312,7 @@ void Core::issue() {
void Core::execute() {
for (uint32_t i = 0; i < (uint32_t)FUType::Count; ++i) {
auto& dispatch = dispatchers_.at(i);
auto& exe_unit = exe_units_.at(i);
auto& exe_unit = func_units_.at(i);
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
if (dispatch->Outputs.at(j).empty())
continue;
@ -396,361 +353,22 @@ void Core::commit() {
}
}
void Core::wspawn(uint32_t num_warps, Word nextPC) {
uint32_t active_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC);
for (uint32_t i = 1; i < active_warps; ++i) {
auto warp = warps_.at(i);
warp->setPC(nextPC);
warp->setTmask(0, true);
active_warps_.set(i);
}
}
void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
uint32_t bar_idx = bar_id & 0x7fffffff;
bool is_global = (bar_id >> 31);
auto& barrier = barriers_.at(bar_idx);
barrier.set(warp_id);
DP(3, "*** Suspend core #" << core_id_ << ", warp #" << warp_id << " at barrier #" << bar_idx);
if (is_global) {
// global barrier handling
if (barrier.count() == active_warps_.count()) {
socket_->barrier(bar_idx, count, core_id_);
barrier.reset();
}
} else {
// local barrier handling
if (barrier.count() == (size_t)count) {
// resume suspended warps
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume core #" << core_id_ << ", warp #" << i << " at barrier #" << bar_idx);
stalled_warps_.reset(i);
}
}
barrier.reset();
}
}
}
void Core::icache_read(void *data, uint64_t addr, uint32_t size) {
mmu_.read(data, addr, size, 0);
}
AddrType Core::get_addr_type(uint64_t addr) {
if (LMEM_ENABLED) {
if (addr >= LMEM_BASE_ADDR && addr < (LMEM_BASE_ADDR + (1 << LMEM_LOG_SIZE))) {
return AddrType::Shared;
}
}
if (addr >= IO_BASE_ADDR) {
return AddrType::IO;
}
return AddrType::Global;
}
void Core::dcache_read(void *data, uint64_t addr, uint32_t size) {
auto type = this->get_addr_type(addr);
if (type == AddrType::Shared) {
local_mem_->read(data, addr, size);
} else {
mmu_.read(data, addr, size, 0);
}
DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
}
void Core::dcache_write(const void* data, uint64_t addr, uint32_t size) {
auto type = this->get_addr_type(addr);
if (addr >= uint64_t(IO_COUT_ADDR)
&& addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
this->writeToStdOut(data, addr, size);
} else {
if (type == AddrType::Shared) {
local_mem_->write(data, addr, size);
} else {
mmu_.write(data, addr, size, 0);
}
}
DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
}
void Core::dcache_amo_reserve(uint64_t addr) {
auto type = this->get_addr_type(addr);
if (type == AddrType::Global) {
mmu_.amo_reserve(addr);
}
}
bool Core::dcache_amo_check(uint64_t addr) {
auto type = this->get_addr_type(addr);
if (type == AddrType::Global) {
return mmu_.amo_check(addr);
}
return false;
}
void Core::writeToStdOut(const void* data, uint64_t addr, uint32_t size) {
if (size != 1)
std::abort();
uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
auto& ss_buf = print_bufs_[tid];
char c = *(char*)data;
ss_buf << c;
if (c == '\n') {
std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush;
ss_buf.str("");
}
}
void Core::cout_flush() {
for (auto& buf : print_bufs_) {
auto str = buf.second.str();
if (!str.empty()) {
std::cout << "#" << buf.first << ": " << str << std::endl;
}
}
}
uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
switch (addr) {
case VX_CSR_SATP:
case VX_CSR_PMPCFG0:
case VX_CSR_PMPADDR0:
case VX_CSR_MSTATUS:
case VX_CSR_MISA:
case VX_CSR_MEDELEG:
case VX_CSR_MIDELEG:
case VX_CSR_MIE:
case VX_CSR_MTVEC:
case VX_CSR_MEPC:
case VX_CSR_MNSTATUS:
return 0;
case VX_CSR_FFLAGS:
return fcsrs_.at(wid) & 0x1F;
case VX_CSR_FRM:
return (fcsrs_.at(wid) >> 5);
case VX_CSR_FCSR:
return fcsrs_.at(wid);
case VX_CSR_MHARTID: // global thread ID
return (core_id_ * arch_.num_warps() + wid) * arch_.num_threads() + tid;
case VX_CSR_THREAD_ID: // thread ID
return tid;
case VX_CSR_WARP_ID: // warp ID
return wid;
case VX_CSR_CORE_ID: // core ID
return core_id_;
case VX_CSR_THREAD_MASK: // thread mask
return warps_.at(wid)->getTmask();
case VX_CSR_WARP_MASK: // active warps
return active_warps_.to_ulong();
case VX_CSR_NUM_THREADS: // Number of threads per warp
return arch_.num_threads();
case VX_CSR_NUM_WARPS: // Number of warps per core
return arch_.num_warps();
case VX_CSR_NUM_CORES: // Number of cores per cluster
return uint32_t(arch_.num_cores()) * arch_.num_clusters();
case VX_CSR_MCYCLE: // NumCycles
return perf_stats_.cycles & 0xffffffff;
case VX_CSR_MCYCLE_H: // NumCycles
return (uint32_t)(perf_stats_.cycles >> 32);
case VX_CSR_MINSTRET: // NumInsts
return perf_stats_.instrs & 0xffffffff;
case VX_CSR_MINSTRET_H: // NumInsts
return (uint32_t)(perf_stats_.instrs >> 32);
default:
if ((addr >= VX_CSR_MPM_BASE && addr < (VX_CSR_MPM_BASE + 32))
|| (addr >= VX_CSR_MPM_BASE_H && addr < (VX_CSR_MPM_BASE_H + 32))) {
// user-defined MPM CSRs
auto perf_class = dcrs_.base_dcrs.read(VX_DCR_BASE_MPM_CLASS);
switch (perf_class) {
case VX_DCR_MPM_CLASS_NONE:
break;
case VX_DCR_MPM_CLASS_CORE: {
switch (addr) {
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idle & 0xffffffff;
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idle >> 32;
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
case VX_CSR_MPM_SCRB_ST_H: return perf_stats_.scrb_stalls >> 32;
case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff;
case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff;
case VX_CSR_MPM_SCRB_FPU_H:return perf_stats_.scrb_fpu >> 32;
case VX_CSR_MPM_SCRB_LSU: return perf_stats_.scrb_lsu & 0xffffffff;
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
case VX_CSR_MPM_SCRB_WCTL: return perf_stats_.scrb_wctl & 0xffffffff;
case VX_CSR_MPM_SCRB_WCTL_H: return perf_stats_.scrb_wctl >> 32;
case VX_CSR_MPM_SCRB_CSRS: return perf_stats_.scrb_csrs & 0xffffffff;
case VX_CSR_MPM_SCRB_CSRS_H: return perf_stats_.scrb_csrs >> 32;
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
case VX_CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
case VX_CSR_MPM_IFETCH_LT: return perf_stats_.ifetch_latency & 0xffffffff;
case VX_CSR_MPM_IFETCH_LT_H: return perf_stats_.ifetch_latency >> 32;
case VX_CSR_MPM_LOAD_LT: return perf_stats_.load_latency & 0xffffffff;
case VX_CSR_MPM_LOAD_LT_H: return perf_stats_.load_latency >> 32;
}
} break;
case VX_DCR_MPM_CLASS_MEM: {
auto proc_perf = socket_->cluster()->processor()->perf_stats();
auto cluster_perf = socket_->cluster()->perf_stats();
auto socket_perf = socket_->perf_stats();
auto lmem_perf = local_mem_->perf_stats();
switch (addr) {
case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff;
case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32;
case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff;
case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32;
case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff;
case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32;
case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff;
case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32;
case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32;
case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32;
case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
case VX_CSR_MPM_L3CACHE_WRITES: return proc_perf.l3cache.writes & 0xffffffff;
case VX_CSR_MPM_L3CACHE_WRITES_H: return proc_perf.l3cache.writes >> 32;
case VX_CSR_MPM_L3CACHE_MISS_R: return proc_perf.l3cache.read_misses & 0xffffffff;
case VX_CSR_MPM_L3CACHE_MISS_R_H: return proc_perf.l3cache.read_misses >> 32;
case VX_CSR_MPM_L3CACHE_MISS_W: return proc_perf.l3cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L3CACHE_MISS_W_H: return proc_perf.l3cache.write_misses >> 32;
case VX_CSR_MPM_L3CACHE_BANK_ST: return proc_perf.l3cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L3CACHE_BANK_ST_H:return proc_perf.l3cache.bank_stalls >> 32;
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
case VX_CSR_MPM_LMEM_READS: return lmem_perf.reads & 0xffffffff;
case VX_CSR_MPM_LMEM_READS_H: return lmem_perf.reads >> 32;
case VX_CSR_MPM_LMEM_WRITES: return lmem_perf.writes & 0xffffffff;
case VX_CSR_MPM_LMEM_WRITES_H: return lmem_perf.writes >> 32;
case VX_CSR_MPM_LMEM_BANK_ST: return lmem_perf.bank_stalls & 0xffffffff;
case VX_CSR_MPM_LMEM_BANK_ST_H: return lmem_perf.bank_stalls >> 32;
}
} break;
default: {
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
std::abort();
} break;
}
} else {
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
std::abort();
}
}
return 0;
}
void Core::set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid) {
__unused (tid);
switch (addr) {
case VX_CSR_FFLAGS:
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F);
break;
case VX_CSR_FRM:
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5);
break;
case VX_CSR_FCSR:
fcsrs_.at(wid) = value & 0xff;
break;
case VX_CSR_SATP:
case VX_CSR_MSTATUS:
case VX_CSR_MEDELEG:
case VX_CSR_MIDELEG:
case VX_CSR_MIE:
case VX_CSR_MTVEC:
case VX_CSR_MEPC:
case VX_CSR_PMPCFG0:
case VX_CSR_PMPADDR0:
case VX_CSR_MNSTATUS:
break;
default:
{
std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
std::abort();
}
}
}
void Core::trigger_ecall() {
active_warps_.reset();
exited_ = true;
}
void Core::trigger_ebreak() {
active_warps_.reset();
exited_ = true;
}
bool Core::check_exit(Word* exitcode, bool riscv_test) const {
if (exited_) {
Word ec = warps_.at(0)->getIRegValue(3);
if (riscv_test) {
*exitcode = (1 - ec);
} else {
*exitcode = ec;
}
return true;
}
return false;
int Core::get_exitcode() const {
return emulator_.get_exitcode();
}
bool Core::running() const {
return (pending_instrs_ != 0);
return emulator_.running() || (pending_instrs_ != 0);
}
void Core::resume() {
stalled_warps_.reset();
void Core::resume(uint32_t wid) {
emulator_.resume(wid);
}
void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
emulator_.barrier(bar_id, count, wid);
}
void Core::attach_ram(RAM* ram) {
// bind RAM to memory unit
#if (XLEN == 64)
mmu_.attach(*ram, 0, 0xFFFFFFFFFFFFFFFF);
#else
mmu_.attach(*ram, 0, 0xFFFFFFFF);
#endif
emulator_.attach_ram(ram);
}

View file

@ -13,21 +13,10 @@
#pragma once
#include <string>
#include <vector>
#include <list>
#include <stack>
#include <queue>
#include <unordered_map>
#include <memory>
#include <set>
#include <simobject.h>
#include <mem.h>
#include "debug.h"
#include "types.h"
#include "arch.h"
#include "decode.h"
#include "warp.h"
#include "emulator.h"
#include "pipeline.h"
#include "cache_sim.h"
#include "local_mem.h"
@ -35,12 +24,13 @@
#include "scoreboard.h"
#include "operand.h"
#include "dispatcher.h"
#include "exe_unit.h"
#include "dcrs.h"
#include "func_unit.h"
namespace vortex {
class Socket;
class Arch;
class DCRS;
using TraceSwitch = Mux<instr_trace_t*>;
@ -108,49 +98,31 @@ public:
bool running() const;
void resume();
void resume(uint32_t wid);
void barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
uint32_t id() const {
return core_id_;
}
Socket* socket() const {
return socket_;
}
const Arch& arch() const {
return arch_;
}
const DCRS& dcrs() const {
return dcrs_;
Socket* socket() const {
return socket_;
}
uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);
const LocalMem::Ptr& local_mem() const {
return local_mem_;
}
void wspawn(uint32_t num_warps, Word nextPC);
void barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id);
const PerfStats& perf_stats() const {
return perf_stats_;
}
AddrType get_addr_type(uint64_t addr);
void icache_read(void* data, uint64_t addr, uint32_t size);
void dcache_read(void* data, uint64_t addr, uint32_t size);
void dcache_write(const void* data, uint64_t addr, uint32_t size);
void dcache_amo_reserve(uint64_t addr);
bool dcache_amo_check(uint64_t addr);
void trigger_ecall();
void trigger_ebreak();
bool check_exit(Word* exitcode, bool riscv_test) const;
int get_exitcode() const;
private:
@ -160,27 +132,18 @@ private:
void issue();
void execute();
void commit();
void writeToStdOut(const void* data, uint64_t addr, uint32_t size);
void cout_flush();
uint32_t core_id_;
Socket* socket_;
const Arch& arch_;
const DCRS &dcrs_;
const Decoder decoder_;
MemoryUnit mmu_;
std::vector<std::shared_ptr<Warp>> warps_;
std::vector<WarpMask> barriers_;
std::vector<Byte> fcsrs_;
Emulator emulator_;
std::vector<IBuffer> ibuffers_;
Scoreboard scoreboard_;
std::vector<Operand::Ptr> operands_;
std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<ExeUnit::Ptr> exe_units_;
std::vector<FuncUnit::Ptr> func_units_;
LocalMem::Ptr local_mem_;
std::vector<LocalMemDemux::Ptr> lmem_demuxs_;
@ -188,16 +151,9 @@ private:
PipelineLatch decode_latch_;
HashTable<instr_trace_t*> pending_icache_;
WarpMask active_warps_;
WarpMask stalled_warps_;
uint64_t pending_instrs_;
bool exited_;
uint64_t pending_ifetches_;
std::unordered_map<int, std::stringstream> print_bufs_;
std::vector<std::vector<CSRs>> csrs_;
PerfStats perf_stats_;
@ -206,7 +162,6 @@ private:
uint32_t commit_exe_;
uint32_t ibuffer_idx_;
friend class Warp;
friend class LsuUnit;
friend class AluUnit;
friend class FpuUnit;

View file

@ -21,7 +21,7 @@
#include <util.h>
#include "debug.h"
#include "types.h"
#include "decode.h"
#include "emulator.h"
#include "arch.h"
#include "instr.h"
@ -435,9 +435,7 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
}
}
Decoder::Decoder(const Arch&) {}
std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
auto instr = std::make_shared<Instr>();
auto op = Opcode((code >> shift_opcode) & mask_opcode);
instr->setOpcode(op);

View file

@ -1,31 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <vector>
#include <memory>
namespace vortex {
class Arch;
class Instr;
class Decoder {
public:
Decoder(const Arch &);
std::shared_ptr<Instr> decode(uint32_t code) const;
};
}

View file

@ -15,6 +15,7 @@
#include "instr_trace.h"
#include <queue>
#include <vector>
namespace vortex {

527
sim/simx/emulator.cpp Normal file
View file

@ -0,0 +1,527 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <assert.h>
#include <util.h>
#include "emulator.h"
#include "instr_trace.h"
#include "instr.h"
#include "dcrs.h"
#include "core.h"
#include "socket.h"
#include "cluster.h"
#include "processor_impl.h"
#include "local_mem.h"
using namespace vortex;
Emulator::ipdom_entry_t::ipdom_entry_t(const ThreadMask &tmask, Word PC)
: tmask(tmask)
, PC(PC)
, fallthrough(false)
{}
Emulator::ipdom_entry_t::ipdom_entry_t(const ThreadMask &tmask)
: tmask(tmask)
, fallthrough(true)
{}
Emulator::warp_t::warp_t(const Arch& arch)
: ireg_file(arch.num_threads(), std::vector<Word>(arch.num_regs()))
, freg_file(arch.num_threads(), std::vector<uint64_t>(arch.num_regs()))
{}
void Emulator::warp_t::clear(const Arch& arch, const DCRS &dcrs) {
this->PC = dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
#if (XLEN == 64)
this->PC = (uint64_t(dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
#endif
this->tmask.reset();
for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) {
for (auto& reg : this->ireg_file.at(i)) {
reg = 0;
}
for (auto& reg : this->freg_file.at(i)) {
reg = 0;
}
}
this->fcsr = 0;
this->uui_gen.reset();
}
///////////////////////////////////////////////////////////////////////////////
Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
: arch_(arch)
, dcrs_(dcrs)
, core_(core)
, warps_(arch.num_warps(), arch)
, barriers_(arch.num_barriers(), 0)
{
this->clear();
}
Emulator::~Emulator() {
this->cout_flush();
}
void Emulator::clear() {
for (auto& warp : warps_) {
warp.clear(arch_, dcrs_);
}
for (auto& barrier : barriers_) {
barrier.reset();
}
stalled_warps_.reset();
active_warps_.reset();
// activate first warp and thread
active_warps_.set(0);
warps_[0].tmask.set(0);
}
void Emulator::attach_ram(RAM* ram) {
// bind RAM to memory unit
#if (XLEN == 64)
mmu_.attach(*ram, 0, 0xFFFFFFFFFFFFFFFF);
#else
mmu_.attach(*ram, 0, 0xFFFFFFFF);
#endif
}
instr_trace_t* Emulator::step() {
int scheduled_warp = -1;
// find next ready warp
for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) {
bool warp_active = active_warps_.test(wid);
bool warp_stalled = stalled_warps_.test(wid);
if (warp_active && !warp_stalled) {
scheduled_warp = wid;
break;
}
}
if (scheduled_warp == -1)
return nullptr;
// suspend warp until decode
auto& warp = warps_.at(scheduled_warp);
assert(warp.tmask.any());
#ifndef NDEBUG
uint32_t instr_uuid = warp.uui_gen.get_uuid(warp.PC);
uint32_t g_wid = core_->id() * arch_.num_warps() + scheduled_warp;
uint32_t instr_id = instr_uuid & 0xffff;
uint32_t instr_ref = instr_uuid >> 16;
uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
#else
uint64_t uuid = 0;
#endif
DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << scheduled_warp << ", tmask=");
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
DPN(1, warp.tmask.test(i));
DPN(1, ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << uuid << ")" << std::endl);
// Fetch
uint32_t instr_code = 0;
this->icache_read(&instr_code, warp.PC, sizeof(uint32_t));
// Decode
auto instr = this->decode(instr_code);
if (!instr) {
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << warp.PC << " (#" << std::dec << uuid << ")" << std::endl;
std::abort();
}
DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);
// Create trace
auto trace = new instr_trace_t(uuid, arch_);
// Execute
this->execute(*instr, scheduled_warp, trace);
DP(5, "Register state:");
for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
DPN(5, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
// Integer register file
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << warp.ireg_file.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(5, '|');
// Floating point register file
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << warp.freg_file.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(5, std::endl);
}
return trace;
}
bool Emulator::running() const {
return active_warps_.any();
}
int Emulator::get_exitcode() const {
return warps_.at(0).ireg_file.at(0).at(3);
}
void Emulator::suspend(uint32_t wid) {
assert(!stalled_warps_.test(wid));
stalled_warps_.set(wid);
}
void Emulator::resume(uint32_t wid) {
if (wid != 0xffffffff) {
assert(stalled_warps_.test(wid));
stalled_warps_.reset(wid);
} else {
stalled_warps_.reset();
}
}
void Emulator::wspawn(uint32_t num_warps, Word nextPC) {
uint32_t active_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC);
for (uint32_t i = 1; i < active_warps; ++i) {
auto& warp = warps_.at(i);
warp.PC = nextPC;
warp.tmask.set(0);
active_warps_.set(i);
}
}
void Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
uint32_t bar_idx = bar_id & 0x7fffffff;
bool is_global = (bar_id >> 31);
auto& barrier = barriers_.at(bar_idx);
barrier.set(wid);
DP(3, "*** Suspend core #" << core_->id() << ", warp #" << wid << " at barrier #" << bar_idx);
if (is_global) {
// global barrier handling
if (barrier.count() == active_warps_.count()) {
core_->socket()->barrier(bar_idx, count, core_->id());
barrier.reset();
}
} else {
// local barrier handling
if (barrier.count() == (size_t)count) {
// resume suspended warps
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume core #" << core_->id() << ", warp #" << i << " at barrier #" << bar_idx);
stalled_warps_.reset(i);
}
}
barrier.reset();
}
}
}
void Emulator::icache_read(void *data, uint64_t addr, uint32_t size) {
mmu_.read(data, addr, size, 0);
}
void Emulator::dcache_read(void *data, uint64_t addr, uint32_t size) {
auto type = get_addr_type(addr);
if (type == AddrType::Shared) {
core_->local_mem()->read(data, addr, size);
} else {
mmu_.read(data, addr, size, 0);
}
DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
}
void Emulator::dcache_write(const void* data, uint64_t addr, uint32_t size) {
auto type = get_addr_type(addr);
if (addr >= uint64_t(IO_COUT_ADDR)
&& addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
this->writeToStdOut(data, addr, size);
} else {
if (type == AddrType::Shared) {
core_->local_mem()->write(data, addr, size);
} else {
mmu_.write(data, addr, size, 0);
}
}
DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << " (size=" << size << ", type=" << type << ")" << std::endl);
}
void Emulator::dcache_amo_reserve(uint64_t addr) {
auto type = get_addr_type(addr);
if (type == AddrType::Global) {
mmu_.amo_reserve(addr);
}
}
bool Emulator::dcache_amo_check(uint64_t addr) {
auto type = get_addr_type(addr);
if (type == AddrType::Global) {
return mmu_.amo_check(addr);
}
return false;
}
void Emulator::writeToStdOut(const void* data, uint64_t addr, uint32_t size) {
if (size != 1)
std::abort();
uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
auto& ss_buf = print_bufs_[tid];
char c = *(char*)data;
ss_buf << c;
if (c == '\n') {
std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush;
ss_buf.str("");
}
}
void Emulator::cout_flush() {
for (auto& buf : print_bufs_) {
auto str = buf.second.str();
if (!str.empty()) {
std::cout << "#" << buf.first << ": " << str << std::endl;
}
}
}
uint32_t Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
auto core_perf = core_->perf_stats();
switch (addr) {
case VX_CSR_SATP:
case VX_CSR_PMPCFG0:
case VX_CSR_PMPADDR0:
case VX_CSR_MSTATUS:
case VX_CSR_MISA:
case VX_CSR_MEDELEG:
case VX_CSR_MIDELEG:
case VX_CSR_MIE:
case VX_CSR_MTVEC:
case VX_CSR_MEPC:
case VX_CSR_MNSTATUS:
return 0;
case VX_CSR_FFLAGS:
return warps_.at(wid).fcsr & 0x1F;
case VX_CSR_FRM:
return (warps_.at(wid).fcsr >> 5);
case VX_CSR_FCSR:
return warps_.at(wid).fcsr;
case VX_CSR_MHARTID: // global thread ID
return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid;
case VX_CSR_THREAD_ID: // thread ID
return tid;
case VX_CSR_WARP_ID: // warp ID
return wid;
case VX_CSR_CORE_ID: // core ID
return core_->id();
case VX_CSR_THREAD_MASK: // thread mask
return warps_.at(wid).tmask.to_ulong();
case VX_CSR_WARP_MASK: // active warps
return active_warps_.to_ulong();
case VX_CSR_NUM_THREADS: // Number of threads per warp
return arch_.num_threads();
case VX_CSR_NUM_WARPS: // Number of warps per core
return arch_.num_warps();
case VX_CSR_NUM_CORES: // Number of cores per cluster
return uint32_t(arch_.num_cores()) * arch_.num_clusters();
case VX_CSR_MCYCLE: // NumCycles
return core_perf.cycles & 0xffffffff;
case VX_CSR_MCYCLE_H: // NumCycles
return (uint32_t)(core_perf.cycles >> 32);
case VX_CSR_MINSTRET: // NumInsts
return core_perf.instrs & 0xffffffff;
case VX_CSR_MINSTRET_H: // NumInsts
return (uint32_t)(core_perf.instrs >> 32);
default:
if ((addr >= VX_CSR_MPM_BASE && addr < (VX_CSR_MPM_BASE + 32))
|| (addr >= VX_CSR_MPM_BASE_H && addr < (VX_CSR_MPM_BASE_H + 32))) {
// user-defined MPM CSRs
auto perf_class = dcrs_.base_dcrs.read(VX_DCR_BASE_MPM_CLASS);
switch (perf_class) {
case VX_DCR_MPM_CLASS_NONE:
break;
case VX_DCR_MPM_CLASS_CORE: {
switch (addr) {
case VX_CSR_MPM_SCHED_ID: return core_perf.sched_idle & 0xffffffff;
case VX_CSR_MPM_SCHED_ID_H:return core_perf.sched_idle >> 32;
case VX_CSR_MPM_SCHED_ST: return core_perf.sched_stalls & 0xffffffff;
case VX_CSR_MPM_SCHED_ST_H:return core_perf.sched_stalls >> 32;
case VX_CSR_MPM_IBUF_ST: return core_perf.ibuf_stalls & 0xffffffff;
case VX_CSR_MPM_IBUF_ST_H: return core_perf.ibuf_stalls >> 32;
case VX_CSR_MPM_SCRB_ST: return core_perf.scrb_stalls & 0xffffffff;
case VX_CSR_MPM_SCRB_ST_H: return core_perf.scrb_stalls >> 32;
case VX_CSR_MPM_SCRB_ALU: return core_perf.scrb_alu & 0xffffffff;
case VX_CSR_MPM_SCRB_ALU_H:return core_perf.scrb_alu >> 32;
case VX_CSR_MPM_SCRB_FPU: return core_perf.scrb_fpu & 0xffffffff;
case VX_CSR_MPM_SCRB_FPU_H:return core_perf.scrb_fpu >> 32;
case VX_CSR_MPM_SCRB_LSU: return core_perf.scrb_lsu & 0xffffffff;
case VX_CSR_MPM_SCRB_LSU_H:return core_perf.scrb_lsu >> 32;
case VX_CSR_MPM_SCRB_SFU: return core_perf.scrb_sfu & 0xffffffff;
case VX_CSR_MPM_SCRB_SFU_H:return core_perf.scrb_sfu >> 32;
case VX_CSR_MPM_SCRB_WCTL: return core_perf.scrb_wctl & 0xffffffff;
case VX_CSR_MPM_SCRB_WCTL_H: return core_perf.scrb_wctl >> 32;
case VX_CSR_MPM_SCRB_CSRS: return core_perf.scrb_csrs & 0xffffffff;
case VX_CSR_MPM_SCRB_CSRS_H: return core_perf.scrb_csrs >> 32;
case VX_CSR_MPM_IFETCHES: return core_perf.ifetches & 0xffffffff;
case VX_CSR_MPM_IFETCHES_H: return core_perf.ifetches >> 32;
case VX_CSR_MPM_LOADS: return core_perf.loads & 0xffffffff;
case VX_CSR_MPM_LOADS_H: return core_perf.loads >> 32;
case VX_CSR_MPM_STORES: return core_perf.stores & 0xffffffff;
case VX_CSR_MPM_STORES_H: return core_perf.stores >> 32;
case VX_CSR_MPM_IFETCH_LT: return core_perf.ifetch_latency & 0xffffffff;
case VX_CSR_MPM_IFETCH_LT_H: return core_perf.ifetch_latency >> 32;
case VX_CSR_MPM_LOAD_LT: return core_perf.load_latency & 0xffffffff;
case VX_CSR_MPM_LOAD_LT_H: return core_perf.load_latency >> 32;
}
} break;
case VX_DCR_MPM_CLASS_MEM: {
auto proc_perf = core_->socket()->cluster()->processor()->perf_stats();
auto cluster_perf = core_->socket()->cluster()->perf_stats();
auto socket_perf = core_->socket()->perf_stats();
auto lmem_perf = core_->local_mem()->perf_stats();
switch (addr) {
case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff;
case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32;
case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff;
case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32;
case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff;
case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32;
case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff;
case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32;
case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32;
case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32;
case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
case VX_CSR_MPM_L3CACHE_WRITES: return proc_perf.l3cache.writes & 0xffffffff;
case VX_CSR_MPM_L3CACHE_WRITES_H: return proc_perf.l3cache.writes >> 32;
case VX_CSR_MPM_L3CACHE_MISS_R: return proc_perf.l3cache.read_misses & 0xffffffff;
case VX_CSR_MPM_L3CACHE_MISS_R_H: return proc_perf.l3cache.read_misses >> 32;
case VX_CSR_MPM_L3CACHE_MISS_W: return proc_perf.l3cache.write_misses & 0xffffffff;
case VX_CSR_MPM_L3CACHE_MISS_W_H: return proc_perf.l3cache.write_misses >> 32;
case VX_CSR_MPM_L3CACHE_BANK_ST: return proc_perf.l3cache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_L3CACHE_BANK_ST_H:return proc_perf.l3cache.bank_stalls >> 32;
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
case VX_CSR_MPM_LMEM_READS: return lmem_perf.reads & 0xffffffff;
case VX_CSR_MPM_LMEM_READS_H: return lmem_perf.reads >> 32;
case VX_CSR_MPM_LMEM_WRITES: return lmem_perf.writes & 0xffffffff;
case VX_CSR_MPM_LMEM_WRITES_H: return lmem_perf.writes >> 32;
case VX_CSR_MPM_LMEM_BANK_ST: return lmem_perf.bank_stalls & 0xffffffff;
case VX_CSR_MPM_LMEM_BANK_ST_H: return lmem_perf.bank_stalls >> 32;
}
} break;
default: {
std::cout << std::dec << "Error: invalid MPM CLASS: value=" << perf_class << std::endl;
std::abort();
} break;
}
} else {
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
std::abort();
}
}
return 0;
}
void Emulator::set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid) {
__unused (tid);
switch (addr) {
case VX_CSR_FFLAGS:
warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0x1F) | (value & 0x1F);
break;
case VX_CSR_FRM:
warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0xE0) | (value << 5);
break;
case VX_CSR_FCSR:
warps_.at(wid).fcsr = value & 0xff;
break;
case VX_CSR_SATP:
case VX_CSR_MSTATUS:
case VX_CSR_MEDELEG:
case VX_CSR_MIDELEG:
case VX_CSR_MIE:
case VX_CSR_MTVEC:
case VX_CSR_MEPC:
case VX_CSR_PMPCFG0:
case VX_CSR_PMPADDR0:
case VX_CSR_MNSTATUS:
break;
default:
{
std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
std::abort();
}
}
}
uint32_t Emulator::get_fpu_rm(uint32_t func3, uint32_t tid, uint32_t wid) {
return (func3 == 0x7) ? this->get_csr(VX_CSR_FRM, tid, wid) : func3;
}
void Emulator::update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid) {
if (fflags) {
this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid);
this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid);
}
}
void Emulator::trigger_ecall() {
active_warps_.reset();
}
void Emulator::trigger_ebreak() {
active_warps_.reset();
}

126
sim/simx/emulator.h Normal file
View file

@ -0,0 +1,126 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __WARP_H
#define __WARP_H
#include <vector>
#include <stack>
#include <mem.h>
#include "types.h"
namespace vortex {
class Arch;
class DCRS;
class Core;
class Instr;
class instr_trace_t;
class Emulator {
public:
Emulator(const Arch &arch,
const DCRS &dcrs,
Core* core);
~Emulator();
void clear();
void attach_ram(RAM* ram);
instr_trace_t* step();
bool running() const;
void suspend(uint32_t wid);
void resume(uint32_t wid);
void barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
int get_exitcode() const;
private:
struct ipdom_entry_t {
ipdom_entry_t(const ThreadMask &tmask, Word PC);
ipdom_entry_t(const ThreadMask &tmask);
ThreadMask tmask;
Word PC;
bool fallthrough;
};
struct warp_t {
warp_t(const Arch& arch);
void clear(const Arch& arch, const DCRS &dcrs);
Word PC;
ThreadMask tmask;
std::vector<std::vector<Word>> ireg_file;
std::vector<std::vector<uint64_t>> freg_file;
std::stack<ipdom_entry_t> ipdom_stack;
Byte fcsr;
UUIDGenerator uui_gen;
};
std::shared_ptr<Instr> decode(uint32_t code) const;
void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace);
void wspawn(uint32_t num_warps, Word nextPC);
void icache_read(void* data, uint64_t addr, uint32_t size);
void dcache_read(void* data, uint64_t addr, uint32_t size);
void dcache_write(const void* data, uint64_t addr, uint32_t size);
void dcache_amo_reserve(uint64_t addr);
bool dcache_amo_check(uint64_t addr);
void writeToStdOut(const void* data, uint64_t addr, uint32_t size);
void cout_flush();
uint32_t get_csr(uint32_t addr, uint32_t tid, uint32_t wid);
void set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid);
uint32_t get_fpu_rm(uint32_t func3, uint32_t tid, uint32_t wid);
void update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid);
void trigger_ecall();
void trigger_ebreak();
const Arch& arch_;
const DCRS& dcrs_;
Core* core_;
std::vector<warp_t> warps_;
WarpMask active_warps_;
WarpMask stalled_warps_;
std::vector<WarpMask> barriers_;
std::unordered_map<int, std::stringstream> print_bufs_;
MemoryUnit mmu_;
};
}
#endif

View file

@ -22,7 +22,7 @@
#include <assert.h>
#include <util.h>
#include <rvfloats.h>
#include "warp.h"
#include "emulator.h"
#include "instr.h"
#include "core.h"
@ -40,17 +40,6 @@ union reg_data_t {
int64_t i64;
};
inline uint32_t get_fpu_rm(uint32_t func3, Core* core, uint32_t tid, uint32_t wid) {
return (func3 == 0x7) ? core->get_csr(VX_CSR_FRM, tid, wid) : func3;
}
inline void update_fcrs(uint32_t fflags, Core* core, uint32_t tid, uint32_t wid) {
if (fflags) {
core->set_csr(VX_CSR_FCSR, core->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid);
core->set_csr(VX_CSR_FFLAGS, core->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid);
}
}
inline uint64_t nan_box(uint32_t value) {
uint64_t mask = 0xffffffff00000000;
return value | mask;
@ -66,11 +55,20 @@ inline int64_t check_boxing(int64_t a) {
return nan_box(0x7fc00000); // NaN
}
void Warp::execute(const Instr &instr, instr_trace_t *trace) {
assert(tmask_.any());
void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
auto& warp = warps_.at(wid);
assert(warp.tmask.any());
auto next_pc = PC_ + 4;
auto next_tmask = tmask_;
// initialize instruction trace
trace->cid = core_->id();
trace->wid = wid;
trace->PC = warp.PC;
trace->tmask = warp.tmask;
trace->rdest = instr.getRDest();
trace->rdest_type = instr.getRDType();
auto next_pc = warp.PC + 4;
auto next_tmask = warp.tmask;
auto opcode = instr.getOpcode();
auto func2 = instr.getFunc2();
@ -86,7 +84,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
uint32_t thread_start = 0;
for (; thread_start < num_threads; ++thread_start) {
if (tmask_.test(thread_start))
if (warp.tmask.test(thread_start))
break;
}
@ -103,11 +101,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
if (!warp.tmask.test(t)) {
DPN(2, "-");
continue;
}
rsdata[t][i].u = ireg_file_.at(t)[reg];
rsdata[t][i].u = warp.ireg_file.at(t)[reg];
DPN(2, "0x" << std::hex << rsdata[t][i].i);
}
DPN(2, "}" << std::endl);
@ -116,11 +114,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
if (!warp.tmask.test(t)) {
DPN(2, "-");
continue;
}
rsdata[t][i].u64 = freg_file_.at(t)[reg];
rsdata[t][i].u64 = warp.freg_file.at(t)[reg];
DPN(2, "0x" << std::hex << rsdata[t][i].f);
}
DPN(2, "}" << std::endl);
@ -139,7 +137,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::ARITH;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
rddata[t].i = immsrc;
}
@ -151,9 +149,9 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::ARITH;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
rddata[t].i = immsrc + PC_;
rddata[t].i = immsrc + warp.PC;
}
rd_write = true;
break;
@ -164,7 +162,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
if (func7 & 0x1) {
switch (func3) {
@ -324,7 +322,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
switch (func3) {
case 0: {
@ -385,7 +383,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
if (func7 & 0x1) {
switch (func3) {
@ -511,7 +509,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
switch (func3) {
case 0: {
@ -555,48 +553,48 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
switch (func3) {
case 0: {
// RV32I: BEQ
if (rsdata[t][0].i == rsdata[t][1].i) {
next_pc = PC_ + immsrc;
next_pc = warp.PC + immsrc;
}
break;
}
case 1: {
// RV32I: BNE
if (rsdata[t][0].i != rsdata[t][1].i) {
next_pc = PC_ + immsrc;
next_pc = warp.PC + immsrc;
}
break;
}
case 4: {
// RV32I: BLT
if (rsdata[t][0].i < rsdata[t][1].i) {
next_pc = PC_ + immsrc;
next_pc = warp.PC + immsrc;
}
break;
}
case 5: {
// RV32I: BGE
if (rsdata[t][0].i >= rsdata[t][1].i) {
next_pc = PC_ + immsrc;
next_pc = warp.PC + immsrc;
}
break;
}
case 6: {
// RV32I: BLTU
if (rsdata[t][0].u < rsdata[t][1].u) {
next_pc = PC_ + immsrc;
next_pc = warp.PC + immsrc;
}
break;
}
case 7: {
// RV32I: BGEU
if (rsdata[t][0].u >= rsdata[t][1].u) {
next_pc = PC_ + immsrc;
next_pc = warp.PC + immsrc;
}
break;
}
@ -613,11 +611,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::BRANCH;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
rddata[t].i = next_pc;
}
next_pc = PC_ + immsrc;
next_pc = warp.PC + immsrc;
trace->fetch_stall = true;
rd_write = true;
break;
@ -628,7 +626,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->alu_type = AluType::BRANCH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
rddata[t].i = next_pc;
}
@ -647,11 +645,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
uint32_t data_bytes = 1 << (func3 & 0x3);
uint32_t data_width = 8 * data_bytes;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i + immsrc;
uint64_t read_data = 0;
core_->dcache_read(&read_data, mem_addr, data_bytes);
this->dcache_read(&read_data, mem_addr, data_bytes);
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
switch (func3) {
case 0: // RV32I: LB
@ -691,7 +689,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->data = trace_data;
uint32_t data_bytes = 1 << (func3 & 0x3);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i + immsrc;
uint64_t write_data = rsdata[t][1].u64;
@ -701,7 +699,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
case 1:
case 2:
case 3:
core_->dcache_write(&write_data, mem_addr, data_bytes);
this->dcache_write(&write_data, mem_addr, data_bytes);
break;
default:
std::abort();
@ -720,26 +718,26 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
uint32_t data_bytes = 1 << (func3 & 0x3);
uint32_t data_width = 8 * data_bytes;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].u;
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
if (amo_type == 0x02) { // LR
uint64_t read_data = 0;
core_->dcache_read(&read_data, mem_addr, data_bytes);
core_->dcache_amo_reserve(mem_addr);
this->dcache_read(&read_data, mem_addr, data_bytes);
this->dcache_amo_reserve(mem_addr);
rddata[t].i = sext((Word)read_data, data_width);
} else
if (amo_type == 0x03) { // SC
if (core_->dcache_amo_check(mem_addr)) {
core_->dcache_write(&rsdata[t][1].u64, mem_addr, data_bytes);
if (this->dcache_amo_check(mem_addr)) {
this->dcache_write(&rsdata[t][1].u64, mem_addr, data_bytes);
rddata[t].i = 0;
} else {
rddata[t].i = 1;
}
} else {
uint64_t read_data = 0;
core_->dcache_read(&read_data, mem_addr, data_bytes);
this->dcache_read(&read_data, mem_addr, data_bytes);
auto read_data_i = sext((WordI)read_data, data_width);
auto rs1_data_i = sext((WordI)rsdata[t][1].u64, data_width);
auto read_data_u = zext((Word)read_data, data_width);
@ -776,7 +774,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
default:
std::abort();
}
core_->dcache_write(&result, mem_addr, data_bytes);
this->dcache_write(&result, mem_addr, data_bytes);
rddata[t].i = read_data_i;
}
}
@ -785,7 +783,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
}
case Opcode::SYS: {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
uint32_t csr_addr = immsrc;
uint32_t csr_value;
@ -796,11 +794,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
switch (csr_addr) {
case 0:
// RV32I: ECALL
core_->trigger_ecall();
this->trigger_ecall();
break;
case 1:
// RV32I: EBREAK
core_->trigger_ebreak();
this->trigger_ebreak();
break;
case 0x002: // URET
case 0x102: // SRET
@ -812,12 +810,12 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
} else {
trace->fu_type = FUType::SFU;
trace->fetch_stall = true;
csr_value = core_->get_csr(csr_addr, t, warp_id_);
csr_value = this->get_csr(csr_addr, t, wid);
switch (func3) {
case 1: {
// RV32I: CSRRW
rddata[t].i = csr_value;
core_->set_csr(csr_addr, rsdata[t][0].i, t, warp_id_);
this->set_csr(csr_addr, rsdata[t][0].i, t, wid);
trace->used_iregs.set(rsrc0);
trace->sfu_type = SfuType::CSRRW;
rd_write = true;
@ -827,7 +825,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
// RV32I: CSRRS
rddata[t].i = csr_value;
if (rsdata[t][0].i != 0) {
core_->set_csr(csr_addr, csr_value | rsdata[t][0].i, t, warp_id_);
this->set_csr(csr_addr, csr_value | rsdata[t][0].i, t, wid);
}
trace->used_iregs.set(rsrc0);
trace->sfu_type = SfuType::CSRRS;
@ -838,7 +836,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
// RV32I: CSRRC
rddata[t].i = csr_value;
if (rsdata[t][0].i != 0) {
core_->set_csr(csr_addr, csr_value & ~rsdata[t][0].i, t, warp_id_);
this->set_csr(csr_addr, csr_value & ~rsdata[t][0].i, t, wid);
}
trace->used_iregs.set(rsrc0);
trace->sfu_type = SfuType::CSRRC;
@ -848,7 +846,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
case 5: {
// RV32I: CSRRWI
rddata[t].i = csr_value;
core_->set_csr(csr_addr, rsrc0, t, warp_id_);
this->set_csr(csr_addr, rsrc0, t, wid);
trace->sfu_type = SfuType::CSRRW;
rd_write = true;
break;
@ -857,7 +855,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
// RV32I: CSRRSI;
rddata[t].i = csr_value;
if (rsrc0 != 0) {
core_->set_csr(csr_addr, csr_value | rsrc0, t, warp_id_);
this->set_csr(csr_addr, csr_value | rsrc0, t, wid);
}
trace->sfu_type = SfuType::CSRRS;
rd_write = true;
@ -867,7 +865,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
// RV32I: CSRRCI
rddata[t].i = csr_value;
if (rsrc0 != 0) {
core_->set_csr(csr_addr, csr_value & ~rsrc0, t, warp_id_);
this->set_csr(csr_addr, csr_value & ~rsrc0, t, wid);
}
trace->sfu_type = SfuType::CSRRC;
rd_write = true;
@ -889,9 +887,9 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
case Opcode::FCI: {
trace->fu_type = FUType::FPU;
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
uint32_t frm = this->get_fpu_rm(func3, t, wid);
uint32_t fflags = 0;
switch (func7) {
case 0x00: { // RV32F: FADD.S
@ -1206,7 +1204,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
break;
}
}
update_fcrs(fflags, core_, t, warp_id_);
this->update_fcrs(fflags, t, wid);
}
rd_write = true;
break;
@ -1220,9 +1218,9 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->used_fregs.set(rsrc1);
trace->used_fregs.set(rsrc2);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
uint32_t frm = this->get_fpu_rm(func3, t, wid);
uint32_t fflags = 0;
switch (opcode) {
case Opcode::FMADD:
@ -1260,7 +1258,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
default:
break;
}
update_fcrs(fflags, core_, t, warp_id_);
this->update_fcrs(fflags, t, wid);
}
rd_write = true;
break;
@ -1287,7 +1285,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
core_->wspawn(rsdata.at(thread_start)[0].i, rsdata.at(thread_start)[1].i);
this->wspawn(rsdata.at(thread_start)[0].i, rsdata.at(thread_start)[1].i);
} break;
case 2: {
// SPLIT
@ -1298,23 +1296,23 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
ThreadMask then_tmask, else_tmask;
for (uint32_t t = 0; t < num_threads; ++t) {
auto cond = ireg_file_.at(t).at(rsrc0);
then_tmask[t] = tmask_.test(t) && cond;
else_tmask[t] = tmask_.test(t) && !cond;
auto cond = warp.ireg_file.at(t).at(rsrc0);
then_tmask[t] = warp.tmask.test(t) && cond;
else_tmask[t] = warp.tmask.test(t) && !cond;
}
bool is_divergent = then_tmask.any() && else_tmask.any();
if (is_divergent) {
if (ipdom_stack_.size() == arch_.ipdom_size()) {
std::cout << "IPDOM stack is full! size=" << std::dec << ipdom_stack_.size() << ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")\n" << std::dec << std::flush;
if (warp.ipdom_stack.size() == arch_.ipdom_size()) {
std::cout << "IPDOM stack is full! size=" << std::dec << warp.ipdom_stack.size() << ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << trace->uuid << ")\n" << std::dec << std::flush;
std::abort();
}
// set new thread mask
next_tmask = then_tmask;
// push reconvergence thread mask onto the stack
ipdom_stack_.emplace(tmask_);
warp.ipdom_stack.emplace(warp.tmask);
// push else's thread mask onto the stack
ipdom_stack_.emplace(else_tmask, next_pc);
warp.ipdom_stack.emplace(else_tmask, next_pc);
}
// return divergent state
for (uint32_t t = thread_start; t < num_threads; ++t) {
@ -1329,17 +1327,17 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
int is_divergent = ireg_file_.at(thread_start).at(rsrc0);
int is_divergent = warp.ireg_file.at(thread_start).at(rsrc0);
if (is_divergent != 0) {
if (ipdom_stack_.empty()) {
if (warp.ipdom_stack.empty()) {
std::cout << "IPDOM stack is empty!\n" << std::flush;
std::abort();
}
next_tmask = ipdom_stack_.top().tmask;
if (!ipdom_stack_.top().fallthrough) {
next_pc = ipdom_stack_.top().PC;
next_tmask = warp.ipdom_stack.top().tmask;
if (!warp.ipdom_stack.top().fallthrough) {
next_pc = warp.ipdom_stack.top().PC;
}
ipdom_stack_.pop();
warp.ipdom_stack.pop();
}
} break;
case 4: {
@ -1360,12 +1358,12 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->fetch_stall = true;
ThreadMask pred;
for (uint32_t t = 0; t < num_threads; ++t) {
pred[t] = tmask_.test(t) && (ireg_file_.at(t).at(rsrc0) & 0x1);
pred[t] = warp.tmask.test(t) && (warp.ireg_file.at(t).at(rsrc0) & 0x1);
}
if (pred.any()) {
next_tmask &= pred;
} else {
next_tmask = ireg_file_.at(thread_start).at(rsrc1);
next_tmask = warp.ireg_file.at(thread_start).at(rsrc1);
}
} break;
default:
@ -1387,7 +1385,7 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
trace->used_iregs.set(rsrc1);
trace->used_iregs.set(rsrc2);
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
if (!warp.tmask.test(t))
continue;
rddata[t].i = rsdata[t][0].i ? rsdata[t][1].i : rsdata[t][2].i;
}
@ -1414,11 +1412,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
DPH(2, "Dest Reg: " << type << std::dec << rdest << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
if (!warp.tmask.test(t)) {
DPN(2, "-");
continue;
}
ireg_file_.at(t)[rdest] = rddata[t].i;
warp.ireg_file.at(t)[rdest] = rddata[t].i;
DPN(2, "0x" << std::hex << rddata[t].i);
}
DPN(2, "}" << std::endl);
@ -1433,11 +1431,11 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
DPH(2, "Dest Reg: " << type << std::dec << rdest << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
if (!warp.tmask.test(t)) {
DPN(2, "-");
continue;
}
freg_file_.at(t)[rdest] = rddata[t].u64;
warp.freg_file.at(t)[rdest] = rddata[t].u64;
DPN(2, "0x" << std::hex << rddata[t].f);
}
DPN(2, "}" << std::endl);
@ -1449,19 +1447,21 @@ void Warp::execute(const Instr &instr, instr_trace_t *trace) {
}
}
PC_ += 4;
if (PC_ != next_pc) {
warp.PC += 4;
if (warp.PC != next_pc) {
DP(3, "*** Next PC=0x" << std::hex << next_pc << std::dec);
PC_ = next_pc;
warp.PC = next_pc;
}
if (tmask_ != next_tmask) {
if (warp.tmask != next_tmask) {
DPH(3, "*** New Tmask=");
for (uint32_t i = 0; i < num_threads; ++i)
DPN(3, next_tmask.test(i));
DPN(3, std::endl);
tmask_ = next_tmask;
warp.tmask = next_tmask;
if (!next_tmask.any()) {
core_->active_warps_.reset(warp_id_);
active_warps_.reset(wid);
}
}
}

View file

@ -11,7 +11,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "exe_unit.h"
#include "func_unit.h"
#include <iostream>
#include <iomanip>
#include <string.h>
@ -24,7 +24,7 @@
using namespace vortex;
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
AluUnit::AluUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "ALU") {}
void AluUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
@ -48,8 +48,7 @@ void AluUnit::tick() {
}
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
if (trace->eop && trace->fetch_stall) {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
core_->resume(trace->wid);
}
input.pop();
}
@ -57,7 +56,7 @@ void AluUnit::tick() {
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "FPU") {}
void FpuUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
@ -93,7 +92,7 @@ void FpuUnit::tick() {
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
: FuncUnit(ctx, core, "LSU")
, pending_rd_reqs_(LSUQ_IN_SIZE)
, num_lanes_(NUM_LSU_LANES)
, pending_loads_(0)
@ -230,7 +229,7 @@ void LsuUnit::tick() {
auto& dcache_req_port = core_->lmem_demuxs_.at(t)->ReqIn;
auto mem_addr = trace_data->mem_addrs.at(t + t0);
auto type = core_->get_addr_type(mem_addr.addr);
auto type = get_addr_type(mem_addr.addr);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
@ -271,7 +270,7 @@ void LsuUnit::tick() {
///////////////////////////////////////////////////////////////////////////////
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "SFU")
: FuncUnit(ctx, core, "SFU")
, input_idx_(0)
{}
@ -315,8 +314,7 @@ void SfuUnit::tick() {
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
if (trace->eop && release_warp) {
assert(core_->stalled_warps_.test(trace->wid));
core_->stalled_warps_.reset(trace->wid);
core_->resume(trace->wid);
}
input.pop();

View file

@ -20,19 +20,19 @@ namespace vortex {
class Core;
class ExeUnit : public SimObject<ExeUnit> {
class FuncUnit : public SimObject<FuncUnit> {
public:
std::vector<SimPort<instr_trace_t*>> Inputs;
std::vector<SimPort<instr_trace_t*>> Outputs;
ExeUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<ExeUnit>(ctx, name)
FuncUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<FuncUnit>(ctx, name)
, Inputs(ISSUE_WIDTH, this)
, Outputs(ISSUE_WIDTH, this)
, core_(core)
{}
virtual ~ExeUnit() {}
virtual ~FuncUnit() {}
virtual void reset() {}
@ -44,7 +44,7 @@ protected:
///////////////////////////////////////////////////////////////////////////////
class AluUnit : public ExeUnit {
class AluUnit : public FuncUnit {
public:
AluUnit(const SimContext& ctx, Core*);
@ -53,7 +53,7 @@ public:
///////////////////////////////////////////////////////////////////////////////
class FpuUnit : public ExeUnit {
class FpuUnit : public FuncUnit {
public:
FpuUnit(const SimContext& ctx, Core*);
@ -62,7 +62,7 @@ public:
///////////////////////////////////////////////////////////////////////////////
class LsuUnit : public ExeUnit {
class LsuUnit : public FuncUnit {
public:
LsuUnit(const SimContext& ctx, Core*);
@ -85,7 +85,7 @@ private:
///////////////////////////////////////////////////////////////////////////////
class SfuUnit : public ExeUnit {
class SfuUnit : public FuncUnit {
public:
SfuUnit(const SimContext& ctx, Core*);

View file

@ -39,7 +39,7 @@ public:
Impl(LocalMem* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, ram_(config.capacity, config.capacity)
, ram_(config.capacity)
, bank_sel_addr_start_(0)
, bank_sel_addr_end_(0 + log2ceil(config.num_banks)-1)
{}

View file

@ -24,6 +24,7 @@
#include "constants.h"
#include <util.h>
#include "core.h"
#include "VX_types.h"
using namespace vortex;
@ -87,7 +88,7 @@ int main(int argc, char **argv) {
Arch arch(num_threads, num_warps, num_cores);
// create memory module
RAM ram(RAM_PAGE_SIZE);
RAM ram(0, RAM_PAGE_SIZE);
// create processor
Processor processor(arch);
@ -117,7 +118,10 @@ int main(int argc, char **argv) {
}
// run simulation
exitcode = processor.run(riscv_test);
exitcode = processor.run();
if (riscv_test) {
exitcode = (1 - exitcode);
}
}
if (exitcode != 0) {

View file

@ -14,7 +14,6 @@
#pragma once
#include "instr_trace.h"
#include <queue>
namespace vortex {

View file

@ -15,6 +15,7 @@
#pragma once
#include "instr_trace.h"
#include <queue>
namespace vortex {

View file

@ -83,24 +83,21 @@ void ProcessorImpl::attach_ram(RAM* ram) {
}
}
int ProcessorImpl::run(bool riscv_test) {
int ProcessorImpl::run() {
SimPlatform::instance().reset();
this->reset();
bool done;
Word exitcode = 0;
int exitcode = 0;
do {
SimPlatform::instance().tick();
done = true;
for (auto cluster : clusters_) {
if (cluster->running()) {
Word ec;
if (cluster->check_exit(&ec, riscv_test)) {
exitcode |= ec;
} else {
done = false;
}
done = false;
continue;
}
exitcode |= cluster->get_exitcode();
}
perf_mem_latency_ += perf_mem_pending_reads_;
} while (!done);
@ -143,8 +140,8 @@ void Processor::attach_ram(RAM* mem) {
impl_->attach_ram(mem);
}
int Processor::run(bool riscv_test) {
return impl_->run(riscv_test);
int Processor::run() {
return impl_->run();
}
void Processor::write_dcr(uint32_t addr, uint32_t value) {

View file

@ -28,7 +28,7 @@ public:
void attach_ram(RAM* mem);
int run(bool riscv_test);
int run();
void write_dcr(uint32_t addr, uint32_t value);

View file

@ -35,7 +35,7 @@ public:
void attach_ram(RAM* mem);
int run(bool riscv_test);
int run();
void write_dcr(uint32_t addr, uint32_t value);

View file

@ -14,7 +14,8 @@
#pragma once
#include "instr_trace.h"
#include <queue>
#include <unordered_map>
#include <vector>
namespace vortex {

Binary file not shown.

View file

@ -118,19 +118,12 @@ bool Socket::running() const {
return false;
}
bool Socket::check_exit(Word* exitcode, bool riscv_test) const {
bool done = true;
Word exitcode_ = 0;
int Socket::get_exitcode() const {
int exitcode = 0;
for (auto& core : cores_) {
Word ec;
if (core->check_exit(&ec, riscv_test)) {
exitcode_ |= ec;
} else {
done = false;
}
exitcode |= core->get_exitcode();
}
*exitcode = exitcode_;
return done;
return exitcode;
}
void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
@ -138,7 +131,7 @@ void Socket::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
}
void Socket::resume(uint32_t core_index) {
cores_.at(core_index)->resume();
cores_.at(core_index)->resume(-1);
}
Socket::PerfStats Socket::perf_stats() const {

View file

@ -62,7 +62,7 @@ public:
bool running() const;
bool check_exit(Word* exitcode, bool riscv_test) const;
int get_exitcode() const;
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);

View file

@ -16,6 +16,7 @@
#include <stdint.h>
#include <bitset>
#include <queue>
#include <vector>
#include <unordered_map>
#include <util.h>
#include <stringutil.h>
@ -53,8 +54,6 @@ typedef std::bitset<MAX_NUM_REGS> RegMask;
typedef std::bitset<MAX_NUM_THREADS> ThreadMask;
typedef std::bitset<MAX_NUM_WARPS> WarpMask;
typedef std::unordered_map<uint32_t, uint32_t> CSRs;
///////////////////////////////////////////////////////////////////////////////
enum class RegType {
@ -142,6 +141,18 @@ enum class AddrType {
IO
};
inline AddrType get_addr_type(uint64_t addr) {
if (LMEM_ENABLED) {
if (addr >= LMEM_BASE_ADDR && addr < (LMEM_BASE_ADDR + (1 << LMEM_LOG_SIZE))) {
return AddrType::Shared;
}
}
if (addr >= IO_BASE_ADDR) {
return AddrType::IO;
}
return AddrType::Global;
}
inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
switch (type) {
case AddrType::Global: os << "Global"; break;

View file

@ -1,112 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <assert.h>
#include <util.h>
#include "instr.h"
#include "core.h"
using namespace vortex;
Warp::Warp(Core *core, uint32_t warp_id)
: warp_id_(warp_id)
, arch_(core->arch())
, core_(core)
, ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<uint64_t>(core->arch().num_regs()))
{
this->reset();
}
void Warp::reset() {
PC_ = core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
#if (XLEN == 64)
PC_ = (uint64_t(core_->dcrs().base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | PC_;
#endif
tmask_.reset();
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i) {
for (auto& reg : ireg_file_.at(i)) {
reg = 0;
}
for (auto& reg : freg_file_.at(i)) {
reg = 0;
}
}
uui_gen_.reset();
}
instr_trace_t* Warp::eval() {
assert(tmask_.any());
#ifndef NDEBUG
uint32_t instr_uuid = uui_gen_.get_uuid(PC_);
uint32_t g_wid = core_->id() * arch_.num_warps() + warp_id_;
uint32_t instr_id = instr_uuid & 0xffff;
uint32_t instr_ref = instr_uuid >> 16;
uint64_t uuid = (uint64_t(instr_ref) << 32) | (g_wid << 16) | instr_id;
#else
uint64_t uuid = 0;
#endif
DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
DPN(1, tmask_.test(i));
DPN(1, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << uuid << ")" << std::endl);
// Fetch
uint32_t instr_code = 0;
core_->icache_read(&instr_code, PC_, sizeof(uint32_t));
// Decode
auto instr = core_->decoder_.decode(instr_code);
if (!instr) {
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=0x" << PC_ << " (#" << std::dec << uuid << ")" << std::endl;
std::abort();
}
DP(1, "Instr 0x" << std::hex << instr_code << ": " << *instr);
// Create trace
auto trace = new instr_trace_t(uuid, arch_);
trace->cid = core_->id();
trace->wid = warp_id_;
trace->PC = PC_;
trace->tmask = tmask_;
trace->rdest = instr->getRDest();
trace->rdest_type = instr->getRDType();
// Execute
this->execute(*instr, trace);
DP(5, "Register state:");
for (uint32_t i = 0; i < arch_.num_regs(); ++i) {
DPN(5, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
// Integer register file
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(XLEN/4) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(5, '|');
// Floating point register file
for (uint32_t j = 0; j < arch_.num_threads(); ++j) {
DPN(5, ' ' << std::setfill('0') << std::setw(16) << std::hex << freg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(5, std::endl);
}
return trace;
}

View file

@ -1,107 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef __WARP_H
#define __WARP_H
#include <vector>
#include <stack>
#include "types.h"
namespace vortex {
class Arch;
class Core;
class Instr;
class instr_trace_t;
struct DomStackEntry {
DomStackEntry(const ThreadMask &tmask, Word PC)
: tmask(tmask)
, PC(PC)
, fallthrough(false)
{}
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, fallthrough(true)
{}
ThreadMask tmask;
Word PC;
bool fallthrough;
};
struct vtype {
uint32_t vill;
uint32_t vediv;
uint32_t vsew;
uint32_t vlmul;
};
class Warp {
public:
Warp(Core *core, uint32_t warp_id);
void reset();
uint32_t id() const {
return warp_id_;
}
Word getPC() const {
return PC_;
}
void setPC(Word PC) {
PC_ = PC;
}
void setTmask(size_t index, bool value) {
tmask_.set(index, value);
}
uint64_t getTmask() const {
return tmask_.to_ulong();
}
Word getIRegValue(uint32_t reg) const {
return ireg_file_.at(0).at(reg);
}
instr_trace_t* eval();
private:
void execute(const Instr &instr, instr_trace_t *trace);
UUIDGenerator uui_gen_;
uint32_t warp_id_;
const Arch& arch_;
Core *core_;
Word PC_;
ThreadMask tmask_;
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<uint64_t>> freg_file_;
std::stack<DomStackEntry> ipdom_stack_;
struct vtype vtype_;
uint32_t vl_;
};
}
#endif