simx timing simulation refactoring

This commit is contained in:
Blaise Tine 2021-11-14 08:52:34 -05:00
parent 9656779d48
commit 808bddb586
22 changed files with 1123 additions and 903 deletions

View file

@ -11,6 +11,128 @@ namespace vortex {
class SimObjectBase;
///////////////////////////////////////////////////////////////////////////////
class SimPortBase {
public:
virtual ~SimPortBase() {}
SimObjectBase* module() const {
return module_;
}
SimPortBase* peer() const {
return peer_;
}
bool connected() const {
return (peer_ != nullptr);
}
protected:
SimPortBase(SimObjectBase* module)
: module_(module)
, peer_(nullptr)
{}
void connect(SimPortBase* peer) {
assert(peer_ == nullptr);
peer_ = peer;
}
void disconnect() {
assert(peer_ == nullptr);
peer_ = nullptr;
}
SimPortBase& operator=(const SimPortBase&) = delete;
SimObjectBase* module_;
SimPortBase* peer_;
template <typename U> friend class SlavePort;
template <typename U> friend class MasterPort;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class SimPort : public SimPortBase {
public:
void send(const Pkt& pkt, uint64_t delay) const;
bool read(Pkt* out) {
if (!valid_)
return false;
*out = data_;
valid_ = false;
return true;
}
protected:
SimPort(SimObjectBase* module)
: SimPortBase(module)
, valid_(false)
{}
void write(const Pkt& data) {
assert(!valid_);
data_ = data;
valid_ = true;
}
SimPort& operator=(const SimPort&) = delete;
Pkt data_;
bool valid_;
template <typename U> friend class SimPortEvent;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class SlavePort : public SimPort<Pkt> {
public:
SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}
void bind(SlavePort<Pkt>* peer) {
this->connect(peer);
}
void unbind() {
this->disconnect();
}
protected:
SlavePort& operator=(const SlavePort&) = delete;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class MasterPort : public SimPort<Pkt> {
public:
MasterPort(SimObjectBase* module) : SimPort<Pkt>(module) {}
void bind(SlavePort<Pkt>* peer) {
this->connect(peer);
}
void bind(MasterPort<Pkt>* peer) {
this->connect(peer);
}
void unbind() {
this->disconnect();
}
protected:
MasterPort& operator=(const MasterPort&) = delete;
};
///////////////////////////////////////////////////////////////////////////////
class SimEventBase {
public:
typedef std::shared_ptr<SimEventBase> Ptr;
@ -32,16 +154,16 @@ protected:
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class SimSimpleEvent : public SimEventBase {
class SimCallEvent : public SimEventBase {
public:
typedef std::function<void (const Pkt&)> Func;
template <typename... Args>
static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) {
return std::make_shared<SimSimpleEvent>(func, pkt, delay);
return std::make_shared<SimCallEvent>(func, pkt, delay);
}
SimSimpleEvent(const Func& func, const Pkt& pkt, uint64_t delay)
SimCallEvent(const Func& func, const Pkt& pkt, uint64_t delay)
: SimEventBase(delay)
, func_(func)
, pkt_(pkt)
@ -61,167 +183,23 @@ protected:
template <typename Pkt>
class SimPortEvent : public SimEventBase {
public:
typedef std::function<void (const Pkt&, uint32_t)> Func;
template <typename... Args>
static Ptr Create(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) {
return std::make_shared<SimPortEvent>(func, pkt, port_id, delay);
static Ptr Create(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay) {
return std::make_shared<SimPortEvent>(port, pkt, delay);
}
SimPortEvent(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay)
SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay)
: SimEventBase(delay)
, func_(func)
, port_(port)
, pkt_(pkt)
, port_id_(port_id)
{}
void fire() const override {
func_(pkt_, port_id_);
const_cast<SimPort<Pkt>*>(port_)->write(pkt_);
}
private:
Func func_;
Pkt pkt_;
uint32_t port_id_;
};
///////////////////////////////////////////////////////////////////////////////
class SimPortBase {
public:
typedef std::shared_ptr<SimPortBase> Ptr;
virtual ~SimPortBase() {}
SimObjectBase* module() const {
return module_;
}
uint32_t port_id() const {
return port_id_;
}
SimPortBase* peer() const {
return peer_;
}
bool connected() const {
return (peer_ != nullptr);
}
bool is_slave() const {
return is_slave_;
}
protected:
SimPortBase(SimObjectBase* module, bool is_slave);
void connect(SimPortBase* peer) {
assert(peer_ == nullptr);
peer_ = peer;
}
void disconnect() {
assert(peer_ == nullptr);
peer_ = nullptr;
}
SimObjectBase* module_;
uint32_t port_id_;
bool is_slave_;
SimPortBase* peer_;
template <typename Pkt> friend class MasterPort;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class SlavePort : public SimPortBase {
public:
typedef std::shared_ptr<SlavePort<Ptr>> Ptr;
typedef std::function<void (const Pkt&, uint32_t)> Func;
static Ptr Create(SimObjectBase* module, const Func& func) {
return std::make_shared<SlavePort<Pkt>>(module, func);
}
template <typename T>
static Ptr Create(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) {
return std::make_shared<SlavePort<Pkt>>(module, obj, entry);
}
SlavePort(SimObjectBase* module, const Func& func)
: SimPortBase(module, true)
, func_(func)
{}
template <typename T>
SlavePort(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t))
: SimPortBase(module, true)
, func_(std::bind(entry, obj, std::placeholders::_1, std::placeholders::_2))
{}
SlavePort(SimObjectBase* module, SlavePort* peer)
: SimPortBase(module, false)
{
this->connect(peer);
}
void send(const Pkt& pkt, uint64_t delay) const;
const Func& func() const {
return func_;
}
protected:
SlavePort& operator=(const SlavePort&);
Func func_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
class MasterPort : public SimPortBase {
public:
typedef std::shared_ptr<MasterPort<Ptr>> Ptr;
typedef std::function<void (const Pkt&, uint32_t)> Func;
static Ptr Create() {
return std::make_shared<MasterPort<Ptr>>(module);
}
MasterPort(SimObjectBase* module) : SimPortBase(module, false) {}
MasterPort(SimObjectBase* module, MasterPort* peer)
: SimPortBase(module, false)
{
peer->connect(this);
}
void bind(SlavePort<Pkt>* peer) {
this->connect(peer);
}
void unbind() {
peer_->disconnect();
this->disconnect();
}
void send(const Pkt& pkt, uint64_t delay) const {
assert(peer_ != nullptr);
if (peer_->is_slave()) {
auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
slave->send(pkt, delay);
} else {
auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
master->send(pkt, delay);
}
}
private:
MasterPort& operator=(const MasterPort&);
const SimPort<Pkt>* port_;
Pkt pkt_;
};
///////////////////////////////////////////////////////////////////////////////
@ -237,25 +215,18 @@ public:
template <typename T, typename Pkt>
void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay);
virtual void step(uint64_t cycle) = 0;
const std::string& name() const {
return name_;
}
protected:
SimObjectBase(const SimContext& ctx, const char* name);
virtual void step(uint64_t cycle) = 0;
uint32_t allocate_port(SimPortBase* port) {
uint32_t id = ports_.size();
ports_.push_back(port);
return id;
}
SimObjectBase(const SimContext& ctx, const char* name);
private:
std::string name_;
std::vector<SimPortBase*> ports_;
friend class SimPlatform;
friend class SimPortBase;
@ -320,20 +291,19 @@ public:
}
template <typename Pkt>
void schedule(const typename SimSimpleEvent<Pkt>::Func& callback,
void schedule(const typename SimCallEvent<Pkt>::Func& callback,
const Pkt& pkt,
uint64_t delay) {
auto evt = SimSimpleEvent<Pkt>::Create(callback, pkt, delay);
auto evt = SimCallEvent<Pkt>::Create(callback, pkt, delay);
assert(delay != 0);
events_.emplace_back(evt);
}
template <typename Pkt>
void schedule(const typename SimPortEvent<Pkt>::Func& callback,
void schedule(const SimPort<Pkt>* port,
const Pkt& pkt,
uint32_t port_id,
uint64_t delay) {
auto evt = SimPortEvent<Pkt>::Create(callback, pkt, port_id, delay);
auto evt = SimPortEvent<Pkt>::Create(port, pkt, delay);
assert(delay != 0);
events_.emplace_back(evt);
}
@ -383,13 +353,6 @@ private:
///////////////////////////////////////////////////////////////////////////////
inline SimPortBase::SimPortBase(SimObjectBase* module, bool is_slave)
: module_(module)
, port_id_(module->allocate_port(this))
, is_slave_(is_slave)
, peer_(nullptr)
{}
inline SimObjectBase::SimObjectBase(const SimContext&, const char* name)
: name_(name)
{}
@ -403,18 +366,11 @@ typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
}
template <typename Pkt>
void SlavePort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
if (func_) {
SimPlatform::instance().schedule(func_, pkt, port_id_, delay);
void SimPort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
if (peer_) {
reinterpret_cast<const SimPort<Pkt>*>(peer_)->send(pkt, delay);
} else {
assert(peer_ != nullptr);
if (peer_->is_slave()) {
auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
slave->send(pkt, delay);
} else {
auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
master->send(pkt, delay);
}
SimPlatform::instance().schedule(this, pkt, delay);
}
}

View file

@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
TOP = vx_cache_sim
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp main.cpp
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp
OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
VPATH := $(sort $(dir $(SRCS)))

View file

@ -1,5 +1,6 @@
#include "cache.h"
#include "debug.h"
#include "types.h"
#include <util.h>
#include <unordered_map>
#include <vector>
@ -30,8 +31,7 @@ struct params_t {
uint32_t offset_bits = config.B - config.W;
uint32_t log2_bank_size = config.C - bank_bits;
uint32_t index_bits = log2_bank_size - (config.B << config.A);
assert(log2_bank_size >= config.B);
assert(log2_bank_size >= config.B);
this->words_per_block = 1 << offset_bits;
this->blocks_per_set = 1 << config.A;
@ -229,9 +229,10 @@ private:
CacheConfig config_;
params_t params_;
std::vector<bank_t> banks_;
std::vector<std::pair<bool, MemReq>> core_reqs_;
std::pair<bool, MemRsp> mem_rsp_;
std::vector<std::queue<uint32_t>> core_rsps_;
Switch<MemReq, MemRsp>::Ptr mem_switch_;
std::vector<MasterPort<MemReq>> mem_req_ports_;
std::vector<SlavePort<MemRsp>> mem_rsp_ports_;
public:
Impl(Cache* simobject, const CacheConfig& config)
@ -239,16 +240,22 @@ public:
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, core_reqs_(config.num_inputs)
, core_rsps_(config.num_inputs)
{}
void handleMemResponse(const MemRsp& response, uint32_t) {
mem_rsp_ = {true, response};
}
void handleCoreRequest(const MemReq& request, uint32_t port_id) {
core_reqs_.at(port_id) = {true, request};
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
{
if (config.num_banks > 1) {
mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
}
mem_switch_->ReqOut.bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&mem_switch_->RspIn);
} else {
mem_req_ports_.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&mem_rsp_ports_.at(0));
}
}
void step(uint64_t /*cycle*/) {
@ -269,31 +276,29 @@ public:
bank.mshr.try_pop(&active_req);
}
// try schedule stall replay
// try schedule stall queue if MSHR has space
if (!active_req.valid
&& !bank.stall_buffer.empty()) {
&& !bank.stall_buffer.empty()
&& !bank.mshr.full()) {
active_req = bank.stall_buffer.front();
bank.stall_buffer.pop();
}
}
// handle memory fills
if (mem_rsp_.first) {
mem_rsp_.first = false;
auto bank_id = bit_getw(mem_rsp_.second.tag, 0, 15);
auto mshr_id = bit_getw(mem_rsp_.second.tag, 16, 31);
this->processMemoryFill(bank_id, mshr_id);
for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) {
MemRsp mem_rsp;
if (mem_rsp_ports_.at(i).read(&mem_rsp)) {
this->processMemoryFill(i, mem_rsp.tag);
}
}
// handle incoming core requests
for (uint32_t i = 0, n = core_reqs_.size(); i < n; ++i) {
auto& entry = core_reqs_.at(i);
if (!entry.first)
for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) {
MemReq core_req;
if (!simobject_->CoreReqPorts.at(i).read(&core_req))
continue;
entry.first = false;
auto& core_req = entry.second;
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
@ -417,7 +422,7 @@ public:
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag);
mem_req.write = true;
mem_req.tag = 0;
simobject_->MemReqPort.send(mem_req, 1);
mem_req_ports_.at(bank_id).send(mem_req, 1);
} else {
// mark block as dirty
hit_block.dirty = true;
@ -438,7 +443,8 @@ public:
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag);
mem_req.write = true;
simobject_->MemReqPort.send(mem_req, 1);
mem_req.tag = 0;
mem_req_ports_.at(bank_id).send(mem_req, 1);
}
}
@ -449,7 +455,7 @@ public:
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
mem_req.write = true;
mem_req.tag = 0;
simobject_->MemReqPort.send(mem_req, 1);
mem_req_ports_.at(bank_id).send(mem_req, 1);
}
// send core response
for (auto& info : active_req.infos) {
@ -467,9 +473,8 @@ public:
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
mem_req.write = active_req.write;
mem_req.tag = bit_setw(0, 0, 15, bank_id);
mem_req.tag = bit_setw(mem_req.tag, 16, 31, mshr_id);
simobject_->MemReqPort.send(mem_req, 1);
mem_req.tag = mshr_id;
mem_req_ports_.at(bank_id).send(mem_req, 1);
}
}
}
@ -480,12 +485,12 @@ public:
///////////////////////////////////////////////////////////////////////////////
Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config)
: SimObject<Cache>(ctx, name)
, impl_(new Impl(this, config))
, CoreReqPorts(config.num_inputs, {this, impl_, &Cache::Impl::handleCoreRequest})
: SimObject<Cache>(ctx, name)
, CoreReqPorts(config.num_inputs, this)
, CoreRspPorts(config.num_inputs, this)
, MemReqPort(this)
, MemRspPort(this, impl_, &Impl::handleMemResponse)
, MemRspPort(this)
, impl_(new Impl(this, config))
{}
Cache::~Cache() {

View file

@ -20,11 +20,7 @@ struct CacheConfig {
uint8_t latency; // pipeline latency
};
class Cache : public SimObject<Cache> {
private:
class Impl;
Impl* impl_;
class Cache : public SimObject<Cache> {
public:
Cache(const SimContext& ctx, const char* name, const CacheConfig& config);
~Cache();
@ -35,6 +31,10 @@ public:
std::vector<MasterPort<MemRsp>> CoreRspPorts;
MasterPort<MemReq> MemReqPort;
SlavePort<MemRsp> MemRspPort;
private:
class Impl;
Impl* impl_;
};
}

View file

@ -12,13 +12,13 @@
using namespace vortex;
Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
: SimObject(ctx, "Core")
, id_(id)
, arch_(arch)
, decoder_(decoder)
, mem_(mem)
, shared_mem_(1, SMEM_SIZE)
, decoder_(arch)
, mmu_(0, arch.wsize(), true)
, shared_mem_(4096)
, warps_(arch.num_warps())
, barriers_(arch.num_barriers(), 0)
, csrs_(arch.num_csrs(), 0)
@ -54,9 +54,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU
DCACHE_MSHR_SIZE, // mshr
2, // pipeline latency
}))
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
, icache_rsp_port_(this, this, &Core::icache_handleCacheReponse)
, dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast<LsuUnit*>(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse})
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
, fetch_stage_("fetch")
, decode_stage_("decode")
, issue_stage_("issue")
@ -65,36 +63,34 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU
, pending_icache_(arch_.num_warps())
, stalled_warps_(0)
, last_schedule_wid_(0)
, pending_instrs_(0)
, issued_instrs_(0)
, committed_instrs_(0)
, ebreak_(false)
, stats_insts_(0)
, stats_loads_(0)
, stats_stores_(0)
, MemRspPort(this, &l1_mem_switch_->RspIn)
, MemReqPort(this, &l1_mem_switch_->ReqOut)
, MemRspPort(this)
, MemReqPort(this)
{
for (int i = 0; i < arch_.num_warps(); ++i) {
warps_.at(i) = std::make_shared<Warp>(this, i);
}
// register execute units
exe_units_.at((int)ExeType::NOP) = std::make_shared<NopUnit>(this);
exe_units_.at((int)ExeType::ALU) = std::make_shared<AluUnit>(this);
exe_units_.at((int)ExeType::LSU) = std::make_shared<LsuUnit>(this);
exe_units_.at((int)ExeType::CSR) = std::make_shared<CsrUnit>(this);
exe_units_.at((int)ExeType::FPU) = std::make_shared<FpuUnit>(this);
exe_units_.at((int)ExeType::GPU) = std::make_shared<GpuUnit>(this);
// connect l1 caches
icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_);
for (int i = 0; i < arch_.num_threads(); ++i) {
dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i));
}
// connect l1 switch
icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]);
l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort);
l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort);
this->MemRspPort.bind(&l1_mem_switch_->RspIn);
l1_mem_switch_->ReqOut.bind(&this->MemReqPort);
// activate warp0
warps_.at(0)->setTmask(0, true);
@ -109,31 +105,24 @@ Core::~Core() {
}
}
void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) {
// advance to decode stage
uint32_t wid = response.tag;
pipeline_state_t state;
pending_icache_.remove(wid, &state);
auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
state.icache_latency = latency;
decode_stage_.push(state);
void Core::attach_ram(RAM* ram) {
// bind RAM to memory unit
mmu_.attach(*ram, 0, 0xFFFFFFFF);
}
void Core::step(uint64_t cycle) {
__unused (cycle);
D(2, "###########################################################");
D(2, std::dec << "Core" << id_ << ": cycle: " << cycle);
this->commit();
this->execute();
this->issue();
this->decode();
this->fetch();
this->commit(cycle);
this->execute(cycle);
this->issue(cycle);
this->decode(cycle);
this->fetch(cycle);
DPN(2, std::flush);
}
void Core::warp_scheduler() {
void Core::warp_scheduler(uint64_t cycle) {
__unused (cycle);
bool foundSchedule = false;
int scheduled_warp = last_schedule_wid_;
@ -159,53 +148,77 @@ void Core::warp_scheduler() {
stats_insts_ += warp->getActiveThreads();
pipeline_state_t state;
state.clear();
state.id = (issued_instrs_++ * arch_.num_cores()) + id_;
warp->eval(&state);
D(4, state);
DT(3, cycle, "pipeline-schedule: " << state);
// advance to fetch stage
++pending_instrs_;
// advance to fetch stage
fetch_stage_.push(state);
}
void Core::fetch() {
// schedule icache request
pipeline_state_t state;
if (fetch_stage_.try_pop(&state)) {
state.icache_latency = SimPlatform::instance().cycles();
MemReq mem_req;
mem_req.addr = state.PC;
mem_req.write = false;
mem_req.tag = pending_icache_.allocate(state);
icache_->CoreReqPorts.at(0).send(mem_req, 1);
void Core::fetch(uint64_t cycle) {
// handle icache reponse
{
MemRsp mem_rsp;
if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){
pipeline_state_t state;
pending_icache_.remove(mem_rsp.tag, &state);
auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
state.icache_latency = latency;
decode_stage_.push(state);
DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state);
}
}
// send icache request
{
pipeline_state_t state;
if (fetch_stage_.try_pop(&state)) {
state.icache_latency = SimPlatform::instance().cycles();
MemReq mem_req;
mem_req.addr = state.PC;
mem_req.write = false;
mem_req.tag = pending_icache_.allocate(state);
icache_->CoreReqPorts.at(0).send(mem_req, 1);
DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state);
}
}
// schedule next warp
this->warp_scheduler();
this->warp_scheduler(cycle);
}
void Core::decode() {
void Core::decode(uint64_t cycle) {
__unused (cycle);
pipeline_state_t state;
if (!decode_stage_.try_pop(&state))
return;
if (state.stall_warp) {
D(3, "*** warp#" << state.wid << " fetch stalled");
} else {
// release warp
// release warp
if (!state.stall_warp) {
stalled_warps_.reset(state.wid);
}
DT(3, cycle, "pipeline-decode: " << state);
// advance to issue stage
issue_stage_.push(state);
}
void Core::issue() {
void Core::issue(uint64_t cycle) {
__unused (cycle);
if (!issue_stage_.empty()) {
// insert to ibuffer
auto& state = issue_stage_.top();
auto& ibuffer = ibuffers_.at(state.wid);
if (!ibuffer.full()) {
if (ibuffer.full()) {
DT(3, cycle, "*** ibuffer-stall: " << state);
} else {
ibuffer.push(state);
issue_stage_.pop();
}
@ -219,8 +232,18 @@ void Core::issue() {
auto& state = ibuffer.top();
// check scoreboard
if (scoreboard_.in_use(state))
if (scoreboard_.in_use(state)) {
DTH(3, cycle, "*** scoreboard-stall: dependents={");
auto owners = scoreboard_.owners(state);
for (uint32_t i = 0, n = owners.size(); i < n; ++i) {
if (i) DTN(3, ", ");
DTN(3, "#" << owners.at(i));
}
DTN(3, "}, " << state << std::endl);
continue;
}
DT(3, cycle, "pipeline-issue: " << state);
// update scoreboard
scoreboard_.reserve(state);
@ -233,18 +256,19 @@ void Core::issue() {
}
}
void Core::execute() {
void Core::execute(uint64_t cycle) {
// process stage inputs
if (!execute_stage_.empty()) {
auto& state = execute_stage_.top();
auto& exe_unit = exe_units_.at((int)state.exe_type);
exe_unit->push_input(state);
execute_stage_.pop();
DT(3, cycle, "pipeline-execute: " << state);
}
// advance execute units
for (auto& exe_unit : exe_units_) {
exe_unit->step();
exe_unit->step(cycle);
}
// commit completed instructions
@ -255,18 +279,29 @@ void Core::execute() {
stalled_warps_.reset(state.wid);
}
// advance to commit stage
commit_stage_.push(state);
commit_stage_.push(state);
}
}
}
void Core::commit() {
void Core::commit(uint64_t cycle) {
__unused (cycle);
pipeline_state_t state;
if (!commit_stage_.try_pop(&state))
return;
DT(3, cycle, "pipeline-commit: " << state);
// update scoreboard
scoreboard_.release(state);
assert(committed_instrs_ <= issued_instrs_);
++committed_instrs_;
}
bool Core::running() const {
return (committed_instrs_ != issued_instrs_);
}
Word Core::get_csr(Addr addr, int tid, int wid) {
@ -349,9 +384,9 @@ void Core::barrier(int bar_id, int count, int warp_id) {
barrier.reset();
}
Word Core::icache_fetch(Addr addr) {
Word Core::icache_read(Addr addr, Size size) {
Word data;
mem_.read(&data, addr, sizeof(Word), 0);
mmu_.read(&data, addr, size, 0);
return data;
}
@ -365,7 +400,7 @@ Word Core::dcache_read(Addr addr, Size size) {
return data;
}
#endif
mem_.read(&data, addr, size, 0);
mmu_.read(&data, addr, size, 0);
return data;
}
@ -383,11 +418,7 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
this->writeToStdOut(addr, data);
return;
}
mem_.write(&data, addr, size, 0);
}
bool Core::running() const {
return pending_instrs_;
mmu_.write(&data, addr, size, 0);
}
void Core::printStats() const {
@ -399,7 +430,7 @@ void Core::printStats() const {
void Core::writeToStdOut(Addr addr, Word data) {
uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
auto& ss_buf = print_bufs_.at(tid);
auto& ss_buf = print_bufs_[tid];
char c = (char)data;
ss_buf << c;
if (c == '\n') {

View file

@ -25,9 +25,11 @@ namespace vortex {
class Core : public SimObject<Core> {
public:
Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id);
Core(const SimContext& ctx, const ArchDef &arch, Word id);
~Core();
void attach_ram(RAM* ram);
bool running() const;
void step(uint64_t cycle);
@ -64,7 +66,7 @@ public:
void barrier(int bar_id, int count, int warp_id);
Word icache_fetch(Addr);
Word icache_read(Addr, Size);
Word dcache_read(Addr, Size);
@ -76,22 +78,21 @@ public:
private:
void fetch();
void decode();
void issue();
void execute();
void commit();
void fetch(uint64_t cycle);
void decode(uint64_t cycle);
void issue(uint64_t cycle);
void execute(uint64_t cycle);
void commit(uint64_t cycle);
void warp_scheduler();
void icache_handleCacheReponse(const MemRsp& response, uint32_t port_id);
void warp_scheduler(uint64_t cycle);
void writeToStdOut(Addr addr, Word data);
Word id_;
const ArchDef& arch_;
const Decoder& decoder_;
MemoryUnit& mem_;
const ArchDef arch_;
const Decoder decoder_;
MemoryUnit mmu_;
#ifdef SM_ENABLE
RAM shared_mem_;
#endif
@ -106,8 +107,6 @@ private:
Cache::Ptr icache_;
Cache::Ptr dcache_;
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
SlavePort<MemRsp> icache_rsp_port_;
std::vector<SlavePort<MemRsp>> dcache_rsp_port_;
PipelineStage fetch_stage_;
PipelineStage decode_stage_;
@ -118,10 +117,12 @@ private:
HashTable<pipeline_state_t> pending_icache_;
WarpMask stalled_warps_;
uint32_t last_schedule_wid_;
uint32_t pending_instrs_;
uint32_t issued_instrs_;
uint32_t committed_instrs_;
bool ebreak_;
std::unordered_map<int, std::stringstream> print_bufs_;
uint64_t stats_insts_;
uint64_t stats_loads_;
uint64_t stats_stores_;

View file

@ -7,14 +7,15 @@
#define DEBUG_HEADER << "DEBUG "
//#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
#define TRACE_HEADER << "TRACE "
//#define TRACE_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
#ifndef NDEBUG
#include <iostream>
#include <iomanip>
#define DX(x) x
#define D(lvl, x) do { \
#define DP(lvl, x) do { \
if ((lvl) <= DEBUG_LEVEL) { \
std::cout DEBUG_HEADER << x << std::endl; \
} \
@ -32,12 +33,33 @@
} \
} while(0)
#define DT(lvl, t, x) do { \
if ((lvl) <= DEBUG_LEVEL) { \
std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \
} \
} while(0)
#define DTH(lvl, t, x) do { \
if ((lvl) <= DEBUG_LEVEL) { \
std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \
} \
} while(0)
#define DTN(lvl, x) do { \
if ((lvl) <= DEBUG_LEVEL) { \
std::cout << x; \
} \
} while(0)
#else
#define DX(x)
#define D(lvl, x) do {} while(0)
#define DP(lvl, x) do {} while(0)
#define DPH(lvl, x) do {} while(0)
#define DPN(lvl, x) do {} while(0)
#define D_RAW(x) do {} while(0)
#define DT(lvl, t, x) do {} while(0)
#define DTH(lvl, t, x) do {} while(0)
#define DTN(lvl, x) do {} while(0)
#endif

View file

@ -194,47 +194,26 @@ static const char* op_string(const Instr &instr) {
namespace vortex {
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
os << op_string(instr) << ": ";
auto opcode = instr.getOpcode();
auto rd_to_string = [&]() {
int rdt = instr.getRDType();
int rd = instr.getRDest();
switch (rdt) {
case 1: os << "r" << std::dec << rd << " <- "; break;
case 2: os << "fr" << std::dec << rd << " <- "; break;
case 3: os << "vr" << std::dec << rd << " <- "; break;
default: break;
}
};
auto rs_to_string = [&](int i) {
int rst = instr.getRSType(i);
int rs = instr.getRSrc(i);
switch (rst) {
case 1: os << "r" << std::dec << rs; break;
case 2: os << "fr" << std::dec << rs; break;
case 3: os << "vr" << std::dec << rs; break;
default: break;
}
};
auto opcode = instr.getOpcode();
if (opcode == S_INST
|| opcode == FS
|| opcode == VS) {
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
rs_to_string(1);
os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
} else
if (opcode == L_INST
|| opcode == FL
|| opcode == VL) {
rd_to_string();
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
} else {
rd_to_string();
if (instr.getRDType() != RegType::None) {
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
}
int i = 0;
for (; i < instr.getNRSrc(); ++i) {
if (i) os << ", ";
rs_to_string(i);
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
}
if (instr.hasImm()) {
if (i) os << ", ";
@ -281,7 +260,7 @@ Decoder::Decoder(const ArchDef &arch) {
v_imm_mask_ = 0x7ff;
}
std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {
std::shared_ptr<Instr> Decoder::decode(Word code) const {
auto instr = std::make_shared<Instr>();
Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_);
instr->setOpcode(op);
@ -297,8 +276,8 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {
auto op_it = sc_instTable.find(op);
if (op_it == sc_instTable.end()) {
std::cout << std::hex << "invalid opcode: 0x" << op << ", instruction=0x" << code << ", PC=" << PC << std::endl;
std::abort();
std::cout << std::hex << "Error: invalid opcode: 0x" << op << std::endl;
return nullptr;
}
auto iType = op_it->second.iType;
@ -459,7 +438,5 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {
std::abort();
}
D(2, "Instr 0x" << std::hex << code << ": " << *instr << std::flush);
return instr;
}

View file

@ -13,7 +13,7 @@ class Decoder {
public:
Decoder(const ArchDef &);
std::shared_ptr<Instr> decode(Word code, Word PC) const;
std::shared_ptr<Instr> decode(Word code) const;
private:

View file

@ -75,11 +75,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
if (num_rsrcs) {
for (int i = 0; i < num_rsrcs; ++i) {
DPH(2, "Src Reg [" << std::dec << i << "]: ");
int type = instr.getRSType(i);
auto type = instr.getRSType(i);
int reg = instr.getRSrc(i);
switch (type) {
case 1:
DPH(2, "r" << std::dec << reg << "={");
case RegType::Integer:
DPN(2, "r" << std::dec << reg << "={");
for (int t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
@ -91,8 +91,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
DPN(2, "}" << std::endl);
break;
case 2:
DPH(2, "fr" << std::dec << reg << "={");
case RegType::Float:
DPN(2, "fr" << std::dec << reg << "={");
for (int t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
@ -105,6 +105,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
DPN(2, "}" << std::endl);
break;
default:
std::abort();
break;
}
}
@ -415,7 +416,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
break;
case L_INST:
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.load = 0;
pipeline_state->lsu.type = LsuType::LOAD;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->mem_addrs.resize(num_threads);
for (int t = 0; t < num_threads; ++t) {
@ -425,7 +426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
Word data_read = core_->dcache_read(memAddr, 4);
pipeline_state->mem_addrs.at(t) = memAddr;
D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
switch (func3) {
case 0:
// LBI
@ -455,7 +456,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
break;
case S_INST:
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.store = 1;
pipeline_state->lsu.type = LsuType::STORE;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
pipeline_state->mem_addrs.resize(num_threads);
@ -464,7 +465,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
continue;
Word memAddr = rsdata[t][0] + immsrc;
pipeline_state->mem_addrs.at(t) = memAddr;
D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
switch (func3) {
case 0:
// SB
@ -543,12 +544,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
break;
case FENCE:
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.fence = 1;
pipeline_state->lsu.type = LsuType::FENCE;
pipeline_state->stall_warp = true;
break;
case (FL | VL):
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.load = 1;
pipeline_state->lsu.type = LsuType::LOAD;
pipeline_state->used_iregs[rsrc0] = 1;
if (func3 == 0x2) {
pipeline_state->mem_addrs.resize(num_threads);
@ -558,14 +559,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
Word memAddr = rsdata[t][0] + immsrc;
pipeline_state->mem_addrs.at(t) = memAddr;
Word data_read = core_->dcache_read(memAddr, 4);
D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
rddata[t] = data_read;
}
} else {
D(3, "Executing vector load");
D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
D(3, "dest: v" << rdest);
D(3, "width" << instr.getVlsWidth());
DP(3, "Executing vector load");
DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
DP(3, "dest: v" << rdest);
DP(3, "width" << instr.getVlsWidth());
pipeline_state->mem_addrs.resize(vl_);
auto &vd = vRegFile_.at(rdest);
switch (instr.getVlsWidth()) {
@ -574,9 +575,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
for (int i = 0; i < vl_; i++) {
Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
pipeline_state->mem_addrs.at(i) = memAddr;
D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
Word data_read = core_->dcache_read(memAddr, 4);
D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
int *result_ptr = (int *)(vd.data() + i);
*result_ptr = data_read;
}
@ -590,7 +591,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
break;
case (FS | VS):
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.store = 1;
pipeline_state->lsu.type = LsuType::STORE;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
if (func3 == 0x2) {
@ -601,20 +602,20 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
Word memAddr = rsdata[t][0] + immsrc;
pipeline_state->mem_addrs.at(t) = memAddr;
core_->dcache_write(memAddr, rsdata[t][1], 4);
D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
}
} else {
pipeline_state->mem_addrs.resize(vl_);
for (int i = 0; i < vl_; i++) {
Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
pipeline_state->mem_addrs.at(i) = memAddr;
D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
switch (instr.getVlsWidth()) {
case 6: {
//store word and unit strided (not checking for unit stride)
uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
core_->dcache_write(memAddr, value, 4);
D(3, "store: " << memAddr << " value:" << value);
DP(3, "store: " << memAddr << " value:" << value);
} break;
default:
std::abort();
@ -705,9 +706,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
} else {
// FMV.X.W
rddata[t] = rsdata[t][0];
pipeline_state->fpu.type = FpuType::FNCP;
pipeline_state->used_fregs[rsrc0] = 1;
}
}
pipeline_state->fpu.type = FpuType::FNCP;
pipeline_state->used_fregs[rsrc0] = 1;
break;
case 0x50:
switch(func3) {
@ -783,132 +784,138 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
rd_write = true;
break;
case GPGPU:
pipeline_state->exe_type = ExeType::GPU;
case GPGPU: {
pipeline_state->exe_type = ExeType::GPU;
int ts = 0;
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
switch (func3) {
case 0: {
// TMC
pipeline_state->gpu.type = GpuType::TMC;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->stall_warp = true;
if (rsrc1) {
// predicate mode
ThreadMask pred;
for (int i = 0; i < num_threads; ++i) {
pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
}
if (pred.any()) {
tmask_ &= pred;
}
} else {
tmask_.reset();
for (int i = 0; i < num_threads; ++i) {
tmask_.set(i, rsdata.at(t)[0] & (1 << i));
}
}
D(3, "*** TMC " << tmask_);
active_ = tmask_.any();
break; // runOnce
} break;
case 1: {
// WSPAWN
pipeline_state->gpu.type = GpuType::WSPAWN;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
pipeline_state->stall_warp = true;
int active_warps = std::min<int>(rsdata.at(t)[0], core_->arch().num_warps());
D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(t)[1]);
for (int i = 1; i < active_warps; ++i) {
Warp &newWarp = core_->warp(i);
newWarp.setPC(rsdata[t][1]);
newWarp.setTmask(0, true);
}
break; // runOnce
} break;
case 2: {
// SPLIT
pipeline_state->gpu.type = GpuType::SPLIT;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->stall_warp = true;
if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {
ThreadMask tmask;
for (int i = 0; i < num_threads; ++i) {
tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
}
DomStackEntry e(tmask, nextPC);
domStack_.push(tmask_);
domStack_.push(e);
for (size_t i = 0; i < e.tmask.size(); ++i) {
tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
}
active_ = tmask_.any();
DPH(3, "*** Split: New TM=");
for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
DPN(3, ", Pushed TM=");
for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1));
DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
} else {
D(3, "*** Unanimous pred");
DomStackEntry e(tmask_);
e.unanimous = true;
domStack_.push(e);
}
break; // runOnce
} break;
case 3: {
// JOIN
pipeline_state->gpu.type = GpuType::JOIN;
pipeline_state->stall_warp = true;
if (!domStack_.empty() && domStack_.top().unanimous) {
D(3, "*** Uninimous branch at join");
tmask_ = domStack_.top().tmask;
active_ = tmask_.any();
domStack_.pop();
} else {
if (!domStack_.top().fallThrough) {
nextPC = domStack_.top().PC;
D(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
}
tmask_ = domStack_.top().tmask;
active_ = tmask_.any();
DPH(3, "*** Join: New TM=");
for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
DPN(3, "\n");
domStack_.pop();
}
break; // runOnce
} break;
case 4: {
// BAR
pipeline_state->gpu.type = GpuType::BAR;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
pipeline_state->stall_warp = true;
active_ = false;
core_->barrier(rsdata[t][0], rsdata[t][1], id_);
break; // runOnce
} break;
case 6: {
// PREFETCH
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.prefetch = 1;
pipeline_state->used_iregs[rsrc0] = 1;
int addr = rsdata[t][0];
printf("*** PREFETCHED %d ***\n", addr);
} break;
default:
std::abort();
if (tmask_.test(t)) {
ts = t;
break;
}
}
break;
switch (func3) {
case 0: {
// TMC
pipeline_state->gpu.type = GpuType::TMC;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->stall_warp = true;
if (rsrc1) {
// predicate mode
ThreadMask pred;
for (int i = 0; i < num_threads; ++i) {
pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
}
if (pred.any()) {
tmask_ &= pred;
}
} else {
tmask_.reset();
for (int i = 0; i < num_threads; ++i) {
tmask_.set(i, rsdata.at(ts)[0] & (1 << i));
}
}
DPH(3, "*** New TMC: ");
for (int i = 0; i < num_threads; ++i)
DPN(3, tmask_.test(num_threads-i-1));
DPN(3, std::endl);
active_ = tmask_.any();
} break;
case 1: {
// WSPAWN
pipeline_state->gpu.type = GpuType::WSPAWN;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
pipeline_state->stall_warp = true;
int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps());
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]);
for (int i = 1; i < active_warps; ++i) {
Warp &newWarp = core_->warp(i);
newWarp.setPC(rsdata[ts][1]);
newWarp.setTmask(0, true);
}
} break;
case 2: {
// SPLIT
pipeline_state->gpu.type = GpuType::SPLIT;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->stall_warp = true;
if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {
ThreadMask tmask;
for (int i = 0; i < num_threads; ++i) {
tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
}
DomStackEntry e(tmask, nextPC);
domStack_.push(tmask_);
domStack_.push(e);
for (size_t i = 0; i < e.tmask.size(); ++i) {
tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
}
active_ = tmask_.any();
DPH(3, "*** Split: New TM=");
for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
DPN(3, ", Pushed TM=");
for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1));
DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
} else {
DP(3, "*** Unanimous pred");
DomStackEntry e(tmask_);
e.unanimous = true;
domStack_.push(e);
}
} break;
case 3: {
// JOIN
pipeline_state->gpu.type = GpuType::JOIN;
pipeline_state->stall_warp = true;
if (!domStack_.empty() && domStack_.top().unanimous) {
DP(3, "*** Uninimous branch at join");
tmask_ = domStack_.top().tmask;
active_ = tmask_.any();
domStack_.pop();
} else {
if (!domStack_.top().fallThrough) {
nextPC = domStack_.top().PC;
DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
}
tmask_ = domStack_.top().tmask;
active_ = tmask_.any();
DPH(3, "*** Join: New TM=");
for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
DPN(3, "\n");
domStack_.pop();
}
} break;
case 4: {
// BAR
pipeline_state->gpu.type = GpuType::BAR;
pipeline_state->used_iregs[rsrc0] = 1;
pipeline_state->used_iregs[rsrc1] = 1;
pipeline_state->stall_warp = true;
active_ = false;
core_->barrier(rsdata[ts][0], rsdata[ts][1], id_);
} break;
case 6: {
// PREFETCH
pipeline_state->exe_type = ExeType::LSU;
pipeline_state->lsu.type = LsuType::PREFETCH;
pipeline_state->used_iregs[rsrc0] = 1;
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
int addr = rsdata[t][0];
printf("*** PREFETCHED %d ***\n", addr);
}
} break;
default:
std::abort();
}
} break;
case VSET: {
int VLEN = core_->arch().vsize() * 8;
int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
@ -928,7 +935,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = first + second;
D(3, "Adding " << first << " + " << second << " = " << result);
DP(3, "Adding " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
}
@ -940,7 +947,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = first + second;
D(3, "Adding " << first << " + " << second << " = " << result);
DP(3, "Adding " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
}
@ -952,7 +959,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = first + second;
D(3, "Adding " << first << " + " << second << " = " << result);
DP(3, "Adding " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
@ -968,7 +975,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first == second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
@ -976,7 +983,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first == second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
@ -984,7 +991,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first == second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
@ -999,7 +1006,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first != second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
@ -1007,7 +1014,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first != second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
@ -1015,7 +1022,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first != second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
@ -1030,7 +1037,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first < second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
@ -1038,7 +1045,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first < second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
@ -1046,7 +1053,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first < second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
@ -1061,7 +1068,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
int8_t first = *(int8_t *)(vr1.data() + i);
int8_t second = *(int8_t *)(vr2.data() + i);
int8_t result = (first < second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
@ -1069,7 +1076,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
int16_t first = *(int16_t *)(vr1.data() + i);
int16_t second = *(int16_t *)(vr2.data() + i);
int16_t result = (first < second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
@ -1077,7 +1084,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
int32_t first = *(int32_t *)(vr1.data() + i);
int32_t second = *(int32_t *)(vr2.data() + i);
int32_t result = (first < second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int32_t *)(vd.data() + i) = result;
}
}
@ -1092,7 +1099,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first <= second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
@ -1100,7 +1107,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first <= second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
@ -1108,7 +1115,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first <= second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
@ -1123,7 +1130,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
int8_t first = *(int8_t *)(vr1.data() + i);
int8_t second = *(int8_t *)(vr2.data() + i);
int8_t result = (first <= second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
@ -1131,7 +1138,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
int16_t first = *(int16_t *)(vr1.data() + i);
int16_t second = *(int16_t *)(vr2.data() + i);
int16_t result = (first <= second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
@ -1139,7 +1146,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
int32_t first = *(int32_t *)(vr1.data() + i);
int32_t second = *(int32_t *)(vr2.data() + i);
int32_t result = (first <= second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int32_t *)(vd.data() + i) = result;
}
}
@ -1154,7 +1161,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first > second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
@ -1162,7 +1169,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first > second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
@ -1170,7 +1177,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first > second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
}
@ -1185,7 +1192,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
int8_t first = *(int8_t *)(vr1.data() + i);
int8_t second = *(int8_t *)(vr2.data() + i);
int8_t result = (first > second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 16) {
@ -1193,7 +1200,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
int16_t first = *(int16_t *)(vr1.data() + i);
int16_t second = *(int16_t *)(vr2.data() + i);
int16_t result = (first > second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int16_t *)(vd.data() + i) = result;
}
} else if (vtype_.vsew == 32) {
@ -1201,7 +1208,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
int32_t first = *(int32_t *)(vr1.data() + i);
int32_t second = *(int32_t *)(vr2.data() + i);
int32_t result = (first > second) ? 1 : 0;
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(int32_t *)(vd.data() + i) = result;
}
}
@ -1222,7 +1229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = (first_value & !second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1235,7 +1242,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = (first_value & !second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1248,7 +1255,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = (first_value & !second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1268,7 +1275,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = (first_value & second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1281,7 +1288,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = (first_value & second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1294,7 +1301,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = (first_value & second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1314,7 +1321,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = (first_value | second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1327,7 +1334,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = (first_value | second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1340,7 +1347,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = (first_value | second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1360,7 +1367,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = (first_value ^ second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1373,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = (first_value ^ second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1386,7 +1393,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = (first_value ^ second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1406,7 +1413,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = (first_value | !second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1419,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = (first_value | !second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1432,7 +1439,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = (first_value | !second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1452,7 +1459,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = !(first_value & second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1465,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = !(first_value & second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1478,7 +1485,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = !(first_value & second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1498,7 +1505,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = !(first_value | second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1511,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = !(first_value | second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1524,7 +1531,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = !(first_value | second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1544,7 +1551,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first_value = (first & 0x1);
uint8_t second_value = (second & 0x1);
uint8_t result = !(first_value ^ second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1557,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first_value = (first & 0x1);
uint16_t second_value = (second & 0x1);
uint16_t result = !(first_value ^ second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1570,7 +1577,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first_value = (first & 0x1);
uint32_t second_value = (second & 0x1);
uint32_t result = !(first_value ^ second_value);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1588,7 +1595,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first * second);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1599,7 +1606,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first * second);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1610,7 +1617,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first * second);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1628,7 +1635,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (first * second);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) += result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1639,7 +1646,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint16_t first = *(uint16_t *)(vr1.data() + i);
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (first * second);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) += result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1650,7 +1657,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
uint32_t first = *(uint32_t *)(vr1.data() + i);
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (first * second);
D(3, "Comparing " << first << " + " << second << " = " << result);
DP(3, "Comparing " << first << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) += result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1669,7 +1676,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
for (int i = 0; i < vl_; i++) {
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (rsdata[i][0] + second);
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1679,7 +1686,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
for (int i = 0; i < vl_; i++) {
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (rsdata[i][0] + second);
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1689,7 +1696,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
for (int i = 0; i < vl_; i++) {
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (rsdata[i][0] + second);
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1705,7 +1712,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
for (int i = 0; i < vl_; i++) {
uint8_t second = *(uint8_t *)(vr2.data() + i);
uint8_t result = (rsdata[i][0] * second);
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
*(uint8_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1715,7 +1722,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
for (int i = 0; i < vl_; i++) {
uint16_t second = *(uint16_t *)(vr2.data() + i);
uint16_t result = (rsdata[i][0] * second);
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
*(uint16_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1725,7 +1732,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
for (int i = 0; i < vl_; i++) {
uint32_t second = *(uint32_t *)(vr2.data() + i);
uint32_t result = (rsdata[i][0] * second);
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
*(uint32_t *)(vd.data() + i) = result;
}
for (int i = vl_; i < VLMAX; i++) {
@ -1741,7 +1748,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
vtype_.vsew = instr.getVsew();
vtype_.vlmul = instr.getVlmul();
D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX);
DP(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX);
int s0 = rsdata[0][0];
if (s0 <= VLMAX) {
@ -1762,46 +1769,49 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
}
if (rd_write) {
pipeline_state->wb = true;
DPH(2, "Dest Reg: ");
int rdt = instr.getRDType();
auto rdt = instr.getRDType();
switch (rdt) {
case 1:
case RegType::Integer:
if (rdest) {
DPH(2, "r" << std::dec << rdest << "={");
DPN(2, "r" << std::dec << rdest << "={");
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
iRegFile_.at(t)[rdest] = rddata[t];
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
DPN(2, "-");
continue;
}
iRegFile_.at(t)[rdest] = rddata[t];
DPN(2, "0x" << std::hex << rddata[t]);
}
DPN(2, "}" << std::endl);
pipeline_state->used_iregs[rdest] = 1;
}
break;
case 2:
DPH(2, "fr" << std::dec << rdest << "={");
case RegType::Float:
DPN(2, "fr" << std::dec << rdest << "={");
for (int t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
fRegFile_.at(t)[rdest] = rddata[t];
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
DPN(2, "-");
continue;
}
fRegFile_.at(t)[rdest] = rddata[t];
DPN(2, "0x" << std::hex << rddata[t]);
}
DPN(2, "}" << std::endl);
pipeline_state->used_fregs[rdest] = 1;
break;
case 3:
pipeline_state->used_vregs[rdest] = 1;
break;
default:
std::abort();
break;
}
}
PC_ += core_->arch().wsize();
if (PC_ != nextPC) {
D(3, "*** Next PC: " << std::hex << nextPC << std::dec);
DP(3, "*** Next PC: " << std::hex << nextPC << std::dec);
PC_ = nextPC;
}
}

View file

@ -9,6 +9,17 @@
using namespace vortex;
NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
void NopUnit::step(uint64_t /*cycle*/) {
pipeline_state_t state;
if (!inputs_.try_pop(&state))
return;
this->schedule_output(state, 1);
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(Core* core)
: ExeUnit("LSU")
, core_(core)
@ -17,61 +28,77 @@ LsuUnit::LsuUnit(Core* core)
, fence_lock_(false)
{}
void LsuUnit::handleCacheReponse(const MemRsp& response, uint32_t port_id) {
auto entry = pending_dcache_.at(response.tag);
entry.second.reset(port_id); // track remaining blocks
if (!entry.second.any()) {
auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
entry.first.dcache_latency = latency;
this->schedule_output(entry.first, 1);
pending_dcache_.release(response.tag);
}
}
void LsuUnit::step(uint64_t cycle) {
__unused (cycle);
// handle dcache response
for (uint32_t t = 0; t < num_threads_; ++t) {
MemRsp mem_rsp;
if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp))
continue;
auto& entry = pending_dcache_.at(mem_rsp.tag);
DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first);
assert(entry.second.test(t));
entry.second.reset(t); // track remaining blocks
if (!entry.second.any()) {
auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
entry.first.dcache_latency = latency;
this->schedule_output(entry.first, 1);
pending_dcache_.release(mem_rsp.tag);
}
}
void LsuUnit::step() {
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_dcache_.empty())
return;
this->schedule_output(fence_state_, 1);
fence_lock_ = false;
DT(3, cycle, "fence-unlock: " << fence_state_);
}
// check input queue
if (inputs_.empty())
return;
auto state = inputs_.top();
if (state.lsu.fence) {
if (state.lsu.type == LsuType::FENCE) {
// schedule fence lock
fence_state_ = state;
fence_lock_ = true;
inputs_.pop();
DT(3, cycle, "fence-lock: " << state);
return;
}
// send dcache requests
if (!pending_dcache_.full()) {
state.dcache_latency = SimPlatform::instance().cycles();
auto tag = pending_dcache_.allocate({state, state.tmask});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!state.tmask.test(t))
continue;
MemReq mem_req;
mem_req.addr = state.mem_addrs.at(t);
mem_req.write = state.lsu.store;
mem_req.tag = tag;
core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
}
inputs_.pop();
// check pending queue capacity
if (pending_dcache_.full()) {
DT(3, cycle, "*** lsu-queue-stall: " << state);
return;
}
// send dcache request
state.dcache_latency = SimPlatform::instance().cycles();
auto tag = pending_dcache_.allocate({state, state.tmask});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!state.tmask.test(t))
continue;
MemReq mem_req;
mem_req.addr = state.mem_addrs.at(t);
mem_req.write = (state.lsu.type == LsuType::STORE);
mem_req.tag = tag;
core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state);
}
inputs_.pop();
}
///////////////////////////////////////////////////////////////////////////////
AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
void AluUnit::step() {
void AluUnit::step(uint64_t /*cycle*/) {
pipeline_state_t state;
if (!inputs_.try_pop(&state))
return;
@ -95,7 +122,7 @@ void AluUnit::step() {
CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
void CsrUnit::step() {
void CsrUnit::step(uint64_t /*cycle*/) {
pipeline_state_t state;
if (!inputs_.try_pop(&state))
return;
@ -106,7 +133,7 @@ void CsrUnit::step() {
FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
void FpuUnit::step() {
void FpuUnit::step(uint64_t /*cycle*/) {
pipeline_state_t state;
if (!inputs_.try_pop(&state))
return;
@ -133,7 +160,7 @@ void FpuUnit::step() {
GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {}
void GpuUnit::step() {
void GpuUnit::step(uint64_t /*cycle*/) {
pipeline_state_t state;
if (!inputs_.try_pop(&state))
return;

View file

@ -43,7 +43,16 @@ public:
return outputs_.try_pop(state);
}
virtual void step() = 0;
virtual void step(uint64_t cycle) = 0;
};
///////////////////////////////////////////////////////////////////////////////
class NopUnit : public ExeUnit {
public:
NopUnit(Core*);
void step(uint64_t cycle);
};
///////////////////////////////////////////////////////////////////////////////
@ -59,9 +68,7 @@ private:
public:
LsuUnit(Core*);
void handleCacheReponse(const MemRsp& response, uint32_t port_id);
void step();
void step(uint64_t cycle);
};
///////////////////////////////////////////////////////////////////////////////
@ -70,7 +77,7 @@ class AluUnit : public ExeUnit {
public:
AluUnit(Core*);
void step();
void step(uint64_t cycle);
};
///////////////////////////////////////////////////////////////////////////////
@ -79,7 +86,7 @@ class CsrUnit : public ExeUnit {
public:
CsrUnit(Core*);
void step();
void step(uint64_t cycle);
};
///////////////////////////////////////////////////////////////////////////////
@ -88,7 +95,7 @@ class FpuUnit : public ExeUnit {
public:
FpuUnit(Core*);
void step();
void step(uint64_t cycle);
};
///////////////////////////////////////////////////////////////////////////////
@ -97,7 +104,7 @@ class GpuUnit : public ExeUnit {
public:
GpuUnit(Core*);
void step();
void step(uint64_t cycle);
};
}

View file

@ -53,22 +53,23 @@ public:
: opcode_(Opcode::NOP)
, num_rsrcs_(0)
, has_imm_(false)
, rdest_type_(RegType::None)
, rdest_(0)
, func3_(0)
, func7_(0) {
for (int i = 0; i < MAX_REG_SOURCES; ++i) {
rsrc_type_[i] = 0;
rsrc_type_[i] = RegType::None;
}
}
/* Setters used to "craft" the instruction. */
void setOpcode(Opcode opcode) { opcode_ = opcode; }
void setDestReg(int destReg) { rdest_type_ = 1; rdest_ = destReg; }
void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = 1; rsrc_[num_rsrcs_++] = srcReg; }
void setDestFReg(int destReg) { rdest_type_ = 2; rdest_ = destReg; }
void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = 2; rsrc_[num_rsrcs_++] = srcReg; }
void setDestVReg(int destReg) { rdest_type_ = 3; rdest_ = destReg; }
void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = 3; rsrc_[num_rsrcs_++] = srcReg; }
void setDestReg(int destReg) { rdest_type_ = RegType::Integer; rdest_ = destReg; }
void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Integer; rsrc_[num_rsrcs_++] = srcReg; }
void setDestFReg(int destReg) { rdest_type_ = RegType::Float; rdest_ = destReg; }
void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; }
void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; }
void setFunc3(Word func3) { func3_ = func3; }
void setFunc7(Word func7) { func7_ = func7; }
void setImm(Word imm) { has_imm_ = true; imm_ = imm; }
@ -89,9 +90,9 @@ public:
Word getFunc7() const { return func7_; }
int getNRSrc() const { return num_rsrcs_; }
int getRSrc(int i) const { return rsrc_[i]; }
int getRSType(int i) const { return rsrc_type_[i]; }
RegType getRSType(int i) const { return rsrc_type_[i]; }
int getRDest() const { return rdest_; }
int getRDType() const { return rdest_type_; }
RegType getRDType() const { return rdest_type_; }
bool hasImm() const { return has_imm_; }
Word getImm() const { return imm_; }
Word getVlsWidth() const { return vlsWidth_; }
@ -112,15 +113,15 @@ private:
Opcode opcode_;
int num_rsrcs_;
bool has_imm_;
int rdest_type_;
RegType rdest_type_;
Word imm_;
int rsrc_type_[MAX_REG_SOURCES];
RegType rsrc_type_[MAX_REG_SOURCES];
int rsrc_[MAX_REG_SOURCES];
int rdest_;
Word func3_;
Word func6_;
//Vector
// Vector
Word vmask_;
Word vlsWidth_;
Word vMop_;

View file

@ -6,12 +6,15 @@
#include <stdlib.h>
#include <sys/stat.h>
#include "processor.h"
#include <util.h>
#include "args.h"
#define RAM_PAGE_SIZE 4096
using namespace vortex;
int main(int argc, char **argv) {
int ret;
int exitcode;
std::string archStr("rv32imf");
std::string imgFileName;
@ -53,11 +56,42 @@ int main(int argc, char **argv) {
{
ArchDef arch(archStr, num_cores, num_warps, num_threads);
Processor processor(arch);
ret = processor.run(imgFileName, riscv_test, showStats);
RAM ram(RAM_PAGE_SIZE);
{
std::string program_ext(fileExtension(imgFileName.c_str()));
if (program_ext == "bin") {
ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
} else if (program_ext == "hex") {
ram.loadHexImage(imgFileName.c_str());
} else {
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
return -1;
}
}
processor.attach_ram(&ram);
exitcode = processor.run();
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed." << std::endl;
exitcode = 0;
} else {
std::cout << "Failed." << std::endl;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
}
SimPlatform::instance().finalize();
return ret;
return exitcode;
}

View file

@ -8,32 +8,26 @@ using namespace vortex;
class MemSim::Impl {
private:
MemSim* simobject_;
std::vector<std::queue<MemReq>> inputs_;
uint32_t num_banks_;
uint32_t latency_;
public:
Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency)
: simobject_(simobject)
, inputs_(num_banks)
, num_banks_(num_banks)
, latency_(latency)
{}
void handleMemRequest(const MemReq& mem_req, uint32_t port_id) {
inputs_.at(port_id).push(mem_req);
}
void step(uint64_t /*cycle*/) {
for (uint32_t i = 0, n = inputs_.size(); i < n; ++i) {
auto& queue = inputs_.at(i);
if (queue.empty())
for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
MemReq mem_req;
if (!simobject_->MemReqPorts.at(i).read(&mem_req))
continue;
auto& entry = queue.front();
if (!entry.write) {
if (!mem_req.write) {
MemRsp mem_rsp;
mem_rsp.tag = entry.tag;
mem_rsp.tag = mem_req.tag;
simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
}
queue.pop();
}
}
};
@ -45,7 +39,7 @@ MemSim::MemSim(const SimContext& ctx,
uint32_t latency)
: SimObject<MemSim>(ctx, "MemSim")
, impl_(new Impl(this, num_banks, latency))
, MemReqPorts(num_banks, {this, impl_, &Impl::handleMemRequest})
, MemReqPorts(num_banks, this)
, MemRspPorts(num_banks, this)
{}

View file

@ -10,14 +10,19 @@
namespace vortex {
struct pipeline_state_t {
//--
//--
uint64_t id;
//--
int cid;
int wid;
ThreadMask tmask;
Word PC;
//--
bool stall_warp;
int rdest_type;
bool wb;
RegType rdest_type;
int rdest;
RegMask used_iregs;
RegMask used_fregs;
@ -30,10 +35,7 @@ struct pipeline_state_t {
//--
union {
struct {
uint8_t load : 1;
uint8_t store: 1;
uint8_t fence : 1;
uint8_t prefetch: 1;
LsuType type;
} lsu;
struct {
AluType type;
@ -49,8 +51,37 @@ struct pipeline_state_t {
// stats
uint64_t icache_latency;
uint64_t dcache_latency;
void clear() {
cid = 0;
wid = 0;
tmask.reset();
PC = 0;
stall_warp = false;
wb = false;
rdest = 0;
rdest_type = RegType::None;
used_iregs.reset();
used_fregs.reset();
used_vregs.reset();
exe_type = ExeType::NOP;
mem_addrs.clear();
icache_latency = 0;
dcache_latency = 0;
}
};
inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
os << ", wb=" << state.wb;
if (state.wb) {
os << ", rd=" << state.rdest_type << std::dec << state.rdest;
}
os << ", ex=" << state.exe_type;
os << " (#" << std::dec << state.id << ")";
return os;
}
class PipelineStage : public Queue<pipeline_state_t> {
protected:
const char* name_;
@ -62,15 +93,4 @@ public:
{}
};
inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
os << "stall_warp=" << state.stall_warp;
os << ", wid=" << state.wid;
os << ", PC=" << std::hex << state.PC;
os << ", used_iregs=" << state.used_iregs;
os << ", used_fregs=" << state.used_fregs;
os << ", used_vregs=" << state.used_vregs;
os << std::endl;
return os;
}
}

141
sim/simX/processor.cpp Normal file
View file

@ -0,0 +1,141 @@
#include "processor.h"
#include "constants.h"
using namespace vortex;
Processor::Processor(const ArchDef& arch)
: cores_(arch.num_cores())
, l2caches_(NUM_CLUSTERS)
, l2_mem_switches_(NUM_CLUSTERS)
{
uint32_t num_cores = arch.num_cores();
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
cores_.at(i) = Core::Create(arch, i);
}
// connect memory sub-systen
memsim_ = MemSim::Create(1, MEM_LATENCY);
std::vector<SlavePort<MemReq>*> mem_req_ports(1);
std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
if (L3_ENABLE) {
l3cache_ = Cache::Create("l3cache", CacheConfig{
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L3_NUM_BANKS, // number of banks
L3_NUM_PORTS, // number of ports
NUM_CLUSTERS, // request size
true, // write-throught
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);
mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
l3cache_->MemReqPort.bind(mem_req_ports.at(0));
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
}
} else if (NUM_CLUSTERS > 1) {
l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
}
}
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
if (L2_ENABLE) {
auto& l2cache = l2caches_.at(i);
l2cache = Cache::Create("l2cache", CacheConfig{
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L2_NUM_BANKS, // number of banks
L2_NUM_PORTS, // number of ports
NUM_CORES, // request size
true, // write-throught
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
l2cache->MemReqPort.bind(mem_req_ports.at(i));
mem_req_ports.resize(cores_per_cluster);
mem_rsp_ports.resize(cores_per_cluster);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
}
} else if (cores_per_cluster > 1) {
auto& l2_mem_switch = l2_mem_switches_.at(i);
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
mem_req_ports.resize(cores_per_cluster);
mem_rsp_ports.resize(cores_per_cluster);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
}
}
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
auto& core = cores_.at((i * NUM_CLUSTERS) + j);
mem_rsp_ports.at(i)->bind(&core->MemRspPort);
core->MemReqPort.bind(mem_req_ports.at(j));
}
}
}
void Processor::attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
}
Processor::~Processor() {}
int Processor::run() {
bool running;
int exitcode = 0;
do {
SimPlatform::instance().step();
running = false;
for (auto& core : cores_) {
if (core->running()) {
running = true;
}
if (core->check_ebreak()) {
exitcode = core->getIRegValue(3);
running = false;
break;
}
}
} while (running);
return exitcode;
}

View file

@ -1,189 +1,27 @@
#pragma once
#include "constants.h"
#include "debug.h"
#include "types.h"
#include "core.h"
namespace vortex {
class Processor {
public:
typedef std::shared_ptr<Processor> Ptr;
Processor(const ArchDef& arch);
~Processor();
void attach_ram(RAM* mem);
int run();
private:
ArchDef arch_;
Decoder decoder_;
MemoryUnit mu_;
RAM ram_;
std::vector<Core::Ptr> cores_;
std::vector<Cache::Ptr> l2caches_;
std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
Cache::Ptr l3cache_;
Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
MemSim::Ptr memsim_;
public:
Processor(const ArchDef& arch)
: arch_(arch)
, decoder_(arch)
, mu_(0, arch.wsize(), true)
, ram_((1<<12), (1<<20))
, cores_(arch.num_cores())
, l2caches_(NUM_CLUSTERS)
, l2_mem_switches_(NUM_CLUSTERS)
{
uint32_t num_cores = arch.num_cores();
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
// bind RAM to memory unit
mu_.attach(ram_, 0, 0xFFFFFFFF);
// create cores
for (uint32_t i = 0; i < num_cores; ++i) {
cores_.at(i) = Core::Create(arch, decoder_, mu_, i);
}
// connect memory sub-systen
memsim_ = MemSim::Create(1, MEM_LATENCY);
std::vector<SlavePort<MemReq>*> mem_req_ports(1);
std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
if (L3_ENABLE) {
l3cache_ = Cache::Create("l3cache", CacheConfig{
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L3_NUM_BANKS, // number of banks
L3_NUM_PORTS, // number of ports
NUM_CLUSTERS, // request size
true, // write-throught
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
});
mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
l3cache_->MemReqPort.bind(mem_req_ports.at(0));
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
}
} else if (NUM_CLUSTERS > 1) {
l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
}
}
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
if (L2_ENABLE) {
auto& l2cache = l2caches_.at(i);
l2cache = Cache::Create("l2cache", CacheConfig{
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
L2_NUM_BANKS, // number of banks
L2_NUM_PORTS, // number of ports
NUM_CORES, // request size
true, // write-throught
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
l2cache->MemReqPort.bind(mem_req_ports.at(i));
mem_req_ports.resize(cores_per_cluster);
mem_rsp_ports.resize(cores_per_cluster);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
}
} else if (cores_per_cluster > 1) {
auto& l2_mem_switch = l2_mem_switches_.at(i);
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
mem_req_ports.resize(cores_per_cluster);
mem_rsp_ports.resize(cores_per_cluster);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
}
}
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
auto& core = cores_.at((i * NUM_CLUSTERS) + j);
mem_rsp_ports.at(i)->bind(&core->MemRspPort);
core->MemReqPort.bind(mem_req_ports.at(j));
}
}
}
~Processor() {}
int run(const std::string& program, bool riscv_test, bool /*showStats*/) {
{
std::string program_ext(fileExtension(program.c_str()));
if (program_ext == "bin") {
ram_.loadBinImage(program.c_str(), STARTUP_ADDR);
} else if (program_ext == "hex") {
ram_.loadHexImage(program.c_str());
} else {
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
return -1;
}
}
bool running;
int exitcode = 0;
do {
SimPlatform::instance().step();
running = false;
for (auto& core : cores_) {
if (core->running()) {
running = true;
}
if (core->check_ebreak()) {
exitcode = core->getIRegValue(3);
running = false;
break;
}
}
} while (running);
// get error status
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed." << std::endl;
exitcode = 0;
} else {
std::cout << "Failed." << std::endl;
}
} else {
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
return exitcode;
}
};
}

View file

@ -10,6 +10,7 @@ private:
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
public:
Scoreboard(const ArchDef &arch)
@ -29,42 +30,87 @@ public:
|| (state.used_fregs & in_use_fregs_.at(state.wid)) != 0
|| (state.used_vregs & in_use_vregs_.at(state.wid)) != 0;
}
std::vector<uint64_t> owners(const pipeline_state_t& state) const {
std::vector<uint64_t> out;
{
uint32_t r = 0;
auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid);
while (used_iregs.any()) {
if (used_iregs.test(0)) {
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer;
out.push_back(owners_.at(tag));
}
used_iregs >>= 1;
++r;
}
}
{
uint32_t r = 0;
auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid);
while (used_fregs.any()) {
if (used_fregs.test(0)) {
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float;
out.push_back(owners_.at(tag));
}
used_fregs >>= 1;
++r;
}
}
{
uint32_t r = 0;
auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid);
while (used_vregs.any()) {
if (used_vregs.test(0)) {
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector;
out.push_back(owners_.at(tag));
}
used_vregs >>= 1;
++r;
}
}
return std::move(out);
}
void reserve(const pipeline_state_t& state) {
if (!state.rdest)
return;
if (!state.wb)
return;
switch (state.rdest_type) {
case 1:
case RegType::Integer:
in_use_iregs_.at(state.wid).set(state.rdest);
break;
case 2:
case RegType::Float:
in_use_fregs_.at(state.wid).set(state.rdest);
break;
case 3:
case RegType::Vector:
in_use_vregs_.at(state.wid).set(state.rdest);
break;
default:
break;
}
}
uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
assert(owners_.count(tag) == 0);
owners_[tag] = state.id;
}
void release(const pipeline_state_t& state) {
if (!state.rdest)
return;
if (!state.wb)
return;
switch (state.rdest_type) {
case 1:
case RegType::Integer:
in_use_iregs_.at(state.wid).reset(state.rdest);
break;
case 2:
case RegType::Float:
in_use_fregs_.at(state.wid).reset(state.rdest);
break;
case 3:
case RegType::Vector:
in_use_vregs_.at(state.wid).reset(state.rdest);
break;
default:
break;
}
uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
owners_.erase(tag);
}
};

View file

@ -4,6 +4,7 @@
#include <bitset>
#include <queue>
#include <unordered_map>
#include <util.h>
#include <VX_config.h>
#include <simobject.h>
@ -20,7 +21,25 @@ typedef std::bitset<32> RegMask;
typedef std::bitset<32> ThreadMask;
typedef std::bitset<32> WarpMask;
enum class RegType {
None,
Integer,
Float,
Vector
};
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
switch (type) {
case RegType::None: break;
case RegType::Integer: os << "r"; break;
case RegType::Float: os << "fr"; break;
case RegType::Vector: os << "vr"; break;
}
return os;
}
enum class ExeType {
NOP,
ALU,
LSU,
CSR,
@ -29,6 +48,19 @@ enum class ExeType {
MAX,
};
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
switch (type) {
case ExeType::NOP: os << "NOP"; break;
case ExeType::ALU: os << "ALU"; break;
case ExeType::LSU: os << "LSU"; break;
case ExeType::CSR: os << "CSR"; break;
case ExeType::FPU: os << "FPU"; break;
case ExeType::GPU: os << "GPU"; break;
case ExeType::MAX: break;
}
return os;
}
enum class AluType {
ARITH,
BRANCH,
@ -36,6 +68,33 @@ enum class AluType {
IDIV,
};
inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
switch (type) {
case AluType::ARITH: os << "ARITH"; break;
case AluType::BRANCH: os << "BRANCH"; break;
case AluType::IMUL: os << "IMUL"; break;
case AluType::IDIV: os << "IDIV"; break;
}
return os;
}
enum class LsuType {
LOAD,
STORE,
FENCE,
PREFETCH,
};
inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
switch (type) {
case LsuType::LOAD: os << "LOAD"; break;
case LsuType::STORE: os << "STORE"; break;
case LsuType::FENCE: os << "FENCE"; break;
case LsuType::PREFETCH: os << "PREFETCH"; break;
}
return os;
}
enum class FpuType {
FNCP,
FMA,
@ -44,6 +103,17 @@ enum class FpuType {
FCVT,
};
inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
switch (type) {
case FpuType::FNCP: os << "FNCP"; break;
case FpuType::FMA: os << "FMA"; break;
case FpuType::FDIV: os << "FDIV"; break;
case FpuType::FSQRT: os << "FSQRT"; break;
case FpuType::FCVT: os << "FCVT"; break;
}
return os;
}
enum class GpuType {
TMC,
WSPAWN,
@ -53,11 +123,31 @@ enum class GpuType {
TEX,
};
inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
switch (type) {
case GpuType::TMC: os << "TMC"; break;
case GpuType::WSPAWN: os << "WSPAWN"; break;
case GpuType::SPLIT: os << "SPLIT"; break;
case GpuType::JOIN: os << "JOIN"; break;
case GpuType::BAR: os << "BAR"; break;
case GpuType::TEX: os << "TEX"; break;
}
return os;
}
enum class ArbiterType {
Priority,
RoundRobin
};
inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
switch (type) {
case ArbiterType::Priority: os << "Priority"; break;
case ArbiterType::RoundRobin: os << "RoundRobin"; break;
}
return os;
}
///////////////////////////////////////////////////////////////////////////////
template <typename T>
@ -65,6 +155,8 @@ class Queue {
protected:
std::queue<T> queue_;
uint32_t count;
public:
Queue() {}
@ -77,6 +169,7 @@ public:
}
void push(const T& value) {
++count;
queue_.push(value);
}
@ -141,6 +234,7 @@ public:
return i;
}
}
assert(false);
return -1;
}
@ -148,6 +242,7 @@ public:
auto& entry = entries_.at(index);
assert(entry.first);
entry.first = false;
--capacity_;
}
void remove(uint32_t index, T* value) {
@ -155,6 +250,7 @@ public:
assert(entry.first);
*value = entry.second;
entry.first = false;
--capacity_;
}
};
@ -163,29 +259,21 @@ public:
template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
class Switch : public SimObject<Switch<Req, Rsp>> {
private:
struct req_t {
struct req_batch_t {
std::vector<Req> data;
std::bitset<MaxInputs> valid;
req_t() {}
req_t(uint32_t size) : data(size) {}
req_batch_t() {}
req_batch_t(uint32_t size)
: data(size)
, valid(0)
{}
};
void handleIncomingRequest(const Req& req, uint32_t port_id) {
cur_req_.data.at(port_id) = req;
cur_req_.valid.set(port_id);
}
void handleIncomingResponse(const Rsp& rsp, uint32_t) {
rsps_.push(rsp);
}
ArbiterType type_;
std::queue<req_t> reqs_;
std::queue<Rsp> rsps_;
req_t cur_req_;
std::queue<req_batch_t> reqq_;
uint32_t delay_;
uint32_t cursor_;
std::unordered_map<uint32_t, uint32_t> addr_table_;
uint32_t tag_shift_;
public:
Switch(
@ -197,12 +285,12 @@ public:
)
: SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)
, type_(type)
, cur_req_(num_inputs)
, delay_(delay)
, cursor_(0)
, ReqIn(num_inputs, {this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingRequest})
, tag_shift_(log2ceil(num_inputs))
, ReqIn(num_inputs, this)
, ReqOut(this)
, RspIn(this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingResponse)
, RspIn(this)
, RspOut(num_inputs, this)
{
assert(delay_ != 0);
@ -210,36 +298,52 @@ public:
}
void step(uint64_t /*cycle*/) {
if (cur_req_.valid.any()) {
reqs_.push(cur_req_);
cur_req_.valid.reset();
}
while (!reqs_.empty()) {
auto& entry = reqs_.front();
bool found = false;
for (uint32_t i = 0, n = entry.data.size(); i < n; ++i) {
auto j = (cursor_ + i) % n;
if (entry.valid.test(j)) {
auto& req = entry.data.at(j);
addr_table_[req.tag] = j;
ReqOut.send(req, delay_);
entry.valid.reset(j);
this->update_cursor(j);
found = true;
break;
// process incomming requests
{
req_batch_t req_batch(ReqIn.size());
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
Req req;
if (ReqIn.at(i).read(&req)) {
req_batch.data.at(i) = req;
req_batch.valid.set(i);
}
}
if (found)
break;
reqs_.pop();
if (req_batch.valid.any()) {
reqq_.push(req_batch);
}
}
// apply arbitration
if (!reqq_.empty()) {
auto& req_batch = reqq_.front();
for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) {
auto j = (cursor_ + i) % n;
if (req_batch.valid.test(j)) {
auto& req = req_batch.data.at(j);
if (tag_shift_) {
req.tag = (req.tag << tag_shift_) | j;
}
ReqOut.send(req, delay_);
req_batch.valid.reset(j);
this->update_cursor(j);
if (!req_batch.valid.any())
reqq_.pop(); // pop when empty
break;
}
}
}
if (!rsps_.empty()) {
auto& rsp = rsps_.front();
auto port_id = addr_table_.at(rsp.tag);
RspOut.at(port_id).send(rsp, 1);
rsps_.pop();
// process incoming reponses
{
Rsp rsp;
if (RspIn.read(&rsp)) {
uint32_t port_id = 0;
if (tag_shift_) {
port_id = rsp.tag & ((1 << tag_shift_)-1);
rsp.tag >>= tag_shift_;
}
RspOut.at(port_id).send(rsp, 1);
}
}
}

View file

@ -24,30 +24,34 @@ Warp::Warp(Core *core, Word id)
void Warp::eval(pipeline_state_t *pipeline_state) {
assert(tmask_.any());
DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask=");
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
for (int i = 0, n = core_->arch().num_threads(); i < n; ++i)
DPN(2, tmask_.test(n-i-1));
DPN(2, "\n");
DPN(2, ", PC=0x" << std::hex << PC_ << std::endl);
/* Fetch and decode. */
Word fetched = core_->icache_fetch(PC_);
auto instr = core_->decoder().decode(fetched, PC_);
Word instr_code = core_->icache_read(PC_, sizeof(Word));
auto instr = core_->decoder().decode(instr_code);
if (!instr) {
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
std::abort();
}
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
// Update state
pipeline_state->cid = core_->id();
pipeline_state->wid = id_;
pipeline_state->PC = PC_;
pipeline_state->tmask = tmask_;
pipeline_state->rdest = instr->getRDest();
pipeline_state->rdest_type = instr->getRDType();
pipeline_state->used_iregs.reset();
pipeline_state->used_fregs.reset();
pipeline_state->used_vregs.reset();
// Execute
this->execute(*instr, pipeline_state);
D(4, "Register state:");
DP(4, "Register state:");
for (int i = 0; i < core_->arch().num_regs(); ++i) {
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
for (int j = 0; j < core_->arch().num_threads(); ++j) {

View file

@ -44,6 +44,8 @@
#define VERILATOR_RESET_VALUE 2
#endif
#define RAM_PAGE_SIZE 4096
using namespace vortex;
static uint64_t timestamp = 0;
@ -136,7 +138,7 @@ opae_sim::opae_sim()
: stop_(false)
, host_buffer_ids_(0) {
vl_obj_ = new VL_OBJ();
ram_ = new RAM((1<<12), (1<<20));
ram_ = new RAM(RAM_PAGE_SIZE);
// reset the device
this->reset();