dram simulator fix

This commit is contained in:
Blaise Tine 2021-12-07 22:44:06 -05:00
parent a9ec1c08a7
commit 5825b7c15a
30 changed files with 702 additions and 499 deletions

View file

@ -124,8 +124,7 @@ public:
future_.wait();
}
// start new run
future_ = std::async(std::launch::async, [&]{
processor_.reset();
future_ = std::async(std::launch::async, [&]{
processor_.run();
});
return 0;

View file

@ -8,11 +8,17 @@
#include <vortex.h>
#include <vx_utils.h>
#include <processor.h>
#include <constants.h>
#include <VX_config.h>
#include <util.h>
#include <processor.h>
#include <archdef.h>
#include <mem.h>
#include <constants.h>
using namespace vortex;
///////////////////////////////////////////////////////////////////////////////
@ -59,13 +65,11 @@ public:
vx_device()
: arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS)
, ram_(RAM_PAGE_SIZE)
, processor_(arch_)
, mem_allocation_(ALLOC_BASE_ADDR)
{
// setup memory simulator
memsim_ = MemSim::Create(MemSim::Config{
DRAM_CHANNELS,
arch_.num_cores()
});
// attach memory module
processor_.attach_ram(&ram_);
}
~vx_device() {
@ -122,28 +126,7 @@ public:
// start new run
future_ = std::async(std::launch::async, [&]{
if (processor_) {
// release current processor instance
processor_->MemReqPort.unbind();
memsim_->MemRspPort.unbind();
SimPlatform::instance().release_object(processor_);
}
// create new processor instance
processor_ = Processor::Create(arch_);
processor_->MemReqPort.bind(&memsim_->MemReqPort);
memsim_->MemRspPort.bind(&processor_->MemRspPort);
// attach memory object
processor_->attach_ram(&ram_);
// run simulation
int exitcode;
for (;;) {
SimPlatform::instance().step();
if (processor_->check_exit(&exitcode))
break;
};
processor_.run();
});
return 0;
@ -167,8 +150,7 @@ public:
private:
ArchDef arch_;
RAM ram_;
MemSim::Ptr memsim_;
Processor::Ptr processor_;
Processor processor_;
uint64_t mem_allocation_;
std::future<void> future_;
};
@ -207,9 +189,6 @@ extern int vx_dev_open(vx_device_h* hdevice) {
if (nullptr == hdevice)
return -1;
if (!SimPlatform::instance().initialize())
return -1;
*hdevice = new vx_device();
#ifdef DUMP_PERF_STATS
@ -232,8 +211,6 @@ extern int vx_dev_close(vx_device_h hdevice) {
delete device;
SimPlatform::instance().finalize();
return 0;
}

View file

@ -127,7 +127,7 @@ public:
virtual ~SimEventBase() {}
virtual void fire() const = 0;
virtual void fire() const = 0;
uint64_t time() const {
return time_;
@ -219,15 +219,21 @@ public:
const std::string& name() const {
return name_;
}
virtual void step(uint64_t cycle) = 0;
}
protected:
SimObjectBase(const SimContext& ctx, const char* name);
private:
virtual void do_reset() = 0;
virtual void do_tick() = 0;
std::string name_;
friend class SimPlatform;
};
///////////////////////////////////////////////////////////////////////////////
@ -246,18 +252,22 @@ protected:
: SimObjectBase(ctx, name)
{}
void step(uint64_t cycle) override {
this->impl().step(cycle);
}
private:
const Impl& impl() const {
return static_cast<const Impl&>(*this);
const Impl* impl() const {
return static_cast<const Impl*>(this);
}
Impl& impl() {
return static_cast<Impl&>(*this);
Impl* impl() {
return static_cast<Impl*>(this);
}
void do_reset() override {
this->impl()->reset();
}
void do_tick() override {
this->impl()->tick();
}
};
@ -282,10 +292,6 @@ public:
return true;
}
void flush() {
instance().clear();
}
void finalize() {
instance().clear();
}
@ -310,7 +316,15 @@ public:
events_.emplace_back(evt);
}
void step() {
void reset() {
events_.clear();
for (auto& object : objects_) {
object->do_reset();
}
cycles_ = 0;
}
void tick() {
// evaluate events
auto evt_it = events_.begin();
auto evt_it_end = events_.end();
@ -325,7 +339,7 @@ public:
}
// evaluate components
for (auto& object : objects_) {
object->step(cycles_);
object->do_tick();
}
// advance clock
++cycles_;

View file

@ -49,12 +49,12 @@ int main(int argc, char **argv) {
parse_args(argc, argv);
for (auto program : programs) {
std::cout << "Running " << program << "..." << std::endl;
vortex::RAM ram(RAM_PAGE_SIZE);
vortex::Processor processor;
processor.attach_ram(&ram);
vortex::RAM ram(RAM_PAGE_SIZE);
vortex::Processor processor;
processor.attach_ram(&ram);
for (auto program : programs) {
std::cout << "Running " << program << "..." << std::endl;
std::string program_ext(fileExtension(program));
if (program_ext == "bin") {

View file

@ -22,6 +22,7 @@
#include <VX_config.h>
#include <ostream>
#include <list>
#include <queue>
#include <vector>
#include <sstream>
#include <unordered_map>
@ -39,7 +40,9 @@
#endif
#endif
#define ENABLE_MEM_STALLS
#ifndef MEM_CYCLE_RATIO
#define MEM_CYCLE_RATIO -1
#endif
#ifndef TRACE_START_TIME
#define TRACE_START_TIME 0ull
@ -126,12 +129,7 @@ public:
}
~Impl() {
for (auto& buf : print_bufs_) {
auto str = buf.second.str();
if (!str.empty()) {
std::cout << "#" << buf.first << ": " << str << std::endl;
}
}
this->cout_flush();
#ifdef VCD_OUTPUT
trace_->close();
@ -147,10 +145,46 @@ public:
}
}
void cout_flush() {
for (auto& buf : print_bufs_) {
auto str = buf.second.str();
if (!str.empty()) {
std::cout << "#" << buf.first << ": " << str << std::endl;
}
}
}
void attach_ram(RAM* ram) {
ram_ = ram;
}
int run() {
int exitcode = 0;
#ifndef NDEBUG
std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
#endif
// reset device
this->reset();
// execute program
while (device_->busy) {
if (get_ebreak()) {
exitcode = get_last_wb_value(3);
break;
}
this->tick();
}
// wait 5 cycles to flush the pipeline
this->wait(5);
return exitcode;
}
private:
void reset() {
print_bufs_.clear();
@ -178,33 +212,11 @@ public:
// Turn on assertion after reset
Verilated::assertOn(true);
this->cout_flush();
}
int run() {
int exitcode = 0;
#ifndef NDEBUG
std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
#endif
// execute program
while (device_->busy) {
if (get_ebreak()) {
exitcode = get_last_wb_value(3);
break;
}
this->step();
}
// wait 5 cycles to flush the pipeline
this->wait(5);
return exitcode;
}
private:
void step() {
void tick() {
device_->clk = 0;
this->eval();
@ -224,7 +236,19 @@ private:
this->eval_avs_bus(1);
#endif
dram_->tick();
if (MEM_CYCLE_RATIO > 0) {
auto cycle = timestamp / 2;
if ((cycle % MEM_CYCLE_RATIO) == 0)
dram_->tick();
} else {
for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
dram_->tick();
}
if (!dram_queue_.empty()) {
if (dram_->send(dram_queue_.front()))
dram_queue_.pop();
}
#ifndef NDEBUG
fflush(stdout);
@ -372,7 +396,7 @@ private:
ramulator::Request::Type::WRITE,
0
);
dram_->send(dram_req);
dram_queue_.push(dram_req);
}
} else {
// process reads
@ -393,7 +417,7 @@ private:
}, placeholders::_1, mem_req),
0
);
dram_->send(dram_req);
dram_queue_.push(dram_req);
}
}
@ -490,7 +514,7 @@ private:
ramulator::Request::Type::WRITE,
0
);
dram_->send(dram_req);
dram_queue_.push(dram_req);
}
} else {
// process reads
@ -511,7 +535,7 @@ private:
}, placeholders::_1, mem_req),
0
);
dram_->send(dram_req);
dram_queue_.push(dram_req);
}
}
@ -522,7 +546,7 @@ private:
void wait(uint32_t cycles) {
for (int i = 0; i < cycles; ++i) {
this->step();
this->tick();
}
}
@ -574,6 +598,8 @@ private:
RAM *ram_;
ramulator::Gem5Wrapper* dram_;
std::queue<ramulator::Request> dram_queue_;
};
///////////////////////////////////////////////////////////////////////////////
@ -590,10 +616,6 @@ void Processor::attach_ram(RAM* mem) {
impl_->attach_ram(mem);
}
void Processor::reset() {
impl_->reset();
}
int Processor::run() {
return impl_->run();
}

View file

@ -8,12 +8,10 @@ class Processor {
public:
Processor();
virtual ~Processor();
~Processor();
void attach_ram(RAM* ram);
void reset();
int run();
private:

View file

@ -102,6 +102,12 @@ struct block_t {
struct set_t {
std::vector<block_t> blocks;
set_t(uint32_t size) : blocks(size) {}
void clear() {
for (auto& block : blocks) {
block.valid = false;
}
}
};
struct bank_req_info_t {
@ -117,6 +123,7 @@ struct bank_req_t {
uint64_t tag;
uint32_t set_id;
uint32_t core_id;
uint64_t uuid;
std::vector<bank_req_info_t> infos;
bank_req_t(uint32_t size)
@ -126,6 +133,7 @@ struct bank_req_t {
, tag(0)
, set_id(0)
, core_id(0)
, uuid(0)
, infos(size)
{}
};
@ -142,20 +150,20 @@ struct mshr_entry_t : public bank_req_t {
class MSHR {
private:
std::vector<mshr_entry_t> entries_;
uint32_t capacity_;
uint32_t size_;
public:
MSHR(uint32_t size)
: entries_(size)
, capacity_(0)
, size_(0)
{}
bool empty() const {
return (0 == capacity_);
return (0 == size_);
}
bool full() const {
return (capacity_ == entries_.size());
return (size_ == entries_.size());
}
int lookup(const bank_req_t& bank_req) {
@ -178,7 +186,7 @@ public:
entry.valid = true;
entry.mshr_replay = false;
entry.block_id = block_id;
++capacity_;
++size_;
return i;
}
}
@ -204,12 +212,21 @@ public:
if (entry.valid && entry.mshr_replay) {
*out = entry;
entry.valid = false;
--capacity_;
--size_;
return true;
}
}
return false;
}
void clear() {
for (auto& entry : entries_) {
if (entry.valid && entry.mshr_replay) {
entry.valid = false;
}
}
size_ = 0;
}
};
struct bank_t {
@ -221,6 +238,13 @@ struct bank_t {
: sets(params.sets_per_bank, params.blocks_per_set)
, mshr(config.mshr_size)
{}
void clear() {
mshr.clear();
for (auto& set : sets) {
set.clear();
}
}
};
///////////////////////////////////////////////////////////////////////////////
@ -235,11 +259,11 @@ private:
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
uint32_t flush_cycles_;
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
uint32_t flush_cycles_;
uint64_t pending_fill_reqs_;
public:
Impl(Cache* simobject, const Config& config)
@ -249,9 +273,6 @@ public:
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
, pending_read_reqs_(0)
, pending_write_reqs_(0)
, pending_fill_reqs_(0)
{
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
@ -272,19 +293,28 @@ public:
// calculate tag flush cycles
flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
void step(uint64_t cycle) {
void reset() {
for (auto& bank : banks_) {
bank.clear();
}
perf_stats_ = PerfStats();
pending_read_reqs_ = 0;
pending_write_reqs_ = 0;
pending_fill_reqs_ = 0;
}
void tick() {
// wait on flush cycles
if (flush_cycles_ != 0) {
--flush_cycles_;
return;
}
// per-bank pipeline request
std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
// calculate memory latency
perf_stats_.mem_latency += pending_fill_reqs_;
@ -294,12 +324,11 @@ public:
auto& mem_rsp = bypass_port.front();
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
MemRsp core_rsp{tag, mem_rsp.core_id};
MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
bypass_port.pop();
}
std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
}
// handle MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
@ -351,6 +380,7 @@ public:
bank_req.tag = tag;
bank_req.set_id = set_id;
bank_req.core_id = core_req.core_id;
bank_req.uuid = core_req.uuid;
bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
auto& bank = banks_.at(bank_id);
@ -400,22 +430,31 @@ public:
// remove request
auto time = core_req_port.pop();
perf_stats_.pipeline_stalls += (cycle - time);
perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
}
// process active request
this->processBankRequest(pipeline_reqs);
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
private:
void processIORequest(const MemReq& core_req, uint32_t req_id) {
{
MemReq mem_req(core_req);
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
}
if (core_req.write && config_.write_reponse) {
simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1);
MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid};
simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
@ -442,8 +481,9 @@ public:
if (pipeline_req.mshr_replay) {
// send core response
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
} else {
bool hit = false;
@ -485,7 +525,9 @@ public:
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
} else {
// mark block as dirty
hit_block.dirty = true;
@ -494,8 +536,9 @@ public:
// send core response
if (!pipeline_req.write || config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
} else {
@ -516,6 +559,7 @@ public:
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
++perf_stats_.evictions;
}
}
@ -527,13 +571,16 @@ public:
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
}
// send core response
if (config_.write_reponse) {
for (auto& info : pipeline_req.infos) {
MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
DT(3, simobject_->name() << "-" << core_rsp);
}
}
} else {
@ -550,7 +597,9 @@ public:
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.core_id = pipeline_req.core_id;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-" << mem_req);
++pending_fill_reqs_;
}
}
@ -575,8 +624,12 @@ Cache::~Cache() {
delete impl_;
}
void Cache::step(uint64_t cycle) {
impl_->step(cycle);
void Cache::reset() {
impl_->reset();
}
void Cache::tick() {
impl_->tick();
}
const Cache::PerfStats& Cache::perf_stats() const {

View file

@ -22,6 +22,7 @@ public:
uint16_t mshr_size; // MSHR buffer size
uint8_t latency; // pipeline latency
};
struct PerfStats {
uint64_t reads;
uint64_t writes;
@ -54,7 +55,9 @@ public:
Cache(const SimContext& ctx, const char* name, const Config& config);
~Cache();
void step(uint64_t cycle);
void reset();
void tick();
const PerfStats& perf_stats() const;

View file

@ -1,10 +1,16 @@
#pragma once
#include "types.h"
#ifndef RAM_PAGE_SIZE
#define RAM_PAGE_SIZE 4096
#endif
#define DRAM_CHANNELS 2
#ifndef MEM_CYCLE_RATIO
#define MEM_CYCLE_RATIO -1
#endif
#ifndef MEMORY_BANKS
#define MEMORY_BANKS 2
#endif
namespace vortex {

View file

@ -30,7 +30,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)
, exe_units_((int)ExeType::MAX)
, icache_(Cache::Create("Icache", Cache::Config{
, icache_(Cache::Create("icache", Cache::Config{
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_BLOCK_SIZE),// B
2, // W
@ -45,7 +45,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
NUM_WARPS, // mshr
2, // pipeline latency
}))
, dcache_(Cache::Create("Dcache", Cache::Config{
, dcache_(Cache::Create("dcache", Cache::Config{
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_BLOCK_SIZE),// B
2, // W
@ -72,15 +72,6 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
, fetch_latch_("fetch")
, decode_latch_("decode")
, pending_icache_(arch_.num_warps())
, active_warps_(1)
, stalled_warps_(0)
, last_schedule_wid_(0)
, issued_instrs_(0)
, committed_instrs_(0)
, csr_tex_unit_(0)
, ecall_(false)
, ebreak_(false)
, perf_mem_pending_reads_(0)
{
for (int i = 0; i < arch_.num_warps(); ++i) {
warps_.at(i) = std::make_shared<Warp>(this, i);
@ -112,10 +103,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
#endif
sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i));
dcache_->CoreRspPorts.at(i).bind(&sw->RspIn);
}
// activate warp0
warps_.at(0)->setTmask(0, true);
}
// memory perf callbacks
MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
@ -128,9 +116,62 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
__unused (cycle);
--perf_mem_pending_reads_;
});
this->reset();
}
Core::~Core() {
this->cout_flush();
}
void Core::reset() {
for (auto& warp : warps_) {
warp->clear();
}
warps_.at(0)->setTmask(0, true);
active_warps_ = 1;
for (auto& tex_unit : tex_units_) {
tex_unit.clear();
}
for ( auto& barrier : barriers_) {
barrier.reset();
}
for (auto& csr : csrs_) {
csr = 0;
}
for (auto& fcsr : fcsrs_) {
fcsr = 0;
}
for (auto& ibuf : ibuffers_) {
ibuf.clear();
}
scoreboard_.clear();
fetch_latch_.clear();
decode_latch_.clear();
pending_icache_.clear();
stalled_warps_.reset();
last_schedule_wid_ = 0;
issued_instrs_ = 0;
committed_instrs_ = 0;
csr_tex_unit_ = 0;
ecall_ = false;
ebreak_ = false;
perf_mem_pending_reads_ = 0;
perf_stats_ = PerfStats();
}
void Core::attach_ram(RAM* ram) {
// bind RAM to memory unit
mmu_.attach(*ram, 0, 0xFFFFFFFF);
}
void Core::cout_flush() {
for (auto& buf : print_bufs_) {
auto str = buf.second.str();
if (!str.empty()) {
@ -139,17 +180,12 @@ Core::~Core() {
}
}
void Core::attach_ram(RAM* ram) {
// bind RAM to memory unit
mmu_.attach(*ram, 0, 0xFFFFFFFF);
}
void Core::step(uint64_t cycle) {
this->commit(cycle);
this->execute(cycle);
this->decode(cycle);
this->fetch(cycle);
this->schedule(cycle);
void Core::tick() {
this->commit();
this->execute();
this->decode();
this->fetch();
this->schedule();
// update perf counter
perf_stats_.mem_latency += perf_mem_pending_reads_;
@ -157,9 +193,7 @@ void Core::step(uint64_t cycle) {
DPN(2, std::flush);
}
void Core::schedule(uint64_t cycle) {
__unused (cycle);
void Core::schedule() {
bool foundSchedule = false;
int scheduled_warp = last_schedule_wid_;
@ -181,30 +215,27 @@ void Core::schedule(uint64_t cycle) {
// suspend warp until decode
stalled_warps_.set(scheduled_warp);
auto& warp = warps_.at(scheduled_warp);
uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_;
auto trace = new pipeline_trace_t(uuid, arch_);
auto& warp = warps_.at(scheduled_warp);
warp->eval(trace);
DT(3, cycle, "pipeline-schedule: " << *trace);
DT(3, "pipeline-schedule: " << *trace);
// advance to fetch stage
fetch_latch_.push(trace);
}
void Core::fetch(uint64_t cycle) {
__unused (cycle);
void Core::fetch() {
// handle icache reponse
auto& icache_rsp_port = icache_->CoreRspPorts.at(0);
if (!icache_rsp_port.empty()){
auto& mem_rsp = icache_rsp_port.front();
auto trace = pending_icache_.at(mem_rsp.tag);
decode_latch_.push(trace);
DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
DT(3, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
pending_icache_.release(mem_rsp.tag);
icache_rsp_port.pop();
}
@ -216,16 +247,15 @@ void Core::fetch(uint64_t cycle) {
mem_req.addr = trace->PC;
mem_req.write = false;
mem_req.tag = pending_icache_.allocate(trace);
mem_req.core_id = id_;
icache_->CoreReqPorts.at(0).send(mem_req, 1);
DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
mem_req.core_id = trace->cid;
mem_req.uuid = trace->uuid;
icache_->CoreReqPorts.at(0).send(mem_req, 1);
DT(3, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
fetch_latch_.pop();
}
}
void Core::decode(uint64_t cycle) {
__unused (cycle);
void Core::decode() {
if (decode_latch_.empty())
return;
@ -235,7 +265,7 @@ void Core::decode(uint64_t cycle) {
auto& ibuffer = ibuffers_.at(trace->wid);
if (ibuffer.full()) {
if (!trace->suspend()) {
DT(3, cycle, "*** ibuffer-stall: " << *trace);
DT(3, "*** ibuffer-stall: " << *trace);
}
++perf_stats_.ibuf_stalls;
return;
@ -257,7 +287,7 @@ void Core::decode(uint64_t cycle) {
if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH)
perf_stats_.branches += active_threads;
DT(3, cycle, "pipeline-decode: " << *trace);
DT(3, "pipeline-decode: " << *trace);
// insert to ibuffer
ibuffer.push(trace);
@ -265,9 +295,7 @@ void Core::decode(uint64_t cycle) {
decode_latch_.pop();
}
void Core::execute(uint64_t cycle) {
__unused (cycle);
void Core::execute() {
// issue ibuffer instructions
for (auto& ibuffer : ibuffers_) {
if (ibuffer.empty())
@ -278,7 +306,7 @@ void Core::execute(uint64_t cycle) {
// check scoreboard
if (scoreboard_.in_use(trace)) {
if (!trace->suspend()) {
DTH(3, cycle, "*** scoreboard-stall: dependents={");
DTH(3, "*** scoreboard-stall: dependents={");
auto uses = scoreboard_.get_uses(trace);
for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
auto& use = uses.at(i);
@ -297,7 +325,7 @@ void Core::execute(uint64_t cycle) {
// update scoreboard
scoreboard_.reserve(trace);
DT(3, cycle, "pipeline-issue: " << *trace);
DT(3, "pipeline-issue: " << *trace);
// push to execute units
auto& exe_unit = exe_units_.at((int)trace->exe_type);
@ -308,9 +336,7 @@ void Core::execute(uint64_t cycle) {
}
}
void Core::commit(uint64_t cycle) {
__unused (cycle);
void Core::commit() {
// commit completed instructions
bool wb = false;
for (auto& exe_unit : exe_units_) {
@ -323,7 +349,7 @@ void Core::commit(uint64_t cycle) {
wb |= trace->wb;
// advance to commit stage
DT(3, cycle, "pipeline-commit: " << *trace);
DT(3, "pipeline-commit: " << *trace);
// update scoreboard
scoreboard_.release(trace);

View file

@ -75,16 +75,14 @@ public:
bool running() const;
void step(uint64_t cycle);
void reset();
void tick();
Word id() const {
return id_;
}
Warp& warp(int i) {
return *warps_.at(i);
}
const Decoder& decoder() {
return decoder_;
}
@ -125,14 +123,16 @@ public:
private:
void schedule(uint64_t cycle);
void fetch(uint64_t cycle);
void decode(uint64_t cycle);
void execute(uint64_t cycle);
void commit(uint64_t cycle);
void schedule();
void fetch();
void decode();
void execute();
void commit();
void writeToStdOut(Addr addr, Word data);
void cout_flush();
Word id_;
const ArchDef arch_;
const Decoder decoder_;

View file

@ -33,15 +33,15 @@
} \
} while(0)
#define DT(lvl, t, x) do { \
#define DT(lvl, x) do { \
if ((lvl) <= DEBUG_LEVEL) { \
std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \
std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x << std::endl; \
} \
} while(0)
#define DTH(lvl, t, x) do { \
#define DTH(lvl, x) do { \
if ((lvl) <= DEBUG_LEVEL) { \
std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \
std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x; \
} \
} while(0)
@ -58,8 +58,8 @@
#define DPH(lvl, x) do {} while(0)
#define DPN(lvl, x) do {} while(0)
#define DT(lvl, t, x) do {} while(0)
#define DTH(lvl, t, x) do {} while(0)
#define DT(lvl, x) do {} while(0)
#define DTH(lvl, x) do {} while(0)
#define DTN(lvl, x) do {} while(0)
#endif

View file

@ -87,7 +87,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
DPN(2, "-");
continue;
}
rsdata[t][i] = iRegFile_.at(t)[reg];
rsdata[t][i] = ireg_file_.at(t)[reg];
DPN(2, std::hex << rsdata[t][i]);
}
DPN(2, "}" << std::endl);
@ -100,7 +100,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
DPN(2, "-");
continue;
}
rsdata[t][i] = fRegFile_.at(t)[reg];
rsdata[t][i] = freg_file_.at(t)[reg];
DPN(2, std::hex << rsdata[t][i]);
}
DPN(2, "}" << std::endl);
@ -460,7 +460,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
DP(4, "dest: v" << rdest);
DP(4, "width" << instr.getVlsWidth());
auto &vd = vRegFile_.at(rdest);
auto &vd = vreg_file_.at(rdest);
switch (instr.getVlsWidth()) {
case 6: {
// load word and unit strided (not checking for unit stride)
@ -517,7 +517,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
switch (instr.getVlsWidth()) {
case 6: {
// store word and unit strided (not checking for unit stride)
uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
uint32_t value = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i);
core_->dcache_write(memAddr, value, 4);
DP(4, "store: " << memAddr << " value:" << value);
} break;
@ -784,7 +784,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
// predicate mode
ThreadMask pred;
for (int i = 0; i < num_threads; ++i) {
pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
pred[i] = tmask_.test(i) ? (ireg_file_.at(i).at(rsrc0) != 0) : 0;
}
if (pred.any()) {
tmask_ &= pred;
@ -819,15 +819,15 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->gpu.type = GpuType::SPLIT;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {
if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {
ThreadMask tmask;
for (int i = 0; i < num_threads; ++i) {
tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
tmask[i] = tmask_.test(i) && !ireg_file_.at(i).at(rsrc0);
}
DomStackEntry e(tmask, nextPC);
domStack_.push(tmask_);
domStack_.push(e);
dom_stack_.push(tmask_);
dom_stack_.push(e);
for (size_t i = 0; i < e.tmask.size(); ++i) {
tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
}
@ -842,7 +842,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
DP(3, "*** Unanimous pred");
DomStackEntry e(tmask_);
e.unanimous = true;
domStack_.push(e);
dom_stack_.push(e);
}
} break;
case 3: {
@ -850,25 +850,25 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->exe_type = ExeType::GPU;
trace->gpu.type = GpuType::JOIN;
trace->fetch_stall = true;
if (!domStack_.empty() && domStack_.top().unanimous) {
if (!dom_stack_.empty() && dom_stack_.top().unanimous) {
DP(3, "*** Uninimous branch at join");
tmask_ = domStack_.top().tmask;
tmask_ = dom_stack_.top().tmask;
active_ = tmask_.any();
domStack_.pop();
dom_stack_.pop();
} else {
if (!domStack_.top().fallThrough) {
nextPC = domStack_.top().PC;
if (!dom_stack_.top().fallThrough) {
nextPC = dom_stack_.top().PC;
DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
}
tmask_ = domStack_.top().tmask;
tmask_ = dom_stack_.top().tmask;
active_ = tmask_.any();
DPH(3, "*** Join: New TM=");
for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
DPN(3, "\n");
domStack_.pop();
dom_stack_.pop();
}
} break;
case 4: {
@ -946,10 +946,10 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
case 0: // vector-vector
switch (func6) {
case 0: {
auto& vr1 = vRegFile_.at(rsrc0);
auto& vr2 = vRegFile_.at(rsrc1);
auto& vd = vRegFile_.at(rdest);
auto& mask = vRegFile_.at(0);
auto& vr1 = vreg_file_.at(rsrc0);
auto& vr2 = vreg_file_.at(rsrc1);
auto& vd = vreg_file_.at(rdest);
auto& mask = vreg_file_.at(0);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t emask = *(uint8_t *)(mask.data() + i);
@ -990,9 +990,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 24: {
// vmseq
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1021,9 +1021,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 25: {
// vmsne
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1052,9 +1052,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 26: {
// vmsltu
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1083,9 +1083,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 27: {
// vmslt
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
int8_t first = *(int8_t *)(vr1.data() + i);
@ -1114,9 +1114,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 28: {
// vmsleu
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1145,9 +1145,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 29: {
// vmsle
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
int8_t first = *(int8_t *)(vr1.data() + i);
@ -1176,9 +1176,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 30: {
// vmsgtu
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1207,9 +1207,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 31: {
// vmsgt
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
int8_t first = *(int8_t *)(vr1.data() + i);
@ -1242,9 +1242,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
switch (func6) {
case 24: {
// vmandnot
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1288,9 +1288,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 25: {
// vmand
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1334,9 +1334,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 26: {
// vmor
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1380,9 +1380,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 27: {
// vmxor
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1426,9 +1426,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 28: {
// vmornot
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1472,9 +1472,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 29: {
// vmnand
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1518,9 +1518,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 30: {
// vmnor
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1564,9 +1564,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 31: {
// vmxnor
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1610,9 +1610,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 37: {
// vmul
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1650,9 +1650,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 45: {
// vmacc
auto &vr1 = vRegFile_.at(rsrc0);
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr1 = vreg_file_.at(rsrc0);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t first = *(uint8_t *)(vr1.data() + i);
@ -1693,8 +1693,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
case 6: {
switch (func6) {
case 0: {
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t second = *(uint8_t *)(vr2.data() + i);
@ -1729,8 +1729,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
} break;
case 37: {
// vmul.vx
auto &vr2 = vRegFile_.at(rsrc1);
auto &vd = vRegFile_.at(rdest);
auto &vr2 = vreg_file_.at(rsrc1);
auto &vd = vreg_file_.at(rdest);
if (vtype_.vsew == 8) {
for (int i = 0; i < vl_; i++) {
uint8_t second = *(uint8_t *)(vr2.data() + i);
@ -1805,7 +1805,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
DPN(2, "-");
continue;
}
iRegFile_.at(t)[rdest] = rddata[t];
ireg_file_.at(t)[rdest] = rddata[t];
DPN(2, "0x" << std::hex << rddata[t]);
}
DPN(2, "}" << std::endl);
@ -1820,7 +1820,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
DPN(2, "-");
continue;
}
fRegFile_.at(t)[rdest] = rddata[t];
freg_file_.at(t)[rdest] = rddata[t];
DPN(2, "0x" << std::hex << rddata[t]);
}
DPN(2, "}" << std::endl);

View file

@ -12,7 +12,7 @@ using namespace vortex;
NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
void NopUnit::step(uint64_t /*cycle*/) {
void NopUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
@ -25,26 +25,31 @@ void NopUnit::step(uint64_t /*cycle*/) {
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, num_threads_(core->arch().num_threads())
, pending_dcache_(LSUQ_SIZE)
, pending_rd_reqs_(LSUQ_SIZE)
, fence_lock_(false)
{}
void LsuUnit::step(uint64_t cycle) {
void LsuUnit::reset() {
pending_rd_reqs_.clear();
fence_lock_ = false;
}
void LsuUnit::tick() {
// handle dcache response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_dcache_.at(mem_rsp.tag);
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_dcache_.release(mem_rsp.tag);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
}
@ -55,26 +60,26 @@ void LsuUnit::step(uint64_t cycle) {
if (smem_rsp_port.empty())
continue;
auto& mem_rsp = smem_rsp_port.front();
auto& entry = pending_dcache_.at(mem_rsp.tag);
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
<< ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
Output.send(trace, 1);
pending_dcache_.release(mem_rsp.tag);
pending_rd_reqs_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_dcache_.empty())
if (!pending_rd_reqs_.empty())
return;
Output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, cycle, "fence-unlock: " << fence_state_);
DT(3, "fence-unlock: " << fence_state_);
}
// check input queue
@ -87,17 +92,17 @@ void LsuUnit::step(uint64_t cycle) {
// schedule fence lock
fence_state_ = trace;
fence_lock_ = true;
DT(3, cycle, "fence-lock: " << *trace);
DT(3, "fence-lock: " << *trace);
// remove input
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (cycle - time);
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
return;
}
// check pending queue capacity
if (pending_dcache_.full()) {
if (pending_rd_reqs_.full()) {
if (!trace->suspend()) {
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
DT(3, "*** lsu-queue-stall: " << *trace);
}
return;
} else {
@ -130,7 +135,7 @@ void LsuUnit::step(uint64_t cycle) {
}
}
auto tag = pending_dcache_.allocate({trace, valid_addrs});
auto tag = pending_rd_reqs_.allocate({trace, valid_addrs});
for (uint32_t t = 0; t < num_threads_; ++t) {
if (!trace->tmask.test(t))
@ -145,15 +150,16 @@ void LsuUnit::step(uint64_t cycle) {
mem_req.write = is_write;
mem_req.non_cacheable = (type == AddrType::IO);
mem_req.tag = tag;
mem_req.core_id = core_->id();
mem_req.core_id = trace->cid;
mem_req.uuid = trace->uuid;
if (type == AddrType::Shared) {
core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
} else {
dcache_req_port.send(mem_req, 2);
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
}
@ -163,20 +169,20 @@ void LsuUnit::step(uint64_t cycle) {
// do not wait on writes
if (is_write) {
pending_dcache_.release(tag);
pending_rd_reqs_.release(tag);
Output.send(trace, 1);
}
// remove input
auto time = Input.pop();
core_->perf_stats_.lsu_stalls += (cycle - time);
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
void AluUnit::step(uint64_t cycle) {
void AluUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
@ -196,33 +202,33 @@ void AluUnit::step(uint64_t cycle) {
default:
std::abort();
}
DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.alu_stalls += (cycle - time);
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
void CsrUnit::step(uint64_t cycle) {
void CsrUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
Output.send(trace, 1);
auto time = Input.pop();
core_->perf_stats_.csr_stalls += (cycle - time);
DT(3, cycle, "pipeline-execute: op=CSR, " << *trace);
core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time);
DT(3, "pipeline-execute: op=CSR, " << *trace);
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
void FpuUnit::step(uint64_t cycle) {
void FpuUnit::tick() {
if (Input.empty())
return;
auto trace = Input.front();
@ -245,9 +251,9 @@ void FpuUnit::step(uint64_t cycle) {
default:
std::abort();
}
DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (cycle - time);
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
///////////////////////////////////////////////////////////////////////////////
@ -257,8 +263,12 @@ GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
, num_threads_(core->arch().num_threads())
, pending_tex_reqs_(TEXQ_SIZE)
{}
void GpuUnit::reset() {
pending_tex_reqs_.clear();
}
void GpuUnit::step(uint64_t cycle) {
void GpuUnit::tick() {
#ifdef EXT_TEX_ENABLE
// handle memory response
for (uint32_t t = 0; t < num_threads_; ++t) {
@ -268,7 +278,7 @@ void GpuUnit::step(uint64_t cycle) {
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
auto trace = entry.first;
DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
assert(entry.second);
--entry.second; // track remaining blocks
if (0 == entry.second) {
@ -312,7 +322,7 @@ void GpuUnit::step(uint64_t cycle) {
issued = true;
break;
case GpuType::TEX:
if (this->processTexRequest(cycle, trace))
if (this->processTexRequest(trace))
issued = true;
break;
default:
@ -320,22 +330,20 @@ void GpuUnit::step(uint64_t cycle) {
}
if (issued) {
DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.fpu_stalls += (cycle - time);
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
}
}
bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
__unused (cycle);
bool GpuUnit::processTexRequest(pipeline_trace_t* trace) {
// check pending queue capacity
if (pending_tex_reqs_.full()) {
if (!trace->suspend()) {
DT(3, cycle, "*** tex-queue-stall: " << *trace);
DT(3, "*** tex-queue-stall: " << *trace);
}
return false;
} else {
@ -356,14 +364,15 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
continue;
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
for (auto mem_addr : trace->mem_addrs.at(t)) {
for (auto& mem_addr : trace->mem_addrs.at(t)) {
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = (trace->lsu.type == LsuType::STORE);
mem_req.tag = tag;
mem_req.core_id = core_->id();
mem_req.uuid = trace->uuid;
dcache_req_port.send(mem_req, 3);
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
<< ", tid=" << t << ", "<< trace);
++ core_->perf_stats_.tex_reads;
++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();

View file

@ -18,10 +18,14 @@ public:
, Input(this)
, Output(this)
, core_(core)
{}
{}
virtual ~ExeUnit() {}
virtual void reset() {}
virtual void tick() = 0;
protected:
Core* core_;
};
@ -32,7 +36,7 @@ class NopUnit : public ExeUnit {
public:
NopUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
@ -40,14 +44,16 @@ public:
class LsuUnit : public ExeUnit {
private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_dcache_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_rd_reqs_;
pipeline_trace_t* fence_state_;
bool fence_lock_;
public:
LsuUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
void reset();
void tick();
};
///////////////////////////////////////////////////////////////////////////////
@ -56,7 +62,7 @@ class AluUnit : public ExeUnit {
public:
AluUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
@ -65,7 +71,7 @@ class CsrUnit : public ExeUnit {
public:
CsrUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
@ -74,7 +80,7 @@ class FpuUnit : public ExeUnit {
public:
FpuUnit(const SimContext& ctx, Core*);
void step(uint64_t cycle);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
@ -84,12 +90,14 @@ private:
uint32_t num_threads_;
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace);
bool processTexRequest(pipeline_trace_t* trace);
public:
GpuUnit(const SimContext& ctx, Core*);
void reset();
void step(uint64_t cycle);
void tick();
};
}

View file

@ -34,6 +34,11 @@ public:
void pop() {
return entries_.pop();
}
void clear() {
std::queue<pipeline_trace_t*> empty;
std::swap(entries_, empty );
}
};
}

View file

@ -6,6 +6,8 @@
#include <stdlib.h>
#include <sys/stat.h>
#include "processor.h"
#include "archdef.h"
#include "mem.h"
#include "constants.h"
#include <util.h>
#include "args.h"
@ -50,11 +52,14 @@ int main(int argc, char **argv) {
std::cout << "Running " << imgFileName << "..." << std::endl;
if (!SimPlatform::instance().initialize())
return -1;
{
// create processor configuation
ArchDef arch(archStr, num_cores, num_warps, num_threads);
// create memory module
RAM ram(RAM_PAGE_SIZE);
// load program
{
std::string program_ext(fileExtension(imgFileName.c_str()));
if (program_ext == "bin") {
@ -67,27 +72,15 @@ int main(int argc, char **argv) {
}
}
ArchDef arch(archStr, num_cores, num_warps, num_threads);
auto processor = Processor::Create(arch);
processor->attach_ram(&ram);
// setup memory simulator
auto memsim = MemSim::Create(MemSim::Config{
DRAM_CHANNELS,
arch.num_cores()
});
processor->MemReqPort.bind(&memsim->MemReqPort);
memsim->MemRspPort.bind(&processor->MemRspPort);
// create processor
Processor processor(arch);
// attach memory module
processor.attach_ram(&ram);
// run simulation
for (;;) {
SimPlatform::instance().step();
if (processor->check_exit(&exitcode))
break;
};
}
SimPlatform::instance().finalize();
processor.run();
}
if (riscv_test) {
if (1 == exitcode) {

View file

@ -13,6 +13,7 @@ DISABLE_WARNING_POP
#include "constants.h"
#include "types.h"
#include "debug.h"
using namespace vortex;
@ -51,37 +52,50 @@ public:
return perf_stats_;
}
void dram_callback(ramulator::Request& req, uint32_t tag) {
MemRsp mem_rsp{tag, (uint32_t)req.coreid};
void dram_callback(ramulator::Request& req, uint32_t tag, uint64_t uuid) {
if (req.type == ramulator::Request::Type::WRITE)
return;
MemRsp mem_rsp{tag, (uint32_t)req.coreid, uuid};
simobject_->MemRspPort.send(mem_rsp, 1);
DT(3, simobject_->name() << "-" << mem_rsp);
}
void step(uint64_t /*cycle*/) {
dram_->tick();
void reset() {
perf_stats_ = PerfStats();
}
void tick() {
if (MEM_CYCLE_RATIO > 0) {
auto cycle = SimPlatform::instance().cycles();
if ((cycle % MEM_CYCLE_RATIO) == 0)
dram_->tick();
} else {
for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
dram_->tick();
}
if (simobject_->MemReqPort.empty())
return;
auto& mem_req = simobject_->MemReqPort.front();
if (mem_req.write) {
ramulator::Request dram_req(
mem_req.addr,
ramulator::Request::Type::WRITE,
mem_req.core_id
);
dram_->send(dram_req);
ramulator::Request dram_req(
mem_req.addr,
mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ,
std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid),
mem_req.core_id
);
if (!dram_->send(dram_req))
return;
if (mem_req.write) {
++perf_stats_.writes;
} else {
ramulator::Request dram_req(
mem_req.addr,
ramulator::Request::Type::READ,
std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag),
mem_req.core_id
);
dram_->send(dram_req);
++perf_stats_.reads;
}
DT(3, simobject_->name() << "-" << mem_req);
simobject_->MemReqPort.pop();
}
@ -89,8 +103,8 @@ public:
///////////////////////////////////////////////////////////////////////////////
MemSim::MemSim(const SimContext& ctx, const Config& config)
: SimObject<MemSim>(ctx, "MemSim")
MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config)
: SimObject<MemSim>(ctx, name)
, MemReqPort(this)
, MemRspPort(this)
, impl_(new Impl(this, config))
@ -100,6 +114,10 @@ MemSim::~MemSim() {
delete impl_;
}
void MemSim::step(uint64_t cycle) {
impl_->step(cycle);
void MemSim::reset() {
impl_->reset();
}
void MemSim::tick() {
impl_->tick();
}

View file

@ -26,10 +26,12 @@ public:
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
MemSim(const SimContext& ctx, const Config& config);
MemSim(const SimContext& ctx, const char* name, const Config& config);
~MemSim();
void step(uint64_t cycle);
void reset();
void tick();
const PerfStats& perf_stats() const;

View file

@ -98,14 +98,40 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state)
return os;
}
class PipelineLatch : public Queue<pipeline_trace_t*> {
class PipelineLatch {
protected:
const char* name_;
std::queue<pipeline_trace_t*> queue_;
public:
PipelineLatch(const char* name = nullptr)
: name_(name)
{}
bool empty() const {
return queue_.empty();
}
pipeline_trace_t* front() {
return queue_.front();
}
pipeline_trace_t* back() {
return queue_.back();
}
void push(pipeline_trace_t* value) {
queue_.push(value);
}
void pop() {
queue_.pop();
}
void clear() {
std::queue<pipeline_trace_t*> empty;
std::swap(queue_, empty );
}
};
}

View file

@ -1,11 +1,11 @@
#include "processor.h"
#include "core.h"
#include "constants.h"
using namespace vortex;
class Processor::Impl {
private:
Processor* simobject_;
std::vector<Core::Ptr> cores_;
std::vector<Cache::Ptr> l2caches_;
std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
@ -13,12 +13,13 @@ private:
Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
public:
Impl(Processor* simobject, const ArchDef& arch)
: simobject_(simobject)
, cores_(arch.num_cores())
Impl(const ArchDef& arch)
: cores_(arch.num_cores())
, l2caches_(NUM_CLUSTERS)
, l2_mem_switches_(NUM_CLUSTERS)
{
SimPlatform::instance().initialize();
uint32_t num_cores = arch.num_cores();
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
@ -26,12 +27,15 @@ public:
for (uint32_t i = 0; i < num_cores; ++i) {
cores_.at(i) = Core::Create(arch, i);
}
std::vector<SimPort<MemReq>*> mem_req_ports(1);
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1);
mem_req_ports.at(0) = &simobject_->MemReqPort;
mem_rsp_ports.at(0) = &simobject_->MemRspPort;
// setup memory simulator
auto memsim = MemSim::Create("dram", MemSim::Config{
MEMORY_BANKS,
arch.num_cores()
});
std::vector<SimPort<MemReq>*> mem_req_ports(1, &memsim->MemReqPort);
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1, &memsim->MemRspPort);
if (L3_ENABLE) {
l3cache_ = Cache::Create("l3cache", Cache::Config{
@ -39,7 +43,7 @@ public:
log2ceil(MEM_BLOCK_SIZE), // B
2, // W
0, // A
32, // address bits
32, // address bits
L3_NUM_BANKS, // number of banks
L3_NUM_PORTS, // number of ports
NUM_CLUSTERS, // request size
@ -122,10 +126,8 @@ public:
}
}
~Impl() {}
void step(uint64_t cycle) {
__unused (cycle);
~Impl() {
SimPlatform::instance().finalize();
}
void attach_ram(RAM* ram) {
@ -134,28 +136,33 @@ public:
}
}
bool check_exit(int* exitcode) {
bool running = false;
for (auto& core : cores_) {
if (core->running()) {
running = true;
int run() {
SimPlatform::instance().reset();
bool running;
int exitcode = 0;
do {
SimPlatform::instance().tick();
running = false;
for (auto& core : cores_) {
if (core->running()) {
running = true;
}
if (core->check_exit()) {
exitcode = core->getIRegValue(3);
running = false;
break;
}
}
if (core->check_exit()) {
*exitcode = core->getIRegValue(3);
return true;
}
}
return !running;
} while (running);
return exitcode;
}
};
///////////////////////////////////////////////////////////////////////////////
Processor::Processor(const SimContext& ctx, const ArchDef& arch)
: SimObject<Processor>(ctx, "Vortex")
, MemReqPort(this)
, MemRspPort(this)
, impl_(new Impl(this, arch))
Processor::Processor(const ArchDef& arch)
: impl_(new Impl(arch))
{}
Processor::~Processor() {
@ -166,10 +173,6 @@ void Processor::attach_ram(RAM* mem) {
impl_->attach_ram(mem);
}
bool Processor::check_exit(int* exitcode) {
return impl_->check_exit(exitcode);
}
void Processor::step(uint64_t cycle) {
impl_->step(cycle);
int Processor::run() {
return impl_->run();
}

View file

@ -1,22 +1,18 @@
#pragma once
#include "core.h"
namespace vortex {
class Processor : public SimObject<Processor> {
class ArchDef;
class RAM;
class Processor {
public:
SimPort<MemReq> MemReqPort;
SimPort<MemRsp> MemRspPort;
Processor(const SimContext& ctx, const ArchDef& arch);
Processor(const ArchDef& arch);
~Processor();
void attach_ram(RAM* mem);
bool check_exit(int* exitcode);
void step(uint64_t cycle);
int run();
private:
class Impl;

View file

@ -24,11 +24,16 @@ public:
, in_use_fregs_(arch.num_warps())
, in_use_vregs_(arch.num_warps())
{
for (int w = 0; w < arch.num_warps(); ++w) {
in_use_iregs_.at(w).reset();
in_use_fregs_.at(w).reset();
in_use_vregs_.at(w).reset();
this->clear();
}
void clear() {
for (int i = 0, n = in_use_iregs_.size(); i < n; ++i) {
in_use_iregs_.at(i).reset();
in_use_fregs_.at(i).reset();
in_use_vregs_.at(i).reset();
}
owners_.clear();
}
bool in_use(pipeline_trace_t* state) const {

View file

@ -45,7 +45,11 @@ public:
virtual ~SharedMem() {}
void step(uint64_t /*cycle*/) {
void reset() {
perf_stats_ = PerfStats();
}
void tick() {
std::vector<bool> in_used_banks(config_.num_banks);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = this->Inputs.at(req_id);

View file

@ -16,6 +16,12 @@ TexUnit::TexUnit(Core* core) : core_(core) {}
TexUnit::~TexUnit() {}
void TexUnit::clear() {
for (auto& state : states_) {
state = 0;
}
}
uint32_t TexUnit::get_state(uint32_t state) {
return states_.at(state);
}

View file

@ -11,6 +11,8 @@ public:
TexUnit(Core* core);
~TexUnit();
void clear();
uint32_t get_state(uint32_t state);
void set_state(uint32_t state, uint32_t value);

View file

@ -213,67 +213,48 @@ struct MemReq {
bool non_cacheable;
uint32_t tag;
uint32_t core_id;
uint64_t uuid;
MemReq(uint64_t _addr = 0,
bool _write = false,
bool _non_cacheable = false,
uint64_t _tag = 0,
uint32_t _core_id = 0
uint32_t _core_id = 0,
uint64_t _uuid = 0
) : addr(_addr)
, write(_write)
, non_cacheable(_non_cacheable)
, tag(_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
};
inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
os << "mem-" << (req.write ? "wr" : "rd") << ": ";
os << "addr=" << req.addr << ", tag=" << req.tag << ", core_id=" << req.core_id;
os << " (#" << std::dec << req.uuid << ")";
return os;
}
///////////////////////////////////////////////////////////////////////////////
struct MemRsp {
uint64_t tag;
uint32_t core_id;
MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0)
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
: tag (_tag)
, core_id(_core_id)
, uuid(_uuid)
{}
};
///////////////////////////////////////////////////////////////////////////////
template <typename T>
class Queue {
protected:
std::queue<T> queue_;
public:
Queue() {}
bool empty() const {
return queue_.empty();
}
const T& front() const {
return queue_.front();
}
T& front() {
return queue_.front();
}
const T& back() const {
return queue_.back();
}
T& back() {
return queue_.back();
}
void push(const T& value) {
queue_.push(value);
}
void pop() {
queue_.pop();
}
};
inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
os << " (#" << std::dec << rsp.uuid << ")";
return os;
}
///////////////////////////////////////////////////////////////////////////////
@ -337,6 +318,14 @@ public:
entry.first = false;
--size_;
}
void clear() {
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
auto& entry = entries_.at(i);
entry.first = false;
}
size_ = 0;
}
};
///////////////////////////////////////////////////////////////////////////////
@ -376,7 +365,11 @@ public:
}
}
void step(uint64_t /*cycle*/) {
void reset() {
cursor_ = 0;
}
void tick() {
if (ReqIn.size() == 1)
return;

View file

@ -13,12 +13,28 @@ using namespace vortex;
Warp::Warp(Core *core, Word id)
: id_(id)
, core_(core)
, active_(false)
, PC_(STARTUP_ADDR)
, tmask_(0) {
iRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
fRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0));
, ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
, vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
{
this->clear();
}
void Warp::clear() {
active_ = false;
PC_ = STARTUP_ADDR;
tmask_.reset();
for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) {
for (auto& reg : ireg_file_.at(i)) {
reg = 0;
}
for (auto& reg : freg_file_.at(i)) {
reg = 0;
}
for (auto& reg : vreg_file_.at(i)) {
reg = 0;
}
}
}
void Warp::eval(pipeline_trace_t *trace) {
@ -55,7 +71,7 @@ void Warp::eval(pipeline_trace_t *trace) {
for (int i = 0; i < core_->arch().num_regs(); ++i) {
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
for (int j = 0; j < core_->arch().num_threads(); ++j) {
DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_.at(j).at(i) << std::setfill(' ') << ' ');
DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
}
DPN(4, std::endl);
}

View file

@ -41,6 +41,8 @@ struct vtype {
class Warp {
public:
Warp(Core *core, Word id);
void clear();
bool active() const {
return active_;
@ -84,7 +86,7 @@ public:
}
Word getIRegValue(int reg) const {
return iRegFile_.at(0).at(reg);
return ireg_file_.at(0).at(reg);
}
void eval(pipeline_trace_t *);
@ -100,10 +102,10 @@ private:
Word PC_;
ThreadMask tmask_;
std::vector<std::vector<Word>> iRegFile_;
std::vector<std::vector<Word>> fRegFile_;
std::vector<std::vector<Byte>> vRegFile_;
std::stack<DomStackEntry> domStack_;
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<Word>> freg_file_;
std::vector<std::vector<Byte>> vreg_file_;
std::stack<DomStackEntry> dom_stack_;
struct vtype vtype_;
int vl_;

View file

@ -23,6 +23,7 @@
#include <future>
#include <list>
#include <queue>
#include <unordered_map>
#ifndef MEMORY_BANKS
@ -33,8 +34,12 @@
#endif
#endif
#ifndef MEM_CYCLE_RATIO
#define MEM_CYCLE_RATIO -1
#endif
#undef MEM_BLOCK_SIZE
#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
#define CACHE_BLOCK_SIZE 64
@ -43,8 +48,6 @@
#define CCI_RQ_SIZE 16
#define CCI_WQ_SIZE 16
#define ENABLE_MEM_STALLS
#ifndef TRACE_START_TIME
#define TRACE_START_TIME 0ull
#endif
@ -144,7 +147,7 @@ public:
future_ = std::async(std::launch::async, [&]{
while (!stop_) {
std::lock_guard<std::mutex> guard(mutex_);
this->step();
this->tick();
}
});
}
@ -206,7 +209,7 @@ public:
device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
this->step();
this->tick();
device_->vcp2af_sRxPort_c0_mmioRdValid = 0;
assert(device_->af2cp_sTxPort_c2_mmioRdValid);
*value = device_->af2cp_sTxPort_c2_data;
@ -220,7 +223,7 @@ public:
device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
memcpy(device_->vcp2af_sRxPort_c0_data, &value, 8);
this->step();
this->tick();
device_->vcp2af_sRxPort_c0_mmioWrValid = 0;
}
@ -257,17 +260,29 @@ private:
Verilated::assertOn(true);
}
void step() {
void tick() {
this->sRxPort_bus();
this->sTxPort_bus();
this->avs_bus();
if (!dram_queue_.empty()) {
if (dram_->send(dram_queue_.front()))
dram_queue_.pop();
}
device_->clk = 0;
this->eval();
device_->clk = 1;
this->eval();
dram_->tick();
if (MEM_CYCLE_RATIO > 0) {
auto cycle = timestamp / 2;
if ((cycle % MEM_CYCLE_RATIO) == 0)
dram_->tick();
} else {
for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
dram_->tick();
}
#ifndef NDEBUG
fflush(stdout);
@ -403,7 +418,7 @@ private:
ramulator::Request::Type::WRITE,
0
);
dram_->send(dram_req);
dram_queue_.push(dram_req);
}
if (device_->avs_read[b]) {
@ -431,7 +446,7 @@ private:
}, placeholders::_1, mem_req),
0
);
dram_->send(dram_req);
dram_queue_.push(dram_req);
}
device_->avs_waitrequest[b] = false;
@ -480,6 +495,8 @@ private:
ramulator::Gem5Wrapper* dram_;
std::queue<ramulator::Request> dram_queue_;
Vvortex_afu_shim *device_;
#ifdef VCD_OUTPUT
VerilatedVcdC *trace_;