mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
dram simulator fix
This commit is contained in:
parent
a9ec1c08a7
commit
5825b7c15a
30 changed files with 702 additions and 499 deletions
|
@ -124,8 +124,7 @@ public:
|
|||
future_.wait();
|
||||
}
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.reset();
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
return 0;
|
||||
|
|
|
@ -8,11 +8,17 @@
|
|||
|
||||
#include <vortex.h>
|
||||
#include <vx_utils.h>
|
||||
#include <processor.h>
|
||||
#include <constants.h>
|
||||
|
||||
#include <VX_config.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include <processor.h>
|
||||
#include <archdef.h>
|
||||
#include <mem.h>
|
||||
#include <constants.h>
|
||||
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -59,13 +65,11 @@ public:
|
|||
vx_device()
|
||||
: arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS)
|
||||
, ram_(RAM_PAGE_SIZE)
|
||||
, processor_(arch_)
|
||||
, mem_allocation_(ALLOC_BASE_ADDR)
|
||||
{
|
||||
// setup memory simulator
|
||||
memsim_ = MemSim::Create(MemSim::Config{
|
||||
DRAM_CHANNELS,
|
||||
arch_.num_cores()
|
||||
});
|
||||
// attach memory module
|
||||
processor_.attach_ram(&ram_);
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
|
@ -122,28 +126,7 @@ public:
|
|||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
if (processor_) {
|
||||
// release current processor instance
|
||||
processor_->MemReqPort.unbind();
|
||||
memsim_->MemRspPort.unbind();
|
||||
SimPlatform::instance().release_object(processor_);
|
||||
}
|
||||
|
||||
// create new processor instance
|
||||
processor_ = Processor::Create(arch_);
|
||||
processor_->MemReqPort.bind(&memsim_->MemReqPort);
|
||||
memsim_->MemRspPort.bind(&processor_->MemRspPort);
|
||||
|
||||
// attach memory object
|
||||
processor_->attach_ram(&ram_);
|
||||
|
||||
// run simulation
|
||||
int exitcode;
|
||||
for (;;) {
|
||||
SimPlatform::instance().step();
|
||||
if (processor_->check_exit(&exitcode))
|
||||
break;
|
||||
};
|
||||
processor_.run();
|
||||
});
|
||||
|
||||
return 0;
|
||||
|
@ -167,8 +150,7 @@ public:
|
|||
private:
|
||||
ArchDef arch_;
|
||||
RAM ram_;
|
||||
MemSim::Ptr memsim_;
|
||||
Processor::Ptr processor_;
|
||||
Processor processor_;
|
||||
uint64_t mem_allocation_;
|
||||
std::future<void> future_;
|
||||
};
|
||||
|
@ -207,9 +189,6 @@ extern int vx_dev_open(vx_device_h* hdevice) {
|
|||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
if (!SimPlatform::instance().initialize())
|
||||
return -1;
|
||||
|
||||
*hdevice = new vx_device();
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
|
@ -232,8 +211,6 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||
|
||||
delete device;
|
||||
|
||||
SimPlatform::instance().finalize();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -127,7 +127,7 @@ public:
|
|||
|
||||
virtual ~SimEventBase() {}
|
||||
|
||||
virtual void fire() const = 0;
|
||||
virtual void fire() const = 0;
|
||||
|
||||
uint64_t time() const {
|
||||
return time_;
|
||||
|
@ -219,15 +219,21 @@ public:
|
|||
|
||||
const std::string& name() const {
|
||||
return name_;
|
||||
}
|
||||
|
||||
virtual void step(uint64_t cycle) = 0;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
SimObjectBase(const SimContext& ctx, const char* name);
|
||||
|
||||
private:
|
||||
|
||||
virtual void do_reset() = 0;
|
||||
|
||||
virtual void do_tick() = 0;
|
||||
|
||||
std::string name_;
|
||||
|
||||
friend class SimPlatform;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -246,18 +252,22 @@ protected:
|
|||
: SimObjectBase(ctx, name)
|
||||
{}
|
||||
|
||||
void step(uint64_t cycle) override {
|
||||
this->impl().step(cycle);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
const Impl& impl() const {
|
||||
return static_cast<const Impl&>(*this);
|
||||
const Impl* impl() const {
|
||||
return static_cast<const Impl*>(this);
|
||||
}
|
||||
|
||||
Impl& impl() {
|
||||
return static_cast<Impl&>(*this);
|
||||
Impl* impl() {
|
||||
return static_cast<Impl*>(this);
|
||||
}
|
||||
|
||||
void do_reset() override {
|
||||
this->impl()->reset();
|
||||
}
|
||||
|
||||
void do_tick() override {
|
||||
this->impl()->tick();
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -282,10 +292,6 @@ public:
|
|||
return true;
|
||||
}
|
||||
|
||||
void flush() {
|
||||
instance().clear();
|
||||
}
|
||||
|
||||
void finalize() {
|
||||
instance().clear();
|
||||
}
|
||||
|
@ -310,7 +316,15 @@ public:
|
|||
events_.emplace_back(evt);
|
||||
}
|
||||
|
||||
void step() {
|
||||
void reset() {
|
||||
events_.clear();
|
||||
for (auto& object : objects_) {
|
||||
object->do_reset();
|
||||
}
|
||||
cycles_ = 0;
|
||||
}
|
||||
|
||||
void tick() {
|
||||
// evaluate events
|
||||
auto evt_it = events_.begin();
|
||||
auto evt_it_end = events_.end();
|
||||
|
@ -325,7 +339,7 @@ public:
|
|||
}
|
||||
// evaluate components
|
||||
for (auto& object : objects_) {
|
||||
object->step(cycles_);
|
||||
object->do_tick();
|
||||
}
|
||||
// advance clock
|
||||
++cycles_;
|
||||
|
|
|
@ -49,12 +49,12 @@ int main(int argc, char **argv) {
|
|||
|
||||
parse_args(argc, argv);
|
||||
|
||||
for (auto program : programs) {
|
||||
std::cout << "Running " << program << "..." << std::endl;
|
||||
vortex::RAM ram(RAM_PAGE_SIZE);
|
||||
vortex::Processor processor;
|
||||
processor.attach_ram(&ram);
|
||||
|
||||
vortex::RAM ram(RAM_PAGE_SIZE);
|
||||
vortex::Processor processor;
|
||||
processor.attach_ram(&ram);
|
||||
for (auto program : programs) {
|
||||
std::cout << "Running " << program << "..." << std::endl;
|
||||
|
||||
std::string program_ext(fileExtension(program));
|
||||
if (program_ext == "bin") {
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <VX_config.h>
|
||||
#include <ostream>
|
||||
#include <list>
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <unordered_map>
|
||||
|
@ -39,7 +40,9 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#define ENABLE_MEM_STALLS
|
||||
#ifndef MEM_CYCLE_RATIO
|
||||
#define MEM_CYCLE_RATIO -1
|
||||
#endif
|
||||
|
||||
#ifndef TRACE_START_TIME
|
||||
#define TRACE_START_TIME 0ull
|
||||
|
@ -126,12 +129,7 @@ public:
|
|||
}
|
||||
|
||||
~Impl() {
|
||||
for (auto& buf : print_bufs_) {
|
||||
auto str = buf.second.str();
|
||||
if (!str.empty()) {
|
||||
std::cout << "#" << buf.first << ": " << str << std::endl;
|
||||
}
|
||||
}
|
||||
this->cout_flush();
|
||||
|
||||
#ifdef VCD_OUTPUT
|
||||
trace_->close();
|
||||
|
@ -147,10 +145,46 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
void cout_flush() {
|
||||
for (auto& buf : print_bufs_) {
|
||||
auto str = buf.second.str();
|
||||
if (!str.empty()) {
|
||||
std::cout << "#" << buf.first << ": " << str << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void attach_ram(RAM* ram) {
|
||||
ram_ = ram;
|
||||
}
|
||||
|
||||
int run() {
|
||||
int exitcode = 0;
|
||||
|
||||
#ifndef NDEBUG
|
||||
std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
|
||||
#endif
|
||||
|
||||
// reset device
|
||||
this->reset();
|
||||
|
||||
// execute program
|
||||
while (device_->busy) {
|
||||
if (get_ebreak()) {
|
||||
exitcode = get_last_wb_value(3);
|
||||
break;
|
||||
}
|
||||
this->tick();
|
||||
}
|
||||
|
||||
// wait 5 cycles to flush the pipeline
|
||||
this->wait(5);
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void reset() {
|
||||
print_bufs_.clear();
|
||||
|
||||
|
@ -178,33 +212,11 @@ public:
|
|||
|
||||
// Turn on assertion after reset
|
||||
Verilated::assertOn(true);
|
||||
|
||||
this->cout_flush();
|
||||
}
|
||||
|
||||
int run() {
|
||||
int exitcode = 0;
|
||||
|
||||
#ifndef NDEBUG
|
||||
std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
|
||||
#endif
|
||||
|
||||
// execute program
|
||||
while (device_->busy) {
|
||||
if (get_ebreak()) {
|
||||
exitcode = get_last_wb_value(3);
|
||||
break;
|
||||
}
|
||||
this->step();
|
||||
}
|
||||
|
||||
// wait 5 cycles to flush the pipeline
|
||||
this->wait(5);
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void step() {
|
||||
void tick() {
|
||||
|
||||
device_->clk = 0;
|
||||
this->eval();
|
||||
|
@ -224,7 +236,19 @@ private:
|
|||
this->eval_avs_bus(1);
|
||||
#endif
|
||||
|
||||
dram_->tick();
|
||||
if (MEM_CYCLE_RATIO > 0) {
|
||||
auto cycle = timestamp / 2;
|
||||
if ((cycle % MEM_CYCLE_RATIO) == 0)
|
||||
dram_->tick();
|
||||
} else {
|
||||
for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
|
||||
dram_->tick();
|
||||
}
|
||||
|
||||
if (!dram_queue_.empty()) {
|
||||
if (dram_->send(dram_queue_.front()))
|
||||
dram_queue_.pop();
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
fflush(stdout);
|
||||
|
@ -372,7 +396,7 @@ private:
|
|||
ramulator::Request::Type::WRITE,
|
||||
0
|
||||
);
|
||||
dram_->send(dram_req);
|
||||
dram_queue_.push(dram_req);
|
||||
}
|
||||
} else {
|
||||
// process reads
|
||||
|
@ -393,7 +417,7 @@ private:
|
|||
}, placeholders::_1, mem_req),
|
||||
0
|
||||
);
|
||||
dram_->send(dram_req);
|
||||
dram_queue_.push(dram_req);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -490,7 +514,7 @@ private:
|
|||
ramulator::Request::Type::WRITE,
|
||||
0
|
||||
);
|
||||
dram_->send(dram_req);
|
||||
dram_queue_.push(dram_req);
|
||||
}
|
||||
} else {
|
||||
// process reads
|
||||
|
@ -511,7 +535,7 @@ private:
|
|||
}, placeholders::_1, mem_req),
|
||||
0
|
||||
);
|
||||
dram_->send(dram_req);
|
||||
dram_queue_.push(dram_req);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -522,7 +546,7 @@ private:
|
|||
|
||||
void wait(uint32_t cycles) {
|
||||
for (int i = 0; i < cycles; ++i) {
|
||||
this->step();
|
||||
this->tick();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -574,6 +598,8 @@ private:
|
|||
RAM *ram_;
|
||||
|
||||
ramulator::Gem5Wrapper* dram_;
|
||||
|
||||
std::queue<ramulator::Request> dram_queue_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -590,10 +616,6 @@ void Processor::attach_ram(RAM* mem) {
|
|||
impl_->attach_ram(mem);
|
||||
}
|
||||
|
||||
void Processor::reset() {
|
||||
impl_->reset();
|
||||
}
|
||||
|
||||
int Processor::run() {
|
||||
return impl_->run();
|
||||
}
|
|
@ -8,12 +8,10 @@ class Processor {
|
|||
public:
|
||||
|
||||
Processor();
|
||||
virtual ~Processor();
|
||||
~Processor();
|
||||
|
||||
void attach_ram(RAM* ram);
|
||||
|
||||
void reset();
|
||||
|
||||
int run();
|
||||
|
||||
private:
|
||||
|
|
|
@ -102,6 +102,12 @@ struct block_t {
|
|||
struct set_t {
|
||||
std::vector<block_t> blocks;
|
||||
set_t(uint32_t size) : blocks(size) {}
|
||||
|
||||
void clear() {
|
||||
for (auto& block : blocks) {
|
||||
block.valid = false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct bank_req_info_t {
|
||||
|
@ -117,6 +123,7 @@ struct bank_req_t {
|
|||
uint64_t tag;
|
||||
uint32_t set_id;
|
||||
uint32_t core_id;
|
||||
uint64_t uuid;
|
||||
std::vector<bank_req_info_t> infos;
|
||||
|
||||
bank_req_t(uint32_t size)
|
||||
|
@ -126,6 +133,7 @@ struct bank_req_t {
|
|||
, tag(0)
|
||||
, set_id(0)
|
||||
, core_id(0)
|
||||
, uuid(0)
|
||||
, infos(size)
|
||||
{}
|
||||
};
|
||||
|
@ -142,20 +150,20 @@ struct mshr_entry_t : public bank_req_t {
|
|||
class MSHR {
|
||||
private:
|
||||
std::vector<mshr_entry_t> entries_;
|
||||
uint32_t capacity_;
|
||||
uint32_t size_;
|
||||
|
||||
public:
|
||||
MSHR(uint32_t size)
|
||||
: entries_(size)
|
||||
, capacity_(0)
|
||||
, size_(0)
|
||||
{}
|
||||
|
||||
bool empty() const {
|
||||
return (0 == capacity_);
|
||||
return (0 == size_);
|
||||
}
|
||||
|
||||
bool full() const {
|
||||
return (capacity_ == entries_.size());
|
||||
return (size_ == entries_.size());
|
||||
}
|
||||
|
||||
int lookup(const bank_req_t& bank_req) {
|
||||
|
@ -178,7 +186,7 @@ public:
|
|||
entry.valid = true;
|
||||
entry.mshr_replay = false;
|
||||
entry.block_id = block_id;
|
||||
++capacity_;
|
||||
++size_;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
@ -204,12 +212,21 @@ public:
|
|||
if (entry.valid && entry.mshr_replay) {
|
||||
*out = entry;
|
||||
entry.valid = false;
|
||||
--capacity_;
|
||||
--size_;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void clear() {
|
||||
for (auto& entry : entries_) {
|
||||
if (entry.valid && entry.mshr_replay) {
|
||||
entry.valid = false;
|
||||
}
|
||||
}
|
||||
size_ = 0;
|
||||
}
|
||||
};
|
||||
|
||||
struct bank_t {
|
||||
|
@ -221,6 +238,13 @@ struct bank_t {
|
|||
: sets(params.sets_per_bank, params.blocks_per_set)
|
||||
, mshr(config.mshr_size)
|
||||
{}
|
||||
|
||||
void clear() {
|
||||
mshr.clear();
|
||||
for (auto& set : sets) {
|
||||
set.clear();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -235,11 +259,11 @@ private:
|
|||
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
|
||||
std::vector<SimPort<MemReq>> mem_req_ports_;
|
||||
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
|
||||
uint32_t flush_cycles_;
|
||||
PerfStats perf_stats_;
|
||||
uint64_t pending_read_reqs_;
|
||||
uint64_t pending_write_reqs_;
|
||||
uint64_t pending_fill_reqs_;
|
||||
uint32_t flush_cycles_;
|
||||
uint64_t pending_fill_reqs_;
|
||||
|
||||
public:
|
||||
Impl(Cache* simobject, const Config& config)
|
||||
|
@ -249,9 +273,6 @@ public:
|
|||
, banks_(config.num_banks, {config, params_})
|
||||
, mem_req_ports_(config.num_banks, simobject)
|
||||
, mem_rsp_ports_(config.num_banks, simobject)
|
||||
, pending_read_reqs_(0)
|
||||
, pending_write_reqs_(0)
|
||||
, pending_fill_reqs_(0)
|
||||
{
|
||||
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
|
||||
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
|
||||
|
@ -272,19 +293,28 @@ public:
|
|||
|
||||
// calculate tag flush cycles
|
||||
flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
void step(uint64_t cycle) {
|
||||
void reset() {
|
||||
for (auto& bank : banks_) {
|
||||
bank.clear();
|
||||
}
|
||||
perf_stats_ = PerfStats();
|
||||
pending_read_reqs_ = 0;
|
||||
pending_write_reqs_ = 0;
|
||||
pending_fill_reqs_ = 0;
|
||||
}
|
||||
|
||||
void tick() {
|
||||
// wait on flush cycles
|
||||
if (flush_cycles_ != 0) {
|
||||
--flush_cycles_;
|
||||
return;
|
||||
}
|
||||
|
||||
// per-bank pipeline request
|
||||
std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
|
||||
|
||||
// calculate memory latency
|
||||
perf_stats_.mem_latency += pending_fill_reqs_;
|
||||
|
||||
|
@ -294,12 +324,11 @@ public:
|
|||
auto& mem_rsp = bypass_port.front();
|
||||
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
|
||||
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
|
||||
MemRsp core_rsp{tag, mem_rsp.core_id};
|
||||
MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid};
|
||||
simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
|
||||
DT(3, simobject_->name() << "-" << core_rsp);
|
||||
bypass_port.pop();
|
||||
}
|
||||
|
||||
std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
|
||||
}
|
||||
|
||||
// handle MSHR replay
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
|
@ -351,6 +380,7 @@ public:
|
|||
bank_req.tag = tag;
|
||||
bank_req.set_id = set_id;
|
||||
bank_req.core_id = core_req.core_id;
|
||||
bank_req.uuid = core_req.uuid;
|
||||
bank_req.infos.at(port_id) = {true, req_id, core_req.tag};
|
||||
|
||||
auto& bank = banks_.at(bank_id);
|
||||
|
@ -400,22 +430,31 @@ public:
|
|||
|
||||
// remove request
|
||||
auto time = core_req_port.pop();
|
||||
perf_stats_.pipeline_stalls += (cycle - time);
|
||||
perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
|
||||
}
|
||||
|
||||
// process active request
|
||||
this->processBankRequest(pipeline_reqs);
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void processIORequest(const MemReq& core_req, uint32_t req_id) {
|
||||
{
|
||||
MemReq mem_req(core_req);
|
||||
mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
|
||||
bypass_switch_->ReqIn.at(1).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-" << mem_req);
|
||||
}
|
||||
|
||||
if (core_req.write && config_.write_reponse) {
|
||||
simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1);
|
||||
MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid};
|
||||
simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);
|
||||
DT(3, simobject_->name() << "-" << core_rsp);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -442,8 +481,9 @@ public:
|
|||
if (pipeline_req.mshr_replay) {
|
||||
// send core response
|
||||
for (auto& info : pipeline_req.infos) {
|
||||
MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
|
||||
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
|
||||
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
|
||||
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
|
||||
DT(3, simobject_->name() << "-" << core_rsp);
|
||||
}
|
||||
} else {
|
||||
bool hit = false;
|
||||
|
@ -485,7 +525,9 @@ public:
|
|||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.core_id = pipeline_req.core_id;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-" << mem_req);
|
||||
} else {
|
||||
// mark block as dirty
|
||||
hit_block.dirty = true;
|
||||
|
@ -494,8 +536,9 @@ public:
|
|||
// send core response
|
||||
if (!pipeline_req.write || config_.write_reponse) {
|
||||
for (auto& info : pipeline_req.infos) {
|
||||
MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
|
||||
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
|
||||
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
|
||||
DT(3, simobject_->name() << "-" << core_rsp);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -516,6 +559,7 @@ public:
|
|||
mem_req.write = true;
|
||||
mem_req.core_id = pipeline_req.core_id;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-" << mem_req);
|
||||
++perf_stats_.evictions;
|
||||
}
|
||||
}
|
||||
|
@ -527,13 +571,16 @@ public:
|
|||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.core_id = pipeline_req.core_id;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-" << mem_req);
|
||||
}
|
||||
// send core response
|
||||
if (config_.write_reponse) {
|
||||
for (auto& info : pipeline_req.infos) {
|
||||
MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
|
||||
MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
|
||||
simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
|
||||
DT(3, simobject_->name() << "-" << core_rsp);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
@ -550,7 +597,9 @@ public:
|
|||
mem_req.write = false;
|
||||
mem_req.tag = mshr_id;
|
||||
mem_req.core_id = pipeline_req.core_id;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-" << mem_req);
|
||||
++pending_fill_reqs_;
|
||||
}
|
||||
}
|
||||
|
@ -575,8 +624,12 @@ Cache::~Cache() {
|
|||
delete impl_;
|
||||
}
|
||||
|
||||
void Cache::step(uint64_t cycle) {
|
||||
impl_->step(cycle);
|
||||
void Cache::reset() {
|
||||
impl_->reset();
|
||||
}
|
||||
|
||||
void Cache::tick() {
|
||||
impl_->tick();
|
||||
}
|
||||
|
||||
const Cache::PerfStats& Cache::perf_stats() const {
|
||||
|
|
|
@ -22,6 +22,7 @@ public:
|
|||
uint16_t mshr_size; // MSHR buffer size
|
||||
uint8_t latency; // pipeline latency
|
||||
};
|
||||
|
||||
struct PerfStats {
|
||||
uint64_t reads;
|
||||
uint64_t writes;
|
||||
|
@ -54,7 +55,9 @@ public:
|
|||
Cache(const SimContext& ctx, const char* name, const Config& config);
|
||||
~Cache();
|
||||
|
||||
void step(uint64_t cycle);
|
||||
void reset();
|
||||
|
||||
void tick();
|
||||
|
||||
const PerfStats& perf_stats() const;
|
||||
|
||||
|
|
|
@ -1,10 +1,16 @@
|
|||
#pragma once
|
||||
|
||||
#include "types.h"
|
||||
|
||||
#ifndef RAM_PAGE_SIZE
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
#endif
|
||||
|
||||
#define DRAM_CHANNELS 2
|
||||
#ifndef MEM_CYCLE_RATIO
|
||||
#define MEM_CYCLE_RATIO -1
|
||||
#endif
|
||||
|
||||
#ifndef MEMORY_BANKS
|
||||
#define MEMORY_BANKS 2
|
||||
#endif
|
||||
|
||||
namespace vortex {
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
|||
, ibuffers_(arch.num_warps(), IBUF_SIZE)
|
||||
, scoreboard_(arch_)
|
||||
, exe_units_((int)ExeType::MAX)
|
||||
, icache_(Cache::Create("Icache", Cache::Config{
|
||||
, icache_(Cache::Create("icache", Cache::Config{
|
||||
log2ceil(ICACHE_SIZE), // C
|
||||
log2ceil(L1_BLOCK_SIZE),// B
|
||||
2, // W
|
||||
|
@ -45,7 +45,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
|||
NUM_WARPS, // mshr
|
||||
2, // pipeline latency
|
||||
}))
|
||||
, dcache_(Cache::Create("Dcache", Cache::Config{
|
||||
, dcache_(Cache::Create("dcache", Cache::Config{
|
||||
log2ceil(DCACHE_SIZE), // C
|
||||
log2ceil(L1_BLOCK_SIZE),// B
|
||||
2, // W
|
||||
|
@ -72,15 +72,6 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
|||
, fetch_latch_("fetch")
|
||||
, decode_latch_("decode")
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, active_warps_(1)
|
||||
, stalled_warps_(0)
|
||||
, last_schedule_wid_(0)
|
||||
, issued_instrs_(0)
|
||||
, committed_instrs_(0)
|
||||
, csr_tex_unit_(0)
|
||||
, ecall_(false)
|
||||
, ebreak_(false)
|
||||
, perf_mem_pending_reads_(0)
|
||||
{
|
||||
for (int i = 0; i < arch_.num_warps(); ++i) {
|
||||
warps_.at(i) = std::make_shared<Warp>(this, i);
|
||||
|
@ -112,10 +103,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
|||
#endif
|
||||
sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i));
|
||||
dcache_->CoreRspPorts.at(i).bind(&sw->RspIn);
|
||||
}
|
||||
|
||||
// activate warp0
|
||||
warps_.at(0)->setTmask(0, true);
|
||||
}
|
||||
|
||||
// memory perf callbacks
|
||||
MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
|
||||
|
@ -128,9 +116,62 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
|||
__unused (cycle);
|
||||
--perf_mem_pending_reads_;
|
||||
});
|
||||
|
||||
this->reset();
|
||||
}
|
||||
|
||||
Core::~Core() {
|
||||
this->cout_flush();
|
||||
}
|
||||
|
||||
void Core::reset() {
|
||||
for (auto& warp : warps_) {
|
||||
warp->clear();
|
||||
}
|
||||
warps_.at(0)->setTmask(0, true);
|
||||
active_warps_ = 1;
|
||||
|
||||
for (auto& tex_unit : tex_units_) {
|
||||
tex_unit.clear();
|
||||
}
|
||||
|
||||
for ( auto& barrier : barriers_) {
|
||||
barrier.reset();
|
||||
}
|
||||
|
||||
for (auto& csr : csrs_) {
|
||||
csr = 0;
|
||||
}
|
||||
|
||||
for (auto& fcsr : fcsrs_) {
|
||||
fcsr = 0;
|
||||
}
|
||||
|
||||
for (auto& ibuf : ibuffers_) {
|
||||
ibuf.clear();
|
||||
}
|
||||
|
||||
scoreboard_.clear();
|
||||
fetch_latch_.clear();
|
||||
decode_latch_.clear();
|
||||
pending_icache_.clear();
|
||||
stalled_warps_.reset();
|
||||
last_schedule_wid_ = 0;
|
||||
issued_instrs_ = 0;
|
||||
committed_instrs_ = 0;
|
||||
csr_tex_unit_ = 0;
|
||||
ecall_ = false;
|
||||
ebreak_ = false;
|
||||
perf_mem_pending_reads_ = 0;
|
||||
perf_stats_ = PerfStats();
|
||||
}
|
||||
|
||||
void Core::attach_ram(RAM* ram) {
|
||||
// bind RAM to memory unit
|
||||
mmu_.attach(*ram, 0, 0xFFFFFFFF);
|
||||
}
|
||||
|
||||
void Core::cout_flush() {
|
||||
for (auto& buf : print_bufs_) {
|
||||
auto str = buf.second.str();
|
||||
if (!str.empty()) {
|
||||
|
@ -139,17 +180,12 @@ Core::~Core() {
|
|||
}
|
||||
}
|
||||
|
||||
void Core::attach_ram(RAM* ram) {
|
||||
// bind RAM to memory unit
|
||||
mmu_.attach(*ram, 0, 0xFFFFFFFF);
|
||||
}
|
||||
|
||||
void Core::step(uint64_t cycle) {
|
||||
this->commit(cycle);
|
||||
this->execute(cycle);
|
||||
this->decode(cycle);
|
||||
this->fetch(cycle);
|
||||
this->schedule(cycle);
|
||||
void Core::tick() {
|
||||
this->commit();
|
||||
this->execute();
|
||||
this->decode();
|
||||
this->fetch();
|
||||
this->schedule();
|
||||
|
||||
// update perf counter
|
||||
perf_stats_.mem_latency += perf_mem_pending_reads_;
|
||||
|
@ -157,9 +193,7 @@ void Core::step(uint64_t cycle) {
|
|||
DPN(2, std::flush);
|
||||
}
|
||||
|
||||
void Core::schedule(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
void Core::schedule() {
|
||||
bool foundSchedule = false;
|
||||
int scheduled_warp = last_schedule_wid_;
|
||||
|
||||
|
@ -181,30 +215,27 @@ void Core::schedule(uint64_t cycle) {
|
|||
// suspend warp until decode
|
||||
stalled_warps_.set(scheduled_warp);
|
||||
|
||||
auto& warp = warps_.at(scheduled_warp);
|
||||
|
||||
uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_;
|
||||
|
||||
auto trace = new pipeline_trace_t(uuid, arch_);
|
||||
|
||||
auto& warp = warps_.at(scheduled_warp);
|
||||
warp->eval(trace);
|
||||
|
||||
DT(3, cycle, "pipeline-schedule: " << *trace);
|
||||
DT(3, "pipeline-schedule: " << *trace);
|
||||
|
||||
// advance to fetch stage
|
||||
fetch_latch_.push(trace);
|
||||
}
|
||||
|
||||
void Core::fetch(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
void Core::fetch() {
|
||||
// handle icache reponse
|
||||
auto& icache_rsp_port = icache_->CoreRspPorts.at(0);
|
||||
if (!icache_rsp_port.empty()){
|
||||
auto& mem_rsp = icache_rsp_port.front();
|
||||
auto trace = pending_icache_.at(mem_rsp.tag);
|
||||
decode_latch_.push(trace);
|
||||
DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
|
||||
DT(3, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
|
||||
pending_icache_.release(mem_rsp.tag);
|
||||
icache_rsp_port.pop();
|
||||
}
|
||||
|
@ -216,16 +247,15 @@ void Core::fetch(uint64_t cycle) {
|
|||
mem_req.addr = trace->PC;
|
||||
mem_req.write = false;
|
||||
mem_req.tag = pending_icache_.allocate(trace);
|
||||
mem_req.core_id = id_;
|
||||
icache_->CoreReqPorts.at(0).send(mem_req, 1);
|
||||
DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
||||
mem_req.core_id = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
icache_->CoreReqPorts.at(0).send(mem_req, 1);
|
||||
DT(3, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
||||
fetch_latch_.pop();
|
||||
}
|
||||
}
|
||||
|
||||
void Core::decode(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
void Core::decode() {
|
||||
if (decode_latch_.empty())
|
||||
return;
|
||||
|
||||
|
@ -235,7 +265,7 @@ void Core::decode(uint64_t cycle) {
|
|||
auto& ibuffer = ibuffers_.at(trace->wid);
|
||||
if (ibuffer.full()) {
|
||||
if (!trace->suspend()) {
|
||||
DT(3, cycle, "*** ibuffer-stall: " << *trace);
|
||||
DT(3, "*** ibuffer-stall: " << *trace);
|
||||
}
|
||||
++perf_stats_.ibuf_stalls;
|
||||
return;
|
||||
|
@ -257,7 +287,7 @@ void Core::decode(uint64_t cycle) {
|
|||
if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH)
|
||||
perf_stats_.branches += active_threads;
|
||||
|
||||
DT(3, cycle, "pipeline-decode: " << *trace);
|
||||
DT(3, "pipeline-decode: " << *trace);
|
||||
|
||||
// insert to ibuffer
|
||||
ibuffer.push(trace);
|
||||
|
@ -265,9 +295,7 @@ void Core::decode(uint64_t cycle) {
|
|||
decode_latch_.pop();
|
||||
}
|
||||
|
||||
void Core::execute(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
void Core::execute() {
|
||||
// issue ibuffer instructions
|
||||
for (auto& ibuffer : ibuffers_) {
|
||||
if (ibuffer.empty())
|
||||
|
@ -278,7 +306,7 @@ void Core::execute(uint64_t cycle) {
|
|||
// check scoreboard
|
||||
if (scoreboard_.in_use(trace)) {
|
||||
if (!trace->suspend()) {
|
||||
DTH(3, cycle, "*** scoreboard-stall: dependents={");
|
||||
DTH(3, "*** scoreboard-stall: dependents={");
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
|
||||
auto& use = uses.at(i);
|
||||
|
@ -297,7 +325,7 @@ void Core::execute(uint64_t cycle) {
|
|||
// update scoreboard
|
||||
scoreboard_.reserve(trace);
|
||||
|
||||
DT(3, cycle, "pipeline-issue: " << *trace);
|
||||
DT(3, "pipeline-issue: " << *trace);
|
||||
|
||||
// push to execute units
|
||||
auto& exe_unit = exe_units_.at((int)trace->exe_type);
|
||||
|
@ -308,9 +336,7 @@ void Core::execute(uint64_t cycle) {
|
|||
}
|
||||
}
|
||||
|
||||
void Core::commit(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
void Core::commit() {
|
||||
// commit completed instructions
|
||||
bool wb = false;
|
||||
for (auto& exe_unit : exe_units_) {
|
||||
|
@ -323,7 +349,7 @@ void Core::commit(uint64_t cycle) {
|
|||
wb |= trace->wb;
|
||||
|
||||
// advance to commit stage
|
||||
DT(3, cycle, "pipeline-commit: " << *trace);
|
||||
DT(3, "pipeline-commit: " << *trace);
|
||||
|
||||
// update scoreboard
|
||||
scoreboard_.release(trace);
|
||||
|
|
|
@ -75,16 +75,14 @@ public:
|
|||
|
||||
bool running() const;
|
||||
|
||||
void step(uint64_t cycle);
|
||||
void reset();
|
||||
|
||||
void tick();
|
||||
|
||||
Word id() const {
|
||||
return id_;
|
||||
}
|
||||
|
||||
Warp& warp(int i) {
|
||||
return *warps_.at(i);
|
||||
}
|
||||
|
||||
const Decoder& decoder() {
|
||||
return decoder_;
|
||||
}
|
||||
|
@ -125,14 +123,16 @@ public:
|
|||
|
||||
private:
|
||||
|
||||
void schedule(uint64_t cycle);
|
||||
void fetch(uint64_t cycle);
|
||||
void decode(uint64_t cycle);
|
||||
void execute(uint64_t cycle);
|
||||
void commit(uint64_t cycle);
|
||||
void schedule();
|
||||
void fetch();
|
||||
void decode();
|
||||
void execute();
|
||||
void commit();
|
||||
|
||||
void writeToStdOut(Addr addr, Word data);
|
||||
|
||||
void cout_flush();
|
||||
|
||||
Word id_;
|
||||
const ArchDef arch_;
|
||||
const Decoder decoder_;
|
||||
|
|
|
@ -33,15 +33,15 @@
|
|||
} \
|
||||
} while(0)
|
||||
|
||||
#define DT(lvl, t, x) do { \
|
||||
#define DT(lvl, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \
|
||||
std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x << std::endl; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DTH(lvl, t, x) do { \
|
||||
#define DTH(lvl, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \
|
||||
std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
@ -58,8 +58,8 @@
|
|||
#define DPH(lvl, x) do {} while(0)
|
||||
#define DPN(lvl, x) do {} while(0)
|
||||
|
||||
#define DT(lvl, t, x) do {} while(0)
|
||||
#define DTH(lvl, t, x) do {} while(0)
|
||||
#define DT(lvl, x) do {} while(0)
|
||||
#define DTH(lvl, x) do {} while(0)
|
||||
#define DTN(lvl, x) do {} while(0)
|
||||
|
||||
#endif
|
|
@ -87,7 +87,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
DPN(2, "-");
|
||||
continue;
|
||||
}
|
||||
rsdata[t][i] = iRegFile_.at(t)[reg];
|
||||
rsdata[t][i] = ireg_file_.at(t)[reg];
|
||||
DPN(2, std::hex << rsdata[t][i]);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
|
@ -100,7 +100,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
DPN(2, "-");
|
||||
continue;
|
||||
}
|
||||
rsdata[t][i] = fRegFile_.at(t)[reg];
|
||||
rsdata[t][i] = freg_file_.at(t)[reg];
|
||||
DPN(2, std::hex << rsdata[t][i]);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
|
@ -460,7 +460,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
|
||||
DP(4, "dest: v" << rdest);
|
||||
DP(4, "width" << instr.getVlsWidth());
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
switch (instr.getVlsWidth()) {
|
||||
case 6: {
|
||||
// load word and unit strided (not checking for unit stride)
|
||||
|
@ -517,7 +517,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
switch (instr.getVlsWidth()) {
|
||||
case 6: {
|
||||
// store word and unit strided (not checking for unit stride)
|
||||
uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
|
||||
uint32_t value = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i);
|
||||
core_->dcache_write(memAddr, value, 4);
|
||||
DP(4, "store: " << memAddr << " value:" << value);
|
||||
} break;
|
||||
|
@ -784,7 +784,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
// predicate mode
|
||||
ThreadMask pred;
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
|
||||
pred[i] = tmask_.test(i) ? (ireg_file_.at(i).at(rsrc0) != 0) : 0;
|
||||
}
|
||||
if (pred.any()) {
|
||||
tmask_ &= pred;
|
||||
|
@ -819,15 +819,15 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->gpu.type = GpuType::SPLIT;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->fetch_stall = true;
|
||||
if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {
|
||||
if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {
|
||||
ThreadMask tmask;
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
|
||||
tmask[i] = tmask_.test(i) && !ireg_file_.at(i).at(rsrc0);
|
||||
}
|
||||
|
||||
DomStackEntry e(tmask, nextPC);
|
||||
domStack_.push(tmask_);
|
||||
domStack_.push(e);
|
||||
dom_stack_.push(tmask_);
|
||||
dom_stack_.push(e);
|
||||
for (size_t i = 0; i < e.tmask.size(); ++i) {
|
||||
tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
|
||||
}
|
||||
|
@ -842,7 +842,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
DP(3, "*** Unanimous pred");
|
||||
DomStackEntry e(tmask_);
|
||||
e.unanimous = true;
|
||||
domStack_.push(e);
|
||||
dom_stack_.push(e);
|
||||
}
|
||||
} break;
|
||||
case 3: {
|
||||
|
@ -850,25 +850,25 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu.type = GpuType::JOIN;
|
||||
trace->fetch_stall = true;
|
||||
if (!domStack_.empty() && domStack_.top().unanimous) {
|
||||
if (!dom_stack_.empty() && dom_stack_.top().unanimous) {
|
||||
DP(3, "*** Uninimous branch at join");
|
||||
tmask_ = domStack_.top().tmask;
|
||||
tmask_ = dom_stack_.top().tmask;
|
||||
active_ = tmask_.any();
|
||||
domStack_.pop();
|
||||
dom_stack_.pop();
|
||||
} else {
|
||||
if (!domStack_.top().fallThrough) {
|
||||
nextPC = domStack_.top().PC;
|
||||
if (!dom_stack_.top().fallThrough) {
|
||||
nextPC = dom_stack_.top().PC;
|
||||
DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
|
||||
}
|
||||
|
||||
tmask_ = domStack_.top().tmask;
|
||||
tmask_ = dom_stack_.top().tmask;
|
||||
active_ = tmask_.any();
|
||||
|
||||
DPH(3, "*** Join: New TM=");
|
||||
for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
|
||||
DPN(3, "\n");
|
||||
|
||||
domStack_.pop();
|
||||
dom_stack_.pop();
|
||||
}
|
||||
} break;
|
||||
case 4: {
|
||||
|
@ -946,10 +946,10 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
case 0: // vector-vector
|
||||
switch (func6) {
|
||||
case 0: {
|
||||
auto& vr1 = vRegFile_.at(rsrc0);
|
||||
auto& vr2 = vRegFile_.at(rsrc1);
|
||||
auto& vd = vRegFile_.at(rdest);
|
||||
auto& mask = vRegFile_.at(0);
|
||||
auto& vr1 = vreg_file_.at(rsrc0);
|
||||
auto& vr2 = vreg_file_.at(rsrc1);
|
||||
auto& vd = vreg_file_.at(rdest);
|
||||
auto& mask = vreg_file_.at(0);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t emask = *(uint8_t *)(mask.data() + i);
|
||||
|
@ -990,9 +990,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 24: {
|
||||
// vmseq
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1021,9 +1021,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 25: {
|
||||
// vmsne
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1052,9 +1052,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 26: {
|
||||
// vmsltu
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1083,9 +1083,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 27: {
|
||||
// vmslt
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
int8_t first = *(int8_t *)(vr1.data() + i);
|
||||
|
@ -1114,9 +1114,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 28: {
|
||||
// vmsleu
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1145,9 +1145,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 29: {
|
||||
// vmsle
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
int8_t first = *(int8_t *)(vr1.data() + i);
|
||||
|
@ -1176,9 +1176,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 30: {
|
||||
// vmsgtu
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1207,9 +1207,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 31: {
|
||||
// vmsgt
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
int8_t first = *(int8_t *)(vr1.data() + i);
|
||||
|
@ -1242,9 +1242,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
switch (func6) {
|
||||
case 24: {
|
||||
// vmandnot
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1288,9 +1288,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 25: {
|
||||
// vmand
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1334,9 +1334,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 26: {
|
||||
// vmor
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1380,9 +1380,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 27: {
|
||||
// vmxor
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1426,9 +1426,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 28: {
|
||||
// vmornot
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1472,9 +1472,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 29: {
|
||||
// vmnand
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1518,9 +1518,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 30: {
|
||||
// vmnor
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1564,9 +1564,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 31: {
|
||||
// vmxnor
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1610,9 +1610,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 37: {
|
||||
// vmul
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1650,9 +1650,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 45: {
|
||||
// vmacc
|
||||
auto &vr1 = vRegFile_.at(rsrc0);
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr1 = vreg_file_.at(rsrc0);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
|
@ -1693,8 +1693,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
case 6: {
|
||||
switch (func6) {
|
||||
case 0: {
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
|
@ -1729,8 +1729,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
} break;
|
||||
case 37: {
|
||||
// vmul.vx
|
||||
auto &vr2 = vRegFile_.at(rsrc1);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
auto &vr2 = vreg_file_.at(rsrc1);
|
||||
auto &vd = vreg_file_.at(rdest);
|
||||
if (vtype_.vsew == 8) {
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
|
@ -1805,7 +1805,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
DPN(2, "-");
|
||||
continue;
|
||||
}
|
||||
iRegFile_.at(t)[rdest] = rddata[t];
|
||||
ireg_file_.at(t)[rdest] = rddata[t];
|
||||
DPN(2, "0x" << std::hex << rddata[t]);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
|
@ -1820,7 +1820,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
DPN(2, "-");
|
||||
continue;
|
||||
}
|
||||
fRegFile_.at(t)[rdest] = rddata[t];
|
||||
freg_file_.at(t)[rdest] = rddata[t];
|
||||
DPN(2, "0x" << std::hex << rddata[t]);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
|
|
|
@ -12,7 +12,7 @@ using namespace vortex;
|
|||
|
||||
NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
|
||||
|
||||
void NopUnit::step(uint64_t /*cycle*/) {
|
||||
void NopUnit::tick() {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = Input.front();
|
||||
|
@ -25,26 +25,31 @@ void NopUnit::step(uint64_t /*cycle*/) {
|
|||
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
|
||||
: ExeUnit(ctx, core, "LSU")
|
||||
, num_threads_(core->arch().num_threads())
|
||||
, pending_dcache_(LSUQ_SIZE)
|
||||
, pending_rd_reqs_(LSUQ_SIZE)
|
||||
, fence_lock_(false)
|
||||
{}
|
||||
|
||||
void LsuUnit::step(uint64_t cycle) {
|
||||
void LsuUnit::reset() {
|
||||
pending_rd_reqs_.clear();
|
||||
fence_lock_ = false;
|
||||
}
|
||||
|
||||
void LsuUnit::tick() {
|
||||
// handle dcache response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = dcache_rsp_port.front();
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
|
||||
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
|
||||
<< ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
Output.send(trace, 1);
|
||||
pending_dcache_.release(mem_rsp.tag);
|
||||
pending_rd_reqs_.release(mem_rsp.tag);
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
}
|
||||
|
@ -55,26 +60,26 @@ void LsuUnit::step(uint64_t cycle) {
|
|||
if (smem_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = smem_rsp_port.front();
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
|
||||
DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
|
||||
<< ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
Output.send(trace, 1);
|
||||
pending_dcache_.release(mem_rsp.tag);
|
||||
pending_rd_reqs_.release(mem_rsp.tag);
|
||||
}
|
||||
smem_rsp_port.pop();
|
||||
}
|
||||
|
||||
if (fence_lock_) {
|
||||
// wait for all pending memory operations to complete
|
||||
if (!pending_dcache_.empty())
|
||||
if (!pending_rd_reqs_.empty())
|
||||
return;
|
||||
Output.send(fence_state_, 1);
|
||||
fence_lock_ = false;
|
||||
DT(3, cycle, "fence-unlock: " << fence_state_);
|
||||
DT(3, "fence-unlock: " << fence_state_);
|
||||
}
|
||||
|
||||
// check input queue
|
||||
|
@ -87,17 +92,17 @@ void LsuUnit::step(uint64_t cycle) {
|
|||
// schedule fence lock
|
||||
fence_state_ = trace;
|
||||
fence_lock_ = true;
|
||||
DT(3, cycle, "fence-lock: " << *trace);
|
||||
DT(3, "fence-lock: " << *trace);
|
||||
// remove input
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.lsu_stalls += (cycle - time);
|
||||
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
|
||||
return;
|
||||
}
|
||||
|
||||
// check pending queue capacity
|
||||
if (pending_dcache_.full()) {
|
||||
if (pending_rd_reqs_.full()) {
|
||||
if (!trace->suspend()) {
|
||||
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
|
||||
DT(3, "*** lsu-queue-stall: " << *trace);
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
|
@ -130,7 +135,7 @@ void LsuUnit::step(uint64_t cycle) {
|
|||
}
|
||||
}
|
||||
|
||||
auto tag = pending_dcache_.allocate({trace, valid_addrs});
|
||||
auto tag = pending_rd_reqs_.allocate({trace, valid_addrs});
|
||||
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!trace->tmask.test(t))
|
||||
|
@ -145,15 +150,16 @@ void LsuUnit::step(uint64_t cycle) {
|
|||
mem_req.write = is_write;
|
||||
mem_req.non_cacheable = (type == AddrType::IO);
|
||||
mem_req.tag = tag;
|
||||
mem_req.core_id = core_->id();
|
||||
mem_req.core_id = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
|
||||
if (type == AddrType::Shared) {
|
||||
core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
|
||||
DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
|
||||
} else {
|
||||
dcache_req_port.send(mem_req, 2);
|
||||
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
|
||||
}
|
||||
|
||||
|
@ -163,20 +169,20 @@ void LsuUnit::step(uint64_t cycle) {
|
|||
|
||||
// do not wait on writes
|
||||
if (is_write) {
|
||||
pending_dcache_.release(tag);
|
||||
pending_rd_reqs_.release(tag);
|
||||
Output.send(trace, 1);
|
||||
}
|
||||
|
||||
// remove input
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.lsu_stalls += (cycle - time);
|
||||
core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
|
||||
|
||||
void AluUnit::step(uint64_t cycle) {
|
||||
void AluUnit::tick() {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = Input.front();
|
||||
|
@ -196,33 +202,33 @@ void AluUnit::step(uint64_t cycle) {
|
|||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
|
||||
DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
|
||||
if (trace->fetch_stall) {
|
||||
core_->stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.alu_stalls += (cycle - time);
|
||||
core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
|
||||
|
||||
void CsrUnit::step(uint64_t cycle) {
|
||||
void CsrUnit::tick() {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = Input.front();
|
||||
Output.send(trace, 1);
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.csr_stalls += (cycle - time);
|
||||
DT(3, cycle, "pipeline-execute: op=CSR, " << *trace);
|
||||
core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time);
|
||||
DT(3, "pipeline-execute: op=CSR, " << *trace);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
|
||||
|
||||
void FpuUnit::step(uint64_t cycle) {
|
||||
void FpuUnit::tick() {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = Input.front();
|
||||
|
@ -245,9 +251,9 @@ void FpuUnit::step(uint64_t cycle) {
|
|||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
|
||||
DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.fpu_stalls += (cycle - time);
|
||||
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -257,8 +263,12 @@ GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
|
|||
, num_threads_(core->arch().num_threads())
|
||||
, pending_tex_reqs_(TEXQ_SIZE)
|
||||
{}
|
||||
|
||||
void GpuUnit::reset() {
|
||||
pending_tex_reqs_.clear();
|
||||
}
|
||||
|
||||
void GpuUnit::step(uint64_t cycle) {
|
||||
void GpuUnit::tick() {
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// handle memory response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
|
@ -268,7 +278,7 @@ void GpuUnit::step(uint64_t cycle) {
|
|||
auto& mem_rsp = dcache_rsp_port.front();
|
||||
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
|
||||
DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
|
@ -312,7 +322,7 @@ void GpuUnit::step(uint64_t cycle) {
|
|||
issued = true;
|
||||
break;
|
||||
case GpuType::TEX:
|
||||
if (this->processTexRequest(cycle, trace))
|
||||
if (this->processTexRequest(trace))
|
||||
issued = true;
|
||||
break;
|
||||
default:
|
||||
|
@ -320,22 +330,20 @@ void GpuUnit::step(uint64_t cycle) {
|
|||
}
|
||||
|
||||
if (issued) {
|
||||
DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
|
||||
DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
|
||||
if (trace->fetch_stall) {
|
||||
core_->stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.fpu_stalls += (cycle - time);
|
||||
core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
|
||||
}
|
||||
}
|
||||
|
||||
bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
|
||||
__unused (cycle);
|
||||
|
||||
bool GpuUnit::processTexRequest(pipeline_trace_t* trace) {
|
||||
// check pending queue capacity
|
||||
if (pending_tex_reqs_.full()) {
|
||||
if (!trace->suspend()) {
|
||||
DT(3, cycle, "*** tex-queue-stall: " << *trace);
|
||||
DT(3, "*** tex-queue-stall: " << *trace);
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
|
@ -356,14 +364,15 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
|
|||
continue;
|
||||
|
||||
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
|
||||
for (auto mem_addr : trace->mem_addrs.at(t)) {
|
||||
for (auto& mem_addr : trace->mem_addrs.at(t)) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr.addr;
|
||||
mem_req.write = (trace->lsu.type == LsuType::STORE);
|
||||
mem_req.tag = tag;
|
||||
mem_req.core_id = core_->id();
|
||||
mem_req.uuid = trace->uuid;
|
||||
dcache_req_port.send(mem_req, 3);
|
||||
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
<< ", tid=" << t << ", "<< trace);
|
||||
++ core_->perf_stats_.tex_reads;
|
||||
++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
|
||||
|
|
|
@ -18,10 +18,14 @@ public:
|
|||
, Input(this)
|
||||
, Output(this)
|
||||
, core_(core)
|
||||
{}
|
||||
{}
|
||||
|
||||
virtual ~ExeUnit() {}
|
||||
|
||||
virtual void reset() {}
|
||||
|
||||
virtual void tick() = 0;
|
||||
|
||||
protected:
|
||||
Core* core_;
|
||||
};
|
||||
|
@ -32,7 +36,7 @@ class NopUnit : public ExeUnit {
|
|||
public:
|
||||
NopUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
void tick();
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -40,14 +44,16 @@ public:
|
|||
class LsuUnit : public ExeUnit {
|
||||
private:
|
||||
uint32_t num_threads_;
|
||||
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_dcache_;
|
||||
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_rd_reqs_;
|
||||
pipeline_trace_t* fence_state_;
|
||||
bool fence_lock_;
|
||||
|
||||
public:
|
||||
LsuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
void reset();
|
||||
|
||||
void tick();
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -56,7 +62,7 @@ class AluUnit : public ExeUnit {
|
|||
public:
|
||||
AluUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
void tick();
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -65,7 +71,7 @@ class CsrUnit : public ExeUnit {
|
|||
public:
|
||||
CsrUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
void tick();
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -74,7 +80,7 @@ class FpuUnit : public ExeUnit {
|
|||
public:
|
||||
FpuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
void tick();
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -84,12 +90,14 @@ private:
|
|||
uint32_t num_threads_;
|
||||
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
|
||||
|
||||
bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace);
|
||||
bool processTexRequest(pipeline_trace_t* trace);
|
||||
|
||||
public:
|
||||
GpuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void reset();
|
||||
|
||||
void step(uint64_t cycle);
|
||||
void tick();
|
||||
};
|
||||
|
||||
}
|
|
@ -34,6 +34,11 @@ public:
|
|||
void pop() {
|
||||
return entries_.pop();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
std::queue<pipeline_trace_t*> empty;
|
||||
std::swap(entries_, empty );
|
||||
}
|
||||
};
|
||||
|
||||
}
|
|
@ -6,6 +6,8 @@
|
|||
#include <stdlib.h>
|
||||
#include <sys/stat.h>
|
||||
#include "processor.h"
|
||||
#include "archdef.h"
|
||||
#include "mem.h"
|
||||
#include "constants.h"
|
||||
#include <util.h>
|
||||
#include "args.h"
|
||||
|
@ -50,11 +52,14 @@ int main(int argc, char **argv) {
|
|||
|
||||
std::cout << "Running " << imgFileName << "..." << std::endl;
|
||||
|
||||
if (!SimPlatform::instance().initialize())
|
||||
return -1;
|
||||
|
||||
{
|
||||
// create processor configuation
|
||||
ArchDef arch(archStr, num_cores, num_warps, num_threads);
|
||||
|
||||
// create memory module
|
||||
RAM ram(RAM_PAGE_SIZE);
|
||||
|
||||
// load program
|
||||
{
|
||||
std::string program_ext(fileExtension(imgFileName.c_str()));
|
||||
if (program_ext == "bin") {
|
||||
|
@ -67,27 +72,15 @@ int main(int argc, char **argv) {
|
|||
}
|
||||
}
|
||||
|
||||
ArchDef arch(archStr, num_cores, num_warps, num_threads);
|
||||
auto processor = Processor::Create(arch);
|
||||
processor->attach_ram(&ram);
|
||||
|
||||
// setup memory simulator
|
||||
auto memsim = MemSim::Create(MemSim::Config{
|
||||
DRAM_CHANNELS,
|
||||
arch.num_cores()
|
||||
});
|
||||
processor->MemReqPort.bind(&memsim->MemReqPort);
|
||||
memsim->MemRspPort.bind(&processor->MemRspPort);
|
||||
// create processor
|
||||
Processor processor(arch);
|
||||
|
||||
// attach memory module
|
||||
processor.attach_ram(&ram);
|
||||
|
||||
// run simulation
|
||||
for (;;) {
|
||||
SimPlatform::instance().step();
|
||||
if (processor->check_exit(&exitcode))
|
||||
break;
|
||||
};
|
||||
}
|
||||
|
||||
SimPlatform::instance().finalize();
|
||||
processor.run();
|
||||
}
|
||||
|
||||
if (riscv_test) {
|
||||
if (1 == exitcode) {
|
||||
|
|
|
@ -13,6 +13,7 @@ DISABLE_WARNING_POP
|
|||
|
||||
#include "constants.h"
|
||||
#include "types.h"
|
||||
#include "debug.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
|
@ -51,37 +52,50 @@ public:
|
|||
return perf_stats_;
|
||||
}
|
||||
|
||||
void dram_callback(ramulator::Request& req, uint32_t tag) {
|
||||
MemRsp mem_rsp{tag, (uint32_t)req.coreid};
|
||||
void dram_callback(ramulator::Request& req, uint32_t tag, uint64_t uuid) {
|
||||
if (req.type == ramulator::Request::Type::WRITE)
|
||||
return;
|
||||
MemRsp mem_rsp{tag, (uint32_t)req.coreid, uuid};
|
||||
simobject_->MemRspPort.send(mem_rsp, 1);
|
||||
DT(3, simobject_->name() << "-" << mem_rsp);
|
||||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
dram_->tick();
|
||||
void reset() {
|
||||
perf_stats_ = PerfStats();
|
||||
}
|
||||
|
||||
void tick() {
|
||||
if (MEM_CYCLE_RATIO > 0) {
|
||||
auto cycle = SimPlatform::instance().cycles();
|
||||
if ((cycle % MEM_CYCLE_RATIO) == 0)
|
||||
dram_->tick();
|
||||
} else {
|
||||
for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
|
||||
dram_->tick();
|
||||
}
|
||||
|
||||
if (simobject_->MemReqPort.empty())
|
||||
return;
|
||||
|
||||
auto& mem_req = simobject_->MemReqPort.front();
|
||||
|
||||
if (mem_req.write) {
|
||||
ramulator::Request dram_req(
|
||||
mem_req.addr,
|
||||
ramulator::Request::Type::WRITE,
|
||||
mem_req.core_id
|
||||
);
|
||||
dram_->send(dram_req);
|
||||
ramulator::Request dram_req(
|
||||
mem_req.addr,
|
||||
mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ,
|
||||
std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid),
|
||||
mem_req.core_id
|
||||
);
|
||||
|
||||
if (!dram_->send(dram_req))
|
||||
return;
|
||||
|
||||
if (mem_req.write) {
|
||||
++perf_stats_.writes;
|
||||
} else {
|
||||
ramulator::Request dram_req(
|
||||
mem_req.addr,
|
||||
ramulator::Request::Type::READ,
|
||||
std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag),
|
||||
mem_req.core_id
|
||||
);
|
||||
dram_->send(dram_req);
|
||||
++perf_stats_.reads;
|
||||
}
|
||||
|
||||
DT(3, simobject_->name() << "-" << mem_req);
|
||||
|
||||
simobject_->MemReqPort.pop();
|
||||
}
|
||||
|
@ -89,8 +103,8 @@ public:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
MemSim::MemSim(const SimContext& ctx, const Config& config)
|
||||
: SimObject<MemSim>(ctx, "MemSim")
|
||||
MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config)
|
||||
: SimObject<MemSim>(ctx, name)
|
||||
, MemReqPort(this)
|
||||
, MemRspPort(this)
|
||||
, impl_(new Impl(this, config))
|
||||
|
@ -100,6 +114,10 @@ MemSim::~MemSim() {
|
|||
delete impl_;
|
||||
}
|
||||
|
||||
void MemSim::step(uint64_t cycle) {
|
||||
impl_->step(cycle);
|
||||
void MemSim::reset() {
|
||||
impl_->reset();
|
||||
}
|
||||
|
||||
void MemSim::tick() {
|
||||
impl_->tick();
|
||||
}
|
|
@ -26,10 +26,12 @@ public:
|
|||
SimPort<MemReq> MemReqPort;
|
||||
SimPort<MemRsp> MemRspPort;
|
||||
|
||||
MemSim(const SimContext& ctx, const Config& config);
|
||||
MemSim(const SimContext& ctx, const char* name, const Config& config);
|
||||
~MemSim();
|
||||
|
||||
void step(uint64_t cycle);
|
||||
void reset();
|
||||
|
||||
void tick();
|
||||
|
||||
const PerfStats& perf_stats() const;
|
||||
|
||||
|
|
|
@ -98,14 +98,40 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state)
|
|||
return os;
|
||||
}
|
||||
|
||||
class PipelineLatch : public Queue<pipeline_trace_t*> {
|
||||
class PipelineLatch {
|
||||
protected:
|
||||
const char* name_;
|
||||
std::queue<pipeline_trace_t*> queue_;
|
||||
|
||||
public:
|
||||
PipelineLatch(const char* name = nullptr)
|
||||
: name_(name)
|
||||
{}
|
||||
|
||||
bool empty() const {
|
||||
return queue_.empty();
|
||||
}
|
||||
|
||||
pipeline_trace_t* front() {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
pipeline_trace_t* back() {
|
||||
return queue_.back();
|
||||
}
|
||||
|
||||
void push(pipeline_trace_t* value) {
|
||||
queue_.push(value);
|
||||
}
|
||||
|
||||
void pop() {
|
||||
queue_.pop();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
std::queue<pipeline_trace_t*> empty;
|
||||
std::swap(queue_, empty );
|
||||
}
|
||||
};
|
||||
|
||||
}
|
|
@ -1,11 +1,11 @@
|
|||
#include "processor.h"
|
||||
#include "core.h"
|
||||
#include "constants.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
class Processor::Impl {
|
||||
private:
|
||||
Processor* simobject_;
|
||||
std::vector<Core::Ptr> cores_;
|
||||
std::vector<Cache::Ptr> l2caches_;
|
||||
std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
|
||||
|
@ -13,12 +13,13 @@ private:
|
|||
Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
|
||||
|
||||
public:
|
||||
Impl(Processor* simobject, const ArchDef& arch)
|
||||
: simobject_(simobject)
|
||||
, cores_(arch.num_cores())
|
||||
Impl(const ArchDef& arch)
|
||||
: cores_(arch.num_cores())
|
||||
, l2caches_(NUM_CLUSTERS)
|
||||
, l2_mem_switches_(NUM_CLUSTERS)
|
||||
{
|
||||
SimPlatform::instance().initialize();
|
||||
|
||||
uint32_t num_cores = arch.num_cores();
|
||||
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
|
||||
|
||||
|
@ -26,12 +27,15 @@ public:
|
|||
for (uint32_t i = 0; i < num_cores; ++i) {
|
||||
cores_.at(i) = Core::Create(arch, i);
|
||||
}
|
||||
|
||||
std::vector<SimPort<MemReq>*> mem_req_ports(1);
|
||||
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1);
|
||||
|
||||
mem_req_ports.at(0) = &simobject_->MemReqPort;
|
||||
mem_rsp_ports.at(0) = &simobject_->MemRspPort;
|
||||
// setup memory simulator
|
||||
auto memsim = MemSim::Create("dram", MemSim::Config{
|
||||
MEMORY_BANKS,
|
||||
arch.num_cores()
|
||||
});
|
||||
|
||||
std::vector<SimPort<MemReq>*> mem_req_ports(1, &memsim->MemReqPort);
|
||||
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1, &memsim->MemRspPort);
|
||||
|
||||
if (L3_ENABLE) {
|
||||
l3cache_ = Cache::Create("l3cache", Cache::Config{
|
||||
|
@ -39,7 +43,7 @@ public:
|
|||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
2, // W
|
||||
0, // A
|
||||
32, // address bits
|
||||
32, // address bits
|
||||
L3_NUM_BANKS, // number of banks
|
||||
L3_NUM_PORTS, // number of ports
|
||||
NUM_CLUSTERS, // request size
|
||||
|
@ -122,10 +126,8 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
~Impl() {}
|
||||
|
||||
void step(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
~Impl() {
|
||||
SimPlatform::instance().finalize();
|
||||
}
|
||||
|
||||
void attach_ram(RAM* ram) {
|
||||
|
@ -134,28 +136,33 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
bool check_exit(int* exitcode) {
|
||||
bool running = false;
|
||||
for (auto& core : cores_) {
|
||||
if (core->running()) {
|
||||
running = true;
|
||||
int run() {
|
||||
SimPlatform::instance().reset();
|
||||
bool running;
|
||||
int exitcode = 0;
|
||||
do {
|
||||
SimPlatform::instance().tick();
|
||||
running = false;
|
||||
for (auto& core : cores_) {
|
||||
if (core->running()) {
|
||||
running = true;
|
||||
}
|
||||
if (core->check_exit()) {
|
||||
exitcode = core->getIRegValue(3);
|
||||
running = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (core->check_exit()) {
|
||||
*exitcode = core->getIRegValue(3);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return !running;
|
||||
} while (running);
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Processor::Processor(const SimContext& ctx, const ArchDef& arch)
|
||||
: SimObject<Processor>(ctx, "Vortex")
|
||||
, MemReqPort(this)
|
||||
, MemRspPort(this)
|
||||
, impl_(new Impl(this, arch))
|
||||
Processor::Processor(const ArchDef& arch)
|
||||
: impl_(new Impl(arch))
|
||||
{}
|
||||
|
||||
Processor::~Processor() {
|
||||
|
@ -166,10 +173,6 @@ void Processor::attach_ram(RAM* mem) {
|
|||
impl_->attach_ram(mem);
|
||||
}
|
||||
|
||||
bool Processor::check_exit(int* exitcode) {
|
||||
return impl_->check_exit(exitcode);
|
||||
}
|
||||
|
||||
void Processor::step(uint64_t cycle) {
|
||||
impl_->step(cycle);
|
||||
int Processor::run() {
|
||||
return impl_->run();
|
||||
}
|
|
@ -1,22 +1,18 @@
|
|||
#pragma once
|
||||
|
||||
#include "core.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Processor : public SimObject<Processor> {
|
||||
class ArchDef;
|
||||
class RAM;
|
||||
|
||||
class Processor {
|
||||
public:
|
||||
SimPort<MemReq> MemReqPort;
|
||||
SimPort<MemRsp> MemRspPort;
|
||||
|
||||
Processor(const SimContext& ctx, const ArchDef& arch);
|
||||
Processor(const ArchDef& arch);
|
||||
~Processor();
|
||||
|
||||
void attach_ram(RAM* mem);
|
||||
|
||||
bool check_exit(int* exitcode);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
int run();
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
|
|
|
@ -24,11 +24,16 @@ public:
|
|||
, in_use_fregs_(arch.num_warps())
|
||||
, in_use_vregs_(arch.num_warps())
|
||||
{
|
||||
for (int w = 0; w < arch.num_warps(); ++w) {
|
||||
in_use_iregs_.at(w).reset();
|
||||
in_use_fregs_.at(w).reset();
|
||||
in_use_vregs_.at(w).reset();
|
||||
this->clear();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
for (int i = 0, n = in_use_iregs_.size(); i < n; ++i) {
|
||||
in_use_iregs_.at(i).reset();
|
||||
in_use_fregs_.at(i).reset();
|
||||
in_use_vregs_.at(i).reset();
|
||||
}
|
||||
owners_.clear();
|
||||
}
|
||||
|
||||
bool in_use(pipeline_trace_t* state) const {
|
||||
|
|
|
@ -45,7 +45,11 @@ public:
|
|||
|
||||
virtual ~SharedMem() {}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
void reset() {
|
||||
perf_stats_ = PerfStats();
|
||||
}
|
||||
|
||||
void tick() {
|
||||
std::vector<bool> in_used_banks(config_.num_banks);
|
||||
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
|
||||
auto& core_req_port = this->Inputs.at(req_id);
|
||||
|
|
|
@ -16,6 +16,12 @@ TexUnit::TexUnit(Core* core) : core_(core) {}
|
|||
|
||||
TexUnit::~TexUnit() {}
|
||||
|
||||
void TexUnit::clear() {
|
||||
for (auto& state : states_) {
|
||||
state = 0;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t TexUnit::get_state(uint32_t state) {
|
||||
return states_.at(state);
|
||||
}
|
||||
|
|
|
@ -11,6 +11,8 @@ public:
|
|||
TexUnit(Core* core);
|
||||
~TexUnit();
|
||||
|
||||
void clear();
|
||||
|
||||
uint32_t get_state(uint32_t state);
|
||||
|
||||
void set_state(uint32_t state, uint32_t value);
|
||||
|
|
|
@ -213,67 +213,48 @@ struct MemReq {
|
|||
bool non_cacheable;
|
||||
uint32_t tag;
|
||||
uint32_t core_id;
|
||||
uint64_t uuid;
|
||||
|
||||
MemReq(uint64_t _addr = 0,
|
||||
bool _write = false,
|
||||
bool _non_cacheable = false,
|
||||
uint64_t _tag = 0,
|
||||
uint32_t _core_id = 0
|
||||
uint32_t _core_id = 0,
|
||||
uint64_t _uuid = 0
|
||||
) : addr(_addr)
|
||||
, write(_write)
|
||||
, non_cacheable(_non_cacheable)
|
||||
, tag(_tag)
|
||||
, core_id(_core_id)
|
||||
, uuid(_uuid)
|
||||
{}
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
|
||||
os << "mem-" << (req.write ? "wr" : "rd") << ": ";
|
||||
os << "addr=" << req.addr << ", tag=" << req.tag << ", core_id=" << req.core_id;
|
||||
os << " (#" << std::dec << req.uuid << ")";
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct MemRsp {
|
||||
uint64_t tag;
|
||||
uint32_t core_id;
|
||||
MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0)
|
||||
uint64_t uuid;
|
||||
MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
|
||||
: tag (_tag)
|
||||
, core_id(_core_id)
|
||||
, uuid(_uuid)
|
||||
{}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
class Queue {
|
||||
protected:
|
||||
std::queue<T> queue_;
|
||||
|
||||
public:
|
||||
Queue() {}
|
||||
|
||||
bool empty() const {
|
||||
return queue_.empty();
|
||||
}
|
||||
|
||||
const T& front() const {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
T& front() {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
const T& back() const {
|
||||
return queue_.back();
|
||||
}
|
||||
|
||||
T& back() {
|
||||
return queue_.back();
|
||||
}
|
||||
|
||||
void push(const T& value) {
|
||||
queue_.push(value);
|
||||
}
|
||||
|
||||
void pop() {
|
||||
queue_.pop();
|
||||
}
|
||||
};
|
||||
inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
|
||||
os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
|
||||
os << " (#" << std::dec << rsp.uuid << ")";
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -337,6 +318,14 @@ public:
|
|||
entry.first = false;
|
||||
--size_;
|
||||
}
|
||||
|
||||
void clear() {
|
||||
for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
|
||||
auto& entry = entries_.at(i);
|
||||
entry.first = false;
|
||||
}
|
||||
size_ = 0;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -376,7 +365,11 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
void reset() {
|
||||
cursor_ = 0;
|
||||
}
|
||||
|
||||
void tick() {
|
||||
if (ReqIn.size() == 1)
|
||||
return;
|
||||
|
||||
|
|
|
@ -13,12 +13,28 @@ using namespace vortex;
|
|||
Warp::Warp(Core *core, Word id)
|
||||
: id_(id)
|
||||
, core_(core)
|
||||
, active_(false)
|
||||
, PC_(STARTUP_ADDR)
|
||||
, tmask_(0) {
|
||||
iRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
|
||||
fRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
|
||||
vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0));
|
||||
, ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
|
||||
, freg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
|
||||
, vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
|
||||
{
|
||||
this->clear();
|
||||
}
|
||||
|
||||
void Warp::clear() {
|
||||
active_ = false;
|
||||
PC_ = STARTUP_ADDR;
|
||||
tmask_.reset();
|
||||
for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) {
|
||||
for (auto& reg : ireg_file_.at(i)) {
|
||||
reg = 0;
|
||||
}
|
||||
for (auto& reg : freg_file_.at(i)) {
|
||||
reg = 0;
|
||||
}
|
||||
for (auto& reg : vreg_file_.at(i)) {
|
||||
reg = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Warp::eval(pipeline_trace_t *trace) {
|
||||
|
@ -55,7 +71,7 @@ void Warp::eval(pipeline_trace_t *trace) {
|
|||
for (int i = 0; i < core_->arch().num_regs(); ++i) {
|
||||
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
|
||||
for (int j = 0; j < core_->arch().num_threads(); ++j) {
|
||||
DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_.at(j).at(i) << std::setfill(' ') << ' ');
|
||||
DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
|
||||
}
|
||||
DPN(4, std::endl);
|
||||
}
|
||||
|
|
|
@ -41,6 +41,8 @@ struct vtype {
|
|||
class Warp {
|
||||
public:
|
||||
Warp(Core *core, Word id);
|
||||
|
||||
void clear();
|
||||
|
||||
bool active() const {
|
||||
return active_;
|
||||
|
@ -84,7 +86,7 @@ public:
|
|||
}
|
||||
|
||||
Word getIRegValue(int reg) const {
|
||||
return iRegFile_.at(0).at(reg);
|
||||
return ireg_file_.at(0).at(reg);
|
||||
}
|
||||
|
||||
void eval(pipeline_trace_t *);
|
||||
|
@ -100,10 +102,10 @@ private:
|
|||
Word PC_;
|
||||
ThreadMask tmask_;
|
||||
|
||||
std::vector<std::vector<Word>> iRegFile_;
|
||||
std::vector<std::vector<Word>> fRegFile_;
|
||||
std::vector<std::vector<Byte>> vRegFile_;
|
||||
std::stack<DomStackEntry> domStack_;
|
||||
std::vector<std::vector<Word>> ireg_file_;
|
||||
std::vector<std::vector<Word>> freg_file_;
|
||||
std::vector<std::vector<Byte>> vreg_file_;
|
||||
std::stack<DomStackEntry> dom_stack_;
|
||||
|
||||
struct vtype vtype_;
|
||||
int vl_;
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
|
||||
#include <future>
|
||||
#include <list>
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
|
||||
#ifndef MEMORY_BANKS
|
||||
|
@ -33,8 +34,12 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef MEM_CYCLE_RATIO
|
||||
#define MEM_CYCLE_RATIO -1
|
||||
#endif
|
||||
|
||||
#undef MEM_BLOCK_SIZE
|
||||
#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
|
||||
#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
|
||||
|
@ -43,8 +48,6 @@
|
|||
#define CCI_RQ_SIZE 16
|
||||
#define CCI_WQ_SIZE 16
|
||||
|
||||
#define ENABLE_MEM_STALLS
|
||||
|
||||
#ifndef TRACE_START_TIME
|
||||
#define TRACE_START_TIME 0ull
|
||||
#endif
|
||||
|
@ -144,7 +147,7 @@ public:
|
|||
future_ = std::async(std::launch::async, [&]{
|
||||
while (!stop_) {
|
||||
std::lock_guard<std::mutex> guard(mutex_);
|
||||
this->step();
|
||||
this->tick();
|
||||
}
|
||||
});
|
||||
}
|
||||
|
@ -206,7 +209,7 @@ public:
|
|||
device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
|
||||
device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
|
||||
device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
|
||||
this->step();
|
||||
this->tick();
|
||||
device_->vcp2af_sRxPort_c0_mmioRdValid = 0;
|
||||
assert(device_->af2cp_sTxPort_c2_mmioRdValid);
|
||||
*value = device_->af2cp_sTxPort_c2_data;
|
||||
|
@ -220,7 +223,7 @@ public:
|
|||
device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
|
||||
device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
|
||||
memcpy(device_->vcp2af_sRxPort_c0_data, &value, 8);
|
||||
this->step();
|
||||
this->tick();
|
||||
device_->vcp2af_sRxPort_c0_mmioWrValid = 0;
|
||||
}
|
||||
|
||||
|
@ -257,17 +260,29 @@ private:
|
|||
Verilated::assertOn(true);
|
||||
}
|
||||
|
||||
void step() {
|
||||
void tick() {
|
||||
this->sRxPort_bus();
|
||||
this->sTxPort_bus();
|
||||
this->avs_bus();
|
||||
|
||||
if (!dram_queue_.empty()) {
|
||||
if (dram_->send(dram_queue_.front()))
|
||||
dram_queue_.pop();
|
||||
}
|
||||
|
||||
device_->clk = 0;
|
||||
this->eval();
|
||||
device_->clk = 1;
|
||||
this->eval();
|
||||
|
||||
dram_->tick();
|
||||
if (MEM_CYCLE_RATIO > 0) {
|
||||
auto cycle = timestamp / 2;
|
||||
if ((cycle % MEM_CYCLE_RATIO) == 0)
|
||||
dram_->tick();
|
||||
} else {
|
||||
for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
|
||||
dram_->tick();
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
fflush(stdout);
|
||||
|
@ -403,7 +418,7 @@ private:
|
|||
ramulator::Request::Type::WRITE,
|
||||
0
|
||||
);
|
||||
dram_->send(dram_req);
|
||||
dram_queue_.push(dram_req);
|
||||
}
|
||||
|
||||
if (device_->avs_read[b]) {
|
||||
|
@ -431,7 +446,7 @@ private:
|
|||
}, placeholders::_1, mem_req),
|
||||
0
|
||||
);
|
||||
dram_->send(dram_req);
|
||||
dram_queue_.push(dram_req);
|
||||
}
|
||||
|
||||
device_->avs_waitrequest[b] = false;
|
||||
|
@ -480,6 +495,8 @@ private:
|
|||
|
||||
ramulator::Gem5Wrapper* dram_;
|
||||
|
||||
std::queue<ramulator::Request> dram_queue_;
|
||||
|
||||
Vvortex_afu_shim *device_;
|
||||
#ifdef VCD_OUTPUT
|
||||
VerilatedVcdC *trace_;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue