SimX multi-ports memory fixes

This commit is contained in:
tinebp 2024-12-04 21:11:51 -08:00
parent 3ace9bbeda
commit 86f20b27dd
6 changed files with 60 additions and 75 deletions

View file

@ -44,7 +44,7 @@ Core::Core(const SimContext& ctx,
, operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)FUType::Count)
, func_units_((uint32_t)FUType::Count)
, lsu_demux_(NUM_LSU_BLOCKS)
, lmem_switch_(NUM_LSU_BLOCKS)
, mem_coalescers_(NUM_LSU_BLOCKS)
, lsu_dcache_adapter_(NUM_LSU_BLOCKS)
, lsu_lmem_adapter_(NUM_LSU_BLOCKS)
@ -73,10 +73,10 @@ Core::Core(const SimContext& ctx,
false
});
// create lsu demux
// create lmem switch
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
snprintf(sname, 100, "%s-lsu_demux%d", this->name().c_str(), i);
lsu_demux_.at(i) = LocalMemSwitch::Create(sname, 1);
snprintf(sname, 100, "%s-lmem_switch%d", this->name().c_str(), i);
lmem_switch_.at(i) = LocalMemSwitch::Create(sname, 1);
}
// create lsu dcache adapter
@ -93,11 +93,11 @@ Core::Core(const SimContext& ctx,
// connect lsu demux
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
lsu_demux_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
mem_coalescers_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspDC);
lmem_switch_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
mem_coalescers_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspDC);
lsu_demux_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
lsu_lmem_adapter_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspLmem);
lmem_switch_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
lsu_lmem_adapter_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspLmem);
}
// connect coalescer-adapter

View file

@ -154,7 +154,7 @@ private:
std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<FuncUnit::Ptr> func_units_;
LocalMem::Ptr local_mem_;
std::vector<LocalMemSwitch::Ptr> lsu_demux_;
std::vector<LocalMemSwitch::Ptr> lmem_switch_;
std::vector<MemCoalescer::Ptr> mem_coalescers_;
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter_;
std::vector<LsuMemAdapter::Ptr> lsu_lmem_adapter_;

View file

@ -116,7 +116,7 @@ void LsuUnit::tick() {
// handle memory responses
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
auto& lsu_rsp_port = core_->lsu_demux_.at(b)->RspIn;
auto& lsu_rsp_port = core_->lmem_switch_.at(b)->RspIn;
if (lsu_rsp_port.empty())
continue;
auto& state = states_.at(b);
@ -201,7 +201,7 @@ void LsuUnit::tick() {
lsu_req.uuid = trace->uuid;
// send memory request
core_->lsu_demux_.at(block_idx)->ReqIn.push(lsu_req);
core_->lmem_switch_.at(block_idx)->ReqIn.push(lsu_req);
DT(3, this->name() << "-mem-req: " << lsu_req);
// update stats
@ -246,7 +246,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
continue;
int req_idx = block_idx * LSU_CHANNELS + (i % LSU_CHANNELS);
auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
auto& dcache_req_port = core_->lmem_switch_.at(req_idx)->ReqIn;
auto mem_addr = trace_data->mem_addrs.at(t);
auto type = get_addr_type(mem_addr.addr);

View file

@ -25,7 +25,7 @@ protected:
Config config_;
RAM ram_;
MemCrossBar::Ptr mem_xbar_;
PerfStats perf_stats_;
mutable PerfStats perf_stats_;
uint64_t to_local_addr(uint64_t addr) {
uint32_t total_lines = config_.capacity / config_.line_size;
@ -68,45 +68,33 @@ public:
}
void tick() {
std::vector<bool> in_used_banks(1 << config_.B);
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
auto& core_req_port = simobject_->Inputs.at(req_id);
if (core_req_port.empty())
// process bank requets from xbar
uint32_t num_banks = (1 << config_.B);
for (uint32_t i = 0; i < num_banks; ++i) {
auto& xbar_req_out = mem_xbar_->ReqOut.at(i);
if (xbar_req_out.empty())
continue;
auto& core_req = core_req_port.front();
auto& bank_req = xbar_req_out.front();
DT(4, simobject_->name() << "-bank" << i << "-req : " << bank_req);
uint32_t bank_id = 0;
if (bank_sel_addr_end_ >= bank_sel_addr_start_) {
bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
}
// bank conflict check
if (in_used_banks.at(bank_id)) {
++perf_stats_.bank_stalls;
continue;
}
DT(4, simobject_->name() << "-mem-req" << req_id << ": "<< core_req);
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
simobject_->Outputs.at(req_id).push(core_rsp, 1);
if (!bank_req.write || config_.write_reponse) {
// send xbar response
MemRsp bank_rsp{bank_req.tag, bank_req.cid, bank_req.uuid};
mem_xbar_->RspOut.at(i).push(bank_rsp, 1);
}
// update perf counters
perf_stats_.reads += !core_req.write;
perf_stats_.writes += core_req.write;
perf_stats_.reads += !bank_req.write;
perf_stats_.writes += bank_req.write;
// remove input
core_req_port.pop();
xbar_req_out.pop();
}
}
const PerfStats& perf_stats() const {
perf_stats_.bank_stalls = mem_xbar_->collisions();
return perf_stats_;
}
};

View file

@ -141,7 +141,6 @@ void LsuMemAdapter::tick() {
if (!ReqIn.empty()) {
auto& in_req = ReqIn.front();
assert(in_req.mask.size() == input_size);
for (uint32_t i = 0; i < input_size; ++i) {
if (in_req.mask.test(i)) {
// build memory request
@ -152,7 +151,6 @@ void LsuMemAdapter::tick() {
out_req.tag = in_req.tag;
out_req.cid = in_req.cid;
out_req.uuid = in_req.uuid;
// send memory request
ReqOut.at(i).push(out_req, delay_);
DT(4, this->name() << "-req" << i << ": " << out_req);

View file

@ -484,7 +484,7 @@ public:
, type_(type)
, delay_(delay)
, grants_(num_outputs, 0)
, num_reqs_(log2ceil(num_inputs / num_outputs))
, lg2_num_reqs_(log2ceil(num_inputs / num_outputs))
{
assert(delay != 0);
assert(num_inputs <= 64);
@ -508,7 +508,7 @@ public:
void tick() {
uint32_t I = Inputs.size();
uint32_t O = Outputs.size();
uint32_t R = 1 << num_reqs_;
uint32_t R = 1 << lg2_num_reqs_;
// skip bypass mode
if (I == O)
@ -545,7 +545,7 @@ protected:
ArbiterType type_;
uint32_t delay_;
std::vector<uint32_t> grants_;
uint32_t num_reqs_;
uint32_t lg2_num_reqs_;
};
///////////////////////////////////////////////////////////////////////////////
@ -571,9 +571,9 @@ public:
, type_(type)
, delay_(delay)
, grants_(num_outputs, 0)
, lg_num_reqs_(log2ceil(num_inputs))
, lg2_inputs_(log2ceil(num_inputs))
, lg2_outputs_(log2ceil(num_outputs))
, addr_start_(addr_start)
, addr_end_(num_outputs-1)
, collisions_(0) {
assert(delay != 0);
assert(num_inputs <= 64);
@ -590,7 +590,7 @@ public:
void tick() {
uint32_t I = Inputs.size();
uint32_t O = Outputs.size();
uint32_t R = 1 << lg_num_reqs_;
uint32_t R = 1 << lg2_inputs_;
// process incoming requests
for (uint32_t o = 0; o < O; ++o) {
@ -602,10 +602,10 @@ public:
auto& req_in = Inputs.at(i);
if (!req_in.empty()) {
auto& req = req_in.front();
// skip if input is not going to this output
// skip if input is not going to current output
uint32_t output_idx = 0;
if (O != 1) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_end_);
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
}
if (output_idx != o)
continue;
@ -619,8 +619,8 @@ public:
if (input_idx != -1) {
auto& req_in = Inputs.at(input_idx);
auto& req = req_in.front();
if (lg_num_reqs_ != 0) {
req.tag = (req.tag << lg_num_reqs_) | input_idx;
if (lg2_inputs_ != 0) {
req.tag = (req.tag << lg2_inputs_) | input_idx;
}
DT(4, this->name() << "-req" << input_idx << ": " << req);
Outputs.at(o).push(req, delay_);
@ -645,9 +645,9 @@ protected:
ArbiterType type_;
uint32_t delay_;
std::vector<uint32_t> grants_;
uint32_t lg_num_reqs_;
uint32_t lg2_inputs_;
uint32_t lg2_outputs_;
uint32_t addr_start_;
uint32_t addr_end_;
uint64_t collisions_;
};
@ -678,7 +678,7 @@ public:
, type_(type)
, delay_(delay)
, grants_(num_outputs, 0)
, lg_num_reqs_(log2ceil(num_inputs / num_outputs))
, lg2_num_reqs_(log2ceil(num_inputs / num_outputs))
{
assert(delay != 0);
assert(num_inputs <= 64);
@ -703,7 +703,7 @@ public:
void tick() {
uint32_t I = ReqIn.size();
uint32_t O = ReqOut.size();
uint32_t R = 1 << lg_num_reqs_;
uint32_t R = 1 << lg2_num_reqs_;
// skip bypass mode
if (I == O)
@ -715,9 +715,9 @@ public:
if (!rsp_out.empty()) {
auto& rsp = rsp_out.front();
uint32_t g = 0;
if (lg_num_reqs_ != 0) {
if (lg2_num_reqs_ != 0) {
g = rsp.tag & (R-1);
rsp.tag >>= lg_num_reqs_;
rsp.tag >>= lg2_num_reqs_;
}
DT(4, this->name() << "-rsp" << o << ": " << rsp);
uint32_t j = o * R + g;
@ -737,8 +737,8 @@ public:
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
if (lg_num_reqs_ != 0) {
req.tag = (req.tag << lg_num_reqs_) | g;
if (lg2_num_reqs_ != 0) {
req.tag = (req.tag << lg2_num_reqs_) | g;
}
DT(4, this->name() << "-req" << j << ": " << req);
ReqOut.at(o).push(req, delay_);
@ -761,7 +761,7 @@ protected:
ArbiterType type_;
uint32_t delay_;
std::vector<uint32_t> grants_;
uint32_t lg_num_reqs_;
uint32_t lg2_num_reqs_;
};
///////////////////////////////////////////////////////////////////////////////
@ -793,10 +793,9 @@ public:
, delay_(delay)
, req_grants_(num_outputs, 0)
, rsp_grants_(num_inputs, 0)
, lg_num_reqs_(log2ceil(num_inputs))
, lg_num_rsps_(log2ceil(num_outputs))
, lg2_inputs_(log2ceil(num_inputs))
, lg2_outputs_(log2ceil(num_outputs))
, addr_start_(addr_start)
, addr_end_(num_outputs-1)
, collisions_(0) {
assert(delay != 0);
assert(num_inputs <= 64);
@ -817,8 +816,8 @@ public:
void tick() {
uint32_t I = ReqIn.size();
uint32_t O = ReqOut.size();
uint32_t R = 1 << lg_num_reqs_;
uint32_t T = 1 << lg_num_rsps_;
uint32_t R = 1 << lg2_inputs_;
uint32_t T = 1 << lg2_outputs_;
// process outgoing responses
for (uint32_t i = 0; i < I; ++i) {
@ -832,7 +831,7 @@ public:
auto& rsp = rsp_out.front();
// skip if response is not going to current input
uint32_t input_idx = 0;
if (lg_num_reqs_ != 0) {
if (lg2_inputs_ != 0) {
input_idx = rsp.tag & (R-1);
}
if (input_idx != i)
@ -848,9 +847,9 @@ public:
auto& rsp_out = RspOut.at(output_idx);
auto& rsp = rsp_out.front();
uint32_t input_idx = 0;
if (lg_num_reqs_ != 0) {
if (lg2_inputs_ != 0) {
input_idx = rsp.tag & (R-1);
rsp.tag >>= lg_num_reqs_;
rsp.tag >>= lg2_inputs_;
}
DT(4, this->name() << "-rsp" << output_idx << ": " << rsp);
RspIn.at(input_idx).push(rsp, 1);
@ -872,7 +871,7 @@ public:
// skip if request is not going to current output
uint32_t output_idx = 0;
if (O != 1) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_end_);
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
}
if (output_idx != o)
continue;
@ -886,8 +885,8 @@ public:
if (input_idx != -1) {
auto& req_in = ReqIn.at(input_idx);
auto& req = req_in.front();
if (lg_num_reqs_ != 0) {
req.tag = (req.tag << lg_num_reqs_) | input_idx;
if (lg2_inputs_ != 0) {
req.tag = (req.tag << lg2_inputs_) | input_idx;
}
DT(4, this->name() << "-req" << input_idx << ": " << req);
ReqOut.at(o).push(req, delay_);
@ -919,10 +918,9 @@ protected:
uint32_t delay_;
std::vector<uint32_t> req_grants_;
std::vector<uint32_t> rsp_grants_;
uint32_t lg_num_reqs_;
uint32_t lg_num_rsps_;
uint32_t lg2_inputs_;
uint32_t lg2_outputs_;
uint32_t addr_start_;
uint32_t addr_end_;
uint64_t collisions_;
};
@ -980,4 +978,5 @@ private:
using MemArbiter = TxArbiter<MemReq, MemRsp>;
using MemCrossBar = TxCrossBar<MemReq, MemRsp>;
}