mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
SimX multi-ports memory fixes
This commit is contained in:
parent
3ace9bbeda
commit
86f20b27dd
6 changed files with 60 additions and 75 deletions
|
@ -44,7 +44,7 @@ Core::Core(const SimContext& ctx,
|
|||
, operands_(ISSUE_WIDTH)
|
||||
, dispatchers_((uint32_t)FUType::Count)
|
||||
, func_units_((uint32_t)FUType::Count)
|
||||
, lsu_demux_(NUM_LSU_BLOCKS)
|
||||
, lmem_switch_(NUM_LSU_BLOCKS)
|
||||
, mem_coalescers_(NUM_LSU_BLOCKS)
|
||||
, lsu_dcache_adapter_(NUM_LSU_BLOCKS)
|
||||
, lsu_lmem_adapter_(NUM_LSU_BLOCKS)
|
||||
|
@ -73,10 +73,10 @@ Core::Core(const SimContext& ctx,
|
|||
false
|
||||
});
|
||||
|
||||
// create lsu demux
|
||||
// create lmem switch
|
||||
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
|
||||
snprintf(sname, 100, "%s-lsu_demux%d", this->name().c_str(), i);
|
||||
lsu_demux_.at(i) = LocalMemSwitch::Create(sname, 1);
|
||||
snprintf(sname, 100, "%s-lmem_switch%d", this->name().c_str(), i);
|
||||
lmem_switch_.at(i) = LocalMemSwitch::Create(sname, 1);
|
||||
}
|
||||
|
||||
// create lsu dcache adapter
|
||||
|
@ -93,11 +93,11 @@ Core::Core(const SimContext& ctx,
|
|||
|
||||
// connect lsu demux
|
||||
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
|
||||
lsu_demux_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
|
||||
mem_coalescers_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspDC);
|
||||
lmem_switch_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
|
||||
mem_coalescers_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspDC);
|
||||
|
||||
lsu_demux_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
|
||||
lsu_lmem_adapter_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspLmem);
|
||||
lmem_switch_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
|
||||
lsu_lmem_adapter_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspLmem);
|
||||
}
|
||||
|
||||
// connect coalescer-adapter
|
||||
|
|
|
@ -154,7 +154,7 @@ private:
|
|||
std::vector<Dispatcher::Ptr> dispatchers_;
|
||||
std::vector<FuncUnit::Ptr> func_units_;
|
||||
LocalMem::Ptr local_mem_;
|
||||
std::vector<LocalMemSwitch::Ptr> lsu_demux_;
|
||||
std::vector<LocalMemSwitch::Ptr> lmem_switch_;
|
||||
std::vector<MemCoalescer::Ptr> mem_coalescers_;
|
||||
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter_;
|
||||
std::vector<LsuMemAdapter::Ptr> lsu_lmem_adapter_;
|
||||
|
|
|
@ -116,7 +116,7 @@ void LsuUnit::tick() {
|
|||
|
||||
// handle memory responses
|
||||
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
|
||||
auto& lsu_rsp_port = core_->lsu_demux_.at(b)->RspIn;
|
||||
auto& lsu_rsp_port = core_->lmem_switch_.at(b)->RspIn;
|
||||
if (lsu_rsp_port.empty())
|
||||
continue;
|
||||
auto& state = states_.at(b);
|
||||
|
@ -201,7 +201,7 @@ void LsuUnit::tick() {
|
|||
lsu_req.uuid = trace->uuid;
|
||||
|
||||
// send memory request
|
||||
core_->lsu_demux_.at(block_idx)->ReqIn.push(lsu_req);
|
||||
core_->lmem_switch_.at(block_idx)->ReqIn.push(lsu_req);
|
||||
DT(3, this->name() << "-mem-req: " << lsu_req);
|
||||
|
||||
// update stats
|
||||
|
@ -246,7 +246,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
|
|||
continue;
|
||||
|
||||
int req_idx = block_idx * LSU_CHANNELS + (i % LSU_CHANNELS);
|
||||
auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
|
||||
auto& dcache_req_port = core_->lmem_switch_.at(req_idx)->ReqIn;
|
||||
|
||||
auto mem_addr = trace_data->mem_addrs.at(t);
|
||||
auto type = get_addr_type(mem_addr.addr);
|
||||
|
|
|
@ -25,7 +25,7 @@ protected:
|
|||
Config config_;
|
||||
RAM ram_;
|
||||
MemCrossBar::Ptr mem_xbar_;
|
||||
PerfStats perf_stats_;
|
||||
mutable PerfStats perf_stats_;
|
||||
|
||||
uint64_t to_local_addr(uint64_t addr) {
|
||||
uint32_t total_lines = config_.capacity / config_.line_size;
|
||||
|
@ -68,45 +68,33 @@ public:
|
|||
}
|
||||
|
||||
void tick() {
|
||||
std::vector<bool> in_used_banks(1 << config_.B);
|
||||
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
|
||||
auto& core_req_port = simobject_->Inputs.at(req_id);
|
||||
if (core_req_port.empty())
|
||||
// process bank requets from xbar
|
||||
uint32_t num_banks = (1 << config_.B);
|
||||
for (uint32_t i = 0; i < num_banks; ++i) {
|
||||
auto& xbar_req_out = mem_xbar_->ReqOut.at(i);
|
||||
if (xbar_req_out.empty())
|
||||
continue;
|
||||
|
||||
auto& core_req = core_req_port.front();
|
||||
auto& bank_req = xbar_req_out.front();
|
||||
DT(4, simobject_->name() << "-bank" << i << "-req : " << bank_req);
|
||||
|
||||
uint32_t bank_id = 0;
|
||||
if (bank_sel_addr_end_ >= bank_sel_addr_start_) {
|
||||
bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
|
||||
}
|
||||
|
||||
// bank conflict check
|
||||
if (in_used_banks.at(bank_id)) {
|
||||
++perf_stats_.bank_stalls;
|
||||
continue;
|
||||
}
|
||||
|
||||
DT(4, simobject_->name() << "-mem-req" << req_id << ": "<< core_req);
|
||||
|
||||
in_used_banks.at(bank_id) = true;
|
||||
|
||||
if (!core_req.write || config_.write_reponse) {
|
||||
// send response
|
||||
MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
|
||||
simobject_->Outputs.at(req_id).push(core_rsp, 1);
|
||||
if (!bank_req.write || config_.write_reponse) {
|
||||
// send xbar response
|
||||
MemRsp bank_rsp{bank_req.tag, bank_req.cid, bank_req.uuid};
|
||||
mem_xbar_->RspOut.at(i).push(bank_rsp, 1);
|
||||
}
|
||||
|
||||
// update perf counters
|
||||
perf_stats_.reads += !core_req.write;
|
||||
perf_stats_.writes += core_req.write;
|
||||
perf_stats_.reads += !bank_req.write;
|
||||
perf_stats_.writes += bank_req.write;
|
||||
|
||||
// remove input
|
||||
core_req_port.pop();
|
||||
xbar_req_out.pop();
|
||||
}
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
perf_stats_.bank_stalls = mem_xbar_->collisions();
|
||||
return perf_stats_;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -141,7 +141,6 @@ void LsuMemAdapter::tick() {
|
|||
if (!ReqIn.empty()) {
|
||||
auto& in_req = ReqIn.front();
|
||||
assert(in_req.mask.size() == input_size);
|
||||
|
||||
for (uint32_t i = 0; i < input_size; ++i) {
|
||||
if (in_req.mask.test(i)) {
|
||||
// build memory request
|
||||
|
@ -152,7 +151,6 @@ void LsuMemAdapter::tick() {
|
|||
out_req.tag = in_req.tag;
|
||||
out_req.cid = in_req.cid;
|
||||
out_req.uuid = in_req.uuid;
|
||||
|
||||
// send memory request
|
||||
ReqOut.at(i).push(out_req, delay_);
|
||||
DT(4, this->name() << "-req" << i << ": " << out_req);
|
||||
|
|
|
@ -484,7 +484,7 @@ public:
|
|||
, type_(type)
|
||||
, delay_(delay)
|
||||
, grants_(num_outputs, 0)
|
||||
, num_reqs_(log2ceil(num_inputs / num_outputs))
|
||||
, lg2_num_reqs_(log2ceil(num_inputs / num_outputs))
|
||||
{
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 64);
|
||||
|
@ -508,7 +508,7 @@ public:
|
|||
void tick() {
|
||||
uint32_t I = Inputs.size();
|
||||
uint32_t O = Outputs.size();
|
||||
uint32_t R = 1 << num_reqs_;
|
||||
uint32_t R = 1 << lg2_num_reqs_;
|
||||
|
||||
// skip bypass mode
|
||||
if (I == O)
|
||||
|
@ -545,7 +545,7 @@ protected:
|
|||
ArbiterType type_;
|
||||
uint32_t delay_;
|
||||
std::vector<uint32_t> grants_;
|
||||
uint32_t num_reqs_;
|
||||
uint32_t lg2_num_reqs_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -571,9 +571,9 @@ public:
|
|||
, type_(type)
|
||||
, delay_(delay)
|
||||
, grants_(num_outputs, 0)
|
||||
, lg_num_reqs_(log2ceil(num_inputs))
|
||||
, lg2_inputs_(log2ceil(num_inputs))
|
||||
, lg2_outputs_(log2ceil(num_outputs))
|
||||
, addr_start_(addr_start)
|
||||
, addr_end_(num_outputs-1)
|
||||
, collisions_(0) {
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 64);
|
||||
|
@ -590,7 +590,7 @@ public:
|
|||
void tick() {
|
||||
uint32_t I = Inputs.size();
|
||||
uint32_t O = Outputs.size();
|
||||
uint32_t R = 1 << lg_num_reqs_;
|
||||
uint32_t R = 1 << lg2_inputs_;
|
||||
|
||||
// process incoming requests
|
||||
for (uint32_t o = 0; o < O; ++o) {
|
||||
|
@ -602,10 +602,10 @@ public:
|
|||
auto& req_in = Inputs.at(i);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
// skip if input is not going to this output
|
||||
// skip if input is not going to current output
|
||||
uint32_t output_idx = 0;
|
||||
if (O != 1) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_end_);
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
|
||||
}
|
||||
if (output_idx != o)
|
||||
continue;
|
||||
|
@ -619,8 +619,8 @@ public:
|
|||
if (input_idx != -1) {
|
||||
auto& req_in = Inputs.at(input_idx);
|
||||
auto& req = req_in.front();
|
||||
if (lg_num_reqs_ != 0) {
|
||||
req.tag = (req.tag << lg_num_reqs_) | input_idx;
|
||||
if (lg2_inputs_ != 0) {
|
||||
req.tag = (req.tag << lg2_inputs_) | input_idx;
|
||||
}
|
||||
DT(4, this->name() << "-req" << input_idx << ": " << req);
|
||||
Outputs.at(o).push(req, delay_);
|
||||
|
@ -645,9 +645,9 @@ protected:
|
|||
ArbiterType type_;
|
||||
uint32_t delay_;
|
||||
std::vector<uint32_t> grants_;
|
||||
uint32_t lg_num_reqs_;
|
||||
uint32_t lg2_inputs_;
|
||||
uint32_t lg2_outputs_;
|
||||
uint32_t addr_start_;
|
||||
uint32_t addr_end_;
|
||||
uint64_t collisions_;
|
||||
};
|
||||
|
||||
|
@ -678,7 +678,7 @@ public:
|
|||
, type_(type)
|
||||
, delay_(delay)
|
||||
, grants_(num_outputs, 0)
|
||||
, lg_num_reqs_(log2ceil(num_inputs / num_outputs))
|
||||
, lg2_num_reqs_(log2ceil(num_inputs / num_outputs))
|
||||
{
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 64);
|
||||
|
@ -703,7 +703,7 @@ public:
|
|||
void tick() {
|
||||
uint32_t I = ReqIn.size();
|
||||
uint32_t O = ReqOut.size();
|
||||
uint32_t R = 1 << lg_num_reqs_;
|
||||
uint32_t R = 1 << lg2_num_reqs_;
|
||||
|
||||
// skip bypass mode
|
||||
if (I == O)
|
||||
|
@ -715,9 +715,9 @@ public:
|
|||
if (!rsp_out.empty()) {
|
||||
auto& rsp = rsp_out.front();
|
||||
uint32_t g = 0;
|
||||
if (lg_num_reqs_ != 0) {
|
||||
if (lg2_num_reqs_ != 0) {
|
||||
g = rsp.tag & (R-1);
|
||||
rsp.tag >>= lg_num_reqs_;
|
||||
rsp.tag >>= lg2_num_reqs_;
|
||||
}
|
||||
DT(4, this->name() << "-rsp" << o << ": " << rsp);
|
||||
uint32_t j = o * R + g;
|
||||
|
@ -737,8 +737,8 @@ public:
|
|||
auto& req_in = ReqIn.at(j);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
if (lg_num_reqs_ != 0) {
|
||||
req.tag = (req.tag << lg_num_reqs_) | g;
|
||||
if (lg2_num_reqs_ != 0) {
|
||||
req.tag = (req.tag << lg2_num_reqs_) | g;
|
||||
}
|
||||
DT(4, this->name() << "-req" << j << ": " << req);
|
||||
ReqOut.at(o).push(req, delay_);
|
||||
|
@ -761,7 +761,7 @@ protected:
|
|||
ArbiterType type_;
|
||||
uint32_t delay_;
|
||||
std::vector<uint32_t> grants_;
|
||||
uint32_t lg_num_reqs_;
|
||||
uint32_t lg2_num_reqs_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -793,10 +793,9 @@ public:
|
|||
, delay_(delay)
|
||||
, req_grants_(num_outputs, 0)
|
||||
, rsp_grants_(num_inputs, 0)
|
||||
, lg_num_reqs_(log2ceil(num_inputs))
|
||||
, lg_num_rsps_(log2ceil(num_outputs))
|
||||
, lg2_inputs_(log2ceil(num_inputs))
|
||||
, lg2_outputs_(log2ceil(num_outputs))
|
||||
, addr_start_(addr_start)
|
||||
, addr_end_(num_outputs-1)
|
||||
, collisions_(0) {
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 64);
|
||||
|
@ -817,8 +816,8 @@ public:
|
|||
void tick() {
|
||||
uint32_t I = ReqIn.size();
|
||||
uint32_t O = ReqOut.size();
|
||||
uint32_t R = 1 << lg_num_reqs_;
|
||||
uint32_t T = 1 << lg_num_rsps_;
|
||||
uint32_t R = 1 << lg2_inputs_;
|
||||
uint32_t T = 1 << lg2_outputs_;
|
||||
|
||||
// process outgoing responses
|
||||
for (uint32_t i = 0; i < I; ++i) {
|
||||
|
@ -832,7 +831,7 @@ public:
|
|||
auto& rsp = rsp_out.front();
|
||||
// skip if response is not going to current input
|
||||
uint32_t input_idx = 0;
|
||||
if (lg_num_reqs_ != 0) {
|
||||
if (lg2_inputs_ != 0) {
|
||||
input_idx = rsp.tag & (R-1);
|
||||
}
|
||||
if (input_idx != i)
|
||||
|
@ -848,9 +847,9 @@ public:
|
|||
auto& rsp_out = RspOut.at(output_idx);
|
||||
auto& rsp = rsp_out.front();
|
||||
uint32_t input_idx = 0;
|
||||
if (lg_num_reqs_ != 0) {
|
||||
if (lg2_inputs_ != 0) {
|
||||
input_idx = rsp.tag & (R-1);
|
||||
rsp.tag >>= lg_num_reqs_;
|
||||
rsp.tag >>= lg2_inputs_;
|
||||
}
|
||||
DT(4, this->name() << "-rsp" << output_idx << ": " << rsp);
|
||||
RspIn.at(input_idx).push(rsp, 1);
|
||||
|
@ -872,7 +871,7 @@ public:
|
|||
// skip if request is not going to current output
|
||||
uint32_t output_idx = 0;
|
||||
if (O != 1) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_end_);
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
|
||||
}
|
||||
if (output_idx != o)
|
||||
continue;
|
||||
|
@ -886,8 +885,8 @@ public:
|
|||
if (input_idx != -1) {
|
||||
auto& req_in = ReqIn.at(input_idx);
|
||||
auto& req = req_in.front();
|
||||
if (lg_num_reqs_ != 0) {
|
||||
req.tag = (req.tag << lg_num_reqs_) | input_idx;
|
||||
if (lg2_inputs_ != 0) {
|
||||
req.tag = (req.tag << lg2_inputs_) | input_idx;
|
||||
}
|
||||
DT(4, this->name() << "-req" << input_idx << ": " << req);
|
||||
ReqOut.at(o).push(req, delay_);
|
||||
|
@ -919,10 +918,9 @@ protected:
|
|||
uint32_t delay_;
|
||||
std::vector<uint32_t> req_grants_;
|
||||
std::vector<uint32_t> rsp_grants_;
|
||||
uint32_t lg_num_reqs_;
|
||||
uint32_t lg_num_rsps_;
|
||||
uint32_t lg2_inputs_;
|
||||
uint32_t lg2_outputs_;
|
||||
uint32_t addr_start_;
|
||||
uint32_t addr_end_;
|
||||
uint64_t collisions_;
|
||||
};
|
||||
|
||||
|
@ -980,4 +978,5 @@ private:
|
|||
|
||||
using MemArbiter = TxArbiter<MemReq, MemRsp>;
|
||||
using MemCrossBar = TxCrossBar<MemReq, MemRsp>;
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue