simx memory coalescer bug fix

This commit is contained in:
Blaise Tine 2024-07-23 00:02:43 -07:00
parent e7b2bb81b4
commit 95f59d23a8
10 changed files with 759 additions and 216 deletions

313
sim/common/bitvector.h Normal file
View file

@ -0,0 +1,313 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <vector>
#include <stdexcept>
#include <algorithm>
namespace vortex {
template <typename T = uint32_t>
class BitVector {
private:
std::vector<uint8_t> bits_;
size_t size_;
bool all_zero_;
size_t byteIndex(size_t pos) const {
return pos / (sizeof(T) * 8);
}
uint8_t bitMask(size_t pos) const {
return 1 << (pos % (sizeof(T) * 8));
}
void updateAllZero() {
all_zero_ = std::all_of(bits_.begin(), bits_.end(), [](T word) { return word == 0; });
}
public:
explicit BitVector(size_t size = 0)
: bits_((size + (sizeof(T) * 8 - 1)) / (sizeof(T) * 8))
, size_(size)
, all_zero_(true)
{}
void set(size_t pos) {
if (pos >= size_) throw std::out_of_range("Index out of range");
bits_[this->byteIndex(pos)] |= this->bitMask(pos);
all_zero_ = false;
}
void set(size_t pos, bool value) {
if (value) {
this->set(pos);
} else {
this->reset(pos);
}
}
void reset() {
std::fill(bits_.begin(), bits_.end(), 0);
all_zero_ = true;
}
void reset(size_t pos) {
if (pos >= size_) throw std::out_of_range("Index out of range");
bits_[this->byteIndex(pos)] &= ~this->bitMask(pos);
this->updateAllZero();
}
bool test(size_t pos) const {
if (pos >= size_) throw std::out_of_range("Index out of range");
return bits_[this->byteIndex(pos)] & this->bitMask(pos);
}
size_t size() const {
return size_;
}
void resize(size_t new_size) {
size_ = new_size;
bits_.resize((new_size + (sizeof(T) * 8 - 1)) / (sizeof(T) * 8));
this->updateAllZero();
}
bool operator==(const BitVector& other) const {
return size_ == other.size_ && bits_ == other.bits_;
}
bool operator!=(const BitVector& other) const {
return !(*this == other);
}
bool operator[](size_t pos) const {
return test(pos);
}
BitVector& operator&=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) {
bits_[i] &= other.bits_[i];
}
this->updateAllZero();
return *this;
}
BitVector& operator|=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) {
bits_[i] |= other.bits_[i];
}
this->updateAllZero();
return *this;
}
BitVector& operator^=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) {
bits_[i] ^= other.bits_[i];
}
this->updateAllZero();
return *this;
}
BitVector operator~() const {
BitVector result(size_);
for (size_t i = 0; i < bits_.size(); ++i) {
result.bits_[i] = ~bits_[i];
}
result.updateAllZero();
return result;
}
void flip() {
for (auto &word : bits_) {
word = ~word;
}
this->updateAllZero();
}
size_t count() const {
size_t count = 0;
for (const auto &word : bits_) {
count += std::bitset<sizeof(T) * 8>(word).count();
}
return count;
}
bool none() const {
return all_zero_;
}
bool any() const {
return !all_zero_;
}
bool all() const {
size_t full_bits = size_ / (sizeof(T) * 8);
size_t remaining_bits = size_ % (sizeof(T) * 8);
T full_mask = ~T(0);
for (size_t i = 0; i < full_bits; ++i) {
if (bits_[i] != full_mask)
return false;
}
if (remaining_bits > 0) {
T partial_mask = (T(1) << remaining_bits) - 1;
if ((bits_[full_bits] & partial_mask) != partial_mask)
return false;
}
return true;
}
BitVector& operator<<=(size_t pos) {
if (pos >= size_) {
reset();
return *this;
}
size_t byte_shift = pos / (sizeof(T) * 8);
size_t bit_shift = pos % (sizeof(T) * 8);
if (byte_shift > 0) {
for (size_t i = bits_.size() - 1; i >= byte_shift; --i) {
bits_[i] = bits_[i - byte_shift];
}
std::fill(bits_.begin(), bits_.begin() + byte_shift, 0);
}
if (bit_shift > 0) {
for (size_t i = bits_.size() - 1; i > 0; --i) {
bits_[i] = (bits_[i] << bit_shift) | (bits_[i - 1] >> (sizeof(T) * 8 - bit_shift));
}
bits_[0] <<= bit_shift;
}
this->updateAllZero();
return *this;
}
BitVector& operator>>=(size_t pos) {
if (pos >= size_) {
reset();
return *this;
}
size_t byte_shift = pos / (sizeof(T) * 8);
size_t bit_shift = pos % (sizeof(T) * 8);
if (byte_shift > 0) {
for (size_t i = 0; i < bits_.size() - byte_shift; ++i) {
bits_[i] = bits_[i + byte_shift];
}
std::fill(bits_.end() - byte_shift, bits_.end(), 0);
}
if (bit_shift > 0) {
for (size_t i = 0; i < bits_.size() - 1; ++i) {
bits_[i] = (bits_[i] >> bit_shift) | (bits_[i + 1] << (sizeof(T) * 8 - bit_shift));
}
bits_.back() >>= bit_shift;
}
this->updateAllZero();
return *this;
}
std::string to_string() const {
std::string result;
for (size_t i = 0; i < size_; ++i) {
result.push_back(test(i) ? '1' : '0');
}
return result;
}
unsigned long to_ulong() const {
if (size_ > sizeof(unsigned long) * 8) {
throw std::overflow_error("BitVector size exceeds unsigned long capacity");
}
unsigned long result = 0;
for (size_t i = 0; i < size_; ++i) {
if (test(i)) {
result |= (1UL << i);
}
}
return result;
}
unsigned long long to_ullong() const {
if (size_ > sizeof(unsigned long long) * 8) {
throw std::overflow_error("BitVector size exceeds unsigned long long capacity");
}
unsigned long long result = 0;
for (size_t i = 0; i < size_; ++i) {
if (test(i)) {
result |= (1ULL << i);
}
}
return result;
}
friend std::ostream& operator<<(std::ostream& os, const BitVector& bv) {
for (size_t i = 0; i < bv.size_; ++i) {
os << bv.test(i);
}
return os;
}
friend BitVector operator&(const BitVector& lhs, const BitVector& rhs) {
BitVector result(lhs);
result &= rhs;
return result;
}
friend BitVector operator|(const BitVector& lhs, const BitVector& rhs) {
BitVector result(lhs);
result |= rhs;
return result;
}
friend BitVector operator^(const BitVector& lhs, const BitVector& rhs) {
BitVector result(lhs);
result ^= rhs;
return result;
}
friend BitVector operator<<(const BitVector& lhs, size_t pos) {
BitVector result(lhs);
result <<= pos;
return result;
}
friend BitVector operator>>(const BitVector& lhs, size_t pos) {
BitVector result(lhs);
result >>= pos;
return result;
}
};
}
// std::hash specialization for BitVector
namespace std {
template <typename T>
struct hash<vortex::BitVector<T>> {
size_t operator()(const vortex::BitVector<T>& bv) const {
return hash<std::string>()(bv.to_string());
}
};
}

View file

@ -44,8 +44,10 @@ Core::Core(const SimContext& ctx,
, operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)FUType::Count)
, func_units_((uint32_t)FUType::Count)
, lsu_demux_(LSU_NUM_REQS)
, lsu_demux_(NUM_LSU_BLOCKS)
, mem_coalescers_(NUM_LSU_BLOCKS)
, lsu_dcache_adapter_(NUM_LSU_BLOCKS)
, lsu_lmem_adapter_(NUM_LSU_BLOCKS)
, pending_icache_(arch_.num_warps())
, commit_arbs_(ISSUE_WIDTH)
{
@ -72,31 +74,53 @@ Core::Core(const SimContext& ctx,
});
// create lsu demux
for (uint32_t i = 0; i < LSU_NUM_REQS; ++i) {
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
snprintf(sname, 100, "core%d-lsu_demux%d", core_id, i);
lsu_demux_.at(i) = LocalMemDemux::Create(sname, 1);
}
// connect dcache-coalescer
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
uint32_t i = b * DCACHE_CHANNELS + c;
mem_coalescers_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&mem_coalescers_.at(b)->RspOut.at(c));
}
// create lsu dcache adapter
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
snprintf(sname, 100, "core%d-lsu_dcache_adapter%d", core_id, i);
lsu_dcache_adapter_.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
}
// create lsu lmem adapter
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
snprintf(sname, 100, "core%d-lsu_lmem_adapter%d", core_id, i);
lsu_lmem_adapter_.at(i) = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
}
// connect lsu demux
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
lsu_demux_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
mem_coalescers_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspDC);
lsu_demux_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
lsu_lmem_adapter_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspLmem);
}
// connect coalescer-adapter
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter_.at(b)->ReqIn);
lsu_dcache_adapter_.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
}
// connect adapter-dcache
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
uint32_t i = b * DCACHE_CHANNELS + c;
lsu_dcache_adapter_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter_.at(b)->RspOut.at(c));
}
}
// connect adapter-lmem
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
uint32_t i = b * LSU_CHANNELS + c;
auto lmem_demux = lsu_demux_.at(i);
lmem_demux->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn.at(c));
mem_coalescers_.at(b)->RspIn.at(c).bind(&lmem_demux->RspDC);
lmem_demux->ReqSM.bind(&local_mem_->Inputs.at(i));
local_mem_->Outputs.at(i).bind(&lmem_demux->RspSM);
lsu_lmem_adapter_.at(b)->ReqOut.at(c).bind(&local_mem_->Inputs.at(i));
local_mem_->Outputs.at(i).bind(&lsu_lmem_adapter_.at(b)->RspOut.at(c));
}
}

View file

@ -152,6 +152,8 @@ private:
LocalMem::Ptr local_mem_;
std::vector<LocalMemDemux::Ptr> lsu_demux_;
std::vector<MemCoalescer::Ptr> mem_coalescers_;
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter_;
std::vector<LsuMemAdapter::Ptr> lsu_lmem_adapter_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;

View file

@ -24,7 +24,7 @@
using namespace vortex;
AluUnit::AluUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "ALU") {}
AluUnit::AluUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "alu-unit") {}
void AluUnit::tick() {
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
@ -49,7 +49,7 @@ void AluUnit::tick() {
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
DT(3, this->name() << ": op" << trace->alu_type << ", " << *trace);
if (trace->eop && trace->fetch_stall) {
core_->resume(trace->wid);
}
@ -59,7 +59,7 @@ void AluUnit::tick() {
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "FPU") {}
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "fpu-unit") {}
void FpuUnit::tick() {
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
@ -88,7 +88,7 @@ void FpuUnit::tick() {
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
DT(3,this->name() << ": op=" << trace->fpu_type << ", " << *trace);
input.pop();
}
}
@ -96,7 +96,7 @@ void FpuUnit::tick() {
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: FuncUnit(ctx, core, "LSU")
: FuncUnit(ctx, core, "lsu-unit")
, pending_loads_(0)
{}
@ -114,24 +114,24 @@ void LsuUnit::tick() {
core_->perf_stats_.load_latency += pending_loads_;
// handle memory responses
for (uint32_t r = 0; r < LSU_NUM_REQS; ++r) {
auto& dcache_rsp_port = core_->lsu_demux_.at(r)->RspIn;
if (dcache_rsp_port.empty())
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
auto& lsu_rsp_port = core_->lsu_demux_.at(b)->RspIn;
if (lsu_rsp_port.empty())
continue;
uint32_t block_idx = r / LSU_CHANNELS;
auto& state = states_.at(block_idx);
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = state.pending_rd_reqs.at(mem_rsp.tag);
auto& state = states_.at(b);
auto& lsu_rsp = lsu_rsp_port.front();
DT(3, this->name() << "-" << lsu_rsp);
auto& entry = state.pending_rd_reqs.at(lsu_rsp.tag);
auto trace = entry.trace;
DT(3, "mem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", rid=" << r << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
assert(!entry.mask.none());
entry.mask &= ~lsu_rsp.mask; // track remaining
if (entry.mask.none()) {
// whole response received, release trace
int iw = trace->wid % ISSUE_WIDTH;
Outputs.at(iw).push(trace, 1);
state.pending_rd_reqs.release(mem_rsp.tag);
state.pending_rd_reqs.release(lsu_rsp.tag);
}
dcache_rsp_port.pop();
lsu_rsp_port.pop();
--pending_loads_;
}
@ -145,7 +145,7 @@ void LsuUnit::tick() {
continue;
Outputs.at(iw).push(state.fence_trace, 1);
state.fence_lock = false;
DT(3, "fence-unlock: " << state.fence_trace);
DT(3, this->name() << "-fence-unlock: " << state.fence_trace);
}
// check input queue
@ -160,7 +160,7 @@ void LsuUnit::tick() {
// schedule fence lock
state.fence_trace = trace;
state.fence_lock = true;
DT(3, "fence-lock: " << *trace);
DT(3, this->name() << "-fence-lock: " << *trace);
// remove input
input.pop();
continue;
@ -178,16 +178,36 @@ void LsuUnit::tick() {
trace->log_once(false);
}
// build memory request
LsuReq lsu_req(NUM_ALU_LANES);
lsu_req.write = is_write;
{
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
auto t0 = trace->pid * NUM_LSU_LANES;
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
if (trace->tmask.test(t0 + i)) {
lsu_req.mask.set(i);
lsu_req.addrs.at(i) = trace_data->mem_addrs.at(t0 + i).addr;
}
}
}
uint32_t tag = 0;
if (!is_write) {
tag = state.pending_rd_reqs.allocate({trace, 0});
tag = state.pending_rd_reqs.allocate({trace, lsu_req.mask});
}
lsu_req.tag = tag;
lsu_req.cid = trace->cid;
lsu_req.uuid = trace->uuid;
// send memory request
auto num_reqs = this->send_requests(trace, block_idx, tag);
core_->lsu_demux_.at(block_idx)->ReqIn.push(lsu_req);
DT(3, this->name() << "-" << lsu_req);
if (!is_write) {
state.pending_rd_reqs.at(tag).count = num_reqs;
if (is_write) {
++core_->perf_stats_.stores;
} else {
++core_->perf_stats_.loads;
++pending_loads_;
}
// do not wait on writes
@ -200,52 +220,10 @@ void LsuUnit::tick() {
}
}
int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
int count = 0;
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
bool is_write = (trace->lsu_type == LsuType::STORE);
auto t0 = trace->pid * NUM_LSU_LANES;
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
uint32_t t = t0 + i;
if (!trace->tmask.test(t))
continue;
int req_idx = block_idx * LSU_CHANNELS + (i % LSU_CHANNELS);
auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
auto mem_addr = trace_data->mem_addrs.at(t);
auto type = get_addr_type(mem_addr.addr);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.type = type;
mem_req.tag = tag;
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.push(mem_req, 1);
DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
if (is_write) {
++core_->perf_stats_.stores;
} else {
++core_->perf_stats_.loads;
++pending_loads_;
}
++count;
}
return count;
}
///////////////////////////////////////////////////////////////////////////////
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
: FuncUnit(ctx, core, "SFU")
: FuncUnit(ctx, core, "sfu-unit")
{}
void SfuUnit::tick() {
@ -287,7 +265,7 @@ void SfuUnit::tick() {
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
DT(3, this->name() << ": op=" << trace->sfu_type << ", " << *trace);
if (trace->eop && release_warp) {
core_->resume(trace->wid);
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -26,13 +26,13 @@ public:
std::vector<SimPort<instr_trace_t*>> Inputs;
std::vector<SimPort<instr_trace_t*>> Outputs;
FuncUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<FuncUnit>(ctx, name)
FuncUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<FuncUnit>(ctx, name)
, Inputs(ISSUE_WIDTH, this)
, Outputs(ISSUE_WIDTH, this)
, core_(core)
{}
virtual ~FuncUnit() {}
virtual void reset() {}
@ -73,28 +73,26 @@ public:
private:
int send_requests(instr_trace_t* trace, int block_idx, int tag);
struct pending_req_t {
struct pending_req_t {
instr_trace_t* trace;
uint32_t count;
BitVector<> mask;
};
struct lsu_state_t {
struct lsu_state_t {
HashTable<pending_req_t> pending_rd_reqs;
instr_trace_t* fence_trace;
instr_trace_t* fence_trace;
bool fence_lock;
lsu_state_t() : pending_rd_reqs(LSUQ_IN_SIZE) {}
void clear() {
this->pending_rd_reqs.clear();
this->fence_trace = nullptr;
this->fence_lock = false;
}
};
std::array<lsu_state_t, NUM_LSU_BLOCKS> states_;
std::array<lsu_state_t, NUM_LSU_BLOCKS> states_;
uint64_t pending_loads_;
};
@ -103,7 +101,7 @@ private:
class SfuUnit : public FuncUnit {
public:
SfuUnit(const SimContext& ctx, Core*);
void tick();
};

View file

@ -82,11 +82,13 @@ public:
continue;
}
DT(4, simobject_->name() << "-" << core_req);
in_used_banks.at(bank_id) = true;
if (!core_req.write || config_.write_reponse) {
// send response
MemRsp core_rsp{core_req.tag, core_req.cid};
MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
simobject_->Outputs.at(req_id).push(core_rsp, 1);
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,100 +16,138 @@
using namespace vortex;
MemCoalescer::MemCoalescer(
const SimContext& ctx,
const char* name,
const SimContext& ctx,
const char* name,
uint32_t input_size,
uint32_t output_size,
uint32_t line_size,
uint32_t queue_size,
uint32_t delay
) : SimObject<MemCoalescer>(ctx, name)
, ReqIn(input_size, this)
, RspIn(input_size, this)
, ReqOut(output_size, this)
, RspOut(output_size, this)
) : SimObject<MemCoalescer>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqOut(this)
, RspOut(this)
, input_size_(input_size)
, output_size_(output_size)
, output_ratio_(input_size / output_size)
, pending_rd_reqs_(queue_size)
, sent_mask_(input_size)
, line_size_(line_size)
, delay_(delay)
{}
void MemCoalescer::reset() {
last_index_ = 0;
sent_mask_.reset();
}
void MemCoalescer::tick() {
uint32_t I = ReqIn.size();
uint32_t O = ReqOut.size();
void MemCoalescer::tick() {
// process incoming responses
for (uint32_t o = 0; o < O; ++o) {
if (RspOut.at(o).empty())
continue;
auto& mem_rsp = RspOut.at(o).front();
DT(3, this->name() << "-" << mem_rsp);
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
for (uint32_t i = 0; i < I; ++i) {
if (entry.mask.test(i)) {
MemRsp rsp(mem_rsp);
rsp.tag = entry.tag;
RspIn.at(i).push(rsp, 1);
if (!RspOut.empty()) {
auto& out_rsp = RspOut.front();
DT(4, this->name() << "-" << out_rsp);
auto& entry = pending_rd_reqs_.at(out_rsp.tag);
BitVector<> rsp_mask(input_size_);
for (uint32_t o = 0; o < output_size_; ++o) {
if (!out_rsp.mask.test(o))
continue;
for (uint32_t r = 0; r < output_ratio_; ++r) {
uint32_t i = o * output_ratio_ + r;
if (entry.mask.test(i))
rsp_mask.set(i);
}
}
pending_rd_reqs_.release(mem_rsp.tag);
RspOut.at(o).pop();
// build memory response
LsuRsp in_rsp(input_size_);
in_rsp.mask = rsp_mask;
in_rsp.tag = entry.tag;
in_rsp.cid = out_rsp.cid;
in_rsp.uuid = out_rsp.uuid;
// send memory response
RspIn.push(in_rsp, 1);
// track remaining responses
assert(!entry.mask.none());
entry.mask &= ~rsp_mask;
if (entry.mask.none()) {
// whole response received, release tag
pending_rd_reqs_.release(out_rsp.tag);
}
RspOut.pop();
}
// process incoming requests
uint64_t addr_mask = ~uint64_t(line_size_-1);
bool completed = true;
for (uint32_t i = last_index_; i < I; ++i) {
if (sent_mask_.test(i) || ReqIn.at(i).empty())
continue;
if (ReqIn.empty())
return;
auto& seed = ReqIn.at(i).front();
auto& in_req = ReqIn.front();
// ensure we can allocate a response tag
if (!seed.write && pending_rd_reqs_.full()) {
DT(4, "*** " << this->name() << "-queue-full: " << seed);
last_index_ = i;
completed = false;
break;
}
std::bitset<64> mask(0);
mask.set(i);
// coalesce matching requests
uint64_t seed_addr = seed.addr & addr_mask;
for (uint32_t j = i + 1; j < I; ++j) {
if (sent_mask_.test(j) || ReqIn.at(j).empty())
continue;
auto& match = ReqIn.at(j).front();
uint64_t match_addr = match.addr & addr_mask;
if (match_addr == seed_addr) {
mask.set(j);
ReqIn.at(j).pop();
}
}
uint32_t tag = 0;
if (!seed.write) {
tag = pending_rd_reqs_.allocate(pending_req_t{seed.tag, mask});
}
MemReq mem_req{seed};
mem_req.tag = tag;
DT(3, this->name() << "-" << mem_req << ", coalesced=" << mask.count());
uint32_t c = i % O;
ReqOut.at(c).push(mem_req, delay_);
ReqIn.at(i).pop();
sent_mask_ |= mask;
// ensure we can allocate a response tag
if (pending_rd_reqs_.full()) {
DT(4, "*** " << this->name() << "-queue-full: " << in_req);
return;
}
if (completed) {
last_index_ = 0;
uint64_t addr_mask = ~uint64_t(line_size_-1);
BitVector<> out_mask(output_size_);
std::vector<uint64_t> out_addrs(output_size_);
BitVector<> cur_mask(input_size_);
for (uint32_t o = 0; o < output_size_; ++o) {
for (uint32_t r = 0; r < output_ratio_; ++r) {
uint32_t i = o * output_ratio_ + r;
if (sent_mask_.test(i) || !in_req.mask.test(i))
continue;
uint64_t seed_addr = in_req.addrs.at(i) & addr_mask;
cur_mask.set(i);
// coalesce matching requests
for (uint32_t s = r + 1; s < output_ratio_; ++s) {
uint32_t j = o * output_ratio_ + s;
if (sent_mask_.test(j) || !in_req.mask.test(j))
continue;
uint64_t match_addr = in_req.addrs.at(j) & addr_mask;
if (match_addr == seed_addr) {
cur_mask.set(j);
}
}
out_mask.set(o);
out_addrs.at(o) = seed_addr;
}
}
assert(!out_mask.none());
uint32_t tag = 0;
if (!in_req.write) {
// allocate a response tag for read requests
tag = pending_rd_reqs_.allocate(pending_req_t{in_req.tag, cur_mask});
}
// build memory request
LsuReq out_req{output_size_};
out_req.mask = out_mask;
out_req.tag = tag;
out_req.write = in_req.write;
out_req.addrs = out_addrs;
out_req.cid = in_req.cid;
out_req.uuid = in_req.uuid;
// send memory request
ReqOut.push(out_req, delay_);
DT(4, this->name() << "-" << out_req << ", coalesced=" << cur_mask.count());
// update sent mask
sent_mask_ |= cur_mask;
if (sent_mask_ == in_req.mask) {
ReqIn.pop();
sent_mask_.reset();
}
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,15 +17,15 @@ namespace vortex {
class MemCoalescer : public SimObject<MemCoalescer> {
public:
std::vector<SimPort<MemReq>> ReqIn;
std::vector<SimPort<MemRsp>> RspIn;
SimPort<LsuReq> ReqIn;
SimPort<LsuRsp> RspIn;
std::vector<SimPort<MemReq>> ReqOut;
std::vector<SimPort<MemRsp>> RspOut;
SimPort<LsuReq> ReqOut;
SimPort<LsuRsp> RspOut;
MemCoalescer(
const SimContext& ctx,
const char* name,
const SimContext& ctx,
const char* name,
uint32_t input_size,
uint32_t output_size,
uint32_t line_size,
@ -41,14 +41,17 @@ private:
struct pending_req_t {
uint32_t tag;
std::bitset<64> mask;
BitVector<> mask;
};
uint32_t input_size_;
uint32_t output_size_;
uint32_t output_ratio_;
HashTable<pending_req_t> pending_rd_reqs_;
BitVector<> sent_mask_;
uint32_t line_size_;
uint32_t delay_;
uint32_t last_index_;
std::bitset<64> sent_mask_;
};
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,14 +16,14 @@
using namespace vortex;
LocalMemDemux::LocalMemDemux(
const SimContext& ctx,
const char* name,
const SimContext& ctx,
const char* name,
uint32_t delay
) : SimObject<LocalMemDemux>(ctx, name)
) : SimObject<LocalMemDemux>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqSM(this)
, RspSM(this)
, ReqLmem(this)
, RspLmem(this)
, ReqDC(this)
, RspDC(this)
, delay_(delay)
@ -31,30 +31,133 @@ LocalMemDemux::LocalMemDemux(
void LocalMemDemux::reset() {}
void LocalMemDemux::tick() {
void LocalMemDemux::tick() {
// process incoming responses
if (!RspSM.empty()) {
auto& rsp = RspSM.front();
DT(4, this->name() << "-" << rsp);
RspIn.push(rsp, 1);
RspSM.pop();
if (!RspLmem.empty()) {
auto& out_rsp = RspLmem.front();
DT(4, this->name() << "-" << out_rsp);
RspIn.push(out_rsp, 1);
RspLmem.pop();
}
if (!RspDC.empty()) {
auto& rsp = RspDC.front();
DT(4, this->name() << "-" << rsp);
RspIn.push(rsp, 1);
RspDC
.pop();
auto& out_rsp = RspDC.front();
DT(4, this->name() << "-" << out_rsp);
RspIn.push(out_rsp, 1);
RspDC.pop();
}
// process incoming requests
// process incoming requests
if (!ReqIn.empty()) {
auto& req = ReqIn.front();
DT(4, this->name() << "-" << req);
if (req.type == AddrType::Shared) {
ReqSM.push(req, delay_);
} else {
ReqDC.push(req, delay_);
auto& in_req = ReqIn.front();
LsuReq out_dc_req(in_req.mask.size());
out_dc_req.write = in_req.write;
out_dc_req.tag = in_req.tag;
out_dc_req.cid = in_req.cid;
out_dc_req.uuid = in_req.uuid;
LsuReq out_lmem_req(out_dc_req);
for (uint32_t i = 0; i < in_req.mask.size(); ++i) {
if (in_req.mask.test(i)) {
auto type = get_addr_type(in_req.addrs.at(i));
if (type == AddrType::Shared) {
out_lmem_req.mask.set(i);
out_lmem_req.addrs.at(i) = in_req.addrs.at(i);
} else {
out_dc_req.mask.set(i);
out_dc_req.addrs.at(i) = in_req.addrs.at(i);
}
}
}
if (!out_dc_req.mask.none()) {
ReqDC.push(out_dc_req, delay_);
DT(4, this->name() << "-" << out_dc_req);
}
if (!out_lmem_req.mask.none()) {
ReqLmem.push(out_lmem_req, delay_);
DT(4, this->name() << "-" << out_lmem_req);
}
ReqIn.pop();
}
}
}
///////////////////////////////////////////////////////////////////////////////
LsuMemAdapter::LsuMemAdapter(
const SimContext& ctx,
const char* name,
uint32_t num_inputs,
uint32_t delay
) : SimObject<LsuMemAdapter>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqOut(num_inputs, this)
, RspOut(num_inputs, this)
, delay_(delay)
{}
void LsuMemAdapter::reset() {}
void LsuMemAdapter::tick() {
uint32_t input_size = ReqOut.size();
// process incoming responses
for (uint32_t i = 0; i < input_size; ++i) {
if (RspOut.at(i).empty())
continue;
auto& out_rsp = RspOut.at(i).front();
DT(4, this->name() << "-" << out_rsp);
// build memory response
LsuRsp in_rsp(input_size);
in_rsp.mask.set(i);
in_rsp.tag = out_rsp.tag;
in_rsp.cid = out_rsp.cid;
in_rsp.uuid = out_rsp.uuid;
// include other responses with the same tag
for (uint32_t j = i + 1; j < input_size; ++j) {
if (RspOut.at(j).empty())
continue;
auto& other_rsp = RspOut.at(j).front();
if (out_rsp.tag == other_rsp.tag) {
in_rsp.mask.set(j);
RspOut.at(j).pop();
}
}
// send memory response
RspIn.push(in_rsp, 1);
// remove input
RspOut.at(i).pop();
break;
}
// process incoming requests
if (!ReqIn.empty()) {
auto& in_req = ReqIn.front();
assert(in_req.mask.size() == input_size);
for (uint32_t i = 0; i < input_size; ++i) {
if (in_req.mask.test(i)) {
// build memory request
MemReq out_req;
out_req.write = in_req.write;
out_req.addr = in_req.addrs.at(i);
out_req.type = get_addr_type(in_req.addrs.at(i));
out_req.tag = in_req.tag;
out_req.cid = in_req.cid;
out_req.uuid = in_req.uuid;
// send memory request
ReqOut.at(i).push(out_req, delay_);
DT(4, this->name() << "-" << out_req);
}
}
ReqIn.pop();
}
}

View file

@ -23,6 +23,7 @@
#include <VX_config.h>
#include <VX_types.h>
#include <simobject.h>
#include <bitvector.h>
#include "debug.h"
namespace vortex {
@ -238,6 +239,62 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
default: assert(false);
}
return os;
}///////////////////////////////////////////////////////////////////////////////
struct LsuReq {
BitVector<> mask;
std::vector<uint64_t> addrs;
bool write;
uint32_t tag;
uint32_t cid;
uint64_t uuid;
LsuReq(uint32_t size)
: mask(size)
, addrs(size, 0)
, write(false)
, tag(0)
, cid(0)
, uuid(0)
{}
};
inline std::ostream &operator<<(std::ostream &os, const LsuReq& req) {
os << "lsu-req-" << (req.write ? "wr" : "rd") << ": mask=" << req.mask << ", ";
for (size_t i = 0; i < req.addrs.size(); ++i) {
os << "addr" << i << "=";
if (req.mask.test(i)) {
os << "0x" << std::hex << req.addrs.at(i);
} else {
os << "-";
}
os << ", ";
}
os << std::dec << "tag=" << req.tag << ", cid=" << req.cid;
os << " (#" << std::dec << req.uuid << ")";
return os;
}
///////////////////////////////////////////////////////////////////////////////
struct LsuRsp {
BitVector<> mask;
uint64_t tag;
uint32_t cid;
uint64_t uuid;
LsuRsp(uint32_t size)
: mask(size)
, tag (0)
, cid(0)
, uuid(0)
{}
};
inline std::ostream &operator<<(std::ostream &os, const LsuRsp& rsp) {
os << "lsu-rsp: mask=" << rsp.mask << ", tag=" << rsp.tag << ", cid=" << rsp.cid;
os << " (#" << std::dec << rsp.uuid << ")";
return os;
}
///////////////////////////////////////////////////////////////////////////////
@ -266,7 +323,7 @@ struct MemReq {
};
inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
os << "mem-" << (req.write ? "wr" : "rd") << ": ";
os << "mem-req-" << (req.write ? "wr" : "rd") << ": ";
os << "addr=0x" << std::hex << req.addr << ", type=" << req.type;
os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid;
os << " (#" << std::dec << req.uuid << ")";
@ -566,14 +623,14 @@ using MemSwitch = Switch<MemReq, MemRsp>;
class LocalMemDemux : public SimObject<LocalMemDemux> {
public:
SimPort<MemReq> ReqIn;
SimPort<MemRsp> RspIn;
SimPort<LsuReq> ReqIn;
SimPort<LsuRsp> RspIn;
SimPort<MemReq> ReqSM;
SimPort<MemRsp> RspSM;
SimPort<LsuReq> ReqLmem;
SimPort<LsuRsp> RspLmem;
SimPort<MemReq> ReqDC;
SimPort<MemRsp> RspDC;
SimPort<LsuReq> ReqDC;
SimPort<LsuRsp> RspDC;
LocalMemDemux(
const SimContext& ctx,
@ -589,4 +646,29 @@ private:
uint32_t delay_;
};
///////////////////////////////////////////////////////////////////////////////
class LsuMemAdapter : public SimObject<LsuMemAdapter> {
public:
SimPort<LsuReq> ReqIn;
SimPort<LsuRsp> RspIn;
std::vector<SimPort<MemReq>> ReqOut;
std::vector<SimPort<MemRsp>> RspOut;
LsuMemAdapter(
const SimContext& ctx,
const char* name,
uint32_t num_inputs,
uint32_t delay
);
void reset();
void tick();
private:
uint32_t delay_;
};
}