simx mem_coalescer

This commit is contained in:
Blaise Tine 2024-03-24 20:31:36 -07:00
parent 86055335ee
commit 402c911991
11 changed files with 328 additions and 176 deletions

View file

@ -16,7 +16,7 @@ LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp
SRCS += processor.cpp cluster.cpp socket.cpp core.cpp emulator.cpp decode.cpp execute.cpp func_unit.cpp cache_sim.cpp mem_sim.cpp local_mem.cpp dcrs.cpp
SRCS += processor.cpp cluster.cpp socket.cpp core.cpp emulator.cpp decode.cpp execute.cpp func_unit.cpp cache_sim.cpp mem_sim.cpp local_mem.cpp mem_coalescer.cpp dcrs.cpp types.cpp
# Debugigng
ifdef DEBUG

View file

@ -25,6 +25,10 @@
#define MEMORY_BANKS 2
#endif
#define LSU_WORD_SIZE (XLEN / 8)
#define LSU_CHANNELS NUM_LSU_LANES
#define LSU_NUM_REQS (NUM_LSU_BLOCKS * LSU_CHANNELS)
#define DCACHE_WORD_SIZE LSU_LINE_SIZE
#define DCACHE_CHANNELS UP((NUM_LSU_LANES * (XLEN / 8)) / DCACHE_WORD_SIZE)
#define DCACHE_NUM_REQS (NUM_LSU_BLOCKS * DCACHE_CHANNELS)

View file

@ -44,7 +44,8 @@ Core::Core(const SimContext& ctx,
, operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)FUType::Count)
, func_units_((uint32_t)FUType::Count)
, lsu_demux_(DCACHE_NUM_REQS)
, lsu_demux_(LSU_NUM_REQS)
, mem_coalescers_(NUM_LSU_BLOCKS)
, pending_icache_(arch_.num_warps())
, commit_arbs_(ISSUE_WIDTH)
{
@ -54,26 +55,49 @@ Core::Core(const SimContext& ctx,
operands_.at(i) = SimPlatform::instance().create_object<Operand>();
}
// initialize local memory
// create the memory coalescer
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
snprintf(sname, 100, "core%d-coalescer%d", core_id, i);
mem_coalescers_.at(i) = MemCoalescer::Create(sname, LSU_CHANNELS, DCACHE_CHANNELS, DCACHE_WORD_SIZE, LSUQ_OUT_SIZE, 1);
}
// create local memory
snprintf(sname, 100, "core%d-local_mem", core_id);
local_mem_ = LocalMem::Create(sname, LocalMem::Config{
(1 << LMEM_LOG_SIZE),
DCACHE_WORD_SIZE,
DCACHE_NUM_REQS,
LSU_WORD_SIZE,
LSU_NUM_REQS,
LMEM_NUM_BANKS,
false
});
for (uint32_t i = 0; i < DCACHE_NUM_REQS; ++i) {
// create lsu demux
for (uint32_t i = 0; i < LSU_NUM_REQS; ++i) {
snprintf(sname, 100, "core%d-lsu_demux%d", core_id, i);
auto lmem_demux = LocalMemDemux::Create(sname);
lsu_demux_.at(i) = LocalMemDemux::Create(sname, 1);
}
lmem_demux->ReqDC.bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&lmem_demux->RspDC);
// connect dcache-coalescer
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
uint32_t i = b * DCACHE_CHANNELS + c;
mem_coalescers_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&mem_coalescers_.at(b)->RspOut.at(c));
}
}
// connect lsu demux
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
uint32_t i = b * LSU_CHANNELS + c;
auto lmem_demux = lsu_demux_.at(i);
lmem_demux->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn.at(c));
mem_coalescers_.at(b)->RspIn.at(c).bind(&lmem_demux->RspDC);
lmem_demux->ReqSM.bind(&local_mem_->Inputs.at(i));
local_mem_->Outputs.at(i).bind(&lmem_demux->RspSM);
lsu_demux_.at(i) = lmem_demux;
lmem_demux->ReqSM.bind(&local_mem_->Inputs.at(i));
local_mem_->Outputs.at(i).bind(&lmem_demux->RspSM);
}
}
// initialize dispatchers
@ -204,7 +228,7 @@ void Core::decode() {
auto& ibuffer = ibuffers_.at(trace->wid);
if (ibuffer.full()) {
if (!trace->log_once(true)) {
DT(3, "*** ibuffer-stall: " << *trace);
DT(4, "*** ibuffer-stall: " << *trace);
}
++perf_stats_.ibuf_stalls;
return;
@ -237,7 +261,7 @@ void Core::issue() {
trace->log_once(false);
} else {
if (!trace->log_once(true)) {
DT(3, "*** dispatch-stall: " << *trace);
DT(4, "*** dispatch-stall: " << *trace);
}
}
}
@ -255,7 +279,7 @@ void Core::issue() {
if (scoreboard_.in_use(trace)) {
auto uses = scoreboard_.get_uses(trace);
if (!trace->log_once(true)) {
DTH(3, "*** scoreboard-stall: dependents={");
DTH(4, "*** scoreboard-stall: dependents={");
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
__unused (use);

View file

@ -25,6 +25,7 @@
#include "operand.h"
#include "dispatcher.h"
#include "func_unit.h"
#include "mem_coalescer.h"
namespace vortex {
@ -146,6 +147,7 @@ private:
std::vector<FuncUnit::Ptr> func_units_;
LocalMem::Ptr local_mem_;
std::vector<LocalMemDemux::Ptr> lsu_demux_;
std::vector<MemCoalescer::Ptr> mem_coalescers_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;

View file

@ -109,17 +109,17 @@ void LsuUnit::reset() {
void LsuUnit::tick() {
core_->perf_stats_.load_latency += pending_loads_;
// handle dcache responses
for (uint32_t r = 0; r < DCACHE_NUM_REQS; ++r) {
// handle memory responses
for (uint32_t r = 0; r < LSU_NUM_REQS; ++r) {
auto& dcache_rsp_port = core_->lsu_demux_.at(r)->RspIn;
if (dcache_rsp_port.empty())
continue;
uint32_t block_idx = r / DCACHE_CHANNELS;
uint32_t block_idx = r / LSU_CHANNELS;
auto& state = states_.at(block_idx);
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = state.pending_rd_reqs.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", rid=" << r << ", " << *trace);
DT(3, "mem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", rid=" << r << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
@ -162,29 +162,32 @@ void LsuUnit::tick() {
continue;
}
bool is_write = (trace->lsu_type == LsuType::STORE);
// check pending queue capacity
if (state.pending_rd_reqs.full()) {
if (!is_write && state.pending_rd_reqs.full()) {
if (!trace->log_once(true)) {
DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
DT(4, "*** " << this->name() << "-queue-full: " << *trace);
}
continue;
} else {
trace->log_once(false);
}
uint32_t num_reqs;
auto tag = state.pending_rd_reqs.allocate({trace, 0});
if (DCACHE_WORD_SIZE != (XLEN/8)) {
num_reqs = this->send_coalesced_requests(trace, block_idx, tag);
} else {
num_reqs = this->send_requests(trace, block_idx, tag);
}
state.pending_rd_reqs.at(tag).count = num_reqs;
uint32_t tag = 0;
if (!is_write) {
tag = state.pending_rd_reqs.allocate({trace, 0});
}
// send memory request
auto num_reqs = this->send_requests(trace, block_idx, tag);
if (!is_write) {
state.pending_rd_reqs.at(tag).count = num_reqs;
}
// do not wait on writes
bool is_write = (trace->lsu_type == LsuType::STORE);
// do not wait on writes
if (is_write) {
state.pending_rd_reqs.release(tag);
output.push(trace, 1);
}
@ -205,7 +208,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
if (!trace->tmask.test(t))
continue;
int req_idx = block_idx * DCACHE_CHANNELS + (i % DCACHE_CHANNELS);
int req_idx = block_idx * LSU_CHANNELS + (i % LSU_CHANNELS);
auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
auto mem_addr = trace_data->mem_addrs.at(t);
@ -220,7 +223,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
mem_req.uuid = trace->uuid;
dcache_req_port.push(mem_req, 1);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
if (is_write) {
@ -235,79 +238,6 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
return count;
}
int LsuUnit::send_coalesced_requests(instr_trace_t* trace, int block_idx, int tag) {
int count = 0;
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
bool is_write = (trace->lsu_type == LsuType::STORE);
auto t0 = trace->pid * NUM_LSU_LANES;
uint64_t addr_mask = ~uint64_t(LSU_LINE_SIZE-1);
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
std::bitset<NUM_LSU_LANES / DCACHE_CHANNELS> mask(0);
for (uint32_t i = 0; i < mask.size(); ++i) {
mask.set(i, trace->tmask.test(t0 + i));
}
int req_idx = block_idx * DCACHE_CHANNELS + c;
auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
while (mask.any()) {
// calculate seed idex
int seed_idx = 0;
for (uint32_t i = 0; i < mask.size(); ++i) {
if (mask.test(i)) {
seed_idx = i;
break;
}
}
uint64_t seed_addr = trace_data->mem_addrs.at(t0 + seed_idx).addr & addr_mask;
auto type = get_addr_type(seed_addr);
// coalesce addresses matching the seed
uint32_t coelescing_size = 0;
for (uint32_t i = seed_idx; i < mask.size(); ++i) {
uint64_t mem_addr = trace_data->mem_addrs.at(t0 + i).addr & addr_mask;
if (mem_addr == seed_addr) {
mask.set(i, 0);
++coelescing_size;
}
}
MemReq mem_req;
mem_req.addr = seed_addr;
mem_req.write = is_write;
mem_req.type = type;
mem_req.tag = tag;
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.push(mem_req, 1);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
if (coelescing_size > 1) {
DT(3, "*** coalescing: size=" << coelescing_size << ", " << *trace);
}
if (is_write) {
++core_->perf_stats_.stores;
} else {
++core_->perf_stats_.loads;
++pending_loads_;
}
++count;
}
t0 += mask.size();
}
return count;
}
///////////////////////////////////////////////////////////////////////////////
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)

View file

@ -74,7 +74,6 @@ public:
private:
int send_requests(instr_trace_t* trace, int block_idx, int tag);
int send_coalesced_requests(instr_trace_t* trace, int block_idx, int tag);
struct pending_req_t {
instr_trace_t* trace;

115
sim/simx/mem_coalescer.cpp Normal file
View file

@ -0,0 +1,115 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "mem_coalescer.h"
using namespace vortex;
MemCoalescer::MemCoalescer(
const SimContext& ctx,
const char* name,
uint32_t input_size,
uint32_t output_size,
uint32_t line_size,
uint32_t queue_size,
uint32_t delay
) : SimObject<MemCoalescer>(ctx, name)
, ReqIn(input_size, this)
, RspIn(input_size, this)
, ReqOut(output_size, this)
, RspOut(output_size, this)
, pending_rd_reqs_(queue_size)
, line_size_(line_size)
, delay_(delay)
{}
void MemCoalescer::reset() {
last_index_ = 0;
sent_mask_.reset();
}
void MemCoalescer::tick() {
uint32_t I = ReqIn.size();
uint32_t O = ReqOut.size();
// process incoming responses
for (uint32_t o = 0; o < O; ++o) {
if (RspOut.at(o).empty())
continue;
auto& mem_rsp = RspOut.at(o).front();
DT(3, this->name() << "-" << mem_rsp);
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
for (uint32_t i = 0; i < I; ++i) {
if (entry.mask.test(i)) {
MemRsp rsp(mem_rsp);
rsp.tag = entry.tag;
RspIn.at(i).push(rsp, 1);
}
}
pending_rd_reqs_.release(mem_rsp.tag);
RspOut.at(o).pop();
}
// process incoming requests
uint64_t addr_mask = ~uint64_t(line_size_-1);
bool completed = true;
for (uint32_t i = last_index_; i < I; ++i) {
if (sent_mask_.test(i) || ReqIn.at(i).empty())
continue;
auto& seed = ReqIn.at(i).front();
// ensure we can allocate a response tag
if (!seed.write && pending_rd_reqs_.full()) {
DT(4, "*** " << this->name() << "-queue-full: " << seed);
last_index_ = i;
completed = false;
break;
}
std::bitset<64> mask(0);
mask.set(i);
// coalesce matching requests
uint64_t seed_addr = seed.addr & addr_mask;
for (uint32_t j = i + 1; j < I; ++j) {
if (sent_mask_.test(j) || ReqIn.at(j).empty())
continue;
auto& match = ReqIn.at(j).front();
uint64_t match_addr = match.addr & addr_mask;
if (match_addr == seed_addr) {
mask.set(j);
ReqIn.at(j).pop();
}
}
uint32_t tag = 0;
if (!seed.write) {
tag = pending_rd_reqs_.allocate(pending_req_t{seed.tag, mask});
}
MemReq mem_req{seed};
mem_req.tag = tag;
DT(3, this->name() << "-" << mem_req << ", coalesced=" << mask.count());
uint32_t c = i % O;
ReqOut.at(c).push(mem_req, delay_);
ReqIn.at(i).pop();
sent_mask_ |= mask;
}
if (completed) {
last_index_ = 0;
sent_mask_.reset();
}
}

54
sim/simx/mem_coalescer.h Normal file
View file

@ -0,0 +1,54 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "types.h"
namespace vortex {
class MemCoalescer : public SimObject<MemCoalescer> {
public:
std::vector<SimPort<MemReq>> ReqIn;
std::vector<SimPort<MemRsp>> RspIn;
std::vector<SimPort<MemReq>> ReqOut;
std::vector<SimPort<MemRsp>> RspOut;
MemCoalescer(
const SimContext& ctx,
const char* name,
uint32_t input_size,
uint32_t output_size,
uint32_t line_size,
uint32_t queue_size,
uint32_t delay
);
void reset();
void tick();
private:
struct pending_req_t {
uint32_t tag;
std::bitset<64> mask;
};
HashTable<pending_req_t> pending_rd_reqs_;
uint32_t line_size_;
uint32_t delay_;
uint32_t last_index_;
std::bitset<64> sent_mask_;
};
}

60
sim/simx/types.cpp Normal file
View file

@ -0,0 +1,60 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "types.h"
using namespace vortex;
LocalMemDemux::LocalMemDemux(
const SimContext& ctx,
const char* name,
uint32_t delay
) : SimObject<LocalMemDemux>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqSM(this)
, RspSM(this)
, ReqDC(this)
, RspDC(this)
, delay_(delay)
{}
void LocalMemDemux::reset() {}
void LocalMemDemux::tick() {
// process incoming responses
if (!RspSM.empty()) {
auto& rsp = RspSM.front();
DT(4, this->name() << "-" << rsp);
RspIn.push(rsp, 1);
RspSM.pop();
}
if (!RspDC.empty()) {
auto& rsp = RspDC.front();
DT(4, this->name() << "-" << rsp);
RspIn.push(rsp, 1);
RspDC
.pop();
}
// process incoming requests
if (!ReqIn.empty()) {
auto& req = ReqIn.front();
DT(4, this->name() << "-" << req);
if (req.type == AddrType::Shared) {
ReqSM.push(req, delay_);
} else {
ReqDC.push(req, delay_);
}
ReqIn.pop();
}
}

View file

@ -509,8 +509,22 @@ public:
if (I == O)
return;
// process incomming requests
for (uint32_t o = 0; o < O; ++o) {
// process incoming responses
if (!RspOut.at(o).empty()) {
auto& rsp = RspOut.at(o).front();
uint32_t i = 0;
if (lg_num_reqs_ != 0) {
i = rsp.tag & (R-1);
rsp.tag >>= lg_num_reqs_;
}
DT(4, this->name() << "-" << rsp);
uint32_t j = o * R + i;
RspIn.at(j).push(rsp, 1);
RspOut.at(o).pop();
}
// process incoming requests
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (cursors_.at(o) + r) & (R-1);
uint32_t j = o * R + i;
@ -530,20 +544,6 @@ public:
break;
}
}
// process incoming reponses
if (!RspOut.at(o).empty()) {
auto& rsp = RspOut.at(o).front();
uint32_t i = 0;
if (lg_num_reqs_ != 0) {
i = rsp.tag & (R-1);
rsp.tag >>= lg_num_reqs_;
}
DT(4, this->name() << "-" << rsp);
uint32_t j = o * R + i;
RspIn.at(j).push(rsp, 1);
RspOut.at(o).pop();
}
}
}
@ -560,6 +560,8 @@ private:
uint32_t lg_num_reqs_;
};
using MemSwitch = Switch<MemReq, MemRsp>;
///////////////////////////////////////////////////////////////////////////////
class LocalMemDemux : public SimObject<LocalMemDemux> {
@ -576,53 +578,15 @@ public:
LocalMemDemux(
const SimContext& ctx,
const char* name,
uint32_t delay = 1
) : SimObject<LocalMemDemux>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqSM(this)
, RspSM(this)
, ReqDC(this)
, RspDC(this)
, delay_(delay)
{}
uint32_t delay
);
void reset() {}
void reset();
void tick() {
// process incoming reponses
if (!RspSM.empty()) {
auto& rsp = RspSM.front();
DT(4, this->name() << "-" << rsp);
RspIn.push(rsp, 1);
RspSM.pop();
}
if (!RspDC.empty()) {
auto& rsp = RspDC.front();
DT(4, this->name() << "-" << rsp);
RspIn.push(rsp, 1);
RspDC
.pop();
}
// process incomming requests
if (!ReqIn.empty()) {
auto& req = ReqIn.front();
DT(4, this->name() << "-" << req);
if (req.type == AddrType::Shared) {
ReqSM.push(req, delay_);
} else {
ReqDC.push(req, delay_);
}
ReqIn.pop();
}
}
void tick();
private:
uint32_t delay_;
};
///////////////////////////////////////////////////////////////////////////////
using MemSwitch = Switch<MemReq, MemRsp>;
}

View file

@ -12,8 +12,8 @@
TestSuite* testSuite = nullptr;
const char* kernel_file = "kernel.bin";
int count = 0;
std::unordered_set<int> included;
int count = 1;
std::unordered_set<std::string> selected;
std::unordered_set<std::string> excluded;
int testid_s = 0;
int testid_e = 0;
@ -28,7 +28,7 @@ kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-t<testid>: selected test] [-s<testid>: start test] [-e<testid>: end test] [-x<name>: excluded tests]" << std::endl;
std::cout << "Usage: [-t<name>: select test [-x<name>: exclude test]] [-s<testid>: start test] [-e<testid>: end test]" << std::endl;
std::cout << " [-k<kernel>] [-n<words>] [-c] [-h: help]" << std::endl;
}
@ -40,7 +40,7 @@ static void parse_args(int argc, char **argv) {
count = atoi(optarg);
break;
case 't':
included.insert(atoi(optarg));
selected.insert(optarg);
break;
case 'x':
excluded.insert(optarg);
@ -143,14 +143,14 @@ int main(int argc, char *argv[]) {
testid_e = (testSuite->size() - 1);
}
// execute tests
for (int t = testid_s; t <= testid_e; ++t) {
if (!included.empty()) {
if (included.count(t) == 0)
continue;
}
for (int t = testid_s; t <= testid_e; ++t) {
auto test = testSuite->get_test(t);
auto name = test->name();
if (!selected.empty()) {
if (selected.count(name) == 0)
continue;
}
if (!excluded.empty()) {
if (excluded.count(name) != 0)