simx memory coalescing support

This commit is contained in:
Blaise Tine 2024-03-14 12:20:39 -07:00
parent 07c063031f
commit f1522e68f8
11 changed files with 433 additions and 383 deletions

View file

@ -252,9 +252,9 @@ module VX_core import VX_gpu_pkg::*; #(
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
assign perf_dcache_rd_req_fire[i] = dcache_lmem_bus_if[i].req_valid && dcache_lmem_bus_if[i].req_ready && ~dcache_lmem_bus_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i] = dcache_lmem_bus_if[i].req_valid && dcache_lmem_bus_if[i].req_ready && dcache_lmem_bus_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i] = dcache_lmem_bus_if[i].rsp_valid && dcache_lmem_bus_if[i].rsp_ready;
end
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);

View file

@ -26,42 +26,42 @@ public:
CacheCluster(const SimContext& ctx,
const char* name,
uint32_t num_units,
uint32_t num_inputs,
uint32_t num_caches,
uint32_t num_requests,
const CacheSim::Config& config)
const CacheSim::Config& cache_config)
: SimObject(ctx, name)
, CoreReqPorts(num_units, std::vector<SimPort<MemReq>>(num_requests, this))
, CoreRspPorts(num_units, std::vector<SimPort<MemRsp>>(num_requests, this))
, CoreReqPorts(num_inputs, std::vector<SimPort<MemReq>>(num_requests, this))
, CoreRspPorts(num_inputs, std::vector<SimPort<MemRsp>>(num_requests, this))
, MemReqPort(this)
, MemRspPort(this)
, caches_(MAX(num_caches, 0x1)) {
CacheSim::Config config2(config);
CacheSim::Config cache_config2(cache_config);
if (0 == num_caches) {
num_caches = 1;
config2.bypass = true;
cache_config2.bypass = true;
}
char sname[100];
std::vector<MemSwitch::Ptr> unit_arbs(num_units);
for (uint32_t u = 0; u < num_units; ++u) {
snprintf(sname, 100, "%s-unit-arb-%d", name, u);
unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
std::vector<MemSwitch::Ptr> input_arbs(num_inputs);
for (uint32_t j = 0; j < num_inputs; ++j) {
snprintf(sname, 100, "%s-input-arb%d", name, j);
input_arbs.at(j) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, cache_config.num_inputs);
for (uint32_t i = 0; i < num_requests; ++i) {
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
this->CoreReqPorts.at(j).at(i).bind(&input_arbs.at(j)->ReqIn.at(i));
input_arbs.at(j)->RspIn.at(i).bind(&this->CoreRspPorts.at(j).at(i));
}
}
std::vector<MemSwitch::Ptr> mem_arbs(config.num_inputs);
for (uint32_t i = 0; i < config.num_inputs; ++i) {
snprintf(sname, 100, "%s-mem-arb-%d", name, i);
mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
for (uint32_t u = 0; u < num_units; ++u) {
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
std::vector<MemSwitch::Ptr> mem_arbs(cache_config.num_inputs);
for (uint32_t i = 0; i < cache_config.num_inputs; ++i) {
snprintf(sname, 100, "%s-mem-arb%d", name, i);
mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_inputs, num_caches);
for (uint32_t j = 0; j < num_inputs; ++j) {
input_arbs.at(j)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(j));
mem_arbs.at(i)->RspIn.at(j).bind(&input_arbs.at(j)->RspOut.at(i));
}
}
@ -70,9 +70,9 @@ public:
for (uint32_t i = 0; i < num_caches; ++i) {
snprintf(sname, 100, "%s-cache%d", name, i);
caches_.at(i) = CacheSim::Create(sname, config2);
caches_.at(i) = CacheSim::Create(sname, cache_config2);
for (uint32_t j = 0; j < config.num_inputs; ++j) {
for (uint32_t j = 0; j < cache_config.num_inputs; ++j) {
mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j));
caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
}

View file

@ -72,7 +72,7 @@ Cluster::Cluster(const SimContext& ctx,
2, // request size
true, // write-through
false, // write response
L2_MSHR_SIZE, // mshr
L2_MSHR_SIZE, // mshr size
2, // pipeline latency
});

View file

@ -23,4 +23,8 @@
#ifndef MEMORY_BANKS
#define MEMORY_BANKS 2
#endif
#endif
#define DCACHE_WORD_SIZE LSU_LINE_SIZE
#define DCACHE_CHANNELS UP((NUM_LSU_LANES * (XLEN / 8)) / DCACHE_WORD_SIZE)
#define DCACHE_NUM_REQS (NUM_LSU_BLOCKS * DCACHE_CHANNELS)

View file

@ -30,23 +30,23 @@ Core::Core(const SimContext& ctx,
Socket* socket,
const Arch &arch,
const DCRS &dcrs)
: SimObject(ctx, "core")
, icache_req_ports(1, this)
, icache_rsp_ports(1, this)
, dcache_req_ports(NUM_LSU_LANES, this)
, dcache_rsp_ports(NUM_LSU_LANES, this)
, core_id_(core_id)
, socket_(socket)
, arch_(arch)
, emulator_(arch, dcrs, this)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)
, operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)FUType::Count)
, func_units_((uint32_t)FUType::Count)
, lmem_demuxs_(NUM_LSU_LANES)
, pending_icache_(arch_.num_warps())
, commit_arbs_(ISSUE_WIDTH)
: SimObject(ctx, "core")
, icache_req_ports(1, this)
, icache_rsp_ports(1, this)
, dcache_req_ports(DCACHE_NUM_REQS, this)
, dcache_rsp_ports(DCACHE_NUM_REQS, this)
, core_id_(core_id)
, socket_(socket)
, arch_(arch)
, emulator_(arch, dcrs, this)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)
, operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)FUType::Count)
, func_units_((uint32_t)FUType::Count)
, lsu_demux_(DCACHE_NUM_REQS)
, pending_icache_(arch_.num_warps())
, commit_arbs_(ISSUE_WIDTH)
{
char sname[100];
@ -58,30 +58,30 @@ Core::Core(const SimContext& ctx,
snprintf(sname, 100, "core%d-local_mem", core_id);
local_mem_ = LocalMem::Create(sname, LocalMem::Config{
(1 << LMEM_LOG_SIZE),
sizeof(Word),
NUM_LSU_LANES,
NUM_LSU_LANES,
DCACHE_WORD_SIZE,
DCACHE_NUM_REQS,
LMEM_NUM_BANKS,
false
});
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
snprintf(sname, 100, "core%d-lmem_demux%d", core_id, i);
for (uint32_t i = 0; i < DCACHE_NUM_REQS; ++i) {
snprintf(sname, 100, "core%d-lsu_demux%d", core_id, i);
auto lmem_demux = LocalMemDemux::Create(sname);
lmem_demux->ReqDC.bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&lmem_demux->RspDC);
lmem_demux->ReqSM.bind(&local_mem_->Inputs.at(i));
local_mem_->Outputs.at(i).bind(&lmem_demux->RspSM);
lmem_demuxs_.at(i) = lmem_demux;
lsu_demux_.at(i) = lmem_demux;
}
// initialize dispatchers
dispatchers_.at((int)FUType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
dispatchers_.at((int)FUType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, 1, NUM_LSU_LANES);
dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, 1, NUM_SFU_LANES);
dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES);
dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES);
// initialize execute units
func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
@ -89,7 +89,7 @@ Core::Core(const SimContext& ctx,
func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
// bind commit arbiters
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)FUType::Count, 1);
for (uint32_t j = 0; j < (uint32_t)FUType::Count; ++j) {
@ -116,7 +116,7 @@ void Core::reset() {
for (auto& commit_arb : commit_arbs_) {
commit_arb->reset();
}
for (auto& ibuf : ibuffers_) {
ibuf.clear();
}
@ -125,11 +125,11 @@ void Core::reset() {
fetch_latch_.clear();
decode_latch_.clear();
pending_icache_.clear();
ibuffer_idx_ = 0;
pending_instrs_ = 0;
pending_instrs_ = 0;
pending_ifetches_ = 0;
perf_stats_ = PerfStats();
}
@ -142,7 +142,7 @@ void Core::tick() {
this->schedule();
++perf_stats_.cycles;
DPN(2, std::flush);
DPN(2, std::flush);
}
void Core::schedule() {
@ -184,11 +184,11 @@ void Core::fetch() {
MemReq mem_req;
mem_req.addr = trace->PC;
mem_req.write = false;
mem_req.tag = pending_icache_.allocate(trace);
mem_req.tag = pending_icache_.allocate(trace);
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
icache_req_ports.at(0).push(mem_req, 2);
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
icache_req_ports.at(0).push(mem_req, 2);
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
fetch_latch_.pop();
++perf_stats_.ifetches;
++pending_ifetches_;
@ -211,9 +211,9 @@ void Core::decode() {
} else {
trace->log_once(false);
}
// release warp
if (!trace->fetch_stall) {
if (!trace->fetch_stall) {
emulator_.resume(trace->wid);
}
@ -225,10 +225,10 @@ void Core::decode() {
decode_latch_.pop();
}
void Core::issue() {
void Core::issue() {
// operands to dispatchers
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& operand = operands_.at(i);
auto& operand = operands_.at(i);
if (operand->Output.empty())
continue;
auto trace = operand->Output.front();
@ -255,7 +255,7 @@ void Core::issue() {
if (scoreboard_.in_use(trace)) {
auto uses = scoreboard_.get_uses(trace);
if (!trace->log_once(true)) {
DTH(3, "*** scoreboard-stall: dependents={");
DTH(3, "*** scoreboard-stall: dependents={");
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
__unused (use);
@ -266,10 +266,10 @@ void Core::issue() {
}
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
switch (use.fu_type) {
switch (use.fu_type) {
case FUType::ALU: ++perf_stats_.scrb_alu; break;
case FUType::FPU: ++perf_stats_.scrb_fpu; break;
case FUType::LSU: ++perf_stats_.scrb_lsu; break;
case FUType::LSU: ++perf_stats_.scrb_lsu; break;
case FUType::SFU: {
++perf_stats_.scrb_sfu;
switch (use.sfu_type) {
@ -286,7 +286,7 @@ void Core::issue() {
}
} break;
default: assert(false);
}
}
}
++perf_stats_.scrb_stalls;
continue;

View file

@ -145,7 +145,7 @@ private:
std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<FuncUnit::Ptr> func_units_;
LocalMem::Ptr local_mem_;
std::vector<LocalMemDemux::Ptr> lmem_demuxs_;
std::vector<LocalMemDemux::Ptr> lsu_demux_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;

View file

@ -25,301 +25,337 @@
using namespace vortex;
AluUnit::AluUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "ALU") {}
void AluUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& input = Inputs.at(i);
if (input.empty())
continue;
auto& output = Outputs.at(i);
auto trace = input.front();
switch (trace->alu_type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
case AluType::IMUL:
output.push(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
output.push(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
if (trace->eop && trace->fetch_stall) {
core_->resume(trace->wid);
}
input.pop();
}
void AluUnit::tick() {
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
switch (trace->alu_type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
case AluType::IMUL:
output.push(trace, LATENCY_IMUL+1);
break;
case AluType::IDIV:
output.push(trace, XLEN+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
if (trace->eop && trace->fetch_stall) {
core_->resume(trace->wid);
}
input.pop();
}
}
///////////////////////////////////////////////////////////////////////////////
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "FPU") {}
void FpuUnit::tick() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& input = Inputs.at(i);
if (input.empty())
continue;
auto& output = Outputs.at(i);
auto trace = input.front();
switch (trace->fpu_type) {
case FpuType::FNCP:
output.push(trace, 2);
break;
case FpuType::FMA:
output.push(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
output.push(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
output.push(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
output.push(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
input.pop();
}
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
switch (trace->fpu_type) {
case FpuType::FNCP:
output.push(trace, 2);
break;
case FpuType::FMA:
output.push(trace, LATENCY_FMA+1);
break;
case FpuType::FDIV:
output.push(trace, LATENCY_FDIV+1);
break;
case FpuType::FSQRT:
output.push(trace, LATENCY_FSQRT+1);
break;
case FpuType::FCVT:
output.push(trace, LATENCY_FCVT+1);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
input.pop();
}
}
///////////////////////////////////////////////////////////////////////////////
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: FuncUnit(ctx, core, "LSU")
, pending_rd_reqs_(LSUQ_IN_SIZE)
, num_lanes_(NUM_LSU_LANES)
, pending_loads_(0)
, fence_lock_(false)
, input_idx_(0)
: FuncUnit(ctx, core, "LSU")
, pending_loads_(0)
{}
LsuUnit::~LsuUnit()
{}
void LsuUnit::reset() {
pending_rd_reqs_.clear();
pending_loads_ = 0;
fence_lock_ = false;
for (auto& state : states_) {
state.clear();
}
pending_loads_ = 0;
}
void LsuUnit::tick() {
core_->perf_stats_.load_latency += pending_loads_;
void LsuUnit::tick() {
core_->perf_stats_.load_latency += pending_loads_;
// handle dcache response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& dcache_rsp_port = core_->lmem_demuxs_.at(t)->RspIn;
if (dcache_rsp_port.empty())
continue;
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type
<< ", tid=" << t << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.push(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
--pending_loads_;
}
// handle dcache responses
for (uint32_t r = 0; r < DCACHE_NUM_REQS; ++r) {
auto& dcache_rsp_port = core_->lsu_demux_.at(r)->RspIn;
if (dcache_rsp_port.empty())
continue;
uint32_t block_idx = r / DCACHE_CHANNELS;
auto& state = states_.at(block_idx);
auto& mem_rsp = dcache_rsp_port.front();
auto& entry = state.pending_rd_reqs.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", rid=" << r << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
Outputs.at(iw).push(trace, 1);
state.pending_rd_reqs.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
--pending_loads_;
}
// handle local memory response
for (uint32_t t = 0; t < num_lanes_; ++t) {
auto& lmem_rsp_port = core_->local_mem_->Outputs.at(t);
if (lmem_rsp_port.empty())
continue;
auto& mem_rsp = lmem_rsp_port.front();
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
auto trace = entry.trace;
DT(3, "lmem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", tid=" << t << ", " << *trace);
assert(entry.count);
--entry.count; // track remaining addresses
if (0 == entry.count) {
int iw = trace->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.push(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
lmem_rsp_port.pop();
--pending_loads_;
}
// handle LSU requests
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
uint32_t block_idx = iw % NUM_LSU_BLOCKS;
auto& state = states_.at(block_idx);
if (state.fence_lock) {
// wait for all pending memory operations to complete
if (!state.pending_rd_reqs.empty())
continue;
Outputs.at(iw).push(state.fence_trace, 1);
state.fence_lock = false;
DT(3, "fence-unlock: " << state.fence_trace);
}
if (fence_lock_) {
// wait for all pending memory operations to complete
if (!pending_rd_reqs_.empty())
return;
int iw = fence_state_->wid % ISSUE_WIDTH;
auto& output = Outputs.at(iw);
output.push(fence_state_, 1);
fence_lock_ = false;
DT(3, "fence-unlock: " << fence_state_);
}
// check input queue
auto& input = Inputs.at(iw);
if (input.empty())
continue;
// check input queue
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
int iw = (input_idx_ + i) % ISSUE_WIDTH;
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
auto& output = Outputs.at(iw);
auto trace = input.front();
auto t0 = trace->pid * num_lanes_;
if (trace->lsu_type == LsuType::FENCE) {
// schedule fence lock
state.fence_trace = trace;
state.fence_lock = true;
DT(3, "fence-lock: " << *trace);
// remove input
input.pop();
continue;
}
if (trace->lsu_type == LsuType::FENCE) {
// schedule fence lock
fence_state_ = trace;
fence_lock_ = true;
DT(3, "fence-lock: " << *trace);
// remove input
input.pop();
break;
}
// check pending queue capacity
if (state.pending_rd_reqs.full()) {
if (!trace->log_once(true)) {
DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
}
continue;
} else {
trace->log_once(false);
}
// check pending queue capacity
if (pending_rd_reqs_.full()) {
if (!trace->log_once(true)) {
DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
}
break;
} else {
trace->log_once(false);
}
bool is_write = (trace->lsu_type == LsuType::STORE);
uint32_t num_reqs;
auto tag = state.pending_rd_reqs.allocate({trace, 0});
if (DCACHE_WORD_SIZE != (XLEN/8)) {
num_reqs = this->send_coalesced_requests(trace, block_idx, tag);
} else {
num_reqs = this->send_requests(trace, block_idx, tag);
}
state.pending_rd_reqs.at(tag).count = num_reqs;
// duplicates detection
bool is_dup = false;
if (trace->tmask.test(t0)) {
uint64_t addr_mask = sizeof(uint32_t)-1;
uint32_t addr0 = trace_data->mem_addrs.at(0).addr & ~addr_mask;
uint32_t matches = 1;
for (uint32_t t = 1; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto mem_addr = trace_data->mem_addrs.at(t + t0).addr & ~addr_mask;
matches += (addr0 == mem_addr);
}
#ifdef LSU_DUP_ENABLE
is_dup = (matches == trace->tmask.count());
#endif
}
// do not wait on writes
bool is_write = (trace->lsu_type == LsuType::STORE);
if (is_write) {
state.pending_rd_reqs.release(tag);
output.push(trace, 1);
}
uint32_t addr_count;
if (is_dup) {
addr_count = 1;
} else {
addr_count = trace->tmask.count();
}
// remove input
input.pop();
}
}
auto tag = pending_rd_reqs_.allocate({trace, addr_count});
int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
int count = 0;
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
bool is_write = (trace->lsu_type == LsuType::STORE);
auto t0 = trace->pid * NUM_LSU_LANES;
for (uint32_t t = 0; t < num_lanes_; ++t) {
if (!trace->tmask.test(t0 + t))
continue;
auto& dcache_req_port = core_->lmem_demuxs_.at(t)->ReqIn;
auto mem_addr = trace_data->mem_addrs.at(t + t0);
auto type = get_addr_type(mem_addr.addr);
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
uint32_t t = t0 + i;
if (!trace->tmask.test(t))
continue;
int req_idx = block_idx * DCACHE_CHANNELS + (i % DCACHE_CHANNELS);
auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
auto mem_addr = trace_data->mem_addrs.at(t);
auto type = get_addr_type(mem_addr.addr);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.type = type;
mem_req.tag = tag;
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.push(mem_req, 1);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
MemReq mem_req;
mem_req.addr = mem_addr.addr;
mem_req.write = is_write;
mem_req.type = type;
mem_req.tag = tag;
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.push(mem_req, 1);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
if (is_write) {
++core_->perf_stats_.stores;
} else {
++core_->perf_stats_.loads;
++pending_loads_;
}
if (is_dup)
break;
}
if (is_write) {
++core_->perf_stats_.stores;
} else {
++core_->perf_stats_.loads;
++pending_loads_;
}
// do not wait on writes
if (is_write) {
pending_rd_reqs_.release(tag);
output.push(trace, 1);
}
++count;
}
return count;
}
// remove input
input.pop();
int LsuUnit::send_coalesced_requests(instr_trace_t* trace, int block_idx, int tag) {
int count = 0;
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
bool is_write = (trace->lsu_type == LsuType::STORE);
auto t0 = trace->pid * NUM_LSU_LANES;
break; // single block
}
++input_idx_;
auto addr_mask = ~uint64_t(LSU_LINE_SIZE-1);
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
std::bitset<NUM_LSU_LANES / DCACHE_CHANNELS> mask(0);
for (uint32_t i = 0; i < mask.size(); ++i) {
mask.set(i, trace->tmask.test(t0 + i));
}
int req_idx = block_idx * DCACHE_CHANNELS + c;
auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
while (mask.any()) {
// calculate seed idex
int seed_idx = 0;
for (uint32_t i = 0; i < mask.size(); ++i) {
if (mask.test(i)) {
seed_idx = i;
break;
}
}
uint32_t seed_addr = trace_data->mem_addrs.at(t0 + seed_idx).addr & addr_mask;
auto type = get_addr_type(seed_addr);
// coalesce addresses matching the seed
uint32_t coelescing_size = 0;
for (uint32_t i = seed_idx; i < mask.size(); ++i) {
auto mem_addr = trace_data->mem_addrs.at(t0 + i).addr & addr_mask;
if (mem_addr == seed_addr) {
mask.set(i, 0);
++coelescing_size;
}
}
MemReq mem_req;
mem_req.addr = seed_addr;
mem_req.write = is_write;
mem_req.type = type;
mem_req.tag = tag;
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
dcache_req_port.push(mem_req, 1);
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
if (coelescing_size > 1) {
DT(3, "*** coalescing: size=" << coelescing_size << ", " << *trace);
}
if (is_write) {
++core_->perf_stats_.stores;
} else {
++core_->perf_stats_.loads;
++pending_loads_;
}
++count;
}
t0 += mask.size();
}
return count;
}
///////////////////////////////////////////////////////////////////////////////
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
: FuncUnit(ctx, core, "SFU")
, input_idx_(0)
: FuncUnit(ctx, core, "SFU")
{}
void SfuUnit::tick() {
// check input queue
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
int iw = (input_idx_ + i) % ISSUE_WIDTH;
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto sfu_type = trace->sfu_type;
bool release_warp = trace->fetch_stall;
// check input queue
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
auto& input = Inputs.at(iw);
if (input.empty())
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
auto sfu_type = trace->sfu_type;
bool release_warp = trace->fetch_stall;
switch (sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::PRED:
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC:
output.push(trace, 1);
break;
case SfuType::BAR: {
output.push(trace, 1);
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
if (trace->eop) {
core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
}
release_warp = false;
} break;
case SfuType::CMOV:
output.push(trace, 3);
break;
default:
std::abort();
}
switch (sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::PRED:
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC:
output.push(trace, 1);
break;
case SfuType::BAR: {
output.push(trace, 1);
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
if (trace->eop) {
core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
}
release_warp = false;
} break;
case SfuType::CMOV:
output.push(trace, 3);
break;
default:
std::abort();
}
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
if (trace->eop && release_warp) {
core_->resume(trace->wid);
}
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
if (trace->eop && release_warp) {
core_->resume(trace->wid);
}
input.pop();
break; // single block
}
++input_idx_;
input.pop();
}
}

View file

@ -14,6 +14,7 @@
#pragma once
#include <simobject.h>
#include <array>
#include "instr_trace.h"
namespace vortex {
@ -22,77 +23,89 @@ class Core;
class FuncUnit : public SimObject<FuncUnit> {
public:
std::vector<SimPort<instr_trace_t*>> Inputs;
std::vector<SimPort<instr_trace_t*>> Outputs;
std::vector<SimPort<instr_trace_t*>> Inputs;
std::vector<SimPort<instr_trace_t*>> Outputs;
FuncUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<FuncUnit>(ctx, name)
, Inputs(ISSUE_WIDTH, this)
, Outputs(ISSUE_WIDTH, this)
, core_(core)
{}
virtual ~FuncUnit() {}
FuncUnit(const SimContext& ctx, Core* core, const char* name)
: SimObject<FuncUnit>(ctx, name)
, Inputs(ISSUE_WIDTH, this)
, Outputs(ISSUE_WIDTH, this)
, core_(core)
{}
virtual ~FuncUnit() {}
virtual void reset() {}
virtual void reset() {}
virtual void tick() = 0;
virtual void tick() = 0;
protected:
Core* core_;
Core* core_;
};
///////////////////////////////////////////////////////////////////////////////
class AluUnit : public FuncUnit {
public:
AluUnit(const SimContext& ctx, Core*);
void tick();
AluUnit(const SimContext& ctx, Core*);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class FpuUnit : public FuncUnit {
public:
FpuUnit(const SimContext& ctx, Core*);
void tick();
FpuUnit(const SimContext& ctx, Core*);
void tick();
};
///////////////////////////////////////////////////////////////////////////////
class LsuUnit : public FuncUnit {
public:
LsuUnit(const SimContext& ctx, Core*);
LsuUnit(const SimContext& ctx, Core*);
~LsuUnit();
void reset();
void reset();
void tick();
void tick();
private:
private:
struct pending_req_t {
instr_trace_t* trace;
uint32_t count;
};
HashTable<pending_req_t> pending_rd_reqs_;
uint32_t num_lanes_;
instr_trace_t* fence_state_;
uint64_t pending_loads_;
bool fence_lock_;
uint32_t input_idx_;
int send_requests(instr_trace_t* trace, int block_idx, int tag);
int send_coalesced_requests(instr_trace_t* trace, int block_idx, int tag);
struct pending_req_t {
instr_trace_t* trace;
uint32_t count;
};
struct lsu_state_t {
HashTable<pending_req_t> pending_rd_reqs;
instr_trace_t* fence_trace;
bool fence_lock;
lsu_state_t() : pending_rd_reqs(LSUQ_IN_SIZE) {}
void clear() {
this->pending_rd_reqs.clear();
this->fence_trace = nullptr;
this->fence_lock = false;
}
};
std::array<lsu_state_t, NUM_LSU_BLOCKS> states_;
uint64_t pending_loads_;
};
///////////////////////////////////////////////////////////////////////////////
class SfuUnit : public FuncUnit {
public:
SfuUnit(const SimContext& ctx, Core*);
void tick();
private:
uint32_t input_idx_;
SfuUnit(const SimContext& ctx, Core*);
void tick();
};
}

View file

@ -41,7 +41,7 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
uint8_t(arch.num_clusters()), // request size
true, // write-through
false, // write response
L3_MSHR_SIZE, // mshr
L3_MSHR_SIZE, // mshr size
2, // pipeline latency
}
);

View file

@ -28,10 +28,10 @@ Socket::Socket(const SimContext& ctx,
, dcache_mem_rsp_port(this)
, socket_id_(socket_id)
, cluster_(cluster)
, cores_(arch.socket_size())
, cores_(arch.socket_size())
{
auto cores_per_socket = cores_.size();
char sname[100];
snprintf(sname, 100, "socket%d-icaches", socket_id);
icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{
@ -46,7 +46,7 @@ Socket::Socket(const SimContext& ctx,
1, // number of inputs
false, // write-through
false, // write response
(uint8_t)arch.num_warps(), // mshr
(uint8_t)arch.num_warps(), // mshr size
2, // pipeline latency
});
@ -54,19 +54,19 @@ Socket::Socket(const SimContext& ctx,
icache_mem_rsp_port.bind(&icaches_->MemRspPort);
snprintf(sname, 100, "socket%d-dcaches", socket_id);
dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, DCACHE_NUM_REQS, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // L
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_WORD_SIZE), // W
log2ceil(DCACHE_NUM_WAYS),// A
log2ceil(DCACHE_NUM_BANKS), // B
XLEN, // address bits
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
DCACHE_NUM_REQS, // number of inputs
true, // write-through
false, // write response
DCACHE_MSHR_SIZE, // mshr
DCACHE_MSHR_SIZE, // mshr size
2, // pipeline latency
});
@ -75,17 +75,14 @@ Socket::Socket(const SimContext& ctx,
// create cores
for (uint32_t i = 0; i < cores_per_socket; ++i) {
for (uint32_t i = 0; i < cores_per_socket; ++i) {
uint32_t core_id = socket_id * cores_per_socket + i;
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs);
cores_.at(i) = Core::Create(core_id, this, arch, dcrs);
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
for (uint32_t j = 0; j < DCACHE_NUM_REQS; ++j) {
cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j));
}
@ -96,7 +93,7 @@ Socket::~Socket() {
//--
}
void Socket::reset() {
void Socket::reset() {
//--
}
@ -137,6 +134,6 @@ void Socket::resume(uint32_t core_index) {
Socket::PerfStats Socket::perf_stats() const {
PerfStats perf_stats;
perf_stats.icache = icaches_->perf_stats();
perf_stats.dcache = dcaches_->perf_stats();
perf_stats.dcache = dcaches_->perf_stats();
return perf_stats;
}

View file

@ -244,7 +244,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
struct MemReq {
uint64_t addr;
bool write;
bool write;
AddrType type;
uint32_t tag;
uint32_t cid;