mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
simx memory coalescing support
This commit is contained in:
parent
07c063031f
commit
f1522e68f8
11 changed files with 433 additions and 383 deletions
|
@ -252,9 +252,9 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
|
||||
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
|
||||
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
|
||||
assign perf_dcache_rd_req_fire[i] = dcache_lmem_bus_if[i].req_valid && dcache_lmem_bus_if[i].req_ready && ~dcache_lmem_bus_if[i].req_data.rw;
|
||||
assign perf_dcache_wr_req_fire[i] = dcache_lmem_bus_if[i].req_valid && dcache_lmem_bus_if[i].req_ready && dcache_lmem_bus_if[i].req_data.rw;
|
||||
assign perf_dcache_rsp_fire[i] = dcache_lmem_bus_if[i].rsp_valid && dcache_lmem_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
|
||||
|
|
|
@ -26,42 +26,42 @@ public:
|
|||
|
||||
CacheCluster(const SimContext& ctx,
|
||||
const char* name,
|
||||
uint32_t num_units,
|
||||
uint32_t num_inputs,
|
||||
uint32_t num_caches,
|
||||
uint32_t num_requests,
|
||||
const CacheSim::Config& config)
|
||||
const CacheSim::Config& cache_config)
|
||||
: SimObject(ctx, name)
|
||||
, CoreReqPorts(num_units, std::vector<SimPort<MemReq>>(num_requests, this))
|
||||
, CoreRspPorts(num_units, std::vector<SimPort<MemRsp>>(num_requests, this))
|
||||
, CoreReqPorts(num_inputs, std::vector<SimPort<MemReq>>(num_requests, this))
|
||||
, CoreRspPorts(num_inputs, std::vector<SimPort<MemRsp>>(num_requests, this))
|
||||
, MemReqPort(this)
|
||||
, MemRspPort(this)
|
||||
, caches_(MAX(num_caches, 0x1)) {
|
||||
|
||||
CacheSim::Config config2(config);
|
||||
CacheSim::Config cache_config2(cache_config);
|
||||
if (0 == num_caches) {
|
||||
num_caches = 1;
|
||||
config2.bypass = true;
|
||||
cache_config2.bypass = true;
|
||||
}
|
||||
|
||||
char sname[100];
|
||||
|
||||
std::vector<MemSwitch::Ptr> unit_arbs(num_units);
|
||||
for (uint32_t u = 0; u < num_units; ++u) {
|
||||
snprintf(sname, 100, "%s-unit-arb-%d", name, u);
|
||||
unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
|
||||
std::vector<MemSwitch::Ptr> input_arbs(num_inputs);
|
||||
for (uint32_t j = 0; j < num_inputs; ++j) {
|
||||
snprintf(sname, 100, "%s-input-arb%d", name, j);
|
||||
input_arbs.at(j) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, cache_config.num_inputs);
|
||||
for (uint32_t i = 0; i < num_requests; ++i) {
|
||||
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
|
||||
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
|
||||
this->CoreReqPorts.at(j).at(i).bind(&input_arbs.at(j)->ReqIn.at(i));
|
||||
input_arbs.at(j)->RspIn.at(i).bind(&this->CoreRspPorts.at(j).at(i));
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<MemSwitch::Ptr> mem_arbs(config.num_inputs);
|
||||
for (uint32_t i = 0; i < config.num_inputs; ++i) {
|
||||
snprintf(sname, 100, "%s-mem-arb-%d", name, i);
|
||||
mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
|
||||
for (uint32_t u = 0; u < num_units; ++u) {
|
||||
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
|
||||
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
|
||||
std::vector<MemSwitch::Ptr> mem_arbs(cache_config.num_inputs);
|
||||
for (uint32_t i = 0; i < cache_config.num_inputs; ++i) {
|
||||
snprintf(sname, 100, "%s-mem-arb%d", name, i);
|
||||
mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_inputs, num_caches);
|
||||
for (uint32_t j = 0; j < num_inputs; ++j) {
|
||||
input_arbs.at(j)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(j));
|
||||
mem_arbs.at(i)->RspIn.at(j).bind(&input_arbs.at(j)->RspOut.at(i));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -70,9 +70,9 @@ public:
|
|||
|
||||
for (uint32_t i = 0; i < num_caches; ++i) {
|
||||
snprintf(sname, 100, "%s-cache%d", name, i);
|
||||
caches_.at(i) = CacheSim::Create(sname, config2);
|
||||
caches_.at(i) = CacheSim::Create(sname, cache_config2);
|
||||
|
||||
for (uint32_t j = 0; j < config.num_inputs; ++j) {
|
||||
for (uint32_t j = 0; j < cache_config.num_inputs; ++j) {
|
||||
mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j));
|
||||
caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
|
||||
}
|
||||
|
|
|
@ -72,7 +72,7 @@ Cluster::Cluster(const SimContext& ctx,
|
|||
2, // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
L2_MSHR_SIZE, // mshr
|
||||
L2_MSHR_SIZE, // mshr size
|
||||
2, // pipeline latency
|
||||
});
|
||||
|
||||
|
|
|
@ -23,4 +23,8 @@
|
|||
|
||||
#ifndef MEMORY_BANKS
|
||||
#define MEMORY_BANKS 2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define DCACHE_WORD_SIZE LSU_LINE_SIZE
|
||||
#define DCACHE_CHANNELS UP((NUM_LSU_LANES * (XLEN / 8)) / DCACHE_WORD_SIZE)
|
||||
#define DCACHE_NUM_REQS (NUM_LSU_BLOCKS * DCACHE_CHANNELS)
|
|
@ -30,23 +30,23 @@ Core::Core(const SimContext& ctx,
|
|||
Socket* socket,
|
||||
const Arch &arch,
|
||||
const DCRS &dcrs)
|
||||
: SimObject(ctx, "core")
|
||||
, icache_req_ports(1, this)
|
||||
, icache_rsp_ports(1, this)
|
||||
, dcache_req_ports(NUM_LSU_LANES, this)
|
||||
, dcache_rsp_ports(NUM_LSU_LANES, this)
|
||||
, core_id_(core_id)
|
||||
, socket_(socket)
|
||||
, arch_(arch)
|
||||
, emulator_(arch, dcrs, this)
|
||||
, ibuffers_(arch.num_warps(), IBUF_SIZE)
|
||||
, scoreboard_(arch_)
|
||||
, operands_(ISSUE_WIDTH)
|
||||
, dispatchers_((uint32_t)FUType::Count)
|
||||
, func_units_((uint32_t)FUType::Count)
|
||||
, lmem_demuxs_(NUM_LSU_LANES)
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, commit_arbs_(ISSUE_WIDTH)
|
||||
: SimObject(ctx, "core")
|
||||
, icache_req_ports(1, this)
|
||||
, icache_rsp_ports(1, this)
|
||||
, dcache_req_ports(DCACHE_NUM_REQS, this)
|
||||
, dcache_rsp_ports(DCACHE_NUM_REQS, this)
|
||||
, core_id_(core_id)
|
||||
, socket_(socket)
|
||||
, arch_(arch)
|
||||
, emulator_(arch, dcrs, this)
|
||||
, ibuffers_(arch.num_warps(), IBUF_SIZE)
|
||||
, scoreboard_(arch_)
|
||||
, operands_(ISSUE_WIDTH)
|
||||
, dispatchers_((uint32_t)FUType::Count)
|
||||
, func_units_((uint32_t)FUType::Count)
|
||||
, lsu_demux_(DCACHE_NUM_REQS)
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, commit_arbs_(ISSUE_WIDTH)
|
||||
{
|
||||
char sname[100];
|
||||
|
||||
|
@ -58,30 +58,30 @@ Core::Core(const SimContext& ctx,
|
|||
snprintf(sname, 100, "core%d-local_mem", core_id);
|
||||
local_mem_ = LocalMem::Create(sname, LocalMem::Config{
|
||||
(1 << LMEM_LOG_SIZE),
|
||||
sizeof(Word),
|
||||
NUM_LSU_LANES,
|
||||
NUM_LSU_LANES,
|
||||
DCACHE_WORD_SIZE,
|
||||
DCACHE_NUM_REQS,
|
||||
LMEM_NUM_BANKS,
|
||||
false
|
||||
});
|
||||
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
|
||||
snprintf(sname, 100, "core%d-lmem_demux%d", core_id, i);
|
||||
for (uint32_t i = 0; i < DCACHE_NUM_REQS; ++i) {
|
||||
snprintf(sname, 100, "core%d-lsu_demux%d", core_id, i);
|
||||
auto lmem_demux = LocalMemDemux::Create(sname);
|
||||
|
||||
|
||||
lmem_demux->ReqDC.bind(&dcache_req_ports.at(i));
|
||||
dcache_rsp_ports.at(i).bind(&lmem_demux->RspDC);
|
||||
|
||||
lmem_demux->ReqSM.bind(&local_mem_->Inputs.at(i));
|
||||
local_mem_->Outputs.at(i).bind(&lmem_demux->RspSM);
|
||||
|
||||
lmem_demuxs_.at(i) = lmem_demux;
|
||||
lsu_demux_.at(i) = lmem_demux;
|
||||
}
|
||||
|
||||
// initialize dispatchers
|
||||
dispatchers_.at((int)FUType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
|
||||
dispatchers_.at((int)FUType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
|
||||
dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, 1, NUM_LSU_LANES);
|
||||
dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, 1, NUM_SFU_LANES);
|
||||
|
||||
dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES);
|
||||
dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES);
|
||||
|
||||
// initialize execute units
|
||||
func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
|
||||
func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
|
||||
|
@ -89,7 +89,7 @@ Core::Core(const SimContext& ctx,
|
|||
func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
|
||||
|
||||
// bind commit arbiters
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
|
||||
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)FUType::Count, 1);
|
||||
for (uint32_t j = 0; j < (uint32_t)FUType::Count; ++j) {
|
||||
|
@ -116,7 +116,7 @@ void Core::reset() {
|
|||
for (auto& commit_arb : commit_arbs_) {
|
||||
commit_arb->reset();
|
||||
}
|
||||
|
||||
|
||||
for (auto& ibuf : ibuffers_) {
|
||||
ibuf.clear();
|
||||
}
|
||||
|
@ -125,11 +125,11 @@ void Core::reset() {
|
|||
fetch_latch_.clear();
|
||||
decode_latch_.clear();
|
||||
pending_icache_.clear();
|
||||
|
||||
|
||||
ibuffer_idx_ = 0;
|
||||
pending_instrs_ = 0;
|
||||
pending_instrs_ = 0;
|
||||
pending_ifetches_ = 0;
|
||||
|
||||
|
||||
perf_stats_ = PerfStats();
|
||||
}
|
||||
|
||||
|
@ -142,7 +142,7 @@ void Core::tick() {
|
|||
this->schedule();
|
||||
|
||||
++perf_stats_.cycles;
|
||||
DPN(2, std::flush);
|
||||
DPN(2, std::flush);
|
||||
}
|
||||
|
||||
void Core::schedule() {
|
||||
|
@ -184,11 +184,11 @@ void Core::fetch() {
|
|||
MemReq mem_req;
|
||||
mem_req.addr = trace->PC;
|
||||
mem_req.write = false;
|
||||
mem_req.tag = pending_icache_.allocate(trace);
|
||||
mem_req.tag = pending_icache_.allocate(trace);
|
||||
mem_req.cid = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
icache_req_ports.at(0).push(mem_req, 2);
|
||||
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
||||
icache_req_ports.at(0).push(mem_req, 2);
|
||||
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
||||
fetch_latch_.pop();
|
||||
++perf_stats_.ifetches;
|
||||
++pending_ifetches_;
|
||||
|
@ -211,9 +211,9 @@ void Core::decode() {
|
|||
} else {
|
||||
trace->log_once(false);
|
||||
}
|
||||
|
||||
|
||||
// release warp
|
||||
if (!trace->fetch_stall) {
|
||||
if (!trace->fetch_stall) {
|
||||
emulator_.resume(trace->wid);
|
||||
}
|
||||
|
||||
|
@ -225,10 +225,10 @@ void Core::decode() {
|
|||
decode_latch_.pop();
|
||||
}
|
||||
|
||||
void Core::issue() {
|
||||
void Core::issue() {
|
||||
// operands to dispatchers
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto& operand = operands_.at(i);
|
||||
auto& operand = operands_.at(i);
|
||||
if (operand->Output.empty())
|
||||
continue;
|
||||
auto trace = operand->Output.front();
|
||||
|
@ -255,7 +255,7 @@ void Core::issue() {
|
|||
if (scoreboard_.in_use(trace)) {
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
if (!trace->log_once(true)) {
|
||||
DTH(3, "*** scoreboard-stall: dependents={");
|
||||
DTH(3, "*** scoreboard-stall: dependents={");
|
||||
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
|
||||
auto& use = uses.at(j);
|
||||
__unused (use);
|
||||
|
@ -266,10 +266,10 @@ void Core::issue() {
|
|||
}
|
||||
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
|
||||
auto& use = uses.at(j);
|
||||
switch (use.fu_type) {
|
||||
switch (use.fu_type) {
|
||||
case FUType::ALU: ++perf_stats_.scrb_alu; break;
|
||||
case FUType::FPU: ++perf_stats_.scrb_fpu; break;
|
||||
case FUType::LSU: ++perf_stats_.scrb_lsu; break;
|
||||
case FUType::LSU: ++perf_stats_.scrb_lsu; break;
|
||||
case FUType::SFU: {
|
||||
++perf_stats_.scrb_sfu;
|
||||
switch (use.sfu_type) {
|
||||
|
@ -286,7 +286,7 @@ void Core::issue() {
|
|||
}
|
||||
} break;
|
||||
default: assert(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
++perf_stats_.scrb_stalls;
|
||||
continue;
|
||||
|
|
|
@ -145,7 +145,7 @@ private:
|
|||
std::vector<Dispatcher::Ptr> dispatchers_;
|
||||
std::vector<FuncUnit::Ptr> func_units_;
|
||||
LocalMem::Ptr local_mem_;
|
||||
std::vector<LocalMemDemux::Ptr> lmem_demuxs_;
|
||||
std::vector<LocalMemDemux::Ptr> lsu_demux_;
|
||||
|
||||
PipelineLatch fetch_latch_;
|
||||
PipelineLatch decode_latch_;
|
||||
|
|
|
@ -25,301 +25,337 @@
|
|||
using namespace vortex;
|
||||
|
||||
AluUnit::AluUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "ALU") {}
|
||||
|
||||
void AluUnit::tick() {
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto& input = Inputs.at(i);
|
||||
if (input.empty())
|
||||
continue;
|
||||
auto& output = Outputs.at(i);
|
||||
auto trace = input.front();
|
||||
switch (trace->alu_type) {
|
||||
case AluType::ARITH:
|
||||
case AluType::BRANCH:
|
||||
case AluType::SYSCALL:
|
||||
case AluType::IMUL:
|
||||
output.push(trace, LATENCY_IMUL+1);
|
||||
break;
|
||||
case AluType::IDIV:
|
||||
output.push(trace, XLEN+1);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
|
||||
if (trace->eop && trace->fetch_stall) {
|
||||
core_->resume(trace->wid);
|
||||
}
|
||||
input.pop();
|
||||
}
|
||||
|
||||
void AluUnit::tick() {
|
||||
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
|
||||
auto& input = Inputs.at(iw);
|
||||
if (input.empty())
|
||||
continue;
|
||||
auto& output = Outputs.at(iw);
|
||||
auto trace = input.front();
|
||||
switch (trace->alu_type) {
|
||||
case AluType::ARITH:
|
||||
case AluType::BRANCH:
|
||||
case AluType::SYSCALL:
|
||||
case AluType::IMUL:
|
||||
output.push(trace, LATENCY_IMUL+1);
|
||||
break;
|
||||
case AluType::IDIV:
|
||||
output.push(trace, XLEN+1);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
|
||||
if (trace->eop && trace->fetch_stall) {
|
||||
core_->resume(trace->wid);
|
||||
}
|
||||
input.pop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "FPU") {}
|
||||
|
||||
|
||||
void FpuUnit::tick() {
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto& input = Inputs.at(i);
|
||||
if (input.empty())
|
||||
continue;
|
||||
auto& output = Outputs.at(i);
|
||||
auto trace = input.front();
|
||||
switch (trace->fpu_type) {
|
||||
case FpuType::FNCP:
|
||||
output.push(trace, 2);
|
||||
break;
|
||||
case FpuType::FMA:
|
||||
output.push(trace, LATENCY_FMA+1);
|
||||
break;
|
||||
case FpuType::FDIV:
|
||||
output.push(trace, LATENCY_FDIV+1);
|
||||
break;
|
||||
case FpuType::FSQRT:
|
||||
output.push(trace, LATENCY_FSQRT+1);
|
||||
break;
|
||||
case FpuType::FCVT:
|
||||
output.push(trace, LATENCY_FCVT+1);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
|
||||
input.pop();
|
||||
}
|
||||
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
|
||||
auto& input = Inputs.at(iw);
|
||||
if (input.empty())
|
||||
continue;
|
||||
auto& output = Outputs.at(iw);
|
||||
auto trace = input.front();
|
||||
switch (trace->fpu_type) {
|
||||
case FpuType::FNCP:
|
||||
output.push(trace, 2);
|
||||
break;
|
||||
case FpuType::FMA:
|
||||
output.push(trace, LATENCY_FMA+1);
|
||||
break;
|
||||
case FpuType::FDIV:
|
||||
output.push(trace, LATENCY_FDIV+1);
|
||||
break;
|
||||
case FpuType::FSQRT:
|
||||
output.push(trace, LATENCY_FSQRT+1);
|
||||
break;
|
||||
case FpuType::FCVT:
|
||||
output.push(trace, LATENCY_FCVT+1);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
|
||||
input.pop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
|
||||
: FuncUnit(ctx, core, "LSU")
|
||||
, pending_rd_reqs_(LSUQ_IN_SIZE)
|
||||
, num_lanes_(NUM_LSU_LANES)
|
||||
, pending_loads_(0)
|
||||
, fence_lock_(false)
|
||||
, input_idx_(0)
|
||||
: FuncUnit(ctx, core, "LSU")
|
||||
, pending_loads_(0)
|
||||
{}
|
||||
|
||||
LsuUnit::~LsuUnit()
|
||||
{}
|
||||
|
||||
void LsuUnit::reset() {
|
||||
pending_rd_reqs_.clear();
|
||||
pending_loads_ = 0;
|
||||
fence_lock_ = false;
|
||||
for (auto& state : states_) {
|
||||
state.clear();
|
||||
}
|
||||
pending_loads_ = 0;
|
||||
}
|
||||
|
||||
void LsuUnit::tick() {
|
||||
core_->perf_stats_.load_latency += pending_loads_;
|
||||
void LsuUnit::tick() {
|
||||
core_->perf_stats_.load_latency += pending_loads_;
|
||||
|
||||
// handle dcache response
|
||||
for (uint32_t t = 0; t < num_lanes_; ++t) {
|
||||
auto& dcache_rsp_port = core_->lmem_demuxs_.at(t)->RspIn;
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = dcache_rsp_port.front();
|
||||
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
|
||||
auto trace = entry.trace;
|
||||
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type
|
||||
<< ", tid=" << t << ", " << *trace);
|
||||
assert(entry.count);
|
||||
--entry.count; // track remaining addresses
|
||||
if (0 == entry.count) {
|
||||
int iw = trace->wid % ISSUE_WIDTH;
|
||||
auto& output = Outputs.at(iw);
|
||||
output.push(trace, 1);
|
||||
pending_rd_reqs_.release(mem_rsp.tag);
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
--pending_loads_;
|
||||
}
|
||||
// handle dcache responses
|
||||
for (uint32_t r = 0; r < DCACHE_NUM_REQS; ++r) {
|
||||
auto& dcache_rsp_port = core_->lsu_demux_.at(r)->RspIn;
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
uint32_t block_idx = r / DCACHE_CHANNELS;
|
||||
auto& state = states_.at(block_idx);
|
||||
auto& mem_rsp = dcache_rsp_port.front();
|
||||
auto& entry = state.pending_rd_reqs.at(mem_rsp.tag);
|
||||
auto trace = entry.trace;
|
||||
DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", rid=" << r << ", " << *trace);
|
||||
assert(entry.count);
|
||||
--entry.count; // track remaining addresses
|
||||
if (0 == entry.count) {
|
||||
int iw = trace->wid % ISSUE_WIDTH;
|
||||
Outputs.at(iw).push(trace, 1);
|
||||
state.pending_rd_reqs.release(mem_rsp.tag);
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
--pending_loads_;
|
||||
}
|
||||
|
||||
// handle local memory response
|
||||
for (uint32_t t = 0; t < num_lanes_; ++t) {
|
||||
auto& lmem_rsp_port = core_->local_mem_->Outputs.at(t);
|
||||
if (lmem_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = lmem_rsp_port.front();
|
||||
auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
|
||||
auto trace = entry.trace;
|
||||
DT(3, "lmem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", tid=" << t << ", " << *trace);
|
||||
assert(entry.count);
|
||||
--entry.count; // track remaining addresses
|
||||
if (0 == entry.count) {
|
||||
int iw = trace->wid % ISSUE_WIDTH;
|
||||
auto& output = Outputs.at(iw);
|
||||
output.push(trace, 1);
|
||||
pending_rd_reqs_.release(mem_rsp.tag);
|
||||
}
|
||||
lmem_rsp_port.pop();
|
||||
--pending_loads_;
|
||||
}
|
||||
// handle LSU requests
|
||||
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
|
||||
uint32_t block_idx = iw % NUM_LSU_BLOCKS;
|
||||
auto& state = states_.at(block_idx);
|
||||
if (state.fence_lock) {
|
||||
// wait for all pending memory operations to complete
|
||||
if (!state.pending_rd_reqs.empty())
|
||||
continue;
|
||||
Outputs.at(iw).push(state.fence_trace, 1);
|
||||
state.fence_lock = false;
|
||||
DT(3, "fence-unlock: " << state.fence_trace);
|
||||
}
|
||||
|
||||
if (fence_lock_) {
|
||||
// wait for all pending memory operations to complete
|
||||
if (!pending_rd_reqs_.empty())
|
||||
return;
|
||||
int iw = fence_state_->wid % ISSUE_WIDTH;
|
||||
auto& output = Outputs.at(iw);
|
||||
output.push(fence_state_, 1);
|
||||
fence_lock_ = false;
|
||||
DT(3, "fence-unlock: " << fence_state_);
|
||||
}
|
||||
// check input queue
|
||||
auto& input = Inputs.at(iw);
|
||||
if (input.empty())
|
||||
continue;
|
||||
|
||||
// check input queue
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
int iw = (input_idx_ + i) % ISSUE_WIDTH;
|
||||
auto& input = Inputs.at(iw);
|
||||
if (input.empty())
|
||||
continue;
|
||||
auto& output = Outputs.at(iw);
|
||||
auto trace = input.front();
|
||||
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
|
||||
auto& output = Outputs.at(iw);
|
||||
auto trace = input.front();
|
||||
|
||||
auto t0 = trace->pid * num_lanes_;
|
||||
if (trace->lsu_type == LsuType::FENCE) {
|
||||
// schedule fence lock
|
||||
state.fence_trace = trace;
|
||||
state.fence_lock = true;
|
||||
DT(3, "fence-lock: " << *trace);
|
||||
// remove input
|
||||
input.pop();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (trace->lsu_type == LsuType::FENCE) {
|
||||
// schedule fence lock
|
||||
fence_state_ = trace;
|
||||
fence_lock_ = true;
|
||||
DT(3, "fence-lock: " << *trace);
|
||||
// remove input
|
||||
input.pop();
|
||||
break;
|
||||
}
|
||||
// check pending queue capacity
|
||||
if (state.pending_rd_reqs.full()) {
|
||||
if (!trace->log_once(true)) {
|
||||
DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
|
||||
}
|
||||
continue;
|
||||
} else {
|
||||
trace->log_once(false);
|
||||
}
|
||||
|
||||
// check pending queue capacity
|
||||
if (pending_rd_reqs_.full()) {
|
||||
if (!trace->log_once(true)) {
|
||||
DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
|
||||
}
|
||||
break;
|
||||
} else {
|
||||
trace->log_once(false);
|
||||
}
|
||||
|
||||
bool is_write = (trace->lsu_type == LsuType::STORE);
|
||||
uint32_t num_reqs;
|
||||
auto tag = state.pending_rd_reqs.allocate({trace, 0});
|
||||
if (DCACHE_WORD_SIZE != (XLEN/8)) {
|
||||
num_reqs = this->send_coalesced_requests(trace, block_idx, tag);
|
||||
} else {
|
||||
num_reqs = this->send_requests(trace, block_idx, tag);
|
||||
}
|
||||
state.pending_rd_reqs.at(tag).count = num_reqs;
|
||||
|
||||
// duplicates detection
|
||||
bool is_dup = false;
|
||||
if (trace->tmask.test(t0)) {
|
||||
uint64_t addr_mask = sizeof(uint32_t)-1;
|
||||
uint32_t addr0 = trace_data->mem_addrs.at(0).addr & ~addr_mask;
|
||||
uint32_t matches = 1;
|
||||
for (uint32_t t = 1; t < num_lanes_; ++t) {
|
||||
if (!trace->tmask.test(t0 + t))
|
||||
continue;
|
||||
auto mem_addr = trace_data->mem_addrs.at(t + t0).addr & ~addr_mask;
|
||||
matches += (addr0 == mem_addr);
|
||||
}
|
||||
#ifdef LSU_DUP_ENABLE
|
||||
is_dup = (matches == trace->tmask.count());
|
||||
#endif
|
||||
}
|
||||
// do not wait on writes
|
||||
bool is_write = (trace->lsu_type == LsuType::STORE);
|
||||
if (is_write) {
|
||||
state.pending_rd_reqs.release(tag);
|
||||
output.push(trace, 1);
|
||||
}
|
||||
|
||||
uint32_t addr_count;
|
||||
if (is_dup) {
|
||||
addr_count = 1;
|
||||
} else {
|
||||
addr_count = trace->tmask.count();
|
||||
}
|
||||
// remove input
|
||||
input.pop();
|
||||
}
|
||||
}
|
||||
|
||||
auto tag = pending_rd_reqs_.allocate({trace, addr_count});
|
||||
int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
|
||||
int count = 0;
|
||||
|
||||
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
|
||||
bool is_write = (trace->lsu_type == LsuType::STORE);
|
||||
auto t0 = trace->pid * NUM_LSU_LANES;
|
||||
|
||||
for (uint32_t t = 0; t < num_lanes_; ++t) {
|
||||
if (!trace->tmask.test(t0 + t))
|
||||
continue;
|
||||
|
||||
auto& dcache_req_port = core_->lmem_demuxs_.at(t)->ReqIn;
|
||||
auto mem_addr = trace_data->mem_addrs.at(t + t0);
|
||||
auto type = get_addr_type(mem_addr.addr);
|
||||
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
|
||||
uint32_t t = t0 + i;
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
|
||||
int req_idx = block_idx * DCACHE_CHANNELS + (i % DCACHE_CHANNELS);
|
||||
auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
|
||||
|
||||
auto mem_addr = trace_data->mem_addrs.at(t);
|
||||
auto type = get_addr_type(mem_addr.addr);
|
||||
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr.addr;
|
||||
mem_req.write = is_write;
|
||||
mem_req.type = type;
|
||||
mem_req.tag = tag;
|
||||
mem_req.cid = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
|
||||
dcache_req_port.push(mem_req, 1);
|
||||
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
|
||||
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr.addr;
|
||||
mem_req.write = is_write;
|
||||
mem_req.type = type;
|
||||
mem_req.tag = tag;
|
||||
mem_req.cid = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
|
||||
dcache_req_port.push(mem_req, 1);
|
||||
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
|
||||
<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
|
||||
|
||||
if (is_write) {
|
||||
++core_->perf_stats_.stores;
|
||||
} else {
|
||||
++core_->perf_stats_.loads;
|
||||
++pending_loads_;
|
||||
}
|
||||
if (is_dup)
|
||||
break;
|
||||
}
|
||||
if (is_write) {
|
||||
++core_->perf_stats_.stores;
|
||||
} else {
|
||||
++core_->perf_stats_.loads;
|
||||
++pending_loads_;
|
||||
}
|
||||
|
||||
// do not wait on writes
|
||||
if (is_write) {
|
||||
pending_rd_reqs_.release(tag);
|
||||
output.push(trace, 1);
|
||||
}
|
||||
++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
// remove input
|
||||
input.pop();
|
||||
int LsuUnit::send_coalesced_requests(instr_trace_t* trace, int block_idx, int tag) {
|
||||
int count = 0;
|
||||
|
||||
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
|
||||
bool is_write = (trace->lsu_type == LsuType::STORE);
|
||||
auto t0 = trace->pid * NUM_LSU_LANES;
|
||||
|
||||
break; // single block
|
||||
}
|
||||
++input_idx_;
|
||||
auto addr_mask = ~uint64_t(LSU_LINE_SIZE-1);
|
||||
|
||||
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
|
||||
|
||||
std::bitset<NUM_LSU_LANES / DCACHE_CHANNELS> mask(0);
|
||||
for (uint32_t i = 0; i < mask.size(); ++i) {
|
||||
mask.set(i, trace->tmask.test(t0 + i));
|
||||
}
|
||||
|
||||
int req_idx = block_idx * DCACHE_CHANNELS + c;
|
||||
auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
|
||||
|
||||
while (mask.any()) {
|
||||
// calculate seed idex
|
||||
int seed_idx = 0;
|
||||
for (uint32_t i = 0; i < mask.size(); ++i) {
|
||||
if (mask.test(i)) {
|
||||
seed_idx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t seed_addr = trace_data->mem_addrs.at(t0 + seed_idx).addr & addr_mask;
|
||||
auto type = get_addr_type(seed_addr);
|
||||
|
||||
// coalesce addresses matching the seed
|
||||
uint32_t coelescing_size = 0;
|
||||
for (uint32_t i = seed_idx; i < mask.size(); ++i) {
|
||||
auto mem_addr = trace_data->mem_addrs.at(t0 + i).addr & addr_mask;
|
||||
if (mem_addr == seed_addr) {
|
||||
mask.set(i, 0);
|
||||
++coelescing_size;
|
||||
}
|
||||
}
|
||||
|
||||
MemReq mem_req;
|
||||
mem_req.addr = seed_addr;
|
||||
mem_req.write = is_write;
|
||||
mem_req.type = type;
|
||||
mem_req.tag = tag;
|
||||
mem_req.cid = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
|
||||
dcache_req_port.push(mem_req, 1);
|
||||
DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
|
||||
<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
|
||||
if (coelescing_size > 1) {
|
||||
DT(3, "*** coalescing: size=" << coelescing_size << ", " << *trace);
|
||||
}
|
||||
|
||||
if (is_write) {
|
||||
++core_->perf_stats_.stores;
|
||||
} else {
|
||||
++core_->perf_stats_.loads;
|
||||
++pending_loads_;
|
||||
}
|
||||
|
||||
++count;
|
||||
}
|
||||
|
||||
t0 += mask.size();
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
|
||||
: FuncUnit(ctx, core, "SFU")
|
||||
, input_idx_(0)
|
||||
: FuncUnit(ctx, core, "SFU")
|
||||
{}
|
||||
|
||||
|
||||
void SfuUnit::tick() {
|
||||
// check input queue
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
int iw = (input_idx_ + i) % ISSUE_WIDTH;
|
||||
auto& input = Inputs.at(iw);
|
||||
if (input.empty())
|
||||
continue;
|
||||
auto& output = Outputs.at(iw);
|
||||
auto trace = input.front();
|
||||
auto sfu_type = trace->sfu_type;
|
||||
bool release_warp = trace->fetch_stall;
|
||||
// check input queue
|
||||
for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
|
||||
auto& input = Inputs.at(iw);
|
||||
if (input.empty())
|
||||
continue;
|
||||
auto& output = Outputs.at(iw);
|
||||
auto trace = input.front();
|
||||
auto sfu_type = trace->sfu_type;
|
||||
bool release_warp = trace->fetch_stall;
|
||||
|
||||
switch (sfu_type) {
|
||||
case SfuType::TMC:
|
||||
case SfuType::WSPAWN:
|
||||
case SfuType::SPLIT:
|
||||
case SfuType::JOIN:
|
||||
case SfuType::PRED:
|
||||
case SfuType::CSRRW:
|
||||
case SfuType::CSRRS:
|
||||
case SfuType::CSRRC:
|
||||
output.push(trace, 1);
|
||||
break;
|
||||
case SfuType::BAR: {
|
||||
output.push(trace, 1);
|
||||
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
|
||||
if (trace->eop) {
|
||||
core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
|
||||
}
|
||||
release_warp = false;
|
||||
} break;
|
||||
case SfuType::CMOV:
|
||||
output.push(trace, 3);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
switch (sfu_type) {
|
||||
case SfuType::TMC:
|
||||
case SfuType::WSPAWN:
|
||||
case SfuType::SPLIT:
|
||||
case SfuType::JOIN:
|
||||
case SfuType::PRED:
|
||||
case SfuType::CSRRW:
|
||||
case SfuType::CSRRS:
|
||||
case SfuType::CSRRC:
|
||||
output.push(trace, 1);
|
||||
break;
|
||||
case SfuType::BAR: {
|
||||
output.push(trace, 1);
|
||||
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
|
||||
if (trace->eop) {
|
||||
core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
|
||||
}
|
||||
release_warp = false;
|
||||
} break;
|
||||
case SfuType::CMOV:
|
||||
output.push(trace, 3);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
||||
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
|
||||
if (trace->eop && release_warp) {
|
||||
core_->resume(trace->wid);
|
||||
}
|
||||
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
|
||||
if (trace->eop && release_warp) {
|
||||
core_->resume(trace->wid);
|
||||
}
|
||||
|
||||
input.pop();
|
||||
|
||||
break; // single block
|
||||
}
|
||||
++input_idx_;
|
||||
input.pop();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#pragma once
|
||||
|
||||
#include <simobject.h>
|
||||
#include <array>
|
||||
#include "instr_trace.h"
|
||||
|
||||
namespace vortex {
|
||||
|
@ -22,77 +23,89 @@ class Core;
|
|||
|
||||
class FuncUnit : public SimObject<FuncUnit> {
|
||||
public:
|
||||
std::vector<SimPort<instr_trace_t*>> Inputs;
|
||||
std::vector<SimPort<instr_trace_t*>> Outputs;
|
||||
std::vector<SimPort<instr_trace_t*>> Inputs;
|
||||
std::vector<SimPort<instr_trace_t*>> Outputs;
|
||||
|
||||
FuncUnit(const SimContext& ctx, Core* core, const char* name)
|
||||
: SimObject<FuncUnit>(ctx, name)
|
||||
, Inputs(ISSUE_WIDTH, this)
|
||||
, Outputs(ISSUE_WIDTH, this)
|
||||
, core_(core)
|
||||
{}
|
||||
|
||||
virtual ~FuncUnit() {}
|
||||
FuncUnit(const SimContext& ctx, Core* core, const char* name)
|
||||
: SimObject<FuncUnit>(ctx, name)
|
||||
, Inputs(ISSUE_WIDTH, this)
|
||||
, Outputs(ISSUE_WIDTH, this)
|
||||
, core_(core)
|
||||
{}
|
||||
|
||||
virtual ~FuncUnit() {}
|
||||
|
||||
virtual void reset() {}
|
||||
virtual void reset() {}
|
||||
|
||||
virtual void tick() = 0;
|
||||
virtual void tick() = 0;
|
||||
|
||||
protected:
|
||||
Core* core_;
|
||||
Core* core_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class AluUnit : public FuncUnit {
|
||||
public:
|
||||
AluUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void tick();
|
||||
AluUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void tick();
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class FpuUnit : public FuncUnit {
|
||||
public:
|
||||
FpuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void tick();
|
||||
FpuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void tick();
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class LsuUnit : public FuncUnit {
|
||||
public:
|
||||
LsuUnit(const SimContext& ctx, Core*);
|
||||
LsuUnit(const SimContext& ctx, Core*);
|
||||
~LsuUnit();
|
||||
|
||||
void reset();
|
||||
void reset();
|
||||
void tick();
|
||||
|
||||
void tick();
|
||||
private:
|
||||
|
||||
private:
|
||||
struct pending_req_t {
|
||||
instr_trace_t* trace;
|
||||
uint32_t count;
|
||||
};
|
||||
HashTable<pending_req_t> pending_rd_reqs_;
|
||||
uint32_t num_lanes_;
|
||||
instr_trace_t* fence_state_;
|
||||
uint64_t pending_loads_;
|
||||
bool fence_lock_;
|
||||
uint32_t input_idx_;
|
||||
int send_requests(instr_trace_t* trace, int block_idx, int tag);
|
||||
int send_coalesced_requests(instr_trace_t* trace, int block_idx, int tag);
|
||||
|
||||
struct pending_req_t {
|
||||
instr_trace_t* trace;
|
||||
uint32_t count;
|
||||
};
|
||||
|
||||
struct lsu_state_t {
|
||||
HashTable<pending_req_t> pending_rd_reqs;
|
||||
instr_trace_t* fence_trace;
|
||||
bool fence_lock;
|
||||
|
||||
lsu_state_t() : pending_rd_reqs(LSUQ_IN_SIZE) {}
|
||||
|
||||
void clear() {
|
||||
this->pending_rd_reqs.clear();
|
||||
this->fence_trace = nullptr;
|
||||
this->fence_lock = false;
|
||||
}
|
||||
};
|
||||
|
||||
std::array<lsu_state_t, NUM_LSU_BLOCKS> states_;
|
||||
uint64_t pending_loads_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class SfuUnit : public FuncUnit {
|
||||
public:
|
||||
SfuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void tick();
|
||||
|
||||
private:
|
||||
uint32_t input_idx_;
|
||||
SfuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void tick();
|
||||
};
|
||||
|
||||
}
|
|
@ -41,7 +41,7 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
|
|||
uint8_t(arch.num_clusters()), // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
L3_MSHR_SIZE, // mshr
|
||||
L3_MSHR_SIZE, // mshr size
|
||||
2, // pipeline latency
|
||||
}
|
||||
);
|
||||
|
|
|
@ -28,10 +28,10 @@ Socket::Socket(const SimContext& ctx,
|
|||
, dcache_mem_rsp_port(this)
|
||||
, socket_id_(socket_id)
|
||||
, cluster_(cluster)
|
||||
, cores_(arch.socket_size())
|
||||
, cores_(arch.socket_size())
|
||||
{
|
||||
auto cores_per_socket = cores_.size();
|
||||
|
||||
|
||||
char sname[100];
|
||||
snprintf(sname, 100, "socket%d-icaches", socket_id);
|
||||
icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{
|
||||
|
@ -46,7 +46,7 @@ Socket::Socket(const SimContext& ctx,
|
|||
1, // number of inputs
|
||||
false, // write-through
|
||||
false, // write response
|
||||
(uint8_t)arch.num_warps(), // mshr
|
||||
(uint8_t)arch.num_warps(), // mshr size
|
||||
2, // pipeline latency
|
||||
});
|
||||
|
||||
|
@ -54,19 +54,19 @@ Socket::Socket(const SimContext& ctx,
|
|||
icache_mem_rsp_port.bind(&icaches_->MemRspPort);
|
||||
|
||||
snprintf(sname, 100, "socket%d-dcaches", socket_id);
|
||||
dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
|
||||
dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, DCACHE_NUM_REQS, CacheSim::Config{
|
||||
!DCACHE_ENABLED,
|
||||
log2ceil(DCACHE_SIZE), // C
|
||||
log2ceil(L1_LINE_SIZE), // L
|
||||
log2ceil(sizeof(Word)), // W
|
||||
log2ceil(DCACHE_WORD_SIZE), // W
|
||||
log2ceil(DCACHE_NUM_WAYS),// A
|
||||
log2ceil(DCACHE_NUM_BANKS), // B
|
||||
XLEN, // address bits
|
||||
1, // number of ports
|
||||
DCACHE_NUM_BANKS, // number of inputs
|
||||
DCACHE_NUM_REQS, // number of inputs
|
||||
true, // write-through
|
||||
false, // write response
|
||||
DCACHE_MSHR_SIZE, // mshr
|
||||
DCACHE_MSHR_SIZE, // mshr size
|
||||
2, // pipeline latency
|
||||
});
|
||||
|
||||
|
@ -75,17 +75,14 @@ Socket::Socket(const SimContext& ctx,
|
|||
|
||||
// create cores
|
||||
|
||||
for (uint32_t i = 0; i < cores_per_socket; ++i) {
|
||||
for (uint32_t i = 0; i < cores_per_socket; ++i) {
|
||||
uint32_t core_id = socket_id * cores_per_socket + i;
|
||||
cores_.at(i) = Core::Create(core_id,
|
||||
this,
|
||||
arch,
|
||||
dcrs);
|
||||
cores_.at(i) = Core::Create(core_id, this, arch, dcrs);
|
||||
|
||||
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
|
||||
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
|
||||
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
|
||||
|
||||
for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
|
||||
for (uint32_t j = 0; j < DCACHE_NUM_REQS; ++j) {
|
||||
cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j));
|
||||
dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j));
|
||||
}
|
||||
|
@ -96,7 +93,7 @@ Socket::~Socket() {
|
|||
//--
|
||||
}
|
||||
|
||||
void Socket::reset() {
|
||||
void Socket::reset() {
|
||||
//--
|
||||
}
|
||||
|
||||
|
@ -137,6 +134,6 @@ void Socket::resume(uint32_t core_index) {
|
|||
Socket::PerfStats Socket::perf_stats() const {
|
||||
PerfStats perf_stats;
|
||||
perf_stats.icache = icaches_->perf_stats();
|
||||
perf_stats.dcache = dcaches_->perf_stats();
|
||||
perf_stats.dcache = dcaches_->perf_stats();
|
||||
return perf_stats;
|
||||
}
|
|
@ -244,7 +244,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
|
|||
|
||||
struct MemReq {
|
||||
uint64_t addr;
|
||||
bool write;
|
||||
bool write;
|
||||
AddrType type;
|
||||
uint32_t tag;
|
||||
uint32_t cid;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue