perf counters update

This commit is contained in:
Blaise Tine 2024-07-12 19:02:43 -07:00
parent 59ed24dc0b
commit a2307a28dc
10 changed files with 78 additions and 85 deletions

View file

@ -308,7 +308,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (num_cores > 1) {
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d, fpu=%d, lsu=%d, sfu=%d)\n", core_id, scrb_stalls_per_core, scrb_percent_per_core,
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, scrb_percent_per_core,
calcAvgPercent(scrb_alu_per_core, scrb_total),
calcAvgPercent(scrb_fpu_per_core, scrb_total),
calcAvgPercent(scrb_lsu_per_core, scrb_total),
@ -548,7 +548,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d, fpu=%d, lsu=%d, sfu=%d)\n", scrb_stalls, scrb_percent,
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls, scrb_percent,
calcAvgPercent(scrb_alu, scrb_total),
calcAvgPercent(scrb_fpu, scrb_total),
calcAvgPercent(scrb_lsu, scrb_total),

View file

@ -33,4 +33,6 @@
#define DCACHE_CHANNELS UP((NUM_LSU_LANES * (XLEN / 8)) / DCACHE_WORD_SIZE)
#define DCACHE_NUM_REQS (NUM_LSU_BLOCKS * DCACHE_CHANNELS)
#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE)
#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE)
#define PER_ISSUE_WARPS NUM_WARPS / ISSUE_WIDTH

View file

@ -264,69 +264,58 @@ void Core::issue() {
// issue ibuffer instructions
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
uint32_t ii = (ibuffer_idx_ + i) % ibuffers_.size();
auto& ibuffer = ibuffers_.at(ii);
if (ibuffer.empty())
continue;
auto trace = ibuffer.top();
// check scoreboard
if (scoreboard_.in_use(trace)) {
auto uses = scoreboard_.get_uses(trace);
if (!trace->log_once(true)) {
DTH(4, "*** scoreboard-stall: dependents={");
bool has_instrs = false;
bool found_match = false;
for (uint32_t k = 0; k < PER_ISSUE_WARPS; ++k) {
uint32_t kk = (ibuffer_idx_ + k) % PER_ISSUE_WARPS;
uint32_t ii = kk * ISSUE_WIDTH + i;
auto& ibuffer = ibuffers_.at(ii);
if (ibuffer.empty())
continue;
// check scoreboard
has_instrs = true;
auto trace = ibuffer.top();
if (scoreboard_.in_use(trace)) {
auto uses = scoreboard_.get_uses(trace);
if (!trace->log_once(true)) {
DTH(4, "*** scoreboard-stall: dependents={");
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
__unused (use);
if (j) DTN(4, ", ");
DTN(4, use.reg_type << use.reg_id << "(#" << use.uuid << ")");
}
DTN(4, "}, " << *trace << std::endl);
}
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
__unused (use);
if (j) DTN(4, ", ");
DTN(4, use.reg_type << use.reg_id << "(#" << use.uuid << ")");
}
DTN(4, "}, " << *trace << std::endl);
}
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
switch (use.fu_type) {
case FUType::ALU: ++perf_stats_.scrb_alu; break;
case FUType::FPU: ++perf_stats_.scrb_fpu; break;
case FUType::LSU: ++perf_stats_.scrb_lsu; break;
case FUType::SFU: {
++perf_stats_.scrb_sfu;
switch (use.sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::BAR:
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
switch (use.fu_type) {
case FUType::ALU: ++perf_stats_.scrb_alu; break;
case FUType::FPU: ++perf_stats_.scrb_fpu; break;
case FUType::LSU: ++perf_stats_.scrb_lsu; break;
case FUType::SFU: ++perf_stats_.scrb_sfu; break;
default: assert(false);
}
} break;
default: assert(false);
}
} else {
trace->log_once(false);
// update scoreboard
DT(3, "pipeline-scoreboard: " << *trace);
if (trace->wb) {
scoreboard_.reserve(trace);
}
// to operand stage
operands_.at(i)->Input.push(trace, 2);
ibuffer.pop();
found_match = true;
break;
}
}
if (has_instrs && !found_match) {
++perf_stats_.scrb_stalls;
continue;
} else {
trace->log_once(false);
}
// update scoreboard
if (trace->wb) {
scoreboard_.reserve(trace);
}
DT(3, "pipeline-scoreboard: " << *trace);
// to operand stage
operands_.at(i)->Input.push(trace, 1);
ibuffer.pop();
}
ibuffer_idx_ += ISSUE_WIDTH;
++ibuffer_idx_;
}
void Core::execute() {
@ -337,7 +326,7 @@ void Core::execute() {
if (dispatch->Outputs.at(j).empty())
continue;
auto trace = dispatch->Outputs.at(j).front();
func_unit->Inputs.at(j).push(trace, 1);
func_unit->Inputs.at(j).push(trace, 2);
dispatch->Outputs.at(j).pop();
}
}

View file

@ -49,8 +49,6 @@ public:
uint64_t scrb_fpu;
uint64_t scrb_lsu;
uint64_t scrb_sfu;
uint64_t scrb_wctl;
uint64_t scrb_csrs;
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
@ -69,8 +67,6 @@ public:
, scrb_fpu(0)
, scrb_lsu(0)
, scrb_sfu(0)
, scrb_wctl(0)
, scrb_csrs(0)
, ifetches(0)
, loads(0)
, stores(0)

View file

@ -397,8 +397,6 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
CSR_READ_64(VX_CSR_MPM_SCRB_FPU, core_perf.scrb_fpu);
CSR_READ_64(VX_CSR_MPM_SCRB_LSU, core_perf.scrb_lsu);
CSR_READ_64(VX_CSR_MPM_SCRB_SFU, core_perf.scrb_sfu);
CSR_READ_64(VX_CSR_MPM_SCRB_WCTL, core_perf.scrb_wctl);
CSR_READ_64(VX_CSR_MPM_SCRB_CSRS, core_perf.scrb_csrs);
CSR_READ_64(VX_CSR_MPM_IFETCHES, core_perf.ifetches);
CSR_READ_64(VX_CSR_MPM_LOADS, core_perf.loads);
CSR_READ_64(VX_CSR_MPM_STORES, core_perf.stores);

View file

@ -718,8 +718,9 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
case Opcode::FS: {
trace->fu_type = FUType::LSU;
trace->lsu_type = LsuType::STORE;
auto data_type = (opcode == Opcode::FS) ? RegType::Float : RegType::Integer;
trace->src_regs[0] = {RegType::Integer, rsrc0};
trace->src_regs[1] = {RegType::Integer, rsrc1};
trace->src_regs[1] = {data_type, rsrc1};
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
trace->data = trace_data;
uint32_t data_bytes = 1 << (func3 & 0x3);
@ -838,7 +839,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
}
} else {
trace->fu_type = FUType::SFU;
trace->fetch_stall = true;
// stall the fetch stage for FPU CSRs
trace->fetch_stall = (csr_addr <= VX_CSR_FCSR);
csr_value = this->get_csr(csr_addr, t, wid);
switch (func3) {
case 1: {

View file

@ -33,15 +33,18 @@ void AluUnit::tick() {
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
int delay = 2;
switch (trace->alu_type) {
case AluType::ARITH:
case AluType::BRANCH:
case AluType::SYSCALL:
output.push(trace, 2+delay);
break;
case AluType::IMUL:
output.push(trace, LATENCY_IMUL+1);
output.push(trace, LATENCY_IMUL+delay);
break;
case AluType::IDIV:
output.push(trace, XLEN+1);
output.push(trace, XLEN+delay);
break;
default:
std::abort();
@ -65,21 +68,22 @@ void FpuUnit::tick() {
continue;
auto& output = Outputs.at(iw);
auto trace = input.front();
int delay = 2;
switch (trace->fpu_type) {
case FpuType::FNCP:
output.push(trace, 2);
output.push(trace, 2+delay);
break;
case FpuType::FMA:
output.push(trace, LATENCY_FMA+1);
output.push(trace, LATENCY_FMA+delay);
break;
case FpuType::FDIV:
output.push(trace, LATENCY_FDIV+1);
output.push(trace, LATENCY_FDIV+delay);
break;
case FpuType::FSQRT:
output.push(trace, LATENCY_FSQRT+1);
output.push(trace, LATENCY_FSQRT+delay);
break;
case FpuType::FCVT:
output.push(trace, LATENCY_FCVT+1);
output.push(trace, LATENCY_FCVT+delay);
break;
default:
std::abort();
@ -254,10 +258,10 @@ void SfuUnit::tick() {
auto trace = input.front();
auto sfu_type = trace->sfu_type;
bool release_warp = trace->fetch_stall;
int delay = 2;
switch (sfu_type) {
case SfuType::WSPAWN:
output.push(trace, 1);
output.push(trace, 2+delay);
if (trace->eop) {
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
release_warp = core_->wspawn(trace_data->arg1, trace_data->arg2);
@ -270,10 +274,10 @@ void SfuUnit::tick() {
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC:
output.push(trace, 1);
output.push(trace, 2+delay);
break;
case SfuType::BAR: {
output.push(trace, 1);
output.push(trace, 2+delay);
if (trace->eop) {
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
release_warp = core_->barrier(trace_data->arg1, trace_data->arg2, trace->wid);

View file

@ -21,7 +21,7 @@ class Operand : public SimObject<Operand> {
private:
static constexpr uint32_t NUM_BANKS = 4;
uint32_t total_stalls_ = 0;
public:
SimPort<instr_trace_t*> Input;
SimPort<instr_trace_t*> Output;
@ -63,7 +63,7 @@ public:
total_stalls_ += stalls;
Output.push(trace, 1 + stalls);
Output.push(trace, 2 + stalls);
DT(3, "pipeline-operands: " << *trace);

View file

@ -26,7 +26,6 @@ public:
RegType reg_type;
uint32_t reg_id;
FUType fu_type;
SfuType sfu_type;
uint64_t uuid;
};
@ -48,7 +47,8 @@ public:
}
bool in_use(instr_trace_t* trace) const {
if (trace->dst_reg.type != RegType::None) {
if (trace->wb) {
assert(trace->dst_reg.type != RegType::None);
if (in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).test(trace->dst_reg.idx)) {
return true;
}
@ -65,11 +65,12 @@ public:
std::vector<reg_use_t> get_uses(instr_trace_t* trace) const {
std::vector<reg_use_t> out;
if (trace->dst_reg.type != RegType::None) {
if (trace->wb) {
assert(trace->dst_reg.type != RegType::None);
if (in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).test(trace->dst_reg.idx)) {
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
auto owner = owners_.at(tag);
out.push_back({trace->dst_reg.type, trace->dst_reg.idx, owner->fu_type, owner->sfu_type, owner->uuid});
out.push_back({trace->dst_reg.type, trace->dst_reg.idx, owner->fu_type, owner->uuid});
}
}
for (uint32_t i = 0; i < trace->src_regs.size(); ++i) {
@ -77,7 +78,7 @@ public:
if (in_use_regs_.at(trace->wid).at((int)trace->src_regs[i].type).test(trace->src_regs[i].idx)) {
uint32_t tag = (trace->src_regs[i].idx << 16) | (trace->wid << 4) | (int)trace->src_regs[i].type;
auto owner = owners_.at(tag);
out.push_back({trace->src_regs[i].type, trace->src_regs[i].idx, owner->fu_type, owner->sfu_type, owner->uuid});
out.push_back({trace->src_regs[i].type, trace->src_regs[i].idx, owner->fu_type, owner->uuid});
}
}
}
@ -90,13 +91,13 @@ public:
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
assert(owners_.count(tag) == 0);
owners_[tag] = trace;
assert((int)trace->fu_type < 5);
}
void release(instr_trace_t* trace) {
assert(trace->wb);
in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).reset(trace->dst_reg.idx);
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
assert(owners_.count(tag) != 0);
owners_.erase(tag);
}

View file

@ -21,6 +21,7 @@
#include <util.h>
#include <stringutil.h>
#include <VX_config.h>
#include <VX_types.h>
#include <simobject.h>
#include "debug.h"