mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
perf counters update
This commit is contained in:
parent
59ed24dc0b
commit
a2307a28dc
10 changed files with 78 additions and 85 deletions
|
@ -308,7 +308,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
if (num_cores > 1) {
|
||||
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
|
||||
int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
|
||||
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d, fpu=%d, lsu=%d, sfu=%d)\n", core_id, scrb_stalls_per_core, scrb_percent_per_core,
|
||||
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, scrb_percent_per_core,
|
||||
calcAvgPercent(scrb_alu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_fpu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_lsu_per_core, scrb_total),
|
||||
|
@ -548,7 +548,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
|
||||
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d, fpu=%d, lsu=%d, sfu=%d)\n", scrb_stalls, scrb_percent,
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls, scrb_percent,
|
||||
calcAvgPercent(scrb_alu, scrb_total),
|
||||
calcAvgPercent(scrb_fpu, scrb_total),
|
||||
calcAvgPercent(scrb_lsu, scrb_total),
|
||||
|
|
|
@ -33,4 +33,6 @@
|
|||
#define DCACHE_CHANNELS UP((NUM_LSU_LANES * (XLEN / 8)) / DCACHE_WORD_SIZE)
|
||||
#define DCACHE_NUM_REQS (NUM_LSU_BLOCKS * DCACHE_CHANNELS)
|
||||
|
||||
#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE)
|
||||
#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE)
|
||||
|
||||
#define PER_ISSUE_WARPS NUM_WARPS / ISSUE_WIDTH
|
|
@ -264,69 +264,58 @@ void Core::issue() {
|
|||
|
||||
// issue ibuffer instructions
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
uint32_t ii = (ibuffer_idx_ + i) % ibuffers_.size();
|
||||
auto& ibuffer = ibuffers_.at(ii);
|
||||
if (ibuffer.empty())
|
||||
continue;
|
||||
|
||||
auto trace = ibuffer.top();
|
||||
|
||||
// check scoreboard
|
||||
if (scoreboard_.in_use(trace)) {
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
if (!trace->log_once(true)) {
|
||||
DTH(4, "*** scoreboard-stall: dependents={");
|
||||
bool has_instrs = false;
|
||||
bool found_match = false;
|
||||
for (uint32_t k = 0; k < PER_ISSUE_WARPS; ++k) {
|
||||
uint32_t kk = (ibuffer_idx_ + k) % PER_ISSUE_WARPS;
|
||||
uint32_t ii = kk * ISSUE_WIDTH + i;
|
||||
auto& ibuffer = ibuffers_.at(ii);
|
||||
if (ibuffer.empty())
|
||||
continue;
|
||||
// check scoreboard
|
||||
has_instrs = true;
|
||||
auto trace = ibuffer.top();
|
||||
if (scoreboard_.in_use(trace)) {
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
if (!trace->log_once(true)) {
|
||||
DTH(4, "*** scoreboard-stall: dependents={");
|
||||
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
|
||||
auto& use = uses.at(j);
|
||||
__unused (use);
|
||||
if (j) DTN(4, ", ");
|
||||
DTN(4, use.reg_type << use.reg_id << "(#" << use.uuid << ")");
|
||||
}
|
||||
DTN(4, "}, " << *trace << std::endl);
|
||||
}
|
||||
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
|
||||
auto& use = uses.at(j);
|
||||
__unused (use);
|
||||
if (j) DTN(4, ", ");
|
||||
DTN(4, use.reg_type << use.reg_id << "(#" << use.uuid << ")");
|
||||
}
|
||||
DTN(4, "}, " << *trace << std::endl);
|
||||
}
|
||||
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
|
||||
auto& use = uses.at(j);
|
||||
switch (use.fu_type) {
|
||||
case FUType::ALU: ++perf_stats_.scrb_alu; break;
|
||||
case FUType::FPU: ++perf_stats_.scrb_fpu; break;
|
||||
case FUType::LSU: ++perf_stats_.scrb_lsu; break;
|
||||
case FUType::SFU: {
|
||||
++perf_stats_.scrb_sfu;
|
||||
switch (use.sfu_type) {
|
||||
case SfuType::TMC:
|
||||
case SfuType::WSPAWN:
|
||||
case SfuType::SPLIT:
|
||||
case SfuType::JOIN:
|
||||
case SfuType::BAR:
|
||||
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
|
||||
case SfuType::CSRRW:
|
||||
case SfuType::CSRRS:
|
||||
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
|
||||
switch (use.fu_type) {
|
||||
case FUType::ALU: ++perf_stats_.scrb_alu; break;
|
||||
case FUType::FPU: ++perf_stats_.scrb_fpu; break;
|
||||
case FUType::LSU: ++perf_stats_.scrb_lsu; break;
|
||||
case FUType::SFU: ++perf_stats_.scrb_sfu; break;
|
||||
default: assert(false);
|
||||
}
|
||||
} break;
|
||||
default: assert(false);
|
||||
}
|
||||
} else {
|
||||
trace->log_once(false);
|
||||
// update scoreboard
|
||||
DT(3, "pipeline-scoreboard: " << *trace);
|
||||
if (trace->wb) {
|
||||
scoreboard_.reserve(trace);
|
||||
}
|
||||
// to operand stage
|
||||
operands_.at(i)->Input.push(trace, 2);
|
||||
ibuffer.pop();
|
||||
found_match = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (has_instrs && !found_match) {
|
||||
++perf_stats_.scrb_stalls;
|
||||
continue;
|
||||
} else {
|
||||
trace->log_once(false);
|
||||
}
|
||||
|
||||
// update scoreboard
|
||||
if (trace->wb) {
|
||||
scoreboard_.reserve(trace);
|
||||
}
|
||||
|
||||
DT(3, "pipeline-scoreboard: " << *trace);
|
||||
|
||||
// to operand stage
|
||||
operands_.at(i)->Input.push(trace, 1);
|
||||
|
||||
ibuffer.pop();
|
||||
}
|
||||
ibuffer_idx_ += ISSUE_WIDTH;
|
||||
++ibuffer_idx_;
|
||||
}
|
||||
|
||||
void Core::execute() {
|
||||
|
@ -337,7 +326,7 @@ void Core::execute() {
|
|||
if (dispatch->Outputs.at(j).empty())
|
||||
continue;
|
||||
auto trace = dispatch->Outputs.at(j).front();
|
||||
func_unit->Inputs.at(j).push(trace, 1);
|
||||
func_unit->Inputs.at(j).push(trace, 2);
|
||||
dispatch->Outputs.at(j).pop();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,8 +49,6 @@ public:
|
|||
uint64_t scrb_fpu;
|
||||
uint64_t scrb_lsu;
|
||||
uint64_t scrb_sfu;
|
||||
uint64_t scrb_wctl;
|
||||
uint64_t scrb_csrs;
|
||||
uint64_t ifetches;
|
||||
uint64_t loads;
|
||||
uint64_t stores;
|
||||
|
@ -69,8 +67,6 @@ public:
|
|||
, scrb_fpu(0)
|
||||
, scrb_lsu(0)
|
||||
, scrb_sfu(0)
|
||||
, scrb_wctl(0)
|
||||
, scrb_csrs(0)
|
||||
, ifetches(0)
|
||||
, loads(0)
|
||||
, stores(0)
|
||||
|
|
|
@ -397,8 +397,6 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
CSR_READ_64(VX_CSR_MPM_SCRB_FPU, core_perf.scrb_fpu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_LSU, core_perf.scrb_lsu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_SFU, core_perf.scrb_sfu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_WCTL, core_perf.scrb_wctl);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_CSRS, core_perf.scrb_csrs);
|
||||
CSR_READ_64(VX_CSR_MPM_IFETCHES, core_perf.ifetches);
|
||||
CSR_READ_64(VX_CSR_MPM_LOADS, core_perf.loads);
|
||||
CSR_READ_64(VX_CSR_MPM_STORES, core_perf.stores);
|
||||
|
|
|
@ -718,8 +718,9 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case Opcode::FS: {
|
||||
trace->fu_type = FUType::LSU;
|
||||
trace->lsu_type = LsuType::STORE;
|
||||
auto data_type = (opcode == Opcode::FS) ? RegType::Float : RegType::Integer;
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Integer, rsrc1};
|
||||
trace->src_regs[1] = {data_type, rsrc1};
|
||||
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
|
||||
trace->data = trace_data;
|
||||
uint32_t data_bytes = 1 << (func3 & 0x3);
|
||||
|
@ -838,7 +839,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
}
|
||||
} else {
|
||||
trace->fu_type = FUType::SFU;
|
||||
trace->fetch_stall = true;
|
||||
// stall the fetch stage for FPU CSRs
|
||||
trace->fetch_stall = (csr_addr <= VX_CSR_FCSR);
|
||||
csr_value = this->get_csr(csr_addr, t, wid);
|
||||
switch (func3) {
|
||||
case 1: {
|
||||
|
|
|
@ -33,15 +33,18 @@ void AluUnit::tick() {
|
|||
continue;
|
||||
auto& output = Outputs.at(iw);
|
||||
auto trace = input.front();
|
||||
int delay = 2;
|
||||
switch (trace->alu_type) {
|
||||
case AluType::ARITH:
|
||||
case AluType::BRANCH:
|
||||
case AluType::SYSCALL:
|
||||
output.push(trace, 2+delay);
|
||||
break;
|
||||
case AluType::IMUL:
|
||||
output.push(trace, LATENCY_IMUL+1);
|
||||
output.push(trace, LATENCY_IMUL+delay);
|
||||
break;
|
||||
case AluType::IDIV:
|
||||
output.push(trace, XLEN+1);
|
||||
output.push(trace, XLEN+delay);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
|
@ -65,21 +68,22 @@ void FpuUnit::tick() {
|
|||
continue;
|
||||
auto& output = Outputs.at(iw);
|
||||
auto trace = input.front();
|
||||
int delay = 2;
|
||||
switch (trace->fpu_type) {
|
||||
case FpuType::FNCP:
|
||||
output.push(trace, 2);
|
||||
output.push(trace, 2+delay);
|
||||
break;
|
||||
case FpuType::FMA:
|
||||
output.push(trace, LATENCY_FMA+1);
|
||||
output.push(trace, LATENCY_FMA+delay);
|
||||
break;
|
||||
case FpuType::FDIV:
|
||||
output.push(trace, LATENCY_FDIV+1);
|
||||
output.push(trace, LATENCY_FDIV+delay);
|
||||
break;
|
||||
case FpuType::FSQRT:
|
||||
output.push(trace, LATENCY_FSQRT+1);
|
||||
output.push(trace, LATENCY_FSQRT+delay);
|
||||
break;
|
||||
case FpuType::FCVT:
|
||||
output.push(trace, LATENCY_FCVT+1);
|
||||
output.push(trace, LATENCY_FCVT+delay);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
|
@ -254,10 +258,10 @@ void SfuUnit::tick() {
|
|||
auto trace = input.front();
|
||||
auto sfu_type = trace->sfu_type;
|
||||
bool release_warp = trace->fetch_stall;
|
||||
|
||||
int delay = 2;
|
||||
switch (sfu_type) {
|
||||
case SfuType::WSPAWN:
|
||||
output.push(trace, 1);
|
||||
output.push(trace, 2+delay);
|
||||
if (trace->eop) {
|
||||
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
|
||||
release_warp = core_->wspawn(trace_data->arg1, trace_data->arg2);
|
||||
|
@ -270,10 +274,10 @@ void SfuUnit::tick() {
|
|||
case SfuType::CSRRW:
|
||||
case SfuType::CSRRS:
|
||||
case SfuType::CSRRC:
|
||||
output.push(trace, 1);
|
||||
output.push(trace, 2+delay);
|
||||
break;
|
||||
case SfuType::BAR: {
|
||||
output.push(trace, 1);
|
||||
output.push(trace, 2+delay);
|
||||
if (trace->eop) {
|
||||
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
|
||||
release_warp = core_->barrier(trace_data->arg1, trace_data->arg2, trace->wid);
|
||||
|
|
|
@ -21,7 +21,7 @@ class Operand : public SimObject<Operand> {
|
|||
private:
|
||||
static constexpr uint32_t NUM_BANKS = 4;
|
||||
uint32_t total_stalls_ = 0;
|
||||
|
||||
|
||||
public:
|
||||
SimPort<instr_trace_t*> Input;
|
||||
SimPort<instr_trace_t*> Output;
|
||||
|
@ -63,7 +63,7 @@ public:
|
|||
|
||||
total_stalls_ += stalls;
|
||||
|
||||
Output.push(trace, 1 + stalls);
|
||||
Output.push(trace, 2 + stalls);
|
||||
|
||||
DT(3, "pipeline-operands: " << *trace);
|
||||
|
||||
|
|
|
@ -26,7 +26,6 @@ public:
|
|||
RegType reg_type;
|
||||
uint32_t reg_id;
|
||||
FUType fu_type;
|
||||
SfuType sfu_type;
|
||||
uint64_t uuid;
|
||||
};
|
||||
|
||||
|
@ -48,7 +47,8 @@ public:
|
|||
}
|
||||
|
||||
bool in_use(instr_trace_t* trace) const {
|
||||
if (trace->dst_reg.type != RegType::None) {
|
||||
if (trace->wb) {
|
||||
assert(trace->dst_reg.type != RegType::None);
|
||||
if (in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).test(trace->dst_reg.idx)) {
|
||||
return true;
|
||||
}
|
||||
|
@ -65,11 +65,12 @@ public:
|
|||
|
||||
std::vector<reg_use_t> get_uses(instr_trace_t* trace) const {
|
||||
std::vector<reg_use_t> out;
|
||||
if (trace->dst_reg.type != RegType::None) {
|
||||
if (trace->wb) {
|
||||
assert(trace->dst_reg.type != RegType::None);
|
||||
if (in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).test(trace->dst_reg.idx)) {
|
||||
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({trace->dst_reg.type, trace->dst_reg.idx, owner->fu_type, owner->sfu_type, owner->uuid});
|
||||
out.push_back({trace->dst_reg.type, trace->dst_reg.idx, owner->fu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
for (uint32_t i = 0; i < trace->src_regs.size(); ++i) {
|
||||
|
@ -77,7 +78,7 @@ public:
|
|||
if (in_use_regs_.at(trace->wid).at((int)trace->src_regs[i].type).test(trace->src_regs[i].idx)) {
|
||||
uint32_t tag = (trace->src_regs[i].idx << 16) | (trace->wid << 4) | (int)trace->src_regs[i].type;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({trace->src_regs[i].type, trace->src_regs[i].idx, owner->fu_type, owner->sfu_type, owner->uuid});
|
||||
out.push_back({trace->src_regs[i].type, trace->src_regs[i].idx, owner->fu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -90,13 +91,13 @@ public:
|
|||
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
|
||||
assert(owners_.count(tag) == 0);
|
||||
owners_[tag] = trace;
|
||||
assert((int)trace->fu_type < 5);
|
||||
}
|
||||
|
||||
void release(instr_trace_t* trace) {
|
||||
assert(trace->wb);
|
||||
in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).reset(trace->dst_reg.idx);
|
||||
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
|
||||
assert(owners_.count(tag) != 0);
|
||||
owners_.erase(tag);
|
||||
}
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include <util.h>
|
||||
#include <stringutil.h>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
#include <simobject.h>
|
||||
#include "debug.h"
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue