simx perf counters updates

This commit is contained in:
Blaise Tine 2022-08-02 18:53:15 -07:00
parent a711f0b6cd
commit 9143c19be3
14 changed files with 117 additions and 27 deletions

View file

@ -147,18 +147,20 @@
`define CSR_MPM_TEX_READS_H 12'hB83
`define CSR_MPM_TEX_LAT 12'hB04 // texture latency
`define CSR_MPM_TEX_LAT_H 12'hB84
`define CSR_MPM_TEX_STALL 12'hB05 // texture latency
`define CSR_MPM_TEX_STALL_H 12'hB85
// PERF: texture cache
`define CSR_MPM_TCACHE_READS 12'hB05 // total reads
`define CSR_MPM_TCACHE_READS_H 12'hB85
`define CSR_MPM_TCACHE_MISS_R 12'hB06 // read misses
`define CSR_MPM_TCACHE_MISS_R_H 12'hB86
`define CSR_MPM_TCACHE_BANK_ST 12'hB07 // bank stalls
`define CSR_MPM_TCACHE_BANK_ST_H 12'hB87
`define CSR_MPM_TCACHE_MSHR_ST 12'hB08 // MSHR stalls
`define CSR_MPM_TCACHE_MSHR_ST_H 12'hB88
`define CSR_MPM_TCACHE_READS 12'hB06 // total reads
`define CSR_MPM_TCACHE_READS_H 12'hB86
`define CSR_MPM_TCACHE_MISS_R 12'hB07 // read misses
`define CSR_MPM_TCACHE_MISS_R_H 12'hB87
`define CSR_MPM_TCACHE_BANK_ST 12'hB08 // bank stalls
`define CSR_MPM_TCACHE_BANK_ST_H 12'hB88
`define CSR_MPM_TCACHE_MSHR_ST 12'hB09 // MSHR stalls
`define CSR_MPM_TCACHE_MSHR_ST_H 12'hB89
// PERF: pipeline
`define CSR_MPM_TEX_ISSUE_ST 12'hB09 // issue stalls
`define CSR_MPM_TEX_ISSUE_ST_H 12'hB89
`define CSR_MPM_TEX_ISSUE_ST 12'hB0A // issue stalls
`define CSR_MPM_TEX_ISSUE_ST_H 12'hB8A
// Machine Performance-monitoring raster counters
// PERF: raster unit

View file

@ -1,4 +1,5 @@
RTL_DIR = ../../../../../rtl
AFU_DIR = ../../../../../afu/opae
THIRD_PARTY_DIR = ../../../../../../third_party
ifeq ($(DEVICE_FAMILY), stratix10)

View file

@ -32,6 +32,7 @@ set_global_assignment -name DEVICE $opts(device)
set_global_assignment -name TOP_LEVEL_ENTITY $opts(top)
set_global_assignment -name PROJECT_OUTPUT_DIRECTORY bin
#set_global_assignment -name OPTIMIZATION_TECHNIQUE AREA
set_global_assignment -name NUM_PARALLEL_PROCESSORS ALL
set_global_assignment -name VERILOG_INPUT_VERSION SYSTEMVERILOG_2009
set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS ON

View file

@ -541,7 +541,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
int tex_stall_cycles_ratio = (int)(100 * double(tex_stall_cycles) / cycles);
fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads);
fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat);
fprintf(stream, "PERF: raster stall cycles=%ld cycles (%d%%)\n", tex_stall_cycles, tex_stall_cycles_ratio);
fprintf(stream, "PERF: tex stall cycles=%ld cycles (%d%%)\n", tex_stall_cycles, tex_stall_cycles_ratio);
fprintf(stream, "PERF: tex issue stalls=%ld\n", tex_issue_stalls);
int tcache_read_hit_ratio = (int)((1.0 - (double(tcache_read_misses) / double(tcache_reads))) * 100);
int tcache_bank_utilization = (int)((double(tcache_reads) / double(tcache_reads + tcache_bank_stalls)) * 100);

View file

@ -265,7 +265,7 @@ private:
PerfStats perf_stats_;
uint64_t pending_read_reqs_;
uint64_t pending_write_reqs_;
uint64_t pending_fill_reqs_;
uint64_t pending_fill_reqs_;
public:
Impl(CacheSim* simobject, const Config& config)

View file

@ -280,6 +280,7 @@ Cluster::PerfStats Cluster::perf_stats() const {
perf.tcache = tcaches_->perf_stats();
perf.ocache = ocaches_->perf_stats();
perf.rcache = rcaches_->perf_stats();
perf.l2cache = l2cache_->perf_stats();
for (auto sharedmem : sharedmems_) {
perf.sharedmem += sharedmem->perf_stats();

View file

@ -39,6 +39,7 @@ public:
CacheSim::PerfStats icache;
CacheSim::PerfStats dcache;
SharedMem::PerfStats sharedmem;
CacheSim::PerfStats l2cache;
CacheSim::PerfStats tcache;
CacheSim::PerfStats ocache;
CacheSim::PerfStats rcache;
@ -50,6 +51,7 @@ public:
this->icache += rhs.icache;
this->dcache += rhs.dcache;
this->sharedmem += rhs.sharedmem;
this->l2cache += rhs.l2cache;
this->tcache += rhs.tcache;
this->ocache += rhs.ocache;
this->rcache += rhs.rcache;

View file

@ -102,6 +102,7 @@ void Core::reset() {
ecall_ = false;
ebreak_ = false;
perf_stats_ = PerfStats();
pending_ifetches_ = 0;
}
void Core::attach_ram(RAM* ram) {
@ -159,6 +160,8 @@ void Core::schedule() {
}
void Core::fetch() {
perf_stats_.ifetch_latency += pending_ifetches_;
// handle icache reponse
auto& icache_rsp_port = icache_rsp_ports.at(0);
if (!icache_rsp_port.empty()){
@ -168,6 +171,7 @@ void Core::fetch() {
DT(3, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
pending_icache_.release(mem_rsp.tag);
icache_rsp_port.pop();
--pending_ifetches_;
}
// send icache request
@ -180,9 +184,11 @@ void Core::fetch() {
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
icache_req_ports.at(0).send(mem_req, 1);
DT(3, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
fetch_latch_.pop();
}
DT(3, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
fetch_latch_.pop();
++pending_ifetches_;
++perf_stats_.ifetches;
}
}
void Core::decode() {
@ -214,8 +220,6 @@ void Core::decode() {
perf_stats_.loads += active_threads;
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE)
perf_stats_.stores += active_threads;
if (trace->exe_type == ExeType::ALU && trace->alu_type == AluType::BRANCH)
perf_stats_.branches += active_threads;
DT(3, "pipeline-decode: " << *trace);
@ -483,13 +487,20 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case CSR_MPM_GPU_ST: return perf_stats_.gpu_stalls & 0xffffffff;
case CSR_MPM_GPU_ST_H: return perf_stats_.gpu_stalls >> 32;
case CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
case CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
case CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
case CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
case CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
case CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
case CSR_MPM_BRANCHES: return perf_stats_.branches & 0xffffffff;
case CSR_MPM_BRANCHES_H:return perf_stats_.branches >> 32;
case CSR_MPM_IFETCH_LAT: return perf_stats_.ifetch_latency & 0xffffffff;
case CSR_MPM_IFETCH_LAT_H: return perf_stats_.ifetch_latency >> 32;
case CSR_MPM_LOAD_LAT: return perf_stats_.load_latency & 0xffffffff;
case CSR_MPM_LOAD_LAT_H: return perf_stats_.load_latency >> 32;
}
} break;
case DCR_MPM_CLASS_MEM: {
switch (addr) {
case CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
case CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
case CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
@ -515,6 +526,32 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
case CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32;
case CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
case CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
case CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff;
case CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32;
case CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff;
case CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32;
case CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff;
case CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32;
case CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff;
case CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32;
case CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff;
case CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32;
case CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
case CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
case CSR_MPM_L3CACHE_WRITES: return proc_perf.l3cache.writes & 0xffffffff;
case CSR_MPM_L3CACHE_WRITES_H: return proc_perf.l3cache.writes >> 32;
case CSR_MPM_L3CACHE_MISS_R: return proc_perf.l3cache.read_misses & 0xffffffff;
case CSR_MPM_L3CACHE_MISS_R_H: return proc_perf.l3cache.read_misses >> 32;
case CSR_MPM_L3CACHE_MISS_W: return proc_perf.l3cache.write_misses & 0xffffffff;
case CSR_MPM_L3CACHE_MISS_W_H: return proc_perf.l3cache.write_misses >> 32;
case CSR_MPM_L3CACHE_BANK_ST: return proc_perf.l3cache.bank_stalls & 0xffffffff;
case CSR_MPM_L3CACHE_BANK_ST_H:return proc_perf.l3cache.bank_stalls >> 32;
case CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
case CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
case CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
case CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
@ -529,6 +566,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case CSR_MPM_TEX_READS_H: return proc_perf.clusters.tex_unit.reads >> 32;
case CSR_MPM_TEX_LAT: return proc_perf.clusters.tex_unit.latency & 0xffffffff;
case CSR_MPM_TEX_LAT_H: return proc_perf.clusters.tex_unit.latency >> 32;
case CSR_MPM_TEX_STALL: return proc_perf.clusters.tex_unit.stalls & 0xffffffff;
case CSR_MPM_TEX_STALL_H: return proc_perf.clusters.tex_unit.stalls >> 32;
case CSR_MPM_TCACHE_READS: return proc_perf.clusters.tcache.reads & 0xffffffff;
case CSR_MPM_TCACHE_READS_H: return proc_perf.clusters.tcache.reads >> 32;
@ -538,6 +577,9 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case CSR_MPM_TCACHE_BANK_ST_H:return proc_perf.clusters.tcache.bank_stalls >> 32;
case CSR_MPM_TCACHE_MSHR_ST: return proc_perf.clusters.tcache.mshr_stalls & 0xffffffff;
case CSR_MPM_TCACHE_MSHR_ST_H:return proc_perf.clusters.tcache.mshr_stalls >> 32;
case CSR_MPM_TEX_ISSUE_ST: return perf_stats_.tex_issue_stalls & 0xffffffff;
case CSR_MPM_TEX_ISSUE_ST_H: return perf_stats_.tex_issue_stalls >> 32;
}
} break;
case DCR_MPM_CLASS_RASTER: {
@ -557,6 +599,9 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case CSR_MPM_RCACHE_BANK_ST_H:return proc_perf.clusters.rcache.bank_stalls >> 32;
case CSR_MPM_RCACHE_MSHR_ST: return proc_perf.clusters.rcache.mshr_stalls & 0xffffffff;
case CSR_MPM_RCACHE_MSHR_ST_H:return proc_perf.clusters.rcache.mshr_stalls >> 32;
case CSR_MPM_RASTER_ISSUE_ST: return perf_stats_.raster_issue_stalls & 0xffffffff;
case CSR_MPM_RASTER_ISSUE_ST_H: return perf_stats_.raster_issue_stalls >> 32;
default:
return 0;
}
@ -584,6 +629,9 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case CSR_MPM_OCACHE_BANK_ST_H:return proc_perf.clusters.ocache.bank_stalls >> 32;
case CSR_MPM_OCACHE_MSHR_ST: return proc_perf.clusters.ocache.mshr_stalls & 0xffffffff;
case CSR_MPM_OCACHE_MSHR_ST_H:return proc_perf.clusters.ocache.mshr_stalls >> 32;
case CSR_MPM_ROP_ISSUE_ST: return perf_stats_.rop_issue_stalls & 0xffffffff;
case CSR_MPM_ROP_ISSUE_ST_H: return perf_stats_.rop_issue_stalls >> 32;
default:
return 0;
}

View file

@ -41,9 +41,14 @@ public:
uint64_t csr_stalls;
uint64_t fpu_stalls;
uint64_t gpu_stalls;
uint64_t tex_issue_stalls;
uint64_t rop_issue_stalls;
uint64_t raster_issue_stalls;
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
uint64_t branches;
uint64_t ifetch_latency;
uint64_t load_latency;
PerfStats()
: instrs(0)
@ -54,9 +59,14 @@ public:
, csr_stalls(0)
, fpu_stalls(0)
, gpu_stalls(0)
, tex_issue_stalls(0)
, rop_issue_stalls(0)
, raster_issue_stalls(0)
, ifetches(0)
, loads(0)
, stores(0)
, branches(0)
, ifetch_latency(0)
, load_latency(0)
{}
};
@ -165,6 +175,8 @@ private:
bool ecall_;
bool ebreak_;
uint64_t pending_ifetches_;
std::unordered_map<int, std::stringstream> print_bufs_;
PerfStats perf_stats_;

View file

@ -27,15 +27,19 @@ LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
: ExeUnit(ctx, core, "LSU")
, pending_rd_reqs_(LSUQ_SIZE)
, num_threads_(core->arch().num_threads())
, pending_loads_(0)
, fence_lock_(false)
{}
void LsuUnit::reset() {
pending_rd_reqs_.clear();
pending_loads_ = 0;
fence_lock_ = false;
}
void LsuUnit::tick() {
core_->perf_stats_.load_latency += pending_loads_;
// handle dcache response
for (uint32_t t = 0; t < num_threads_; ++t) {
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
@ -52,7 +56,8 @@ void LsuUnit::tick() {
Output.send(trace, 1);
pending_rd_reqs_.release(mem_rsp.tag);
}
dcache_rsp_port.pop();
dcache_rsp_port.pop();
--pending_loads_;
}
// handle shared memory response
@ -72,6 +77,7 @@ void LsuUnit::tick() {
pending_rd_reqs_.release(mem_rsp.tag);
}
smem_rsp_port.pop();
--pending_loads_;
}
if (fence_lock_) {
@ -81,7 +87,7 @@ void LsuUnit::tick() {
Output.send(fence_state_, 1);
fence_lock_ = false;
DT(3, "fence-unlock: " << fence_state_);
}
}
// check input queue
if (Input.empty())
@ -156,6 +162,9 @@ void LsuUnit::tick() {
dcache_req_port.send(mem_req, 2);
DT(3, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << tag
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.addr_type << ", " << *trace);
++pending_loads_;
++core_->perf_stats_.loads;
if (is_dup)
break;
@ -165,6 +174,8 @@ void LsuUnit::tick() {
if (is_write) {
pending_rd_reqs_.release(tag);
Output.send(trace, 1);
++core_->perf_stats_.stores;
}
// remove input
@ -279,7 +290,9 @@ void GpuUnit::tick() {
auto trace = Input.front();
switch (trace->gpu_type) {
auto gpu_type = trace->gpu_type;
switch (gpu_type) {
case GpuType::TMC: {
Output.send(trace, 1);
auto trace_data = std::dynamic_pointer_cast<GPUTraceData>(trace->data);
@ -325,6 +338,12 @@ void GpuUnit::tick() {
if (trace->fetch_stall) {
core_->stalled_warps_.reset(trace->wid);
}
auto time = Input.pop();
core_->perf_stats_.gpu_stalls += (SimPlatform::instance().cycles() - time);
auto stalls = (SimPlatform::instance().cycles() - time);
if (gpu_type == GpuType::TEX) core_->perf_stats_.tex_issue_stalls += stalls;
if (gpu_type == GpuType::ROP) core_->perf_stats_.rop_issue_stalls += stalls;
if (gpu_type == GpuType::RASTER) core_->perf_stats_.raster_issue_stalls += stalls;
core_->perf_stats_.gpu_stalls += stalls;
}

View file

@ -53,6 +53,7 @@ private:
HashTable<pending_req_t> pending_rd_reqs_;
uint32_t num_threads_;
pipeline_trace_t* fence_state_;
uint64_t pending_loads_;
bool fence_lock_;
public:

View file

@ -100,6 +100,7 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
perf.mem_reads = perf_mem_reads_;
perf.mem_writes = perf_mem_writes_;
perf.mem_latency = perf_mem_pending_reads_;
perf.l3cache = l3cache_->perf_stats();
for (auto cluster : clusters_) {
perf.clusters += cluster->perf_stats();
}

View file

@ -29,6 +29,7 @@ public:
uint64_t mem_reads;
uint64_t mem_writes;
uint64_t mem_latency;
CacheSim::PerfStats l3cache;
Cluster::PerfStats clusters;
PerfStats()

View file

@ -566,7 +566,8 @@ public:
// check input trace
if (simobject_->Input.empty())
return;
perf_stats_.stalls += simobject_->Input.stalled();
perf_stats_.stalls += simobject_->Input.stalled();
auto trace = simobject_->Input.front();
auto data = std::dynamic_pointer_cast<RopUnit::TraceData>(trace->data);
data->cid = trace->cid;