mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 22:07:41 -04:00
simx perf counters updates
This commit is contained in:
parent
a711f0b6cd
commit
9143c19be3
14 changed files with 117 additions and 27 deletions
|
@ -147,18 +147,20 @@
|
||||||
`define CSR_MPM_TEX_READS_H 12'hB83
|
`define CSR_MPM_TEX_READS_H 12'hB83
|
||||||
`define CSR_MPM_TEX_LAT 12'hB04 // texture latency
|
`define CSR_MPM_TEX_LAT 12'hB04 // texture latency
|
||||||
`define CSR_MPM_TEX_LAT_H 12'hB84
|
`define CSR_MPM_TEX_LAT_H 12'hB84
|
||||||
|
`define CSR_MPM_TEX_STALL 12'hB05 // texture latency
|
||||||
|
`define CSR_MPM_TEX_STALL_H 12'hB85
|
||||||
// PERF: texture cache
|
// PERF: texture cache
|
||||||
`define CSR_MPM_TCACHE_READS 12'hB05 // total reads
|
`define CSR_MPM_TCACHE_READS 12'hB06 // total reads
|
||||||
`define CSR_MPM_TCACHE_READS_H 12'hB85
|
`define CSR_MPM_TCACHE_READS_H 12'hB86
|
||||||
`define CSR_MPM_TCACHE_MISS_R 12'hB06 // read misses
|
`define CSR_MPM_TCACHE_MISS_R 12'hB07 // read misses
|
||||||
`define CSR_MPM_TCACHE_MISS_R_H 12'hB86
|
`define CSR_MPM_TCACHE_MISS_R_H 12'hB87
|
||||||
`define CSR_MPM_TCACHE_BANK_ST 12'hB07 // bank stalls
|
`define CSR_MPM_TCACHE_BANK_ST 12'hB08 // bank stalls
|
||||||
`define CSR_MPM_TCACHE_BANK_ST_H 12'hB87
|
`define CSR_MPM_TCACHE_BANK_ST_H 12'hB88
|
||||||
`define CSR_MPM_TCACHE_MSHR_ST 12'hB08 // MSHR stalls
|
`define CSR_MPM_TCACHE_MSHR_ST 12'hB09 // MSHR stalls
|
||||||
`define CSR_MPM_TCACHE_MSHR_ST_H 12'hB88
|
`define CSR_MPM_TCACHE_MSHR_ST_H 12'hB89
|
||||||
// PERF: pipeline
|
// PERF: pipeline
|
||||||
`define CSR_MPM_TEX_ISSUE_ST 12'hB09 // issue stalls
|
`define CSR_MPM_TEX_ISSUE_ST 12'hB0A // issue stalls
|
||||||
`define CSR_MPM_TEX_ISSUE_ST_H 12'hB89
|
`define CSR_MPM_TEX_ISSUE_ST_H 12'hB8A
|
||||||
|
|
||||||
// Machine Performance-monitoring raster counters
|
// Machine Performance-monitoring raster counters
|
||||||
// PERF: raster unit
|
// PERF: raster unit
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
RTL_DIR = ../../../../../rtl
|
RTL_DIR = ../../../../../rtl
|
||||||
|
AFU_DIR = ../../../../../afu/opae
|
||||||
THIRD_PARTY_DIR = ../../../../../../third_party
|
THIRD_PARTY_DIR = ../../../../../../third_party
|
||||||
|
|
||||||
ifeq ($(DEVICE_FAMILY), stratix10)
|
ifeq ($(DEVICE_FAMILY), stratix10)
|
||||||
|
|
|
@ -32,6 +32,7 @@ set_global_assignment -name DEVICE $opts(device)
|
||||||
set_global_assignment -name TOP_LEVEL_ENTITY $opts(top)
|
set_global_assignment -name TOP_LEVEL_ENTITY $opts(top)
|
||||||
set_global_assignment -name PROJECT_OUTPUT_DIRECTORY bin
|
set_global_assignment -name PROJECT_OUTPUT_DIRECTORY bin
|
||||||
|
|
||||||
|
#set_global_assignment -name OPTIMIZATION_TECHNIQUE AREA
|
||||||
set_global_assignment -name NUM_PARALLEL_PROCESSORS ALL
|
set_global_assignment -name NUM_PARALLEL_PROCESSORS ALL
|
||||||
set_global_assignment -name VERILOG_INPUT_VERSION SYSTEMVERILOG_2009
|
set_global_assignment -name VERILOG_INPUT_VERSION SYSTEMVERILOG_2009
|
||||||
set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS ON
|
set_global_assignment -name ADD_PASS_THROUGH_LOGIC_TO_INFERRED_RAMS ON
|
||||||
|
|
|
@ -541,7 +541,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
||||||
int tex_stall_cycles_ratio = (int)(100 * double(tex_stall_cycles) / cycles);
|
int tex_stall_cycles_ratio = (int)(100 * double(tex_stall_cycles) / cycles);
|
||||||
fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads);
|
fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads);
|
||||||
fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat);
|
fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat);
|
||||||
fprintf(stream, "PERF: raster stall cycles=%ld cycles (%d%%)\n", tex_stall_cycles, tex_stall_cycles_ratio);
|
fprintf(stream, "PERF: tex stall cycles=%ld cycles (%d%%)\n", tex_stall_cycles, tex_stall_cycles_ratio);
|
||||||
fprintf(stream, "PERF: tex issue stalls=%ld\n", tex_issue_stalls);
|
fprintf(stream, "PERF: tex issue stalls=%ld\n", tex_issue_stalls);
|
||||||
int tcache_read_hit_ratio = (int)((1.0 - (double(tcache_read_misses) / double(tcache_reads))) * 100);
|
int tcache_read_hit_ratio = (int)((1.0 - (double(tcache_read_misses) / double(tcache_reads))) * 100);
|
||||||
int tcache_bank_utilization = (int)((double(tcache_reads) / double(tcache_reads + tcache_bank_stalls)) * 100);
|
int tcache_bank_utilization = (int)((double(tcache_reads) / double(tcache_reads + tcache_bank_stalls)) * 100);
|
||||||
|
|
|
@ -265,7 +265,7 @@ private:
|
||||||
PerfStats perf_stats_;
|
PerfStats perf_stats_;
|
||||||
uint64_t pending_read_reqs_;
|
uint64_t pending_read_reqs_;
|
||||||
uint64_t pending_write_reqs_;
|
uint64_t pending_write_reqs_;
|
||||||
uint64_t pending_fill_reqs_;
|
uint64_t pending_fill_reqs_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
Impl(CacheSim* simobject, const Config& config)
|
Impl(CacheSim* simobject, const Config& config)
|
||||||
|
|
|
@ -280,6 +280,7 @@ Cluster::PerfStats Cluster::perf_stats() const {
|
||||||
perf.tcache = tcaches_->perf_stats();
|
perf.tcache = tcaches_->perf_stats();
|
||||||
perf.ocache = ocaches_->perf_stats();
|
perf.ocache = ocaches_->perf_stats();
|
||||||
perf.rcache = rcaches_->perf_stats();
|
perf.rcache = rcaches_->perf_stats();
|
||||||
|
perf.l2cache = l2cache_->perf_stats();
|
||||||
|
|
||||||
for (auto sharedmem : sharedmems_) {
|
for (auto sharedmem : sharedmems_) {
|
||||||
perf.sharedmem += sharedmem->perf_stats();
|
perf.sharedmem += sharedmem->perf_stats();
|
||||||
|
|
|
@ -39,6 +39,7 @@ public:
|
||||||
CacheSim::PerfStats icache;
|
CacheSim::PerfStats icache;
|
||||||
CacheSim::PerfStats dcache;
|
CacheSim::PerfStats dcache;
|
||||||
SharedMem::PerfStats sharedmem;
|
SharedMem::PerfStats sharedmem;
|
||||||
|
CacheSim::PerfStats l2cache;
|
||||||
CacheSim::PerfStats tcache;
|
CacheSim::PerfStats tcache;
|
||||||
CacheSim::PerfStats ocache;
|
CacheSim::PerfStats ocache;
|
||||||
CacheSim::PerfStats rcache;
|
CacheSim::PerfStats rcache;
|
||||||
|
@ -50,6 +51,7 @@ public:
|
||||||
this->icache += rhs.icache;
|
this->icache += rhs.icache;
|
||||||
this->dcache += rhs.dcache;
|
this->dcache += rhs.dcache;
|
||||||
this->sharedmem += rhs.sharedmem;
|
this->sharedmem += rhs.sharedmem;
|
||||||
|
this->l2cache += rhs.l2cache;
|
||||||
this->tcache += rhs.tcache;
|
this->tcache += rhs.tcache;
|
||||||
this->ocache += rhs.ocache;
|
this->ocache += rhs.ocache;
|
||||||
this->rcache += rhs.rcache;
|
this->rcache += rhs.rcache;
|
||||||
|
|
|
@ -102,6 +102,7 @@ void Core::reset() {
|
||||||
ecall_ = false;
|
ecall_ = false;
|
||||||
ebreak_ = false;
|
ebreak_ = false;
|
||||||
perf_stats_ = PerfStats();
|
perf_stats_ = PerfStats();
|
||||||
|
pending_ifetches_ = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void Core::attach_ram(RAM* ram) {
|
void Core::attach_ram(RAM* ram) {
|
||||||
|
@ -159,6 +160,8 @@ void Core::schedule() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void Core::fetch() {
|
void Core::fetch() {
|
||||||
|
perf_stats_.ifetch_latency += pending_ifetches_;
|
||||||
|
|
||||||
// handle icache reponse
|
// handle icache reponse
|
||||||
auto& icache_rsp_port = icache_rsp_ports.at(0);
|
auto& icache_rsp_port = icache_rsp_ports.at(0);
|
||||||
if (!icache_rsp_port.empty()){
|
if (!icache_rsp_port.empty()){
|
||||||
|
@ -168,6 +171,7 @@ void Core::fetch() {
|
||||||
DT(3, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
|
DT(3, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
|
||||||
pending_icache_.release(mem_rsp.tag);
|
pending_icache_.release(mem_rsp.tag);
|
||||||
icache_rsp_port.pop();
|
icache_rsp_port.pop();
|
||||||
|
--pending_ifetches_;
|
||||||
}
|
}
|
||||||
|
|
||||||
// send icache request
|
// send icache request
|
||||||
|
@ -180,9 +184,11 @@ void Core::fetch() {
|
||||||
mem_req.cid = trace->cid;
|
mem_req.cid = trace->cid;
|
||||||
mem_req.uuid = trace->uuid;
|
mem_req.uuid = trace->uuid;
|
||||||
icache_req_ports.at(0).send(mem_req, 1);
|
icache_req_ports.at(0).send(mem_req, 1);
|
||||||
DT(3, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
DT(3, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
||||||
fetch_latch_.pop();
|
fetch_latch_.pop();
|
||||||
}
|
++pending_ifetches_;
|
||||||
|
++perf_stats_.ifetches;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Core::decode() {
|
void Core::decode() {
|
||||||
|
@ -214,8 +220,6 @@ void Core::decode() {
|
||||||
perf_stats_.loads += active_threads;
|
perf_stats_.loads += active_threads;
|
||||||
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE)
|
if (trace->exe_type == ExeType::LSU && trace->lsu_type == LsuType::STORE)
|
||||||
perf_stats_.stores += active_threads;
|
perf_stats_.stores += active_threads;
|
||||||
if (trace->exe_type == ExeType::ALU && trace->alu_type == AluType::BRANCH)
|
|
||||||
perf_stats_.branches += active_threads;
|
|
||||||
|
|
||||||
DT(3, "pipeline-decode: " << *trace);
|
DT(3, "pipeline-decode: " << *trace);
|
||||||
|
|
||||||
|
@ -483,13 +487,20 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||||
case CSR_MPM_GPU_ST: return perf_stats_.gpu_stalls & 0xffffffff;
|
case CSR_MPM_GPU_ST: return perf_stats_.gpu_stalls & 0xffffffff;
|
||||||
case CSR_MPM_GPU_ST_H: return perf_stats_.gpu_stalls >> 32;
|
case CSR_MPM_GPU_ST_H: return perf_stats_.gpu_stalls >> 32;
|
||||||
|
|
||||||
|
case CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
|
||||||
|
case CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
|
||||||
case CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
|
case CSR_MPM_LOADS: return perf_stats_.loads & 0xffffffff;
|
||||||
case CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
|
case CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
|
||||||
case CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
|
case CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
|
||||||
case CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
|
case CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
|
||||||
case CSR_MPM_BRANCHES: return perf_stats_.branches & 0xffffffff;
|
case CSR_MPM_IFETCH_LAT: return perf_stats_.ifetch_latency & 0xffffffff;
|
||||||
case CSR_MPM_BRANCHES_H:return perf_stats_.branches >> 32;
|
case CSR_MPM_IFETCH_LAT_H: return perf_stats_.ifetch_latency >> 32;
|
||||||
|
case CSR_MPM_LOAD_LAT: return perf_stats_.load_latency & 0xffffffff;
|
||||||
|
case CSR_MPM_LOAD_LAT_H: return perf_stats_.load_latency >> 32;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
|
case DCR_MPM_CLASS_MEM: {
|
||||||
|
switch (addr) {
|
||||||
case CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
|
case CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
|
||||||
case CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
|
case CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
|
||||||
case CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
|
case CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
|
||||||
|
@ -515,6 +526,32 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||||
case CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
|
case CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
|
||||||
case CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32;
|
case CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32;
|
||||||
|
|
||||||
|
case CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
|
||||||
|
case CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
|
||||||
|
case CSR_MPM_L2CACHE_WRITES: return proc_perf.clusters.l2cache.writes & 0xffffffff;
|
||||||
|
case CSR_MPM_L2CACHE_WRITES_H: return proc_perf.clusters.l2cache.writes >> 32;
|
||||||
|
case CSR_MPM_L2CACHE_MISS_R: return proc_perf.clusters.l2cache.read_misses & 0xffffffff;
|
||||||
|
case CSR_MPM_L2CACHE_MISS_R_H: return proc_perf.clusters.l2cache.read_misses >> 32;
|
||||||
|
case CSR_MPM_L2CACHE_MISS_W: return proc_perf.clusters.l2cache.write_misses & 0xffffffff;
|
||||||
|
case CSR_MPM_L2CACHE_MISS_W_H: return proc_perf.clusters.l2cache.write_misses >> 32;
|
||||||
|
case CSR_MPM_L2CACHE_BANK_ST: return proc_perf.clusters.l2cache.bank_stalls & 0xffffffff;
|
||||||
|
case CSR_MPM_L2CACHE_BANK_ST_H:return proc_perf.clusters.l2cache.bank_stalls >> 32;
|
||||||
|
case CSR_MPM_L2CACHE_MSHR_ST: return proc_perf.clusters.l2cache.mshr_stalls & 0xffffffff;
|
||||||
|
case CSR_MPM_L2CACHE_MSHR_ST_H:return proc_perf.clusters.l2cache.mshr_stalls >> 32;
|
||||||
|
|
||||||
|
case CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
|
||||||
|
case CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
|
||||||
|
case CSR_MPM_L3CACHE_WRITES: return proc_perf.l3cache.writes & 0xffffffff;
|
||||||
|
case CSR_MPM_L3CACHE_WRITES_H: return proc_perf.l3cache.writes >> 32;
|
||||||
|
case CSR_MPM_L3CACHE_MISS_R: return proc_perf.l3cache.read_misses & 0xffffffff;
|
||||||
|
case CSR_MPM_L3CACHE_MISS_R_H: return proc_perf.l3cache.read_misses >> 32;
|
||||||
|
case CSR_MPM_L3CACHE_MISS_W: return proc_perf.l3cache.write_misses & 0xffffffff;
|
||||||
|
case CSR_MPM_L3CACHE_MISS_W_H: return proc_perf.l3cache.write_misses >> 32;
|
||||||
|
case CSR_MPM_L3CACHE_BANK_ST: return proc_perf.l3cache.bank_stalls & 0xffffffff;
|
||||||
|
case CSR_MPM_L3CACHE_BANK_ST_H:return proc_perf.l3cache.bank_stalls >> 32;
|
||||||
|
case CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
|
||||||
|
case CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
|
||||||
|
|
||||||
case CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
|
case CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
|
||||||
case CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
case CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
||||||
case CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
case CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
||||||
|
@ -529,6 +566,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||||
case CSR_MPM_TEX_READS_H: return proc_perf.clusters.tex_unit.reads >> 32;
|
case CSR_MPM_TEX_READS_H: return proc_perf.clusters.tex_unit.reads >> 32;
|
||||||
case CSR_MPM_TEX_LAT: return proc_perf.clusters.tex_unit.latency & 0xffffffff;
|
case CSR_MPM_TEX_LAT: return proc_perf.clusters.tex_unit.latency & 0xffffffff;
|
||||||
case CSR_MPM_TEX_LAT_H: return proc_perf.clusters.tex_unit.latency >> 32;
|
case CSR_MPM_TEX_LAT_H: return proc_perf.clusters.tex_unit.latency >> 32;
|
||||||
|
case CSR_MPM_TEX_STALL: return proc_perf.clusters.tex_unit.stalls & 0xffffffff;
|
||||||
|
case CSR_MPM_TEX_STALL_H: return proc_perf.clusters.tex_unit.stalls >> 32;
|
||||||
|
|
||||||
case CSR_MPM_TCACHE_READS: return proc_perf.clusters.tcache.reads & 0xffffffff;
|
case CSR_MPM_TCACHE_READS: return proc_perf.clusters.tcache.reads & 0xffffffff;
|
||||||
case CSR_MPM_TCACHE_READS_H: return proc_perf.clusters.tcache.reads >> 32;
|
case CSR_MPM_TCACHE_READS_H: return proc_perf.clusters.tcache.reads >> 32;
|
||||||
|
@ -538,6 +577,9 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||||
case CSR_MPM_TCACHE_BANK_ST_H:return proc_perf.clusters.tcache.bank_stalls >> 32;
|
case CSR_MPM_TCACHE_BANK_ST_H:return proc_perf.clusters.tcache.bank_stalls >> 32;
|
||||||
case CSR_MPM_TCACHE_MSHR_ST: return proc_perf.clusters.tcache.mshr_stalls & 0xffffffff;
|
case CSR_MPM_TCACHE_MSHR_ST: return proc_perf.clusters.tcache.mshr_stalls & 0xffffffff;
|
||||||
case CSR_MPM_TCACHE_MSHR_ST_H:return proc_perf.clusters.tcache.mshr_stalls >> 32;
|
case CSR_MPM_TCACHE_MSHR_ST_H:return proc_perf.clusters.tcache.mshr_stalls >> 32;
|
||||||
|
|
||||||
|
case CSR_MPM_TEX_ISSUE_ST: return perf_stats_.tex_issue_stalls & 0xffffffff;
|
||||||
|
case CSR_MPM_TEX_ISSUE_ST_H: return perf_stats_.tex_issue_stalls >> 32;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
case DCR_MPM_CLASS_RASTER: {
|
case DCR_MPM_CLASS_RASTER: {
|
||||||
|
@ -557,6 +599,9 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||||
case CSR_MPM_RCACHE_BANK_ST_H:return proc_perf.clusters.rcache.bank_stalls >> 32;
|
case CSR_MPM_RCACHE_BANK_ST_H:return proc_perf.clusters.rcache.bank_stalls >> 32;
|
||||||
case CSR_MPM_RCACHE_MSHR_ST: return proc_perf.clusters.rcache.mshr_stalls & 0xffffffff;
|
case CSR_MPM_RCACHE_MSHR_ST: return proc_perf.clusters.rcache.mshr_stalls & 0xffffffff;
|
||||||
case CSR_MPM_RCACHE_MSHR_ST_H:return proc_perf.clusters.rcache.mshr_stalls >> 32;
|
case CSR_MPM_RCACHE_MSHR_ST_H:return proc_perf.clusters.rcache.mshr_stalls >> 32;
|
||||||
|
|
||||||
|
case CSR_MPM_RASTER_ISSUE_ST: return perf_stats_.raster_issue_stalls & 0xffffffff;
|
||||||
|
case CSR_MPM_RASTER_ISSUE_ST_H: return perf_stats_.raster_issue_stalls >> 32;
|
||||||
default:
|
default:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -584,6 +629,9 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||||
case CSR_MPM_OCACHE_BANK_ST_H:return proc_perf.clusters.ocache.bank_stalls >> 32;
|
case CSR_MPM_OCACHE_BANK_ST_H:return proc_perf.clusters.ocache.bank_stalls >> 32;
|
||||||
case CSR_MPM_OCACHE_MSHR_ST: return proc_perf.clusters.ocache.mshr_stalls & 0xffffffff;
|
case CSR_MPM_OCACHE_MSHR_ST: return proc_perf.clusters.ocache.mshr_stalls & 0xffffffff;
|
||||||
case CSR_MPM_OCACHE_MSHR_ST_H:return proc_perf.clusters.ocache.mshr_stalls >> 32;
|
case CSR_MPM_OCACHE_MSHR_ST_H:return proc_perf.clusters.ocache.mshr_stalls >> 32;
|
||||||
|
|
||||||
|
case CSR_MPM_ROP_ISSUE_ST: return perf_stats_.rop_issue_stalls & 0xffffffff;
|
||||||
|
case CSR_MPM_ROP_ISSUE_ST_H: return perf_stats_.rop_issue_stalls >> 32;
|
||||||
default:
|
default:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,9 +41,14 @@ public:
|
||||||
uint64_t csr_stalls;
|
uint64_t csr_stalls;
|
||||||
uint64_t fpu_stalls;
|
uint64_t fpu_stalls;
|
||||||
uint64_t gpu_stalls;
|
uint64_t gpu_stalls;
|
||||||
|
uint64_t tex_issue_stalls;
|
||||||
|
uint64_t rop_issue_stalls;
|
||||||
|
uint64_t raster_issue_stalls;
|
||||||
|
uint64_t ifetches;
|
||||||
uint64_t loads;
|
uint64_t loads;
|
||||||
uint64_t stores;
|
uint64_t stores;
|
||||||
uint64_t branches;
|
uint64_t ifetch_latency;
|
||||||
|
uint64_t load_latency;
|
||||||
|
|
||||||
PerfStats()
|
PerfStats()
|
||||||
: instrs(0)
|
: instrs(0)
|
||||||
|
@ -54,9 +59,14 @@ public:
|
||||||
, csr_stalls(0)
|
, csr_stalls(0)
|
||||||
, fpu_stalls(0)
|
, fpu_stalls(0)
|
||||||
, gpu_stalls(0)
|
, gpu_stalls(0)
|
||||||
|
, tex_issue_stalls(0)
|
||||||
|
, rop_issue_stalls(0)
|
||||||
|
, raster_issue_stalls(0)
|
||||||
|
, ifetches(0)
|
||||||
, loads(0)
|
, loads(0)
|
||||||
, stores(0)
|
, stores(0)
|
||||||
, branches(0)
|
, ifetch_latency(0)
|
||||||
|
, load_latency(0)
|
||||||
{}
|
{}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -165,6 +175,8 @@ private:
|
||||||
bool ecall_;
|
bool ecall_;
|
||||||
bool ebreak_;
|
bool ebreak_;
|
||||||
|
|
||||||
|
uint64_t pending_ifetches_;
|
||||||
|
|
||||||
std::unordered_map<int, std::stringstream> print_bufs_;
|
std::unordered_map<int, std::stringstream> print_bufs_;
|
||||||
|
|
||||||
PerfStats perf_stats_;
|
PerfStats perf_stats_;
|
||||||
|
|
|
@ -27,15 +27,19 @@ LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
|
||||||
: ExeUnit(ctx, core, "LSU")
|
: ExeUnit(ctx, core, "LSU")
|
||||||
, pending_rd_reqs_(LSUQ_SIZE)
|
, pending_rd_reqs_(LSUQ_SIZE)
|
||||||
, num_threads_(core->arch().num_threads())
|
, num_threads_(core->arch().num_threads())
|
||||||
|
, pending_loads_(0)
|
||||||
, fence_lock_(false)
|
, fence_lock_(false)
|
||||||
{}
|
{}
|
||||||
|
|
||||||
void LsuUnit::reset() {
|
void LsuUnit::reset() {
|
||||||
pending_rd_reqs_.clear();
|
pending_rd_reqs_.clear();
|
||||||
|
pending_loads_ = 0;
|
||||||
fence_lock_ = false;
|
fence_lock_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void LsuUnit::tick() {
|
void LsuUnit::tick() {
|
||||||
|
core_->perf_stats_.load_latency += pending_loads_;
|
||||||
|
|
||||||
// handle dcache response
|
// handle dcache response
|
||||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||||
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
|
auto& dcache_rsp_port = core_->dcache_rsp_ports.at(t);
|
||||||
|
@ -52,7 +56,8 @@ void LsuUnit::tick() {
|
||||||
Output.send(trace, 1);
|
Output.send(trace, 1);
|
||||||
pending_rd_reqs_.release(mem_rsp.tag);
|
pending_rd_reqs_.release(mem_rsp.tag);
|
||||||
}
|
}
|
||||||
dcache_rsp_port.pop();
|
dcache_rsp_port.pop();
|
||||||
|
--pending_loads_;
|
||||||
}
|
}
|
||||||
|
|
||||||
// handle shared memory response
|
// handle shared memory response
|
||||||
|
@ -72,6 +77,7 @@ void LsuUnit::tick() {
|
||||||
pending_rd_reqs_.release(mem_rsp.tag);
|
pending_rd_reqs_.release(mem_rsp.tag);
|
||||||
}
|
}
|
||||||
smem_rsp_port.pop();
|
smem_rsp_port.pop();
|
||||||
|
--pending_loads_;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (fence_lock_) {
|
if (fence_lock_) {
|
||||||
|
@ -81,7 +87,7 @@ void LsuUnit::tick() {
|
||||||
Output.send(fence_state_, 1);
|
Output.send(fence_state_, 1);
|
||||||
fence_lock_ = false;
|
fence_lock_ = false;
|
||||||
DT(3, "fence-unlock: " << fence_state_);
|
DT(3, "fence-unlock: " << fence_state_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// check input queue
|
// check input queue
|
||||||
if (Input.empty())
|
if (Input.empty())
|
||||||
|
@ -156,6 +162,9 @@ void LsuUnit::tick() {
|
||||||
dcache_req_port.send(mem_req, 2);
|
dcache_req_port.send(mem_req, 2);
|
||||||
DT(3, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << tag
|
DT(3, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << tag
|
||||||
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.addr_type << ", " << *trace);
|
<< ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.addr_type << ", " << *trace);
|
||||||
|
|
||||||
|
++pending_loads_;
|
||||||
|
++core_->perf_stats_.loads;
|
||||||
|
|
||||||
if (is_dup)
|
if (is_dup)
|
||||||
break;
|
break;
|
||||||
|
@ -165,6 +174,8 @@ void LsuUnit::tick() {
|
||||||
if (is_write) {
|
if (is_write) {
|
||||||
pending_rd_reqs_.release(tag);
|
pending_rd_reqs_.release(tag);
|
||||||
Output.send(trace, 1);
|
Output.send(trace, 1);
|
||||||
|
|
||||||
|
++core_->perf_stats_.stores;
|
||||||
}
|
}
|
||||||
|
|
||||||
// remove input
|
// remove input
|
||||||
|
@ -279,7 +290,9 @@ void GpuUnit::tick() {
|
||||||
|
|
||||||
auto trace = Input.front();
|
auto trace = Input.front();
|
||||||
|
|
||||||
switch (trace->gpu_type) {
|
auto gpu_type = trace->gpu_type;
|
||||||
|
|
||||||
|
switch (gpu_type) {
|
||||||
case GpuType::TMC: {
|
case GpuType::TMC: {
|
||||||
Output.send(trace, 1);
|
Output.send(trace, 1);
|
||||||
auto trace_data = std::dynamic_pointer_cast<GPUTraceData>(trace->data);
|
auto trace_data = std::dynamic_pointer_cast<GPUTraceData>(trace->data);
|
||||||
|
@ -325,6 +338,12 @@ void GpuUnit::tick() {
|
||||||
if (trace->fetch_stall) {
|
if (trace->fetch_stall) {
|
||||||
core_->stalled_warps_.reset(trace->wid);
|
core_->stalled_warps_.reset(trace->wid);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto time = Input.pop();
|
auto time = Input.pop();
|
||||||
core_->perf_stats_.gpu_stalls += (SimPlatform::instance().cycles() - time);
|
auto stalls = (SimPlatform::instance().cycles() - time);
|
||||||
|
|
||||||
|
if (gpu_type == GpuType::TEX) core_->perf_stats_.tex_issue_stalls += stalls;
|
||||||
|
if (gpu_type == GpuType::ROP) core_->perf_stats_.rop_issue_stalls += stalls;
|
||||||
|
if (gpu_type == GpuType::RASTER) core_->perf_stats_.raster_issue_stalls += stalls;
|
||||||
|
core_->perf_stats_.gpu_stalls += stalls;
|
||||||
}
|
}
|
|
@ -53,6 +53,7 @@ private:
|
||||||
HashTable<pending_req_t> pending_rd_reqs_;
|
HashTable<pending_req_t> pending_rd_reqs_;
|
||||||
uint32_t num_threads_;
|
uint32_t num_threads_;
|
||||||
pipeline_trace_t* fence_state_;
|
pipeline_trace_t* fence_state_;
|
||||||
|
uint64_t pending_loads_;
|
||||||
bool fence_lock_;
|
bool fence_lock_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
|
|
@ -100,6 +100,7 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
|
||||||
perf.mem_reads = perf_mem_reads_;
|
perf.mem_reads = perf_mem_reads_;
|
||||||
perf.mem_writes = perf_mem_writes_;
|
perf.mem_writes = perf_mem_writes_;
|
||||||
perf.mem_latency = perf_mem_pending_reads_;
|
perf.mem_latency = perf_mem_pending_reads_;
|
||||||
|
perf.l3cache = l3cache_->perf_stats();
|
||||||
for (auto cluster : clusters_) {
|
for (auto cluster : clusters_) {
|
||||||
perf.clusters += cluster->perf_stats();
|
perf.clusters += cluster->perf_stats();
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,7 @@ public:
|
||||||
uint64_t mem_reads;
|
uint64_t mem_reads;
|
||||||
uint64_t mem_writes;
|
uint64_t mem_writes;
|
||||||
uint64_t mem_latency;
|
uint64_t mem_latency;
|
||||||
|
CacheSim::PerfStats l3cache;
|
||||||
Cluster::PerfStats clusters;
|
Cluster::PerfStats clusters;
|
||||||
|
|
||||||
PerfStats()
|
PerfStats()
|
||||||
|
|
|
@ -566,7 +566,8 @@ public:
|
||||||
// check input trace
|
// check input trace
|
||||||
if (simobject_->Input.empty())
|
if (simobject_->Input.empty())
|
||||||
return;
|
return;
|
||||||
perf_stats_.stalls += simobject_->Input.stalled();
|
|
||||||
|
perf_stats_.stalls += simobject_->Input.stalled();
|
||||||
auto trace = simobject_->Input.front();
|
auto trace = simobject_->Input.front();
|
||||||
auto data = std::dynamic_pointer_cast<RopUnit::TraceData>(trace->data);
|
auto data = std::dynamic_pointer_cast<RopUnit::TraceData>(trace->data);
|
||||||
data->cid = trace->cid;
|
data->cid = trace->cid;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue