mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
PERF pipeline stalls and cache
This commit is contained in:
parent
0d0706411d
commit
1595ff08e2
17 changed files with 843 additions and 5 deletions
5
Makefile
5
Makefile
|
@ -6,6 +6,11 @@ all:
|
|||
$(MAKE) -C simX
|
||||
$(MAKE) -C benchmarks/opencl
|
||||
|
||||
perf-demo:
|
||||
$(MAKE) -C hw
|
||||
$(MAKE) -C driver rtlsim
|
||||
$(MAKE) -C driver/tests/demo/ run-rtlsim
|
||||
|
||||
clean:
|
||||
$(MAKE) -C hw clean
|
||||
$(MAKE) -C driver clean
|
||||
|
|
|
@ -62,7 +62,8 @@ int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value);
|
|||
|
||||
// get device constant registers
|
||||
int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value);
|
||||
|
||||
// get device constant registers (64 bit long int)
|
||||
int vx_csr_get_l(vx_device_h hdevice, int core_id, int addr, int addr_h, uint64_t* value);
|
||||
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
|
||||
|
||||
// upload kernel bytes to device
|
||||
|
|
|
@ -243,6 +243,32 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||
vx_csr_get(hdevice, 0, CSR_NC, &num_cores);
|
||||
if (num_cores > 1) {
|
||||
uint64_t total_instrs = 0, total_cycles = 0;
|
||||
// -------------------------
|
||||
#ifdef PERF_ENABLE
|
||||
// PERF: cache
|
||||
uint64_t total_r = 0;
|
||||
uint64_t total_w = 0;
|
||||
uint64_t dram_st = 0;
|
||||
uint64_t dram_lat = 0;
|
||||
uint64_t dram_rsp = 0;
|
||||
uint64_t msrq_st = 0;
|
||||
uint64_t total_st = 0;
|
||||
uint64_t r_miss = 0;
|
||||
uint64_t w_miss = 0;
|
||||
uint64_t core_rsp_st = 0;
|
||||
uint64_t total_evict = 0;
|
||||
// PERF: pipeline stalls
|
||||
uint64_t lsu_stall = 0;
|
||||
uint64_t fpu_stall = 0;
|
||||
uint64_t mul_stall = 0;
|
||||
uint64_t csr_stall = 0;
|
||||
uint64_t alu_stall = 0;
|
||||
uint64_t gpu_stall = 0;
|
||||
uint64_t ibuffer_stall = 0;
|
||||
uint64_t scoreboard_stall = 0;
|
||||
uint64_t icache_stall = 0;
|
||||
#endif
|
||||
// -------------------------
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
uint64_t instrs, cycles;
|
||||
vx_get_perf(hdevice, core_id, &instrs, &cycles);
|
||||
|
@ -250,14 +276,235 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||
fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC);
|
||||
total_instrs += instrs;
|
||||
total_cycles = std::max<uint64_t>(total_cycles, cycles);
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
// PERF: cache
|
||||
// total_read
|
||||
uint64_t total_r_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_TOTAL_R, CSR_TOTAL_R_H, &total_r_per_core);
|
||||
fprintf(stdout, "PERF: \t\ttotal_reads_per_core=%ld\n", total_r_per_core);
|
||||
total_r += total_r_per_core;
|
||||
// total_write
|
||||
uint64_t total_w_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_TOTAL_W, CSR_TOTAL_W_H, &total_w_per_core);
|
||||
fprintf(stdout, "PERF: \t\ttotal_writes_per_core=%ld\n", total_w_per_core);
|
||||
total_w += total_w_per_core;
|
||||
// dram_stall
|
||||
uint64_t dram_st_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_DRAM_ST, CSR_DRAM_ST_H, &dram_st_per_core);
|
||||
fprintf(stdout, "PERF: \t\tdram_stalls_per_core=%ld\n", dram_st_per_core);
|
||||
dram_st += dram_st_per_core;
|
||||
// dram_latency
|
||||
uint64_t dram_lat_per_core, dram_rsp_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_DRAM_LAT, CSR_DRAM_LAT_H, &dram_lat_per_core);
|
||||
vx_csr_get_l(hdevice, core_id, CSR_DRAM_RSP, CSR_DRAM_RSP_H, &dram_rsp_per_core);
|
||||
fprintf(stdout, "PERF: \t\tdram_latency_per_core=%ld\n", dram_lat_per_core);
|
||||
fprintf(stdout, "PERF: \t\tdram_response_per_core=%ld\n", dram_rsp_per_core);
|
||||
dram_lat += dram_lat_per_core;
|
||||
dram_rsp += dram_rsp_per_core;
|
||||
float dram_lat_per_rsp_per_core = (float)(double(dram_lat_per_core) / double(dram_rsp_per_core));
|
||||
fprintf(stdout, "PERF: \t\tdram_latency_per_response_per_core=%f\n", dram_lat_per_rsp_per_core);
|
||||
// miss_reserve_queue_stall
|
||||
uint64_t msrq_st_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_MSRQ_ST, CSR_MSRQ_ST_H, &msrq_st_per_core);
|
||||
fprintf(stdout, "PERF: \t\tmsrq_stalls_per_core=%ld\n", msrq_st_per_core);
|
||||
msrq_st += msrq_st_per_core;
|
||||
// total_stall
|
||||
uint64_t total_st_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_TOTAL_ST, CSR_TOTAL_ST_H, &total_st_per_core);
|
||||
fprintf(stdout, "PERF: \t\ttotal_stalls_per_core=%ld\n", total_st_per_core);
|
||||
total_st += total_st_per_core;
|
||||
// read_miss
|
||||
uint64_t r_miss_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_R_MISS, CSR_R_MISS_H, &r_miss_per_core);
|
||||
fprintf(stdout, "PERF: \t\tread_misses_per_core=%ld\n", r_miss_per_core);
|
||||
r_miss += r_miss_per_core;
|
||||
// write_miss
|
||||
uint64_t w_miss_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_W_MISS, CSR_W_MISS_H, &w_miss_per_core);
|
||||
fprintf(stdout, "PERF: \t\twrite_misses_per_core=%ld\n", w_miss_per_core);
|
||||
w_miss += w_miss_per_core;
|
||||
// core_rsp_stalls
|
||||
uint64_t core_rsp_st_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_CORE_RSP_ST, CSR_CORE_RSP_ST_H, &core_rsp_st_per_core);
|
||||
fprintf(stdout, "PERF: \t\tcore_rsp_stalls_per_core=%ld\n", core_rsp_st_per_core);
|
||||
core_rsp_st += core_rsp_st_per_core;
|
||||
// total_evictions
|
||||
uint64_t total_evict_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_TOTAL_EV, CSR_TOTAL_EV_H, &total_evict_per_core);
|
||||
fprintf(stdout, "PERF: \t\ttotal_evictions_per_core=%ld\n", total_evict_per_core);
|
||||
total_evict += total_evict_per_core;
|
||||
// PERF: pipeline stall
|
||||
// lsu_stall
|
||||
uint64_t lsu_stall_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_LSU_ST, CSR_LSU_ST_H, &lsu_stall_per_core);
|
||||
fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall_per_core);
|
||||
lsu_stall += lsu_stall_per_core;
|
||||
// fpu_stall
|
||||
uint64_t fpu_stall_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_FPU_ST, CSR_FPU_ST_H, &fpu_stall_per_core);
|
||||
fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall_per_core);
|
||||
fpu_stall += fpu_stall_per_core;
|
||||
// mul_stall
|
||||
uint64_t mul_stall_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_MUL_ST, CSR_MUL_ST_H, &mul_stall_per_core);
|
||||
fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall_per_core);
|
||||
mul_stall += mul_stall_per_core;
|
||||
// csr_stall
|
||||
uint64_t csr_stall_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_CSR_ST, CSR_CSR_ST_H, &csr_stall_per_core);
|
||||
fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall_per_core);
|
||||
csr_stall += csr_stall_per_core;
|
||||
// alu_stall
|
||||
uint64_t alu_stall_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_ALU_ST, CSR_ALU_ST_H, &alu_stall_per_core);
|
||||
fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall_per_core);
|
||||
alu_stall += alu_stall_per_core;
|
||||
// gpu_stall
|
||||
uint64_t gpu_stall_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_GPU_ST, CSR_GPU_ST_H, &gpu_stall_per_core);
|
||||
fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall_per_core);
|
||||
gpu_stall += gpu_stall_per_core;
|
||||
// ibuffer_stall
|
||||
uint64_t ibuffer_stall_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_IBUF_ST, CSR_IBUF_ST_H, &ibuffer_stall_per_core);
|
||||
fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall_per_core);
|
||||
ibuffer_stall += ibuffer_stall_per_core;
|
||||
// scoreboard_stall
|
||||
uint64_t scoreboard_stall_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_SCRBRD_ST, CSR_SCRBRD_ST_H, &scoreboard_stall_per_core);
|
||||
fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall_per_core);
|
||||
scoreboard_stall += scoreboard_stall_per_core;
|
||||
// icache_stall
|
||||
uint64_t icache_stall_per_core;
|
||||
vx_csr_get_l(hdevice, core_id, CSR_ICACHE_ST, CSR_ICACHE_ST_H, &icache_stall_per_core);
|
||||
fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall_per_core);
|
||||
icache_stall += icache_stall_per_core;
|
||||
#endif
|
||||
// -------------------------
|
||||
}
|
||||
float IPC = (float)(double(total_instrs) / double(total_cycles));
|
||||
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC);
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
// PERF: cache
|
||||
fprintf(stdout, "PERF: \t\ttotal_reads=%ld\n", total_r);
|
||||
fprintf(stdout, "PERF: \t\ttotal_writes=%ld\n", total_w);
|
||||
fprintf(stdout, "PERF: \t\tdram_stalls=%ld\n", dram_st);
|
||||
fprintf(stdout, "PERF: \t\tdram_latency=%ld\n", dram_lat);
|
||||
fprintf(stdout, "PERF: \t\tdram_response=%ld\n", dram_rsp);
|
||||
float dram_lat_per_rsp = (float)(double(dram_lat) / double(dram_rsp));
|
||||
fprintf(stdout, "PERF: \t\tdram_latency_per_response=%f\n", dram_lat_per_rsp);
|
||||
fprintf(stdout, "PERF: \t\tmsrq_stalls=%ld\n", msrq_st);
|
||||
fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", total_st);
|
||||
fprintf(stdout, "PERF: \t\tread_misses=%ld\n", r_miss);
|
||||
fprintf(stdout, "PERF: \t\twrite_misses=%ld\n", w_miss);
|
||||
fprintf(stdout, "PERF: \t\tcore_rsp_stalls=%ld\n", core_rsp_st);
|
||||
fprintf(stdout, "PERF: \t\ttotal_evictions=%ld\n", total_evict);
|
||||
// PERF: pipeline stall
|
||||
fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall);
|
||||
fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall);
|
||||
fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall);
|
||||
fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall);
|
||||
fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall);
|
||||
fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall);
|
||||
fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall);
|
||||
fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall);
|
||||
fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall);
|
||||
#endif
|
||||
// -------------------------
|
||||
} else {
|
||||
uint64_t instrs, cycles;
|
||||
vx_get_perf(hdevice, 0, &instrs, &cycles);
|
||||
float IPC = (float)(double(instrs) / double(cycles));
|
||||
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
// PERF: cache
|
||||
// total_read
|
||||
uint64_t total_r;
|
||||
vx_csr_get_l(hdevice, 0, CSR_TOTAL_R, CSR_TOTAL_R_H, &total_r);
|
||||
fprintf(stdout, "PERF: \t\ttotal_reads=%ld\n", total_r);
|
||||
// total_write
|
||||
uint64_t total_w;
|
||||
vx_csr_get_l(hdevice, 0, CSR_TOTAL_W, CSR_TOTAL_W_H, &total_w);
|
||||
fprintf(stdout, "PERF: \t\ttotal_writes=%ld\n", total_w);
|
||||
// dram_stall
|
||||
uint64_t dram_st;
|
||||
vx_csr_get_l(hdevice, 0, CSR_DRAM_ST, CSR_DRAM_ST_H, &dram_st);
|
||||
fprintf(stdout, "PERF: \t\tdram_stalls=%ld\n", dram_st);
|
||||
// dram_latency
|
||||
uint64_t dram_lat, dram_rsp;
|
||||
vx_csr_get_l(hdevice, 0, CSR_DRAM_LAT, CSR_DRAM_LAT_H, &dram_lat);
|
||||
vx_csr_get_l(hdevice, 0, CSR_DRAM_RSP, CSR_DRAM_RSP_H, &dram_rsp);
|
||||
float dram_lat_per_rsp = (float)(double(dram_lat) / double(dram_rsp));
|
||||
fprintf(stdout, "PERF: \t\tdram_latency=%ld\n", dram_lat);
|
||||
fprintf(stdout, "PERF: \t\tdram_response=%ld\n", dram_rsp);
|
||||
fprintf(stdout, "PERF: \t\tdram_latency_per_response=%f\n", dram_lat_per_rsp);
|
||||
// miss_reserve_queue_stall
|
||||
uint64_t msrq_st;
|
||||
vx_csr_get_l(hdevice, 0, CSR_MSRQ_ST, CSR_MSRQ_ST_H, &msrq_st);
|
||||
fprintf(stdout, "PERF: \t\tmsrq_stalls=%ld\n", msrq_st);
|
||||
// total_stall
|
||||
uint64_t total_st;
|
||||
vx_csr_get_l(hdevice, 0, CSR_TOTAL_ST, CSR_TOTAL_ST_H, &total_st);
|
||||
fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", total_st);
|
||||
// read_miss
|
||||
uint64_t r_miss;
|
||||
vx_csr_get_l(hdevice, 0, CSR_R_MISS, CSR_R_MISS_H, &r_miss);
|
||||
fprintf(stdout, "PERF: \t\tread_misses=%ld\n", r_miss);
|
||||
// write_miss
|
||||
uint64_t w_miss;
|
||||
vx_csr_get_l(hdevice, 0, CSR_W_MISS, CSR_W_MISS_H, &w_miss);
|
||||
fprintf(stdout, "PERF: \t\twrite_misses=%ld\n", w_miss);
|
||||
// core_rsp_stalls
|
||||
uint64_t core_rsp_st;
|
||||
vx_csr_get_l(hdevice, 0, CSR_CORE_RSP_ST, CSR_CORE_RSP_ST_H, &core_rsp_st);
|
||||
fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", core_rsp_st);
|
||||
// total_evictions
|
||||
uint64_t total_evict;
|
||||
vx_csr_get_l(hdevice, 0, CSR_TOTAL_EV, CSR_TOTAL_EV_H, &total_evict);
|
||||
fprintf(stdout, "PERF: \t\ttotal_evictions=%ld\n", total_evict);
|
||||
// PERF: pipeline stalls
|
||||
// TODO:
|
||||
// lsu_stall
|
||||
uint64_t lsu_stall;
|
||||
vx_csr_get_l(hdevice, 0, CSR_LSU_ST, CSR_LSU_ST_H, &lsu_stall);
|
||||
fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall);
|
||||
// fpu_stall
|
||||
uint64_t fpu_stall;
|
||||
vx_csr_get_l(hdevice, 0, CSR_FPU_ST, CSR_FPU_ST_H, &fpu_stall);
|
||||
fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall);
|
||||
// mul_stall
|
||||
uint64_t mul_stall;
|
||||
vx_csr_get_l(hdevice, 0, CSR_MUL_ST, CSR_MUL_ST_H, &mul_stall);
|
||||
fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall);
|
||||
// csr_stall
|
||||
uint64_t csr_stall;
|
||||
vx_csr_get_l(hdevice, 0, CSR_CSR_ST, CSR_CSR_ST_H, &csr_stall);
|
||||
fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall);
|
||||
// alu_stall
|
||||
uint64_t alu_stall;
|
||||
vx_csr_get_l(hdevice, 0, CSR_ALU_ST, CSR_ALU_ST_H, &alu_stall);
|
||||
fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall);
|
||||
// gpu_stall
|
||||
uint64_t gpu_stall;
|
||||
vx_csr_get_l(hdevice, 0, CSR_GPU_ST, CSR_GPU_ST_H, &gpu_stall);
|
||||
fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall);
|
||||
// ibuffer_stall
|
||||
uint64_t ibuffer_stall;
|
||||
vx_csr_get_l(hdevice, 0, CSR_IBUF_ST, CSR_IBUF_ST_H, &ibuffer_stall);
|
||||
fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall);
|
||||
// scoreboard_stall
|
||||
uint64_t scoreboard_stall;
|
||||
vx_csr_get_l(hdevice, 0, CSR_SCRBRD_ST, CSR_SCRBRD_ST_H, &scoreboard_stall);
|
||||
fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall);
|
||||
// icache_stall
|
||||
uint64_t icache_stall;
|
||||
vx_csr_get_l(hdevice, 0, CSR_ICACHE_ST, CSR_ICACHE_ST_H, &icache_stall);
|
||||
fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall);
|
||||
#endif
|
||||
// -------------------------
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -386,4 +633,16 @@ extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* valu
|
|||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
return device->get_csr(core_id, addr, value);
|
||||
}
|
||||
|
||||
extern int vx_csr_get_l(vx_device_h hdevice, int core_id, int addr, int addr_h, uint64_t* value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
unsigned csr_value;
|
||||
vx_csr_get(hdevice, core_id, addr_h, &csr_value);
|
||||
*value = csr_value;
|
||||
vx_csr_get(hdevice, core_id, addr, &csr_value);
|
||||
*value = (*value << 32) | csr_value;
|
||||
return 0;
|
||||
}
|
|
@ -369,6 +369,11 @@ module VX_cluster #(
|
|||
.core_rsp_tag (core_dram_rsp_tag),
|
||||
.core_rsp_ready (core_dram_rsp_ready),
|
||||
|
||||
// PERF: total read
|
||||
`ifdef PERF_ENABLE
|
||||
`UNUSED_PIN (perf_cache_if),
|
||||
`endif
|
||||
|
||||
// DRAM request
|
||||
.dram_req_valid (dram_req_valid),
|
||||
.dram_req_rw (dram_req_rw),
|
||||
|
|
|
@ -124,6 +124,12 @@
|
|||
`define LATENCY_FCONV 3
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////
|
||||
`ifndef PERF_ENABLE
|
||||
`define PERF_ENABLE
|
||||
`endif
|
||||
///////////////////////////////////////
|
||||
|
||||
// CSR Addresses //////////////////////////////////////////////////////////////
|
||||
|
||||
`define CSR_FFLAGS 12'h001
|
||||
|
@ -139,6 +145,52 @@
|
|||
`define CSR_NW 12'h026
|
||||
`define CSR_NC 12'h027
|
||||
|
||||
// PERF: cache
|
||||
`define CSR_R_MISS 12'h030 // read misses
|
||||
`define CSR_R_MISS_H 12'h031
|
||||
`define CSR_W_MISS 12'h032 // write misses
|
||||
`define CSR_W_MISS_H 12'h033
|
||||
`define CSR_DRAM_ST 12'h034 // dram stalls
|
||||
`define CSR_DRAM_ST_H 12'h035
|
||||
`define CSR_CORE_RSP_ST 12'h036 // core_rsp stalls
|
||||
`define CSR_CORE_RSP_ST_H 12'h037
|
||||
`define CSR_MSRQ_ST 12'h038 // miss reserve queue stalls
|
||||
`define CSR_MSRQ_ST_H 12'h039
|
||||
`define CSR_TOTAL_ST 12'h03A // total stalls
|
||||
`define CSR_TOTAL_ST_H 12'h03B
|
||||
`define CSR_TOTAL_R 12'h03C // total reads
|
||||
`define CSR_TOTAL_R_H 12'h03D
|
||||
`define CSR_TOTAL_W 12'h03E // total writes
|
||||
`define CSR_TOTAL_W_H 12'h03F
|
||||
`define CSR_TOTAL_EV 12'h040 // total evictions
|
||||
`define CSR_TOTAL_EV_H 12'h041
|
||||
`define CSR_DRAM_LAT 12'h042 // dram latency (total)
|
||||
`define CSR_DRAM_LAT_H 12'h043
|
||||
`define CSR_DRAM_RSP 12'h044 // dram responses
|
||||
`define CSR_DRAM_RSP_H 12'h045
|
||||
// PERF: pipeline stalls
|
||||
`define CSR_FPU_ST 12'h046
|
||||
`define CSR_FPU_ST_H 12'h047
|
||||
`define CSR_MUL_ST 12'h048
|
||||
`define CSR_MUL_ST_H 12'h049
|
||||
`define CSR_CSR_ST 12'h04A
|
||||
`define CSR_CSR_ST_H 12'h04B
|
||||
`define CSR_ALU_ST 12'h04C
|
||||
`define CSR_ALU_ST_H 12'h04D
|
||||
`define CSR_GPU_ST 12'h04E
|
||||
`define CSR_GPU_ST_H 12'h04F
|
||||
`define CSR_LSU_ST 12'h050
|
||||
`define CSR_LSU_ST_H 12'h051
|
||||
`define CSR_IBUF_ST 12'h052
|
||||
`define CSR_IBUF_ST_H 12'h053
|
||||
`define CSR_SCRBRD_ST 12'h054
|
||||
`define CSR_SCRBRD_ST_H 12'h055
|
||||
`define CSR_ICACHE_ST 12'h056
|
||||
`define CSR_ICACHE_ST_H 12'h057
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////
|
||||
|
||||
`define CSR_SATP 12'h180
|
||||
|
||||
`define CSR_PMPCFG0 12'h3A0
|
||||
|
|
|
@ -66,6 +66,10 @@ module VX_core #(
|
|||
output wire busy,
|
||||
output wire ebreak
|
||||
);
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_cache_if();
|
||||
`endif
|
||||
|
||||
VX_cache_dram_req_if #(
|
||||
.DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH),
|
||||
.DRAM_ADDR_WIDTH(`DDRAM_ADDR_WIDTH),
|
||||
|
@ -220,6 +224,11 @@ module VX_core #(
|
|||
.csr_io_rsp_data (csr_io_rsp_data),
|
||||
.csr_io_rsp_ready (csr_io_rsp_ready),
|
||||
|
||||
// PERF: total reads
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_cache_if),
|
||||
`endif
|
||||
|
||||
// Status
|
||||
.busy(busy),
|
||||
.ebreak(ebreak)
|
||||
|
@ -238,6 +247,11 @@ module VX_core #(
|
|||
// Core <-> Dcache
|
||||
.core_dcache_req_if (core_dcache_req_if),
|
||||
.core_dcache_rsp_if (core_dcache_rsp_if),
|
||||
|
||||
// PERF: total reads
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_cache_if),
|
||||
`endif
|
||||
|
||||
// Core <-> Icache
|
||||
.core_icache_req_if (core_icache_req_if),
|
||||
|
|
|
@ -6,6 +6,12 @@ module VX_csr_data #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// PERF: total reads
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_cache_if,
|
||||
VX_perf_pipeline_stall_if perf_pipeline_stall_if,
|
||||
`endif
|
||||
|
||||
VX_cmt_to_csr_if cmt_to_csr_if,
|
||||
VX_fpu_to_csr_if fpu_to_csr_if,
|
||||
|
||||
|
@ -114,6 +120,51 @@ module VX_csr_data #(
|
|||
`CSR_NW : read_data_r = `NUM_WARPS;
|
||||
`CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
// PERF: cache
|
||||
`CSR_R_MISS : read_data_r = perf_cache_if.read_miss[31:0];
|
||||
`CSR_R_MISS_H : read_data_r = perf_cache_if.read_miss[63:32];
|
||||
`CSR_W_MISS : read_data_r = perf_cache_if.write_miss[31:0];
|
||||
`CSR_W_MISS_H : read_data_r = perf_cache_if.write_miss[63:32];
|
||||
`CSR_DRAM_ST : read_data_r = perf_cache_if.dram_stall[31:0];
|
||||
`CSR_DRAM_ST_H : read_data_r = perf_cache_if.dram_stall[63:32];
|
||||
`CSR_CORE_RSP_ST : read_data_r = perf_cache_if.core_rsp_stall[31:0];
|
||||
`CSR_CORE_RSP_ST_H: read_data_r = perf_cache_if.core_rsp_stall[63:32];
|
||||
`CSR_MSRQ_ST : read_data_r = perf_cache_if.msrq_stall[31:0];
|
||||
`CSR_MSRQ_ST_H : read_data_r = perf_cache_if.msrq_stall[63:32];
|
||||
`CSR_TOTAL_ST : read_data_r = perf_cache_if.total_stall[31:0];
|
||||
`CSR_TOTAL_ST_H : read_data_r = perf_cache_if.total_stall[63:32];
|
||||
`CSR_TOTAL_R : read_data_r = perf_cache_if.total_read[31:0];
|
||||
`CSR_TOTAL_R_H : read_data_r = perf_cache_if.total_read[63:32];
|
||||
`CSR_TOTAL_W : read_data_r = perf_cache_if.total_write[31:0];
|
||||
`CSR_TOTAL_W_H : read_data_r = perf_cache_if.total_write[63:32];
|
||||
`CSR_TOTAL_EV : read_data_r = perf_cache_if.total_eviction[31:0];
|
||||
`CSR_TOTAL_EV_H : read_data_r = perf_cache_if.total_eviction[63:32];
|
||||
`CSR_DRAM_LAT : read_data_r = perf_cache_if.dram_latency[31:0];
|
||||
`CSR_DRAM_LAT_H : read_data_r = perf_cache_if.dram_latency[63:32];
|
||||
`CSR_DRAM_RSP : read_data_r = perf_cache_if.dram_rsp[31:0];
|
||||
`CSR_DRAM_RSP_H : read_data_r = perf_cache_if.dram_rsp[63:32];
|
||||
// PERF: pipeline stalls
|
||||
`CSR_LSU_ST : read_data_r = perf_pipeline_stall_if.lsu_stall[31:0];
|
||||
`CSR_LSU_ST_H : read_data_r = perf_pipeline_stall_if.lsu_stall[63:32];
|
||||
`CSR_FPU_ST : read_data_r = perf_pipeline_stall_if.fpu_stall[31:0];
|
||||
`CSR_FPU_ST_H : read_data_r = perf_pipeline_stall_if.fpu_stall[63:32];
|
||||
`CSR_MUL_ST : read_data_r = perf_pipeline_stall_if.mul_stall[31:0];
|
||||
`CSR_MUL_ST_H : read_data_r = perf_pipeline_stall_if.mul_stall[63:32];
|
||||
`CSR_CSR_ST : read_data_r = perf_pipeline_stall_if.csr_stall[31:0];
|
||||
`CSR_CSR_ST_H : read_data_r = perf_pipeline_stall_if.csr_stall[63:32];
|
||||
`CSR_ALU_ST : read_data_r = perf_pipeline_stall_if.alu_stall[31:0];
|
||||
`CSR_ALU_ST_H : read_data_r = perf_pipeline_stall_if.alu_stall[63:32];
|
||||
`CSR_GPU_ST : read_data_r = perf_pipeline_stall_if.gpu_stall[31:0];
|
||||
`CSR_GPU_ST_H : read_data_r = perf_pipeline_stall_if.gpu_stall[63:32];
|
||||
`CSR_IBUF_ST : read_data_r = perf_pipeline_stall_if.ibuffer_stall[31:0];
|
||||
`CSR_IBUF_ST_H : read_data_r = perf_pipeline_stall_if.ibuffer_stall[63:32];
|
||||
`CSR_SCRBRD_ST : read_data_r = perf_pipeline_stall_if.scoreboard_stall[31:0];
|
||||
`CSR_SCRBRD_ST_H : read_data_r = perf_pipeline_stall_if.scoreboard_stall[63:32];
|
||||
`CSR_ICACHE_ST : read_data_r = perf_pipeline_stall_if.icache_stall[31:0];
|
||||
`CSR_ICACHE_ST_H : read_data_r = perf_pipeline_stall_if.icache_stall[63:32];
|
||||
`endif
|
||||
|
||||
`CSR_SATP : read_data_r = 32'(csr_satp);
|
||||
|
||||
`CSR_MSTATUS : read_data_r = 32'(csr_mstatus);
|
||||
|
|
|
@ -6,6 +6,12 @@ module VX_csr_unit #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// PERF: total reads
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_cache_if,
|
||||
VX_perf_pipeline_stall_if perf_pipeline_stall_if,
|
||||
`endif
|
||||
|
||||
VX_cmt_to_csr_if cmt_to_csr_if,
|
||||
VX_fpu_to_csr_if fpu_to_csr_if,
|
||||
|
||||
|
@ -51,6 +57,11 @@ module VX_csr_unit #(
|
|||
) csr_data (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
// PERF: total reads
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_cache_if),
|
||||
.perf_pipeline_stall_if (perf_pipeline_stall_if),
|
||||
`endif
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
.read_enable (csr_pipe_req_if.valid),
|
||||
|
|
|
@ -18,6 +18,12 @@ module VX_execute #(
|
|||
|
||||
// perf
|
||||
VX_cmt_to_csr_if cmt_to_csr_if,
|
||||
|
||||
// PERF: total reads
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_cache_if,
|
||||
VX_perf_pipeline_stall_if perf_pipeline_stall_if,
|
||||
`endif
|
||||
|
||||
// inputs
|
||||
VX_alu_req_if alu_req_if,
|
||||
|
@ -72,7 +78,12 @@ module VX_execute #(
|
|||
.CORE_ID(CORE_ID)
|
||||
) csr_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (reset),
|
||||
// PERF: total reads
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_cache_if),
|
||||
.perf_pipeline_stall_if (perf_pipeline_stall_if),
|
||||
`endif
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
.csr_io_req_if (csr_io_req_if),
|
||||
|
@ -150,4 +161,72 @@ module VX_execute #(
|
|||
&& (`BR_OP(alu_req_if.op_type) == `BR_EBREAK
|
||||
|| `BR_OP(alu_req_if.op_type) == `BR_ECALL);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [63:0] perf_alu_stall;
|
||||
reg [63:0] perf_lsu_stall;
|
||||
reg [63:0] perf_csr_stall;
|
||||
reg [63:0] perf_gpu_stall;
|
||||
`ifdef EXT_M_ENABLE
|
||||
reg [63:0] perf_mul_stall;
|
||||
`endif
|
||||
`ifdef EXT_F_ENABLE
|
||||
reg [63:0] perf_fpu_stall;
|
||||
`endif
|
||||
|
||||
always@(posedge clk) begin
|
||||
if(reset) begin
|
||||
perf_alu_stall <= 0;
|
||||
perf_lsu_stall <= 0;
|
||||
perf_csr_stall <= 0;
|
||||
perf_gpu_stall <= 0;
|
||||
`ifdef EXT_M_ENABLE
|
||||
perf_mul_stall <= 0;
|
||||
`endif
|
||||
`ifdef EXT_F_ENABLE
|
||||
perf_fpu_stall <= 0;
|
||||
`endif
|
||||
end else begin
|
||||
// alu_stall
|
||||
if (alu_req_if.valid & !alu_req_if.ready) begin
|
||||
perf_alu_stall <= perf_alu_stall + 64'd1;
|
||||
end
|
||||
// lsu_stall
|
||||
if (lsu_req_if.valid & !lsu_req_if.ready) begin
|
||||
perf_lsu_stall <= perf_lsu_stall + 64'd1;
|
||||
end
|
||||
// csr_stall
|
||||
if (csr_req_if.valid & !csr_req_if.ready) begin
|
||||
perf_csr_stall <= perf_csr_stall + 64'd1;
|
||||
end
|
||||
// gpu_stall
|
||||
if (gpu_req_if.valid & !gpu_req_if.ready) begin
|
||||
perf_gpu_stall <= perf_gpu_stall + 64'd1;
|
||||
end
|
||||
// mul_stall
|
||||
`ifdef EXT_M_ENABLE
|
||||
if (mul_req_if.valid & !mul_req_if.ready) begin
|
||||
perf_mul_stall <= perf_mul_stall + 64'd1;
|
||||
end
|
||||
`endif
|
||||
// fpu_stall
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_req_if.valid & !fpu_req_if.ready) begin
|
||||
perf_fpu_stall <= perf_fpu_stall + 64'd1;
|
||||
end
|
||||
`endif
|
||||
end
|
||||
end
|
||||
assign perf_pipeline_stall_if.alu_stall = perf_alu_stall;
|
||||
assign perf_pipeline_stall_if.lsu_stall = perf_lsu_stall;
|
||||
assign perf_pipeline_stall_if.csr_stall = perf_csr_stall;
|
||||
assign perf_pipeline_stall_if.gpu_stall = perf_gpu_stall;
|
||||
`ifdef EXT_M_ENABLE
|
||||
assign perf_pipeline_stall_if.mul_stall = perf_mul_stall;
|
||||
`endif
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign perf_pipeline_stall_if.fpu_stall = perf_fpu_stall;
|
||||
`endif
|
||||
// gpr_stall, ibuffer_stall, scoreboard_stall, icache_stall come from other stages
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -8,6 +8,10 @@ module VX_issue #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_pipeline_stall_if perf_pipeline_stall_if,
|
||||
`endif
|
||||
|
||||
VX_decode_if decode_if,
|
||||
VX_writeback_if writeback_if,
|
||||
|
||||
|
@ -120,6 +124,21 @@ module VX_issue #(
|
|||
`SCOPE_ASSIGN (writeback_rd, writeback_if.rd);
|
||||
`SCOPE_ASSIGN (writeback_data, writeback_if.data);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [63:0] perf_scoreboard_stall;
|
||||
always @ (posedge clk) begin
|
||||
if(reset) begin
|
||||
perf_scoreboard_stall <= 0;
|
||||
end else begin
|
||||
// scoreboard_stall
|
||||
if (ibuf_deq_if.valid & scoreboard_delay) begin
|
||||
perf_scoreboard_stall <= perf_scoreboard_stall + 64'd1;
|
||||
end
|
||||
end
|
||||
end
|
||||
assign perf_pipeline_stall_if.scoreboard_stall = perf_scoreboard_stall;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_PRINT_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (alu_req_if.valid && alu_req_if.ready) begin
|
||||
|
|
|
@ -12,6 +12,10 @@ module VX_mem_unit # (
|
|||
VX_cache_core_req_if core_dcache_req_if,
|
||||
VX_cache_core_rsp_if core_dcache_rsp_if,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_cache_if,
|
||||
`endif
|
||||
|
||||
// Core <-> Icache
|
||||
VX_cache_core_req_if core_icache_req_if,
|
||||
VX_cache_core_rsp_if core_icache_rsp_if,
|
||||
|
@ -28,6 +32,11 @@ module VX_mem_unit # (
|
|||
VX_cache_core_req_if io_req_if,
|
||||
VX_cache_core_rsp_if io_rsp_if
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_cache_smem_if(), perf_cache_icache_if(), perf_cache_dcache_if();
|
||||
`endif
|
||||
|
||||
VX_cache_dram_req_if #(
|
||||
.DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH),
|
||||
.DRAM_ADDR_WIDTH (`DDRAM_ADDR_WIDTH),
|
||||
|
@ -124,6 +133,10 @@ module VX_mem_unit # (
|
|||
.core_rsp_tag (dcache_rsp_if.tag),
|
||||
.core_rsp_ready (dcache_rsp_if.ready),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_cache_dcache_if),
|
||||
`endif
|
||||
|
||||
// DRAM request
|
||||
.dram_req_valid (dcache_dram_req_if.valid),
|
||||
.dram_req_rw (dcache_dram_req_if.rw),
|
||||
|
@ -196,6 +209,11 @@ module VX_mem_unit # (
|
|||
.core_rsp_tag (core_icache_rsp_if.tag),
|
||||
.core_rsp_ready (core_icache_rsp_if.ready),
|
||||
|
||||
// PERF: cache read
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_cache_icache_if),
|
||||
`endif
|
||||
|
||||
// DRAM Req
|
||||
.dram_req_valid (icache_dram_req_if.valid),
|
||||
.dram_req_rw (icache_dram_req_if.rw),
|
||||
|
@ -268,6 +286,11 @@ module VX_mem_unit # (
|
|||
.core_rsp_tag (smem_rsp_if.tag),
|
||||
.core_rsp_ready (smem_rsp_if.ready),
|
||||
|
||||
// PERF: cache read
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_cache_smem_if),
|
||||
`endif
|
||||
|
||||
// DRAM request
|
||||
`UNUSED_PIN (dram_req_valid),
|
||||
`UNUSED_PIN (dram_req_rw),
|
||||
|
@ -340,4 +363,42 @@ module VX_mem_unit # (
|
|||
.rsp_ready_in (dram_rsp_if.ready)
|
||||
);
|
||||
|
||||
// PERF: cache
|
||||
// TODO: some cache has dram and write disabled, hence some stats can can be removed.
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_cache_if.read_miss = perf_cache_smem_if.read_miss +
|
||||
perf_cache_icache_if.read_miss +
|
||||
perf_cache_dcache_if.read_miss;
|
||||
assign perf_cache_if.write_miss = perf_cache_smem_if.write_miss +
|
||||
perf_cache_icache_if.write_miss +
|
||||
perf_cache_dcache_if.write_miss;
|
||||
assign perf_cache_if.dram_stall = perf_cache_smem_if.dram_stall +
|
||||
perf_cache_icache_if.dram_stall +
|
||||
perf_cache_dcache_if.dram_stall;
|
||||
assign perf_cache_if.core_rsp_stall = perf_cache_smem_if.core_rsp_stall +
|
||||
perf_cache_icache_if.core_rsp_stall +
|
||||
perf_cache_dcache_if.core_rsp_stall;
|
||||
assign perf_cache_if.msrq_stall = perf_cache_smem_if.msrq_stall +
|
||||
perf_cache_icache_if.msrq_stall +
|
||||
perf_cache_dcache_if.msrq_stall;
|
||||
assign perf_cache_if.total_stall = perf_cache_smem_if.total_stall +
|
||||
perf_cache_icache_if.total_stall +
|
||||
perf_cache_dcache_if.total_stall;
|
||||
assign perf_cache_if.total_read = perf_cache_smem_if.total_read +
|
||||
perf_cache_icache_if.total_read +
|
||||
perf_cache_dcache_if.total_read;
|
||||
assign perf_cache_if.total_write = perf_cache_smem_if.total_write +
|
||||
perf_cache_icache_if.total_write +
|
||||
perf_cache_dcache_if.total_write;
|
||||
assign perf_cache_if.total_eviction = perf_cache_smem_if.total_eviction +
|
||||
perf_cache_icache_if.total_eviction +
|
||||
perf_cache_dcache_if.total_eviction;
|
||||
assign perf_cache_if.dram_latency = perf_cache_smem_if.dram_latency +
|
||||
perf_cache_icache_if.dram_latency +
|
||||
perf_cache_dcache_if.dram_latency;
|
||||
assign perf_cache_if.dram_rsp = perf_cache_smem_if.dram_rsp +
|
||||
perf_cache_icache_if.dram_rsp +
|
||||
perf_cache_dcache_if.dram_rsp;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -51,6 +51,10 @@ module VX_pipeline #(
|
|||
output wire[31:0] csr_io_rsp_data,
|
||||
input wire csr_io_rsp_ready,
|
||||
|
||||
// PERF: total reads
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_cache_if,
|
||||
`endif
|
||||
// Status
|
||||
output wire busy,
|
||||
output wire ebreak
|
||||
|
@ -171,6 +175,10 @@ module VX_pipeline #(
|
|||
VX_commit_if fpu_commit_if();
|
||||
VX_commit_if gpu_commit_if();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_pipeline_stall_if perf_pipeline_stall_if();
|
||||
`endif
|
||||
|
||||
VX_fetch #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) fetch (
|
||||
|
@ -206,6 +214,10 @@ module VX_pipeline #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_pipeline_stall_if (perf_pipeline_stall_if),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
|
@ -224,7 +236,13 @@ module VX_pipeline #(
|
|||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
|
||||
// PERF: total reads
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_cache_if),
|
||||
.perf_pipeline_stall_if (perf_pipeline_stall_if),
|
||||
`endif
|
||||
|
||||
.dcache_req_if (core_dcache_req_if),
|
||||
.dcache_rsp_if (core_dcache_rsp_if),
|
||||
|
||||
|
@ -272,4 +290,27 @@ module VX_pipeline #(
|
|||
.cmt_to_csr_if (cmt_to_csr_if)
|
||||
);
|
||||
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [63:0] perf_icache_stall;
|
||||
reg [63:0] perf_ibuffer_stall;
|
||||
always @ (posedge clk) begin
|
||||
if(reset) begin
|
||||
perf_icache_stall <= 0;
|
||||
perf_ibuffer_stall <= 0;
|
||||
end else begin
|
||||
// icache_stall
|
||||
if (core_icache_req_if.valid & !core_icache_req_if.ready) begin
|
||||
perf_icache_stall <= perf_icache_stall + 64'd1;
|
||||
end
|
||||
// ibuffer_stall: decode_if == issue->ibuffer->ibuf_enq_if
|
||||
if(decode_if.valid & !decode_if.ready) begin
|
||||
perf_ibuffer_stall <= perf_ibuffer_stall + 64'd1;
|
||||
end
|
||||
end
|
||||
end
|
||||
assign perf_pipeline_stall_if.icache_stall = perf_icache_stall;
|
||||
assign perf_pipeline_stall_if.ibuffer_stall = perf_ibuffer_stall;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -372,6 +372,11 @@ module Vortex (
|
|||
.core_rsp_tag (cluster_dram_rsp_tag),
|
||||
.core_rsp_ready (cluster_dram_rsp_ready),
|
||||
|
||||
// PERF: total read
|
||||
`ifdef PERF_ENABLE
|
||||
`UNUSED_PIN (perf_cache_if),
|
||||
`endif
|
||||
|
||||
// DRAM request
|
||||
.dram_req_valid (dram_req_valid),
|
||||
.dram_req_rw (dram_req_rw),
|
||||
|
|
21
hw/rtl/cache/VX_bank.v
vendored
21
hw/rtl/cache/VX_bank.v
vendored
|
@ -96,6 +96,15 @@ module VX_bank #(
|
|||
output wire [SNP_TAG_WIDTH-1:0] snp_rsp_tag,
|
||||
input wire snp_rsp_ready,
|
||||
|
||||
// PERF: perf_msrq_stall
|
||||
`ifdef PERF_ENABLE
|
||||
output wire perf_msrq_stall,
|
||||
output wire perf_total_stall,
|
||||
output wire perf_evict,
|
||||
output wire perf_read_miss,
|
||||
output wire perf_write_miss,
|
||||
`endif
|
||||
|
||||
// Misses
|
||||
output wire misses
|
||||
);
|
||||
|
@ -948,6 +957,18 @@ end
|
|||
`SCOPE_ASSIGN (addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID));
|
||||
`SCOPE_ASSIGN (addr_st3, `LINE_TO_BYTE_ADDR(addr_st3, BANK_ID));
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_total_stall = pipeline_stall;
|
||||
assign perf_msrq_stall = mshr_push_stall;
|
||||
assign perf_read_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & !mem_rw_st1;
|
||||
assign perf_write_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & mem_rw_st1;
|
||||
if (DRAM_ENABLE) begin
|
||||
assign perf_evict = dwbq_push & do_writeback_st3 & !is_snp_st3;
|
||||
end else begin
|
||||
assign perf_evict = 0;
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef DBG_PRINT_CACHE_BANK
|
||||
wire incoming_fill_dfp_st3 = dram_rsp_fire && (addr_st3 == dram_rsp_addr);
|
||||
always @(posedge clk) begin
|
||||
|
|
171
hw/rtl/cache/VX_cache.v
vendored
171
hw/rtl/cache/VX_cache.v
vendored
|
@ -70,7 +70,12 @@ module VX_cache #(
|
|||
output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
|
||||
output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
|
||||
input wire core_rsp_ready,
|
||||
|
||||
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_cache_if,
|
||||
`endif
|
||||
|
||||
// DRAM request
|
||||
output wire dram_req_valid,
|
||||
output wire dram_req_rw,
|
||||
|
@ -130,7 +135,16 @@ module VX_cache #(
|
|||
|
||||
wire [NUM_BANKS-1:0] per_bank_miss;
|
||||
assign miss_vec = per_bank_miss;
|
||||
|
||||
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [NUM_BANKS-1:0] perf_msrq_stall_per_bank;
|
||||
wire [NUM_BANKS-1:0] perf_total_stall_per_bank;
|
||||
wire [NUM_BANKS-1:0] perf_evict_per_bank;
|
||||
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
|
||||
wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
|
||||
`endif
|
||||
|
||||
if (NUM_BANKS == 1) begin
|
||||
assign snp_req_ready = per_bank_snp_req_ready;
|
||||
end else begin
|
||||
|
@ -311,6 +325,15 @@ module VX_cache #(
|
|||
.dram_rsp_addr (curr_bank_dram_rsp_addr),
|
||||
.dram_rsp_ready (curr_bank_dram_rsp_ready),
|
||||
|
||||
// PERF: perf_msrq_stall
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_msrq_stall (perf_msrq_stall_per_bank[i]),
|
||||
.perf_total_stall (perf_total_stall_per_bank[i]),
|
||||
.perf_evict (perf_evict_per_bank[i]),
|
||||
.perf_read_miss (perf_read_miss_per_bank[i]),
|
||||
.perf_write_miss (perf_write_miss_per_bank[i]),
|
||||
`endif
|
||||
|
||||
// Snoop request
|
||||
.snp_req_valid (curr_bank_snp_req_valid),
|
||||
.snp_req_addr (curr_bank_snp_req_addr),
|
||||
|
@ -407,4 +430,148 @@ module VX_cache #(
|
|||
`UNUSED_VAR (snp_rsp_ready)
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
// per cycle: core_req_r, core_req_w
|
||||
reg[($clog2(NUM_REQS+1)-1):0] perf_core_req_r_per_cycle, perf_core_req_w_per_cycle;
|
||||
if (CORE_TAG_ID_BITS != 0) begin // core_req_rw is 1-bit wide
|
||||
VX_countones #( // core_req_r
|
||||
.N(NUM_REQS)
|
||||
) perf_countones_core_req_r_count (
|
||||
.valids (core_req_valid & {NUM_REQS{(~core_req_rw) & core_req_ready}}),
|
||||
.count (perf_core_req_r_per_cycle)
|
||||
);
|
||||
VX_countones #( // core_req_w
|
||||
.N(NUM_REQS)
|
||||
) perf_countones_core_req_w_count (
|
||||
.valids (core_req_valid & {NUM_REQS{(core_req_rw) & core_req_ready}}),
|
||||
.count (perf_core_req_w_per_cycle)
|
||||
);
|
||||
end else begin // core_req_rw is NUM_REQS-bit wide
|
||||
VX_countones #( // core_req_r
|
||||
.N(NUM_REQS)
|
||||
) perf_countones_core_req_r_count (
|
||||
.valids (core_req_valid & (~core_req_rw) & {NUM_REQS{core_req_ready}}),
|
||||
.count (perf_core_req_r_per_cycle)
|
||||
);
|
||||
VX_countones #( // core_req_w
|
||||
.N(NUM_REQS)
|
||||
) perf_countones_core_req_w_count (
|
||||
.valids (core_req_valid & (core_req_rw) & {NUM_REQS{core_req_ready}}),
|
||||
.count (perf_core_req_w_per_cycle)
|
||||
);
|
||||
end
|
||||
// per cycle: dram_latency
|
||||
reg[63:0] perf_dram_lat_per_cycle;
|
||||
always@(posedge clk) begin
|
||||
if(reset) begin
|
||||
perf_dram_lat_per_cycle <= 0;
|
||||
end else begin
|
||||
if(dram_req_valid & (~dram_req_rw) & dram_req_ready & dram_rsp_valid & dram_rsp_ready) begin
|
||||
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle;
|
||||
end else if(dram_req_valid & (~dram_req_rw) & dram_req_ready) begin
|
||||
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + 64'd1;
|
||||
end else if(dram_rsp_valid & dram_rsp_ready) begin
|
||||
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle - 64'd1;
|
||||
end
|
||||
end
|
||||
end
|
||||
// per cycle: msrq stalls, total stalls, total eviction, read miss, write miss
|
||||
reg [($clog2(NUM_BANKS+1)-1):0] perf_msrq_stall_per_cycle;
|
||||
reg [($clog2(NUM_BANKS+1)-1):0] perf_total_stall_per_cycle;
|
||||
reg [($clog2(NUM_BANKS+1)-1):0] perf_total_eviction_per_cycle;
|
||||
reg [($clog2(NUM_BANKS+1)-1):0] perf_read_miss_per_cycle;
|
||||
reg [($clog2(NUM_BANKS+1)-1):0] perf_write_miss_per_cycle;
|
||||
VX_countones #(
|
||||
.N(NUM_BANKS)
|
||||
) perf_countones_msrq_stall_count (
|
||||
.valids (perf_msrq_stall_per_bank),
|
||||
.count (perf_msrq_stall_per_cycle)
|
||||
);
|
||||
VX_countones #(
|
||||
.N(NUM_BANKS)
|
||||
) perf_countones_total_stall_count (
|
||||
.valids (perf_total_stall_per_bank),
|
||||
.count (perf_total_stall_per_cycle)
|
||||
);
|
||||
VX_countones #(
|
||||
.N(NUM_BANKS)
|
||||
) perf_countones_total_evict_count (
|
||||
.valids (perf_evict_per_bank),
|
||||
.count (perf_total_eviction_per_cycle)
|
||||
);
|
||||
VX_countones #(
|
||||
.N(NUM_BANKS)
|
||||
) perf_countones_read_miss_count (
|
||||
.valids (perf_read_miss_per_bank),
|
||||
.count (perf_read_miss_per_cycle)
|
||||
);
|
||||
VX_countones #(
|
||||
.N(NUM_BANKS)
|
||||
) perf_countones_write_miss_count (
|
||||
.valids (perf_write_miss_per_bank),
|
||||
.count (perf_write_miss_per_cycle)
|
||||
);
|
||||
reg [63:0] perf_core_req_r, perf_core_req_w;
|
||||
reg [63:0] perf_dram_lat, perf_dram_rsp;
|
||||
reg [63:0] perf_msrq_stall;
|
||||
reg [63:0] perf_total_stall;
|
||||
reg [63:0] perf_total_eviction;
|
||||
reg [63:0] perf_read_miss, perf_write_miss;
|
||||
reg [63:0] perf_core_rsp_stall, perf_dram_stall;
|
||||
always @ (posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_core_req_r <= 0;
|
||||
perf_core_req_w <= 0;
|
||||
perf_dram_lat <= 0;
|
||||
perf_dram_rsp <= 0;
|
||||
perf_msrq_stall <= 0;
|
||||
perf_total_stall <= 0;
|
||||
perf_total_eviction <= 0;
|
||||
perf_read_miss <= 0;
|
||||
perf_write_miss <= 0;
|
||||
perf_core_rsp_stall <= 0;
|
||||
perf_dram_stall <= 0;
|
||||
end else begin
|
||||
// core_req_r, core_req_w
|
||||
perf_core_req_r <= perf_core_req_r + $bits(perf_core_req_r)'(perf_core_req_r_per_cycle);
|
||||
perf_core_req_w <= perf_core_req_w + $bits(perf_core_req_w)'(perf_core_req_w_per_cycle);
|
||||
// dram_latency
|
||||
perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle;
|
||||
if (dram_rsp_valid & dram_rsp_ready) begin
|
||||
perf_dram_rsp <= perf_dram_rsp + 64'd1;
|
||||
end
|
||||
// miss reserve queue stalls: bank->msrq_push_stall
|
||||
perf_msrq_stall <= perf_msrq_stall + $bits(perf_msrq_stall)'(perf_msrq_stall_per_cycle);
|
||||
// total stalls: from bank->pipeline_stall
|
||||
perf_total_stall <= perf_total_stall + $bits(perf_total_stall)'(perf_total_stall_per_cycle);
|
||||
// total eviction: from bank-> dwbq_push & do_writeback_st3 & !is_snp_st3
|
||||
perf_total_eviction <= perf_total_eviction + $bits(perf_total_eviction)'(perf_total_eviction_per_cycle);
|
||||
// read miss: from bank-> !pipeline_stall & miss_st1 & !is_msrq_st1 & !mem_rw_st1
|
||||
perf_read_miss <= perf_read_miss + $bits(perf_read_miss)'(perf_read_miss_per_cycle);
|
||||
// write miss: from bank-> !pipeline_stall & miss_st1 & !is_msrq_st1 & mem_rw_st1
|
||||
perf_write_miss <= perf_write_miss + $bits(perf_write_miss)'(perf_write_miss_per_cycle);
|
||||
// core_rsp_stall
|
||||
if ((| core_rsp_valid) & !core_rsp_ready) begin
|
||||
perf_core_rsp_stall <= perf_core_rsp_stall + 64'd1;
|
||||
end
|
||||
// dram_stall
|
||||
if (dram_req_valid & !dram_req_ready) begin
|
||||
perf_dram_stall <= perf_dram_stall + 64'd1;
|
||||
end
|
||||
end
|
||||
end
|
||||
assign perf_cache_if.total_read = perf_core_req_r;
|
||||
assign perf_cache_if.total_write = perf_core_req_w;
|
||||
assign perf_cache_if.dram_latency = perf_dram_lat;
|
||||
assign perf_cache_if.dram_rsp = perf_dram_rsp;
|
||||
assign perf_cache_if.msrq_stall = perf_msrq_stall;
|
||||
assign perf_cache_if.total_stall = perf_total_stall;
|
||||
assign perf_cache_if.total_eviction = perf_total_eviction;
|
||||
assign perf_cache_if.read_miss = perf_read_miss;
|
||||
assign perf_cache_if.write_miss = perf_write_miss;
|
||||
assign perf_cache_if.core_rsp_stall = perf_core_rsp_stall;
|
||||
assign perf_cache_if.dram_stall = perf_dram_stall;
|
||||
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
22
hw/rtl/interfaces/VX_perf_cache_if.v
Normal file
22
hw/rtl/interfaces/VX_perf_cache_if.v
Normal file
|
@ -0,0 +1,22 @@
|
|||
`ifndef VX_PERF_CACHE_IF
|
||||
`define VX_PERF_CACHE_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_perf_cache_if ();
|
||||
|
||||
wire [63:0] read_miss;
|
||||
wire [63:0] write_miss;
|
||||
wire [63:0] dram_stall;
|
||||
wire [63:0] dram_rsp;
|
||||
wire [63:0] core_rsp_stall;
|
||||
wire [63:0] msrq_stall;
|
||||
wire [63:0] total_stall;
|
||||
wire [63:0] total_read;
|
||||
wire [63:0] total_write;
|
||||
wire [63:0] total_eviction;
|
||||
wire [63:0] dram_latency;
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
25
hw/rtl/interfaces/VX_perf_pipeline_stall_if.v
Normal file
25
hw/rtl/interfaces/VX_perf_pipeline_stall_if.v
Normal file
|
@ -0,0 +1,25 @@
|
|||
`ifndef VX_PERF_PIPELINE_STALL_IF
|
||||
`define VX_PERF_PIPELINE_STALL_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_perf_pipeline_stall_if ();
|
||||
// from pipeline
|
||||
wire [63:0] icache_stall;
|
||||
wire [63:0] ibuffer_stall;
|
||||
// from issue
|
||||
wire [63:0] scoreboard_stall;
|
||||
// from execute
|
||||
wire [63:0] lsu_stall;
|
||||
wire [63:0] csr_stall;
|
||||
wire [63:0] alu_stall;
|
||||
wire [63:0] gpu_stall;
|
||||
`ifdef EXT_M_ENABLE
|
||||
wire [63:0] mul_stall;
|
||||
`endif
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [63:0] fpu_stall;
|
||||
`endif
|
||||
endinterface
|
||||
|
||||
`endif
|
Loading…
Add table
Add a link
Reference in a new issue