PERF pipeline stalls and cache

This commit is contained in:
Xandy Liu 2020-12-08 01:14:41 -05:00
parent 0d0706411d
commit 1595ff08e2
17 changed files with 843 additions and 5 deletions

View file

@ -6,6 +6,11 @@ all:
$(MAKE) -C simX
$(MAKE) -C benchmarks/opencl
perf-demo:
$(MAKE) -C hw
$(MAKE) -C driver rtlsim
$(MAKE) -C driver/tests/demo/ run-rtlsim
clean:
$(MAKE) -C hw clean
$(MAKE) -C driver clean

View file

@ -62,7 +62,8 @@ int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value);
// get device constant registers
int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value);
// get device constant registers (64 bit long int)
int vx_csr_get_l(vx_device_h hdevice, int core_id, int addr, int addr_h, uint64_t* value);
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
// upload kernel bytes to device

View file

@ -243,6 +243,32 @@ extern int vx_dev_close(vx_device_h hdevice) {
vx_csr_get(hdevice, 0, CSR_NC, &num_cores);
if (num_cores > 1) {
uint64_t total_instrs = 0, total_cycles = 0;
// -------------------------
#ifdef PERF_ENABLE
// PERF: cache
uint64_t total_r = 0;
uint64_t total_w = 0;
uint64_t dram_st = 0;
uint64_t dram_lat = 0;
uint64_t dram_rsp = 0;
uint64_t msrq_st = 0;
uint64_t total_st = 0;
uint64_t r_miss = 0;
uint64_t w_miss = 0;
uint64_t core_rsp_st = 0;
uint64_t total_evict = 0;
// PERF: pipeline stalls
uint64_t lsu_stall = 0;
uint64_t fpu_stall = 0;
uint64_t mul_stall = 0;
uint64_t csr_stall = 0;
uint64_t alu_stall = 0;
uint64_t gpu_stall = 0;
uint64_t ibuffer_stall = 0;
uint64_t scoreboard_stall = 0;
uint64_t icache_stall = 0;
#endif
// -------------------------
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t instrs, cycles;
vx_get_perf(hdevice, core_id, &instrs, &cycles);
@ -250,14 +276,235 @@ extern int vx_dev_close(vx_device_h hdevice) {
fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC);
total_instrs += instrs;
total_cycles = std::max<uint64_t>(total_cycles, cycles);
#ifdef PERF_ENABLE
// PERF: cache
// total_read
uint64_t total_r_per_core;
vx_csr_get_l(hdevice, core_id, CSR_TOTAL_R, CSR_TOTAL_R_H, &total_r_per_core);
fprintf(stdout, "PERF: \t\ttotal_reads_per_core=%ld\n", total_r_per_core);
total_r += total_r_per_core;
// total_write
uint64_t total_w_per_core;
vx_csr_get_l(hdevice, core_id, CSR_TOTAL_W, CSR_TOTAL_W_H, &total_w_per_core);
fprintf(stdout, "PERF: \t\ttotal_writes_per_core=%ld\n", total_w_per_core);
total_w += total_w_per_core;
// dram_stall
uint64_t dram_st_per_core;
vx_csr_get_l(hdevice, core_id, CSR_DRAM_ST, CSR_DRAM_ST_H, &dram_st_per_core);
fprintf(stdout, "PERF: \t\tdram_stalls_per_core=%ld\n", dram_st_per_core);
dram_st += dram_st_per_core;
// dram_latency
uint64_t dram_lat_per_core, dram_rsp_per_core;
vx_csr_get_l(hdevice, core_id, CSR_DRAM_LAT, CSR_DRAM_LAT_H, &dram_lat_per_core);
vx_csr_get_l(hdevice, core_id, CSR_DRAM_RSP, CSR_DRAM_RSP_H, &dram_rsp_per_core);
fprintf(stdout, "PERF: \t\tdram_latency_per_core=%ld\n", dram_lat_per_core);
fprintf(stdout, "PERF: \t\tdram_response_per_core=%ld\n", dram_rsp_per_core);
dram_lat += dram_lat_per_core;
dram_rsp += dram_rsp_per_core;
float dram_lat_per_rsp_per_core = (float)(double(dram_lat_per_core) / double(dram_rsp_per_core));
fprintf(stdout, "PERF: \t\tdram_latency_per_response_per_core=%f\n", dram_lat_per_rsp_per_core);
// miss_reserve_queue_stall
uint64_t msrq_st_per_core;
vx_csr_get_l(hdevice, core_id, CSR_MSRQ_ST, CSR_MSRQ_ST_H, &msrq_st_per_core);
fprintf(stdout, "PERF: \t\tmsrq_stalls_per_core=%ld\n", msrq_st_per_core);
msrq_st += msrq_st_per_core;
// total_stall
uint64_t total_st_per_core;
vx_csr_get_l(hdevice, core_id, CSR_TOTAL_ST, CSR_TOTAL_ST_H, &total_st_per_core);
fprintf(stdout, "PERF: \t\ttotal_stalls_per_core=%ld\n", total_st_per_core);
total_st += total_st_per_core;
// read_miss
uint64_t r_miss_per_core;
vx_csr_get_l(hdevice, core_id, CSR_R_MISS, CSR_R_MISS_H, &r_miss_per_core);
fprintf(stdout, "PERF: \t\tread_misses_per_core=%ld\n", r_miss_per_core);
r_miss += r_miss_per_core;
// write_miss
uint64_t w_miss_per_core;
vx_csr_get_l(hdevice, core_id, CSR_W_MISS, CSR_W_MISS_H, &w_miss_per_core);
fprintf(stdout, "PERF: \t\twrite_misses_per_core=%ld\n", w_miss_per_core);
w_miss += w_miss_per_core;
// core_rsp_stalls
uint64_t core_rsp_st_per_core;
vx_csr_get_l(hdevice, core_id, CSR_CORE_RSP_ST, CSR_CORE_RSP_ST_H, &core_rsp_st_per_core);
fprintf(stdout, "PERF: \t\tcore_rsp_stalls_per_core=%ld\n", core_rsp_st_per_core);
core_rsp_st += core_rsp_st_per_core;
// total_evictions
uint64_t total_evict_per_core;
vx_csr_get_l(hdevice, core_id, CSR_TOTAL_EV, CSR_TOTAL_EV_H, &total_evict_per_core);
fprintf(stdout, "PERF: \t\ttotal_evictions_per_core=%ld\n", total_evict_per_core);
total_evict += total_evict_per_core;
// PERF: pipeline stall
// lsu_stall
uint64_t lsu_stall_per_core;
vx_csr_get_l(hdevice, core_id, CSR_LSU_ST, CSR_LSU_ST_H, &lsu_stall_per_core);
fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall_per_core);
lsu_stall += lsu_stall_per_core;
// fpu_stall
uint64_t fpu_stall_per_core;
vx_csr_get_l(hdevice, core_id, CSR_FPU_ST, CSR_FPU_ST_H, &fpu_stall_per_core);
fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall_per_core);
fpu_stall += fpu_stall_per_core;
// mul_stall
uint64_t mul_stall_per_core;
vx_csr_get_l(hdevice, core_id, CSR_MUL_ST, CSR_MUL_ST_H, &mul_stall_per_core);
fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall_per_core);
mul_stall += mul_stall_per_core;
// csr_stall
uint64_t csr_stall_per_core;
vx_csr_get_l(hdevice, core_id, CSR_CSR_ST, CSR_CSR_ST_H, &csr_stall_per_core);
fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall_per_core);
csr_stall += csr_stall_per_core;
// alu_stall
uint64_t alu_stall_per_core;
vx_csr_get_l(hdevice, core_id, CSR_ALU_ST, CSR_ALU_ST_H, &alu_stall_per_core);
fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall_per_core);
alu_stall += alu_stall_per_core;
// gpu_stall
uint64_t gpu_stall_per_core;
vx_csr_get_l(hdevice, core_id, CSR_GPU_ST, CSR_GPU_ST_H, &gpu_stall_per_core);
fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall_per_core);
gpu_stall += gpu_stall_per_core;
// ibuffer_stall
uint64_t ibuffer_stall_per_core;
vx_csr_get_l(hdevice, core_id, CSR_IBUF_ST, CSR_IBUF_ST_H, &ibuffer_stall_per_core);
fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall_per_core);
ibuffer_stall += ibuffer_stall_per_core;
// scoreboard_stall
uint64_t scoreboard_stall_per_core;
vx_csr_get_l(hdevice, core_id, CSR_SCRBRD_ST, CSR_SCRBRD_ST_H, &scoreboard_stall_per_core);
fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall_per_core);
scoreboard_stall += scoreboard_stall_per_core;
// icache_stall
uint64_t icache_stall_per_core;
vx_csr_get_l(hdevice, core_id, CSR_ICACHE_ST, CSR_ICACHE_ST_H, &icache_stall_per_core);
fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall_per_core);
icache_stall += icache_stall_per_core;
#endif
// -------------------------
}
float IPC = (float)(double(total_instrs) / double(total_cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC);
#ifdef PERF_ENABLE
// PERF: cache
fprintf(stdout, "PERF: \t\ttotal_reads=%ld\n", total_r);
fprintf(stdout, "PERF: \t\ttotal_writes=%ld\n", total_w);
fprintf(stdout, "PERF: \t\tdram_stalls=%ld\n", dram_st);
fprintf(stdout, "PERF: \t\tdram_latency=%ld\n", dram_lat);
fprintf(stdout, "PERF: \t\tdram_response=%ld\n", dram_rsp);
float dram_lat_per_rsp = (float)(double(dram_lat) / double(dram_rsp));
fprintf(stdout, "PERF: \t\tdram_latency_per_response=%f\n", dram_lat_per_rsp);
fprintf(stdout, "PERF: \t\tmsrq_stalls=%ld\n", msrq_st);
fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", total_st);
fprintf(stdout, "PERF: \t\tread_misses=%ld\n", r_miss);
fprintf(stdout, "PERF: \t\twrite_misses=%ld\n", w_miss);
fprintf(stdout, "PERF: \t\tcore_rsp_stalls=%ld\n", core_rsp_st);
fprintf(stdout, "PERF: \t\ttotal_evictions=%ld\n", total_evict);
// PERF: pipeline stall
fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall);
fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall);
fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall);
fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall);
fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall);
fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall);
fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall);
fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall);
fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall);
#endif
// -------------------------
} else {
uint64_t instrs, cycles;
vx_get_perf(hdevice, 0, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
#ifdef PERF_ENABLE
// PERF: cache
// total_read
uint64_t total_r;
vx_csr_get_l(hdevice, 0, CSR_TOTAL_R, CSR_TOTAL_R_H, &total_r);
fprintf(stdout, "PERF: \t\ttotal_reads=%ld\n", total_r);
// total_write
uint64_t total_w;
vx_csr_get_l(hdevice, 0, CSR_TOTAL_W, CSR_TOTAL_W_H, &total_w);
fprintf(stdout, "PERF: \t\ttotal_writes=%ld\n", total_w);
// dram_stall
uint64_t dram_st;
vx_csr_get_l(hdevice, 0, CSR_DRAM_ST, CSR_DRAM_ST_H, &dram_st);
fprintf(stdout, "PERF: \t\tdram_stalls=%ld\n", dram_st);
// dram_latency
uint64_t dram_lat, dram_rsp;
vx_csr_get_l(hdevice, 0, CSR_DRAM_LAT, CSR_DRAM_LAT_H, &dram_lat);
vx_csr_get_l(hdevice, 0, CSR_DRAM_RSP, CSR_DRAM_RSP_H, &dram_rsp);
float dram_lat_per_rsp = (float)(double(dram_lat) / double(dram_rsp));
fprintf(stdout, "PERF: \t\tdram_latency=%ld\n", dram_lat);
fprintf(stdout, "PERF: \t\tdram_response=%ld\n", dram_rsp);
fprintf(stdout, "PERF: \t\tdram_latency_per_response=%f\n", dram_lat_per_rsp);
// miss_reserve_queue_stall
uint64_t msrq_st;
vx_csr_get_l(hdevice, 0, CSR_MSRQ_ST, CSR_MSRQ_ST_H, &msrq_st);
fprintf(stdout, "PERF: \t\tmsrq_stalls=%ld\n", msrq_st);
// total_stall
uint64_t total_st;
vx_csr_get_l(hdevice, 0, CSR_TOTAL_ST, CSR_TOTAL_ST_H, &total_st);
fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", total_st);
// read_miss
uint64_t r_miss;
vx_csr_get_l(hdevice, 0, CSR_R_MISS, CSR_R_MISS_H, &r_miss);
fprintf(stdout, "PERF: \t\tread_misses=%ld\n", r_miss);
// write_miss
uint64_t w_miss;
vx_csr_get_l(hdevice, 0, CSR_W_MISS, CSR_W_MISS_H, &w_miss);
fprintf(stdout, "PERF: \t\twrite_misses=%ld\n", w_miss);
// core_rsp_stalls
uint64_t core_rsp_st;
vx_csr_get_l(hdevice, 0, CSR_CORE_RSP_ST, CSR_CORE_RSP_ST_H, &core_rsp_st);
fprintf(stdout, "PERF: \t\ttotal_stalls=%ld\n", core_rsp_st);
// total_evictions
uint64_t total_evict;
vx_csr_get_l(hdevice, 0, CSR_TOTAL_EV, CSR_TOTAL_EV_H, &total_evict);
fprintf(stdout, "PERF: \t\ttotal_evictions=%ld\n", total_evict);
// PERF: pipeline stalls
// TODO:
// lsu_stall
uint64_t lsu_stall;
vx_csr_get_l(hdevice, 0, CSR_LSU_ST, CSR_LSU_ST_H, &lsu_stall);
fprintf(stdout, "PERF: \t\tlsu_stall=%ld\n", lsu_stall);
// fpu_stall
uint64_t fpu_stall;
vx_csr_get_l(hdevice, 0, CSR_FPU_ST, CSR_FPU_ST_H, &fpu_stall);
fprintf(stdout, "PERF: \t\tfpu_stall=%ld\n", fpu_stall);
// mul_stall
uint64_t mul_stall;
vx_csr_get_l(hdevice, 0, CSR_MUL_ST, CSR_MUL_ST_H, &mul_stall);
fprintf(stdout, "PERF: \t\tmul_stall=%ld\n", mul_stall);
// csr_stall
uint64_t csr_stall;
vx_csr_get_l(hdevice, 0, CSR_CSR_ST, CSR_CSR_ST_H, &csr_stall);
fprintf(stdout, "PERF: \t\tcsr_stall=%ld\n", csr_stall);
// alu_stall
uint64_t alu_stall;
vx_csr_get_l(hdevice, 0, CSR_ALU_ST, CSR_ALU_ST_H, &alu_stall);
fprintf(stdout, "PERF: \t\talu_stall=%ld\n", alu_stall);
// gpu_stall
uint64_t gpu_stall;
vx_csr_get_l(hdevice, 0, CSR_GPU_ST, CSR_GPU_ST_H, &gpu_stall);
fprintf(stdout, "PERF: \t\tgpu_stall=%ld\n", gpu_stall);
// ibuffer_stall
uint64_t ibuffer_stall;
vx_csr_get_l(hdevice, 0, CSR_IBUF_ST, CSR_IBUF_ST_H, &ibuffer_stall);
fprintf(stdout, "PERF: \t\tibuffer_stall=%ld\n", ibuffer_stall);
// scoreboard_stall
uint64_t scoreboard_stall;
vx_csr_get_l(hdevice, 0, CSR_SCRBRD_ST, CSR_SCRBRD_ST_H, &scoreboard_stall);
fprintf(stdout, "PERF: \t\tscoreboard_stall=%ld\n", scoreboard_stall);
// icache_stall
uint64_t icache_stall;
vx_csr_get_l(hdevice, 0, CSR_ICACHE_ST, CSR_ICACHE_ST_H, &icache_stall);
fprintf(stdout, "PERF: \t\ticache_stall=%ld\n", icache_stall);
#endif
// -------------------------
}
#endif
@ -386,4 +633,16 @@ extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* valu
vx_device *device = ((vx_device*)hdevice);
return device->get_csr(core_id, addr, value);
}
extern int vx_csr_get_l(vx_device_h hdevice, int core_id, int addr, int addr_h, uint64_t* value) {
if (nullptr == hdevice)
return -1;
unsigned csr_value;
vx_csr_get(hdevice, core_id, addr_h, &csr_value);
*value = csr_value;
vx_csr_get(hdevice, core_id, addr, &csr_value);
*value = (*value << 32) | csr_value;
return 0;
}

View file

@ -369,6 +369,11 @@ module VX_cluster #(
.core_rsp_tag (core_dram_rsp_tag),
.core_rsp_ready (core_dram_rsp_ready),
// PERF: total read
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_cache_if),
`endif
// DRAM request
.dram_req_valid (dram_req_valid),
.dram_req_rw (dram_req_rw),

View file

@ -124,6 +124,12 @@
`define LATENCY_FCONV 3
`endif
///////////////////////////////////////
`ifndef PERF_ENABLE
`define PERF_ENABLE
`endif
///////////////////////////////////////
// CSR Addresses //////////////////////////////////////////////////////////////
`define CSR_FFLAGS 12'h001
@ -139,6 +145,52 @@
`define CSR_NW 12'h026
`define CSR_NC 12'h027
// PERF: cache
`define CSR_R_MISS 12'h030 // read misses
`define CSR_R_MISS_H 12'h031
`define CSR_W_MISS 12'h032 // write misses
`define CSR_W_MISS_H 12'h033
`define CSR_DRAM_ST 12'h034 // dram stalls
`define CSR_DRAM_ST_H 12'h035
`define CSR_CORE_RSP_ST 12'h036 // core_rsp stalls
`define CSR_CORE_RSP_ST_H 12'h037
`define CSR_MSRQ_ST 12'h038 // miss reserve queue stalls
`define CSR_MSRQ_ST_H 12'h039
`define CSR_TOTAL_ST 12'h03A // total stalls
`define CSR_TOTAL_ST_H 12'h03B
`define CSR_TOTAL_R 12'h03C // total reads
`define CSR_TOTAL_R_H 12'h03D
`define CSR_TOTAL_W 12'h03E // total writes
`define CSR_TOTAL_W_H 12'h03F
`define CSR_TOTAL_EV 12'h040 // total evictions
`define CSR_TOTAL_EV_H 12'h041
`define CSR_DRAM_LAT 12'h042 // dram latency (total)
`define CSR_DRAM_LAT_H 12'h043
`define CSR_DRAM_RSP 12'h044 // dram responses
`define CSR_DRAM_RSP_H 12'h045
// PERF: pipeline stalls
`define CSR_FPU_ST 12'h046
`define CSR_FPU_ST_H 12'h047
`define CSR_MUL_ST 12'h048
`define CSR_MUL_ST_H 12'h049
`define CSR_CSR_ST 12'h04A
`define CSR_CSR_ST_H 12'h04B
`define CSR_ALU_ST 12'h04C
`define CSR_ALU_ST_H 12'h04D
`define CSR_GPU_ST 12'h04E
`define CSR_GPU_ST_H 12'h04F
`define CSR_LSU_ST 12'h050
`define CSR_LSU_ST_H 12'h051
`define CSR_IBUF_ST 12'h052
`define CSR_IBUF_ST_H 12'h053
`define CSR_SCRBRD_ST 12'h054
`define CSR_SCRBRD_ST_H 12'h055
`define CSR_ICACHE_ST 12'h056
`define CSR_ICACHE_ST_H 12'h057
//////////////////////////////////////////////////////////////
`define CSR_SATP 12'h180
`define CSR_PMPCFG0 12'h3A0

View file

@ -66,6 +66,10 @@ module VX_core #(
output wire busy,
output wire ebreak
);
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if();
`endif
VX_cache_dram_req_if #(
.DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH),
.DRAM_ADDR_WIDTH(`DDRAM_ADDR_WIDTH),
@ -220,6 +224,11 @@ module VX_core #(
.csr_io_rsp_data (csr_io_rsp_data),
.csr_io_rsp_ready (csr_io_rsp_ready),
// PERF: total reads
`ifdef PERF_ENABLE
.perf_cache_if (perf_cache_if),
`endif
// Status
.busy(busy),
.ebreak(ebreak)
@ -238,6 +247,11 @@ module VX_core #(
// Core <-> Dcache
.core_dcache_req_if (core_dcache_req_if),
.core_dcache_rsp_if (core_dcache_rsp_if),
// PERF: total reads
`ifdef PERF_ENABLE
.perf_cache_if (perf_cache_if),
`endif
// Core <-> Icache
.core_icache_req_if (core_icache_req_if),

View file

@ -6,6 +6,12 @@ module VX_csr_data #(
input wire clk,
input wire reset,
// PERF: total reads
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if,
VX_perf_pipeline_stall_if perf_pipeline_stall_if,
`endif
VX_cmt_to_csr_if cmt_to_csr_if,
VX_fpu_to_csr_if fpu_to_csr_if,
@ -114,6 +120,51 @@ module VX_csr_data #(
`CSR_NW : read_data_r = `NUM_WARPS;
`CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS;
`ifdef PERF_ENABLE
// PERF: cache
`CSR_R_MISS : read_data_r = perf_cache_if.read_miss[31:0];
`CSR_R_MISS_H : read_data_r = perf_cache_if.read_miss[63:32];
`CSR_W_MISS : read_data_r = perf_cache_if.write_miss[31:0];
`CSR_W_MISS_H : read_data_r = perf_cache_if.write_miss[63:32];
`CSR_DRAM_ST : read_data_r = perf_cache_if.dram_stall[31:0];
`CSR_DRAM_ST_H : read_data_r = perf_cache_if.dram_stall[63:32];
`CSR_CORE_RSP_ST : read_data_r = perf_cache_if.core_rsp_stall[31:0];
`CSR_CORE_RSP_ST_H: read_data_r = perf_cache_if.core_rsp_stall[63:32];
`CSR_MSRQ_ST : read_data_r = perf_cache_if.msrq_stall[31:0];
`CSR_MSRQ_ST_H : read_data_r = perf_cache_if.msrq_stall[63:32];
`CSR_TOTAL_ST : read_data_r = perf_cache_if.total_stall[31:0];
`CSR_TOTAL_ST_H : read_data_r = perf_cache_if.total_stall[63:32];
`CSR_TOTAL_R : read_data_r = perf_cache_if.total_read[31:0];
`CSR_TOTAL_R_H : read_data_r = perf_cache_if.total_read[63:32];
`CSR_TOTAL_W : read_data_r = perf_cache_if.total_write[31:0];
`CSR_TOTAL_W_H : read_data_r = perf_cache_if.total_write[63:32];
`CSR_TOTAL_EV : read_data_r = perf_cache_if.total_eviction[31:0];
`CSR_TOTAL_EV_H : read_data_r = perf_cache_if.total_eviction[63:32];
`CSR_DRAM_LAT : read_data_r = perf_cache_if.dram_latency[31:0];
`CSR_DRAM_LAT_H : read_data_r = perf_cache_if.dram_latency[63:32];
`CSR_DRAM_RSP : read_data_r = perf_cache_if.dram_rsp[31:0];
`CSR_DRAM_RSP_H : read_data_r = perf_cache_if.dram_rsp[63:32];
// PERF: pipeline stalls
`CSR_LSU_ST : read_data_r = perf_pipeline_stall_if.lsu_stall[31:0];
`CSR_LSU_ST_H : read_data_r = perf_pipeline_stall_if.lsu_stall[63:32];
`CSR_FPU_ST : read_data_r = perf_pipeline_stall_if.fpu_stall[31:0];
`CSR_FPU_ST_H : read_data_r = perf_pipeline_stall_if.fpu_stall[63:32];
`CSR_MUL_ST : read_data_r = perf_pipeline_stall_if.mul_stall[31:0];
`CSR_MUL_ST_H : read_data_r = perf_pipeline_stall_if.mul_stall[63:32];
`CSR_CSR_ST : read_data_r = perf_pipeline_stall_if.csr_stall[31:0];
`CSR_CSR_ST_H : read_data_r = perf_pipeline_stall_if.csr_stall[63:32];
`CSR_ALU_ST : read_data_r = perf_pipeline_stall_if.alu_stall[31:0];
`CSR_ALU_ST_H : read_data_r = perf_pipeline_stall_if.alu_stall[63:32];
`CSR_GPU_ST : read_data_r = perf_pipeline_stall_if.gpu_stall[31:0];
`CSR_GPU_ST_H : read_data_r = perf_pipeline_stall_if.gpu_stall[63:32];
`CSR_IBUF_ST : read_data_r = perf_pipeline_stall_if.ibuffer_stall[31:0];
`CSR_IBUF_ST_H : read_data_r = perf_pipeline_stall_if.ibuffer_stall[63:32];
`CSR_SCRBRD_ST : read_data_r = perf_pipeline_stall_if.scoreboard_stall[31:0];
`CSR_SCRBRD_ST_H : read_data_r = perf_pipeline_stall_if.scoreboard_stall[63:32];
`CSR_ICACHE_ST : read_data_r = perf_pipeline_stall_if.icache_stall[31:0];
`CSR_ICACHE_ST_H : read_data_r = perf_pipeline_stall_if.icache_stall[63:32];
`endif
`CSR_SATP : read_data_r = 32'(csr_satp);
`CSR_MSTATUS : read_data_r = 32'(csr_mstatus);

View file

@ -6,6 +6,12 @@ module VX_csr_unit #(
input wire clk,
input wire reset,
// PERF: total reads
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if,
VX_perf_pipeline_stall_if perf_pipeline_stall_if,
`endif
VX_cmt_to_csr_if cmt_to_csr_if,
VX_fpu_to_csr_if fpu_to_csr_if,
@ -51,6 +57,11 @@ module VX_csr_unit #(
) csr_data (
.clk (clk),
.reset (reset),
// PERF: total reads
`ifdef PERF_ENABLE
.perf_cache_if (perf_cache_if),
.perf_pipeline_stall_if (perf_pipeline_stall_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fpu_to_csr_if (fpu_to_csr_if),
.read_enable (csr_pipe_req_if.valid),

View file

@ -18,6 +18,12 @@ module VX_execute #(
// perf
VX_cmt_to_csr_if cmt_to_csr_if,
// PERF: total reads
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if,
VX_perf_pipeline_stall_if perf_pipeline_stall_if,
`endif
// inputs
VX_alu_req_if alu_req_if,
@ -72,7 +78,12 @@ module VX_execute #(
.CORE_ID(CORE_ID)
) csr_unit (
.clk (clk),
.reset (reset),
.reset (reset),
// PERF: total reads
`ifdef PERF_ENABLE
.perf_cache_if (perf_cache_if),
.perf_pipeline_stall_if (perf_pipeline_stall_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fpu_to_csr_if (fpu_to_csr_if),
.csr_io_req_if (csr_io_req_if),
@ -150,4 +161,72 @@ module VX_execute #(
&& (`BR_OP(alu_req_if.op_type) == `BR_EBREAK
|| `BR_OP(alu_req_if.op_type) == `BR_ECALL);
`ifdef PERF_ENABLE
reg [63:0] perf_alu_stall;
reg [63:0] perf_lsu_stall;
reg [63:0] perf_csr_stall;
reg [63:0] perf_gpu_stall;
`ifdef EXT_M_ENABLE
reg [63:0] perf_mul_stall;
`endif
`ifdef EXT_F_ENABLE
reg [63:0] perf_fpu_stall;
`endif
always@(posedge clk) begin
if(reset) begin
perf_alu_stall <= 0;
perf_lsu_stall <= 0;
perf_csr_stall <= 0;
perf_gpu_stall <= 0;
`ifdef EXT_M_ENABLE
perf_mul_stall <= 0;
`endif
`ifdef EXT_F_ENABLE
perf_fpu_stall <= 0;
`endif
end else begin
// alu_stall
if (alu_req_if.valid & !alu_req_if.ready) begin
perf_alu_stall <= perf_alu_stall + 64'd1;
end
// lsu_stall
if (lsu_req_if.valid & !lsu_req_if.ready) begin
perf_lsu_stall <= perf_lsu_stall + 64'd1;
end
// csr_stall
if (csr_req_if.valid & !csr_req_if.ready) begin
perf_csr_stall <= perf_csr_stall + 64'd1;
end
// gpu_stall
if (gpu_req_if.valid & !gpu_req_if.ready) begin
perf_gpu_stall <= perf_gpu_stall + 64'd1;
end
// mul_stall
`ifdef EXT_M_ENABLE
if (mul_req_if.valid & !mul_req_if.ready) begin
perf_mul_stall <= perf_mul_stall + 64'd1;
end
`endif
// fpu_stall
`ifdef EXT_F_ENABLE
if (fpu_req_if.valid & !fpu_req_if.ready) begin
perf_fpu_stall <= perf_fpu_stall + 64'd1;
end
`endif
end
end
assign perf_pipeline_stall_if.alu_stall = perf_alu_stall;
assign perf_pipeline_stall_if.lsu_stall = perf_lsu_stall;
assign perf_pipeline_stall_if.csr_stall = perf_csr_stall;
assign perf_pipeline_stall_if.gpu_stall = perf_gpu_stall;
`ifdef EXT_M_ENABLE
assign perf_pipeline_stall_if.mul_stall = perf_mul_stall;
`endif
`ifdef EXT_F_ENABLE
assign perf_pipeline_stall_if.fpu_stall = perf_fpu_stall;
`endif
// gpr_stall, ibuffer_stall, scoreboard_stall, icache_stall come from other stages
`endif
endmodule

View file

@ -8,6 +8,10 @@ module VX_issue #(
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_pipeline_stall_if perf_pipeline_stall_if,
`endif
VX_decode_if decode_if,
VX_writeback_if writeback_if,
@ -120,6 +124,21 @@ module VX_issue #(
`SCOPE_ASSIGN (writeback_rd, writeback_if.rd);
`SCOPE_ASSIGN (writeback_data, writeback_if.data);
`ifdef PERF_ENABLE
reg [63:0] perf_scoreboard_stall;
always @ (posedge clk) begin
if(reset) begin
perf_scoreboard_stall <= 0;
end else begin
// scoreboard_stall
if (ibuf_deq_if.valid & scoreboard_delay) begin
perf_scoreboard_stall <= perf_scoreboard_stall + 64'd1;
end
end
end
assign perf_pipeline_stall_if.scoreboard_stall = perf_scoreboard_stall;
`endif
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (alu_req_if.valid && alu_req_if.ready) begin

View file

@ -12,6 +12,10 @@ module VX_mem_unit # (
VX_cache_core_req_if core_dcache_req_if,
VX_cache_core_rsp_if core_dcache_rsp_if,
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if,
`endif
// Core <-> Icache
VX_cache_core_req_if core_icache_req_if,
VX_cache_core_rsp_if core_icache_rsp_if,
@ -28,6 +32,11 @@ module VX_mem_unit # (
VX_cache_core_req_if io_req_if,
VX_cache_core_rsp_if io_rsp_if
);
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_smem_if(), perf_cache_icache_if(), perf_cache_dcache_if();
`endif
VX_cache_dram_req_if #(
.DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH),
.DRAM_ADDR_WIDTH (`DDRAM_ADDR_WIDTH),
@ -124,6 +133,10 @@ module VX_mem_unit # (
.core_rsp_tag (dcache_rsp_if.tag),
.core_rsp_ready (dcache_rsp_if.ready),
`ifdef PERF_ENABLE
.perf_cache_if (perf_cache_dcache_if),
`endif
// DRAM request
.dram_req_valid (dcache_dram_req_if.valid),
.dram_req_rw (dcache_dram_req_if.rw),
@ -196,6 +209,11 @@ module VX_mem_unit # (
.core_rsp_tag (core_icache_rsp_if.tag),
.core_rsp_ready (core_icache_rsp_if.ready),
// PERF: cache read
`ifdef PERF_ENABLE
.perf_cache_if (perf_cache_icache_if),
`endif
// DRAM Req
.dram_req_valid (icache_dram_req_if.valid),
.dram_req_rw (icache_dram_req_if.rw),
@ -268,6 +286,11 @@ module VX_mem_unit # (
.core_rsp_tag (smem_rsp_if.tag),
.core_rsp_ready (smem_rsp_if.ready),
// PERF: cache read
`ifdef PERF_ENABLE
.perf_cache_if (perf_cache_smem_if),
`endif
// DRAM request
`UNUSED_PIN (dram_req_valid),
`UNUSED_PIN (dram_req_rw),
@ -340,4 +363,42 @@ module VX_mem_unit # (
.rsp_ready_in (dram_rsp_if.ready)
);
// PERF: cache
// TODO: some cache has dram and write disabled, hence some stats can can be removed.
`ifdef PERF_ENABLE
assign perf_cache_if.read_miss = perf_cache_smem_if.read_miss +
perf_cache_icache_if.read_miss +
perf_cache_dcache_if.read_miss;
assign perf_cache_if.write_miss = perf_cache_smem_if.write_miss +
perf_cache_icache_if.write_miss +
perf_cache_dcache_if.write_miss;
assign perf_cache_if.dram_stall = perf_cache_smem_if.dram_stall +
perf_cache_icache_if.dram_stall +
perf_cache_dcache_if.dram_stall;
assign perf_cache_if.core_rsp_stall = perf_cache_smem_if.core_rsp_stall +
perf_cache_icache_if.core_rsp_stall +
perf_cache_dcache_if.core_rsp_stall;
assign perf_cache_if.msrq_stall = perf_cache_smem_if.msrq_stall +
perf_cache_icache_if.msrq_stall +
perf_cache_dcache_if.msrq_stall;
assign perf_cache_if.total_stall = perf_cache_smem_if.total_stall +
perf_cache_icache_if.total_stall +
perf_cache_dcache_if.total_stall;
assign perf_cache_if.total_read = perf_cache_smem_if.total_read +
perf_cache_icache_if.total_read +
perf_cache_dcache_if.total_read;
assign perf_cache_if.total_write = perf_cache_smem_if.total_write +
perf_cache_icache_if.total_write +
perf_cache_dcache_if.total_write;
assign perf_cache_if.total_eviction = perf_cache_smem_if.total_eviction +
perf_cache_icache_if.total_eviction +
perf_cache_dcache_if.total_eviction;
assign perf_cache_if.dram_latency = perf_cache_smem_if.dram_latency +
perf_cache_icache_if.dram_latency +
perf_cache_dcache_if.dram_latency;
assign perf_cache_if.dram_rsp = perf_cache_smem_if.dram_rsp +
perf_cache_icache_if.dram_rsp +
perf_cache_dcache_if.dram_rsp;
`endif
endmodule

View file

@ -51,6 +51,10 @@ module VX_pipeline #(
output wire[31:0] csr_io_rsp_data,
input wire csr_io_rsp_ready,
// PERF: total reads
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if,
`endif
// Status
output wire busy,
output wire ebreak
@ -171,6 +175,10 @@ module VX_pipeline #(
VX_commit_if fpu_commit_if();
VX_commit_if gpu_commit_if();
`ifdef PERF_ENABLE
VX_perf_pipeline_stall_if perf_pipeline_stall_if();
`endif
VX_fetch #(
.CORE_ID(CORE_ID)
) fetch (
@ -206,6 +214,10 @@ module VX_pipeline #(
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.perf_pipeline_stall_if (perf_pipeline_stall_if),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
@ -224,7 +236,13 @@ module VX_pipeline #(
.clk (clk),
.reset (reset),
// PERF: total reads
`ifdef PERF_ENABLE
.perf_cache_if (perf_cache_if),
.perf_pipeline_stall_if (perf_pipeline_stall_if),
`endif
.dcache_req_if (core_dcache_req_if),
.dcache_rsp_if (core_dcache_rsp_if),
@ -272,4 +290,27 @@ module VX_pipeline #(
.cmt_to_csr_if (cmt_to_csr_if)
);
`ifdef PERF_ENABLE
reg [63:0] perf_icache_stall;
reg [63:0] perf_ibuffer_stall;
always @ (posedge clk) begin
if(reset) begin
perf_icache_stall <= 0;
perf_ibuffer_stall <= 0;
end else begin
// icache_stall
if (core_icache_req_if.valid & !core_icache_req_if.ready) begin
perf_icache_stall <= perf_icache_stall + 64'd1;
end
// ibuffer_stall: decode_if == issue->ibuffer->ibuf_enq_if
if(decode_if.valid & !decode_if.ready) begin
perf_ibuffer_stall <= perf_ibuffer_stall + 64'd1;
end
end
end
assign perf_pipeline_stall_if.icache_stall = perf_icache_stall;
assign perf_pipeline_stall_if.ibuffer_stall = perf_ibuffer_stall;
`endif
endmodule

View file

@ -372,6 +372,11 @@ module Vortex (
.core_rsp_tag (cluster_dram_rsp_tag),
.core_rsp_ready (cluster_dram_rsp_ready),
// PERF: total read
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_cache_if),
`endif
// DRAM request
.dram_req_valid (dram_req_valid),
.dram_req_rw (dram_req_rw),

View file

@ -96,6 +96,15 @@ module VX_bank #(
output wire [SNP_TAG_WIDTH-1:0] snp_rsp_tag,
input wire snp_rsp_ready,
// PERF: perf_msrq_stall
`ifdef PERF_ENABLE
output wire perf_msrq_stall,
output wire perf_total_stall,
output wire perf_evict,
output wire perf_read_miss,
output wire perf_write_miss,
`endif
// Misses
output wire misses
);
@ -948,6 +957,18 @@ end
`SCOPE_ASSIGN (addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID));
`SCOPE_ASSIGN (addr_st3, `LINE_TO_BYTE_ADDR(addr_st3, BANK_ID));
`ifdef PERF_ENABLE
assign perf_total_stall = pipeline_stall;
assign perf_msrq_stall = mshr_push_stall;
assign perf_read_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & !mem_rw_st1;
assign perf_write_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & mem_rw_st1;
if (DRAM_ENABLE) begin
assign perf_evict = dwbq_push & do_writeback_st3 & !is_snp_st3;
end else begin
assign perf_evict = 0;
end
`endif
`ifdef DBG_PRINT_CACHE_BANK
wire incoming_fill_dfp_st3 = dram_rsp_fire && (addr_st3 == dram_rsp_addr);
always @(posedge clk) begin

View file

@ -70,7 +70,12 @@ module VX_cache #(
output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
input wire core_rsp_ready,
// PERF
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if,
`endif
// DRAM request
output wire dram_req_valid,
output wire dram_req_rw,
@ -130,7 +135,16 @@ module VX_cache #(
wire [NUM_BANKS-1:0] per_bank_miss;
assign miss_vec = per_bank_miss;
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_msrq_stall_per_bank;
wire [NUM_BANKS-1:0] perf_total_stall_per_bank;
wire [NUM_BANKS-1:0] perf_evict_per_bank;
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
`endif
if (NUM_BANKS == 1) begin
assign snp_req_ready = per_bank_snp_req_ready;
end else begin
@ -311,6 +325,15 @@ module VX_cache #(
.dram_rsp_addr (curr_bank_dram_rsp_addr),
.dram_rsp_ready (curr_bank_dram_rsp_ready),
// PERF: perf_msrq_stall
`ifdef PERF_ENABLE
.perf_msrq_stall (perf_msrq_stall_per_bank[i]),
.perf_total_stall (perf_total_stall_per_bank[i]),
.perf_evict (perf_evict_per_bank[i]),
.perf_read_miss (perf_read_miss_per_bank[i]),
.perf_write_miss (perf_write_miss_per_bank[i]),
`endif
// Snoop request
.snp_req_valid (curr_bank_snp_req_valid),
.snp_req_addr (curr_bank_snp_req_addr),
@ -407,4 +430,148 @@ module VX_cache #(
`UNUSED_VAR (snp_rsp_ready)
end
`ifdef PERF_ENABLE
// per cycle: core_req_r, core_req_w
reg[($clog2(NUM_REQS+1)-1):0] perf_core_req_r_per_cycle, perf_core_req_w_per_cycle;
if (CORE_TAG_ID_BITS != 0) begin // core_req_rw is 1-bit wide
VX_countones #( // core_req_r
.N(NUM_REQS)
) perf_countones_core_req_r_count (
.valids (core_req_valid & {NUM_REQS{(~core_req_rw) & core_req_ready}}),
.count (perf_core_req_r_per_cycle)
);
VX_countones #( // core_req_w
.N(NUM_REQS)
) perf_countones_core_req_w_count (
.valids (core_req_valid & {NUM_REQS{(core_req_rw) & core_req_ready}}),
.count (perf_core_req_w_per_cycle)
);
end else begin // core_req_rw is NUM_REQS-bit wide
VX_countones #( // core_req_r
.N(NUM_REQS)
) perf_countones_core_req_r_count (
.valids (core_req_valid & (~core_req_rw) & {NUM_REQS{core_req_ready}}),
.count (perf_core_req_r_per_cycle)
);
VX_countones #( // core_req_w
.N(NUM_REQS)
) perf_countones_core_req_w_count (
.valids (core_req_valid & (core_req_rw) & {NUM_REQS{core_req_ready}}),
.count (perf_core_req_w_per_cycle)
);
end
// per cycle: dram_latency
reg[63:0] perf_dram_lat_per_cycle;
always@(posedge clk) begin
if(reset) begin
perf_dram_lat_per_cycle <= 0;
end else begin
if(dram_req_valid & (~dram_req_rw) & dram_req_ready & dram_rsp_valid & dram_rsp_ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle;
end else if(dram_req_valid & (~dram_req_rw) & dram_req_ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + 64'd1;
end else if(dram_rsp_valid & dram_rsp_ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle - 64'd1;
end
end
end
// per cycle: msrq stalls, total stalls, total eviction, read miss, write miss
reg [($clog2(NUM_BANKS+1)-1):0] perf_msrq_stall_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_total_stall_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_total_eviction_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_read_miss_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_write_miss_per_cycle;
VX_countones #(
.N(NUM_BANKS)
) perf_countones_msrq_stall_count (
.valids (perf_msrq_stall_per_bank),
.count (perf_msrq_stall_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_total_stall_count (
.valids (perf_total_stall_per_bank),
.count (perf_total_stall_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_total_evict_count (
.valids (perf_evict_per_bank),
.count (perf_total_eviction_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_read_miss_count (
.valids (perf_read_miss_per_bank),
.count (perf_read_miss_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_write_miss_count (
.valids (perf_write_miss_per_bank),
.count (perf_write_miss_per_cycle)
);
reg [63:0] perf_core_req_r, perf_core_req_w;
reg [63:0] perf_dram_lat, perf_dram_rsp;
reg [63:0] perf_msrq_stall;
reg [63:0] perf_total_stall;
reg [63:0] perf_total_eviction;
reg [63:0] perf_read_miss, perf_write_miss;
reg [63:0] perf_core_rsp_stall, perf_dram_stall;
always @ (posedge clk) begin
if (reset) begin
perf_core_req_r <= 0;
perf_core_req_w <= 0;
perf_dram_lat <= 0;
perf_dram_rsp <= 0;
perf_msrq_stall <= 0;
perf_total_stall <= 0;
perf_total_eviction <= 0;
perf_read_miss <= 0;
perf_write_miss <= 0;
perf_core_rsp_stall <= 0;
perf_dram_stall <= 0;
end else begin
// core_req_r, core_req_w
perf_core_req_r <= perf_core_req_r + $bits(perf_core_req_r)'(perf_core_req_r_per_cycle);
perf_core_req_w <= perf_core_req_w + $bits(perf_core_req_w)'(perf_core_req_w_per_cycle);
// dram_latency
perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle;
if (dram_rsp_valid & dram_rsp_ready) begin
perf_dram_rsp <= perf_dram_rsp + 64'd1;
end
// miss reserve queue stalls: bank->msrq_push_stall
perf_msrq_stall <= perf_msrq_stall + $bits(perf_msrq_stall)'(perf_msrq_stall_per_cycle);
// total stalls: from bank->pipeline_stall
perf_total_stall <= perf_total_stall + $bits(perf_total_stall)'(perf_total_stall_per_cycle);
// total eviction: from bank-> dwbq_push & do_writeback_st3 & !is_snp_st3
perf_total_eviction <= perf_total_eviction + $bits(perf_total_eviction)'(perf_total_eviction_per_cycle);
// read miss: from bank-> !pipeline_stall & miss_st1 & !is_msrq_st1 & !mem_rw_st1
perf_read_miss <= perf_read_miss + $bits(perf_read_miss)'(perf_read_miss_per_cycle);
// write miss: from bank-> !pipeline_stall & miss_st1 & !is_msrq_st1 & mem_rw_st1
perf_write_miss <= perf_write_miss + $bits(perf_write_miss)'(perf_write_miss_per_cycle);
// core_rsp_stall
if ((| core_rsp_valid) & !core_rsp_ready) begin
perf_core_rsp_stall <= perf_core_rsp_stall + 64'd1;
end
// dram_stall
if (dram_req_valid & !dram_req_ready) begin
perf_dram_stall <= perf_dram_stall + 64'd1;
end
end
end
assign perf_cache_if.total_read = perf_core_req_r;
assign perf_cache_if.total_write = perf_core_req_w;
assign perf_cache_if.dram_latency = perf_dram_lat;
assign perf_cache_if.dram_rsp = perf_dram_rsp;
assign perf_cache_if.msrq_stall = perf_msrq_stall;
assign perf_cache_if.total_stall = perf_total_stall;
assign perf_cache_if.total_eviction = perf_total_eviction;
assign perf_cache_if.read_miss = perf_read_miss;
assign perf_cache_if.write_miss = perf_write_miss;
assign perf_cache_if.core_rsp_stall = perf_core_rsp_stall;
assign perf_cache_if.dram_stall = perf_dram_stall;
`endif
endmodule

View file

@ -0,0 +1,22 @@
`ifndef VX_PERF_CACHE_IF
`define VX_PERF_CACHE_IF
`include "VX_define.vh"
interface VX_perf_cache_if ();
wire [63:0] read_miss;
wire [63:0] write_miss;
wire [63:0] dram_stall;
wire [63:0] dram_rsp;
wire [63:0] core_rsp_stall;
wire [63:0] msrq_stall;
wire [63:0] total_stall;
wire [63:0] total_read;
wire [63:0] total_write;
wire [63:0] total_eviction;
wire [63:0] dram_latency;
endinterface
`endif

View file

@ -0,0 +1,25 @@
`ifndef VX_PERF_PIPELINE_STALL_IF
`define VX_PERF_PIPELINE_STALL_IF
`include "VX_define.vh"
interface VX_perf_pipeline_stall_if ();
// from pipeline
wire [63:0] icache_stall;
wire [63:0] ibuffer_stall;
// from issue
wire [63:0] scoreboard_stall;
// from execute
wire [63:0] lsu_stall;
wire [63:0] csr_stall;
wire [63:0] alu_stall;
wire [63:0] gpu_stall;
`ifdef EXT_M_ENABLE
wire [63:0] mul_stall;
`endif
`ifdef EXT_F_ENABLE
wire [63:0] fpu_stall;
`endif
endinterface
`endif