adding new performance counters (banks utilization and DRAM bus utilization)

This commit is contained in:
Blaise Tine 2020-12-22 12:33:45 -08:00
parent 4b7d871d62
commit d956e268b9
14 changed files with 426 additions and 439 deletions

View file

@ -108,37 +108,39 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
uint64_t instrs = 0;
uint64_t cycles = 0;
#ifdef PERF_ENABLE
// PERF: pipeline stalls
uint64_t ibuffer_stalls = 0;
uint64_t scoreboard_stalls = 0;
uint64_t lsu_stalls = 0;
uint64_t fpu_stalls = 0;
uint64_t mul_stalls = 0;
uint64_t csr_stalls = 0;
uint64_t alu_stalls = 0;
uint64_t gpu_stalls = 0;
uint64_t ibuffer_stalls = 0;
uint64_t scoreboard_stalls = 0;
uint64_t icache_stalls = 0;
// PERF: Icache
uint64_t icache_reads = 0;
uint64_t icache_read_misses = 0;
uint64_t icache_pipe_stalls = 0;
uint64_t icache_dram_stalls = 0;
uint64_t icache_mshr_stalls = 0;
uint64_t icache_rsp_stalls = 0;
// PERF: Dcache
uint64_t dcache_reads = 0;
uint64_t dcache_writes = 0;
uint64_t dcache_read_misses = 0;
uint64_t dcache_write_misses = 0;
uint64_t dcache_pipe_stalls = 0;
uint64_t dcache_dram_stalls = 0;
uint64_t dcache_bank_stalls = 0;
uint64_t dcache_mshr_stalls = 0;
uint64_t dcache_rsp_stalls = 0;
uint64_t dcache_evictions = 0;
uint64_t dcache_pipe_stalls = 0;
uint64_t dcache_rsp_stalls = 0;
// PERF: SMEM
uint64_t smem_reads = 0;
uint64_t smem_writes = 0;
uint64_t smem_bank_stalls = 0;
// PERF: memory
uint64_t dram_req = 0;
uint64_t dram_rsp = 0;
uint64_t dram_reads = 0;
uint64_t dram_writes = 0;
uint64_t dram_stalls = 0;
uint64_t dram_lat = 0;
#endif
@ -154,11 +156,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
#ifdef PERF_ENABLE
// PERF: pipeline
// icache_stall
uint64_t icache_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_ST, CSR_MPM_ICACHE_ST_H, &icache_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache stalls=%ld\n", core_id, icache_stalls_per_core);
icache_stalls += icache_stalls_per_core;
// ibuffer_stall
uint64_t ibuffer_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_IBUF_ST, CSR_MPM_IBUF_ST_H, &ibuffer_stalls_per_core);
@ -209,7 +206,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
// read misses
uint64_t icache_miss_r_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MISS_R, CSR_MPM_ICACHE_MISS_R_H, &icache_miss_r_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld\n", core_id, icache_miss_r_per_core);
int icache_read_hit_ratio = (int)((1.0 - (double(icache_miss_r_per_core) / double(icache_reads_per_core))) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio);
icache_read_misses += icache_miss_r_per_core;
// pipeline stalls
uint64_t icache_pipe_st_per_core;
@ -221,16 +219,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_CRSP_ST, CSR_MPM_ICACHE_CRSP_ST_H, &icache_crsp_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core);
icache_rsp_stalls += icache_crsp_st_per_core;
// dram_stalls
uint64_t icache_dram_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_DREQ_ST, CSR_MPM_ICACHE_DREQ_ST_H, &icache_dram_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache dram stalls=%ld\n", core_id, icache_dram_st_per_core);
icache_dram_stalls += icache_dram_st_per_core;
// mshr_stalls
uint64_t icache_mshr_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MSHR_ST, CSR_MPM_ICACHE_MSHR_ST_H, &icache_mshr_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache mshr stalls=%ld\n", core_id, icache_mshr_st_per_core);
icache_mshr_stalls += icache_mshr_st_per_core;
// PERF: Dcache
// total reads
@ -246,50 +234,70 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
// read misses
uint64_t dcache_miss_r_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_R, CSR_MPM_DCACHE_MISS_R_H, &dcache_miss_r_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache read misses=%ld\n", core_id, dcache_miss_r_per_core);
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_miss_r_per_core) / double(dcache_reads_per_core))) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_miss_r_per_core, dcache_read_hit_ratio);
dcache_read_misses += dcache_miss_r_per_core;
// read misses
uint64_t dcache_miss_w_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_W, CSR_MPM_DCACHE_MISS_W_H, &dcache_miss_w_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache wrire misses=%ld\n", core_id, dcache_miss_w_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_W, CSR_MPM_DCACHE_MISS_W_H, &dcache_miss_w_per_core);
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_miss_w_per_core) / double(dcache_writes_per_core))) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache wrire misses=%ld (hit ratio=%d%%)\n", core_id, dcache_miss_w_per_core, dcache_write_hit_ratio);
dcache_write_misses += dcache_miss_w_per_core;
// total_evictions
uint64_t dcache_evictions_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_EVICTS, CSR_MPM_DCACHE_EVICTS_H, &dcache_evictions_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache evictions_per_core=%ld\n", core_id, dcache_evictions_per_core);
dcache_evictions += dcache_evictions_per_core;
// pipeline stalls
uint64_t dcache_pipe_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_PIPE_ST, CSR_MPM_DCACHE_PIPE_ST_H, &dcache_pipe_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core);
dcache_pipe_stalls += dcache_pipe_st_per_core;
// response stalls
uint64_t dcache_crsp_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_CRSP_ST, CSR_MPM_DCACHE_CRSP_ST_H, &dcache_crsp_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core);
dcache_rsp_stalls += dcache_crsp_st_per_core;
// dram_stalls
uint64_t dcache_dram_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_DREQ_ST, CSR_MPM_DCACHE_DREQ_ST_H, &dcache_dram_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache dram stalls=%ld\n", core_id, dcache_dram_st_per_core);
dcache_dram_stalls += dcache_dram_st_per_core;
// bank_stalls
uint64_t dcache_bank_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_BANK_ST, CSR_MPM_DCACHE_BANK_ST_H, &dcache_bank_st_per_core);
int dcache_bank_utilization = (int)((1.0 - (double(dcache_reads_per_core + dcache_writes_per_core) / double(dcache_reads_per_core + dcache_writes_per_core + dcache_bank_st_per_core))) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_st_per_core, dcache_bank_utilization);
dcache_bank_stalls += dcache_bank_st_per_core;
// mshr_stalls
uint64_t dcache_mshr_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MSHR_ST, CSR_MPM_DCACHE_MSHR_ST_H, &dcache_mshr_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core);
dcache_mshr_stalls += dcache_mshr_st_per_core;
// pipeline stalls
uint64_t dcache_pipe_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_PIPE_ST, CSR_MPM_DCACHE_PIPE_ST_H, &dcache_pipe_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core);
dcache_pipe_stalls += dcache_pipe_st_per_core;
// response stalls
uint64_t dcache_crsp_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_CRSP_ST, CSR_MPM_DCACHE_CRSP_ST_H, &dcache_crsp_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core);
dcache_rsp_stalls += dcache_crsp_st_per_core;
// PERF: dram_latency
uint64_t dram_req_per_core, dram_rsp_per_core, dram_lat_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_REQ, CSR_MPM_DRAM_REQ_H, &dram_req_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_RSP, CSR_MPM_DRAM_RSP_H, &dram_rsp_per_core);
// PERF: SMEM
// total reads
uint64_t smem_reads_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_READS, CSR_MPM_SMEM_READS_H, &smem_reads_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads_per_core);
smem_reads += smem_reads_per_core;
// total write
uint64_t smem_writes_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_WRITES, CSR_MPM_SMEM_WRITES_H, &smem_writes_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes_per_core);
smem_writes += smem_writes_per_core;
// bank_stalls
uint64_t smem_bank_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_BANK_ST, CSR_MPM_SMEM_BANK_ST_H, &smem_bank_st_per_core);
int smem_bank_utilization = (int)((1.0 - (double(smem_reads_per_core + smem_writes_per_core) / double(smem_reads_per_core + smem_writes_per_core + smem_bank_st_per_core))) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_st_per_core, smem_bank_utilization);
smem_bank_stalls += smem_bank_st_per_core;
// PERF: DRAM
uint64_t dram_reads_per_core, dram_writes_per_core, dram_stalls_per_core, dram_lat_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_READS, CSR_MPM_DRAM_READS_H, &dram_reads_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_WRITES, CSR_MPM_DRAM_WRITES_H, &dram_writes_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_ST, CSR_MPM_DRAM_ST_H, &dram_stalls_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core);
int avg_dram_lat_per_core = (int)(double(dram_lat_per_core) / double(dram_rsp_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, dram_req_per_core, dram_rsp_per_core, dram_req_per_core - dram_rsp_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: average dram latency=%d cycles\n", core_id, avg_dram_lat_per_core);
dram_req += dram_req_per_core;
dram_rsp += dram_rsp_per_core;
dram_lat += dram_lat_per_core;
int avg_dram_lat = (int)(double(dram_lat_per_core) / double(dram_reads_per_core));
int dram_utilization = (int)((1.0 - (double(dram_reads_per_core + dram_writes_per_core) / double(dram_reads_per_core + dram_writes_per_core + dram_stalls_per_core))) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, (dram_reads_per_core + dram_writes_per_core), dram_reads_per_core, dram_writes_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram stalls=%d (utilization=%d%%)\n", core_id, dram_stalls_per_core, dram_utilization);
if (num_cores > 1) fprintf(stream, "PERF: core%d: average dram latency=%d cycles\n", core_id, avg_dram_lat);
dram_reads += dram_reads_per_core;
dram_writes += dram_writes_per_core;
dram_stalls += dram_stalls_per_core;
dram_lat += dram_lat_per_core;
#endif
}
@ -297,7 +305,13 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
#ifdef PERF_ENABLE
fprintf(stream, "PERF: icache stalls=%ld\n", icache_stalls);
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
int dcache_bank_utilization = (int)((1.0 - (double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls))) * 100);
int smem_bank_utilization = (int)((1.0 - (double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls))) * 100);
int dram_utilization = (int)((1.0 - (double(dram_reads + dram_writes) / double(dram_reads + dram_writes + dram_stalls))) * 100);
int avg_dram_lat = (int)(double(dram_lat) / double(dram_reads));
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
@ -307,22 +321,22 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls);
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
fprintf(stream, "PERF: icache read misses=%ld\n", icache_read_misses);
fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls);
fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls);
fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls);
fprintf(stream, "PERF: icache dram stalls=%ld\n", icache_dram_stalls);
fprintf(stream, "PERF: icache mshr stalls=%ld\n", icache_mshr_stalls);
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
fprintf(stream, "PERF: dcache read misses=%ld\n", dcache_read_misses);
fprintf(stream, "PERF: dcache wrire misses=%ld\n", dcache_write_misses);
fprintf(stream, "PERF: dcache evictions=%ld\n", dcache_evictions);
fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio);
fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio);
fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization);
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
fprintf(stream, "PERF: dcache pipeline stalls=%ld\n", dcache_pipe_stalls);
fprintf(stream, "PERF: dcache reponse stalls=%ld\n", dcache_rsp_stalls);
fprintf(stream, "PERF: dcache dram stalls=%ld\n", dcache_dram_stalls);
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", dram_req, dram_rsp, dram_req - dram_rsp);
int avg_dram_lat = (int)(double(dram_lat) / double(dram_rsp));
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", (dram_reads + dram_writes), dram_reads, dram_writes);
fprintf(stream, "PERF: dram stalls=%ld (utilization=%d%%)\n", dram_stalls, dram_utilization);
fprintf(stream, "PERF: average dram latency=%d cycles\n", avg_dram_lat);
#endif

View file

@ -167,63 +167,64 @@
// Machine Performance-monitoring counters
// PERF: pipeline
`define CSR_MPM_ICACHE_ST 12'hB03
`define CSR_MPM_ICACHE_ST_H 12'hB83
`define CSR_MPM_IBUF_ST 12'hB04
`define CSR_MPM_IBUF_ST_H 12'hB84
`define CSR_MPM_SCRB_ST 12'hB05
`define CSR_MPM_SCRB_ST_H 12'hB85
`define CSR_MPM_ALU_ST 12'hB06
`define CSR_MPM_ALU_ST_H 12'hB86
`define CSR_MPM_LSU_ST 12'hB07
`define CSR_MPM_LSU_ST_H 12'hB87
`define CSR_MPM_CSR_ST 12'hB08
`define CSR_MPM_CSR_ST_H 12'hB88
`define CSR_MPM_MUL_ST 12'hB09
`define CSR_MPM_MUL_ST_H 12'hB89
`define CSR_MPM_FPU_ST 12'hB0A
`define CSR_MPM_FPU_ST_H 12'hB8A
`define CSR_MPM_GPU_ST 12'hB0B
`define CSR_MPM_GPU_ST_H 12'hB8B
`define CSR_MPM_IBUF_ST 12'hB03
`define CSR_MPM_IBUF_ST_H 12'hB83
`define CSR_MPM_SCRB_ST 12'hB04
`define CSR_MPM_SCRB_ST_H 12'hB84
`define CSR_MPM_ALU_ST 12'hB05
`define CSR_MPM_ALU_ST_H 12'hB85
`define CSR_MPM_LSU_ST 12'hB06
`define CSR_MPM_LSU_ST_H 12'hB86
`define CSR_MPM_CSR_ST 12'hB07
`define CSR_MPM_CSR_ST_H 12'hB87
`define CSR_MPM_MUL_ST 12'hB08
`define CSR_MPM_MUL_ST_H 12'hB88
`define CSR_MPM_FPU_ST 12'hB09
`define CSR_MPM_FPU_ST_H 12'hB89
`define CSR_MPM_GPU_ST 12'hB0A
`define CSR_MPM_GPU_ST_H 12'hB8A
// PERF: icache
`define CSR_MPM_ICACHE_MISS_R 12'hB0C // read misses
`define CSR_MPM_ICACHE_READS 12'hB0B // total reads
`define CSR_MPM_ICACHE_READS_H 12'hB8B
`define CSR_MPM_ICACHE_MISS_R 12'hB0C // total misses
`define CSR_MPM_ICACHE_MISS_R_H 12'hB8C
`define CSR_MPM_ICACHE_DREQ_ST 12'hB0D // dram request stalls
`define CSR_MPM_ICACHE_DREQ_ST_H 12'hB8D
`define CSR_MPM_ICACHE_PIPE_ST 12'hB0D // pipeline stalls
`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB8D
`define CSR_MPM_ICACHE_CRSP_ST 12'hB0E // core response stalls
`define CSR_MPM_ICACHE_CRSP_ST_H 12'hB8E
`define CSR_MPM_ICACHE_MSHR_ST 12'hB0F // MSHR stalls
`define CSR_MPM_ICACHE_MSHR_ST_H 12'hB8F
`define CSR_MPM_ICACHE_PIPE_ST 12'hB10 // pipeline stalls
`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB90
`define CSR_MPM_ICACHE_READS 12'hB11 // total reads
`define CSR_MPM_ICACHE_READS_H 12'hB91
// PERF: dcache
`define CSR_MPM_DCACHE_MISS_R 12'hB12 // read misses
`define CSR_MPM_DCACHE_MISS_R_H 12'hB92
`define CSR_MPM_DCACHE_MISS_W 12'hB13 // write misses
`define CSR_MPM_DCACHE_MISS_W_H 12'hB93
`define CSR_MPM_DCACHE_DREQ_ST 12'hB14 // dram request stalls
`define CSR_MPM_DCACHE_DREQ_ST_H 12'hB94
`define CSR_MPM_DCACHE_CRSP_ST 12'hB15 // core response stalls
`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB95
`define CSR_MPM_DCACHE_MSHR_ST 12'hB16 // MSHR stalls
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB96
`define CSR_MPM_DCACHE_PIPE_ST 12'hB17 // pipeline stalls
`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB97
`define CSR_MPM_DCACHE_READS 12'hB18 // total reads
`define CSR_MPM_DCACHE_READS_H 12'hB98
`define CSR_MPM_DCACHE_WRITES 12'hB19 // total writes
`define CSR_MPM_DCACHE_WRITES_H 12'hB99
`define CSR_MPM_DCACHE_EVICTS 12'hB1A // total evictions
`define CSR_MPM_DCACHE_EVICTS_H 12'hB9A
`define CSR_MPM_DCACHE_READS 12'hB0F // total reads
`define CSR_MPM_DCACHE_READS_H 12'hB8F
`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes
`define CSR_MPM_DCACHE_WRITES_H 12'hB90
`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses
`define CSR_MPM_DCACHE_MISS_R_H 12'hB91
`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses
`define CSR_MPM_DCACHE_MISS_W_H 12'hB92
`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts stalls
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93
`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94
`define CSR_MPM_DCACHE_PIPE_ST 12'hB15 // pipeline stalls
`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB95
`define CSR_MPM_DCACHE_CRSP_ST 12'hB16 // core response stalls
`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB96
// PERF: smem
`define CSR_MPM_SMEM_READS 12'hB17 // total reads
`define CSR_MPM_SMEM_READS_H 12'hB97
`define CSR_MPM_SMEM_WRITES 12'hB18 // total writes
`define CSR_MPM_SMEM_WRITES_H 12'hB98
`define CSR_MPM_SMEM_BANK_ST 12'hB19 // bank conflicts stalls
`define CSR_MPM_SMEM_BANK_ST_H 12'hB99
// PERF: memory
`define CSR_MPM_DRAM_LAT 12'hB1B // dram latency (total)
`define CSR_MPM_DRAM_LAT_H 12'hB9B
`define CSR_MPM_DRAM_REQ 12'hB1C // dram requests
`define CSR_MPM_DRAM_REQ_H 12'hB9C
`define CSR_MPM_DRAM_RSP 12'hB1D // dram responses
`define CSR_MPM_DRAM_RSP_H 12'hB9D
`define CSR_MPM_DRAM_READS 12'hB1A // dram reads
`define CSR_MPM_DRAM_READS_H 12'hB9A
`define CSR_MPM_DRAM_WRITES 12'hB1B // dram writes
`define CSR_MPM_DRAM_WRITES_H 12'hB9B
`define CSR_MPM_DRAM_ST 12'hB1C // dram request stalls
`define CSR_MPM_DRAM_ST_H 12'hB9C
`define CSR_MPM_DRAM_LAT 12'hB1D // dram latency (total)
`define CSR_MPM_DRAM_LAT_H 12'hB9D
// Machine Information Registers
`define CSR_MVENDORID 12'hF11

View file

@ -121,63 +121,64 @@ module VX_csr_data #(
`ifdef PERF_ENABLE
// PERF: pipeline
`CSR_MPM_ICACHE_ST : read_data_r = perf_pipeline_if.icache_stalls[31:0];
`CSR_MPM_ICACHE_ST_H : read_data_r = perf_pipeline_if.icache_stalls[63:32];
`CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibuffer_stalls[31:0];
`CSR_MPM_IBUF_ST_H : read_data_r = perf_pipeline_if.ibuffer_stalls[63:32];
`CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scoreboard_stalls[31:0];
`CSR_MPM_SCRB_ST_H : read_data_r = perf_pipeline_if.scoreboard_stalls[63:32];
`CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0];
`CSR_MPM_ALU_ST_H : read_data_r = perf_pipeline_if.alu_stalls[63:32];
`CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0];
`CSR_MPM_LSU_ST_H : read_data_r = perf_pipeline_if.lsu_stalls[63:32];
`CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0];
`CSR_MPM_CSR_ST_H : read_data_r = perf_pipeline_if.csr_stalls[63:32];
`CSR_MPM_MUL_ST : read_data_r = perf_pipeline_if.mul_stalls[31:0];
`CSR_MPM_MUL_ST_H : read_data_r = perf_pipeline_if.mul_stalls[63:32];
`CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0];
`CSR_MPM_FPU_ST_H : read_data_r = perf_pipeline_if.fpu_stalls[63:32];
`CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0];
`CSR_MPM_GPU_ST_H : read_data_r = perf_pipeline_if.gpu_stalls[63:32];
`CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibf_stalls[31:0];
`CSR_MPM_IBUF_ST_H : read_data_r = perf_pipeline_if.ibf_stalls[63:32];
`CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scb_stalls[31:0];
`CSR_MPM_SCRB_ST_H : read_data_r = perf_pipeline_if.scb_stalls[63:32];
`CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0];
`CSR_MPM_ALU_ST_H : read_data_r = perf_pipeline_if.alu_stalls[63:32];
`CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0];
`CSR_MPM_LSU_ST_H : read_data_r = perf_pipeline_if.lsu_stalls[63:32];
`CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0];
`CSR_MPM_CSR_ST_H : read_data_r = perf_pipeline_if.csr_stalls[63:32];
`CSR_MPM_MUL_ST : read_data_r = perf_pipeline_if.mul_stalls[31:0];
`CSR_MPM_MUL_ST_H : read_data_r = perf_pipeline_if.mul_stalls[63:32];
`CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0];
`CSR_MPM_FPU_ST_H : read_data_r = perf_pipeline_if.fpu_stalls[63:32];
`CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0];
`CSR_MPM_GPU_ST_H : read_data_r = perf_pipeline_if.gpu_stalls[63:32];
// PERF: icache
`CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0];
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = perf_memsys_if.icache_read_misses[63:32];
`CSR_MPM_ICACHE_DREQ_ST : read_data_r = perf_memsys_if.icache_dreq_stalls[31:0];
`CSR_MPM_ICACHE_DREQ_ST_H : read_data_r = perf_memsys_if.icache_dreq_stalls[63:32];
`CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0];
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = perf_memsys_if.icache_crsp_stalls[63:32];
`CSR_MPM_ICACHE_MSHR_ST : read_data_r = perf_memsys_if.icache_mshr_stalls[31:0];
`CSR_MPM_ICACHE_MSHR_ST_H : read_data_r = perf_memsys_if.icache_mshr_stalls[63:32];
`CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0];
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = perf_memsys_if.icache_pipe_stalls[63:32];
`CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0];
`CSR_MPM_ICACHE_READS_H : read_data_r = perf_memsys_if.icache_reads[63:32];
// PERF: dcache
`CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_read_misses[31:0];
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = perf_memsys_if.dcache_read_misses[63:32];
`CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_write_misses[31:0];
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = perf_memsys_if.dcache_write_misses[63:32];
`CSR_MPM_DCACHE_DREQ_ST : read_data_r = perf_memsys_if.dcache_dreq_stalls[31:0];
`CSR_MPM_DCACHE_DREQ_ST_H : read_data_r = perf_memsys_if.dcache_dreq_stalls[63:32];
`CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0];
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = perf_memsys_if.dcache_crsp_stalls[63:32];
`CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0];
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = perf_memsys_if.dcache_mshr_stalls[63:32];
`CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0];
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = perf_memsys_if.dcache_pipe_stalls[63:32];
`CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0];
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = perf_memsys_if.icache_read_misses[63:32];
`CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0];
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = perf_memsys_if.icache_pipe_stalls[63:32];
`CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0];
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = perf_memsys_if.icache_crsp_stalls[63:32];
// PERF: dcache
`CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0];
`CSR_MPM_DCACHE_READS_H : read_data_r = perf_memsys_if.dcache_reads[63:32];
`CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0];
`CSR_MPM_DCACHE_WRITES_H : read_data_r = perf_memsys_if.dcache_writes[63:32];
`CSR_MPM_DCACHE_EVICTS : read_data_r = perf_memsys_if.dcache_evictions[31:0];
`CSR_MPM_DCACHE_EVICTS_H : read_data_r = perf_memsys_if.dcache_evictions[63:32];
// PERF: memory
`CSR_MPM_DRAM_LAT : read_data_r = perf_memsys_if.dram_latency[31:0];
`CSR_MPM_DRAM_LAT_H : read_data_r = perf_memsys_if.dram_latency[63:32];
`CSR_MPM_DRAM_REQ : read_data_r = perf_memsys_if.dram_requests[31:0];
`CSR_MPM_DRAM_REQ_H : read_data_r = perf_memsys_if.dram_requests[63:32];
`CSR_MPM_DRAM_RSP : read_data_r = perf_memsys_if.dram_responses[31:0];
`CSR_MPM_DRAM_RSP_H : read_data_r = perf_memsys_if.dram_responses[63:32];
`CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_read_misses[31:0];
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = perf_memsys_if.dcache_read_misses[63:32];
`CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_write_misses[31:0];
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = perf_memsys_if.dcache_write_misses[63:32];
`CSR_MPM_DCACHE_BANK_ST : read_data_r = perf_memsys_if.dcache_bank_stalls[31:0];
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = perf_memsys_if.dcache_bank_stalls[63:32];
`CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0];
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = perf_memsys_if.dcache_mshr_stalls[63:32];
`CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0];
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = perf_memsys_if.dcache_pipe_stalls[63:32];
`CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0];
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = perf_memsys_if.dcache_crsp_stalls[63:32];
// PERF: smem
`CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0];
`CSR_MPM_SMEM_READS_H : read_data_r = perf_memsys_if.smem_reads[63:32];
`CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0];
`CSR_MPM_SMEM_WRITES_H : read_data_r = perf_memsys_if.smem_writes[63:32];
`CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0];
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = perf_memsys_if.smem_bank_stalls[63:32];
// PERF: DRAM
`CSR_MPM_DRAM_READS : read_data_r = perf_memsys_if.dram_reads[31:0];
`CSR_MPM_DRAM_READS_H : read_data_r = perf_memsys_if.dram_reads[63:32];
`CSR_MPM_DRAM_WRITES : read_data_r = perf_memsys_if.dram_writes[31:0];
`CSR_MPM_DRAM_WRITES_H : read_data_r = perf_memsys_if.dram_writes[63:32];
`CSR_MPM_DRAM_ST : read_data_r = perf_memsys_if.dram_stalls[31:0];
`CSR_MPM_DRAM_ST_H : read_data_r = perf_memsys_if.dram_stalls[63:32];
`CSR_MPM_DRAM_LAT : read_data_r = perf_memsys_if.dram_latency[31:0];
`CSR_MPM_DRAM_LAT_H : read_data_r = perf_memsys_if.dram_latency[63:32];
`endif
`CSR_SATP : read_data_r = 32'(csr_satp);

View file

@ -123,18 +123,77 @@ module VX_issue #(
`SCOPE_ASSIGN (writeback_data, writeback_if.data);
`ifdef PERF_ENABLE
reg [63:0] perf_scoreboard_stalls;
reg [63:0] perf_ibf_stalls ;
reg [63:0] perf_scb_stalls ;
reg [63:0] perf_alu_stalls;
reg [63:0] perf_lsu_stalls;
reg [63:0] perf_csr_stalls;
reg [63:0] perf_gpu_stalls;
`ifdef EXT_M_ENABLE
reg [63:0] perf_mul_stalls;
`endif
`ifdef EXT_F_ENABLE
reg [63:0] perf_fpu_stalls;
`endif
always @(posedge clk) begin
if (reset) begin
perf_scoreboard_stalls <= 0;
perf_ibf_stalls <= 0;
perf_scb_stalls <= 0;
perf_alu_stalls <= 0;
perf_lsu_stalls <= 0;
perf_csr_stalls <= 0;
perf_gpu_stalls <= 0;
`ifdef EXT_M_ENABLE
perf_mul_stalls <= 0;
`endif
`ifdef EXT_F_ENABLE
perf_fpu_stalls <= 0;
`endif
end else begin
// scoreboard_stall
if (ibuf_deq_if.valid & scoreboard_delay) begin
perf_scoreboard_stalls <= perf_scoreboard_stalls + 64'd1;
if (decode_if.valid & !decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + 64'd1;
end
if (ibuf_deq_if.valid & scoreboard_delay) begin
perf_scb_stalls <= perf_scb_stalls + 64'd1;
end
if (alu_req_if.valid & !alu_req_if.ready) begin
perf_alu_stalls <= perf_alu_stalls + 64'd1;
end
if (lsu_req_if.valid & !lsu_req_if.ready) begin
perf_lsu_stalls <= perf_lsu_stalls + 64'd1;
end
if (csr_req_if.valid & !csr_req_if.ready) begin
perf_csr_stalls <= perf_csr_stalls + 64'd1;
end
if (gpu_req_if.valid & !gpu_req_if.ready) begin
perf_gpu_stalls <= perf_gpu_stalls + 64'd1;
end
`ifdef EXT_M_ENABLE
if (mul_req_if.valid & !mul_req_if.ready) begin
perf_mul_stalls <= perf_mul_stalls + 64'd1;
end
`endif
`ifdef EXT_F_ENABLE
if (fpu_req_if.valid & !fpu_req_if.ready) begin
perf_fpu_stalls <= perf_fpu_stalls + 64'd1;
end
`endif
end
end
assign perf_pipeline_if.scoreboard_stalls = perf_scoreboard_stalls;
assign perf_pipeline_if.ibf_stalls = perf_ibf_stalls;
assign perf_pipeline_if.scb_stalls = perf_scb_stalls;
assign perf_pipeline_if.alu_stalls = perf_alu_stalls;
assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls;
assign perf_pipeline_if.csr_stalls = perf_csr_stalls;
assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls;
`ifdef EXT_M_ENABLE
assign perf_pipeline_if.mul_stalls = perf_mul_stalls;
`endif
`ifdef EXT_F_ENABLE
assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls;
`endif
`endif
`ifdef DBG_PRINT_PIPELINE

View file

@ -363,60 +363,72 @@ module VX_mem_unit # (
`ifdef PERF_ENABLE
assign perf_memsys_if.icache_reads = perf_icache_if.reads;
assign perf_memsys_if.icache_reads = perf_icache_if.reads;
assign perf_memsys_if.icache_read_misses = perf_icache_if.read_misses;
assign perf_memsys_if.icache_mshr_stalls = perf_icache_if.mshr_stalls;
assign perf_memsys_if.icache_crsp_stalls = perf_icache_if.crsp_stalls;
assign perf_memsys_if.icache_dreq_stalls = perf_icache_if.dreq_stalls;
assign perf_memsys_if.icache_pipe_stalls = perf_icache_if.pipe_stalls;
assign perf_memsys_if.icache_crsp_stalls = perf_icache_if.crsp_stalls;
assign perf_memsys_if.dcache_reads = perf_dcache_if.reads;
assign perf_memsys_if.dcache_writes = perf_dcache_if.writes;
assign perf_memsys_if.dcache_reads = perf_dcache_if.reads;
assign perf_memsys_if.dcache_writes = perf_dcache_if.writes;
assign perf_memsys_if.dcache_read_misses = perf_dcache_if.read_misses;
assign perf_memsys_if.dcache_write_misses = perf_dcache_if.write_misses;
assign perf_memsys_if.dcache_evictions = perf_dcache_if.evictions;
assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls;
assign perf_memsys_if.dcache_crsp_stalls = perf_dcache_if.crsp_stalls;
assign perf_memsys_if.dcache_dreq_stalls = perf_dcache_if.dreq_stalls;
assign perf_memsys_if.dcache_write_misses= perf_dcache_if.write_misses;
assign perf_memsys_if.dcache_bank_stalls = perf_dcache_if.bank_stalls;
assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls;
assign perf_memsys_if.dcache_pipe_stalls = perf_dcache_if.pipe_stalls;
assign perf_memsys_if.dcache_crsp_stalls = perf_dcache_if.crsp_stalls;
if (`SM_ENABLE) begin
assign perf_memsys_if.smem_reads = perf_smem_if.reads;
assign perf_memsys_if.smem_writes = perf_smem_if.writes;
assign perf_memsys_if.smem_bank_stalls = perf_smem_if.bank_stalls;
end else begin
assign perf_memsys_if.smem_reads = 0;
assign perf_memsys_if.smem_writes = 0;
assign perf_memsys_if.smem_bank_stalls = 0;
end
reg [63:0] perf_dram_lat_per_cycle;
always @(posedge clk) begin
if (reset) begin
perf_dram_lat_per_cycle <= 0;
end else begin
if (dram_req_if.valid & (~dram_req_if.rw) & dram_req_if.ready & dram_rsp_if.valid & dram_rsp_if.ready) begin
end else begin
if (dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready && dram_rsp_if.valid && dram_rsp_if.ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle;
end else if (dram_req_if.valid & (~dram_req_if.rw) & dram_req_if.ready) begin
end else if (dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + 64'd1;
end else if (dram_rsp_if.valid & dram_rsp_if.ready) begin
end else if (dram_rsp_if.valid && dram_rsp_if.ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle - 64'd1;
end
end
end
reg [63:0] perf_dram_req, perf_dram_rsp, perf_dram_lat;
reg [63:0] perf_dram_reads, perf_dram_writes, perf_dram_lat, perf_dram_stalls;
always @(posedge clk) begin
if (reset) begin
perf_dram_req <= 0;
perf_dram_rsp <= 0;
perf_dram_lat <= 0;
end else begin
if (dram_req_if.valid & dram_req_if.ready) begin
perf_dram_req <= perf_dram_req + 64'd1;
end
if (dram_rsp_if.valid & dram_rsp_if.ready) begin
perf_dram_rsp <= perf_dram_rsp + 64'd1;
perf_dram_reads <= 0;
perf_dram_writes <= 0;
perf_dram_lat <= 0;
perf_dram_stalls <= 0;
end else begin
if (dram_req_if.valid && dram_req_if.ready && !dram_req_if.rw) begin
perf_dram_reads <= perf_dram_reads + 64'd1;
end
if (dram_req_if.valid && dram_req_if.ready && dram_req_if.rw) begin
perf_dram_writes <= perf_dram_writes + 64'd1;
end
if (dram_req_if.valid && !dram_req_if.ready) begin
perf_dram_stalls <= perf_dram_stalls + 64'd1;
end
perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle;
end
end
assign perf_memsys_if.dram_requests = perf_dram_req;
assign perf_memsys_if.dram_responses = perf_dram_rsp;
assign perf_memsys_if.dram_latency = perf_dram_lat;
assign perf_memsys_if.dram_reads = perf_dram_reads;
assign perf_memsys_if.dram_writes = perf_dram_writes;
assign perf_memsys_if.dram_latency = perf_dram_lat;
assign perf_memsys_if.dram_stalls = perf_dram_stalls;
`endif
endmodule

View file

@ -288,79 +288,5 @@ module VX_pipeline #(
.writeback_if (writeback_if),
.cmt_to_csr_if (cmt_to_csr_if)
);
`ifdef PERF_ENABLE
reg [63:0] perf_icache_stalls;
reg [63:0] perf_ibuffer_stalls;
reg [63:0] perf_alu_stalls;
reg [63:0] perf_lsu_stalls;
reg [63:0] perf_csr_stalls;
reg [63:0] perf_gpu_stalls;
`ifdef EXT_M_ENABLE
reg [63:0] perf_mul_stalls;
`endif
`ifdef EXT_F_ENABLE
reg [63:0] perf_fpu_stalls;
`endif
always @(posedge clk) begin
if (reset) begin
perf_icache_stalls <= 0;
perf_ibuffer_stalls <= 0;
perf_alu_stalls <= 0;
perf_lsu_stalls <= 0;
perf_csr_stalls <= 0;
perf_gpu_stalls <= 0;
`ifdef EXT_M_ENABLE
perf_mul_stalls <= 0;
`endif
`ifdef EXT_F_ENABLE
perf_fpu_stalls <= 0;
`endif
end else begin
if (core_icache_req_if.valid & !core_icache_req_if.ready) begin
perf_icache_stalls <= perf_icache_stalls + 64'd1;
end
if (decode_if.valid & !decode_if.ready) begin
perf_ibuffer_stalls <= perf_ibuffer_stalls + 64'd1;
end
if (alu_req_if.valid & !alu_req_if.ready) begin
perf_alu_stalls <= perf_alu_stalls + 64'd1;
end
if (lsu_req_if.valid & !lsu_req_if.ready) begin
perf_lsu_stalls <= perf_lsu_stalls + 64'd1;
end
if (csr_req_if.valid & !csr_req_if.ready) begin
perf_csr_stalls <= perf_csr_stalls + 64'd1;
end
if (gpu_req_if.valid & !gpu_req_if.ready) begin
perf_gpu_stalls <= perf_gpu_stalls + 64'd1;
end
`ifdef EXT_M_ENABLE
if (mul_req_if.valid & !mul_req_if.ready) begin
perf_mul_stalls <= perf_mul_stalls + 64'd1;
end
`endif
`ifdef EXT_F_ENABLE
if (fpu_req_if.valid & !fpu_req_if.ready) begin
perf_fpu_stalls <= perf_fpu_stalls + 64'd1;
end
`endif
end
end
assign perf_pipeline_if.icache_stalls = perf_icache_stalls;
assign perf_pipeline_if.ibuffer_stalls = perf_ibuffer_stalls;
assign perf_pipeline_if.alu_stalls = perf_alu_stalls;
assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls;
assign perf_pipeline_if.csr_stalls = perf_csr_stalls;
assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls;
`ifdef EXT_M_ENABLE
assign perf_pipeline_if.mul_stalls = perf_mul_stalls;
`endif
`ifdef EXT_F_ENABLE
assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls;
`endif
`endif
endmodule

View file

@ -98,11 +98,10 @@ module VX_bank #(
input wire snp_rsp_ready,
`ifdef PERF_ENABLE
output wire perf_mshr_stall,
output wire perf_pipe_stall,
output wire perf_evict,
output wire perf_read_miss,
output wire perf_write_miss,
output wire perf_read_misses,
output wire perf_write_misses,
output wire perf_mshr_stalls,
output wire perf_pipe_stalls,
`endif
// Misses
@ -335,7 +334,7 @@ module VX_bank #(
wire dreq_push_stall;
wire srsq_push_stall;
wire pipeline_stall;
wire is_mshr_miss_st2 = valid_st2 && is_mshr_st2 && (miss_st2 || force_miss_st2);
wire is_mshr_miss_st3 = valid_st3 && is_mshr_st3 && (miss_st3 || force_miss_st3);
@ -938,15 +937,10 @@ end
`SCOPE_ASSIGN (addr_st3, `LINE_TO_BYTE_ADDR(addr_st3, BANK_ID));
`ifdef PERF_ENABLE
assign perf_pipe_stall = pipeline_stall;
assign perf_mshr_stall = mshr_going_full;
assign perf_read_miss = !pipeline_stall & miss_st2 & !is_mshr_st2 & !mem_rw_st2;
assign perf_write_miss = !pipeline_stall & miss_st2 & !is_mshr_st2 & mem_rw_st2;
if (DRAM_ENABLE) begin
assign perf_evict = dreq_push & do_writeback_st3 & !is_snp_st3;
end else begin
assign perf_evict = 0;
end
assign perf_read_misses = !pipeline_stall && miss_st2 && !is_mshr_st2 && !mem_rw_st2;
assign perf_write_misses = !pipeline_stall && miss_st2 && !is_mshr_st2 && mem_rw_st2;
assign perf_mshr_stalls = mshr_going_full;
assign perf_pipe_stalls = pipeline_stall || mshr_going_full;
`endif
`ifdef DBG_PRINT_CACHE_BANK

View file

@ -134,15 +134,13 @@ module VX_cache #(
wire [NUM_BANKS-1:0] per_bank_snp_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_miss;
assign miss_vec = per_bank_miss;
assign miss_vec = per_bank_miss;
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank;
wire [NUM_BANKS-1:0] perf_evict_per_bank;
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank;
`endif
if (NUM_BANKS == 1) begin
@ -156,13 +154,20 @@ module VX_cache #(
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS)
) cache_core_req_bank_sel (
) cache_core_req_bank_sel (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.bank_stalls (perf_cache_if.bank_stalls),
`else
`UNUSED_PIN (bank_stalls),
`endif
.core_req_valid (core_req_valid),
.core_req_addr (core_req_addr),
.core_req_ready (core_req_ready),
.per_bank_valid (per_bank_core_req_valid),
.per_bank_tid (per_bank_core_req_tid),
.per_bank_ready (per_bank_core_req_ready)
.per_bank_ready (per_bank_core_req_ready)
);
assign dram_req_tag = dram_req_addr;
@ -297,7 +302,8 @@ module VX_cache #(
`SCOPE_BIND_VX_cache_bank(i)
.clk (clk),
.reset (reset),
.reset (reset),
// Core request
.core_req_valid (curr_bank_core_req_valid),
.core_req_tid (curr_bank_core_req_tid),
@ -330,11 +336,10 @@ module VX_cache #(
.dram_rsp_ready (curr_bank_dram_rsp_ready),
`ifdef PERF_ENABLE
.perf_mshr_stall (perf_mshr_stall_per_bank[i]),
.perf_pipe_stall (perf_pipe_stall_per_bank[i]),
.perf_evict (perf_evict_per_bank[i]),
.perf_read_miss (perf_read_miss_per_bank[i]),
.perf_write_miss (perf_write_miss_per_bank[i]),
.perf_read_misses (perf_read_miss_per_bank[i]),
.perf_write_misses (perf_write_miss_per_bank[i]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[i]),
.perf_pipe_stalls (perf_pipe_stall_per_bank[i]),
`endif
// Snoop request
@ -434,47 +439,33 @@ module VX_cache #(
end
`ifdef PERF_ENABLE
// per cycle: core_req_r, core_req_w
reg [($clog2(NUM_REQS+1)-1):0] perf_core_req_r_per_cycle, perf_core_req_w_per_cycle;
// per cycle: core_reads, core_writes
reg [($clog2(NUM_REQS+1)-1):0] perf_core_reads_per_cycle, perf_core_writes_per_cycle;
reg [($clog2(NUM_REQS+1)-1):0] perf_crsp_stall_per_cycle;
VX_countones #(
.N(NUM_REQS)
) perf_countones_core_reads_count (
.valids (core_req_valid & core_req_ready & ~core_req_rw),
.count (perf_core_reads_per_cycle)
);
VX_countones #(
.N(NUM_REQS)
) perf_countones_core_writes_count (
.valids (core_req_valid & core_req_ready & core_req_rw),
.count (perf_core_writes_per_cycle)
);
if (CORE_TAG_ID_BITS != 0) begin
VX_countones #( // core_req_r
.N(NUM_REQS)
) perf_countones_core_req_r_count (
.valids (core_req_valid & {NUM_REQS{core_req_ready & ~core_req_rw}}),
.count (perf_core_req_r_per_cycle)
);
VX_countones #( // core_req_w
.N(NUM_REQS)
) perf_countones_core_req_w_count (
.valids (core_req_valid & {NUM_REQS{core_req_ready & core_req_rw}}),
.count (perf_core_req_w_per_cycle)
);
VX_countones #( // core_rsp
VX_countones #(
.N(NUM_REQS)
) perf_countones_core_rsp_count (
.valids (core_rsp_valid & {NUM_REQS{!core_rsp_ready}}),
.count (perf_crsp_stall_per_cycle)
);
end else begin
VX_countones #( // core_req_r
.N(NUM_REQS)
) perf_countones_core_req_r_count (
.valids (core_req_valid & core_req_ready & ~core_req_rw),
.count (perf_core_req_r_per_cycle)
);
VX_countones #( // core_req_w
.N(NUM_REQS)
) perf_countones_core_req_w_count (
.valids (core_req_valid & core_req_ready & core_req_rw),
.count (perf_core_req_w_per_cycle)
);
VX_countones #( // core_rsp
VX_countones #(
.N(NUM_REQS)
) perf_countones_core_rsp_count (
.valids (core_rsp_valid & ~core_rsp_ready),
@ -482,33 +473,11 @@ module VX_cache #(
);
end
// per cycle: msrq stalls, pipeline stalls, evictions, read misses, write misses
reg [($clog2(NUM_BANKS+1)-1):0] perf_mshr_stall_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_pipe_stall_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_evictions_per_cycle;
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
reg [($clog2(NUM_BANKS+1)-1):0] perf_read_miss_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_write_miss_per_cycle;
VX_countones #(
.N(NUM_BANKS)
) perf_countones_mshr_stall_count (
.valids (perf_mshr_stall_per_bank),
.count (perf_mshr_stall_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_total_stall_count (
.valids (perf_pipe_stall_per_bank),
.count (perf_pipe_stall_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_EVICTSict_count (
.valids (perf_evict_per_bank),
.count (perf_evictions_per_cycle)
);
reg [($clog2(NUM_BANKS+1)-1):0] perf_mshr_stall_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_pipe_stall_per_cycle;
VX_countones #(
.N(NUM_BANKS)
@ -524,59 +493,55 @@ module VX_cache #(
.count (perf_write_miss_per_cycle)
);
reg [63:0] perf_core_req_r;
reg [63:0] perf_core_req_w;
reg [63:0] perf_mshr_stall;
reg [63:0] perf_pipe_stall;
reg [63:0] perf_evictions;
reg [63:0] perf_read_miss;
reg [63:0] perf_write_miss;
reg [63:0] perf_crsp_stall;
reg [63:0] perf_dreq_stall;
VX_countones #(
.N(NUM_BANKS)
) perf_countones_mshr_stall_count (
.valids (perf_mshr_stall_per_bank),
.count (perf_mshr_stall_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_total_stall_count (
.valids (perf_pipe_stall_per_bank),
.count (perf_pipe_stall_per_cycle)
);
reg [63:0] perf_core_reads;
reg [63:0] perf_core_writes;
reg [63:0] perf_read_misses;
reg [63:0] perf_write_misses;
reg [63:0] perf_mshr_stalls;
reg [63:0] perf_pipe_stalls;
reg [63:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
perf_core_req_r <= 0;
perf_core_req_w <= 0;
perf_crsp_stall <= 0;
perf_mshr_stall <= 0;
perf_pipe_stall <= 0;
perf_evictions <= 0;
perf_read_miss <= 0;
perf_write_miss <= 0;
perf_dreq_stall <= 0;
perf_core_reads <= 0;
perf_core_writes <= 0;
perf_read_misses <= 0;
perf_write_misses <= 0;
perf_mshr_stalls <= 0;
perf_pipe_stalls <= 0;
perf_crsp_stalls <= 0;
end else begin
// core requests
perf_core_req_r <= perf_core_req_r + $bits(perf_core_req_r)'(perf_core_req_r_per_cycle);
perf_core_req_w <= perf_core_req_w + $bits(perf_core_req_w)'(perf_core_req_w_per_cycle);
// core response stalls
perf_crsp_stall <= perf_crsp_stall + $bits(perf_crsp_stall)'(perf_crsp_stall_per_cycle);
// miss reserve queue stalls
perf_mshr_stall <= perf_mshr_stall + $bits(perf_mshr_stall)'(perf_mshr_stall_per_cycle);
// pipeline stalls
perf_pipe_stall <= perf_pipe_stall + $bits(perf_pipe_stall)'(perf_pipe_stall_per_cycle);
// total evictions
perf_evictions <= perf_evictions + $bits(perf_evictions)'(perf_evictions_per_cycle);
// read misses
perf_read_miss <= perf_read_miss + $bits(perf_read_miss)'(perf_read_miss_per_cycle);
// write misses
perf_write_miss <= perf_write_miss + $bits(perf_write_miss)'(perf_write_miss_per_cycle);
// dram request stalls
if (dram_req_valid & !dram_req_ready) begin
perf_dreq_stall <= perf_dreq_stall + 64'd1;
end
perf_core_reads <= perf_core_reads + 64'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + 64'(perf_core_writes_per_cycle);
perf_read_misses <= perf_read_misses + 64'(perf_read_miss_per_cycle);
perf_write_misses <= perf_write_misses + 64'(perf_write_miss_per_cycle);
perf_mshr_stalls <= perf_mshr_stalls + 64'(perf_mshr_stall_per_cycle);
perf_pipe_stalls <= perf_pipe_stalls + 64'(perf_pipe_stall_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + 64'(perf_crsp_stall_per_cycle);
end
end
assign perf_cache_if.reads = perf_core_req_r;
assign perf_cache_if.writes = perf_core_req_w;
assign perf_cache_if.read_misses = perf_read_miss;
assign perf_cache_if.write_misses = perf_write_miss;
assign perf_cache_if.evictions = perf_evictions;
assign perf_cache_if.mshr_stalls = perf_mshr_stall;
assign perf_cache_if.pipe_stalls = perf_pipe_stall;
assign perf_cache_if.crsp_stalls = perf_crsp_stall;
assign perf_cache_if.dreq_stalls = perf_dreq_stall;
assign perf_cache_if.reads = perf_core_reads;
assign perf_cache_if.writes = perf_core_writes;
assign perf_cache_if.read_misses = perf_read_misses;
assign perf_cache_if.write_misses = perf_write_misses;
assign perf_cache_if.mshr_stalls = perf_mshr_stalls;
assign perf_cache_if.pipe_stalls = perf_pipe_stalls;
assign perf_cache_if.crsp_stalls = perf_crsp_stalls;
`endif
endmodule

View file

@ -10,17 +10,21 @@ module VX_cache_core_req_bank_sel #(
// Number of Word requests per cycle
parameter NUM_REQS = 1
) (
input wire clk,
input wire reset,
input wire [NUM_REQS-1:0] core_req_valid,
input wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr,
output wire [NUM_REQS-1:0] core_req_ready,
output wire [NUM_BANKS-1:0] per_bank_valid,
output wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_tid,
input wire [NUM_BANKS-1:0] per_bank_ready
input wire [NUM_BANKS-1:0] per_bank_ready,
output wire [63:0] bank_stalls
);
if (NUM_BANKS > 1) begin
if (NUM_BANKS > 1) begin
reg [NUM_BANKS-1:0] per_bank_valid_r;
reg [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_tid_r;
reg [NUM_REQS-1:0] core_req_ready_r;
reg [NUM_BANKS-1:0] core_req_sel_r;
wire [NUM_REQS-1:0][`BANK_BITS-1:0] core_req_bid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
@ -40,28 +44,41 @@ module VX_cache_core_req_bank_sel #(
always @(*) begin
core_req_ready_r = 0;
core_req_sel_r = 0;
for (integer j = 0; j < NUM_BANKS; ++j) begin
for (integer i = 0; i < NUM_REQS; ++i) begin
if (core_req_valid[i] && (core_req_bid[i] == `BANK_BITS'(j))) begin
core_req_ready_r[i] = per_bank_ready[j];
core_req_sel_r[i] = 1;
break;
end
end
end
end
reg [63:0] bank_stalls_r;
always @(posedge clk) begin
if (reset) begin
bank_stalls_r <= 0;
end else begin
bank_stalls_r <= bank_stalls_r + 64'($countones(core_req_valid & ~core_req_sel_r));
end
end
assign per_bank_valid = per_bank_valid_r;
assign per_bank_tid = per_bank_tid_r;
assign core_req_ready = core_req_ready_r;
assign bank_stalls = bank_stalls_r;
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (core_req_valid)
`UNUSED_VAR (core_req_addr)
assign per_bank_valid = core_req_valid;
assign per_bank_tid = 0;
assign core_req_ready[0] = per_bank_ready;
assign bank_stalls = 0;
end
endmodule

View file

@ -48,6 +48,7 @@ module VX_tag_store #(
VX_dp_ram #(
.DATAW(`TAG_SELECT_BITS),
.SIZE(`BANK_LINE_COUNT),
.FASTRAM(1),
.RWCHECK(1)
) tags (
.clk(clk),

View file

@ -6,14 +6,13 @@
interface VX_perf_cache_if ();
wire [63:0] reads;
wire [63:0] writes;
wire [63:0] writes;
wire [63:0] read_misses;
wire [63:0] write_misses;
wire [63:0] evictions;
wire [63:0] bank_stalls;
wire [63:0] mshr_stalls;
wire [63:0] pipe_stalls;
wire [63:0] crsp_stalls;
wire [63:0] dreq_stalls;
wire [63:0] pipe_stalls;
endinterface

View file

@ -7,24 +7,26 @@ interface VX_perf_memsys_if ();
wire [63:0] icache_reads;
wire [63:0] icache_read_misses;
wire [63:0] icache_mshr_stalls;
wire [63:0] icache_crsp_stalls;
wire [63:0] icache_dreq_stalls;
wire [63:0] icache_pipe_stalls;
wire [63:0] icache_crsp_stalls;
wire [63:0] dcache_reads;
wire [63:0] dcache_writes;
wire [63:0] dcache_writes;
wire [63:0] dcache_read_misses;
wire [63:0] dcache_write_misses;
wire [63:0] dcache_evictions;
wire [63:0] dcache_bank_stalls;
wire [63:0] dcache_mshr_stalls;
wire [63:0] dcache_crsp_stalls;
wire [63:0] dcache_dreq_stalls;
wire [63:0] dcache_pipe_stalls;
wire [63:0] dcache_crsp_stalls;
wire [63:0] smem_reads;
wire [63:0] smem_writes;
wire [63:0] smem_bank_stalls;
wire [63:0] dram_reads;
wire [63:0] dram_writes;
wire [63:0] dram_stalls;
wire [63:0] dram_latency;
wire [63:0] dram_requests;
wire [63:0] dram_responses;
endinterface

View file

@ -4,12 +4,8 @@
`include "VX_define.vh"
interface VX_perf_pipeline_if ();
// from pipeline
wire [63:0] icache_stalls;
wire [63:0] ibuffer_stalls;
// from issue
wire [63:0] scoreboard_stalls;
// from execute
wire [63:0] ibf_stalls;
wire [63:0] scb_stalls;
wire [63:0] lsu_stalls;
wire [63:0] csr_stalls;
wire [63:0] alu_stalls;

View file

@ -71,8 +71,8 @@ void Simulator::reset() {
vortex_->dram_rsp_valid = 0;
vortex_->dram_req_ready = 0;
vortex_->io_req_ready = 0;
vortex_->io_rsp_valid = 0;
//vortex_->io_req_ready = 0;
//vortex_->io_rsp_valid = 0;
vortex_->snp_req_valid = 0;
vortex_->snp_rsp_ready = 0;
vortex_->csr_io_req_valid = 0;
@ -201,7 +201,7 @@ void Simulator::eval_dram_bus() {
}
void Simulator::eval_io_bus() {
for (int i = 0; i < NUM_THREADS; ++i) {
/*for (int i = 0; i < NUM_THREADS; ++i) {
if (((vortex_->io_req_valid >> i) & 0x1)
&& ((VL_WDATA_GETW(vortex_->io_req_addr, i, NUM_THREADS, 30) << 2) == IO_BUS_ADDR_COUT)) {
assert(vortex_->io_req_rw);
@ -217,7 +217,7 @@ void Simulator::eval_io_bus() {
}
}
vortex_->io_req_ready = 1;
vortex_->io_rsp_valid = 0;
vortex_->io_rsp_valid = 0;*/
}
void Simulator::eval_snp_bus() {