merging perf counters

This commit is contained in:
Blaise Tine 2020-12-08 21:02:39 -08:00
commit d5438fd591
27 changed files with 1047 additions and 230 deletions

View file

@ -6,6 +6,11 @@ all:
$(MAKE) -C simX
$(MAKE) -C benchmarks/opencl
perf-demo:
$(MAKE) -C hw
$(MAKE) -C driver rtlsim
$(MAKE) -C driver/tests/demo/ run-rtlsim
clean:
$(MAKE) -C hw clean
$(MAKE) -C driver clean

View file

@ -6,7 +6,7 @@ set -e
show_usage()
{
echo "Vortex BlackBox Test Driver v1.0"
echo "Usage: [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=rtlsim|vlsim] [--debug] [--scope] [--app=vecadd|sgemm|basic|demo|dogfood] [--args=<args>] [--help]]"
echo "Usage: [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=rtlsim|vlsim] [--debug] [--scope] [--perf] [--app=vecadd|sgemm|basic|demo|dogfood] [--args=<args>] [--help]]"
}
DRIVER=vlsim
@ -64,6 +64,10 @@ case $i in
SCOPE=1
shift
;;
--perf)
PERF=-DPERF_ENABLE
shift
;;
--args=*)
ARGS=${i#*=}
HAS_ARGS=1
@ -125,7 +129,7 @@ case $APP in
;;
esac
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3"
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3 $PERF"
echo "CONFIGS=$CONFIGS"

View file

@ -91,24 +91,240 @@ extern int vx_upload_kernel_file(vx_device_h device, const char* filename) {
return err;
}
extern int vx_get_perf(vx_device_h device, int core_id, size_t* instrs, size_t* cycles) {
int vx_csr_get_l(vx_device_h device, int core_id, int addr, int addr_h, uint64_t* value) {
int ret = 0;
unsigned value_lo, value_hi;
ret |= vx_csr_get(device, core_id, addr, &value_lo);
ret |= vx_csr_get(device, core_id, addr_h, &value_hi);
*value = (uint64_t(value_hi) << 32) | value_lo;
return ret;
}
extern int vx_dump_perf(vx_device_h device, FILE* stream) {
int ret = 0;
unsigned value;
if (instrs) {
ret |= vx_csr_get(device, core_id, CSR_INSTRET_H, &value);
*instrs = value;
ret |= vx_csr_get(device, core_id, CSR_INSTRET, &value);
*instrs = (*instrs << 32) | value;
}
unsigned num_cores;
vx_csr_get(device, 0, CSR_NC, &num_cores);
if (cycles) {
ret |= vx_csr_get(device, core_id, CSR_CYCLE_H, &value);
*cycles = value;
ret |= vx_csr_get(device, core_id, CSR_CYCLE, &value);
*cycles = (*cycles << 32) | value;
}
uint64_t instrs = 0;
uint64_t cycles = 0;
#ifdef PERF_ENABLE
// PERF: pipeline stalls
uint64_t lsu_stalls = 0;
uint64_t fpu_stalls = 0;
uint64_t mul_stalls = 0;
uint64_t csr_stalls = 0;
uint64_t alu_stalls = 0;
uint64_t gpu_stalls = 0;
uint64_t ibuffer_stalls = 0;
uint64_t scoreboard_stalls = 0;
uint64_t icache_stalls = 0;
// PERF: Icache
uint64_t icache_reads = 0;
uint64_t icache_read_misses = 0;
uint64_t icache_pipe_stalls = 0;
uint64_t icache_dram_stalls = 0;
uint64_t icache_mshr_stalls = 0;
uint64_t icache_rsp_stalls = 0;
// PERF: Dcache
uint64_t dcache_reads = 0;
uint64_t dcache_writes = 0;
uint64_t dcache_read_misses = 0;
uint64_t dcache_write_misses = 0;
uint64_t dcache_pipe_stalls = 0;
uint64_t dcache_dram_stalls = 0;
uint64_t dcache_mshr_stalls = 0;
uint64_t dcache_rsp_stalls = 0;
uint64_t dcache_evictions = 0;
// PERF: memory
uint64_t dram_req = 0;
uint64_t dram_rsp = 0;
uint64_t dram_lat = 0;
#endif
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t instrs_per_core, cycles_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MINSTRET, CSR_MINSTRET_H, &instrs_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MCYCLE, CSR_MCYCLE_H, &cycles_per_core);
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
instrs += instrs_per_core;
cycles = std::max<uint64_t>(cycles_per_core, cycles);
#ifdef PERF_ENABLE
// PERF: pipeline
// icache_stall
uint64_t icache_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_ST, CSR_MPM_ICACHE_ST_H, &icache_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache stalls=%ld\n", core_id, icache_stalls_per_core);
icache_stalls += icache_stalls_per_core;
// ibuffer_stall
uint64_t ibuffer_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_IBUF_ST, CSR_MPM_IBUF_ST_H, &ibuffer_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
ibuffer_stalls += ibuffer_stalls_per_core;
// scoreboard_stall
uint64_t scoreboard_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_SCRB_ST, CSR_MPM_SCRB_ST_H, &scoreboard_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
scoreboard_stalls += scoreboard_stalls_per_core;
// alu_stall
uint64_t alu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ALU_ST, CSR_MPM_ALU_ST_H, &alu_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu stalls=%ld\n", core_id, alu_stalls_per_core);
alu_stalls += alu_stalls_per_core;
// lsu_stall
uint64_t lsu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_LSU_ST, CSR_MPM_LSU_ST_H, &lsu_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu stalls=%ld\n", core_id, lsu_stalls_per_core);
lsu_stalls += lsu_stalls_per_core;
// csr_stall
uint64_t csr_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_CSR_ST, CSR_MPM_CSR_ST_H, &csr_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: csr stalls=%ld\n", core_id, csr_stalls_per_core);
csr_stalls += csr_stalls_per_core;
// mul_stall
uint64_t mul_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MUL_ST, CSR_MPM_MUL_ST_H, &mul_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: mul stalls=%ld\n", core_id, mul_stalls_per_core);
mul_stalls += mul_stalls_per_core;
// fpu_stall
uint64_t fpu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_FPU_ST, CSR_MPM_FPU_ST_H, &fpu_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu stalls=%ld\n", core_id, fpu_stalls_per_core);
fpu_stalls += fpu_stalls_per_core;
// gpu_stall
uint64_t gpu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_GPU_ST, CSR_MPM_GPU_ST_H, &gpu_stalls_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu stalls=%ld\n", core_id, gpu_stalls_per_core);
gpu_stalls += gpu_stalls_per_core;
// PERF: Icache
// total reads
uint64_t icache_reads_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_READS, CSR_MPM_ICACHE_READS_H, &icache_reads_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads_per_core);
icache_reads += icache_reads_per_core;
// read misses
uint64_t icache_miss_r_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MISS_R, CSR_MPM_ICACHE_MISS_R_H, &icache_miss_r_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld\n", core_id, icache_miss_r_per_core);
icache_read_misses += icache_miss_r_per_core;
// pipeline stalls
uint64_t icache_pipe_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_PIPE_ST, CSR_MPM_ICACHE_PIPE_ST_H, &icache_pipe_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache pipeline stalls=%ld\n", core_id, icache_pipe_st_per_core);
icache_pipe_stalls += icache_pipe_st_per_core;
// response stalls
uint64_t icache_crsp_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_CRSP_ST, CSR_MPM_ICACHE_CRSP_ST_H, &icache_crsp_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core);
icache_rsp_stalls += icache_crsp_st_per_core;
// dram_stalls
uint64_t icache_dram_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_DREQ_ST, CSR_MPM_ICACHE_DREQ_ST_H, &icache_dram_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache dram stalls=%ld\n", core_id, icache_dram_st_per_core);
icache_dram_stalls += icache_dram_st_per_core;
// mshr_stalls
uint64_t icache_mshr_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MSHR_ST, CSR_MPM_ICACHE_MSHR_ST_H, &icache_mshr_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache mshr stalls=%ld\n", core_id, icache_mshr_st_per_core);
icache_mshr_stalls += icache_mshr_st_per_core;
// PERF: Dcache
// total reads
uint64_t dcache_reads_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_READS, CSR_MPM_DCACHE_READS_H, &dcache_reads_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads_per_core);
dcache_reads += dcache_reads_per_core;
// total write
uint64_t dcache_writes_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_WRITES, CSR_MPM_DCACHE_WRITES_H, &dcache_writes_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes_per_core);
dcache_writes += dcache_writes_per_core;
// read misses
uint64_t dcache_miss_r_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_R, CSR_MPM_DCACHE_MISS_R_H, &dcache_miss_r_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache read misses=%ld\n", core_id, dcache_miss_r_per_core);
dcache_read_misses += dcache_miss_r_per_core;
// read misses
uint64_t dcache_miss_w_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_W, CSR_MPM_DCACHE_MISS_W_H, &dcache_miss_w_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache wrire misses=%ld\n", core_id, dcache_miss_w_per_core);
dcache_write_misses += dcache_miss_w_per_core;
// total_evictions
uint64_t dcache_evictions_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_EVICTS, CSR_MPM_DCACHE_EVICTS_H, &dcache_evictions_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache evictions_per_core=%ld\n", core_id, dcache_evictions_per_core);
dcache_evictions += dcache_evictions_per_core;
// pipeline stalls
uint64_t dcache_pipe_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_PIPE_ST, CSR_MPM_DCACHE_PIPE_ST_H, &dcache_pipe_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core);
dcache_pipe_stalls += dcache_pipe_st_per_core;
// response stalls
uint64_t dcache_crsp_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_CRSP_ST, CSR_MPM_DCACHE_CRSP_ST_H, &dcache_crsp_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core);
dcache_rsp_stalls += dcache_crsp_st_per_core;
// dram_stalls
uint64_t dcache_dram_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_DREQ_ST, CSR_MPM_DCACHE_DREQ_ST_H, &dcache_dram_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache dram stalls=%ld\n", core_id, dcache_dram_st_per_core);
dcache_dram_stalls += dcache_dram_st_per_core;
// mshr_stalls
uint64_t dcache_mshr_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MSHR_ST, CSR_MPM_DCACHE_MSHR_ST_H, &dcache_mshr_st_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core);
dcache_mshr_stalls += dcache_mshr_st_per_core;
// PERF: dram_latency
uint64_t dram_req_per_core, dram_rsp_per_core, dram_lat_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_REQ, CSR_MPM_DRAM_REQ_H, &dram_req_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_RSP, CSR_MPM_DRAM_RSP_H, &dram_rsp_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core);
int avg_dram_lat_per_core = (int)(double(dram_lat_per_core) / double(dram_rsp_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, dram_req_per_core, dram_rsp_per_core, dram_req_per_core - dram_rsp_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: average dram latency=%d cycles\n", core_id, avg_dram_lat_per_core);
dram_req += dram_req_per_core;
dram_rsp += dram_rsp_per_core;
dram_lat += dram_lat_per_core;
#endif
}
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
#ifdef PERF_ENABLE
fprintf(stream, "PERF: icache stalls=%ld\n", icache_stalls);
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
fprintf(stream, "PERF: alu stalls=%ld\n", alu_stalls);
fprintf(stream, "PERF: lsu stalls=%ld\n", lsu_stalls);
fprintf(stream, "PERF: csr stalls=%ld\n", csr_stalls);
fprintf(stream, "PERF: mul stalls=%ld\n", mul_stalls);
fprintf(stream, "PERF: fpu stalls=%ld\n", fpu_stalls);
fprintf(stream, "PERF: gpu stalls=%ld\n", gpu_stalls);
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
fprintf(stream, "PERF: icache read misses=%ld\n", icache_read_misses);
fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls);
fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls);
fprintf(stream, "PERF: icache dram stalls=%ld\n", icache_dram_stalls);
fprintf(stream, "PERF: icache mshr stalls=%ld\n", icache_mshr_stalls);
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
fprintf(stream, "PERF: dcache read misses=%ld\n", dcache_read_misses);
fprintf(stream, "PERF: dcache wrire misses=%ld\n", dcache_write_misses);
fprintf(stream, "PERF: dcache evictions=%ld\n", dcache_evictions);
fprintf(stream, "PERF: dcache pipeline stalls=%ld\n", dcache_pipe_stalls);
fprintf(stream, "PERF: dcache reponse stalls=%ld\n", dcache_rsp_stalls);
fprintf(stream, "PERF: dcache dram stalls=%ld\n", dcache_dram_stalls);
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", dram_req, dram_rsp, dram_req - dram_rsp);
int avg_dram_lat = (int)(double(dram_lat) / double(dram_rsp));
fprintf(stream, "PERF: average dram latency=%d cycles\n", avg_dram_lat);
#endif
return ret;
}

View file

@ -2,6 +2,7 @@
#define __VX_DRIVER_H__
#include <stddef.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
@ -71,8 +72,8 @@ int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size)
// upload kernel file to device
int vx_upload_kernel_file(vx_device_h device, const char* filename);
// get performance counters
int vx_get_perf(vx_device_h device, int core_id, size_t* instrs, size_t* cycles);
// dump performance counters
int vx_dump_perf(vx_device_h device, FILE* stream);
#ifdef __cplusplus
}

View file

@ -58,6 +58,12 @@ ifdef SCOPE
SCOPE_H = scope-defs.h
endif
# Enable perf counters
ifdef PERF
CXXFLAGS += -DPERF_ENABLE
PERF_ENABLE = PERF=1
endif
all: vlsim
# AFU info from JSON file, including AFU UUID
@ -71,7 +77,7 @@ scope-defs.h: $(SCRIPT_DIR)/scope.json
scope: scope-defs.h
vlsim-hw: $(SCOPE_H)
$(SCOPE_ENABLE) $(MAKE) -C vlsim
$(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C vlsim
fpga: $(SRCS) $(SCOPE_H)
$(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) $(FPGA_LIBS) -o $(PROJECT)
@ -94,7 +100,6 @@ $(ASE_DIR):
clean:
rm -rf $(PROJECT) $(PROJECT_ASE) $(PROJECT_VLSIM) *.o .depend
$(MAKE) -C vlsim clean
$(MAKE) -C ase clean
ifneq ($(MAKECMDGOALS),clean)
-include .depend

View file

@ -43,8 +43,9 @@ RTL_DIR=../../../hw/rtl
SRCS = fpga.cpp opae_sim.cpp
SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
VL_FLAGS += -Wno-DECLFILENAME
@ -70,6 +71,12 @@ ifdef SCOPE
CFLAGS += -DSCOPE
endif
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CFLAGS += -DPERF_ENABLE
endif
# use our OPAE shim
VL_FLAGS += -DNOPAE
CFLAGS += -DNOPAE
@ -77,8 +84,6 @@ CFLAGS += -DNOPAE
# use DPI FPU
VL_FLAGS += -DFPU_FAST
RTL_INCLUDE += -I../../../hw/opae -I../../../hw/opae/ccip
PROJECT = libopae-c-vlsim.so
all: $(PROJECT)

View file

@ -244,27 +244,7 @@ extern int vx_dev_close(vx_device_h hdevice) {
#endif
#ifdef DUMP_PERF_STATS
// Dump perf stats
if (device->num_cores > 1) {
uint64_t total_instrs = 0, total_cycles = 0;
for (unsigned core_id = 0; core_id < device->num_cores; ++core_id) {
uint64_t instrs, cycles;
int ret = vx_get_perf(hdevice, core_id, &instrs, &cycles);
assert(ret == 0);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "[VXDRV] PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC);
total_instrs += instrs;
total_cycles = std::max<uint64_t>(total_cycles, cycles);
}
float IPC = (float)(double(total_instrs) / double(total_cycles));
fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC);
} else {
uint64_t instrs, cycles;
int ret = vx_get_perf(hdevice, 0, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
assert(ret == 0);
fprintf(stdout, "[VXDRV] PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
}
vx_dump_perf(device, stdout);
#endif
fpgaClose(device->fpga);

View file

@ -64,6 +64,12 @@ else
CFLAGS += -DNDEBUG
endif
# Enable perf counters
ifdef PERF
VL_FLAGS += -DPERF_ENABLE
CFLAGS += -DPERF_ENABLE
endif
# use DPI FPU
VL_FLAGS += -DFPU_FAST

View file

@ -239,26 +239,7 @@ extern int vx_dev_close(vx_device_h hdevice) {
vx_device *device = ((vx_device*)hdevice);
#ifdef DUMP_PERF_STATS
unsigned num_cores;
vx_csr_get(hdevice, 0, CSR_NC, &num_cores);
if (num_cores > 1) {
uint64_t total_instrs = 0, total_cycles = 0;
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t instrs, cycles;
vx_get_perf(hdevice, core_id, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC);
total_instrs += instrs;
total_cycles = std::max<uint64_t>(total_cycles, cycles);
}
float IPC = (float)(double(total_instrs) / double(total_cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC);
} else {
uint64_t instrs, cycles;
vx_get_perf(hdevice, 0, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
}
vx_dump_perf(device, stdout);
#endif
delete device;

View file

@ -4,6 +4,7 @@
+define+QUARTUS
+define+FPU_FAST
#+define+SCOPE
#+define+PERF_ENABLE
#+define+DBG_PRINT_CORE_ICACHE
#+define+DBG_PRINT_CORE_DCACHE

View file

@ -297,6 +297,9 @@ module VX_cluster #(
);
if (`L2_ENABLE) begin
`ifdef PERF_ENABLE
VX_perf_cache_if perf_l2cache_if();
`endif
wire [`NUM_CORES-1:0] per_core_dram_req_valid_qual;
wire [`NUM_CORES-1:0] per_core_dram_req_rw_qual;
@ -345,10 +348,14 @@ module VX_cluster #(
.SNP_TAG_WIDTH (`L2SNP_TAG_WIDTH)
) l2cache (
`SCOPE_BIND_VX_cluster_l2cache
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.perf_cache_if (perf_l2cache_if),
`endif
// Core request
.core_req_valid (per_core_dram_req_valid_qual),
.core_req_rw (per_core_dram_req_rw_qual),

View file

@ -126,10 +126,12 @@
// CSR Addresses //////////////////////////////////////////////////////////////
// User Floating-Point CSRs
`define CSR_FFLAGS 12'h001
`define CSR_FRM 12'h002
`define CSR_FCSR 12'h003
// SIMT CSRs
`define CSR_LTID 12'h020
`define CSR_LWID 12'h021
`define CSR_GTID 12'h022
@ -153,11 +155,73 @@
`define CSR_MEPC 12'h341
`define CSR_CYCLE 12'hC00
`define CSR_CYCLE_H 12'hC80
`define CSR_INSTRET 12'hC02
`define CSR_INSTRET_H 12'hC82
// Machine Counter/Timers
`define CSR_MCYCLE 12'hB00
`define CSR_MCYCLE_H 12'hB80
`define CSR_MINSTRET 12'hB02
`define CSR_MINSTRET_H 12'hB82
// Machine Performance-monitoring counters
// PERF: pipeline
`define CSR_MPM_ICACHE_ST 12'hB03
`define CSR_MPM_ICACHE_ST_H 12'hB83
`define CSR_MPM_IBUF_ST 12'hB04
`define CSR_MPM_IBUF_ST_H 12'hB84
`define CSR_MPM_SCRB_ST 12'hB05
`define CSR_MPM_SCRB_ST_H 12'hB85
`define CSR_MPM_ALU_ST 12'hB06
`define CSR_MPM_ALU_ST_H 12'hB86
`define CSR_MPM_LSU_ST 12'hB07
`define CSR_MPM_LSU_ST_H 12'hB87
`define CSR_MPM_CSR_ST 12'hB08
`define CSR_MPM_CSR_ST_H 12'hB88
`define CSR_MPM_MUL_ST 12'hB09
`define CSR_MPM_MUL_ST_H 12'hB89
`define CSR_MPM_FPU_ST 12'hB0A
`define CSR_MPM_FPU_ST_H 12'hB8A
`define CSR_MPM_GPU_ST 12'hB0B
`define CSR_MPM_GPU_ST_H 12'hB8B
// PERF: icache
`define CSR_MPM_ICACHE_MISS_R 12'hB0C // read misses
`define CSR_MPM_ICACHE_MISS_R_H 12'hB8C
`define CSR_MPM_ICACHE_DREQ_ST 12'hB0D // dram request stalls
`define CSR_MPM_ICACHE_DREQ_ST_H 12'hB8D
`define CSR_MPM_ICACHE_CRSP_ST 12'hB0E // core response stalls
`define CSR_MPM_ICACHE_CRSP_ST_H 12'hB8E
`define CSR_MPM_ICACHE_MSHR_ST 12'hB0F // MSHR stalls
`define CSR_MPM_ICACHE_MSHR_ST_H 12'hB8F
`define CSR_MPM_ICACHE_PIPE_ST 12'hB10 // pipeline stalls
`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB90
`define CSR_MPM_ICACHE_READS 12'hB11 // total reads
`define CSR_MPM_ICACHE_READS_H 12'hB91
// PERF: dcache
`define CSR_MPM_DCACHE_MISS_R 12'hB12 // read misses
`define CSR_MPM_DCACHE_MISS_R_H 12'hB92
`define CSR_MPM_DCACHE_MISS_W 12'hB13 // write misses
`define CSR_MPM_DCACHE_MISS_W_H 12'hB93
`define CSR_MPM_DCACHE_DREQ_ST 12'hB14 // dram request stalls
`define CSR_MPM_DCACHE_DREQ_ST_H 12'hB94
`define CSR_MPM_DCACHE_CRSP_ST 12'hB15 // core response stalls
`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB95
`define CSR_MPM_DCACHE_MSHR_ST 12'hB16 // MSHR stalls
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB96
`define CSR_MPM_DCACHE_PIPE_ST 12'hB17 // pipeline stalls
`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB97
`define CSR_MPM_DCACHE_READS 12'hB18 // total reads
`define CSR_MPM_DCACHE_READS_H 12'hB98
`define CSR_MPM_DCACHE_WRITES 12'hB19 // total writes
`define CSR_MPM_DCACHE_WRITES_H 12'hB99
`define CSR_MPM_DCACHE_EVICTS 12'hB1A // total evictions
`define CSR_MPM_DCACHE_EVICTS_H 12'hB9A
// PERF: memory
`define CSR_MPM_DRAM_LAT 12'hB1B // dram latency (total)
`define CSR_MPM_DRAM_LAT_H 12'hB9B
`define CSR_MPM_DRAM_REQ 12'hB1C // dram requests
`define CSR_MPM_DRAM_REQ_H 12'hB9C
`define CSR_MPM_DRAM_RSP 12'hB1D // dram responses
`define CSR_MPM_DRAM_RSP_H 12'hB9D
// Machine Information Registers
`define CSR_MVENDORID 12'hF11
`define CSR_MARCHID 12'hF12
`define CSR_MIMPID 12'hF13
@ -185,6 +249,38 @@
`define FPUQ_SIZE 4
`endif
// Icache Configurable Knobs //////////////////////////////////////////////////
// Size of cache in bytes
`ifndef ICACHE_SIZE
`define ICACHE_SIZE 4096
`endif
// Core Request Queue Size
`ifndef ICREQ_SIZE
`define ICREQ_SIZE 4
`endif
// Core Response Queue Size
`ifndef ICRSQ_SIZE
`define ICRSQ_SIZE 4
`endif
// Miss Handling Register Size
`ifndef IMSHR_SIZE
`define IMSHR_SIZE `NUM_WARPS
`endif
// DRAM Request Queue Size
`ifndef IDREQ_SIZE
`define IDREQ_SIZE 4
`endif
// DRAM Response Queue Size
`ifndef IDRSQ_SIZE
`define IDRSQ_SIZE 4
`endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
// Size of cache in bytes
@ -232,38 +328,6 @@
`define DSRSQ_SIZE 4
`endif
// Icache Configurable Knobs //////////////////////////////////////////////////
// Size of cache in bytes
`ifndef ICACHE_SIZE
`define ICACHE_SIZE 4096
`endif
// Core Request Queue Size
`ifndef ICREQ_SIZE
`define ICREQ_SIZE 4
`endif
// Core Response Queue Size
`ifndef ICRSQ_SIZE
`define ICRSQ_SIZE 4
`endif
// Miss Handling Register Size
`ifndef IMSHR_SIZE
`define IMSHR_SIZE `NUM_WARPS
`endif
// DRAM Request Queue Size
`ifndef IDREQ_SIZE
`define IDREQ_SIZE 4
`endif
// DRAM Response Queue Size
`ifndef IDRSQ_SIZE
`define IDRSQ_SIZE 4
`endif
// SM Configurable Knobs //////////////////////////////////////////////////////
// Size of cache in bytes

View file

@ -66,6 +66,10 @@ module VX_core #(
output wire busy,
output wire ebreak
);
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if();
`endif
VX_cache_dram_req_if #(
.DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH),
.DRAM_ADDR_WIDTH(`DDRAM_ADDR_WIDTH),
@ -174,6 +178,9 @@ module VX_core #(
.CORE_ID(CORE_ID)
) pipeline (
`SCOPE_BIND_VX_core_pipeline
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
`endif
.clk(clk),
.reset(reset),
@ -231,6 +238,9 @@ module VX_core #(
.CORE_ID(CORE_ID)
) mem_unit (
`SCOPE_BIND_VX_core_mem_unit
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
`endif
.clk (clk),
.reset (reset),
@ -238,7 +248,7 @@ module VX_core #(
// Core <-> Dcache
.core_dcache_req_if (core_dcache_req_if),
.core_dcache_rsp_if (core_dcache_rsp_if),
// Core <-> Icache
.core_icache_req_if (core_icache_req_if),
.core_icache_rsp_if (core_icache_rsp_if),

View file

@ -6,6 +6,11 @@ module VX_csr_data #(
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_pipeline_if perf_pipeline_if,
`endif
VX_cmt_to_csr_if cmt_to_csr_if,
VX_fpu_to_csr_if fpu_to_csr_if,
@ -114,6 +119,67 @@ module VX_csr_data #(
`CSR_NW : read_data_r = `NUM_WARPS;
`CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS;
`ifdef PERF_ENABLE
// PERF: pipeline
`CSR_MPM_ICACHE_ST : read_data_r = perf_pipeline_if.icache_stalls[31:0];
`CSR_MPM_ICACHE_ST_H : read_data_r = perf_pipeline_if.icache_stalls[63:32];
`CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibuffer_stalls[31:0];
`CSR_MPM_IBUF_ST_H : read_data_r = perf_pipeline_if.ibuffer_stalls[63:32];
`CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scoreboard_stalls[31:0];
`CSR_MPM_SCRB_ST_H : read_data_r = perf_pipeline_if.scoreboard_stalls[63:32];
`CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0];
`CSR_MPM_ALU_ST_H : read_data_r = perf_pipeline_if.alu_stalls[63:32];
`CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0];
`CSR_MPM_LSU_ST_H : read_data_r = perf_pipeline_if.lsu_stalls[63:32];
`CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0];
`CSR_MPM_CSR_ST_H : read_data_r = perf_pipeline_if.csr_stalls[63:32];
`CSR_MPM_MUL_ST : read_data_r = perf_pipeline_if.mul_stalls[31:0];
`CSR_MPM_MUL_ST_H : read_data_r = perf_pipeline_if.mul_stalls[63:32];
`CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0];
`CSR_MPM_FPU_ST_H : read_data_r = perf_pipeline_if.fpu_stalls[63:32];
`CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0];
`CSR_MPM_GPU_ST_H : read_data_r = perf_pipeline_if.gpu_stalls[63:32];
// PERF: icache
`CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_if.read_misses[31:0];
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = perf_memsys_if.icache_if.read_misses[63:32];
`CSR_MPM_ICACHE_DREQ_ST : read_data_r = perf_memsys_if.icache_if.dreq_stalls[31:0];
`CSR_MPM_ICACHE_DREQ_ST_H : read_data_r = perf_memsys_if.icache_if.dreq_stalls[63:32];
`CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_if.crsp_stalls[31:0];
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = perf_memsys_if.icache_if.crsp_stalls[63:32];
`CSR_MPM_ICACHE_MSHR_ST : read_data_r = perf_memsys_if.icache_if.mshr_stalls[31:0];
`CSR_MPM_ICACHE_MSHR_ST_H : read_data_r = perf_memsys_if.icache_if.mshr_stalls[63:32];
`CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_if.pipe_stalls[31:0];
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = perf_memsys_if.icache_if.pipe_stalls[63:32];
`CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_if.reads[31:0];
`CSR_MPM_ICACHE_READS_H : read_data_r = perf_memsys_if.icache_if.reads[63:32];
// PERF: dcache
`CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_if.read_misses[31:0];
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = perf_memsys_if.dcache_if.read_misses[63:32];
`CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_if.write_misses[31:0];
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = perf_memsys_if.dcache_if.write_misses[63:32];
`CSR_MPM_DCACHE_DREQ_ST : read_data_r = perf_memsys_if.dcache_if.dreq_stalls[31:0];
`CSR_MPM_DCACHE_DREQ_ST_H : read_data_r = perf_memsys_if.dcache_if.dreq_stalls[63:32];
`CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_if.crsp_stalls[31:0];
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = perf_memsys_if.dcache_if.crsp_stalls[63:32];
`CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_if.mshr_stalls[31:0];
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = perf_memsys_if.dcache_if.mshr_stalls[63:32];
`CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_if.pipe_stalls[31:0];
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = perf_memsys_if.dcache_if.pipe_stalls[63:32];
`CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_if.reads[31:0];
`CSR_MPM_DCACHE_READS_H : read_data_r = perf_memsys_if.dcache_if.reads[63:32];
`CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_if.writes[31:0];
`CSR_MPM_DCACHE_WRITES_H : read_data_r = perf_memsys_if.dcache_if.writes[63:32];
`CSR_MPM_DCACHE_EVICTS : read_data_r = perf_memsys_if.dcache_if.evictions[31:0];
`CSR_MPM_DCACHE_EVICTS_H : read_data_r = perf_memsys_if.dcache_if.evictions[63:32];
// PERF: memory
`CSR_MPM_DRAM_LAT : read_data_r = perf_memsys_if.dram_latency[31:0];
`CSR_MPM_DRAM_LAT_H : read_data_r = perf_memsys_if.dram_latency[63:32];
`CSR_MPM_DRAM_REQ : read_data_r = perf_memsys_if.dram_requests[31:0];
`CSR_MPM_DRAM_REQ_H : read_data_r = perf_memsys_if.dram_requests[63:32];
`CSR_MPM_DRAM_RSP : read_data_r = perf_memsys_if.dram_responses[31:0];
`CSR_MPM_DRAM_RSP_H : read_data_r = perf_memsys_if.dram_responses[63:32];
`endif
`CSR_SATP : read_data_r = 32'(csr_satp);
`CSR_MSTATUS : read_data_r = 32'(csr_mstatus);
@ -128,10 +194,10 @@ module VX_csr_data #(
`CSR_PMPCFG0 : read_data_r = 32'(csr_pmpcfg[0]);
`CSR_PMPADDR0 : read_data_r = 32'(csr_pmpaddr[0]);
`CSR_CYCLE : read_data_r = csr_cycle[31:0];
`CSR_CYCLE_H : read_data_r = csr_cycle[63:32];
`CSR_INSTRET : read_data_r = csr_instret[31:0];
`CSR_INSTRET_H : read_data_r = csr_instret[63:32];
`CSR_MCYCLE : read_data_r = csr_cycle[31:0];
`CSR_MCYCLE_H : read_data_r = csr_cycle[63:32];
`CSR_MINSTRET : read_data_r = csr_instret[31:0];
`CSR_MINSTRET_H: read_data_r = csr_instret[63:32];
`CSR_MVENDORID : read_data_r = `VENDOR_ID;
`CSR_MARCHID : read_data_r = `ARCHITECTURE_ID;

View file

@ -6,6 +6,11 @@ module VX_csr_unit #(
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_pipeline_if perf_pipeline_if,
`endif
VX_cmt_to_csr_if cmt_to_csr_if,
VX_fpu_to_csr_if fpu_to_csr_if,
@ -51,6 +56,10 @@ module VX_csr_unit #(
) csr_data (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if (perf_pipeline_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fpu_to_csr_if (fpu_to_csr_if),
.read_enable (csr_pipe_req_if.valid),

View file

@ -240,45 +240,10 @@
`define DBG_CACHE_REQ_MDATAW 0
`endif
////////////////////////// Dcache Configurable Knobs //////////////////////////
// Cache ID
`define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0)
// Block size in bytes
`define DBANK_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE)
// Word size in bytes
`define DWORD_SIZE 4
// TAG sharing enable
`define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE)
// Core request tag bits
`define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS)
// DRAM request data bits
`define DDRAM_LINE_WIDTH (`DBANK_LINE_SIZE * 8)
// DRAM request address bits
`define DDRAM_ADDR_WIDTH (32 - `CLOG2(`DBANK_LINE_SIZE))
// DRAM byte enable bits
`define DDRAM_BYTEEN_WIDTH `DBANK_LINE_SIZE
// DRAM request tag bits
`define DDRAM_TAG_WIDTH `DDRAM_ADDR_WIDTH
// Core request size
`define DNUM_REQUESTS `NUM_THREADS
// Snoop request tag bits
`define DSNP_TAG_WIDTH ((`NUM_CORES > 1) ? `LOG2UP(`L2SREQ_SIZE) : `L2SNP_TAG_WIDTH)
////////////////////////// Icache Configurable Knobs //////////////////////////
// Cache ID
`define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1)
`define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0)
// Block size in bytes
`define IBANK_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE)
@ -316,6 +281,41 @@
// Core request size
`define INUM_REQUESTS 1
////////////////////////// Dcache Configurable Knobs //////////////////////////
// Cache ID
`define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1)
// Block size in bytes
`define DBANK_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE)
// Word size in bytes
`define DWORD_SIZE 4
// TAG sharing enable
`define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE)
// Core request tag bits
`define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS)
// DRAM request data bits
`define DDRAM_LINE_WIDTH (`DBANK_LINE_SIZE * 8)
// DRAM request address bits
`define DDRAM_ADDR_WIDTH (32 - `CLOG2(`DBANK_LINE_SIZE))
// DRAM byte enable bits
`define DDRAM_BYTEEN_WIDTH `DBANK_LINE_SIZE
// DRAM request tag bits
`define DDRAM_TAG_WIDTH `DDRAM_ADDR_WIDTH
// Core request size
`define DNUM_REQUESTS `NUM_THREADS
// Snoop request tag bits
`define DSNP_TAG_WIDTH ((`NUM_CORES > 1) ? `LOG2UP(`L2SREQ_SIZE) : `L2SNP_TAG_WIDTH)
////////////////////////// SM Configurable Knobs //////////////////////////////
// Cache ID

View file

@ -16,8 +16,13 @@ module VX_execute #(
VX_cache_core_req_if dcache_req_if,
VX_cache_core_rsp_if dcache_rsp_if,
// perf
// commit status
VX_cmt_to_csr_if cmt_to_csr_if,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_pipeline_if perf_pipeline_if,
`endif
// inputs
VX_alu_req_if alu_req_if,
@ -72,7 +77,11 @@ module VX_execute #(
.CORE_ID(CORE_ID)
) csr_unit (
.clk (clk),
.reset (reset),
.reset (reset),
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if (perf_pipeline_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fpu_to_csr_if (fpu_to_csr_if),
.csr_io_req_if (csr_io_req_if),

View file

@ -8,6 +8,10 @@ module VX_issue #(
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_pipeline_if perf_pipeline_if,
`endif
VX_decode_if decode_if,
VX_writeback_if writeback_if,
@ -120,6 +124,21 @@ module VX_issue #(
`SCOPE_ASSIGN (writeback_rd, writeback_if.rd);
`SCOPE_ASSIGN (writeback_data, writeback_if.data);
`ifdef PERF_ENABLE
reg [63:0] perf_scoreboard_stalls;
always @(posedge clk) begin
if (reset) begin
perf_scoreboard_stalls <= 0;
end else begin
// scoreboard_stall
if (ibuf_deq_if.valid & scoreboard_delay) begin
perf_scoreboard_stalls <= perf_scoreboard_stalls + 64'd1;
end
end
end
assign perf_pipeline_if.scoreboard_stalls = perf_scoreboard_stalls;
`endif
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (alu_req_if.valid && alu_req_if.ready) begin

View file

@ -7,6 +7,10 @@ module VX_mem_unit # (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
`endif
// Core <-> Dcache
VX_cache_core_req_if core_dcache_req_if,
@ -28,6 +32,11 @@ module VX_mem_unit # (
VX_cache_core_req_if io_req_if,
VX_cache_core_rsp_if io_rsp_if
);
`ifdef PERF_ENABLE
VX_perf_cache_if perf_icache_if(), perf_dcache_if(), perf_smem_if();
`endif
VX_cache_dram_req_if #(
.DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH),
.DRAM_ADDR_WIDTH (`DDRAM_ADDR_WIDTH),
@ -80,6 +89,82 @@ module VX_mem_unit # (
.smem_rsp_if (smem_rsp_if),
.io_rsp_if (io_rsp_if),
.core_rsp_if (core_dcache_rsp_if)
);
VX_cache #(
.CACHE_ID (`ICACHE_ID),
.CACHE_SIZE (`ICACHE_SIZE),
.BANK_LINE_SIZE (`IBANK_LINE_SIZE),
.NUM_BANKS (`INUM_BANKS),
.WORD_SIZE (`IWORD_SIZE),
.NUM_REQS (`INUM_REQUESTS),
.CREQ_SIZE (`ICREQ_SIZE),
.MSHR_SIZE (`IMSHR_SIZE),
.DRSQ_SIZE (`IDRSQ_SIZE),
.SREQ_SIZE (1),
.CRSQ_SIZE (`ICRSQ_SIZE),
.DREQ_SIZE (`IDREQ_SIZE),
.SRSQ_SIZE (1),
.DRAM_ENABLE (1),
.FLUSH_ENABLE (0),
.WRITE_ENABLE (0),
.CORE_TAG_WIDTH (`ICORE_TAG_WIDTH),
.CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS),
.DRAM_TAG_WIDTH (`IDRAM_TAG_WIDTH)
) icache (
`SCOPE_BIND_VX_mem_unit_icache
.clk (clk),
.reset (reset),
// Core request
.core_req_valid (core_icache_req_if.valid),
.core_req_rw (core_icache_req_if.rw),
.core_req_byteen (core_icache_req_if.byteen),
.core_req_addr (core_icache_req_if.addr),
.core_req_data (core_icache_req_if.data),
.core_req_tag (core_icache_req_if.tag),
.core_req_ready (core_icache_req_if.ready),
// Core response
.core_rsp_valid (core_icache_rsp_if.valid),
.core_rsp_data (core_icache_rsp_if.data),
.core_rsp_tag (core_icache_rsp_if.tag),
.core_rsp_ready (core_icache_rsp_if.ready),
`ifdef PERF_ENABLE
.perf_cache_if (perf_icache_if),
`endif
// DRAM Req
.dram_req_valid (icache_dram_req_if.valid),
.dram_req_rw (icache_dram_req_if.rw),
.dram_req_byteen (icache_dram_req_if.byteen),
.dram_req_addr (icache_dram_req_if.addr),
.dram_req_data (icache_dram_req_if.data),
.dram_req_tag (icache_dram_req_if.tag),
.dram_req_ready (icache_dram_req_if.ready),
// DRAM response
.dram_rsp_valid (icache_dram_rsp_if.valid),
.dram_rsp_data (icache_dram_rsp_if.data),
.dram_rsp_tag (icache_dram_rsp_if.tag),
.dram_rsp_ready (icache_dram_rsp_if.ready),
// Snoop request
.snp_req_valid (1'b0),
.snp_req_addr (0),
.snp_req_inv (1'b0),
.snp_req_tag (0),
`UNUSED_PIN (snp_req_ready),
// Snoop response
`UNUSED_PIN (snp_rsp_valid),
`UNUSED_PIN (snp_rsp_tag),
.snp_rsp_ready (1'b0),
// Miss status
`UNUSED_PIN (miss_vec)
);
VX_cache #(
@ -124,6 +209,10 @@ module VX_mem_unit # (
.core_rsp_tag (dcache_rsp_if.tag),
.core_rsp_ready (dcache_rsp_if.ready),
`ifdef PERF_ENABLE
.perf_cache_if (perf_dcache_if),
`endif
// DRAM request
.dram_req_valid (dcache_dram_req_if.valid),
.dram_req_rw (dcache_dram_req_if.rw),
@ -151,78 +240,6 @@ module VX_mem_unit # (
.snp_rsp_tag (dcache_snp_rsp_if.tag),
.snp_rsp_ready (dcache_snp_rsp_if.ready),
// Miss status
`UNUSED_PIN (miss_vec)
);
VX_cache #(
.CACHE_ID (`ICACHE_ID),
.CACHE_SIZE (`ICACHE_SIZE),
.BANK_LINE_SIZE (`IBANK_LINE_SIZE),
.NUM_BANKS (`INUM_BANKS),
.WORD_SIZE (`IWORD_SIZE),
.NUM_REQS (`INUM_REQUESTS),
.CREQ_SIZE (`ICREQ_SIZE),
.MSHR_SIZE (`IMSHR_SIZE),
.DRSQ_SIZE (`IDRSQ_SIZE),
.SREQ_SIZE (1),
.CRSQ_SIZE (`ICRSQ_SIZE),
.DREQ_SIZE (`IDREQ_SIZE),
.SRSQ_SIZE (1),
.DRAM_ENABLE (1),
.FLUSH_ENABLE (0),
.WRITE_ENABLE (0),
.CORE_TAG_WIDTH (`ICORE_TAG_WIDTH),
.CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS),
.DRAM_TAG_WIDTH (`IDRAM_TAG_WIDTH)
) icache (
`SCOPE_BIND_VX_mem_unit_icache
.clk (clk),
.reset (reset),
// Core request
.core_req_valid (core_icache_req_if.valid),
.core_req_rw (core_icache_req_if.rw),
.core_req_byteen (core_icache_req_if.byteen),
.core_req_addr (core_icache_req_if.addr),
.core_req_data (core_icache_req_if.data),
.core_req_tag (core_icache_req_if.tag),
.core_req_ready (core_icache_req_if.ready),
// Core response
.core_rsp_valid (core_icache_rsp_if.valid),
.core_rsp_data (core_icache_rsp_if.data),
.core_rsp_tag (core_icache_rsp_if.tag),
.core_rsp_ready (core_icache_rsp_if.ready),
// DRAM Req
.dram_req_valid (icache_dram_req_if.valid),
.dram_req_rw (icache_dram_req_if.rw),
.dram_req_byteen (icache_dram_req_if.byteen),
.dram_req_addr (icache_dram_req_if.addr),
.dram_req_data (icache_dram_req_if.data),
.dram_req_tag (icache_dram_req_if.tag),
.dram_req_ready (icache_dram_req_if.ready),
// DRAM response
.dram_rsp_valid (icache_dram_rsp_if.valid),
.dram_rsp_data (icache_dram_rsp_if.data),
.dram_rsp_tag (icache_dram_rsp_if.tag),
.dram_rsp_ready (icache_dram_rsp_if.ready),
// Snoop request
.snp_req_valid (1'b0),
.snp_req_addr (0),
.snp_req_inv (1'b0),
.snp_req_tag (0),
`UNUSED_PIN (snp_req_ready),
// Snoop response
`UNUSED_PIN (snp_rsp_valid),
`UNUSED_PIN (snp_rsp_tag),
.snp_rsp_ready (1'b0),
// Miss status
`UNUSED_PIN (miss_vec)
);
@ -268,6 +285,10 @@ module VX_mem_unit # (
.core_rsp_tag (smem_rsp_if.tag),
.core_rsp_ready (smem_rsp_if.ready),
`ifdef PERF_ENABLE
.perf_cache_if (perf_smem_if),
`endif
// DRAM request
`UNUSED_PIN (dram_req_valid),
`UNUSED_PIN (dram_req_rw),
@ -340,4 +361,65 @@ module VX_mem_unit # (
.rsp_ready_in (dram_rsp_if.ready)
);
`ifdef PERF_ENABLE
assign perf_memsys_if.icache_if.read_misses = perf_icache_if.read_misses;
assign perf_memsys_if.icache_if.write_misses = perf_icache_if.write_misses;
assign perf_memsys_if.icache_if.mshr_stalls = perf_icache_if.mshr_stalls;
assign perf_memsys_if.icache_if.crsp_stalls = perf_icache_if.crsp_stalls;
assign perf_memsys_if.icache_if.dreq_stalls = perf_icache_if.dreq_stalls;
assign perf_memsys_if.icache_if.pipe_stalls = perf_icache_if.pipe_stalls;
assign perf_memsys_if.icache_if.reads = perf_icache_if.reads;
assign perf_memsys_if.icache_if.writes = perf_icache_if.writes;
assign perf_memsys_if.icache_if.evictions = perf_icache_if.evictions;
assign perf_memsys_if.dcache_if.read_misses = perf_dcache_if.read_misses;
assign perf_memsys_if.dcache_if.write_misses = perf_dcache_if.write_misses;
assign perf_memsys_if.dcache_if.mshr_stalls = perf_dcache_if.mshr_stalls;
assign perf_memsys_if.dcache_if.crsp_stalls = perf_dcache_if.crsp_stalls;
assign perf_memsys_if.dcache_if.dreq_stalls = perf_dcache_if.dreq_stalls;
assign perf_memsys_if.dcache_if.pipe_stalls = perf_dcache_if.pipe_stalls;
assign perf_memsys_if.dcache_if.reads = perf_dcache_if.reads;
assign perf_memsys_if.dcache_if.writes = perf_dcache_if.writes;
assign perf_memsys_if.dcache_if.evictions = perf_dcache_if.evictions;
reg [63:0] perf_dram_lat_per_cycle;
always @(posedge clk) begin
if (reset) begin
perf_dram_lat_per_cycle <= 0;
end else begin
if (dram_req_if.valid & (~dram_req_if.rw) & dram_req_if.ready & dram_rsp_if.valid & dram_rsp_if.ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle;
end else if (dram_req_if.valid & (~dram_req_if.rw) & dram_req_if.ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle + 64'd1;
end else if (dram_rsp_if.valid & dram_rsp_if.ready) begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle - 64'd1;
end
end
end
reg [63:0] perf_dram_req, perf_dram_rsp, perf_dram_lat;
always @(posedge clk) begin
if (reset) begin
perf_dram_req <= 0;
perf_dram_rsp <= 0;
perf_dram_lat <= 0;
end else begin
if (dram_req_if.valid & dram_req_if.ready) begin
perf_dram_req <= perf_dram_req + 64'd1;
end
if (dram_rsp_if.valid & dram_rsp_if.ready) begin
perf_dram_rsp <= perf_dram_rsp + 64'd1;
end
perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle;
end
end
assign perf_memsys_if.dram_requests = perf_dram_req;
assign perf_memsys_if.dram_responses = perf_dram_rsp;
assign perf_memsys_if.dram_latency = perf_dram_lat;
`endif
endmodule

View file

@ -51,6 +51,10 @@ module VX_pipeline #(
output wire[31:0] csr_io_rsp_data,
input wire csr_io_rsp_ready,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
`endif
// Status
output wire busy,
output wire ebreak
@ -171,6 +175,10 @@ module VX_pipeline #(
VX_commit_if fpu_commit_if();
VX_commit_if gpu_commit_if();
`ifdef PERF_ENABLE
VX_perf_pipeline_if perf_pipeline_if();
`endif
VX_fetch #(
.CORE_ID(CORE_ID)
) fetch (
@ -206,6 +214,10 @@ module VX_pipeline #(
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.perf_pipeline_if (perf_pipeline_if),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
@ -224,7 +236,12 @@ module VX_pipeline #(
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if (perf_pipeline_if),
`endif
.dcache_req_if (core_dcache_req_if),
.dcache_rsp_if (core_dcache_rsp_if),
@ -272,4 +289,78 @@ module VX_pipeline #(
.cmt_to_csr_if (cmt_to_csr_if)
);
`ifdef PERF_ENABLE
reg [63:0] perf_icache_stalls;
reg [63:0] perf_ibuffer_stalls;
reg [63:0] perf_alu_stalls;
reg [63:0] perf_lsu_stalls;
reg [63:0] perf_csr_stalls;
reg [63:0] perf_gpu_stalls;
`ifdef EXT_M_ENABLE
reg [63:0] perf_mul_stalls;
`endif
`ifdef EXT_F_ENABLE
reg [63:0] perf_fpu_stalls;
`endif
always @(posedge clk) begin
if (reset) begin
perf_icache_stalls <= 0;
perf_ibuffer_stalls <= 0;
perf_alu_stalls <= 0;
perf_lsu_stalls <= 0;
perf_csr_stalls <= 0;
perf_gpu_stalls <= 0;
`ifdef EXT_M_ENABLE
perf_mul_stalls <= 0;
`endif
`ifdef EXT_F_ENABLE
perf_fpu_stalls <= 0;
`endif
end else begin
if (core_icache_req_if.valid & !core_icache_req_if.ready) begin
perf_icache_stalls <= perf_icache_stalls + 64'd1;
end
if (decode_if.valid & !decode_if.ready) begin
perf_ibuffer_stalls <= perf_ibuffer_stalls + 64'd1;
end
if (alu_req_if.valid & !alu_req_if.ready) begin
perf_alu_stalls <= perf_alu_stalls + 64'd1;
end
if (lsu_req_if.valid & !lsu_req_if.ready) begin
perf_lsu_stalls <= perf_lsu_stalls + 64'd1;
end
if (csr_req_if.valid & !csr_req_if.ready) begin
perf_csr_stalls <= perf_csr_stalls + 64'd1;
end
if (gpu_req_if.valid & !gpu_req_if.ready) begin
perf_gpu_stalls <= perf_gpu_stalls + 64'd1;
end
`ifdef EXT_M_ENABLE
if (mul_req_if.valid & !mul_req_if.ready) begin
perf_mul_stalls <= perf_mul_stalls + 64'd1;
end
`endif
`ifdef EXT_F_ENABLE
if (fpu_req_if.valid & !fpu_req_if.ready) begin
perf_fpu_stalls <= perf_fpu_stalls + 64'd1;
end
`endif
end
end
assign perf_pipeline_if.icache_stalls = perf_icache_stalls;
assign perf_pipeline_if.ibuffer_stalls = perf_ibuffer_stalls;
assign perf_pipeline_if.alu_stalls = perf_alu_stalls;
assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls;
assign perf_pipeline_if.csr_stalls = perf_csr_stalls;
assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls;
`ifdef EXT_M_ENABLE
assign perf_pipeline_if.mul_stalls = perf_mul_stalls;
`endif
`ifdef EXT_F_ENABLE
assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls;
`endif
`endif
endmodule

View file

@ -299,6 +299,9 @@ module Vortex (
);
if (`L3_ENABLE) begin
`ifdef PERF_ENABLE
VX_perf_cache_if perf_l3cache_if();
`endif
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_valid_qual;
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_rw_qual;
@ -347,10 +350,14 @@ module Vortex (
.SNP_TAG_WIDTH (`L3SNP_TAG_WIDTH)
) l3cache (
`SCOPE_BIND_Vortex_l3cache
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.perf_cache_if (perf_l3cache_if),
`endif
// Core request
.core_req_valid (per_cluster_dram_req_valid_qual),
.core_req_rw (per_cluster_dram_req_rw_qual),

View file

@ -96,6 +96,14 @@ module VX_bank #(
output wire [SNP_TAG_WIDTH-1:0] snp_rsp_tag,
input wire snp_rsp_ready,
`ifdef PERF_ENABLE
output wire perf_mshr_stall,
output wire perf_pipe_stall,
output wire perf_evict,
output wire perf_read_miss,
output wire perf_write_miss,
`endif
// Misses
output wire misses
);
@ -567,7 +575,6 @@ end else begin
assign incoming_fill_st2 = 0;
assign misses = 0;
end
`ifdef DBG_CACHE_REQ_INFO
@ -951,6 +958,18 @@ end
`SCOPE_ASSIGN (addr_st2, `LINE_TO_BYTE_ADDR(addr_st2, BANK_ID));
`SCOPE_ASSIGN (addr_st3, `LINE_TO_BYTE_ADDR(addr_st3, BANK_ID));
`ifdef PERF_ENABLE
assign perf_pipe_stall = pipeline_stall;
assign perf_mshr_stall = mshr_going_full;
assign perf_read_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & !mem_rw_st1;
assign perf_write_miss = !pipeline_stall & miss_st1 & !is_mshr_st1 & mem_rw_st1;
if (DRAM_ENABLE) begin
assign perf_evict = dreq_push & do_writeback_st3 & !is_snp_st3;
end else begin
assign perf_evict = 0;
end
`endif
`ifdef DBG_PRINT_CACHE_BANK
wire incoming_fill_dfp_st3 = drsq_push && (addr_st3 == dram_rsp_addr);
always @(posedge clk) begin

View file

@ -70,7 +70,12 @@ module VX_cache #(
output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready,
// PERF
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if,
`endif
// DRAM request
output wire dram_req_valid,
output wire dram_req_rw,
@ -130,7 +135,16 @@ module VX_cache #(
wire [NUM_BANKS-1:0] per_bank_miss;
assign miss_vec = per_bank_miss;
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank;
wire [NUM_BANKS-1:0] perf_evict_per_bank;
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
`endif
if (NUM_BANKS == 1) begin
assign snp_req_ready = per_bank_snp_req_ready;
end else begin
@ -139,9 +153,9 @@ module VX_cache #(
VX_cache_core_req_bank_sel #(
.BANK_LINE_SIZE (BANK_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.NUM_BANKS (NUM_BANKS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.CORE_TAG_ID_BITS (CORE_TAG_ID_BITS)
) cache_core_req_bank_sel (
.core_req_valid (core_req_valid),
@ -312,6 +326,14 @@ module VX_cache #(
.dram_rsp_addr (curr_bank_dram_rsp_addr),
.dram_rsp_ready (curr_bank_dram_rsp_ready),
`ifdef PERF_ENABLE
.perf_mshr_stall (perf_mshr_stall_per_bank[i]),
.perf_pipe_stall (perf_pipe_stall_per_bank[i]),
.perf_evict (perf_evict_per_bank[i]),
.perf_read_miss (perf_read_miss_per_bank[i]),
.perf_write_miss (perf_write_miss_per_bank[i]),
`endif
// Snoop request
.snp_req_valid (curr_bank_snp_req_valid),
.snp_req_addr (curr_bank_snp_req_addr),
@ -408,4 +430,150 @@ module VX_cache #(
`UNUSED_VAR (snp_rsp_ready)
end
`ifdef PERF_ENABLE
// per cycle: core_req_r, core_req_w
reg [($clog2(NUM_REQS+1)-1):0] perf_core_req_r_per_cycle, perf_core_req_w_per_cycle;
reg [($clog2(NUM_REQS+1)-1):0] perf_crsp_stall_per_cycle;
if (CORE_TAG_ID_BITS != 0) begin
VX_countones #( // core_req_r
.N(NUM_REQS)
) perf_countones_core_req_r_count (
.valids (core_req_valid & {NUM_REQS{core_req_ready & ~core_req_rw}}),
.count (perf_core_req_r_per_cycle)
);
VX_countones #( // core_req_w
.N(NUM_REQS)
) perf_countones_core_req_w_count (
.valids (core_req_valid & {NUM_REQS{core_req_ready & core_req_rw}}),
.count (perf_core_req_w_per_cycle)
);
VX_countones #( // core_rsp
.N(NUM_REQS)
) perf_countones_core_rsp_count (
.valids (core_rsp_valid & {NUM_REQS{!core_rsp_ready}}),
.count (perf_crsp_stall_per_cycle)
);
end else begin
VX_countones #( // core_req_r
.N(NUM_REQS)
) perf_countones_core_req_r_count (
.valids (core_req_valid & core_req_ready & ~core_req_rw),
.count (perf_core_req_r_per_cycle)
);
VX_countones #( // core_req_w
.N(NUM_REQS)
) perf_countones_core_req_w_count (
.valids (core_req_valid & core_req_ready & core_req_rw),
.count (perf_core_req_w_per_cycle)
);
VX_countones #( // core_rsp
.N(NUM_REQS)
) perf_countones_core_rsp_count (
.valids (core_rsp_valid & ~core_rsp_ready),
.count (perf_crsp_stall_per_cycle)
);
end
// per cycle: msrq stalls, pipeline stalls, evictions, read misses, write misses
reg [($clog2(NUM_BANKS+1)-1):0] perf_mshr_stall_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_pipe_stall_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_evictions_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_read_miss_per_cycle;
reg [($clog2(NUM_BANKS+1)-1):0] perf_write_miss_per_cycle;
VX_countones #(
.N(NUM_BANKS)
) perf_countones_mshr_stall_count (
.valids (perf_mshr_stall_per_bank),
.count (perf_mshr_stall_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_total_stall_count (
.valids (perf_pipe_stall_per_bank),
.count (perf_pipe_stall_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_EVICTSict_count (
.valids (perf_evict_per_bank),
.count (perf_evictions_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_read_miss_count (
.valids (perf_read_miss_per_bank),
.count (perf_read_miss_per_cycle)
);
VX_countones #(
.N(NUM_BANKS)
) perf_countones_write_miss_count (
.valids (perf_write_miss_per_bank),
.count (perf_write_miss_per_cycle)
);
reg [63:0] perf_core_req_r;
reg [63:0] perf_core_req_w;
reg [63:0] perf_mshr_stall;
reg [63:0] perf_pipe_stall;
reg [63:0] perf_evictions;
reg [63:0] perf_read_miss;
reg [63:0] perf_write_miss;
reg [63:0] perf_crsp_stall;
reg [63:0] perf_dreq_stall;
always @(posedge clk) begin
if (reset) begin
perf_core_req_r <= 0;
perf_core_req_w <= 0;
perf_crsp_stall <= 0;
perf_mshr_stall <= 0;
perf_pipe_stall <= 0;
perf_evictions <= 0;
perf_read_miss <= 0;
perf_write_miss <= 0;
perf_dreq_stall <= 0;
end else begin
// core requests
perf_core_req_r <= perf_core_req_r + $bits(perf_core_req_r)'(perf_core_req_r_per_cycle);
perf_core_req_w <= perf_core_req_w + $bits(perf_core_req_w)'(perf_core_req_w_per_cycle);
// core response stalls
perf_crsp_stall <= perf_crsp_stall + $bits(perf_crsp_stall)'(perf_crsp_stall_per_cycle);
// miss reserve queue stalls
perf_mshr_stall <= perf_mshr_stall + $bits(perf_mshr_stall)'(perf_mshr_stall_per_cycle);
// pipeline stalls
perf_pipe_stall <= perf_pipe_stall + $bits(perf_pipe_stall)'(perf_pipe_stall_per_cycle);
// total evictions
perf_evictions <= perf_evictions + $bits(perf_evictions)'(perf_evictions_per_cycle);
// read misses
perf_read_miss <= perf_read_miss + $bits(perf_read_miss)'(perf_read_miss_per_cycle);
// write misses
perf_write_miss <= perf_write_miss + $bits(perf_write_miss)'(perf_write_miss_per_cycle);
// dram request stalls
if (dram_req_valid & !dram_req_ready) begin
perf_dreq_stall <= perf_dreq_stall + 64'd1;
end
end
end
assign perf_cache_if.reads = perf_core_req_r;
assign perf_cache_if.writes = perf_core_req_w;
assign perf_cache_if.read_misses = perf_read_miss;
assign perf_cache_if.write_misses = perf_write_miss;
assign perf_cache_if.evictions = perf_evictions;
assign perf_cache_if.mshr_stalls = perf_mshr_stall;
assign perf_cache_if.pipe_stalls = perf_pipe_stall;
assign perf_cache_if.crsp_stalls = perf_crsp_stall;
assign perf_cache_if.dreq_stalls = perf_dreq_stall;
`endif
endmodule

View file

@ -159,7 +159,7 @@ module VX_fpnew
.tag_o ({fpu_tag_out, fpu_has_fflags_out}),
.out_valid_o (fpu_valid_out),
.out_ready_i (fpu_ready_out),
`UNUSED_PIN (busy_o)
`UNUSED_PIN (busy_o)
);
end else begin
fpnew_top #(
@ -179,14 +179,14 @@ module VX_fpnew
.vectorial_op_i (1'b0),
.tag_i (1'b0),
.in_valid_i (fpu_valid_in),
`UNUSED_PIN (in_ready_o),
`UNUSED_PIN (in_ready_o),
.flush_i (reset),
.result_o (fpu_result[i]),
.status_o (fpu_status[i]),
`UNUSED_PIN (tag_o),
`UNUSED_PIN (out_valid_o),
`UNUSED_PIN (tag_o),
`UNUSED_PIN (out_valid_o),
.out_ready_i (fpu_ready_out),
`UNUSED_PIN (busy_o)
`UNUSED_PIN (busy_o)
);
end
end

View file

@ -0,0 +1,20 @@
`ifndef VX_PERF_CACHE_IF
`define VX_PERF_CACHE_IF
`include "VX_define.vh"
interface VX_perf_cache_if ();
wire [63:0] reads;
wire [63:0] writes;
wire [63:0] read_misses;
wire [63:0] write_misses;
wire [63:0] evictions;
wire [63:0] mshr_stalls;
wire [63:0] crsp_stalls;
wire [63:0] dreq_stalls;
wire [63:0] pipe_stalls;
endinterface
`endif

View file

@ -0,0 +1,17 @@
`ifndef VX_PERF_MEMSYS_IF
`define VX_PERF_MEMSYS_IF
`include "VX_define.vh"
interface VX_perf_memsys_if ();
VX_perf_cache_if dcache_if;
VX_perf_cache_if icache_if;
wire [63:0] dram_latency;
wire [63:0] dram_requests;
wire [63:0] dram_responses;
endinterface
`endif

View file

@ -0,0 +1,25 @@
`ifndef VX_PERF_PIPELINE_IF
`define VX_PERF_PIPELINE_IF
`include "VX_define.vh"
interface VX_perf_pipeline_if ();
// from pipeline
wire [63:0] icache_stalls;
wire [63:0] ibuffer_stalls;
// from issue
wire [63:0] scoreboard_stalls;
// from execute
wire [63:0] lsu_stalls;
wire [63:0] csr_stalls;
wire [63:0] alu_stalls;
wire [63:0] gpu_stalls;
`ifdef EXT_M_ENABLE
wire [63:0] mul_stalls;
`endif
`ifdef EXT_F_ENABLE
wire [63:0] fpu_stalls;
`endif
endinterface
`endif