Added taken branch performance counter and excluded jumps and branches

in icache misses
This commit is contained in:
Andreas Traber 2015-12-15 18:07:32 +01:00
parent a6c4f6d4ef
commit 4ed498014b
5 changed files with 70 additions and 72 deletions

View file

@ -106,7 +106,6 @@ module riscv_controller
// Performance Counters
output logic perf_jump_o, // we are executing a jump instruction (j, jr, jal, jalr)
output logic perf_branch_o, // we are executing a branch instruction (bf, bnf)
output logic perf_jr_stall_o, // stall due to jump-register-hazard
output logic perf_ld_stall_o // stall due to load-use-hazard
);
@ -516,7 +515,6 @@ module riscv_controller
// Performance Counters
assign perf_jump_o = (jump_in_id_i == `BRANCH_JAL || jump_in_id_i == `BRANCH_JALR);
assign perf_branch_o = (jump_in_id_i == `BRANCH_COND);
assign perf_jr_stall_o = jr_stall_o;
assign perf_ld_stall_o = load_stall_o;

View file

@ -78,8 +78,10 @@ module riscv_cs_registers
input logic is_decoding_i, // controller is in DECODE state
input logic imiss_i, // instruction fetch
input logic pc_set_i, // pc was set to a new value
input logic jump_i, // jump instruction seen (j, jr, jal, jalr)
input logic branch_i, // branch instruction seen (bf, bnf)
input logic branch_taken_i, // branch was taken
input logic ld_stall_i, // load use hazard
input logic jr_stall_i, // jump register use hazard
@ -89,7 +91,7 @@ module riscv_cs_registers
input logic [N_EXT_CNT-1:0] ext_counters_i
);
localparam N_PERF_COUNTERS = 10 + N_EXT_CNT;
localparam N_PERF_COUNTERS = 11 + N_EXT_CNT;
`ifdef ASIC_SYNTHESIS
localparam N_PERF_REGS = 1;
@ -276,12 +278,13 @@ module riscv_cs_registers
assign PCCR_in[1] = id_valid_i & is_decoding_i; // instruction counter
assign PCCR_in[2] = ld_stall_i & id_valid_q; // nr of load use hazards
assign PCCR_in[3] = jr_stall_i & id_valid_q; // nr of jump register hazards
assign PCCR_in[4] = imiss_i; // cycles waiting for instruction fetches
assign PCCR_in[4] = imiss_i & (~pc_set_i); // cycles waiting for instruction fetches, excluding jumps and branches
assign PCCR_in[5] = mem_load_i; // nr of loads
assign PCCR_in[6] = mem_store_i; // nr of stores
assign PCCR_in[7] = jump_i & id_valid_q; // nr of jumps (unconditional)
assign PCCR_in[8] = branch_i & id_valid_q; // nr of branches (conditional)
assign PCCR_in[9] = id_valid_i & is_decoding_i & is_compressed_i; // compressed instruction counter
assign PCCR_in[7] = jump_i & id_valid_q; // nr of jumps (unconditional)
assign PCCR_in[8] = branch_i & id_valid_q; // nr of branches (conditional)
assign PCCR_in[9] = branch_taken_i & id_valid_q; // nr of taken branches (conditional)
assign PCCR_in[10] = id_valid_i & is_decoding_i & is_compressed_i; // compressed instruction counter
// assign external performance counters
generate

View file

@ -96,20 +96,19 @@ controlled by the \instr{saturation} bit in PCMR.
\textbf{PCCR0} & \textbf{CYCLES} & Count the number of cycles the core was running \\ \hline
\textbf{PCCR1} & \textbf{INSTR} & Count the number of instructions executed \\ \hline
\textbf{PCCR2} & \textbf{LD\_STALL} & Number of load data hazards \\ \hline
\textbf{PCCR3} & \textbf{JMP\_STALL} & Number of jump register data hazards \\ \hline
\textbf{PCCR3} & \textbf{JR\_STALL} & Number of jump register data hazards \\ \hline
\textbf{PCCR4} & \textbf{IMISS} & Cycles waiting for instruction fetches. i.e. the number of instructions wasted due to non-ideal caches \\ \hline
\textbf{PCCR5} & \textbf{WBRANCH} & Number of wrong predicted branches \\ \hline
\textbf{PCCR6} & \textbf{WBRANCH\_CYC} & Cycles wasted due to wrong predicted branches \\ \hline
\textbf{PCCR7} & \textbf{LD} & Number of memory loads executed. Misaligned accesses are counted twice \\ \hline
\textbf{PCCR8} & \textbf{ST} & Number of memory stores executed. Misaligned accesses are counted twice \\ \hline
\textbf{PCCR9} & \textbf{JUMP} & Number of jumps (j, jal, jr, jalr)\\ \hline
\textbf{PCCR10} & \textbf{BRANCH} & Number of branches (bf, bnf), counts taken and not taken branches\\ \hline
\textbf{PCCR11} & \textbf{DELAY\_NOP} & Number of empty (l.nop) delay slots \\ \hline
\textbf{PCCR12} & \textbf{LD\_EXT} & Number of memory loads to EXT executed. Misaligned accesses are counted twice. Every non-TCDM access is considered external \\ \hline
\textbf{PCCR13} & \textbf{ST\_EXT} & Number of memory stores to EXT executed. Misaligned accesses are counted twice. Every non-TCDM access is considered external \\ \hline
\textbf{PCCR14} & \textbf{LD\_EXT\_CYC} & Cycles used for memory loads to EXT. Every non-TCDM access is considered external \\ \hline
\textbf{PCCR15} & \textbf{ST\_EXT\_CYC} & Cycles used for memory stores to EXT. Every non-TCDM access is considered external \\ \hline
\textbf{PCCR16} & \textbf{TCDM\_CONT} & Cycles wasted due to TCDM/log-interconnect contention \\ \hline
\textbf{PCCR5} & \textbf{LD} & Number of memory loads executed. Misaligned accesses are counted twice \\ \hline
\textbf{PCCR6} & \textbf{ST} & Number of memory stores executed. Misaligned accesses are counted twice \\ \hline
\textbf{PCCR7} & \textbf{JUMP} & Number of jumps (j, jal, jr, jalr)\\ \hline
\textbf{PCCR8} & \textbf{BRANCH} & Number of branches, counts taken and not taken branches\\ \hline
\textbf{PCCR9} & \textbf{BTAKEN} & Number of taken branches \\ \hline
\textbf{PCCR10} & \textbf{RVC} & Number of compressed instructions executed \\ \hline
\textbf{PCCR11} & \textbf{LD\_EXT} & Number of memory loads to EXT executed. Misaligned accesses are counted twice. Every non-TCDM access is considered external \\ \hline
\textbf{PCCR12} & \textbf{ST\_EXT} & Number of memory stores to EXT executed. Misaligned accesses are counted twice. Every non-TCDM access is considered external \\ \hline
\textbf{PCCR13} & \textbf{LD\_EXT\_CYC} & Cycles used for memory loads to EXT. Every non-TCDM access is considered external \\ \hline
\textbf{PCCR14} & \textbf{ST\_EXT\_CYC} & Cycles used for memory stores to EXT. Every non-TCDM access is considered external \\ \hline
\textbf{PCCR15} & \textbf{TCDM\_CONT} & Cycles wasted due to TCDM/log-interconnect contention \\ \hline
\textbf{PCCR31} & \textbf{ALL} & Special Register, a write to this register will set all counters to the supplied value\\ \bottomrule
\end{tabularx}
\end{table}

View file

@ -166,7 +166,6 @@ module riscv_id_stage
// Performance Counters
output logic perf_jump_o, // we are executing a jump instruction
output logic perf_branch_o, // we are executing a branch instruction
output logic perf_jr_stall_o, // jump-register-hazard
output logic perf_ld_stall_o // load-use-hazard
);
@ -710,7 +709,6 @@ module riscv_id_stage
// Performance Counters
.perf_jump_o ( perf_jump_o ),
.perf_branch_o ( perf_branch_o ),
.perf_jr_stall_o ( perf_jr_stall_o ),
.perf_ld_stall_o ( perf_ld_stall_o )
);

View file

@ -225,7 +225,6 @@ module riscv_core
// Performance Counters
logic perf_imiss;
logic perf_jump;
logic perf_branch;
logic perf_jr_stall;
logic perf_ld_stall;
@ -248,21 +247,21 @@ module riscv_core
)
if_stage_i
(
.clk ( clk ),
.rst_n ( rst_n ),
.clk ( clk ),
.rst_n ( rst_n ),
// boot address (trap vector location)
.boot_addr_i ( boot_addr_i ),
.boot_addr_i ( boot_addr_i ),
// instruction request control
.req_i ( instr_req_int ),
.req_i ( instr_req_int ),
// instruction cache interface
.instr_req_o ( instr_req_o ),
.instr_addr_o ( instr_addr_o ),
.instr_gnt_i ( instr_gnt_i ),
.instr_rvalid_i ( instr_rvalid_i ),
.instr_rdata_i ( instr_rdata_i ),
.instr_req_o ( instr_req_o ),
.instr_addr_o ( instr_addr_o ),
.instr_gnt_i ( instr_gnt_i ),
.instr_rvalid_i ( instr_rvalid_i ),
.instr_rdata_i ( instr_rdata_i ),
// outputs to ID stage
.hwlp_dec_cnt_id_o ( hwlp_dec_cnt_id ),
@ -283,26 +282,26 @@ module riscv_core
.exc_vec_pc_mux_i ( exc_vec_pc_mux_id ),
// from hwloop registers
.hwlp_start_i ( hwlp_start ),
.hwlp_end_i ( hwlp_end ),
.hwlp_cnt_i ( hwlp_cnt ),
.hwlp_start_i ( hwlp_start ),
.hwlp_end_i ( hwlp_end ),
.hwlp_cnt_i ( hwlp_cnt ),
// from debug unit
.dbg_npc_i ( dbg_npc ),
.dbg_set_npc_i ( dbg_set_npc ),
.dbg_npc_i ( dbg_npc ),
.dbg_set_npc_i ( dbg_set_npc ),
// Jump targets
.jump_target_id_i ( jump_target_id ),
.jump_target_ex_i ( jump_target_ex ),
.jump_target_id_i ( jump_target_id ),
.jump_target_ex_i ( jump_target_ex ),
// pipeline stalls
.halt_if_i ( halt_if ),
.if_ready_o ( if_ready ),
.id_ready_i ( id_ready ),
.if_valid_o ( if_valid ),
.halt_if_i ( halt_if ),
.if_ready_o ( if_ready ),
.id_ready_i ( id_ready ),
.if_valid_o ( if_valid ),
.if_busy_o ( if_busy ),
.perf_imiss_o ( perf_imiss )
.if_busy_o ( if_busy ),
.perf_imiss_o ( perf_imiss )
);
@ -447,7 +446,6 @@ module riscv_core
// Performance Counters
.perf_jump_o ( perf_jump ),
.perf_branch_o ( perf_branch ),
.perf_jr_stall_o ( perf_jr_stall ),
.perf_ld_stall_o ( perf_ld_stall )
);
@ -587,49 +585,51 @@ module riscv_core
)
cs_registers_i
(
.clk ( clk ),
.rst_n ( rst_n ),
.clk ( clk ),
.rst_n ( rst_n ),
// Core and Cluster ID from outside
.core_id_i ( core_id_i ),
.cluster_id_i ( cluster_id_i ),
.core_id_i ( core_id_i ),
.cluster_id_i ( cluster_id_i ),
// Interface to CSRs (SRAM like)
.csr_access_i ( csr_access_ex ),
.csr_addr_i ( csr_addr ),
.csr_wdata_i ( csr_wdata ),
.csr_op_i ( csr_op ),
.csr_rdata_o ( csr_rdata ),
.csr_access_i ( csr_access_ex ),
.csr_addr_i ( csr_addr ),
.csr_wdata_i ( csr_wdata ),
.csr_op_i ( csr_op ),
.csr_rdata_o ( csr_rdata ),
// Interrupt related control signals
.irq_enable_o ( irq_enable ),
.epcr_o ( epcr ),
.irq_enable_o ( irq_enable ),
.epcr_o ( epcr ),
.curr_pc_id_i ( current_pc_id ), // from IF stage
.save_pc_id_i ( save_pc_id ),
.curr_pc_id_i ( current_pc_id ), // from IF stage
.save_pc_id_i ( save_pc_id ),
.exc_cause_i ( exc_cause ),
.save_exc_cause_i ( save_exc_cause ),
.exc_cause_i ( exc_cause ),
.save_exc_cause_i ( save_exc_cause ),
// from hwloop registers
.hwlp_start_i ( hwlp_start ),
.hwlp_end_i ( hwlp_end ),
.hwlp_cnt_i ( hwlp_cnt ),
.hwlp_start_i ( hwlp_start ),
.hwlp_end_i ( hwlp_end ),
.hwlp_cnt_i ( hwlp_cnt ),
.hwlp_regid_o ( csr_hwlp_regid ),
.hwlp_we_o ( csr_hwlp_we ),
.hwlp_data_o ( csr_hwlp_data ),
.hwlp_regid_o ( csr_hwlp_regid ),
.hwlp_we_o ( csr_hwlp_we ),
.hwlp_data_o ( csr_hwlp_data ),
// performance counter related signals
.id_valid_i ( id_valid ),
.is_compressed_i ( is_compressed_id ),
.is_decoding_i ( is_decoding ),
.imiss_i ( perf_imiss ),
.jump_i ( perf_jump ),
.branch_i ( perf_branch ),
.ld_stall_i ( perf_ld_stall ),
.jr_stall_i ( perf_jr_stall ),
.imiss_i ( perf_imiss ),
.pc_set_i ( pc_set ),
.jump_i ( perf_jump ),
.branch_i ( branch_in_ex ),
.branch_taken_i ( branch_decision ),
.ld_stall_i ( perf_ld_stall ),
.jr_stall_i ( perf_jr_stall ),
.mem_load_i ( data_req_o & data_gnt_i & (~data_we_o) ),
.mem_store_i ( data_req_o & data_gnt_i & data_we_o ),