diff --git a/controller.sv b/controller.sv index 4564e89f..5f014248 100644 --- a/controller.sv +++ b/controller.sv @@ -106,7 +106,6 @@ module riscv_controller // Performance Counters output logic perf_jump_o, // we are executing a jump instruction (j, jr, jal, jalr) - output logic perf_branch_o, // we are executing a branch instruction (bf, bnf) output logic perf_jr_stall_o, // stall due to jump-register-hazard output logic perf_ld_stall_o // stall due to load-use-hazard ); @@ -516,7 +515,6 @@ module riscv_controller // Performance Counters assign perf_jump_o = (jump_in_id_i == `BRANCH_JAL || jump_in_id_i == `BRANCH_JALR); - assign perf_branch_o = (jump_in_id_i == `BRANCH_COND); assign perf_jr_stall_o = jr_stall_o; assign perf_ld_stall_o = load_stall_o; diff --git a/cs_registers.sv b/cs_registers.sv index 2f0193bd..9b0379ab 100644 --- a/cs_registers.sv +++ b/cs_registers.sv @@ -78,8 +78,10 @@ module riscv_cs_registers input logic is_decoding_i, // controller is in DECODE state input logic imiss_i, // instruction fetch + input logic pc_set_i, // pc was set to a new value input logic jump_i, // jump instruction seen (j, jr, jal, jalr) input logic branch_i, // branch instruction seen (bf, bnf) + input logic branch_taken_i, // branch was taken input logic ld_stall_i, // load use hazard input logic jr_stall_i, // jump register use hazard @@ -89,7 +91,7 @@ module riscv_cs_registers input logic [N_EXT_CNT-1:0] ext_counters_i ); - localparam N_PERF_COUNTERS = 10 + N_EXT_CNT; + localparam N_PERF_COUNTERS = 11 + N_EXT_CNT; `ifdef ASIC_SYNTHESIS localparam N_PERF_REGS = 1; @@ -276,12 +278,13 @@ module riscv_cs_registers assign PCCR_in[1] = id_valid_i & is_decoding_i; // instruction counter assign PCCR_in[2] = ld_stall_i & id_valid_q; // nr of load use hazards assign PCCR_in[3] = jr_stall_i & id_valid_q; // nr of jump register hazards - assign PCCR_in[4] = imiss_i; // cycles waiting for instruction fetches + assign PCCR_in[4] = imiss_i & (~pc_set_i); // cycles waiting for instruction fetches, excluding jumps and branches assign PCCR_in[5] = mem_load_i; // nr of loads assign PCCR_in[6] = mem_store_i; // nr of stores - assign PCCR_in[7] = jump_i & id_valid_q; // nr of jumps (unconditional) - assign PCCR_in[8] = branch_i & id_valid_q; // nr of branches (conditional) - assign PCCR_in[9] = id_valid_i & is_decoding_i & is_compressed_i; // compressed instruction counter + assign PCCR_in[7] = jump_i & id_valid_q; // nr of jumps (unconditional) + assign PCCR_in[8] = branch_i & id_valid_q; // nr of branches (conditional) + assign PCCR_in[9] = branch_taken_i & id_valid_q; // nr of taken branches (conditional) + assign PCCR_in[10] = id_valid_i & is_decoding_i & is_compressed_i; // compressed instruction counter // assign external performance counters generate diff --git a/docs/datasheet/content/perfcounters.tex b/docs/datasheet/content/perfcounters.tex index b490df53..a6b916d1 100644 --- a/docs/datasheet/content/perfcounters.tex +++ b/docs/datasheet/content/perfcounters.tex @@ -96,20 +96,19 @@ controlled by the \instr{saturation} bit in PCMR. \textbf{PCCR0} & \textbf{CYCLES} & Count the number of cycles the core was running \\ \hline \textbf{PCCR1} & \textbf{INSTR} & Count the number of instructions executed \\ \hline \textbf{PCCR2} & \textbf{LD\_STALL} & Number of load data hazards \\ \hline - \textbf{PCCR3} & \textbf{JMP\_STALL} & Number of jump register data hazards \\ \hline + \textbf{PCCR3} & \textbf{JR\_STALL} & Number of jump register data hazards \\ \hline \textbf{PCCR4} & \textbf{IMISS} & Cycles waiting for instruction fetches. i.e. the number of instructions wasted due to non-ideal caches \\ \hline - \textbf{PCCR5} & \textbf{WBRANCH} & Number of wrong predicted branches \\ \hline - \textbf{PCCR6} & \textbf{WBRANCH\_CYC} & Cycles wasted due to wrong predicted branches \\ \hline - \textbf{PCCR7} & \textbf{LD} & Number of memory loads executed. Misaligned accesses are counted twice \\ \hline - \textbf{PCCR8} & \textbf{ST} & Number of memory stores executed. Misaligned accesses are counted twice \\ \hline - \textbf{PCCR9} & \textbf{JUMP} & Number of jumps (j, jal, jr, jalr)\\ \hline - \textbf{PCCR10} & \textbf{BRANCH} & Number of branches (bf, bnf), counts taken and not taken branches\\ \hline - \textbf{PCCR11} & \textbf{DELAY\_NOP} & Number of empty (l.nop) delay slots \\ \hline - \textbf{PCCR12} & \textbf{LD\_EXT} & Number of memory loads to EXT executed. Misaligned accesses are counted twice. Every non-TCDM access is considered external \\ \hline - \textbf{PCCR13} & \textbf{ST\_EXT} & Number of memory stores to EXT executed. Misaligned accesses are counted twice. Every non-TCDM access is considered external \\ \hline - \textbf{PCCR14} & \textbf{LD\_EXT\_CYC} & Cycles used for memory loads to EXT. Every non-TCDM access is considered external \\ \hline - \textbf{PCCR15} & \textbf{ST\_EXT\_CYC} & Cycles used for memory stores to EXT. Every non-TCDM access is considered external \\ \hline - \textbf{PCCR16} & \textbf{TCDM\_CONT} & Cycles wasted due to TCDM/log-interconnect contention \\ \hline + \textbf{PCCR5} & \textbf{LD} & Number of memory loads executed. Misaligned accesses are counted twice \\ \hline + \textbf{PCCR6} & \textbf{ST} & Number of memory stores executed. Misaligned accesses are counted twice \\ \hline + \textbf{PCCR7} & \textbf{JUMP} & Number of jumps (j, jal, jr, jalr)\\ \hline + \textbf{PCCR8} & \textbf{BRANCH} & Number of branches, counts taken and not taken branches\\ \hline + \textbf{PCCR9} & \textbf{BTAKEN} & Number of taken branches \\ \hline + \textbf{PCCR10} & \textbf{RVC} & Number of compressed instructions executed \\ \hline + \textbf{PCCR11} & \textbf{LD\_EXT} & Number of memory loads to EXT executed. Misaligned accesses are counted twice. Every non-TCDM access is considered external \\ \hline + \textbf{PCCR12} & \textbf{ST\_EXT} & Number of memory stores to EXT executed. Misaligned accesses are counted twice. Every non-TCDM access is considered external \\ \hline + \textbf{PCCR13} & \textbf{LD\_EXT\_CYC} & Cycles used for memory loads to EXT. Every non-TCDM access is considered external \\ \hline + \textbf{PCCR14} & \textbf{ST\_EXT\_CYC} & Cycles used for memory stores to EXT. Every non-TCDM access is considered external \\ \hline + \textbf{PCCR15} & \textbf{TCDM\_CONT} & Cycles wasted due to TCDM/log-interconnect contention \\ \hline \textbf{PCCR31} & \textbf{ALL} & Special Register, a write to this register will set all counters to the supplied value\\ \bottomrule \end{tabularx} \end{table} diff --git a/id_stage.sv b/id_stage.sv index 12930819..7364da8b 100644 --- a/id_stage.sv +++ b/id_stage.sv @@ -166,7 +166,6 @@ module riscv_id_stage // Performance Counters output logic perf_jump_o, // we are executing a jump instruction - output logic perf_branch_o, // we are executing a branch instruction output logic perf_jr_stall_o, // jump-register-hazard output logic perf_ld_stall_o // load-use-hazard ); @@ -710,7 +709,6 @@ module riscv_id_stage // Performance Counters .perf_jump_o ( perf_jump_o ), - .perf_branch_o ( perf_branch_o ), .perf_jr_stall_o ( perf_jr_stall_o ), .perf_ld_stall_o ( perf_ld_stall_o ) ); diff --git a/riscv_core.sv b/riscv_core.sv index bee486fd..82d4d913 100644 --- a/riscv_core.sv +++ b/riscv_core.sv @@ -225,7 +225,6 @@ module riscv_core // Performance Counters logic perf_imiss; logic perf_jump; - logic perf_branch; logic perf_jr_stall; logic perf_ld_stall; @@ -248,21 +247,21 @@ module riscv_core ) if_stage_i ( - .clk ( clk ), - .rst_n ( rst_n ), + .clk ( clk ), + .rst_n ( rst_n ), // boot address (trap vector location) - .boot_addr_i ( boot_addr_i ), + .boot_addr_i ( boot_addr_i ), // instruction request control - .req_i ( instr_req_int ), + .req_i ( instr_req_int ), // instruction cache interface - .instr_req_o ( instr_req_o ), - .instr_addr_o ( instr_addr_o ), - .instr_gnt_i ( instr_gnt_i ), - .instr_rvalid_i ( instr_rvalid_i ), - .instr_rdata_i ( instr_rdata_i ), + .instr_req_o ( instr_req_o ), + .instr_addr_o ( instr_addr_o ), + .instr_gnt_i ( instr_gnt_i ), + .instr_rvalid_i ( instr_rvalid_i ), + .instr_rdata_i ( instr_rdata_i ), // outputs to ID stage .hwlp_dec_cnt_id_o ( hwlp_dec_cnt_id ), @@ -283,26 +282,26 @@ module riscv_core .exc_vec_pc_mux_i ( exc_vec_pc_mux_id ), // from hwloop registers - .hwlp_start_i ( hwlp_start ), - .hwlp_end_i ( hwlp_end ), - .hwlp_cnt_i ( hwlp_cnt ), + .hwlp_start_i ( hwlp_start ), + .hwlp_end_i ( hwlp_end ), + .hwlp_cnt_i ( hwlp_cnt ), // from debug unit - .dbg_npc_i ( dbg_npc ), - .dbg_set_npc_i ( dbg_set_npc ), + .dbg_npc_i ( dbg_npc ), + .dbg_set_npc_i ( dbg_set_npc ), // Jump targets - .jump_target_id_i ( jump_target_id ), - .jump_target_ex_i ( jump_target_ex ), + .jump_target_id_i ( jump_target_id ), + .jump_target_ex_i ( jump_target_ex ), // pipeline stalls - .halt_if_i ( halt_if ), - .if_ready_o ( if_ready ), - .id_ready_i ( id_ready ), - .if_valid_o ( if_valid ), + .halt_if_i ( halt_if ), + .if_ready_o ( if_ready ), + .id_ready_i ( id_ready ), + .if_valid_o ( if_valid ), - .if_busy_o ( if_busy ), - .perf_imiss_o ( perf_imiss ) + .if_busy_o ( if_busy ), + .perf_imiss_o ( perf_imiss ) ); @@ -447,7 +446,6 @@ module riscv_core // Performance Counters .perf_jump_o ( perf_jump ), - .perf_branch_o ( perf_branch ), .perf_jr_stall_o ( perf_jr_stall ), .perf_ld_stall_o ( perf_ld_stall ) ); @@ -587,49 +585,51 @@ module riscv_core ) cs_registers_i ( - .clk ( clk ), - .rst_n ( rst_n ), + .clk ( clk ), + .rst_n ( rst_n ), // Core and Cluster ID from outside - .core_id_i ( core_id_i ), - .cluster_id_i ( cluster_id_i ), + .core_id_i ( core_id_i ), + .cluster_id_i ( cluster_id_i ), // Interface to CSRs (SRAM like) - .csr_access_i ( csr_access_ex ), - .csr_addr_i ( csr_addr ), - .csr_wdata_i ( csr_wdata ), - .csr_op_i ( csr_op ), - .csr_rdata_o ( csr_rdata ), + .csr_access_i ( csr_access_ex ), + .csr_addr_i ( csr_addr ), + .csr_wdata_i ( csr_wdata ), + .csr_op_i ( csr_op ), + .csr_rdata_o ( csr_rdata ), // Interrupt related control signals - .irq_enable_o ( irq_enable ), - .epcr_o ( epcr ), + .irq_enable_o ( irq_enable ), + .epcr_o ( epcr ), - .curr_pc_id_i ( current_pc_id ), // from IF stage - .save_pc_id_i ( save_pc_id ), + .curr_pc_id_i ( current_pc_id ), // from IF stage + .save_pc_id_i ( save_pc_id ), - .exc_cause_i ( exc_cause ), - .save_exc_cause_i ( save_exc_cause ), + .exc_cause_i ( exc_cause ), + .save_exc_cause_i ( save_exc_cause ), // from hwloop registers - .hwlp_start_i ( hwlp_start ), - .hwlp_end_i ( hwlp_end ), - .hwlp_cnt_i ( hwlp_cnt ), + .hwlp_start_i ( hwlp_start ), + .hwlp_end_i ( hwlp_end ), + .hwlp_cnt_i ( hwlp_cnt ), - .hwlp_regid_o ( csr_hwlp_regid ), - .hwlp_we_o ( csr_hwlp_we ), - .hwlp_data_o ( csr_hwlp_data ), + .hwlp_regid_o ( csr_hwlp_regid ), + .hwlp_we_o ( csr_hwlp_we ), + .hwlp_data_o ( csr_hwlp_data ), // performance counter related signals .id_valid_i ( id_valid ), .is_compressed_i ( is_compressed_id ), .is_decoding_i ( is_decoding ), - .imiss_i ( perf_imiss ), - .jump_i ( perf_jump ), - .branch_i ( perf_branch ), - .ld_stall_i ( perf_ld_stall ), - .jr_stall_i ( perf_jr_stall ), + .imiss_i ( perf_imiss ), + .pc_set_i ( pc_set ), + .jump_i ( perf_jump ), + .branch_i ( branch_in_ex ), + .branch_taken_i ( branch_decision ), + .ld_stall_i ( perf_ld_stall ), + .jr_stall_i ( perf_jr_stall ), .mem_load_i ( data_req_o & data_gnt_i & (~data_we_o) ), .mem_store_i ( data_req_o & data_gnt_i & data_we_o ),