[rtl] Fix retired instruction counters

When the writeback stage is present the retired instruction counter
(minstret) and the retired compressed instruction counter could see an
off by one error when an instruction was in the writeback stage when
reading the counters. With this fix the ID stage observes the
incremented value of the counters when an instruction that would
increment them is in writeback.
This commit is contained in:
Greg Chadwick 2021-08-30 11:30:48 +00:00 committed by Greg Chadwick
parent 75c030b776
commit 31b2f6c863
4 changed files with 134 additions and 58 deletions

View file

@ -330,6 +330,8 @@ module ibex_core import ibex_pkg::*; #(
logic perf_instr_ret_wb;
logic perf_instr_ret_compressed_wb;
logic perf_instr_ret_wb_spec;
logic perf_instr_ret_compressed_wb_spec;
logic perf_iside_wait;
logic perf_dside_wait;
logic perf_mul_wait;
@ -759,13 +761,15 @@ module ibex_core import ibex_pkg::*; #(
.instr_is_compressed_id_i(instr_is_compressed_id),
.instr_perf_count_id_i (instr_perf_count_id),
.ready_wb_o (ready_wb),
.rf_write_wb_o (rf_write_wb),
.outstanding_load_wb_o (outstanding_load_wb),
.outstanding_store_wb_o (outstanding_store_wb),
.pc_wb_o (pc_wb),
.perf_instr_ret_wb_o (perf_instr_ret_wb),
.perf_instr_ret_compressed_wb_o(perf_instr_ret_compressed_wb),
.ready_wb_o (ready_wb),
.rf_write_wb_o (rf_write_wb),
.outstanding_load_wb_o (outstanding_load_wb),
.outstanding_store_wb_o (outstanding_store_wb),
.pc_wb_o (pc_wb),
.perf_instr_ret_wb_o (perf_instr_ret_wb),
.perf_instr_ret_compressed_wb_o (perf_instr_ret_compressed_wb),
.perf_instr_ret_wb_spec_o (perf_instr_ret_wb_spec),
.perf_instr_ret_compressed_wb_spec_o(perf_instr_ret_compressed_wb_spec),
.rf_waddr_id_i(rf_waddr_id),
.rf_wdata_id_i(rf_wdata_id),
@ -1010,17 +1014,19 @@ module ibex_core import ibex_pkg::*; #(
.illegal_csr_insn_o(illegal_csr_insn_id),
// performance counter related signals
.instr_ret_i (perf_instr_ret_wb),
.instr_ret_compressed_i(perf_instr_ret_compressed_wb),
.iside_wait_i (perf_iside_wait),
.jump_i (perf_jump),
.branch_i (perf_branch),
.branch_taken_i (perf_tbranch),
.mem_load_i (perf_load),
.mem_store_i (perf_store),
.dside_wait_i (perf_dside_wait),
.mul_wait_i (perf_mul_wait),
.div_wait_i (perf_div_wait)
.instr_ret_i (perf_instr_ret_wb),
.instr_ret_compressed_i (perf_instr_ret_compressed_wb),
.instr_ret_spec_i (perf_instr_ret_wb_spec),
.instr_ret_compressed_spec_i(perf_instr_ret_compressed_wb_spec),
.iside_wait_i (perf_iside_wait),
.jump_i (perf_jump),
.branch_i (perf_branch),
.branch_taken_i (perf_tbranch),
.mem_load_i (perf_load),
.mem_store_i (perf_store),
.dside_wait_i (perf_dside_wait),
.mul_wait_i (perf_mul_wait),
.div_wait_i (perf_div_wait)
);
// These assertions are in top-level as instr_valid_id required as the enable term

View file

@ -1,5 +1,9 @@
module ibex_counter #(
parameter int CounterWidth = 32
parameter int CounterWidth = 32,
// When set `counter_val_upd_o` provides an incremented version of the counter value, otherwise
// the output is hard-wired to 0. This is required to allow Xilinx DSP inference to work
// correctly. When `ProvideValUpd` is set no DSPs are inferred.
parameter bit ProvideValUpd = 0
) (
input logic clk_i,
input logic rst_ni,
@ -8,7 +12,8 @@ module ibex_counter #(
input logic counterh_we_i,
input logic counter_we_i,
input logic [31:0] counter_val_i,
output logic [63:0] counter_val_o
output logic [63:0] counter_val_o,
output logic [63:0] counter_val_upd_o
);
logic [63:0] counter;
@ -17,9 +22,11 @@ module ibex_counter #(
logic we;
logic [CounterWidth-1:0] counter_d;
// Increment
assign counter_upd = counter[CounterWidth-1:0] + {{CounterWidth - 1{1'b0}}, 1'b1};
// Update
always_comb begin
// Write
we = counter_we_i | counterh_we_i;
counter_load[63:32] = counter[63:32];
@ -29,9 +36,6 @@ module ibex_counter #(
counter_load[31:0] = counter[31:0];
end
// Increment
counter_upd = counter[CounterWidth-1:0] + {{CounterWidth - 1{1'b0}}, 1'b1};
// Next value logic
if (we) begin
counter_d = counter_load[CounterWidth-1:0];
@ -67,11 +71,24 @@ module ibex_counter #(
if (CounterWidth < 64) begin : g_counter_narrow
logic [63:CounterWidth] unused_counter_load;
assign counter[CounterWidth-1:0] = counter_q;
assign counter[63:CounterWidth] = '0;
assign unused_counter_load = counter_load[63:CounterWidth];
assign counter[CounterWidth-1:0] = counter_q;
assign counter[63:CounterWidth] = '0;
if (ProvideValUpd) begin : g_counter_val_upd_o
assign counter_val_upd_o[CounterWidth-1:0] = counter_upd;
end else begin : g_no_counter_val_upd_o
assign counter_val_upd_o[CounterWidth-1:0] = '0;
end
assign counter_val_upd_o[63:CounterWidth] = '0;
assign unused_counter_load = counter_load[63:CounterWidth];
end else begin : g_counter_full
assign counter = counter_q;
assign counter = counter_q;
if (ProvideValUpd) begin : g_counter_val_upd_o
assign counter_val_upd_o = counter_upd;
end else begin : g_no_counter_val_upd_o
assign counter_val_upd_o = '0;
end
end
assign counter_val_o = counter;

View file

@ -106,17 +106,19 @@ module ibex_cs_registers #(
// with wrong priviledge level, or
// missing write permissions
// Performance Counters
input logic instr_ret_i, // instr retired in ID/EX stage
input logic instr_ret_compressed_i, // compressed instr retired
input logic iside_wait_i, // core waiting for the iside
input logic jump_i, // jump instr seen (j, jr, jal, jalr)
input logic branch_i, // branch instr seen (bf, bnf)
input logic branch_taken_i, // branch was taken
input logic mem_load_i, // load from memory in this cycle
input logic mem_store_i, // store to memory in this cycle
input logic dside_wait_i, // core waiting for the dside
input logic mul_wait_i, // core waiting for multiply
input logic div_wait_i // core waiting for divide
input logic instr_ret_i, // instr retired in ID/EX stage
input logic instr_ret_compressed_i, // compressed instr retired
input logic instr_ret_spec_i, // speculative instr_ret_i
input logic instr_ret_compressed_spec_i, // speculative instr_ret_compressed_i
input logic iside_wait_i, // core waiting for the iside
input logic jump_i, // jump instr seen (j, jr, jal, jalr)
input logic branch_i, // branch instr seen (bf, bnf)
input logic branch_taken_i, // branch was taken
input logic mem_load_i, // load from memory in this cycle
input logic mem_store_i, // store to memory in this cycle
input logic dside_wait_i, // core waiting for the dside
input logic mul_wait_i, // core waiting for multiply
input logic div_wait_i // core waiting for divide
);
import ibex_pkg::*;
@ -242,6 +244,8 @@ module ibex_cs_registers #(
logic unused_mhpmcounterh_we_1;
logic unused_mhpmcounter_incr_1;
logic [63:0] minstret_next, minstret_raw;
// Debug / trigger registers
logic [31:0] tselect_rdata;
logic [31:0] tmatch_control_rdata;
@ -1228,12 +1232,15 @@ module ibex_cs_registers #(
.counterh_we_i(mhpmcounterh_we[0]),
.counter_we_i(mhpmcounter_we[0]),
.counter_val_i(csr_wdata_int),
.counter_val_o(mhpmcounter[0])
.counter_val_o(mhpmcounter[0]),
.counter_val_upd_o()
);
// minstret
ibex_counter #(
.CounterWidth(64)
.CounterWidth(64),
.ProvideValUpd(1)
) minstret_counter_i (
.clk_i(clk_i),
.rst_ni(rst_ni),
@ -1241,30 +1248,65 @@ module ibex_cs_registers #(
.counterh_we_i(mhpmcounterh_we[2]),
.counter_we_i(mhpmcounter_we[2]),
.counter_val_i(csr_wdata_int),
.counter_val_o(mhpmcounter[2])
.counter_val_o(minstret_raw),
.counter_val_upd_o(minstret_next)
);
// Where the writeback stage is present instruction in ID observing value of minstret must take
// into account any instruction in the writeback stage. If one is present the incremented value of
// minstret is used. A speculative version of the signal is used to aid timing. When the writeback
// stage sees an exception (so the speculative signal is incorrect) the ID stage will be flushed
// so the incorrect value doesn't matter. A similar behaviour is required for the compressed
// instruction retired counter below. When the writeback stage isn't present the speculative
// signals are always 0.
assign mhpmcounter[2] = instr_ret_spec_i & ~mcountinhibit[2] ? minstret_next : minstret_raw;
// reserved:
assign mhpmcounter[1] = '0;
assign unused_mhpmcounter_we_1 = mhpmcounter_we[1];
assign unused_mhpmcounterh_we_1 = mhpmcounterh_we[1];
assign unused_mhpmcounter_incr_1 = mhpmcounter_incr[1];
for (genvar cnt = 0; cnt < 29; cnt++) begin : gen_cntrs
if (cnt < MHPMCounterNum) begin : gen_imp
// Iterate through optionally included counters (MHPMCounterNum controls how many are included)
for (genvar i = 0; i < 29; i++) begin : gen_cntrs
localparam int Cnt = i + 3;
if (i < MHPMCounterNum) begin : gen_imp
logic [63:0] mhpmcounter_raw, mhpmcounter_next;
ibex_counter #(
.CounterWidth(MHPMCounterWidth)
.CounterWidth(MHPMCounterWidth),
.ProvideValUpd(Cnt == 10)
) mcounters_variable_i (
.clk_i(clk_i),
.rst_ni(rst_ni),
.counter_inc_i(mhpmcounter_incr[cnt+3] & ~mcountinhibit[cnt+3]),
.counterh_we_i(mhpmcounterh_we[cnt+3]),
.counter_we_i(mhpmcounter_we[cnt+3]),
.counter_inc_i(mhpmcounter_incr[Cnt] & ~mcountinhibit[Cnt]),
.counterh_we_i(mhpmcounterh_we[Cnt]),
.counter_we_i(mhpmcounter_we[Cnt]),
.counter_val_i(csr_wdata_int),
.counter_val_o(mhpmcounter[cnt+3])
.counter_val_o(mhpmcounter_raw),
.counter_val_upd_o(mhpmcounter_next)
);
if (Cnt == 10) begin : gen_compressed_instr_cnt
// Special behaviour for reading compressed instruction retired counter, see comment on
// `mhpmcounter[2]` above for further information.
assign mhpmcounter[Cnt] =
instr_ret_compressed_spec_i & ~mcountinhibit[Cnt] ? mhpmcounter_next:
mhpmcounter_raw;
end else begin : gen_other_cnts
logic [63:0] unused_mhpmcounter_next;
// All other counters just see the raw counter value directly.
assign mhpmcounter[Cnt] = mhpmcounter_raw;
assign unused_mhpmcounter_next = mhpmcounter_next;
end
end else begin : gen_unimp
assign mhpmcounter[cnt+3] = '0;
assign mhpmcounter[Cnt] = '0;
if (Cnt == 10) begin : gen_no_compressed_instr_cnt
logic unused_instr_ret_compressed_spec_i;
assign unused_instr_ret_compressed_spec_i = instr_ret_compressed_spec_i;
end
end
end

View file

@ -34,6 +34,8 @@ module ibex_wb_stage #(
output logic [31:0] pc_wb_o,
output logic perf_instr_ret_wb_o,
output logic perf_instr_ret_compressed_wb_o,
output logic perf_instr_ret_wb_spec_o,
output logic perf_instr_ret_compressed_wb_spec_o,
input logic [4:0] rf_waddr_id_i,
input logic [31:0] rf_wdata_id_i,
@ -144,10 +146,15 @@ module ibex_wb_stage #(
assign instr_done_wb_o = wb_valid_q & wb_done;
// Increment instruction retire counters for valid instructions which are not lsu errors
assign perf_instr_ret_wb_o = instr_done_wb_o & wb_count_q &
~(lsu_resp_valid_i & lsu_resp_err_i);
assign perf_instr_ret_compressed_wb_o = perf_instr_ret_wb_o & wb_compressed_q;
// Increment instruction retire counters for valid instructions which are not lsu errors.
// Speculative versions of the signals do not factor in exceptions and whether the instruction
// is done yet. These are used to get correct values for instructions reading the relevant
// performance counters in the ID stage.
assign perf_instr_ret_wb_spec_o = wb_count_q;
assign perf_instr_ret_compressed_wb_spec_o = perf_instr_ret_wb_spec_o & wb_compressed_q;
assign perf_instr_ret_wb_o = instr_done_wb_o & wb_count_q &
~(lsu_resp_valid_i & lsu_resp_err_i);
assign perf_instr_ret_compressed_wb_o = perf_instr_ret_wb_o & wb_compressed_q;
// Forward data that will be written to the RF back to ID to resolve data hazards. The flopped
// rf_wdata_wb_q is used rather than rf_wdata_wb_o as the latter includes read data from memory
@ -159,10 +166,14 @@ module ibex_wb_stage #(
assign rf_wdata_wb_mux[0] = rf_wdata_id_i;
assign rf_wdata_wb_mux_we[0] = rf_we_id_i;
// Increment instruction retire counters for valid instructions which are not lsu errors
assign perf_instr_ret_wb_o = instr_perf_count_id_i & en_wb_i &
~(lsu_resp_valid_i & lsu_resp_err_i);
assign perf_instr_ret_compressed_wb_o = perf_instr_ret_wb_o & instr_is_compressed_id_i;
// Increment instruction retire counters for valid instructions which are not lsu errors.
// The speculative signals are always 0 when no writeback stage is present as the raw counter
// values will be correct.
assign perf_instr_ret_wb_spec_o = 1'b0;
assign perf_instr_ret_compressed_wb_spec_o = 1'b0;
assign perf_instr_ret_wb_o = instr_perf_count_id_i & en_wb_i &
~(lsu_resp_valid_i & lsu_resp_err_i);
assign perf_instr_ret_compressed_wb_o = perf_instr_ret_wb_o & instr_is_compressed_id_i;
// ready needs to be constant 1 without writeback stage (otherwise ID/EX stage will stall)
assign ready_wb_o = 1'b1;