diff --git a/doc/performance_counters.rst b/doc/performance_counters.rst index e42b32b8..fb4c1fb3 100644 --- a/doc/performance_counters.rst +++ b/doc/performance_counters.rst @@ -122,3 +122,17 @@ The remaining event selector CSRs are tied to 0, i.e., no events are counted by +----------------------+-------------+-------------+--------------+ | ``mhpmevent10(h)`` | 0x32A | 0x0000_0400 | 10 | +----------------------+-------------+-------------+--------------+ + +FPGA Targets +------------ + +For FPGA targets the performance counters constitute a particularily large structure. +Implementing the maximum 29 event counters 32, 48 and 64 bit wide results in relative logic utilizations of the core of 100%, 111% and 129% respectively. +The relative numbers of flip-flops are 100%, 125% and 150%. +It is recommended to implement event counters of 32 bit width where possible. + +For Xilinx FPGA devices featuring the `DSP48E1` DSP slice or similar, counter logic can be absorbed into the DSP slice for widths up to 48 bits. +The resulting relative logic utilizations with respect to the non-DSP 32 bit counter implementation are 83% and 89% respectively for 32 and 48 bit DSP counters. +This comes at the expense of 1 DSP slice per counter. +For 32 bit counters only, the corresponding flip-flops can be incorporated into the DSP's output pipeline register, resulting in a reduction of the number of flip-flops to 50%. +In order to infer DSP slices for performance counters, define the preprocessor variable ``FPGA_XILINX``. diff --git a/dv/uvm/core_ibex/ibex_dv.f b/dv/uvm/core_ibex/ibex_dv.f index d2ee021d..d25ae42f 100644 --- a/dv/uvm/core_ibex/ibex_dv.f +++ b/dv/uvm/core_ibex/ibex_dv.f @@ -21,6 +21,7 @@ ${PRJ_DIR}/ibex/rtl/ibex_alu.sv ${PRJ_DIR}/ibex/rtl/ibex_compressed_decoder.sv ${PRJ_DIR}/ibex/rtl/ibex_controller.sv ${PRJ_DIR}/ibex/rtl/ibex_cs_registers.sv +${PRJ_DIR}/ibex/rtl/ibex_counters.sv ${PRJ_DIR}/ibex/rtl/ibex_decoder.sv ${PRJ_DIR}/ibex/rtl/ibex_ex_block.sv ${PRJ_DIR}/ibex/rtl/ibex_id_stage.sv diff --git a/examples/fpga/artya7/top_artya7.core b/examples/fpga/artya7/top_artya7.core index 632f068e..6b903b45 100644 --- a/examples/fpga/artya7/top_artya7.core +++ b/examples/fpga/artya7/top_artya7.core @@ -31,6 +31,12 @@ parameters: default: "../../../../../examples/sw/led/led.vmem" paramtype: vlogdefine + FPGA_XILINX: + datatype: str + description: Identifies Xilinx FPGA targets to set DSP pragmas for performance counters. + default: 1 + paramtype: vlogdefine + targets: synth: default_tool: vivado @@ -40,6 +46,7 @@ targets: toplevel: top_artya7 parameters: - SRAM_INIT_FILE + - FPGA_XILINX tools: vivado: part: "xc7a100tcsg324-1" # Default to Arty A7-100 diff --git a/ibex_core.core b/ibex_core.core index 67dc410d..09009ca9 100644 --- a/ibex_core.core +++ b/ibex_core.core @@ -15,6 +15,7 @@ filesets: - rtl/ibex_compressed_decoder.sv - rtl/ibex_controller.sv - rtl/ibex_cs_registers.sv + - rtl/ibex_counters.sv - rtl/ibex_decoder.sv - rtl/ibex_ex_block.sv - rtl/ibex_fetch_fifo.sv diff --git a/lint/verilator_waiver.vlt b/lint/verilator_waiver.vlt index 8e7c6a3d..bc57bb32 100644 --- a/lint/verilator_waiver.vlt +++ b/lint/verilator_waiver.vlt @@ -58,6 +58,52 @@ lint_off -rule UNUSED -file "*/rtl/sim/simulator_ctrl.sv" -match "*'wdata_i'[31: // entire 32-bit address around to make the code a bit cleaner. lint_off -rule UNUSED -file "*/rtl/timer.sv" -match "*'timer_addr_i'[31:10]*" +// Bits of signal are not used for MHPMCounterNum < 29: mhpmcounter_we[31:MHPMCounterNum+3] +// cleaner to write all bits even if not all are used +lint_off -rule UNUSED -file "*/rtl/ibex_cs_registers.sv" -match "*'mhpmcounter_we'[31:MHPMCounterNum+3]*" + +// Bits of signal are not used: mhpmcounter_we[1] +// Bits of signal are not used: mhpmcounterh_we[1] +// Bits of signal are not used: mhpmcounter_incr[1] +// +// cleaner to write all bits even if not all are used +// +lint_off -rule UNUSED -file "*/rtl/ibex_cs_registers.sv" -match "*'mhpmcounter_we'[1]*" +lint_off -rule UNUSED -file "*/rtl/ibex_cs_registers.sv" -match "*'mhpmcounterh_we'[1]*" +lint_off -rule UNUSED -file "*/rtl/ibex_cs_registers.sv" -match "*'mhpmcounter_incr'[1]*" + +// Signals are unused if MHPMCounterNum == 0: clk_i, rst_ni +// Signal is unused if MHPMCounterNum == 0: counter_val_i[31:0] +// +// If no counters are implemented, no flops are elaborated. No clock, reset or +// next-state logic is used. +// +lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*'clk_i'" +lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*'rst_ni'" +lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*'counter_val_i'" + +// Bits of signal are not used for MHPMCounterNum < 29: counter_inc_i[28:MHPMCounterNum] +// Bits of signal are not used for MHPMCounterNum < 29: counterh_we_i[28:MHPMCounterNum] +// Bits of signal are not used for MHPMCounterNum < 29: counter_we_i[28:MHPMCounterNum] +// +// cleaner to write all bits even if not all are used +// +// lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*'counter_inc_i'[28:*]*" +// lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*'counterh_we_i'[28:*]*" +// lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*'counter_we_i'[28:*]*" + +lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*counter_inc_i*" +lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*counterh_we_i*" +lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*counter_we_i*" + +// Bits of signal are not used for MHPMCounterWidth < 64: counter_upd[63:MHPMCounterWidth] +// Bits of signal are not used for MHPMCounterWidth < 64: counter_load[63:MHPMCounterWidth] +// +// cleaner to write all bits even if not all are used +// +lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*'counter_upd'[63:*]*" +lint_off -rule UNUSED -file "*/rtl/ibex_counters.sv" -match "*'counter_load'[63:*]*" + // Signal is not used: test_en_i // testability signal lint_off -rule UNUSED -file "*/rtl/ibex_register_file_ff.sv" -match "*test_en_i*" diff --git a/rtl/ibex_core.f b/rtl/ibex_core.f index 68dc3494..83e8396b 100644 --- a/rtl/ibex_core.f +++ b/rtl/ibex_core.f @@ -2,6 +2,7 @@ ibex_pkg.sv ibex_alu.sv ibex_compressed_decoder.sv ibex_controller.sv +ibex_counter.sv ibex_cs_registers.sv ibex_decoder.sv ibex_ex_block.sv diff --git a/rtl/ibex_counters.sv b/rtl/ibex_counters.sv new file mode 100644 index 00000000..9a28b9a9 --- /dev/null +++ b/rtl/ibex_counters.sv @@ -0,0 +1,86 @@ +module ibex_counters #( + parameter int MaxNumCounters = 29, + parameter int NumCounters = 0, + parameter int CounterWidth = 32 +) ( + input clk_i, + input rst_ni, + + input logic [MaxNumCounters-1:0] counter_inc_i, + input logic [MaxNumCounters-1:0] counterh_we_i, + input logic [MaxNumCounters-1:0] counter_we_i, + input logic [31:0] counter_val_i, + output logic [63:0] counter_val_o [MaxNumCounters] +); + logic [63:0] counter [MaxNumCounters]; + + assign counter_val_o = counter; + + for (genvar i = 0; i < MaxNumCounters; i++) begin : g_counter + // Only elaborate flops that are needed from the given CounterWidth and NumCounters. + if (i < NumCounters) begin : g_counter_exists + + logic [63:0] counter_upd; + logic [63:0] counter_load; + logic we; + logic [CounterWidth-1:0] counter_d; + + // Update + always_comb begin + + // Write + we = counter_we_i[i] | counterh_we_i[i]; + counter_load[63:32] = counter[i][63:32]; + counter_load[31:0] = counter_val_i; + if (counterh_we_i[i]) begin + counter_load[63:32] = counter_val_i; + counter_load[31:0] = counter[i][31:0]; + end + + // Increment + counter_upd = counter[i] + 64'h1; + + // Next value logic + if (we) begin + counter_d = counter_load[CounterWidth-1:0]; + end else if (counter_inc_i[i])begin + counter_d = counter_upd[CounterWidth-1:0]; + end else begin + counter_d = counter[i][CounterWidth-1:0]; + end + end + +`ifdef FPGA_XILINX + // Set DSP pragma for supported xilinx FPGAs + localparam dsp_pragma = CounterWidth < 49 ? "yes" : "no"; + (* use_dsp = dsp_pragma *) logic [CounterWidth-1:0] counter_q; +`else + logic [CounterWidth-1:0] counter_q; +`endif + + // Counter flop +`ifdef FPGA_XILINX + // DSP output register requires synchronous reset. + always @(posedge clk_i) begin +`else + always @(posedge clk_i or negedge rst_ni) begin +`endif + if (!rst_ni) begin + counter_q <= '0; + end else begin + counter_q <= counter_d; + end + end + + if (CounterWidth < 64) begin : g_counter_narrow + assign counter[i][CounterWidth-1:0] = counter_q; + assign counter[i][63:CounterWidth] = '0; + end else begin : g_counter_full + assign counter[i] = counter_q; + end + end else begin : g_no_counter + assign counter[i] = '0; + end + end + +endmodule diff --git a/rtl/ibex_cs_registers.sv b/rtl/ibex_cs_registers.sv index 3e8f13a7..e9fce05e 100644 --- a/rtl/ibex_cs_registers.sv +++ b/rtl/ibex_cs_registers.sv @@ -183,7 +183,6 @@ module ibex_cs_registers #( logic [MHPMCounterNum+3-1:0] mcountinhibit_d, mcountinhibit_q; logic mcountinhibit_we; - logic [63:0] mhpmcounter_d [32]; // mhpmcounter flops are elaborated below providing only the precise number that is required based // on MHPMCounterNum/MHPMCounterWidth. This signal connects to the Q output of these flops // where they exist and is otherwise 0. @@ -877,55 +876,51 @@ module ibex_cs_registers #( end end - // update - always_comb begin : mhpmcounter_update - mhpmcounter_d = mhpmcounter; + // mcycle and minstret + ibex_counters #( + .MaxNumCounters(1), + .NumCounters(1), + .CounterWidth(64) + ) mcycle_counter_i ( + .clk_i(clk_i), + .rst_ni(rst_ni), + .counter_inc_i(mhpmcounter_incr[0] & ~mcountinhibit[0]), + .counterh_we_i(mhpmcounterh_we[0]), + .counter_we_i(mhpmcounter_we[0]), + .counter_val_i(csr_wdata_int), + .counter_val_o(mhpmcounter[0:0]) + ); - for (int i=0; i<32; i++) begin : gen_mhpmcounter_update + ibex_counters #( + .MaxNumCounters(1), + .NumCounters(1), + .CounterWidth(64) + ) minstret_counter_i ( + .clk_i(clk_i), + .rst_ni(rst_ni), + .counter_inc_i(mhpmcounter_incr[2] & ~mcountinhibit[2]), + .counterh_we_i(mhpmcounterh_we[2]), + .counter_we_i(mhpmcounter_we[2]), + .counter_val_i(csr_wdata_int), + .counter_val_o(mhpmcounter[2:2]) + ); - // increment - if (mhpmcounter_incr[i] & ~mcountinhibit[i]) begin - mhpmcounter_d[i] = mhpmcounter[i] + 64'h1; - end + // reserved: + assign mhpmcounter[1] = '0; - // write - if (mhpmcounter_we[i]) begin - mhpmcounter_d[i][31: 0] = csr_wdata_int; - end else if (mhpmcounterh_we[i]) begin - mhpmcounter_d[i][63:32] = csr_wdata_int; - end - end - end - - // Performance monitor registers - // Only elaborate flops that are needed from the given MHPMCounterWidth and MHPMCounterNum - // parameters - for (genvar i = 0; i < 32; i++) begin : g_mhpmcounter - // First 3 counters (cycle, time, instret) must always be elaborated - if (i < 3 + MHPMCounterNum) begin : g_mhpmcounter_exists - // First 3 counters must be 64-bit the rest have parameterisable width - localparam int unsigned IMHPMCounterWidth = i < 3 ? 64 : MHPMCounterWidth; - - logic [IMHPMCounterWidth-1:0] mhpmcounter_q; - - always @(posedge clk_i or negedge rst_ni) begin - if(~rst_ni) begin - mhpmcounter_q <= '0; - end else begin - mhpmcounter_q <= mhpmcounter_d[i][IMHPMCounterWidth-1:0]; - end - end - - if (IMHPMCounterWidth < 64) begin : g_mhpmcounter_narrow - assign mhpmcounter[i][IMHPMCounterWidth-1:0] = mhpmcounter_q; - assign mhpmcounter[i][63:IMHPMCounterWidth] = '0; - end else begin : g_mhpmcounter_full - assign mhpmcounter[i] = mhpmcounter_q; - end - end else begin : g_no_mhpmcounter - assign mhpmcounter[i] = '0; - end - end + ibex_counters #( + .MaxNumCounters(29), + .NumCounters(MHPMCounterNum), + .CounterWidth(MHPMCounterWidth) + ) mcounters_variable_i ( + .clk_i(clk_i), + .rst_ni(rst_ni), + .counter_inc_i(mhpmcounter_incr[31:3] & ~mcountinhibit[31:3]), + .counterh_we_i(mhpmcounterh_we[31:3]), + .counter_we_i(mhpmcounter_we[31:3]), + .counter_val_i(csr_wdata_int), + .counter_val_o(mhpmcounter[3:31]) + ); if(MHPMCounterNum < 29) begin : g_mcountinhibit_reduced assign mcountinhibit = {{29-MHPMCounterNum{1'b1}}, mcountinhibit_q};