[RTL] Added seperate ALU for branch target

On branches now compute target same cycle as the condition.  This
removes a stall cycle from all taken conditional branches.
This commit is contained in:
Greg Chadwick 2019-09-24 15:20:14 +01:00
parent 32a69899a9
commit 639964514c
17 changed files with 205 additions and 57 deletions

View file

@ -87,6 +87,9 @@ Parameters
+------------------------------+-------------+------------+-----------------------------------------------------------------+
| ``RV32M`` | bit | 1 | M(ultiply) extension enable |
+------------------------------+-------------+------------+-----------------------------------------------------------------+
| ``BranchTargetALU`` | bit | 0 | *EXPERIMENTAL* - Enables branch target ALU removing a stall |
| | | | cycle from taken branches |
+------------------------------+-------------+------------+-----------------------------------------------------------------+
| ``MultiplierImplementation`` | string | "fast" | Multiplicator type, "slow", or "fast" |
+------------------------------+-------------+------------+-----------------------------------------------------------------+
| ``DbgTriggerEn`` | bit | 0 | Enable debug trigger support (one trigger only) |
@ -96,6 +99,8 @@ Parameters
| ``DmExceptionAddr`` | int | 0x1A110808 | Address to jump to when an exception occurs while in Debug Mode |
+------------------------------+-------------+------------+-----------------------------------------------------------------+
Any parameter marked *EXPERIMENTAL* when enabled is not verified to the same standard as the rest of the Ibex core.
Interfaces
----------

View file

@ -67,13 +67,17 @@ Read the description for more information.
+-----------------------+-----------------------+-------------------------------------------------------------+
| Branch (Taken) | 2 - N | Any branch where the condition is met will stall for 2 |
| | | cycles as in the first cycle the branch is in ID/EX the ALU |
| | | is used to calculate the branch condition. The following |
| | | cycle the ALU is used again to calculate the branch target |
| | 1 - N (Branch Target | is used to calculate the branch condition. The following |
| | ALU enabled) | cycle the ALU is used again to calculate the branch target |
| | | where it proceeds as Jump does above (Flush IF stage and |
| | | prefetch buffer, new PC on instruction-side memory |
| | | interface the same cycle it is calculated). The longer the |
| | | instruction-side memory interface takes to receive data the |
| | | longer the branch will stall. |
| | | longer the branch will stall. With the parameter |
| | | ``BranchTargetALU`` set to ``1`` a seperate ALU calculates |
| | | the branch target simultaneously to calculating the branch |
| | | condition with the main ALU so 1 less stall cycle is |
| | | required |
+-----------------------+-----------------------+-------------------------------------------------------------+
| Instruction Fence | 1 - N | The FENCE.I instruction as defined in 'Zifencei' of the |
| | | RISC-V specification. Internally it is implemented as a |

View file

@ -29,6 +29,11 @@ parameters:
paramtype: vlogparam
default: 0
description: Enable the E ISA extension (reduced register set) [0/1]
BranchTargetALU:
datatype: int
paramtype: vlogparam
default: 0
description: Enables seperate branch target ALU (increasing branch performance EXPERIMENTAL) [0/1]
targets:
sim:
@ -38,6 +43,7 @@ targets:
parameters:
- RV32M
- RV32E
- BranchTargetALU
toplevel: ibex_riscv_compliance
tools:
verilator:

View file

@ -15,8 +15,9 @@ module ibex_riscv_compliance (
input IO_RST_N
);
parameter bit RV32E = 0;
parameter bit RV32M = 1;
parameter bit RV32E = 0;
parameter bit RV32M = 1;
parameter bit BranchTargetALU = 0;
logic clk_sys, rst_sys_n;
@ -104,7 +105,8 @@ module ibex_riscv_compliance (
.DmHaltAddr(32'h00000000),
.DmExceptionAddr(32'h00000000),
.RV32E(RV32E),
.RV32M(RV32M)
.RV32M(RV32M),
.BranchTargetALU(BranchTargetALU)
) u_core (
.clk_i (clk_sys),
.rst_ni (rst_sys_n),

View file

@ -32,6 +32,11 @@ parameters:
datatype: str
paramtype: vlogdefine
description: Path to a vmem file to initialize the RAM with
BranchTargetALU:
datatype: int
paramtype: vlogparam
default: 0
description: Enables seperate branch target ALU (increasing branch performance EXPERIMENTAL) [0/1]
targets:
sim:
@ -41,6 +46,7 @@ targets:
parameters:
- RV32M
- RV32E
- BranchTargetALU
- SRAM_INIT_FILE
toplevel: ibex_simple_system
tools:

View file

@ -18,8 +18,9 @@ module ibex_simple_system (
input IO_RST_N
);
parameter bit RV32E = 0;
parameter bit RV32M = 1;
parameter bit RV32E = 0;
parameter bit RV32M = 1;
parameter bit BranchTargetALU = 0;
logic clk_sys = 1'b0, rst_sys_n;
@ -137,7 +138,8 @@ module ibex_simple_system (
.DmHaltAddr(32'h00100000),
.DmExceptionAddr(32'h00100000),
.RV32E(RV32E),
.RV32M(RV32M)
.RV32M(RV32M),
.BranchTargetALU(BranchTargetALU)
) u_core (
.clk_i (clk_sys),
.rst_ni (rst_sys_n),

View file

@ -63,6 +63,12 @@ parameters:
description: "Multiplier implementation. Valid values: fast, slow"
default: fast
BranchTargetALU:
datatype: int
paramtype: vlogparam
default: 0
description: "Enables seperate branch target ALU (increasing branch performance EXPERIMENTAL) [0/1]"
targets:
default:
filesets:

View file

@ -46,6 +46,12 @@ parameters:
description: "Multiplier implementation. Valid values: fast, slow"
default: fast
BranchTargetALU:
datatype: int
paramtype: vlogparam
default: 0
description: "Enables seperate branch target ALU (increasing branch performance EXPERIMENTAL) [0/1]"
targets:

View file

@ -47,12 +47,12 @@ lint_off -msg UNUSED -file "*/rtl/ibex_register_file_fpga.sv" -lines 22
// Signal is not used: clk_i
// leaving clk and reset connected in-case we want to add assertions
lint_off -msg UNUSED -file "*/rtl/ibex_compressed_decoder.sv" -lines 17
lint_off -msg UNUSED -file "*/rtl/ibex_decoder.sv" -lines 24
lint_off -msg UNUSED -file "*/rtl/ibex_decoder.sv" -lines 25
// Signal is not used: rst_ni
// leaving clk and reset connected in-case we want to add assertions
lint_off -msg UNUSED -file "*/rtl/ibex_compressed_decoder.sv" -lines 18
lint_off -msg UNUSED -file "*/rtl/ibex_decoder.sv" -lines 25
lint_off -msg UNUSED -file "*/rtl/ibex_decoder.sv" -lines 26
lint_off -msg UNUSED -file "*/rtl/ibex_register_file_fpga.sv" -lines 20
// Signal unoptimizable: Feedback to clock or circular logic:

View file

@ -18,6 +18,7 @@ module ibex_core #(
parameter int unsigned MHPMCounterWidth = 40,
parameter bit RV32E = 1'b0,
parameter bit RV32M = 1'b1,
parameter bit BranchTargetALU = 1'b0,
parameter MultiplierImplementation = "fast",
parameter bit DbgTriggerEn = 1'b0,
parameter int unsigned DmHaltAddr = 32'h1A110800,
@ -135,6 +136,9 @@ module ibex_core #(
logic [31:0] alu_operand_a_ex;
logic [31:0] alu_operand_b_ex;
jt_mux_sel_e jt_mux_sel_ex;
logic [11:0] bt_operand_imm_ex;
logic [31:0] alu_adder_result_ex; // Used to forward computed address to LSU
logic [31:0] regfile_wdata_ex;
@ -356,8 +360,9 @@ module ibex_core #(
//////////////
ibex_id_stage #(
.RV32E ( RV32E ),
.RV32M ( RV32M )
.RV32E ( RV32E ),
.RV32M ( RV32M ),
.BranchTargetALU ( BranchTargetALU )
) id_stage_i (
.clk_i ( clk ),
.rst_ni ( rst_ni ),
@ -401,6 +406,9 @@ module ibex_core #(
.alu_operand_a_ex_o ( alu_operand_a_ex ),
.alu_operand_b_ex_o ( alu_operand_b_ex ),
.jt_mux_sel_ex_o ( jt_mux_sel_ex ),
.bt_operand_imm_o ( bt_operand_imm_ex ),
.mult_en_ex_o ( mult_en_ex ),
.div_en_ex_o ( div_en_ex ),
.multdiv_operator_ex_o ( multdiv_operator_ex ),
@ -482,6 +490,7 @@ module ibex_core #(
ibex_ex_block #(
.RV32M ( RV32M ),
.BranchTargetALU ( BranchTargetALU ),
.MultiplierImplementation ( MultiplierImplementation )
) ex_block_i (
.clk_i ( clk ),
@ -492,6 +501,11 @@ module ibex_core #(
.alu_operand_a_i ( alu_operand_a_ex ),
.alu_operand_b_i ( alu_operand_b_ex ),
// Branch target ALU signal from ID stage
.jt_mux_sel_i ( jt_mux_sel_ex ),
.bt_operand_imm_i ( bt_operand_imm_ex ),
.pc_id_i ( pc_id ),
// Multipler/Divider signal from ID stage
.multdiv_operator_i ( multdiv_operator_ex ),
.mult_en_i ( mult_en_ex ),

View file

@ -14,6 +14,7 @@ module ibex_core_tracing #(
parameter int unsigned MHPMCounterWidth = 40,
parameter bit RV32E = 1'b0,
parameter bit RV32M = 1'b1,
parameter bit BranchTargetALU = 1'b0,
parameter MultiplierImplementation = "fast",
parameter bit DbgTriggerEn = 1'b0,
parameter int unsigned DmHaltAddr = 32'h1A110800,
@ -99,6 +100,7 @@ module ibex_core_tracing #(
.MHPMCounterWidth ( MHPMCounterWidth ),
.RV32E ( RV32E ),
.RV32M ( RV32M ),
.BranchTargetALU ( BranchTargetALU ),
.DbgTriggerEn ( DbgTriggerEn ),
.MultiplierImplementation ( MultiplierImplementation ),
.DmHaltAddr ( DmHaltAddr ),

View file

@ -18,8 +18,9 @@
`include "prim_assert.sv"
module ibex_decoder #(
parameter bit RV32E = 0,
parameter bit RV32M = 1
parameter bit RV32E = 0,
parameter bit RV32M = 1,
parameter bit BranchTargetALU = 0
) (
input logic clk_i,
input logic rst_ni,
@ -40,14 +41,15 @@ module ibex_decoder #(
input logic illegal_c_insn_i, // compressed instruction decode failed
// immediates
output ibex_pkg::imm_a_sel_e imm_a_mux_sel_o, // immediate selection for operand a
output ibex_pkg::imm_b_sel_e imm_b_mux_sel_o, // immediate selection for operand b
output logic [31:0] imm_i_type_o,
output logic [31:0] imm_s_type_o,
output logic [31:0] imm_b_type_o,
output logic [31:0] imm_u_type_o,
output logic [31:0] imm_j_type_o,
output logic [31:0] zimm_rs1_type_o,
output ibex_pkg::imm_a_sel_e imm_a_mux_sel_o, // immediate selection for operand a
output ibex_pkg::imm_b_sel_e imm_b_mux_sel_o, // immediate selection for operand b
output ibex_pkg::jt_mux_sel_e jt_mux_sel_o, // jump target selection
output logic [31:0] imm_i_type_o,
output logic [31:0] imm_s_type_o,
output logic [31:0] imm_b_type_o,
output logic [31:0] imm_u_type_o,
output logic [31:0] imm_j_type_o,
output logic [31:0] zimm_rs1_type_o,
// register file
output ibex_pkg::rf_wd_sel_e regfile_wdata_sel_o, // RF write data selection
@ -214,6 +216,8 @@ module ibex_decoder #(
ecall_insn_o = 1'b0;
wfi_insn_o = 1'b0;
jt_mux_sel_o = JT_ALU;
opcode = opcode_e'(instr[6:0]);
unique case (opcode)
@ -224,6 +228,11 @@ module ibex_decoder #(
OPCODE_JAL: begin // Jump and Link
jump_in_dec_o = 1'b1;
if(BranchTargetALU) begin
jt_mux_sel_o = JT_ALU;
end
if (instr_new_i) begin
// Calculate jump target
alu_op_a_mux_sel_o = OP_A_CURRPC;
@ -244,6 +253,11 @@ module ibex_decoder #(
OPCODE_JALR: begin // Jump and Link Register
jump_in_dec_o = 1'b1;
if(BranchTargetALU) begin
jt_mux_sel_o = JT_ALU;
end
if (instr_new_i) begin
// Calculate jump target
alu_op_a_mux_sel_o = OP_A_REG_A;
@ -277,17 +291,28 @@ module ibex_decoder #(
3'b111: alu_operator_o = ALU_GEU;
default: illegal_insn = 1'b1;
endcase
if (instr_new_i) begin
// Evaluate branch condition
if (BranchTargetALU) begin
// With branch target ALU main ALU evaluates branch condition and branch target ALU
// calculates target (which is controlled in a seperate block below)
alu_op_a_mux_sel_o = OP_A_REG_A;
alu_op_b_mux_sel_o = OP_B_REG_B;
end else begin
// Calculate jump target in EX
alu_op_a_mux_sel_o = OP_A_CURRPC;
alu_op_b_mux_sel_o = OP_B_IMM;
imm_b_mux_sel_o = IMM_B_B;
alu_operator_o = ALU_ADD;
regfile_we = 1'b0;
jt_mux_sel_o = JT_BT_ALU;
end else begin
// Without branch target ALU branch is 2 stage operation using the Main ALU in both stages
if (instr_new_i) begin
// First evaluates branch condition
alu_op_a_mux_sel_o = OP_A_REG_A;
alu_op_b_mux_sel_o = OP_B_REG_B;
end else begin
// Then calculate jump target
alu_op_a_mux_sel_o = OP_A_CURRPC;
alu_op_b_mux_sel_o = OP_B_IMM;
imm_b_mux_sel_o = IMM_B_B;
alu_operator_o = ALU_ADD;
regfile_we = 1'b0;
end
end
end

View file

@ -10,31 +10,38 @@
*/
module ibex_ex_block #(
parameter bit RV32M = 1,
parameter bit BranchTargetALU = 0,
parameter MultiplierImplementation = "fast"
) (
input logic clk_i,
input logic rst_ni,
input logic clk_i,
input logic rst_ni,
// ALU
input ibex_pkg::alu_op_e alu_operator_i,
input logic [31:0] alu_operand_a_i,
input logic [31:0] alu_operand_b_i,
input ibex_pkg::alu_op_e alu_operator_i,
input logic [31:0] alu_operand_a_i,
input logic [31:0] alu_operand_b_i,
// Branch Target ALU
// All of these signals are unusued when BranchTargetALU == 0
input ibex_pkg::jt_mux_sel_e jt_mux_sel_i,
input logic [11:0] bt_operand_imm_i,
input logic [31:0] pc_id_i,
// Multiplier/Divider
input ibex_pkg::md_op_e multdiv_operator_i,
input logic mult_en_i,
input logic div_en_i,
input logic [1:0] multdiv_signed_mode_i,
input logic [31:0] multdiv_operand_a_i,
input logic [31:0] multdiv_operand_b_i,
input ibex_pkg::md_op_e multdiv_operator_i,
input logic mult_en_i,
input logic div_en_i,
input logic [1:0] multdiv_signed_mode_i,
input logic [31:0] multdiv_operand_a_i,
input logic [31:0] multdiv_operand_b_i,
// Outputs
output logic [31:0] alu_adder_result_ex_o, // to LSU
output logic [31:0] regfile_wdata_ex_o,
output logic [31:0] jump_target_o, // to IF
output logic branch_decision_o, // to ID
output logic [31:0] alu_adder_result_ex_o, // to LSU
output logic [31:0] regfile_wdata_ex_o,
output logic [31:0] jump_target_o, // to IF
output logic branch_decision_o, // to ID
output logic ex_valid_o // EX has valid output
output logic ex_valid_o // EX has valid output
);
import ibex_pkg::*;
@ -64,7 +71,25 @@ module ibex_ex_block #(
// branch handling
assign branch_decision_o = alu_cmp_result;
assign jump_target_o = alu_adder_result_ex_o;
if (BranchTargetALU) begin : g_branch_target_alu
logic [32:0] bt_alu_result;
assign bt_alu_result = {{19{bt_operand_imm_i[11]}}, bt_operand_imm_i, 1'b0} + pc_id_i;
assign jump_target_o = (jt_mux_sel_i == JT_ALU) ? alu_adder_result_ex_o : bt_alu_result[31:0];
end else begin : g_no_branch_target_alu
// Unused jt_mux_sel_i/bt_operand_imm_i/pc_id_i signals causes lint errors, this avoids them
ibex_pkg::jt_mux_sel_e jt_mux_sel_unused;
logic [11:0] bt_operand_imm_unused;
logic [31:0] pc_id_unused;
assign jt_mux_sel_unused = jt_mux_sel_i;
assign bt_operand_imm_unused = bt_operand_imm_i;
assign pc_id_unused = pc_id_i;
assign jump_target_o = alu_adder_result_ex_o;
end
/////////
// ALU //

View file

@ -17,8 +17,9 @@
`include "prim_assert.sv"
module ibex_id_stage #(
parameter bit RV32E = 0,
parameter bit RV32M = 1
parameter bit RV32E = 0,
parameter bit RV32M = 1,
parameter bit BranchTargetALU = 0
) (
input logic clk_i,
input logic rst_ni,
@ -61,6 +62,10 @@ module ibex_id_stage #(
output logic [31:0] alu_operand_a_ex_o,
output logic [31:0] alu_operand_b_ex_o,
// Branch target ALU
output ibex_pkg::jt_mux_sel_e jt_mux_sel_ex_o,
output logic [11:0] bt_operand_imm_o,
// MUL, DIV
output logic mult_en_ex_o,
output logic div_en_ex_o,
@ -149,7 +154,7 @@ module ibex_id_stage #(
logic wfi_insn_dec;
logic branch_in_dec;
logic branch_set_n, branch_set_q;
logic branch_set, branch_set_n;
logic jump_in_dec;
logic jump_set;
@ -317,8 +322,9 @@ module ibex_id_stage #(
/////////////
ibex_decoder #(
.RV32E ( RV32E ),
.RV32M ( RV32M )
.RV32E ( RV32E ),
.RV32M ( RV32M ),
.BranchTargetALU ( BranchTargetALU )
) decoder_i (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
@ -340,6 +346,7 @@ module ibex_id_stage #(
// immediates
.imm_a_mux_sel_o ( imm_a_mux_sel ),
.imm_b_mux_sel_o ( imm_b_mux_sel_dec ),
.jt_mux_sel_o ( jt_mux_sel_ex_o ),
.imm_i_type_o ( imm_i_type ),
.imm_s_type_o ( imm_s_type ),
@ -430,7 +437,7 @@ module ibex_id_stage #(
.store_err_i ( lsu_store_err_i ),
// jump/branch control
.branch_set_i ( branch_set_q ),
.branch_set_i ( branch_set ),
.jump_set_i ( jump_set ),
// interrupt signals
@ -506,6 +513,14 @@ module ibex_id_stage #(
assign alu_operand_a_ex_o = alu_operand_a;
assign alu_operand_b_ex_o = alu_operand_b;
if (BranchTargetALU) begin : g_bt_operand_imm
// Branch target ALU sign-extends and inserts bottom 0 bit so only want the
// 'raw' B-type immediate bits.
assign bt_operand_imm_o = imm_b_type[12:1];
end else begin : g_no_bt_operand_imm
assign bt_operand_imm_o = '0;
end
assign mult_en_ex_o = mult_en_id;
assign div_en_ex_o = div_en_id;
@ -521,14 +536,32 @@ module ibex_id_stage #(
// ID-EX/WB Pipeline Register //
////////////////////////////////
if (BranchTargetALU) begin : g_branch_set_direct
// Branch set fed straight to controller with branch target ALU
// (condition pass/fail used same cycle as generated instruction request)
assign branch_set = branch_set_n;
end else begin : g_branch_set_flopped
// Branch set flopped without branch target ALU
// (condition pass/fail used next cycle where branch target is calculated)
logic branch_set_q;
always_ff @(posedge clk_i or negedge rst_ni) begin
if (!rst_ni) begin
branch_set_q <= 1'b0;
end else begin
branch_set_q <= branch_set_n;
end
end
assign branch_set = branch_set_q;
end
always_ff @(posedge clk_i or negedge rst_ni) begin : id_wb_pipeline_reg
if (!rst_ni) begin
id_wb_fsm_cs <= IDLE;
branch_set_q <= 1'b0;
instr_multicycle_done_q <= 1'b0;
end else begin
id_wb_fsm_cs <= id_wb_fsm_ns;
branch_set_q <= branch_set_n;
instr_multicycle_done_q <= instr_multicycle_done_n;
end
end

View file

@ -132,6 +132,12 @@ typedef enum logic [2:0] {
IMM_B_INCR_ADDR
} imm_b_sel_e;
// Only used when BranchTargetALU == 1
typedef enum logic {
JT_ALU, // Jump target from main ALU
JT_BT_ALU // Jump target from specialised branch ALU
} jt_mux_sel_e;
// Regfile write data selection
typedef enum logic [1:0] {
RF_WD_LSU,

View file

@ -13,6 +13,7 @@ set_flow_var config_file "${lr_synth_top_module}_lr_synth_conf.tcl" "Synth confi
set_flow_var rpt_out "./${lr_synth_out_dir}/reports" "Report output directory"
set_flow_bool_var flatten 1 "flatten"
set_flow_bool_var timing_run 0 "timing run"
set_flow_bool_var ibex_branch_target_alu 0 "Enable branch target ALU in Ibex"
source $lr_synth_config_file

View file

@ -14,7 +14,12 @@ if { $lr_synth_timing_run } {
write_sdc_out $lr_synth_sdc_file_in $lr_synth_sdc_file_out
}
yosys "read -sv ./rtl/prim_clock_gating.v $lr_synth_out_dir/generated/*.v"
yosys "read_verilog -sv ./rtl/prim_clock_gating.v $lr_synth_out_dir/generated/*.v"
if { $lr_synth_ibex_branch_target_alu } {
yosys "chparam -set BranchTargetALU 1 ibex_core"
}
yosys "synth $flatten_opt -top $lr_synth_top_module"
yosys "opt -purge"