diff --git a/controller.sv b/controller.sv index 2444e283..b90fca5f 100644 --- a/controller.sv +++ b/controller.sv @@ -66,9 +66,6 @@ module riscv_controller input logic data_req_ex_i, // data memory access is currently performed in EX stage input logic data_misaligned_i, - // hwloop signals - input logic hwloop_jump_i, // modify pc_mux to select the hwloop addr - // jump/branch signals input logic branch_taken_ex_i, // branch taken signal from EX ALU input logic [1:0] jump_in_id_i, // jump is being calculated in ALU @@ -78,7 +75,6 @@ module riscv_controller input logic exc_req_i, output logic exc_ack_o, - // TODO input logic trap_hit_i, // a trap was hit, so we have to flush EX and WB output logic save_pc_if_o, @@ -125,7 +121,6 @@ module riscv_controller // FSM state encoding enum logic [3:0] { RESET, BOOT_SET, SLEEP, FIRST_FETCH, DECODE, - JUMP_EXC, FLUSH_EX, FLUSH_WB, DBG_WAIT_BRANCH, DBG_SIGNAL, DBG_WAIT } ctrl_fsm_cs, ctrl_fsm_ns; @@ -228,13 +223,6 @@ module riscv_controller ctrl_fsm_ns = DECODE; end - // hwloop detected, jump to start address! - // Attention: This has to be done in the DECODE and the FIRST_FETCH states - if (hwloop_jump_i == 1'b1) begin - pc_mux_o = `PC_HWLOOP; - pc_set_o = 1'b1; - end - // handle exceptions if (exc_req_i) begin pc_mux_o = `PC_EXCEPTION; @@ -251,93 +239,74 @@ module riscv_controller begin is_decoding_o = 1'b0; - // TODO: integrate this with the next loop, rename branch_decision - // into branch_taken and remove the jump_in_ex signal completely, - // there is no need to propagate it into the controller - if (instr_valid_i) begin + // decode and execute instructions only if the current conditional + // branch in the EX stage is either not taken, or there is no + // conditional branch in the EX stage + if (instr_valid_i && (~branch_taken_ex_i)) + begin // now analyze the current instruction in the ID stage + is_decoding_o = 1'b1; - // decode and execute instructions only if the current conditional - // branch in the EX stage is either not taken, or there is no - // conditional branch in the EX stage - if (~branch_taken_ex_i) - begin // now analyze the current instruction in the ID stage - is_decoding_o = 1'b1; + // handle unconditional jumps + // we can jump directly since we know the address already + // we don't need to worry about conditional branches here as they + // will be evaluated in the EX stage + if (jump_in_dec_i == `BRANCH_JALR || jump_in_dec_i == `BRANCH_JAL) begin + pc_mux_o = `PC_JUMP; - // handle unconditional jumps - // we can jump directly since we know the address already - // we don't need to worry about conditional branches here as they - // will be evaluated in the EX stage - if (jump_in_dec_i == `BRANCH_JALR || jump_in_dec_i == `BRANCH_JAL) begin - pc_mux_o = `PC_JUMP; + // if there is a jr stall, wait for it to be gone + if (~jr_stall_o) + pc_set_o = 1'b1; - // if there is a jr stall, wait for it to be gone - if (~jr_stall_o) - pc_set_o = 1'b1; + // we don't have to change our current state here as the prefetch + // buffer is automatically invalidated, thus the next instruction + // that is served to the ID stage is the one of the jump target + end else begin + // handle exceptions + if (exc_req_i) begin + pc_mux_o = `PC_EXCEPTION; + pc_set_o = 1'b1; + exc_ack_o = 1'b1; + + halt_id_o = 1'b1; // we don't want to propagate this instruction to EX + save_pc_id_o = 1'b1; // we don't have to change our current state here as the prefetch // buffer is automatically invalidated, thus the next instruction - // that is served to the ID stage is the one of the jump target + // that is served to the ID stage is the one of the jump to the + // exception handler end + end - // handle hwloops - if (hwloop_jump_i) begin - pc_mux_o = `PC_HWLOOP; - pc_set_o = 1'b1; - end + if (eret_insn_i) begin + pc_mux_o = `PC_ERET; + pc_set_o = 1'b1; + end - if (eret_insn_i) begin - pc_mux_o = `PC_ERET; - pc_set_o = 1'b1; - end + // handle WFI instruction, flush pipeline and (potentially) go to + // sleep + // also handles eret when the core should go back to sleep + if (pipe_flush_i || (eret_insn_i && (~fetch_enable_i))) + begin + halt_if_o = 1'b1; + halt_id_o = 1'b1; - // handle WFI instruction, flush pipeline and (potentially) go to - // sleep - // also handles eret when the core should go back to sleep - if (pipe_flush_i || (eret_insn_i && (~fetch_enable_i))) - begin - halt_if_o = 1'b1; - halt_id_o = 1'b1; + ctrl_fsm_ns = FLUSH_EX; + end - ctrl_fsm_ns = FLUSH_EX; - end + // take care of debug + // branch conditional will be handled in next state + if (trap_hit_i) + begin + // halt pipeline immediately + halt_if_o = 1'b1; - // handle exceptions - if (exc_req_i) begin - // to not loose the hwloop, we to into a special state where we - // save the new PC - if (hwloop_jump_i) - begin - ctrl_fsm_ns = JUMP_EXC; - end else begin - pc_mux_o = `PC_EXCEPTION; - pc_set_o = 1'b1; - exc_ack_o = 1'b1; - - halt_id_o = 1'b1; // we don't want to propagate this instruction to EX - save_pc_id_o = 1'b1; - - // we don't have to change our current state here as the prefetch - // buffer is automatically invalidated, thus the next instruction - // that is served to the ID stage is the one of the jump to the - // exception handler - end - end - - // take care of debug - // branch conditional will be handled in next state - if (trap_hit_i) - begin - // halt pipeline immediately - halt_if_o = 1'b1; - - // make sure the current instruction has been executed - // before changing state to non-decode - if (id_valid_i) begin - if (jump_in_id_i == `BRANCH_COND) - ctrl_fsm_ns = DBG_WAIT_BRANCH; - else - ctrl_fsm_ns = DBG_SIGNAL; - end + // make sure the current instruction has been executed + // before changing state to non-decode + if (id_valid_i) begin + if (jump_in_id_i == `BRANCH_COND) + ctrl_fsm_ns = DBG_WAIT_BRANCH; + else + ctrl_fsm_ns = DBG_SIGNAL; end end end @@ -430,19 +399,6 @@ module riscv_controller end end - // go to an exception handler after a jump - JUMP_EXC: - begin - // we can just save the IF PC, since it propagated through the - // prefetcher - save_pc_if_o = 1'b1; - pc_mux_o = `PC_EXCEPTION; - pc_set_o = 1'b1; - exc_ack_o = 1'b1; - - ctrl_fsm_ns = DECODE; - end - default: begin instr_req_o = 1'b0; ctrl_fsm_ns = RESET; @@ -572,6 +528,7 @@ module riscv_controller assign perf_jr_stall_o = jr_stall_o; assign perf_ld_stall_o = load_stall_o; + //---------------------------------------------------------------------------- // Assertions //---------------------------------------------------------------------------- @@ -579,6 +536,6 @@ module riscv_controller // make sure that taken branches do not happen back-to-back, as this is not // possible without branch prediction in the IF stage assert property ( - @(posedge clk) (branch_taken_ex_i) |=> (~branch_taken_ex_i) ); + @(posedge clk) (branch_taken_ex_i) |=> (~branch_taken_ex_i) ) else $warning("Two branches back-to-back are taken"); endmodule // controller diff --git a/decoder.sv b/decoder.sv index 1ffd221c..b2eb1b50 100644 --- a/decoder.sv +++ b/decoder.sv @@ -614,13 +614,12 @@ module riscv_decoder end 3'b101: begin - // lp.setupi: initialize counter from rs1, set start address to + // lp.setupi: initialize counter from immediate, set start address to // next instruction and end address to PC + I-type immediate hwloop_we = 3'b111; hwloop_target_mux_sel_o = 1'b1; hwloop_start_mux_sel_o = 1'b1; - hwloop_cnt_mux_sel_o = 1'b1; - rega_used_o = 1'b1; + hwloop_cnt_mux_sel_o = 1'b0; end default: begin diff --git a/hwloop_controller.sv b/hwloop_controller.sv index 6c295252..27de287a 100644 --- a/hwloop_controller.sv +++ b/hwloop_controller.sv @@ -44,6 +44,9 @@ module riscv_hwloop_controller // to hwloop_regs output logic [N_REGS-1:0] hwlp_dec_cnt_o, + // from pipeline stages + input logic [N_REGS-1:0] hwlp_dec_cnt_id_i, + // to id stage output logic hwlp_jump_o, output logic [31:0] hwlp_targ_addr_o @@ -58,14 +61,27 @@ module riscv_hwloop_controller // generate comparators. check for end address and the loop counter genvar i; - for (i = 0; i < N_REGS; i++) begin - assign pc_is_end_addr[i] = (current_pc_i == hwlp_end_addr_i[i]) && - (hwlp_counter_i[i] > 32'h1); - end - - // output signal for ID stage - assign hwlp_jump_o = (|pc_is_end_addr); + generate + for (i = 0; i < N_REGS; i++) begin + always @(*) + begin + pc_is_end_addr[i] = 1'b0; + if (current_pc_i == hwlp_end_addr_i[i]) begin + if (hwlp_counter_i[i][31:2] != 30'h0) begin + pc_is_end_addr[i] = 1'b1; + end else begin + // hwlp_counter_i[i][31:2] == 32'h0 + case (hwlp_counter_i[i][1:0]) + 2'b11: pc_is_end_addr[i] = 1'b1; + 2'b10: pc_is_end_addr[i] = ~hwlp_dec_cnt_id_i[i]; // only when there is nothing in flight + 2'b01, 2'b00: pc_is_end_addr[i] = 1'b0; + endcase + end + end + end + end + endgenerate // select corresponding start address and decrement counter always_comb @@ -82,4 +98,7 @@ module riscv_hwloop_controller end end + // output signal for ID stage + assign hwlp_jump_o = (|pc_is_end_addr); + endmodule diff --git a/hwloop_regs.sv b/hwloop_regs.sv index bba508b7..16a7cce9 100644 --- a/hwloop_regs.sv +++ b/hwloop_regs.sv @@ -117,12 +117,11 @@ module riscv_hwloop_regs end else begin - if (hwlp_we_i[2] == 1'b1) // potential contention problem here! + for (i = 0; i < N_REGS; i++) begin - hwlp_counter_q[hwlp_regid_i] <= hwlp_cnt_data_i; - end else begin - for (i = 0; i < N_REGS; i++) - begin + if ((hwlp_we_i[2] == 1'b1) && (i == hwlp_regid_i)) begin + hwlp_counter_q[i] <= hwlp_cnt_data_i; + end else begin if (hwlp_dec_cnt_i[i] && valid_i) hwlp_counter_q[i] <= hwlp_counter_n[i]; end diff --git a/id_stage.sv b/id_stage.sv index e8ff50da..08f18322 100644 --- a/id_stage.sv +++ b/id_stage.sv @@ -35,8 +35,8 @@ module riscv_id_stage #( - parameter N_HWLP_REGS = 2, - parameter N_HWLP_REG_BITS = $clog2(N_HWLP_REGS) + parameter N_HWLP = 2, + parameter N_HWLP_BITS = $clog2(N_HWLP) ) ( input logic clk, @@ -49,9 +49,11 @@ module riscv_id_stage output logic is_decoding_o, // Interface to IF stage - input logic instr_valid_i, - input logic [31:0] instr_rdata_i, // comes from pipeline of IF stage - output logic instr_req_o, + input logic [N_HWLP-1:0] hwlp_dec_cnt_i, + input logic is_hwlp_i, + input logic instr_valid_i, + input logic [31:0] instr_rdata_i, // comes from pipeline of IF stage + output logic instr_req_o, // Jumps and branches @@ -113,7 +115,9 @@ module riscv_id_stage output logic [1:0] csr_op_ex_o, // hwloop signals - output logic [31:0] hwloop_targ_addr_o, + output logic [N_HWLP-1:0] [31:0] hwlp_start_o, + output logic [N_HWLP-1:0] [31:0] hwlp_end_o, + output logic [N_HWLP-1:0] [31:0] hwlp_cnt_o, // Interface to load store unit output logic data_req_ex_o, @@ -196,6 +200,7 @@ module riscv_id_stage // Immediate decoding and sign extension logic [31:0] imm_i_type; + logic [31:0] imm_iz_type; logic [31:0] imm_s_type; logic [31:0] imm_sb_type; logic [31:0] imm_u_type; @@ -255,23 +260,16 @@ module riscv_id_stage logic data_req_id; // hwloop signals - logic [N_HWLP_REG_BITS-1:0] hwloop_regid; - logic [2:0] hwloop_we; - logic hwloop_jump; - logic hwloop_target_mux_sel; - logic hwloop_start_mux_sel; - logic hwloop_cnt_mux_sel; + logic [N_HWLP_BITS-1:0] hwloop_regid; + logic [2:0] hwloop_we; + logic hwloop_target_mux_sel; + logic hwloop_start_mux_sel; + logic hwloop_cnt_mux_sel; - logic [31:0] hwloop_target; - logic [31:0] hwloop_start; - logic [31:0] hwloop_end; - logic [31:0] hwloop_cnt; - - // hwloop reg signals - logic [N_HWLP_REGS-1:0] hwloop_dec_cnt; - logic [N_HWLP_REGS-1:0] [31:0] hwloop_start_addr; - logic [N_HWLP_REGS-1:0] [31:0] hwloop_end_addr; - logic [N_HWLP_REGS-1:0] [31:0] hwloop_counter; + logic [31:0] hwloop_target; + logic [31:0] hwloop_start; + logic [31:0] hwloop_end; + logic [31:0] hwloop_cnt; // CSR control logic csr_access; @@ -296,6 +294,7 @@ module riscv_id_stage // immediate extraction and sign extension assign imm_i_type = { {20 {instr[31]}}, instr[31:20] }; + assign imm_iz_type = { 20'b0, instr[31:20] }; assign imm_s_type = { {20 {instr[31]}}, instr[31:25], instr[11:7] }; assign imm_sb_type = { {19 {instr[31]}}, instr[31], instr[7], instr[30:25], instr[11:8], 1'b0 }; assign imm_u_type = { instr[31:12], 12'b0 }; @@ -320,7 +319,7 @@ module riscv_id_stage // kill instruction in the IF/ID stage by setting the instr_valid_id control // signal to 0 for instructions that are done - assign clear_instr_valid_o = id_ready_o; + assign clear_instr_valid_o = id_ready_o | halt_id; assign branch_taken_ex = branch_in_ex_o & branch_decision_i; @@ -341,7 +340,7 @@ module riscv_id_stage always_comb begin unique case (hwloop_target_mux_sel) - 1'b0: hwloop_target = current_pc_id_i + imm_i_type; + 1'b0: hwloop_target = current_pc_id_i + {imm_iz_type[30:0], 1'b0}; 1'b1: hwloop_target = current_pc_id_i + {imm_z_type[30:0], 1'b0}; endcase end @@ -362,7 +361,7 @@ module riscv_id_stage always_comb begin : hwloop_cnt_mux unique case (hwloop_cnt_mux_sel) - 1'b0: hwloop_cnt = imm_i_type; + 1'b0: hwloop_cnt = imm_iz_type; 1'b1: hwloop_cnt = operand_a_fw_id; endcase; end @@ -658,9 +657,6 @@ module riscv_id_stage .data_req_ex_i ( data_req_ex_o ), .data_misaligned_i ( data_misaligned_i ), - // hwloop signals - .hwloop_jump_i ( hwloop_jump ), - // jump/branch control .branch_taken_ex_i ( branch_taken_ex ), .jump_in_id_i ( jump_in_id ), @@ -769,54 +765,32 @@ module riscv_id_stage // // ////////////////////////////////////////////////////////////////////////// - riscv_hwloop_controller - #( - .N_REGS ( N_HWLP_REGS ) - ) - hwloop_controller_i - ( - // from ID stage - .current_pc_i ( current_pc_if_i ), - - // to IF stage/controller - .hwlp_jump_o ( hwloop_jump ), - .hwlp_targ_addr_o ( hwloop_targ_addr_o ), - - // from hwloop_regs - .hwlp_start_addr_i ( hwloop_start_addr ), - .hwlp_end_addr_i ( hwloop_end_addr ), - .hwlp_counter_i ( hwloop_counter ), - - // to hwloop_regs - .hwlp_dec_cnt_o ( hwloop_dec_cnt ) - ); - riscv_hwloop_regs #( - .N_REGS ( N_HWLP_REGS ) + .N_REGS ( N_HWLP ) ) hwloop_regs_i ( - .clk ( clk ), - .rst_n ( rst_n ), + .clk ( clk ), + .rst_n ( rst_n ), // from ID - .hwlp_start_data_i ( hwloop_start ), - .hwlp_end_data_i ( hwloop_end ), - .hwlp_cnt_data_i ( hwloop_cnt ), - .hwlp_we_i ( hwloop_we ), - .hwlp_regid_i ( hwloop_regid ), + .hwlp_start_data_i ( hwloop_start ), + .hwlp_end_data_i ( hwloop_end ), + .hwlp_cnt_data_i ( hwloop_cnt ), + .hwlp_we_i ( hwloop_we ), + .hwlp_regid_i ( hwloop_regid ), // from controller - .valid_i ( instr_valid_i ), + .valid_i ( instr_valid_i & is_hwlp_i ), // to hwloop controller - .hwlp_start_addr_o ( hwloop_start_addr ), - .hwlp_end_addr_o ( hwloop_end_addr ), - .hwlp_counter_o ( hwloop_counter ), + .hwlp_start_addr_o ( hwlp_start_o ), + .hwlp_end_addr_o ( hwlp_end_o ), + .hwlp_counter_o ( hwlp_cnt_o ), // from hwloop controller - .hwlp_dec_cnt_i ( hwloop_dec_cnt ) + .hwlp_dec_cnt_i ( hwlp_dec_cnt_i ) ); @@ -956,6 +930,7 @@ module riscv_id_stage assign id_ready_o = ((~misaligned_stall) & (~jr_stall) & (~load_stall) & ex_ready_i); assign id_valid_o = (~halt_id) & id_ready_o; + //---------------------------------------------------------------------------- // Assertions //---------------------------------------------------------------------------- @@ -964,4 +939,8 @@ module riscv_id_stage assert property ( @(posedge clk) (branch_in_ex_o) |-> (branch_decision_i !== 1'bx) ); + // the instruction delivered to the ID stage should always be valid + assert property ( + @(posedge clk) (instr_valid_i & (~illegal_c_insn_i)) |-> (!$isunknown(instr_rdata_i)) ); + endmodule diff --git a/if_stage.sv b/if_stage.sv index 31bf7981..8c67420d 100644 --- a/if_stage.sv +++ b/if_stage.sv @@ -35,6 +35,7 @@ module riscv_if_stage #( + parameter N_HWLP = 2, parameter RDATA_WIDTH = 32 ) ( @@ -55,12 +56,14 @@ module riscv_if_stage input logic [RDATA_WIDTH-1:0] instr_rdata_i, // Output of IF Pipeline stage - output logic instr_valid_id_o, // instruction in IF/ID pipeline is valid - output logic [31:0] instr_rdata_id_o, // read instruction is sampled and sent to ID stage for decoding - output logic is_compressed_id_o, // compressed decoder thinks this is a compressed instruction - output logic illegal_c_insn_id_o, // compressed decoder thinks this is an invalid instruction - output logic [31:0] current_pc_if_o, - output logic [31:0] current_pc_id_o, + output logic [N_HWLP-1:0] hwlp_dec_cnt_id_o, // currently served instruction was the target of a hwlp + output logic is_hwlp_id_o, // currently served instruction was the target of a hwlp + output logic instr_valid_id_o, // instruction in IF/ID pipeline is valid + output logic [31:0] instr_rdata_id_o, // read instruction is sampled and sent to ID stage for decoding + output logic is_compressed_id_o, // compressed decoder thinks this is a compressed instruction + output logic illegal_c_insn_id_o, // compressed decoder thinks this is an invalid instruction + output logic [31:0] current_pc_if_o, + output logic [31:0] current_pc_id_o, // Forwarding ports - control signals input logic clear_instr_valid_i, // clear instruction valid bit in IF/ID pipe @@ -75,7 +78,9 @@ module riscv_if_stage input logic [31:0] jump_target_ex_i, // jump target address // from hwloop controller - input logic [31:0] hwloop_target_i, // pc from hwloop start addr + input logic [N_HWLP-1:0] [31:0] hwlp_start_i, // hardware loop start addresses + input logic [N_HWLP-1:0] [31:0] hwlp_end_i, // hardware loop end addresses + input logic [N_HWLP-1:0] [31:0] hwlp_cnt_i, // hardware loop counters // from debug unit input logic [31:0] dbg_npc_i, @@ -93,46 +98,27 @@ module riscv_if_stage ); // offset FSM - enum logic[1:0] {WAIT_ALIGNED, WAIT_UNALIGNED, IDLE } offset_fsm_cs, offset_fsm_ns; + enum logic[0:0] {WAIT, IDLE } offset_fsm_cs, offset_fsm_ns; - logic [1:0] is_compressed; - logic unaligned; - logic unaligned_jump; - - logic valid; + logic valid; // prefetch buffer related signals - logic prefetch_busy; - logic branch_req; - logic [31:0] fetch_addr_n; + logic prefetch_busy; + logic branch_req; + logic [31:0] fetch_addr_n; - logic fetch_valid; - logic fetch_ready; - logic [31:0] fetch_rdata; - logic [31:0] fetch_addr; + logic fetch_valid; + logic fetch_ready; + logic [31:0] fetch_rdata; + logic [31:0] fetch_addr; + logic is_hwlp_id_q, fetch_is_hwlp; + logic [31:0] exc_pc; - logic [31:0] instr_rdata_int; - - logic [31:0] exc_pc; - - - // output data and PC mux - always_comb - begin - // default values for regular aligned access - current_pc_if_o = {fetch_addr[31:2], 2'b00}; - instr_rdata_int = fetch_rdata; - - if (unaligned) begin - current_pc_if_o = {fetch_addr[31:2], 2'b10}; - end - end - - - // compressed instruction detection - assign is_compressed[0] = (fetch_rdata[1:0] != 2'b11); - assign is_compressed[1] = (fetch_rdata[17:16] != 2'b11); + // hardware loop related signals + logic hwlp_jump; + logic [31:0] hwlp_target; + logic [N_HWLP-1:0] hwlp_dec_cnt, hwlp_dec_cnt_if; // exception PC selection mux @@ -166,7 +152,6 @@ module riscv_if_stage `PC_BRANCH: fetch_addr_n = jump_target_ex_i; `PC_EXCEPTION: fetch_addr_n = exc_pc; // set PC to exception handler `PC_ERET: fetch_addr_n = exception_pc_reg_i; // PC is restored when returning from IRQ/exception - `PC_HWLOOP: fetch_addr_n = hwloop_target_i; // PC is taken from hwloop start addr `PC_DBG_NPC: fetch_addr_n = dbg_npc_i; // PC is taken from debug unit default: begin @@ -177,8 +162,6 @@ module riscv_if_stage endcase end - assign unaligned_jump = fetch_addr_n[1]; - generate if (RDATA_WIDTH == 32) begin : prefetch_32 // prefetch buffer, caches a fixed number of instructions @@ -188,14 +171,18 @@ module riscv_if_stage .rst_n ( rst_n ), .req_i ( 1'b1 ), - .branch_i ( branch_req ), - .addr_i ( {fetch_addr_n[31:2], 2'b00} ), - .unaligned_i ( unaligned ), // is the current address unaligned? + .branch_i ( branch_req ), + .addr_i ( fetch_addr_n ), + + .hwloop_i ( hwlp_jump ), + .hwloop_target_i ( hwlp_target ), + .ready_i ( fetch_ready ), .valid_o ( fetch_valid ), .rdata_o ( fetch_rdata ), .addr_o ( fetch_addr ), + .is_hwlp_o ( fetch_is_hwlp ), // goes to instruction memory / instruction cache .instr_req_o ( instr_req_o ), @@ -215,14 +202,18 @@ module riscv_if_stage .rst_n ( rst_n ), .req_i ( 1'b1 ), - .branch_i ( branch_req ), - .addr_i ( {fetch_addr_n[31:2], 2'b00} ), - .unaligned_i ( unaligned ), // is the current address unaligned? + .branch_i ( branch_req ), + .addr_i ( fetch_addr_n ), + + .hwloop_i ( hwlp_jump ), + .hwloop_target_i ( hwlp_target ), + .ready_i ( fetch_ready ), .valid_o ( fetch_valid ), .rdata_o ( fetch_rdata ), .addr_o ( fetch_addr ), + .is_hwlp_o ( fetch_is_hwlp ), // goes to instruction memory / instruction cache .instr_req_o ( instr_req_o ), @@ -257,55 +248,24 @@ module riscv_if_stage branch_req = 1'b0; valid = 1'b0; - unaligned = 1'b0; - unique case (offset_fsm_cs) // no valid instruction data for ID stage // assume aligned IDLE: begin if (req_i) begin branch_req = 1'b1; - offset_fsm_ns = WAIT_ALIGNED; + offset_fsm_ns = WAIT; end end // serving aligned 32 bit or 16 bit instruction, we don't know yet - WAIT_ALIGNED: begin + WAIT: begin if (fetch_valid) begin valid = 1'b1; // an instruction is ready for ID stage if (req_i && if_valid_o) begin - - if (~is_compressed[0]) begin - // 32 bit aligned instruction found - fetch_ready = 1'b1; - offset_fsm_ns = WAIT_ALIGNED; - end else begin - // 16 bit aligned instruction found - // next instruction will be unaligned - offset_fsm_ns = WAIT_UNALIGNED; - end - end - end - end - - // serving unaligned 32 bit instruction - // next instruction might be 16 bit unaligned (no need to fetch) - // or 32 bit unaligned (need to fetch another word from cache) - WAIT_UNALIGNED: begin - unaligned = 1'b1; - - if (fetch_valid) begin - valid = 1'b1; // an instruction is ready for ID stage - - if (req_i && if_valid_o) begin - // next instruction will be aligned fetch_ready = 1'b1; - - if (is_compressed[0]) - offset_fsm_ns = WAIT_ALIGNED; - else - offset_fsm_ns = WAIT_UNALIGNED; + offset_fsm_ns = WAIT; end end end @@ -322,17 +282,38 @@ module riscv_if_stage // switch to new PC from ID stage branch_req = 1'b1; - if (unaligned_jump) - offset_fsm_ns = WAIT_UNALIGNED; - else - offset_fsm_ns = WAIT_ALIGNED; + offset_fsm_ns = WAIT; end end + // Hardware Loops + riscv_hwloop_controller + #( + .N_REGS ( N_HWLP ) + ) + hwloop_controller_i + ( + .current_pc_i ( fetch_addr ), - assign if_busy_o = prefetch_busy; + .hwlp_jump_o ( hwlp_jump ), + .hwlp_targ_addr_o ( hwlp_target ), - assign perf_imiss_o = (~fetch_valid) | branch_req; + // from hwloop_regs + .hwlp_start_addr_i ( hwlp_start_i ), + .hwlp_end_addr_i ( hwlp_end_i ), + .hwlp_counter_i ( hwlp_cnt_i ), + + // to hwloop_regs + .hwlp_dec_cnt_o ( hwlp_dec_cnt ), + .hwlp_dec_cnt_id_i ( hwlp_dec_cnt_id_o & {N_HWLP{is_hwlp_id_o}} ) + ); + + + assign current_pc_if_o = fetch_addr; + + assign if_busy_o = prefetch_busy; + + assign perf_imiss_o = (~fetch_valid) | branch_req; // compressed instruction decoding, or more precisely compressed instruction @@ -346,12 +327,25 @@ module riscv_if_stage riscv_compressed_decoder compressed_decoder_i ( - .instr_i ( instr_rdata_int ), + .instr_i ( fetch_rdata ), .instr_o ( instr_decompressed ), .is_compressed_o ( instr_compressed_int ), .illegal_instr_o ( illegal_c_insn ) ); + // prefetch -> IF registers + always_ff @(posedge clk, negedge rst_n) + begin + if (rst_n == 1'b0) + begin + hwlp_dec_cnt_if <= '0; + end + else + begin + if (hwlp_jump) + hwlp_dec_cnt_if <= hwlp_dec_cnt; + end + end // IF-ID pipeline registers, frozen when the ID stage is stalled always_ff @(posedge clk, negedge rst_n) @@ -363,6 +357,8 @@ module riscv_if_stage illegal_c_insn_id_o <= 1'b0; is_compressed_id_o <= 1'b0; current_pc_id_o <= '0; + is_hwlp_id_q <= 1'b0; + hwlp_dec_cnt_id_o <= '0; end else begin @@ -376,10 +372,16 @@ module riscv_if_stage illegal_c_insn_id_o <= illegal_c_insn; is_compressed_id_o <= instr_compressed_int; current_pc_id_o <= current_pc_if_o; + is_hwlp_id_q <= fetch_is_hwlp; + + if (fetch_is_hwlp) + hwlp_dec_cnt_id_o <= hwlp_dec_cnt_if; end end end + assign is_hwlp_id_o = is_hwlp_id_q & instr_valid_id_o; + assign if_ready_o = valid & id_ready_i; assign if_valid_o = (~halt_if_i) & if_ready_o; diff --git a/include/defines.sv b/include/defines.sv index b434dafb..91d2e169 100644 --- a/include/defines.sv +++ b/include/defines.sv @@ -99,6 +99,10 @@ `define INSTR_SRA { 7'b0100000, 10'b?, 3'b101, 5'b?, `OPCODE_OP } `define INSTR_OR { 7'b0000000, 10'b?, 3'b110, 5'b?, `OPCODE_OP } `define INSTR_AND { 7'b0000000, 10'b?, 3'b111, 5'b?, `OPCODE_OP } +`define INSTR_EXTHS { 7'b0001000, 10'b?, 3'b100, 5'b?, `OPCODE_OP } // pulp specific +`define INSTR_EXTHZ { 7'b0001000, 10'b?, 3'b101, 5'b?, `OPCODE_OP } // pulp specific +`define INSTR_EXTBS { 7'b0001000, 10'b?, 3'b110, 5'b?, `OPCODE_OP } // pulp specific +`define INSTR_EXTBZ { 7'b0001000, 10'b?, 3'b111, 5'b?, `OPCODE_OP } // pulp specific // FENCE `define INSTR_FENCE { 4'b0, 8'b?, 13'b0, `OPCODE_FENCE } `define INSTR_FENCEI { 17'b0, 3'b001, 5'b0, `OPCODE_FENCE } @@ -302,7 +306,6 @@ `define PC_BRANCH 3'b011 `define PC_EXCEPTION 3'b100 `define PC_ERET 3'b101 -`define PC_HWLOOP 3'b110 `define PC_DBG_NPC 3'b111 // Exception PC mux selector defines diff --git a/prefetch_L0_buffer.sv b/prefetch_L0_buffer.sv index 46dc0599..3e4d63ab 100644 --- a/prefetch_L0_buffer.sv +++ b/prefetch_L0_buffer.sv @@ -32,15 +32,18 @@ module riscv_prefetch_L0_buffer input logic rst_n, input logic req_i, + input logic branch_i, - input logic ready_i, input logic [31:0] addr_i, + input logic hwloop_i, + input logic [31:0] hwloop_target_i, + + input logic ready_i, output logic valid_o, output logic [31:0] rdata_o, output logic [31:0] addr_o, - - input logic unaligned_i, + output logic is_hwlp_o, // is set when the currently served data is from a hwloop // goes to instruction memory / instruction cache output logic instr_req_o, @@ -53,86 +56,425 @@ module riscv_prefetch_L0_buffer output logic busy_o ); - enum logic [2:0] {EMPTY, VALID_L0, WAIT_GNT, WAIT_RVALID, WAIT_ABORTED } CS, NS; - logic [31:0] current_address, last_address; - logic [1:0] pointer_cs, pointer_ns; - logic update_current_address; + logic busy_L0; - logic [3:0][31:0] L0_buffer; - logic [31:0] previous_chunk; - logic clear_buffer; + enum logic [2:0] { REGULAR, PREFETCH, LAST_BRANCH, LAST_BRANCH_WAIT, HWLP_WAIT_LAST, HWLP_FETCHING, HWLP_PREFETCH, HWLP_ABORT } prefetch_CS, prefetch_NS; + logic do_prefetch; + logic [31:0] addr_q, addr_n, addr_int, addr_aligned_next; - logic valid_L0; - logic ready_L0; - logic is_prefetch_q, is_prefetch_n; + logic [31:0] rdata_last_q; + + logic valid_L0; + logic [RDATA_IN_WIDTH/32-1:0][31:0] rdata_L0; + logic [31:0] addr_L0; // prepared data for output - logic [31:0] rdata, unaligned_rdata; - logic valid, unaligned_valid; + logic [31:0] rdata, rdata_unaligned; + logic valid, valid_unaligned; + + logic aligned_is_compressed, unaligned_is_compressed; + + logic fetching_hwlp; + logic hwlp_inhibit; + logic prefetch_important; - assign busy_o = (CS != EMPTY && CS != VALID_L0) || instr_req_o; + prefetch_L0_buffer_L0 + #( + .RDATA_IN_WIDTH ( RDATA_IN_WIDTH ) + ) + L0_buffer_i + ( + .clk ( clk ), + .rst_n ( rst_n ), + + .prefetch_i ( do_prefetch ), + .prefetch_important_i ( prefetch_important ), + .prefetch_addr_i ( addr_aligned_next ), + + .branch_i ( branch_i ), + .branch_addr_i ( addr_i ), + + .hwlp_i ( hwloop_i & (~hwlp_inhibit) ), + .hwlp_addr_i ( hwloop_target_i ), + + .hwlp_fetching_o ( fetching_hwlp ), + + .valid_o ( valid_L0 ), + .rdata_o ( rdata_L0 ), + .addr_o ( addr_L0 ), + + .instr_req_o ( instr_req_o ), + .instr_addr_o ( instr_addr_o ), + .instr_gnt_i ( instr_gnt_i ), + .instr_rvalid_i ( instr_rvalid_i ), + .instr_rdata_i ( instr_rdata_i ), + + .busy_o ( busy_L0 ) + ); - always_ff @(posedge clk or negedge rst_n) + assign rdata = ((prefetch_CS == PREFETCH) | (prefetch_CS == HWLP_WAIT_LAST) | (prefetch_CS == HWLP_PREFETCH) | (prefetch_CS == LAST_BRANCH_WAIT)) ? rdata_last_q : rdata_L0[addr_o[3:2]]; + assign valid = ( ((prefetch_CS == PREFETCH) | (prefetch_CS == HWLP_WAIT_LAST) | (prefetch_CS == HWLP_PREFETCH)) | valid_L0) & (prefetch_CS != HWLP_ABORT); + + // the lower part of rdata_unaligned is always the higher part of rdata + assign rdata_unaligned[15:0] = rdata[31:16]; + + always_comb begin - if (~rst_n) - begin - CS <= EMPTY; - current_address <= '0; - last_address <= '0; - pointer_cs <= '0; - is_prefetch_q <= 1'b0; - end - else - begin - CS <= NS; + valid_unaligned = 1'b0; - if (branch_i) - begin - current_address <= {addr_i[31:4],4'b0000}; - pointer_cs <= addr_i[3:2]; - is_prefetch_q <= 1'b0; - end - else - begin - if (update_current_address) begin - last_address <= current_address; - current_address <= current_address + 5'h10; // jump to the next cache line - end + if (valid_L0) begin + case(addr_o[3:2]) + 2'b00: begin rdata_unaligned[31:16] = rdata_L0[1][15:0]; valid_unaligned = 1'b1; end + 2'b01: begin rdata_unaligned[31:16] = rdata_L0[2][15:0]; valid_unaligned = 1'b1; end + 2'b10: begin rdata_unaligned[31:16] = rdata_L0[3][15:0]; valid_unaligned = 1'b1; end + // this state is only interesting if we have already done a prefetch + 2'b11: begin + rdata_unaligned[31:16] = rdata_L0[0][15:0]; - if (ready_i) - is_prefetch_q <= 1'b0; - else - is_prefetch_q <= is_prefetch_n; - - pointer_cs <= pointer_ns; - end + if ((prefetch_CS == PREFETCH) | (prefetch_CS == HWLP_PREFETCH)) begin + valid_unaligned = 1'b1; + end else begin + valid_unaligned = 1'b0; + end + end + endcase // addr_o end end + assign unaligned_is_compressed = rdata[17:16] != 2'b11; + assign aligned_is_compressed = rdata[1:0] != 2'b11; + + assign addr_aligned_next = { addr_o[31:2], 2'b00 } + 32'h4; always_comb begin - valid = 1'b0; - valid_L0 = 1'b0; - pointer_ns = pointer_cs; - instr_req_o = 1'b0; - instr_addr_o = (branch_i) ? addr_i : current_address + 5'h10; - update_current_address = 1'b0; - clear_buffer = 1'b0; - is_prefetch_n = is_prefetch_q; + addr_int = addr_o; + + // advance address when pipeline is unstalled + if (ready_i) begin + + if (addr_o[1]) begin + // unaligned case + // always move to next entry in the FIFO + + if (unaligned_is_compressed) begin + addr_int = { addr_aligned_next[31:2], 2'b00}; + end else begin + addr_int = { addr_aligned_next[31:2], 2'b10}; + end + + end else begin + // aligned case + + if (aligned_is_compressed) begin + // just increase address, do not move to next entry in the FIFO + addr_int = { addr_o[31:2], 2'b10 }; + end else begin + // move to next entry in the FIFO + addr_int = { addr_aligned_next[31:2], 2'b00 }; + end + end + + end + end + + always_comb + begin + do_prefetch = 1'b0; + prefetch_NS = prefetch_CS; + addr_n = addr_int; + + case (prefetch_CS) + REGULAR: begin + if (fetching_hwlp) begin + if (ready_i) begin + addr_n = hwloop_target_i; + prefetch_NS = HWLP_FETCHING; + end + else + prefetch_NS = HWLP_WAIT_LAST; + end else if (addr_o[3:2] == 2'b11) begin + if ((~addr_o[1]) & aligned_is_compressed & valid) + // we are serving a compressed instruction + prefetch_NS = PREFETCH; + else begin + if (ready_i) + prefetch_NS = REGULAR; + else if (valid_L0) + prefetch_NS = PREFETCH; + end + end + + // actually only needed when ~branch_i and ~fetching_hwlp not set, but + // if we would keep those as conditions, we generate a cominational loop + if (addr_o[3:2] == 2'b11) + do_prefetch = 1'b1; + end + + // we are doing a prefetch + // we save the last word of the L0 buffer and already preload the L0 + // buffer with new stuff + PREFETCH: begin + if (fetching_hwlp) begin + if (ready_i) begin + addr_n = hwloop_target_i; + prefetch_NS = HWLP_FETCHING; + end + else + prefetch_NS = HWLP_WAIT_LAST; + end else if (ready_i) begin + if (hwloop_i) begin + addr_n = addr_q; + prefetch_NS = HWLP_ABORT; + end else begin + if ((~addr_o[1]) & aligned_is_compressed) + // we are serving a compressed instruction + prefetch_NS = PREFETCH; + else + prefetch_NS = REGULAR; + end + end + end + + // we have branched into the last word of the L0 buffer, so we have to + // prefetch the next cache line as soon as we got this one + LAST_BRANCH: begin + do_prefetch = 1'b1; + + if (valid_L0) begin + if (fetching_hwlp) begin + if (ready_i) begin + addr_n = hwloop_target_i; + prefetch_NS = HWLP_FETCHING; + end + else + prefetch_NS = HWLP_WAIT_LAST; + end + else if ( ((~addr_o[1]) & aligned_is_compressed) | (addr_o[1] & (~unaligned_is_compressed)) ) + // we are serving a compressed instruction or an instruction that + // spans two cache lines + prefetch_NS = PREFETCH; + else if (ready_i) + prefetch_NS = REGULAR; + else + prefetch_NS = LAST_BRANCH_WAIT; + end + end + + LAST_BRANCH_WAIT: begin + if (ready_i) + prefetch_NS = REGULAR; + end + + // wait for last instruction to be delivered before going to hwloop + HWLP_WAIT_LAST: begin + if (ready_i) begin + addr_n = addr_L0; // use address that was saved in L0 buffer + prefetch_NS = HWLP_FETCHING; + end + end + + HWLP_FETCHING: begin + if (valid_L0) begin + if (addr_o[3:2] == 2'b11) begin + do_prefetch = 1'b1; + + if ((~addr_o[1]) & aligned_is_compressed) + // we are serving a compressed instruction + prefetch_NS = HWLP_PREFETCH; + else begin + if (ready_i) + prefetch_NS = REGULAR; + else + prefetch_NS = HWLP_PREFETCH; + end + end else begin + if (ready_i) begin + prefetch_NS = REGULAR; + end + end + end + end + + HWLP_PREFETCH: begin + if (ready_i) begin + prefetch_NS = REGULAR; + end + end + + HWLP_ABORT: begin + if (fetching_hwlp) begin + prefetch_NS = HWLP_FETCHING; + addr_n = hwloop_target_i; + end + end + endcase + + // branches always have priority + if (branch_i) begin + addr_n = addr_i; + if (addr_i[3:2] == 2'b11) + prefetch_NS = LAST_BRANCH; + else + prefetch_NS = REGULAR; + end + end + + // do not abort an important prefetch for a hardware loop + //assign prefetch_important = (((addr_q[3:1] == 3'b111) & (~unaligned_is_compressed)) | (addr_q[3:2] == 2'b00)) & do_prefetch; + assign prefetch_important = 1'b0; + + assign hwlp_inhibit = (prefetch_CS == HWLP_WAIT_LAST) | (prefetch_CS == HWLP_FETCHING) | (prefetch_CS == HWLP_PREFETCH); + + + ////////////////////////////////////////////////////////////////////////////// + // registers + ////////////////////////////////////////////////////////////////////////////// + + always_ff @(posedge clk, negedge rst_n) + begin + if (~rst_n) + begin + addr_q <= '0; + prefetch_CS <= REGULAR; + rdata_last_q <= '0; + end + else + begin + addr_q <= addr_n; + + prefetch_CS <= prefetch_NS; + + if (fetching_hwlp) + rdata_last_q <= rdata_o; + else if (do_prefetch) + rdata_last_q <= rdata; + + end + end + + ////////////////////////////////////////////////////////////////////////////// + // output ports + ////////////////////////////////////////////////////////////////////////////// + + assign rdata_o = (~addr_o[1] | (prefetch_CS == HWLP_WAIT_LAST)) ? rdata: rdata_unaligned; + assign valid_o = (addr_o[1] & (~unaligned_is_compressed)) ? valid_unaligned : valid; + + assign addr_o = addr_q; + + assign is_hwlp_o = ((prefetch_CS == HWLP_FETCHING) | (prefetch_CS == HWLP_PREFETCH)) & valid_o; + + assign busy_o = busy_L0; + + + //---------------------------------------------------------------------------- + // Assertions + //---------------------------------------------------------------------------- + + // there should never be a ready_i without valid_o + assert property ( + @(posedge clk) (ready_i) |-> (valid_o) ) else $warning("IF Stage is ready without prefetcher having valid data"); + +endmodule // prefetch_L0_buffer + + +module prefetch_L0_buffer_L0 +#( + parameter RDATA_IN_WIDTH = 128 +) +( + input logic clk, + input logic rst_n, + + input logic prefetch_i, + input logic prefetch_important_i, + input logic [31:0] prefetch_addr_i, + + input logic branch_i, + input logic [31:0] branch_addr_i, + + input logic hwlp_i, + input logic [31:0] hwlp_addr_i, + + + output logic hwlp_fetching_o, + + output logic valid_o, + output logic [RDATA_IN_WIDTH/32-1:0][31:0] rdata_o, + output logic [31:0] addr_o, + + // goes to instruction memory / instruction cache + output logic instr_req_o, + output logic [31:0] instr_addr_o, + input logic instr_gnt_i, + input logic instr_rvalid_i, + input logic [RDATA_IN_WIDTH/32-1:0][31:0] instr_rdata_i, + + output logic busy_o +); + + enum logic [2:0] { EMPTY, VALID_L0, WAIT_GNT, WAIT_RVALID, ABORTED_BRANCH, WAIT_HWLOOP } CS, NS; + + logic [3:0][31:0] L0_buffer; + logic [31:0] addr_q, instr_addr_int; + logic valid; + + logic hwlp_pending_n; + + + // edge detector on hwlp pending + assign hwlp_fetching_o = (~hwlp_pending_n) & (hwlp_i); + + ////////////////////////////////////////////////////////////////////////////// + // FSM + ////////////////////////////////////////////////////////////////////////////// + + always_comb + begin + NS = CS; + valid = 1'b0; + instr_req_o = 1'b0; + instr_addr_int = 'x; + hwlp_pending_n = hwlp_i; case(CS) // wait for the first branch request before fetching any instructions EMPTY: begin - instr_req_o = branch_i; + if (branch_i) + instr_addr_int = branch_addr_i; + else if (hwlp_i & (~prefetch_important_i)) begin + instr_addr_int = hwlp_addr_i; + hwlp_pending_n = 1'b0; + end + else + instr_addr_int = prefetch_addr_i; - if (branch_i) // make the request to icache + if (branch_i | hwlp_i | prefetch_i) // make the request to icache begin + instr_req_o = 1'b1; + + if (instr_gnt_i) + NS = WAIT_RVALID; + else + NS = WAIT_GNT; + end + end //~EMPTY + + WAIT_GNT: + begin + if (branch_i) + instr_addr_int = branch_addr_i; + else + instr_addr_int = addr_q; + + if (branch_i) + begin + instr_req_o = 1'b1; if (instr_gnt_i) NS = WAIT_RVALID; @@ -141,49 +483,68 @@ module riscv_prefetch_L0_buffer end else begin - NS = EMPTY; + instr_req_o = 1'b1; + + if (instr_gnt_i) + NS = WAIT_RVALID; + else + NS = WAIT_GNT; end - end //~EMPTY + end //~WAIT_GNT + WAIT_RVALID: begin - if (branch_i) // there is a pending branch - begin - instr_addr_o = {addr_i[31:4],4'b0000}; + valid = instr_rvalid_i; + if (branch_i) + instr_addr_int = branch_addr_i; + else if (hwlp_i) + instr_addr_int = hwlp_addr_i; + else + instr_addr_int = prefetch_addr_i; + + if (branch_i) + begin if (instr_rvalid_i) begin - instr_req_o = 1'b1; + instr_req_o = 1'b1; if (instr_gnt_i) NS = WAIT_RVALID; else NS = WAIT_GNT; - end - else - begin - NS = WAIT_ABORTED; + end else begin + NS = ABORTED_BRANCH; // TODO: THIS STATE IS IDENTICAL WITH THIS ONE end end - else // else (branch_i) + else if (hwlp_i) begin - valid = instr_rvalid_i; - // prepare address even if we don't need it - // this removes the dependency for instr_addr_o on instr_rvalid_i - instr_addr_o = current_address + 5'h10; + if (instr_rvalid_i) + begin + instr_req_o = 1'b1; + hwlp_pending_n = 1'b0; + + if (instr_gnt_i) + NS = WAIT_RVALID; + else + NS = WAIT_GNT; + end else begin + NS = WAIT_HWLOOP; + end + + end + else + begin if (instr_rvalid_i) begin - if (&pointer_cs) // we are receiving the last packet, then prefetch the next one + if (prefetch_i) // we are receiving the last packet, then prefetch the next one begin - is_prefetch_n = 1'b1; - - instr_req_o = 1'b1; - pointer_ns = '0; - update_current_address = 1'b1; + instr_req_o = 1'b1; if (instr_gnt_i) NS = WAIT_RVALID; @@ -193,215 +554,125 @@ module riscv_prefetch_L0_buffer else // not the last chunk begin NS = VALID_L0; - - if (ready_L0) - pointer_ns = pointer_cs + 1'b1; - else - pointer_ns = pointer_cs; end end - else // still wait instr_rvalid_i - begin - NS = WAIT_RVALID; - end end end //~WAIT_RVALID VALID_L0: begin - valid = 1'b1; - valid_L0 = 1'b1; + valid = 1'b1; if (branch_i) + instr_addr_int = branch_addr_i; + else if (hwlp_i) begin + instr_addr_int = hwlp_addr_i; + hwlp_pending_n = 1'b0; + end + else + instr_addr_int = prefetch_addr_i; + + if (branch_i | hwlp_i | prefetch_i) begin - instr_req_o = 1'b1; - instr_addr_o = {addr_i[31:4],4'b0000}; + instr_req_o = 1'b1; if (instr_gnt_i) NS = WAIT_RVALID; else NS = WAIT_GNT; end - else - begin - if ( &pointer_cs ) // we are dispathing the last packet, therefore prefetch the next cache line - begin - is_prefetch_n = 1'b1; - instr_req_o = 1'b1; - instr_addr_o = current_address + 5'h10; - pointer_ns = '0; - update_current_address = 1'b1; - - if (instr_gnt_i) - NS = WAIT_RVALID; - else - NS = WAIT_GNT; - end - else - begin - if (ready_L0) - begin - pointer_ns = pointer_cs + 1'b1; - end - - NS = VALID_L0; - end - end end //~VALID_L0 - WAIT_GNT: + ABORTED_BRANCH: begin - if (branch_i) - begin - instr_req_o = 1'b1; - instr_addr_o = {addr_i[31:4],4'b0000}; - - if (instr_gnt_i) - NS = WAIT_RVALID; - else - NS = WAIT_GNT; - end - else - begin - instr_req_o = 1'b1; - instr_addr_o = current_address; // has been previously updated - - if (instr_gnt_i) - NS = WAIT_RVALID; - else - NS = WAIT_GNT; - end - end //~WAIT_GNT - - WAIT_ABORTED: - begin - clear_buffer = 1'b1; // prepare address even if we don't need it // this removes the dependency for instr_addr_o on instr_rvalid_i - instr_addr_o = current_address; + if (branch_i) + instr_addr_int = branch_addr_i; + else + instr_addr_int = addr_q; if (instr_rvalid_i) begin - instr_req_o = 1'b1; + instr_req_o = 1'b1; if (instr_gnt_i) NS = WAIT_RVALID; else NS = WAIT_GNT; end + end //~ABORTED_BRANCH + + WAIT_HWLOOP: + begin + valid = instr_rvalid_i; + + // prepare address even if we don't need it + // this removes the dependency for instr_addr_o on instr_rvalid_i + if (branch_i) + instr_addr_int = branch_addr_i; else + instr_addr_int = addr_q; + + if (instr_rvalid_i) begin - NS = WAIT_ABORTED; + hwlp_pending_n = 1'b0; + instr_req_o = 1'b1; + + if (instr_gnt_i) + NS = WAIT_RVALID; + else + NS = WAIT_GNT; end - end //~WAIT_ABORTED + end //~ABORTED_HWLOOP default: begin NS = EMPTY; - clear_buffer = 1'b1; end endcase //~CS end - // rdata mux, either directly use the incoming data or the saved data in - // L0/previous_chunk - always_comb - begin - if (is_prefetch_q) - begin - rdata = previous_chunk; - addr_o = { last_address[31:4], 2'b11, 2'b00 }; - end - else - begin - if (valid_L0) begin - rdata = L0_buffer[pointer_cs]; - addr_o = { current_address[31:4], pointer_cs, 2'b00 }; - end - else - begin - rdata = instr_rdata_i[pointer_cs]; - addr_o = { current_address[31:4], pointer_cs, 2'b00 }; - end - end - end + ////////////////////////////////////////////////////////////////////////////// + // registers + ////////////////////////////////////////////////////////////////////////////// - // the lower part of unaligned_rdata is always the higher part of rdata - assign unaligned_rdata[15:0] = rdata[31:16]; - - always_comb - begin - if (valid_L0) begin - case(addr_o[3:2]) - 2'b00: begin unaligned_rdata[31:16] = L0_buffer[1][15:0]; unaligned_valid = 1'b1; end - 2'b01: begin unaligned_rdata[31:16] = L0_buffer[2][15:0]; unaligned_valid = 1'b1; end - 2'b10: begin unaligned_rdata[31:16] = L0_buffer[3][15:0]; unaligned_valid = 1'b1; end - // this state is only interesting if we have already done a prefetch - 2'b11: begin - unaligned_rdata[31:16] = L0_buffer[0][15:0]; - - if (is_prefetch_q) begin - unaligned_valid = 1'b1; - end else begin - unaligned_valid = 1'b0; - end - end - endcase // addr_o - end else begin - // L0 buffer is not valid, so we can take the data directly from the - // icache - - case(addr_o[3:2]) - 2'b00: begin unaligned_rdata[31:16] = instr_rdata_i[1][15:0]; unaligned_valid = instr_rvalid_i; end - 2'b01: begin unaligned_rdata[31:16] = instr_rdata_i[2][15:0]; unaligned_valid = instr_rvalid_i; end - 2'b10: begin unaligned_rdata[31:16] = instr_rdata_i[3][15:0]; unaligned_valid = instr_rvalid_i; end - - 2'b11: - begin - unaligned_rdata[31:16] = instr_rdata_i[0][15:0]; - - if (is_prefetch_q) - unaligned_valid = instr_rvalid_i; - else - unaligned_valid = 1'b0; - end - endcase // pointer_cs - end - end - - assign ready_L0 = (is_prefetch_q) ? 1'b0 : ready_i; - - - always_ff @(posedge clk or negedge rst_n) + always_ff @(posedge clk, negedge rst_n) begin if (~rst_n) begin - L0_buffer <= '0; - previous_chunk <= '0; + CS <= EMPTY; + L0_buffer <= '0; + addr_q <= '0; end else begin + CS <= NS; + if (instr_rvalid_i) begin L0_buffer <= instr_rdata_i; end - // update previous chunk only when we are doing a prefetch - // do this only once per prefetch - if (is_prefetch_n && (~is_prefetch_q)) - begin - previous_chunk <= (valid_L0) ? L0_buffer[3][31:0] : instr_rdata_i[3][31:0]; - end + if (branch_i | hwlp_i | prefetch_i) + addr_q <= instr_addr_int; end end + ////////////////////////////////////////////////////////////////////////////// - // instruction aligner (if unaligned) + // output ports ////////////////////////////////////////////////////////////////////////////// - assign rdata_o = unaligned_i ? unaligned_rdata : rdata; - assign valid_o = unaligned_i ? unaligned_valid : valid; + assign instr_addr_o = { instr_addr_int[31:4], 4'b0000 }; -endmodule // prefetch_L0_buffer + assign rdata_o = (instr_rvalid_i) ? instr_rdata_i : L0_buffer; + assign addr_o = addr_q; + + assign valid_o = valid & (~branch_i); + + assign busy_o = (CS != EMPTY) && (CS != VALID_L0) || instr_req_o; + +endmodule diff --git a/prefetch_buffer.sv b/prefetch_buffer.sv index b1bd96c4..b5d51a02 100644 --- a/prefetch_buffer.sv +++ b/prefetch_buffer.sv @@ -32,9 +32,8 @@ module riscv_fetch_fifo input logic rst_n, // control signals - input logic clear_i, // clears the contents of the fifo - - input logic unaligned_i, // is the current output rdata unaligned + input logic branch_i, // clears the contents of the fifo + input logic hwloop_i, // tries to insert an entry above the first one // input port input logic in_addr_valid_i, @@ -51,11 +50,7 @@ module riscv_fetch_fifo input logic out_ready_i, output logic [31:0] out_rdata_o, output logic [31:0] out_addr_o, - - output logic out_unaligned_valid_o, - output logic [31:0] out_unaligned_rdata_o, - - output logic out_is_unaligned_o + output logic out_is_hwlp_o ); localparam DEPTH = 3; // must be 2 or greater @@ -65,25 +60,57 @@ module riscv_fetch_fifo logic [0:DEPTH-1] addr_valid_n, addr_valid_int, addr_valid_Q; logic [0:DEPTH-1] [31:0] rdata_n, rdata_int, rdata_Q; logic [0:DEPTH-1] rdata_valid_n, rdata_valid_int, rdata_valid_Q; - logic is_unaligned_n, is_unaligned_Q; + logic [0:1 ] is_hwlp_n, is_hwlp_int, is_hwlp_Q; + + logic [31:0] rdata, rdata_unaligned; + logic valid, valid_unaligned; + + logic aligned_is_compressed, unaligned_is_compressed; + + logic hwlp_inbound; ////////////////////////////////////////////////////////////////////////////// // output port ////////////////////////////////////////////////////////////////////////////// - // output assignments - assign out_rdata_o = (rdata_valid_Q[0]) ? rdata_Q[0] : in_rdata_i; - assign out_addr_o = addr_Q[0]; // always output addr directly since we sent it one cycle earlier to the FIFO - assign out_valid_o = (rdata_valid_Q[0] || (addr_valid_Q[0] && in_rdata_valid_i)); + assign rdata = (rdata_valid_Q[0]) ? rdata_Q[0] : in_rdata_i; + assign valid = (rdata_valid_Q[0] || (addr_valid_Q[0] && in_rdata_valid_i)); - assign out_unaligned_rdata_o = (rdata_valid_Q[1]) ? {rdata_Q[1][15:0], out_rdata_o[31:16]} : {in_rdata_i[15:0], out_rdata_o[31:16]}; + assign rdata_unaligned = (rdata_valid_Q[1]) ? {rdata_Q[1][15:0], rdata[31:16]} : {in_rdata_i[15:0], rdata[31:16]}; // it is implied that rdata_valid_Q[0] is set - assign out_unaligned_valid_o = (rdata_valid_Q[1] || (addr_valid_Q[1] && in_rdata_valid_i)); + assign valid_unaligned = (rdata_valid_Q[1] || (addr_valid_Q[1] && in_rdata_valid_i)); - assign out_is_unaligned_o = is_unaligned_Q; + assign unaligned_is_compressed = rdata[17:16] != 2'b11; + assign aligned_is_compressed = rdata[1:0] != 2'b11; + ////////////////////////////////////////////////////////////////////////////// + // instruction aligner (if unaligned) + ////////////////////////////////////////////////////////////////////////////// + + always_comb + begin + // serve the aligned case even though the output address is unaligned when + // the next instruction will be from a hardware loop target + // in this case the current instruction is already prealigned in element 0 + if (out_addr_o[1] && (~is_hwlp_Q[1])) begin + // unaligned case + out_rdata_o = rdata_unaligned; + + if (unaligned_is_compressed) + out_valid_o = 1'b1; + else + out_valid_o = valid_unaligned; + end else begin + // aligned case + out_rdata_o = rdata; + out_valid_o = valid; + end + end + + assign out_addr_o = addr_Q[0]; // always output addr directly since we sent it one cycle earlier to the FIFO + assign out_is_hwlp_o = is_hwlp_Q[0]; ////////////////////////////////////////////////////////////////////////////// @@ -91,7 +118,7 @@ module riscv_fetch_fifo ////////////////////////////////////////////////////////////////////////////// // we accept addresses as long as our fifo is not full or we are cleared - assign in_addr_ready_o = clear_i || (~addr_valid_Q[DEPTH-1]); + assign in_addr_ready_o = branch_i || (~addr_valid_Q[DEPTH-1]); // we accept data as long as our fifo is not full // we don't care about clear here as the data will be received one cycle @@ -111,6 +138,9 @@ module riscv_fetch_fifo end end + // accept hwloop input as long as our second entry is not already one + assign hwlp_inbound = hwloop_i & (~is_hwlp_Q[1]); + ////////////////////////////////////////////////////////////////////////////// // FIFO management ////////////////////////////////////////////////////////////////////////////// @@ -120,6 +150,7 @@ module riscv_fetch_fifo begin addr_int = addr_Q; addr_valid_int = addr_valid_Q; + is_hwlp_int = is_hwlp_Q; if (in_addr_valid_i && in_addr_ready_o) begin for(j = 0; j < DEPTH; j++) begin @@ -131,6 +162,14 @@ module riscv_fetch_fifo end end end + + // on a hardware loop invalidate everything starting from the second entry + if (hwlp_inbound) begin + addr_int[1] = in_addr_i; + addr_valid_int[1] = 1'b1; + addr_valid_int[2:DEPTH-1] = '0; + is_hwlp_int[1] = 1'b1; + end end int k; @@ -149,28 +188,65 @@ module riscv_fetch_fifo end end end + + // on a hardware loop invalidate everything starting from the second entry + if (hwlp_inbound) begin + rdata_int[0] = out_rdata_o; // save current output in rdata_int[0], so that we have it available even though we override entry #1 + rdata_valid_int[1:DEPTH-1] = '0; + end end // move everything by one step always_comb begin - addr_n = addr_int; - addr_valid_n = addr_valid_int; - rdata_n = rdata_int; - rdata_valid_n = rdata_valid_int; - is_unaligned_n = is_unaligned_Q; + addr_n = addr_int; + addr_valid_n = addr_valid_int; + rdata_n = rdata_int; + rdata_valid_n = rdata_valid_int; + is_hwlp_n = is_hwlp_int; if (out_ready_i && out_valid_o) begin - addr_n = {addr_int[1:DEPTH-1], 32'b0}; - addr_valid_n = {addr_valid_int[1:DEPTH-1], 1'b0}; - rdata_n = {rdata_int[1:DEPTH-1], 32'b0}; - rdata_valid_n = {rdata_valid_int[1:DEPTH-1], 1'b0}; - is_unaligned_n = 1'b0; - end else begin - if (out_unaligned_valid_o && unaligned_i && (~is_unaligned_Q)) begin - // are we unaligned? then assemble the last word from the two halfes - rdata_n[0] = out_unaligned_rdata_o; - is_unaligned_n = 1'b1; + + // now take care of the addresses + if (is_hwlp_int[1]) begin + // hardware loop found in second entry + addr_n = {addr_int[1][31:0], addr_int[2:DEPTH-1], 32'b0}; + addr_valid_n = {addr_valid_int[1:DEPTH-1], 1'b0}; + rdata_n = {rdata_int[1:DEPTH-1], 32'b0}; + rdata_valid_n = {rdata_valid_int[1:DEPTH-1], 1'b0}; + is_hwlp_n = {is_hwlp_int[1], 1'b0}; + end else begin + if (addr_Q[0][1]) begin + // unaligned case + + if (unaligned_is_compressed) begin + addr_n = {{addr_int[1][31:2], 2'b00}, addr_int[2:DEPTH-1], 32'b0}; + end else begin + addr_n = {{addr_int[1][31:2], 2'b10}, addr_int[2:DEPTH-1], 32'b0}; + end + + addr_valid_n = {addr_valid_int[1:DEPTH-1], 1'b0}; + rdata_n = {rdata_int[1:DEPTH-1], 32'b0}; + rdata_valid_n = {rdata_valid_int[1:DEPTH-1], 1'b0}; + is_hwlp_n = {is_hwlp_int[1], 1'b0}; + + end else begin + // aligned case + + if (aligned_is_compressed) begin + // just increase address, do not move to next entry in FIFO + addr_n[0] = {addr_int[0][31:2], 2'b10}; + is_hwlp_n[0] = 1'b0; // invalidate hwlp bit for current address + end else begin + // move to next entry in FIFO + addr_n = {{addr_int[1][31:2], 2'b00}, addr_int[2:DEPTH-1], 32'b0}; + addr_valid_n = {addr_valid_int[1:DEPTH-1], 1'b0}; + rdata_n = {rdata_int[1:DEPTH-1], 32'b0}; + rdata_valid_n = {rdata_valid_int[1:DEPTH-1], 1'b0}; + is_hwlp_n = {is_hwlp_int[1], 1'b0}; + end + + end end end end @@ -183,27 +259,27 @@ module riscv_fetch_fifo begin if(rst_n == 1'b0) begin - addr_Q <= '{default: '0}; - addr_valid_Q <= '0; - rdata_Q <= '{default: '0}; - rdata_valid_Q <= '0; - is_unaligned_Q <= 1'b0; + addr_Q <= '{default: '0}; + addr_valid_Q <= '0; + rdata_Q <= '{default: '0}; + rdata_valid_Q <= '0; + is_hwlp_Q <= '0; end else begin // on a clear signal from outside we invalidate the content of the FIFO // completely and start from an empty state - if (clear_i) begin - addr_Q[0] <= in_addr_i; - addr_valid_Q <= {in_addr_valid_i, {DEPTH-1{1'b0}}}; - rdata_valid_Q <= '0; - is_unaligned_Q <= 1'b0; + if (branch_i) begin + addr_Q[0] <= in_addr_i; + addr_valid_Q <= {in_addr_valid_i, {DEPTH-1{1'b0}}}; + rdata_valid_Q <= '0; + is_hwlp_Q <= '0; end else begin - addr_Q <= addr_n; - addr_valid_Q <= addr_valid_n; - rdata_Q <= rdata_n; - rdata_valid_Q <= rdata_valid_n; - is_unaligned_Q <= is_unaligned_n; + addr_Q <= addr_n; + addr_valid_Q <= addr_valid_n; + rdata_Q <= rdata_n; + rdata_valid_Q <= rdata_valid_n; + is_hwlp_Q <= is_hwlp_n; end end end @@ -217,15 +293,19 @@ module riscv_prefetch_buffer input logic clk, input logic rst_n, - input logic unaligned_i, input logic req_i, + input logic branch_i, - input logic ready_i, input logic [31:0] addr_i, + input logic hwloop_i, + input logic [31:0] hwloop_target_i, + + input logic ready_i, output logic valid_o, output logic [31:0] rdata_o, output logic [31:0] addr_o, + output logic is_hwlp_o, // is set when the currently served data is from a hwloop // goes to instruction memory / instruction cache output logic instr_req_o, @@ -249,11 +329,6 @@ module riscv_prefetch_buffer logic fifo_rdata_valid; logic fifo_rdata_ready; - logic fifo_is_unaligned; - - logic [31:0] rdata, unaligned_rdata; - logic valid, unaligned_valid; - ////////////////////////////////////////////////////////////////////////////// // prefetch buffer status @@ -265,7 +340,17 @@ module riscv_prefetch_buffer // address selection and increase ////////////////////////////////////////////////////////////////////////////// - assign addr_next = (branch_i) ? addr_i : (fifo_last_addr + 32'd4); + always_comb + begin + addr_next = {fifo_last_addr[31:2], 2'b00} + 32'd4; + + if (branch_i) begin + addr_next = addr_i; + end else begin + if (hwloop_i) + addr_next = hwloop_target_i; + end + end ////////////////////////////////////////////////////////////////////////////// @@ -278,9 +363,8 @@ module riscv_prefetch_buffer .clk ( clk ), .rst_n ( rst_n ), - .clear_i ( branch_i ), - - .unaligned_i ( unaligned_i ), + .branch_i ( branch_i ), + .hwloop_i ( hwloop_i ), .in_addr_valid_i ( fifo_addr_valid ), .in_addr_ready_o ( fifo_addr_ready ), @@ -291,24 +375,13 @@ module riscv_prefetch_buffer .in_rdata_ready_o ( fifo_rdata_ready ), .in_rdata_i ( instr_rdata_i ), - .out_valid_o ( valid ), + .out_valid_o ( valid_o ), .out_ready_i ( ready_i ), - .out_rdata_o ( rdata ), + .out_rdata_o ( rdata_o ), .out_addr_o ( addr_o ), - - .out_unaligned_valid_o ( unaligned_valid ), - .out_unaligned_rdata_o ( unaligned_rdata ), - - .out_is_unaligned_o ( fifo_is_unaligned ) + .out_is_hwlp_o ( is_hwlp_o ) ); - ////////////////////////////////////////////////////////////////////////////// - // instruction aligner (if unaligned) - ////////////////////////////////////////////////////////////////////////////// - - assign rdata_o = (unaligned_i && (~fifo_is_unaligned)) ? unaligned_rdata : rdata; - assign valid_o = (unaligned_i && (~fifo_is_unaligned)) ? unaligned_valid : valid; - ////////////////////////////////////////////////////////////////////////////// // instruction fetch FSM diff --git a/riscv_core.sv b/riscv_core.sv index 2a9e851e..5e95fec3 100644 --- a/riscv_core.sv +++ b/riscv_core.sv @@ -83,23 +83,27 @@ module riscv_core input logic [N_EXT_PERF_COUNTERS-1:0] ext_perf_counters_i ); + localparam N_HWLP = 2; + // IF/ID signals - logic instr_valid_id; - logic [31:0] instr_rdata_id; // Instruction sampled inside IF stage - logic is_compressed_id; - logic illegal_c_insn_id; // Illegal compressed instruction sent to ID stage - logic [31:0] current_pc_if; // Current Program counter - logic [31:0] current_pc_id; // Current Program counter + logic is_hwlp_id; + logic [N_HWLP-1:0] hwlp_dec_cnt_id; + logic instr_valid_id; + logic [31:0] instr_rdata_id; // Instruction sampled inside IF stage + logic is_compressed_id; + logic illegal_c_insn_id; // Illegal compressed instruction sent to ID stage + logic [31:0] current_pc_if; // Current Program counter + logic [31:0] current_pc_id; // Current Program counter - logic clear_instr_valid; - logic pc_set; - logic [2:0] pc_mux_id; // Mux selector for next PC - logic [1:0] exc_pc_mux_id; // Mux selector for exception PC - logic [4:0] exc_vec_pc_mux_id; // Mux selector for vectorized IR lines + logic clear_instr_valid; + logic pc_set; + logic [2:0] pc_mux_id; // Mux selector for next PC + logic [1:0] exc_pc_mux_id; // Mux selector for exception PC + logic [4:0] exc_vec_pc_mux_id; // Mux selector for vectorized IR lines - logic lsu_load_err; - logic lsu_store_err; + logic lsu_load_err; + logic lsu_store_err; // ID performance counter signals logic is_decoding; @@ -191,7 +195,9 @@ module riscv_core // Hardware loop controller signals - logic [31:0] hwloop_target; // from hwloop controller to if stage + logic [N_HWLP-1:0] [31:0] hwlp_start; + logic [N_HWLP-1:0] [31:0] hwlp_end; + logic [N_HWLP-1:0] [31:0] hwlp_cnt; // Debug Unit @@ -233,6 +239,7 @@ module riscv_core ////////////////////////////////////////////////// riscv_if_stage #( + .N_HWLP ( N_HWLP ), .RDATA_WIDTH ( INSTR_RDATA_WIDTH ) ) if_stage_i @@ -254,6 +261,8 @@ module riscv_core .instr_rdata_i ( instr_rdata_i ), // outputs to ID stage + .hwlp_dec_cnt_id_o ( hwlp_dec_cnt_id ), + .is_hwlp_id_o ( is_hwlp_id ), .instr_valid_id_o ( instr_valid_id ), .instr_rdata_id_o ( instr_rdata_id ), .is_compressed_id_o ( is_compressed_id ), @@ -270,7 +279,9 @@ module riscv_core .exc_vec_pc_mux_i ( exc_vec_pc_mux_id ), // from hwloop controller - .hwloop_target_i ( hwloop_target ), // pc from hwloop start address + .hwlp_start_i ( hwlp_start ), + .hwlp_end_i ( hwlp_end ), + .hwlp_cnt_i ( hwlp_cnt ), // from debug unit .dbg_npc_i ( dbg_npc ), @@ -299,7 +310,11 @@ module riscv_core // |___|____/ |____/ |_/_/ \_\____|_____| // // // ///////////////////////////////////////////////// - riscv_id_stage id_stage_i + riscv_id_stage + #( + .N_HWLP ( N_HWLP ) + ) + id_stage_i ( .clk ( clk ), .rst_n ( rst_n ), @@ -312,6 +327,8 @@ module riscv_core .is_decoding_o ( is_decoding ), // Interface to instruction memory + .hwlp_dec_cnt_i ( hwlp_dec_cnt_id ), + .is_hwlp_i ( is_hwlp_id ), .instr_valid_i ( instr_valid_id ), .instr_rdata_i ( instr_rdata_id ), .instr_req_o ( instr_req_int ), @@ -372,8 +389,10 @@ module riscv_core .csr_access_ex_o ( csr_access_ex ), .csr_op_ex_o ( csr_op_ex ), - // hwloop signals - .hwloop_targ_addr_o ( hwloop_target ), + // hardware loop signals to IF hwlp controller + .hwlp_start_o ( hwlp_start ), + .hwlp_end_o ( hwlp_end ), + .hwlp_cnt_o ( hwlp_cnt ), // LSU .data_req_ex_o ( data_req_ex ), // to load store unit @@ -773,6 +792,10 @@ module riscv_core `INSTR_SRA: printRInstr("SRA"); `INSTR_OR: printRInstr("OR"); `INSTR_AND: printRInstr("AND"); + `INSTR_EXTHS: printRInstr("EXTHS"); + `INSTR_EXTHZ: printRInstr("EXTHZ"); + `INSTR_EXTBS: printRInstr("EXTBS"); + `INSTR_EXTBZ: printRInstr("EXTBZ"); // FENCE `INSTR_FENCE: printMnemonic("FENCE"); `INSTR_FENCEI: printMnemonic("FENCEI"); @@ -986,6 +1009,7 @@ module riscv_core 3'b010: mnemonic = "LCOUNT"; 3'b011: mnemonic = "LCOUNTI"; 3'b100: mnemonic = "LSETUP"; + 3'b101: mnemonic = "LSETUPI"; 3'b111: begin printMnemonic("INVALID"); return; @@ -994,18 +1018,21 @@ module riscv_core riscv_core.mnemonic = mnemonic; // decode and print instruction - imm = id_stage_i.imm_i_type; + imm = id_stage_i.imm_iz_type; case (instr[14:12]) // lp.starti and lp.endi 3'b000, - 3'b001: $fdisplay(f, "%7s\tx%0d, 0x%h (-> 0x%h)", mnemonic, rd, imm, pc+imm); + 3'b001: $fdisplay(f, "%7s\tx%0d, 0x%h (-> 0x%h)", mnemonic, rd, imm, pc+(imm<<1)); // lp.count 3'b010: $fdisplay(f, "%7s\tx%0d, x%0d (0x%h)", mnemonic, rd, rs1, rs1_value); // lp.counti 3'b011: $fdisplay(f, "%7s\tx%0d, 0x%h", mnemonic, rd, imm); // lp.setup 3'b100: $fdisplay(f, "%7s\tx%0d, x%0d (0x%h), 0x%h (-> 0x%h)", mnemonic, - rd, rs1, rs1_value, imm, pc+imm); + rd, rs1, rs1_value, imm, pc+(imm<<1)); + // lp.setupi + 3'b101: $fdisplay(f, "%7s\tx%0d, x%0d (0x%h), 0x%h (-> 0x%h)", mnemonic, + rd, rs1, rs1_value, imm, pc+(id_stage_i.imm_z_type << 1)); endcase end endfunction