Fix hardware loops, reimplement prefetch buffer for pulp

Change encoding of hardware loop setup instructions, displacement by one
bit and unsigned
This commit is contained in:
Andreas Traber 2015-12-07 15:28:06 +01:00
parent 0cb4f2620c
commit af8343d366
10 changed files with 921 additions and 592 deletions

View file

@ -66,9 +66,6 @@ module riscv_controller
input logic data_req_ex_i, // data memory access is currently performed in EX stage
input logic data_misaligned_i,
// hwloop signals
input logic hwloop_jump_i, // modify pc_mux to select the hwloop addr
// jump/branch signals
input logic branch_taken_ex_i, // branch taken signal from EX ALU
input logic [1:0] jump_in_id_i, // jump is being calculated in ALU
@ -78,7 +75,6 @@ module riscv_controller
input logic exc_req_i,
output logic exc_ack_o,
// TODO
input logic trap_hit_i, // a trap was hit, so we have to flush EX and WB
output logic save_pc_if_o,
@ -125,7 +121,6 @@ module riscv_controller
// FSM state encoding
enum logic [3:0] { RESET, BOOT_SET, SLEEP, FIRST_FETCH,
DECODE,
JUMP_EXC,
FLUSH_EX, FLUSH_WB,
DBG_WAIT_BRANCH, DBG_SIGNAL, DBG_WAIT } ctrl_fsm_cs, ctrl_fsm_ns;
@ -228,13 +223,6 @@ module riscv_controller
ctrl_fsm_ns = DECODE;
end
// hwloop detected, jump to start address!
// Attention: This has to be done in the DECODE and the FIRST_FETCH states
if (hwloop_jump_i == 1'b1) begin
pc_mux_o = `PC_HWLOOP;
pc_set_o = 1'b1;
end
// handle exceptions
if (exc_req_i) begin
pc_mux_o = `PC_EXCEPTION;
@ -251,93 +239,74 @@ module riscv_controller
begin
is_decoding_o = 1'b0;
// TODO: integrate this with the next loop, rename branch_decision
// into branch_taken and remove the jump_in_ex signal completely,
// there is no need to propagate it into the controller
if (instr_valid_i) begin
// decode and execute instructions only if the current conditional
// branch in the EX stage is either not taken, or there is no
// conditional branch in the EX stage
if (instr_valid_i && (~branch_taken_ex_i))
begin // now analyze the current instruction in the ID stage
is_decoding_o = 1'b1;
// decode and execute instructions only if the current conditional
// branch in the EX stage is either not taken, or there is no
// conditional branch in the EX stage
if (~branch_taken_ex_i)
begin // now analyze the current instruction in the ID stage
is_decoding_o = 1'b1;
// handle unconditional jumps
// we can jump directly since we know the address already
// we don't need to worry about conditional branches here as they
// will be evaluated in the EX stage
if (jump_in_dec_i == `BRANCH_JALR || jump_in_dec_i == `BRANCH_JAL) begin
pc_mux_o = `PC_JUMP;
// handle unconditional jumps
// we can jump directly since we know the address already
// we don't need to worry about conditional branches here as they
// will be evaluated in the EX stage
if (jump_in_dec_i == `BRANCH_JALR || jump_in_dec_i == `BRANCH_JAL) begin
pc_mux_o = `PC_JUMP;
// if there is a jr stall, wait for it to be gone
if (~jr_stall_o)
pc_set_o = 1'b1;
// if there is a jr stall, wait for it to be gone
if (~jr_stall_o)
pc_set_o = 1'b1;
// we don't have to change our current state here as the prefetch
// buffer is automatically invalidated, thus the next instruction
// that is served to the ID stage is the one of the jump target
end else begin
// handle exceptions
if (exc_req_i) begin
pc_mux_o = `PC_EXCEPTION;
pc_set_o = 1'b1;
exc_ack_o = 1'b1;
halt_id_o = 1'b1; // we don't want to propagate this instruction to EX
save_pc_id_o = 1'b1;
// we don't have to change our current state here as the prefetch
// buffer is automatically invalidated, thus the next instruction
// that is served to the ID stage is the one of the jump target
// that is served to the ID stage is the one of the jump to the
// exception handler
end
end
// handle hwloops
if (hwloop_jump_i) begin
pc_mux_o = `PC_HWLOOP;
pc_set_o = 1'b1;
end
if (eret_insn_i) begin
pc_mux_o = `PC_ERET;
pc_set_o = 1'b1;
end
if (eret_insn_i) begin
pc_mux_o = `PC_ERET;
pc_set_o = 1'b1;
end
// handle WFI instruction, flush pipeline and (potentially) go to
// sleep
// also handles eret when the core should go back to sleep
if (pipe_flush_i || (eret_insn_i && (~fetch_enable_i)))
begin
halt_if_o = 1'b1;
halt_id_o = 1'b1;
// handle WFI instruction, flush pipeline and (potentially) go to
// sleep
// also handles eret when the core should go back to sleep
if (pipe_flush_i || (eret_insn_i && (~fetch_enable_i)))
begin
halt_if_o = 1'b1;
halt_id_o = 1'b1;
ctrl_fsm_ns = FLUSH_EX;
end
ctrl_fsm_ns = FLUSH_EX;
end
// take care of debug
// branch conditional will be handled in next state
if (trap_hit_i)
begin
// halt pipeline immediately
halt_if_o = 1'b1;
// handle exceptions
if (exc_req_i) begin
// to not loose the hwloop, we to into a special state where we
// save the new PC
if (hwloop_jump_i)
begin
ctrl_fsm_ns = JUMP_EXC;
end else begin
pc_mux_o = `PC_EXCEPTION;
pc_set_o = 1'b1;
exc_ack_o = 1'b1;
halt_id_o = 1'b1; // we don't want to propagate this instruction to EX
save_pc_id_o = 1'b1;
// we don't have to change our current state here as the prefetch
// buffer is automatically invalidated, thus the next instruction
// that is served to the ID stage is the one of the jump to the
// exception handler
end
end
// take care of debug
// branch conditional will be handled in next state
if (trap_hit_i)
begin
// halt pipeline immediately
halt_if_o = 1'b1;
// make sure the current instruction has been executed
// before changing state to non-decode
if (id_valid_i) begin
if (jump_in_id_i == `BRANCH_COND)
ctrl_fsm_ns = DBG_WAIT_BRANCH;
else
ctrl_fsm_ns = DBG_SIGNAL;
end
// make sure the current instruction has been executed
// before changing state to non-decode
if (id_valid_i) begin
if (jump_in_id_i == `BRANCH_COND)
ctrl_fsm_ns = DBG_WAIT_BRANCH;
else
ctrl_fsm_ns = DBG_SIGNAL;
end
end
end
@ -430,19 +399,6 @@ module riscv_controller
end
end
// go to an exception handler after a jump
JUMP_EXC:
begin
// we can just save the IF PC, since it propagated through the
// prefetcher
save_pc_if_o = 1'b1;
pc_mux_o = `PC_EXCEPTION;
pc_set_o = 1'b1;
exc_ack_o = 1'b1;
ctrl_fsm_ns = DECODE;
end
default: begin
instr_req_o = 1'b0;
ctrl_fsm_ns = RESET;
@ -572,6 +528,7 @@ module riscv_controller
assign perf_jr_stall_o = jr_stall_o;
assign perf_ld_stall_o = load_stall_o;
//----------------------------------------------------------------------------
// Assertions
//----------------------------------------------------------------------------
@ -579,6 +536,6 @@ module riscv_controller
// make sure that taken branches do not happen back-to-back, as this is not
// possible without branch prediction in the IF stage
assert property (
@(posedge clk) (branch_taken_ex_i) |=> (~branch_taken_ex_i) );
@(posedge clk) (branch_taken_ex_i) |=> (~branch_taken_ex_i) ) else $warning("Two branches back-to-back are taken");
endmodule // controller

View file

@ -614,13 +614,12 @@ module riscv_decoder
end
3'b101: begin
// lp.setupi: initialize counter from rs1, set start address to
// lp.setupi: initialize counter from immediate, set start address to
// next instruction and end address to PC + I-type immediate
hwloop_we = 3'b111;
hwloop_target_mux_sel_o = 1'b1;
hwloop_start_mux_sel_o = 1'b1;
hwloop_cnt_mux_sel_o = 1'b1;
rega_used_o = 1'b1;
hwloop_cnt_mux_sel_o = 1'b0;
end
default: begin

View file

@ -44,6 +44,9 @@ module riscv_hwloop_controller
// to hwloop_regs
output logic [N_REGS-1:0] hwlp_dec_cnt_o,
// from pipeline stages
input logic [N_REGS-1:0] hwlp_dec_cnt_id_i,
// to id stage
output logic hwlp_jump_o,
output logic [31:0] hwlp_targ_addr_o
@ -58,14 +61,27 @@ module riscv_hwloop_controller
// generate comparators. check for end address and the loop counter
genvar i;
for (i = 0; i < N_REGS; i++) begin
assign pc_is_end_addr[i] = (current_pc_i == hwlp_end_addr_i[i]) &&
(hwlp_counter_i[i] > 32'h1);
end
// output signal for ID stage
assign hwlp_jump_o = (|pc_is_end_addr);
generate
for (i = 0; i < N_REGS; i++) begin
always @(*)
begin
pc_is_end_addr[i] = 1'b0;
if (current_pc_i == hwlp_end_addr_i[i]) begin
if (hwlp_counter_i[i][31:2] != 30'h0) begin
pc_is_end_addr[i] = 1'b1;
end else begin
// hwlp_counter_i[i][31:2] == 32'h0
case (hwlp_counter_i[i][1:0])
2'b11: pc_is_end_addr[i] = 1'b1;
2'b10: pc_is_end_addr[i] = ~hwlp_dec_cnt_id_i[i]; // only when there is nothing in flight
2'b01, 2'b00: pc_is_end_addr[i] = 1'b0;
endcase
end
end
end
end
endgenerate
// select corresponding start address and decrement counter
always_comb
@ -82,4 +98,7 @@ module riscv_hwloop_controller
end
end
// output signal for ID stage
assign hwlp_jump_o = (|pc_is_end_addr);
endmodule

View file

@ -117,12 +117,11 @@ module riscv_hwloop_regs
end
else
begin
if (hwlp_we_i[2] == 1'b1) // potential contention problem here!
for (i = 0; i < N_REGS; i++)
begin
hwlp_counter_q[hwlp_regid_i] <= hwlp_cnt_data_i;
end else begin
for (i = 0; i < N_REGS; i++)
begin
if ((hwlp_we_i[2] == 1'b1) && (i == hwlp_regid_i)) begin
hwlp_counter_q[i] <= hwlp_cnt_data_i;
end else begin
if (hwlp_dec_cnt_i[i] && valid_i)
hwlp_counter_q[i] <= hwlp_counter_n[i];
end

View file

@ -35,8 +35,8 @@
module riscv_id_stage
#(
parameter N_HWLP_REGS = 2,
parameter N_HWLP_REG_BITS = $clog2(N_HWLP_REGS)
parameter N_HWLP = 2,
parameter N_HWLP_BITS = $clog2(N_HWLP)
)
(
input logic clk,
@ -49,9 +49,11 @@ module riscv_id_stage
output logic is_decoding_o,
// Interface to IF stage
input logic instr_valid_i,
input logic [31:0] instr_rdata_i, // comes from pipeline of IF stage
output logic instr_req_o,
input logic [N_HWLP-1:0] hwlp_dec_cnt_i,
input logic is_hwlp_i,
input logic instr_valid_i,
input logic [31:0] instr_rdata_i, // comes from pipeline of IF stage
output logic instr_req_o,
// Jumps and branches
@ -113,7 +115,9 @@ module riscv_id_stage
output logic [1:0] csr_op_ex_o,
// hwloop signals
output logic [31:0] hwloop_targ_addr_o,
output logic [N_HWLP-1:0] [31:0] hwlp_start_o,
output logic [N_HWLP-1:0] [31:0] hwlp_end_o,
output logic [N_HWLP-1:0] [31:0] hwlp_cnt_o,
// Interface to load store unit
output logic data_req_ex_o,
@ -196,6 +200,7 @@ module riscv_id_stage
// Immediate decoding and sign extension
logic [31:0] imm_i_type;
logic [31:0] imm_iz_type;
logic [31:0] imm_s_type;
logic [31:0] imm_sb_type;
logic [31:0] imm_u_type;
@ -255,23 +260,16 @@ module riscv_id_stage
logic data_req_id;
// hwloop signals
logic [N_HWLP_REG_BITS-1:0] hwloop_regid;
logic [2:0] hwloop_we;
logic hwloop_jump;
logic hwloop_target_mux_sel;
logic hwloop_start_mux_sel;
logic hwloop_cnt_mux_sel;
logic [N_HWLP_BITS-1:0] hwloop_regid;
logic [2:0] hwloop_we;
logic hwloop_target_mux_sel;
logic hwloop_start_mux_sel;
logic hwloop_cnt_mux_sel;
logic [31:0] hwloop_target;
logic [31:0] hwloop_start;
logic [31:0] hwloop_end;
logic [31:0] hwloop_cnt;
// hwloop reg signals
logic [N_HWLP_REGS-1:0] hwloop_dec_cnt;
logic [N_HWLP_REGS-1:0] [31:0] hwloop_start_addr;
logic [N_HWLP_REGS-1:0] [31:0] hwloop_end_addr;
logic [N_HWLP_REGS-1:0] [31:0] hwloop_counter;
logic [31:0] hwloop_target;
logic [31:0] hwloop_start;
logic [31:0] hwloop_end;
logic [31:0] hwloop_cnt;
// CSR control
logic csr_access;
@ -296,6 +294,7 @@ module riscv_id_stage
// immediate extraction and sign extension
assign imm_i_type = { {20 {instr[31]}}, instr[31:20] };
assign imm_iz_type = { 20'b0, instr[31:20] };
assign imm_s_type = { {20 {instr[31]}}, instr[31:25], instr[11:7] };
assign imm_sb_type = { {19 {instr[31]}}, instr[31], instr[7], instr[30:25], instr[11:8], 1'b0 };
assign imm_u_type = { instr[31:12], 12'b0 };
@ -320,7 +319,7 @@ module riscv_id_stage
// kill instruction in the IF/ID stage by setting the instr_valid_id control
// signal to 0 for instructions that are done
assign clear_instr_valid_o = id_ready_o;
assign clear_instr_valid_o = id_ready_o | halt_id;
assign branch_taken_ex = branch_in_ex_o & branch_decision_i;
@ -341,7 +340,7 @@ module riscv_id_stage
always_comb
begin
unique case (hwloop_target_mux_sel)
1'b0: hwloop_target = current_pc_id_i + imm_i_type;
1'b0: hwloop_target = current_pc_id_i + {imm_iz_type[30:0], 1'b0};
1'b1: hwloop_target = current_pc_id_i + {imm_z_type[30:0], 1'b0};
endcase
end
@ -362,7 +361,7 @@ module riscv_id_stage
always_comb
begin : hwloop_cnt_mux
unique case (hwloop_cnt_mux_sel)
1'b0: hwloop_cnt = imm_i_type;
1'b0: hwloop_cnt = imm_iz_type;
1'b1: hwloop_cnt = operand_a_fw_id;
endcase;
end
@ -658,9 +657,6 @@ module riscv_id_stage
.data_req_ex_i ( data_req_ex_o ),
.data_misaligned_i ( data_misaligned_i ),
// hwloop signals
.hwloop_jump_i ( hwloop_jump ),
// jump/branch control
.branch_taken_ex_i ( branch_taken_ex ),
.jump_in_id_i ( jump_in_id ),
@ -769,54 +765,32 @@ module riscv_id_stage
// //
//////////////////////////////////////////////////////////////////////////
riscv_hwloop_controller
#(
.N_REGS ( N_HWLP_REGS )
)
hwloop_controller_i
(
// from ID stage
.current_pc_i ( current_pc_if_i ),
// to IF stage/controller
.hwlp_jump_o ( hwloop_jump ),
.hwlp_targ_addr_o ( hwloop_targ_addr_o ),
// from hwloop_regs
.hwlp_start_addr_i ( hwloop_start_addr ),
.hwlp_end_addr_i ( hwloop_end_addr ),
.hwlp_counter_i ( hwloop_counter ),
// to hwloop_regs
.hwlp_dec_cnt_o ( hwloop_dec_cnt )
);
riscv_hwloop_regs
#(
.N_REGS ( N_HWLP_REGS )
.N_REGS ( N_HWLP )
)
hwloop_regs_i
(
.clk ( clk ),
.rst_n ( rst_n ),
.clk ( clk ),
.rst_n ( rst_n ),
// from ID
.hwlp_start_data_i ( hwloop_start ),
.hwlp_end_data_i ( hwloop_end ),
.hwlp_cnt_data_i ( hwloop_cnt ),
.hwlp_we_i ( hwloop_we ),
.hwlp_regid_i ( hwloop_regid ),
.hwlp_start_data_i ( hwloop_start ),
.hwlp_end_data_i ( hwloop_end ),
.hwlp_cnt_data_i ( hwloop_cnt ),
.hwlp_we_i ( hwloop_we ),
.hwlp_regid_i ( hwloop_regid ),
// from controller
.valid_i ( instr_valid_i ),
.valid_i ( instr_valid_i & is_hwlp_i ),
// to hwloop controller
.hwlp_start_addr_o ( hwloop_start_addr ),
.hwlp_end_addr_o ( hwloop_end_addr ),
.hwlp_counter_o ( hwloop_counter ),
.hwlp_start_addr_o ( hwlp_start_o ),
.hwlp_end_addr_o ( hwlp_end_o ),
.hwlp_counter_o ( hwlp_cnt_o ),
// from hwloop controller
.hwlp_dec_cnt_i ( hwloop_dec_cnt )
.hwlp_dec_cnt_i ( hwlp_dec_cnt_i )
);
@ -956,6 +930,7 @@ module riscv_id_stage
assign id_ready_o = ((~misaligned_stall) & (~jr_stall) & (~load_stall) & ex_ready_i);
assign id_valid_o = (~halt_id) & id_ready_o;
//----------------------------------------------------------------------------
// Assertions
//----------------------------------------------------------------------------
@ -964,4 +939,8 @@ module riscv_id_stage
assert property (
@(posedge clk) (branch_in_ex_o) |-> (branch_decision_i !== 1'bx) );
// the instruction delivered to the ID stage should always be valid
assert property (
@(posedge clk) (instr_valid_i & (~illegal_c_insn_i)) |-> (!$isunknown(instr_rdata_i)) );
endmodule

View file

@ -35,6 +35,7 @@
module riscv_if_stage
#(
parameter N_HWLP = 2,
parameter RDATA_WIDTH = 32
)
(
@ -55,12 +56,14 @@ module riscv_if_stage
input logic [RDATA_WIDTH-1:0] instr_rdata_i,
// Output of IF Pipeline stage
output logic instr_valid_id_o, // instruction in IF/ID pipeline is valid
output logic [31:0] instr_rdata_id_o, // read instruction is sampled and sent to ID stage for decoding
output logic is_compressed_id_o, // compressed decoder thinks this is a compressed instruction
output logic illegal_c_insn_id_o, // compressed decoder thinks this is an invalid instruction
output logic [31:0] current_pc_if_o,
output logic [31:0] current_pc_id_o,
output logic [N_HWLP-1:0] hwlp_dec_cnt_id_o, // currently served instruction was the target of a hwlp
output logic is_hwlp_id_o, // currently served instruction was the target of a hwlp
output logic instr_valid_id_o, // instruction in IF/ID pipeline is valid
output logic [31:0] instr_rdata_id_o, // read instruction is sampled and sent to ID stage for decoding
output logic is_compressed_id_o, // compressed decoder thinks this is a compressed instruction
output logic illegal_c_insn_id_o, // compressed decoder thinks this is an invalid instruction
output logic [31:0] current_pc_if_o,
output logic [31:0] current_pc_id_o,
// Forwarding ports - control signals
input logic clear_instr_valid_i, // clear instruction valid bit in IF/ID pipe
@ -75,7 +78,9 @@ module riscv_if_stage
input logic [31:0] jump_target_ex_i, // jump target address
// from hwloop controller
input logic [31:0] hwloop_target_i, // pc from hwloop start addr
input logic [N_HWLP-1:0] [31:0] hwlp_start_i, // hardware loop start addresses
input logic [N_HWLP-1:0] [31:0] hwlp_end_i, // hardware loop end addresses
input logic [N_HWLP-1:0] [31:0] hwlp_cnt_i, // hardware loop counters
// from debug unit
input logic [31:0] dbg_npc_i,
@ -93,46 +98,27 @@ module riscv_if_stage
);
// offset FSM
enum logic[1:0] {WAIT_ALIGNED, WAIT_UNALIGNED, IDLE } offset_fsm_cs, offset_fsm_ns;
enum logic[0:0] {WAIT, IDLE } offset_fsm_cs, offset_fsm_ns;
logic [1:0] is_compressed;
logic unaligned;
logic unaligned_jump;
logic valid;
logic valid;
// prefetch buffer related signals
logic prefetch_busy;
logic branch_req;
logic [31:0] fetch_addr_n;
logic prefetch_busy;
logic branch_req;
logic [31:0] fetch_addr_n;
logic fetch_valid;
logic fetch_ready;
logic [31:0] fetch_rdata;
logic [31:0] fetch_addr;
logic fetch_valid;
logic fetch_ready;
logic [31:0] fetch_rdata;
logic [31:0] fetch_addr;
logic is_hwlp_id_q, fetch_is_hwlp;
logic [31:0] exc_pc;
logic [31:0] instr_rdata_int;
logic [31:0] exc_pc;
// output data and PC mux
always_comb
begin
// default values for regular aligned access
current_pc_if_o = {fetch_addr[31:2], 2'b00};
instr_rdata_int = fetch_rdata;
if (unaligned) begin
current_pc_if_o = {fetch_addr[31:2], 2'b10};
end
end
// compressed instruction detection
assign is_compressed[0] = (fetch_rdata[1:0] != 2'b11);
assign is_compressed[1] = (fetch_rdata[17:16] != 2'b11);
// hardware loop related signals
logic hwlp_jump;
logic [31:0] hwlp_target;
logic [N_HWLP-1:0] hwlp_dec_cnt, hwlp_dec_cnt_if;
// exception PC selection mux
@ -166,7 +152,6 @@ module riscv_if_stage
`PC_BRANCH: fetch_addr_n = jump_target_ex_i;
`PC_EXCEPTION: fetch_addr_n = exc_pc; // set PC to exception handler
`PC_ERET: fetch_addr_n = exception_pc_reg_i; // PC is restored when returning from IRQ/exception
`PC_HWLOOP: fetch_addr_n = hwloop_target_i; // PC is taken from hwloop start addr
`PC_DBG_NPC: fetch_addr_n = dbg_npc_i; // PC is taken from debug unit
default: begin
@ -177,8 +162,6 @@ module riscv_if_stage
endcase
end
assign unaligned_jump = fetch_addr_n[1];
generate
if (RDATA_WIDTH == 32) begin : prefetch_32
// prefetch buffer, caches a fixed number of instructions
@ -188,14 +171,18 @@ module riscv_if_stage
.rst_n ( rst_n ),
.req_i ( 1'b1 ),
.branch_i ( branch_req ),
.addr_i ( {fetch_addr_n[31:2], 2'b00} ),
.unaligned_i ( unaligned ), // is the current address unaligned?
.branch_i ( branch_req ),
.addr_i ( fetch_addr_n ),
.hwloop_i ( hwlp_jump ),
.hwloop_target_i ( hwlp_target ),
.ready_i ( fetch_ready ),
.valid_o ( fetch_valid ),
.rdata_o ( fetch_rdata ),
.addr_o ( fetch_addr ),
.is_hwlp_o ( fetch_is_hwlp ),
// goes to instruction memory / instruction cache
.instr_req_o ( instr_req_o ),
@ -215,14 +202,18 @@ module riscv_if_stage
.rst_n ( rst_n ),
.req_i ( 1'b1 ),
.branch_i ( branch_req ),
.addr_i ( {fetch_addr_n[31:2], 2'b00} ),
.unaligned_i ( unaligned ), // is the current address unaligned?
.branch_i ( branch_req ),
.addr_i ( fetch_addr_n ),
.hwloop_i ( hwlp_jump ),
.hwloop_target_i ( hwlp_target ),
.ready_i ( fetch_ready ),
.valid_o ( fetch_valid ),
.rdata_o ( fetch_rdata ),
.addr_o ( fetch_addr ),
.is_hwlp_o ( fetch_is_hwlp ),
// goes to instruction memory / instruction cache
.instr_req_o ( instr_req_o ),
@ -257,55 +248,24 @@ module riscv_if_stage
branch_req = 1'b0;
valid = 1'b0;
unaligned = 1'b0;
unique case (offset_fsm_cs)
// no valid instruction data for ID stage
// assume aligned
IDLE: begin
if (req_i) begin
branch_req = 1'b1;
offset_fsm_ns = WAIT_ALIGNED;
offset_fsm_ns = WAIT;
end
end
// serving aligned 32 bit or 16 bit instruction, we don't know yet
WAIT_ALIGNED: begin
WAIT: begin
if (fetch_valid) begin
valid = 1'b1; // an instruction is ready for ID stage
if (req_i && if_valid_o) begin
if (~is_compressed[0]) begin
// 32 bit aligned instruction found
fetch_ready = 1'b1;
offset_fsm_ns = WAIT_ALIGNED;
end else begin
// 16 bit aligned instruction found
// next instruction will be unaligned
offset_fsm_ns = WAIT_UNALIGNED;
end
end
end
end
// serving unaligned 32 bit instruction
// next instruction might be 16 bit unaligned (no need to fetch)
// or 32 bit unaligned (need to fetch another word from cache)
WAIT_UNALIGNED: begin
unaligned = 1'b1;
if (fetch_valid) begin
valid = 1'b1; // an instruction is ready for ID stage
if (req_i && if_valid_o) begin
// next instruction will be aligned
fetch_ready = 1'b1;
if (is_compressed[0])
offset_fsm_ns = WAIT_ALIGNED;
else
offset_fsm_ns = WAIT_UNALIGNED;
offset_fsm_ns = WAIT;
end
end
end
@ -322,17 +282,38 @@ module riscv_if_stage
// switch to new PC from ID stage
branch_req = 1'b1;
if (unaligned_jump)
offset_fsm_ns = WAIT_UNALIGNED;
else
offset_fsm_ns = WAIT_ALIGNED;
offset_fsm_ns = WAIT;
end
end
// Hardware Loops
riscv_hwloop_controller
#(
.N_REGS ( N_HWLP )
)
hwloop_controller_i
(
.current_pc_i ( fetch_addr ),
assign if_busy_o = prefetch_busy;
.hwlp_jump_o ( hwlp_jump ),
.hwlp_targ_addr_o ( hwlp_target ),
assign perf_imiss_o = (~fetch_valid) | branch_req;
// from hwloop_regs
.hwlp_start_addr_i ( hwlp_start_i ),
.hwlp_end_addr_i ( hwlp_end_i ),
.hwlp_counter_i ( hwlp_cnt_i ),
// to hwloop_regs
.hwlp_dec_cnt_o ( hwlp_dec_cnt ),
.hwlp_dec_cnt_id_i ( hwlp_dec_cnt_id_o & {N_HWLP{is_hwlp_id_o}} )
);
assign current_pc_if_o = fetch_addr;
assign if_busy_o = prefetch_busy;
assign perf_imiss_o = (~fetch_valid) | branch_req;
// compressed instruction decoding, or more precisely compressed instruction
@ -346,12 +327,25 @@ module riscv_if_stage
riscv_compressed_decoder compressed_decoder_i
(
.instr_i ( instr_rdata_int ),
.instr_i ( fetch_rdata ),
.instr_o ( instr_decompressed ),
.is_compressed_o ( instr_compressed_int ),
.illegal_instr_o ( illegal_c_insn )
);
// prefetch -> IF registers
always_ff @(posedge clk, negedge rst_n)
begin
if (rst_n == 1'b0)
begin
hwlp_dec_cnt_if <= '0;
end
else
begin
if (hwlp_jump)
hwlp_dec_cnt_if <= hwlp_dec_cnt;
end
end
// IF-ID pipeline registers, frozen when the ID stage is stalled
always_ff @(posedge clk, negedge rst_n)
@ -363,6 +357,8 @@ module riscv_if_stage
illegal_c_insn_id_o <= 1'b0;
is_compressed_id_o <= 1'b0;
current_pc_id_o <= '0;
is_hwlp_id_q <= 1'b0;
hwlp_dec_cnt_id_o <= '0;
end
else
begin
@ -376,10 +372,16 @@ module riscv_if_stage
illegal_c_insn_id_o <= illegal_c_insn;
is_compressed_id_o <= instr_compressed_int;
current_pc_id_o <= current_pc_if_o;
is_hwlp_id_q <= fetch_is_hwlp;
if (fetch_is_hwlp)
hwlp_dec_cnt_id_o <= hwlp_dec_cnt_if;
end
end
end
assign is_hwlp_id_o = is_hwlp_id_q & instr_valid_id_o;
assign if_ready_o = valid & id_ready_i;
assign if_valid_o = (~halt_if_i) & if_ready_o;

View file

@ -99,6 +99,10 @@
`define INSTR_SRA { 7'b0100000, 10'b?, 3'b101, 5'b?, `OPCODE_OP }
`define INSTR_OR { 7'b0000000, 10'b?, 3'b110, 5'b?, `OPCODE_OP }
`define INSTR_AND { 7'b0000000, 10'b?, 3'b111, 5'b?, `OPCODE_OP }
`define INSTR_EXTHS { 7'b0001000, 10'b?, 3'b100, 5'b?, `OPCODE_OP } // pulp specific
`define INSTR_EXTHZ { 7'b0001000, 10'b?, 3'b101, 5'b?, `OPCODE_OP } // pulp specific
`define INSTR_EXTBS { 7'b0001000, 10'b?, 3'b110, 5'b?, `OPCODE_OP } // pulp specific
`define INSTR_EXTBZ { 7'b0001000, 10'b?, 3'b111, 5'b?, `OPCODE_OP } // pulp specific
// FENCE
`define INSTR_FENCE { 4'b0, 8'b?, 13'b0, `OPCODE_FENCE }
`define INSTR_FENCEI { 17'b0, 3'b001, 5'b0, `OPCODE_FENCE }
@ -302,7 +306,6 @@
`define PC_BRANCH 3'b011
`define PC_EXCEPTION 3'b100
`define PC_ERET 3'b101
`define PC_HWLOOP 3'b110
`define PC_DBG_NPC 3'b111
// Exception PC mux selector defines

View file

@ -32,15 +32,18 @@ module riscv_prefetch_L0_buffer
input logic rst_n,
input logic req_i,
input logic branch_i,
input logic ready_i,
input logic [31:0] addr_i,
input logic hwloop_i,
input logic [31:0] hwloop_target_i,
input logic ready_i,
output logic valid_o,
output logic [31:0] rdata_o,
output logic [31:0] addr_o,
input logic unaligned_i,
output logic is_hwlp_o, // is set when the currently served data is from a hwloop
// goes to instruction memory / instruction cache
output logic instr_req_o,
@ -53,86 +56,425 @@ module riscv_prefetch_L0_buffer
output logic busy_o
);
enum logic [2:0] {EMPTY, VALID_L0, WAIT_GNT, WAIT_RVALID, WAIT_ABORTED } CS, NS;
logic [31:0] current_address, last_address;
logic [1:0] pointer_cs, pointer_ns;
logic update_current_address;
logic busy_L0;
logic [3:0][31:0] L0_buffer;
logic [31:0] previous_chunk;
logic clear_buffer;
enum logic [2:0] { REGULAR, PREFETCH, LAST_BRANCH, LAST_BRANCH_WAIT, HWLP_WAIT_LAST, HWLP_FETCHING, HWLP_PREFETCH, HWLP_ABORT } prefetch_CS, prefetch_NS;
logic do_prefetch;
logic [31:0] addr_q, addr_n, addr_int, addr_aligned_next;
logic valid_L0;
logic ready_L0;
logic is_prefetch_q, is_prefetch_n;
logic [31:0] rdata_last_q;
logic valid_L0;
logic [RDATA_IN_WIDTH/32-1:0][31:0] rdata_L0;
logic [31:0] addr_L0;
// prepared data for output
logic [31:0] rdata, unaligned_rdata;
logic valid, unaligned_valid;
logic [31:0] rdata, rdata_unaligned;
logic valid, valid_unaligned;
logic aligned_is_compressed, unaligned_is_compressed;
logic fetching_hwlp;
logic hwlp_inhibit;
logic prefetch_important;
assign busy_o = (CS != EMPTY && CS != VALID_L0) || instr_req_o;
prefetch_L0_buffer_L0
#(
.RDATA_IN_WIDTH ( RDATA_IN_WIDTH )
)
L0_buffer_i
(
.clk ( clk ),
.rst_n ( rst_n ),
.prefetch_i ( do_prefetch ),
.prefetch_important_i ( prefetch_important ),
.prefetch_addr_i ( addr_aligned_next ),
.branch_i ( branch_i ),
.branch_addr_i ( addr_i ),
.hwlp_i ( hwloop_i & (~hwlp_inhibit) ),
.hwlp_addr_i ( hwloop_target_i ),
.hwlp_fetching_o ( fetching_hwlp ),
.valid_o ( valid_L0 ),
.rdata_o ( rdata_L0 ),
.addr_o ( addr_L0 ),
.instr_req_o ( instr_req_o ),
.instr_addr_o ( instr_addr_o ),
.instr_gnt_i ( instr_gnt_i ),
.instr_rvalid_i ( instr_rvalid_i ),
.instr_rdata_i ( instr_rdata_i ),
.busy_o ( busy_L0 )
);
always_ff @(posedge clk or negedge rst_n)
assign rdata = ((prefetch_CS == PREFETCH) | (prefetch_CS == HWLP_WAIT_LAST) | (prefetch_CS == HWLP_PREFETCH) | (prefetch_CS == LAST_BRANCH_WAIT)) ? rdata_last_q : rdata_L0[addr_o[3:2]];
assign valid = ( ((prefetch_CS == PREFETCH) | (prefetch_CS == HWLP_WAIT_LAST) | (prefetch_CS == HWLP_PREFETCH)) | valid_L0) & (prefetch_CS != HWLP_ABORT);
// the lower part of rdata_unaligned is always the higher part of rdata
assign rdata_unaligned[15:0] = rdata[31:16];
always_comb
begin
if (~rst_n)
begin
CS <= EMPTY;
current_address <= '0;
last_address <= '0;
pointer_cs <= '0;
is_prefetch_q <= 1'b0;
end
else
begin
CS <= NS;
valid_unaligned = 1'b0;
if (branch_i)
begin
current_address <= {addr_i[31:4],4'b0000};
pointer_cs <= addr_i[3:2];
is_prefetch_q <= 1'b0;
end
else
begin
if (update_current_address) begin
last_address <= current_address;
current_address <= current_address + 5'h10; // jump to the next cache line
end
if (valid_L0) begin
case(addr_o[3:2])
2'b00: begin rdata_unaligned[31:16] = rdata_L0[1][15:0]; valid_unaligned = 1'b1; end
2'b01: begin rdata_unaligned[31:16] = rdata_L0[2][15:0]; valid_unaligned = 1'b1; end
2'b10: begin rdata_unaligned[31:16] = rdata_L0[3][15:0]; valid_unaligned = 1'b1; end
// this state is only interesting if we have already done a prefetch
2'b11: begin
rdata_unaligned[31:16] = rdata_L0[0][15:0];
if (ready_i)
is_prefetch_q <= 1'b0;
else
is_prefetch_q <= is_prefetch_n;
pointer_cs <= pointer_ns;
end
if ((prefetch_CS == PREFETCH) | (prefetch_CS == HWLP_PREFETCH)) begin
valid_unaligned = 1'b1;
end else begin
valid_unaligned = 1'b0;
end
end
endcase // addr_o
end
end
assign unaligned_is_compressed = rdata[17:16] != 2'b11;
assign aligned_is_compressed = rdata[1:0] != 2'b11;
assign addr_aligned_next = { addr_o[31:2], 2'b00 } + 32'h4;
always_comb
begin
valid = 1'b0;
valid_L0 = 1'b0;
pointer_ns = pointer_cs;
instr_req_o = 1'b0;
instr_addr_o = (branch_i) ? addr_i : current_address + 5'h10;
update_current_address = 1'b0;
clear_buffer = 1'b0;
is_prefetch_n = is_prefetch_q;
addr_int = addr_o;
// advance address when pipeline is unstalled
if (ready_i) begin
if (addr_o[1]) begin
// unaligned case
// always move to next entry in the FIFO
if (unaligned_is_compressed) begin
addr_int = { addr_aligned_next[31:2], 2'b00};
end else begin
addr_int = { addr_aligned_next[31:2], 2'b10};
end
end else begin
// aligned case
if (aligned_is_compressed) begin
// just increase address, do not move to next entry in the FIFO
addr_int = { addr_o[31:2], 2'b10 };
end else begin
// move to next entry in the FIFO
addr_int = { addr_aligned_next[31:2], 2'b00 };
end
end
end
end
always_comb
begin
do_prefetch = 1'b0;
prefetch_NS = prefetch_CS;
addr_n = addr_int;
case (prefetch_CS)
REGULAR: begin
if (fetching_hwlp) begin
if (ready_i) begin
addr_n = hwloop_target_i;
prefetch_NS = HWLP_FETCHING;
end
else
prefetch_NS = HWLP_WAIT_LAST;
end else if (addr_o[3:2] == 2'b11) begin
if ((~addr_o[1]) & aligned_is_compressed & valid)
// we are serving a compressed instruction
prefetch_NS = PREFETCH;
else begin
if (ready_i)
prefetch_NS = REGULAR;
else if (valid_L0)
prefetch_NS = PREFETCH;
end
end
// actually only needed when ~branch_i and ~fetching_hwlp not set, but
// if we would keep those as conditions, we generate a cominational loop
if (addr_o[3:2] == 2'b11)
do_prefetch = 1'b1;
end
// we are doing a prefetch
// we save the last word of the L0 buffer and already preload the L0
// buffer with new stuff
PREFETCH: begin
if (fetching_hwlp) begin
if (ready_i) begin
addr_n = hwloop_target_i;
prefetch_NS = HWLP_FETCHING;
end
else
prefetch_NS = HWLP_WAIT_LAST;
end else if (ready_i) begin
if (hwloop_i) begin
addr_n = addr_q;
prefetch_NS = HWLP_ABORT;
end else begin
if ((~addr_o[1]) & aligned_is_compressed)
// we are serving a compressed instruction
prefetch_NS = PREFETCH;
else
prefetch_NS = REGULAR;
end
end
end
// we have branched into the last word of the L0 buffer, so we have to
// prefetch the next cache line as soon as we got this one
LAST_BRANCH: begin
do_prefetch = 1'b1;
if (valid_L0) begin
if (fetching_hwlp) begin
if (ready_i) begin
addr_n = hwloop_target_i;
prefetch_NS = HWLP_FETCHING;
end
else
prefetch_NS = HWLP_WAIT_LAST;
end
else if ( ((~addr_o[1]) & aligned_is_compressed) | (addr_o[1] & (~unaligned_is_compressed)) )
// we are serving a compressed instruction or an instruction that
// spans two cache lines
prefetch_NS = PREFETCH;
else if (ready_i)
prefetch_NS = REGULAR;
else
prefetch_NS = LAST_BRANCH_WAIT;
end
end
LAST_BRANCH_WAIT: begin
if (ready_i)
prefetch_NS = REGULAR;
end
// wait for last instruction to be delivered before going to hwloop
HWLP_WAIT_LAST: begin
if (ready_i) begin
addr_n = addr_L0; // use address that was saved in L0 buffer
prefetch_NS = HWLP_FETCHING;
end
end
HWLP_FETCHING: begin
if (valid_L0) begin
if (addr_o[3:2] == 2'b11) begin
do_prefetch = 1'b1;
if ((~addr_o[1]) & aligned_is_compressed)
// we are serving a compressed instruction
prefetch_NS = HWLP_PREFETCH;
else begin
if (ready_i)
prefetch_NS = REGULAR;
else
prefetch_NS = HWLP_PREFETCH;
end
end else begin
if (ready_i) begin
prefetch_NS = REGULAR;
end
end
end
end
HWLP_PREFETCH: begin
if (ready_i) begin
prefetch_NS = REGULAR;
end
end
HWLP_ABORT: begin
if (fetching_hwlp) begin
prefetch_NS = HWLP_FETCHING;
addr_n = hwloop_target_i;
end
end
endcase
// branches always have priority
if (branch_i) begin
addr_n = addr_i;
if (addr_i[3:2] == 2'b11)
prefetch_NS = LAST_BRANCH;
else
prefetch_NS = REGULAR;
end
end
// do not abort an important prefetch for a hardware loop
//assign prefetch_important = (((addr_q[3:1] == 3'b111) & (~unaligned_is_compressed)) | (addr_q[3:2] == 2'b00)) & do_prefetch;
assign prefetch_important = 1'b0;
assign hwlp_inhibit = (prefetch_CS == HWLP_WAIT_LAST) | (prefetch_CS == HWLP_FETCHING) | (prefetch_CS == HWLP_PREFETCH);
//////////////////////////////////////////////////////////////////////////////
// registers
//////////////////////////////////////////////////////////////////////////////
always_ff @(posedge clk, negedge rst_n)
begin
if (~rst_n)
begin
addr_q <= '0;
prefetch_CS <= REGULAR;
rdata_last_q <= '0;
end
else
begin
addr_q <= addr_n;
prefetch_CS <= prefetch_NS;
if (fetching_hwlp)
rdata_last_q <= rdata_o;
else if (do_prefetch)
rdata_last_q <= rdata;
end
end
//////////////////////////////////////////////////////////////////////////////
// output ports
//////////////////////////////////////////////////////////////////////////////
assign rdata_o = (~addr_o[1] | (prefetch_CS == HWLP_WAIT_LAST)) ? rdata: rdata_unaligned;
assign valid_o = (addr_o[1] & (~unaligned_is_compressed)) ? valid_unaligned : valid;
assign addr_o = addr_q;
assign is_hwlp_o = ((prefetch_CS == HWLP_FETCHING) | (prefetch_CS == HWLP_PREFETCH)) & valid_o;
assign busy_o = busy_L0;
//----------------------------------------------------------------------------
// Assertions
//----------------------------------------------------------------------------
// there should never be a ready_i without valid_o
assert property (
@(posedge clk) (ready_i) |-> (valid_o) ) else $warning("IF Stage is ready without prefetcher having valid data");
endmodule // prefetch_L0_buffer
module prefetch_L0_buffer_L0
#(
parameter RDATA_IN_WIDTH = 128
)
(
input logic clk,
input logic rst_n,
input logic prefetch_i,
input logic prefetch_important_i,
input logic [31:0] prefetch_addr_i,
input logic branch_i,
input logic [31:0] branch_addr_i,
input logic hwlp_i,
input logic [31:0] hwlp_addr_i,
output logic hwlp_fetching_o,
output logic valid_o,
output logic [RDATA_IN_WIDTH/32-1:0][31:0] rdata_o,
output logic [31:0] addr_o,
// goes to instruction memory / instruction cache
output logic instr_req_o,
output logic [31:0] instr_addr_o,
input logic instr_gnt_i,
input logic instr_rvalid_i,
input logic [RDATA_IN_WIDTH/32-1:0][31:0] instr_rdata_i,
output logic busy_o
);
enum logic [2:0] { EMPTY, VALID_L0, WAIT_GNT, WAIT_RVALID, ABORTED_BRANCH, WAIT_HWLOOP } CS, NS;
logic [3:0][31:0] L0_buffer;
logic [31:0] addr_q, instr_addr_int;
logic valid;
logic hwlp_pending_n;
// edge detector on hwlp pending
assign hwlp_fetching_o = (~hwlp_pending_n) & (hwlp_i);
//////////////////////////////////////////////////////////////////////////////
// FSM
//////////////////////////////////////////////////////////////////////////////
always_comb
begin
NS = CS;
valid = 1'b0;
instr_req_o = 1'b0;
instr_addr_int = 'x;
hwlp_pending_n = hwlp_i;
case(CS)
// wait for the first branch request before fetching any instructions
EMPTY:
begin
instr_req_o = branch_i;
if (branch_i)
instr_addr_int = branch_addr_i;
else if (hwlp_i & (~prefetch_important_i)) begin
instr_addr_int = hwlp_addr_i;
hwlp_pending_n = 1'b0;
end
else
instr_addr_int = prefetch_addr_i;
if (branch_i) // make the request to icache
if (branch_i | hwlp_i | prefetch_i) // make the request to icache
begin
instr_req_o = 1'b1;
if (instr_gnt_i)
NS = WAIT_RVALID;
else
NS = WAIT_GNT;
end
end //~EMPTY
WAIT_GNT:
begin
if (branch_i)
instr_addr_int = branch_addr_i;
else
instr_addr_int = addr_q;
if (branch_i)
begin
instr_req_o = 1'b1;
if (instr_gnt_i)
NS = WAIT_RVALID;
@ -141,49 +483,68 @@ module riscv_prefetch_L0_buffer
end
else
begin
NS = EMPTY;
instr_req_o = 1'b1;
if (instr_gnt_i)
NS = WAIT_RVALID;
else
NS = WAIT_GNT;
end
end //~EMPTY
end //~WAIT_GNT
WAIT_RVALID:
begin
if (branch_i) // there is a pending branch
begin
instr_addr_o = {addr_i[31:4],4'b0000};
valid = instr_rvalid_i;
if (branch_i)
instr_addr_int = branch_addr_i;
else if (hwlp_i)
instr_addr_int = hwlp_addr_i;
else
instr_addr_int = prefetch_addr_i;
if (branch_i)
begin
if (instr_rvalid_i)
begin
instr_req_o = 1'b1;
instr_req_o = 1'b1;
if (instr_gnt_i)
NS = WAIT_RVALID;
else
NS = WAIT_GNT;
end
else
begin
NS = WAIT_ABORTED;
end else begin
NS = ABORTED_BRANCH; // TODO: THIS STATE IS IDENTICAL WITH THIS ONE
end
end
else // else (branch_i)
else if (hwlp_i)
begin
valid = instr_rvalid_i;
// prepare address even if we don't need it
// this removes the dependency for instr_addr_o on instr_rvalid_i
instr_addr_o = current_address + 5'h10;
if (instr_rvalid_i)
begin
instr_req_o = 1'b1;
hwlp_pending_n = 1'b0;
if (instr_gnt_i)
NS = WAIT_RVALID;
else
NS = WAIT_GNT;
end else begin
NS = WAIT_HWLOOP;
end
end
else
begin
if (instr_rvalid_i)
begin
if (&pointer_cs) // we are receiving the last packet, then prefetch the next one
if (prefetch_i) // we are receiving the last packet, then prefetch the next one
begin
is_prefetch_n = 1'b1;
instr_req_o = 1'b1;
pointer_ns = '0;
update_current_address = 1'b1;
instr_req_o = 1'b1;
if (instr_gnt_i)
NS = WAIT_RVALID;
@ -193,215 +554,125 @@ module riscv_prefetch_L0_buffer
else // not the last chunk
begin
NS = VALID_L0;
if (ready_L0)
pointer_ns = pointer_cs + 1'b1;
else
pointer_ns = pointer_cs;
end
end
else // still wait instr_rvalid_i
begin
NS = WAIT_RVALID;
end
end
end //~WAIT_RVALID
VALID_L0:
begin
valid = 1'b1;
valid_L0 = 1'b1;
valid = 1'b1;
if (branch_i)
instr_addr_int = branch_addr_i;
else if (hwlp_i) begin
instr_addr_int = hwlp_addr_i;
hwlp_pending_n = 1'b0;
end
else
instr_addr_int = prefetch_addr_i;
if (branch_i | hwlp_i | prefetch_i)
begin
instr_req_o = 1'b1;
instr_addr_o = {addr_i[31:4],4'b0000};
instr_req_o = 1'b1;
if (instr_gnt_i)
NS = WAIT_RVALID;
else
NS = WAIT_GNT;
end
else
begin
if ( &pointer_cs ) // we are dispathing the last packet, therefore prefetch the next cache line
begin
is_prefetch_n = 1'b1;
instr_req_o = 1'b1;
instr_addr_o = current_address + 5'h10;
pointer_ns = '0;
update_current_address = 1'b1;
if (instr_gnt_i)
NS = WAIT_RVALID;
else
NS = WAIT_GNT;
end
else
begin
if (ready_L0)
begin
pointer_ns = pointer_cs + 1'b1;
end
NS = VALID_L0;
end
end
end //~VALID_L0
WAIT_GNT:
ABORTED_BRANCH:
begin
if (branch_i)
begin
instr_req_o = 1'b1;
instr_addr_o = {addr_i[31:4],4'b0000};
if (instr_gnt_i)
NS = WAIT_RVALID;
else
NS = WAIT_GNT;
end
else
begin
instr_req_o = 1'b1;
instr_addr_o = current_address; // has been previously updated
if (instr_gnt_i)
NS = WAIT_RVALID;
else
NS = WAIT_GNT;
end
end //~WAIT_GNT
WAIT_ABORTED:
begin
clear_buffer = 1'b1;
// prepare address even if we don't need it
// this removes the dependency for instr_addr_o on instr_rvalid_i
instr_addr_o = current_address;
if (branch_i)
instr_addr_int = branch_addr_i;
else
instr_addr_int = addr_q;
if (instr_rvalid_i)
begin
instr_req_o = 1'b1;
instr_req_o = 1'b1;
if (instr_gnt_i)
NS = WAIT_RVALID;
else
NS = WAIT_GNT;
end
end //~ABORTED_BRANCH
WAIT_HWLOOP:
begin
valid = instr_rvalid_i;
// prepare address even if we don't need it
// this removes the dependency for instr_addr_o on instr_rvalid_i
if (branch_i)
instr_addr_int = branch_addr_i;
else
instr_addr_int = addr_q;
if (instr_rvalid_i)
begin
NS = WAIT_ABORTED;
hwlp_pending_n = 1'b0;
instr_req_o = 1'b1;
if (instr_gnt_i)
NS = WAIT_RVALID;
else
NS = WAIT_GNT;
end
end //~WAIT_ABORTED
end //~ABORTED_HWLOOP
default:
begin
NS = EMPTY;
clear_buffer = 1'b1;
end
endcase //~CS
end
// rdata mux, either directly use the incoming data or the saved data in
// L0/previous_chunk
always_comb
begin
if (is_prefetch_q)
begin
rdata = previous_chunk;
addr_o = { last_address[31:4], 2'b11, 2'b00 };
end
else
begin
if (valid_L0) begin
rdata = L0_buffer[pointer_cs];
addr_o = { current_address[31:4], pointer_cs, 2'b00 };
end
else
begin
rdata = instr_rdata_i[pointer_cs];
addr_o = { current_address[31:4], pointer_cs, 2'b00 };
end
end
end
//////////////////////////////////////////////////////////////////////////////
// registers
//////////////////////////////////////////////////////////////////////////////
// the lower part of unaligned_rdata is always the higher part of rdata
assign unaligned_rdata[15:0] = rdata[31:16];
always_comb
begin
if (valid_L0) begin
case(addr_o[3:2])
2'b00: begin unaligned_rdata[31:16] = L0_buffer[1][15:0]; unaligned_valid = 1'b1; end
2'b01: begin unaligned_rdata[31:16] = L0_buffer[2][15:0]; unaligned_valid = 1'b1; end
2'b10: begin unaligned_rdata[31:16] = L0_buffer[3][15:0]; unaligned_valid = 1'b1; end
// this state is only interesting if we have already done a prefetch
2'b11: begin
unaligned_rdata[31:16] = L0_buffer[0][15:0];
if (is_prefetch_q) begin
unaligned_valid = 1'b1;
end else begin
unaligned_valid = 1'b0;
end
end
endcase // addr_o
end else begin
// L0 buffer is not valid, so we can take the data directly from the
// icache
case(addr_o[3:2])
2'b00: begin unaligned_rdata[31:16] = instr_rdata_i[1][15:0]; unaligned_valid = instr_rvalid_i; end
2'b01: begin unaligned_rdata[31:16] = instr_rdata_i[2][15:0]; unaligned_valid = instr_rvalid_i; end
2'b10: begin unaligned_rdata[31:16] = instr_rdata_i[3][15:0]; unaligned_valid = instr_rvalid_i; end
2'b11:
begin
unaligned_rdata[31:16] = instr_rdata_i[0][15:0];
if (is_prefetch_q)
unaligned_valid = instr_rvalid_i;
else
unaligned_valid = 1'b0;
end
endcase // pointer_cs
end
end
assign ready_L0 = (is_prefetch_q) ? 1'b0 : ready_i;
always_ff @(posedge clk or negedge rst_n)
always_ff @(posedge clk, negedge rst_n)
begin
if (~rst_n)
begin
L0_buffer <= '0;
previous_chunk <= '0;
CS <= EMPTY;
L0_buffer <= '0;
addr_q <= '0;
end
else
begin
CS <= NS;
if (instr_rvalid_i)
begin
L0_buffer <= instr_rdata_i;
end
// update previous chunk only when we are doing a prefetch
// do this only once per prefetch
if (is_prefetch_n && (~is_prefetch_q))
begin
previous_chunk <= (valid_L0) ? L0_buffer[3][31:0] : instr_rdata_i[3][31:0];
end
if (branch_i | hwlp_i | prefetch_i)
addr_q <= instr_addr_int;
end
end
//////////////////////////////////////////////////////////////////////////////
// instruction aligner (if unaligned)
// output ports
//////////////////////////////////////////////////////////////////////////////
assign rdata_o = unaligned_i ? unaligned_rdata : rdata;
assign valid_o = unaligned_i ? unaligned_valid : valid;
assign instr_addr_o = { instr_addr_int[31:4], 4'b0000 };
endmodule // prefetch_L0_buffer
assign rdata_o = (instr_rvalid_i) ? instr_rdata_i : L0_buffer;
assign addr_o = addr_q;
assign valid_o = valid & (~branch_i);
assign busy_o = (CS != EMPTY) && (CS != VALID_L0) || instr_req_o;
endmodule

View file

@ -32,9 +32,8 @@ module riscv_fetch_fifo
input logic rst_n,
// control signals
input logic clear_i, // clears the contents of the fifo
input logic unaligned_i, // is the current output rdata unaligned
input logic branch_i, // clears the contents of the fifo
input logic hwloop_i, // tries to insert an entry above the first one
// input port
input logic in_addr_valid_i,
@ -51,11 +50,7 @@ module riscv_fetch_fifo
input logic out_ready_i,
output logic [31:0] out_rdata_o,
output logic [31:0] out_addr_o,
output logic out_unaligned_valid_o,
output logic [31:0] out_unaligned_rdata_o,
output logic out_is_unaligned_o
output logic out_is_hwlp_o
);
localparam DEPTH = 3; // must be 2 or greater
@ -65,25 +60,57 @@ module riscv_fetch_fifo
logic [0:DEPTH-1] addr_valid_n, addr_valid_int, addr_valid_Q;
logic [0:DEPTH-1] [31:0] rdata_n, rdata_int, rdata_Q;
logic [0:DEPTH-1] rdata_valid_n, rdata_valid_int, rdata_valid_Q;
logic is_unaligned_n, is_unaligned_Q;
logic [0:1 ] is_hwlp_n, is_hwlp_int, is_hwlp_Q;
logic [31:0] rdata, rdata_unaligned;
logic valid, valid_unaligned;
logic aligned_is_compressed, unaligned_is_compressed;
logic hwlp_inbound;
//////////////////////////////////////////////////////////////////////////////
// output port
//////////////////////////////////////////////////////////////////////////////
// output assignments
assign out_rdata_o = (rdata_valid_Q[0]) ? rdata_Q[0] : in_rdata_i;
assign out_addr_o = addr_Q[0]; // always output addr directly since we sent it one cycle earlier to the FIFO
assign out_valid_o = (rdata_valid_Q[0] || (addr_valid_Q[0] && in_rdata_valid_i));
assign rdata = (rdata_valid_Q[0]) ? rdata_Q[0] : in_rdata_i;
assign valid = (rdata_valid_Q[0] || (addr_valid_Q[0] && in_rdata_valid_i));
assign out_unaligned_rdata_o = (rdata_valid_Q[1]) ? {rdata_Q[1][15:0], out_rdata_o[31:16]} : {in_rdata_i[15:0], out_rdata_o[31:16]};
assign rdata_unaligned = (rdata_valid_Q[1]) ? {rdata_Q[1][15:0], rdata[31:16]} : {in_rdata_i[15:0], rdata[31:16]};
// it is implied that rdata_valid_Q[0] is set
assign out_unaligned_valid_o = (rdata_valid_Q[1] || (addr_valid_Q[1] && in_rdata_valid_i));
assign valid_unaligned = (rdata_valid_Q[1] || (addr_valid_Q[1] && in_rdata_valid_i));
assign out_is_unaligned_o = is_unaligned_Q;
assign unaligned_is_compressed = rdata[17:16] != 2'b11;
assign aligned_is_compressed = rdata[1:0] != 2'b11;
//////////////////////////////////////////////////////////////////////////////
// instruction aligner (if unaligned)
//////////////////////////////////////////////////////////////////////////////
always_comb
begin
// serve the aligned case even though the output address is unaligned when
// the next instruction will be from a hardware loop target
// in this case the current instruction is already prealigned in element 0
if (out_addr_o[1] && (~is_hwlp_Q[1])) begin
// unaligned case
out_rdata_o = rdata_unaligned;
if (unaligned_is_compressed)
out_valid_o = 1'b1;
else
out_valid_o = valid_unaligned;
end else begin
// aligned case
out_rdata_o = rdata;
out_valid_o = valid;
end
end
assign out_addr_o = addr_Q[0]; // always output addr directly since we sent it one cycle earlier to the FIFO
assign out_is_hwlp_o = is_hwlp_Q[0];
//////////////////////////////////////////////////////////////////////////////
@ -91,7 +118,7 @@ module riscv_fetch_fifo
//////////////////////////////////////////////////////////////////////////////
// we accept addresses as long as our fifo is not full or we are cleared
assign in_addr_ready_o = clear_i || (~addr_valid_Q[DEPTH-1]);
assign in_addr_ready_o = branch_i || (~addr_valid_Q[DEPTH-1]);
// we accept data as long as our fifo is not full
// we don't care about clear here as the data will be received one cycle
@ -111,6 +138,9 @@ module riscv_fetch_fifo
end
end
// accept hwloop input as long as our second entry is not already one
assign hwlp_inbound = hwloop_i & (~is_hwlp_Q[1]);
//////////////////////////////////////////////////////////////////////////////
// FIFO management
//////////////////////////////////////////////////////////////////////////////
@ -120,6 +150,7 @@ module riscv_fetch_fifo
begin
addr_int = addr_Q;
addr_valid_int = addr_valid_Q;
is_hwlp_int = is_hwlp_Q;
if (in_addr_valid_i && in_addr_ready_o) begin
for(j = 0; j < DEPTH; j++) begin
@ -131,6 +162,14 @@ module riscv_fetch_fifo
end
end
end
// on a hardware loop invalidate everything starting from the second entry
if (hwlp_inbound) begin
addr_int[1] = in_addr_i;
addr_valid_int[1] = 1'b1;
addr_valid_int[2:DEPTH-1] = '0;
is_hwlp_int[1] = 1'b1;
end
end
int k;
@ -149,28 +188,65 @@ module riscv_fetch_fifo
end
end
end
// on a hardware loop invalidate everything starting from the second entry
if (hwlp_inbound) begin
rdata_int[0] = out_rdata_o; // save current output in rdata_int[0], so that we have it available even though we override entry #1
rdata_valid_int[1:DEPTH-1] = '0;
end
end
// move everything by one step
always_comb
begin
addr_n = addr_int;
addr_valid_n = addr_valid_int;
rdata_n = rdata_int;
rdata_valid_n = rdata_valid_int;
is_unaligned_n = is_unaligned_Q;
addr_n = addr_int;
addr_valid_n = addr_valid_int;
rdata_n = rdata_int;
rdata_valid_n = rdata_valid_int;
is_hwlp_n = is_hwlp_int;
if (out_ready_i && out_valid_o) begin
addr_n = {addr_int[1:DEPTH-1], 32'b0};
addr_valid_n = {addr_valid_int[1:DEPTH-1], 1'b0};
rdata_n = {rdata_int[1:DEPTH-1], 32'b0};
rdata_valid_n = {rdata_valid_int[1:DEPTH-1], 1'b0};
is_unaligned_n = 1'b0;
end else begin
if (out_unaligned_valid_o && unaligned_i && (~is_unaligned_Q)) begin
// are we unaligned? then assemble the last word from the two halfes
rdata_n[0] = out_unaligned_rdata_o;
is_unaligned_n = 1'b1;
// now take care of the addresses
if (is_hwlp_int[1]) begin
// hardware loop found in second entry
addr_n = {addr_int[1][31:0], addr_int[2:DEPTH-1], 32'b0};
addr_valid_n = {addr_valid_int[1:DEPTH-1], 1'b0};
rdata_n = {rdata_int[1:DEPTH-1], 32'b0};
rdata_valid_n = {rdata_valid_int[1:DEPTH-1], 1'b0};
is_hwlp_n = {is_hwlp_int[1], 1'b0};
end else begin
if (addr_Q[0][1]) begin
// unaligned case
if (unaligned_is_compressed) begin
addr_n = {{addr_int[1][31:2], 2'b00}, addr_int[2:DEPTH-1], 32'b0};
end else begin
addr_n = {{addr_int[1][31:2], 2'b10}, addr_int[2:DEPTH-1], 32'b0};
end
addr_valid_n = {addr_valid_int[1:DEPTH-1], 1'b0};
rdata_n = {rdata_int[1:DEPTH-1], 32'b0};
rdata_valid_n = {rdata_valid_int[1:DEPTH-1], 1'b0};
is_hwlp_n = {is_hwlp_int[1], 1'b0};
end else begin
// aligned case
if (aligned_is_compressed) begin
// just increase address, do not move to next entry in FIFO
addr_n[0] = {addr_int[0][31:2], 2'b10};
is_hwlp_n[0] = 1'b0; // invalidate hwlp bit for current address
end else begin
// move to next entry in FIFO
addr_n = {{addr_int[1][31:2], 2'b00}, addr_int[2:DEPTH-1], 32'b0};
addr_valid_n = {addr_valid_int[1:DEPTH-1], 1'b0};
rdata_n = {rdata_int[1:DEPTH-1], 32'b0};
rdata_valid_n = {rdata_valid_int[1:DEPTH-1], 1'b0};
is_hwlp_n = {is_hwlp_int[1], 1'b0};
end
end
end
end
end
@ -183,27 +259,27 @@ module riscv_fetch_fifo
begin
if(rst_n == 1'b0)
begin
addr_Q <= '{default: '0};
addr_valid_Q <= '0;
rdata_Q <= '{default: '0};
rdata_valid_Q <= '0;
is_unaligned_Q <= 1'b0;
addr_Q <= '{default: '0};
addr_valid_Q <= '0;
rdata_Q <= '{default: '0};
rdata_valid_Q <= '0;
is_hwlp_Q <= '0;
end
else
begin
// on a clear signal from outside we invalidate the content of the FIFO
// completely and start from an empty state
if (clear_i) begin
addr_Q[0] <= in_addr_i;
addr_valid_Q <= {in_addr_valid_i, {DEPTH-1{1'b0}}};
rdata_valid_Q <= '0;
is_unaligned_Q <= 1'b0;
if (branch_i) begin
addr_Q[0] <= in_addr_i;
addr_valid_Q <= {in_addr_valid_i, {DEPTH-1{1'b0}}};
rdata_valid_Q <= '0;
is_hwlp_Q <= '0;
end else begin
addr_Q <= addr_n;
addr_valid_Q <= addr_valid_n;
rdata_Q <= rdata_n;
rdata_valid_Q <= rdata_valid_n;
is_unaligned_Q <= is_unaligned_n;
addr_Q <= addr_n;
addr_valid_Q <= addr_valid_n;
rdata_Q <= rdata_n;
rdata_valid_Q <= rdata_valid_n;
is_hwlp_Q <= is_hwlp_n;
end
end
end
@ -217,15 +293,19 @@ module riscv_prefetch_buffer
input logic clk,
input logic rst_n,
input logic unaligned_i,
input logic req_i,
input logic branch_i,
input logic ready_i,
input logic [31:0] addr_i,
input logic hwloop_i,
input logic [31:0] hwloop_target_i,
input logic ready_i,
output logic valid_o,
output logic [31:0] rdata_o,
output logic [31:0] addr_o,
output logic is_hwlp_o, // is set when the currently served data is from a hwloop
// goes to instruction memory / instruction cache
output logic instr_req_o,
@ -249,11 +329,6 @@ module riscv_prefetch_buffer
logic fifo_rdata_valid;
logic fifo_rdata_ready;
logic fifo_is_unaligned;
logic [31:0] rdata, unaligned_rdata;
logic valid, unaligned_valid;
//////////////////////////////////////////////////////////////////////////////
// prefetch buffer status
@ -265,7 +340,17 @@ module riscv_prefetch_buffer
// address selection and increase
//////////////////////////////////////////////////////////////////////////////
assign addr_next = (branch_i) ? addr_i : (fifo_last_addr + 32'd4);
always_comb
begin
addr_next = {fifo_last_addr[31:2], 2'b00} + 32'd4;
if (branch_i) begin
addr_next = addr_i;
end else begin
if (hwloop_i)
addr_next = hwloop_target_i;
end
end
//////////////////////////////////////////////////////////////////////////////
@ -278,9 +363,8 @@ module riscv_prefetch_buffer
.clk ( clk ),
.rst_n ( rst_n ),
.clear_i ( branch_i ),
.unaligned_i ( unaligned_i ),
.branch_i ( branch_i ),
.hwloop_i ( hwloop_i ),
.in_addr_valid_i ( fifo_addr_valid ),
.in_addr_ready_o ( fifo_addr_ready ),
@ -291,24 +375,13 @@ module riscv_prefetch_buffer
.in_rdata_ready_o ( fifo_rdata_ready ),
.in_rdata_i ( instr_rdata_i ),
.out_valid_o ( valid ),
.out_valid_o ( valid_o ),
.out_ready_i ( ready_i ),
.out_rdata_o ( rdata ),
.out_rdata_o ( rdata_o ),
.out_addr_o ( addr_o ),
.out_unaligned_valid_o ( unaligned_valid ),
.out_unaligned_rdata_o ( unaligned_rdata ),
.out_is_unaligned_o ( fifo_is_unaligned )
.out_is_hwlp_o ( is_hwlp_o )
);
//////////////////////////////////////////////////////////////////////////////
// instruction aligner (if unaligned)
//////////////////////////////////////////////////////////////////////////////
assign rdata_o = (unaligned_i && (~fifo_is_unaligned)) ? unaligned_rdata : rdata;
assign valid_o = (unaligned_i && (~fifo_is_unaligned)) ? unaligned_valid : valid;
//////////////////////////////////////////////////////////////////////////////
// instruction fetch FSM

View file

@ -83,23 +83,27 @@ module riscv_core
input logic [N_EXT_PERF_COUNTERS-1:0] ext_perf_counters_i
);
localparam N_HWLP = 2;
// IF/ID signals
logic instr_valid_id;
logic [31:0] instr_rdata_id; // Instruction sampled inside IF stage
logic is_compressed_id;
logic illegal_c_insn_id; // Illegal compressed instruction sent to ID stage
logic [31:0] current_pc_if; // Current Program counter
logic [31:0] current_pc_id; // Current Program counter
logic is_hwlp_id;
logic [N_HWLP-1:0] hwlp_dec_cnt_id;
logic instr_valid_id;
logic [31:0] instr_rdata_id; // Instruction sampled inside IF stage
logic is_compressed_id;
logic illegal_c_insn_id; // Illegal compressed instruction sent to ID stage
logic [31:0] current_pc_if; // Current Program counter
logic [31:0] current_pc_id; // Current Program counter
logic clear_instr_valid;
logic pc_set;
logic [2:0] pc_mux_id; // Mux selector for next PC
logic [1:0] exc_pc_mux_id; // Mux selector for exception PC
logic [4:0] exc_vec_pc_mux_id; // Mux selector for vectorized IR lines
logic clear_instr_valid;
logic pc_set;
logic [2:0] pc_mux_id; // Mux selector for next PC
logic [1:0] exc_pc_mux_id; // Mux selector for exception PC
logic [4:0] exc_vec_pc_mux_id; // Mux selector for vectorized IR lines
logic lsu_load_err;
logic lsu_store_err;
logic lsu_load_err;
logic lsu_store_err;
// ID performance counter signals
logic is_decoding;
@ -191,7 +195,9 @@ module riscv_core
// Hardware loop controller signals
logic [31:0] hwloop_target; // from hwloop controller to if stage
logic [N_HWLP-1:0] [31:0] hwlp_start;
logic [N_HWLP-1:0] [31:0] hwlp_end;
logic [N_HWLP-1:0] [31:0] hwlp_cnt;
// Debug Unit
@ -233,6 +239,7 @@ module riscv_core
//////////////////////////////////////////////////
riscv_if_stage
#(
.N_HWLP ( N_HWLP ),
.RDATA_WIDTH ( INSTR_RDATA_WIDTH )
)
if_stage_i
@ -254,6 +261,8 @@ module riscv_core
.instr_rdata_i ( instr_rdata_i ),
// outputs to ID stage
.hwlp_dec_cnt_id_o ( hwlp_dec_cnt_id ),
.is_hwlp_id_o ( is_hwlp_id ),
.instr_valid_id_o ( instr_valid_id ),
.instr_rdata_id_o ( instr_rdata_id ),
.is_compressed_id_o ( is_compressed_id ),
@ -270,7 +279,9 @@ module riscv_core
.exc_vec_pc_mux_i ( exc_vec_pc_mux_id ),
// from hwloop controller
.hwloop_target_i ( hwloop_target ), // pc from hwloop start address
.hwlp_start_i ( hwlp_start ),
.hwlp_end_i ( hwlp_end ),
.hwlp_cnt_i ( hwlp_cnt ),
// from debug unit
.dbg_npc_i ( dbg_npc ),
@ -299,7 +310,11 @@ module riscv_core
// |___|____/ |____/ |_/_/ \_\____|_____| //
// //
/////////////////////////////////////////////////
riscv_id_stage id_stage_i
riscv_id_stage
#(
.N_HWLP ( N_HWLP )
)
id_stage_i
(
.clk ( clk ),
.rst_n ( rst_n ),
@ -312,6 +327,8 @@ module riscv_core
.is_decoding_o ( is_decoding ),
// Interface to instruction memory
.hwlp_dec_cnt_i ( hwlp_dec_cnt_id ),
.is_hwlp_i ( is_hwlp_id ),
.instr_valid_i ( instr_valid_id ),
.instr_rdata_i ( instr_rdata_id ),
.instr_req_o ( instr_req_int ),
@ -372,8 +389,10 @@ module riscv_core
.csr_access_ex_o ( csr_access_ex ),
.csr_op_ex_o ( csr_op_ex ),
// hwloop signals
.hwloop_targ_addr_o ( hwloop_target ),
// hardware loop signals to IF hwlp controller
.hwlp_start_o ( hwlp_start ),
.hwlp_end_o ( hwlp_end ),
.hwlp_cnt_o ( hwlp_cnt ),
// LSU
.data_req_ex_o ( data_req_ex ), // to load store unit
@ -773,6 +792,10 @@ module riscv_core
`INSTR_SRA: printRInstr("SRA");
`INSTR_OR: printRInstr("OR");
`INSTR_AND: printRInstr("AND");
`INSTR_EXTHS: printRInstr("EXTHS");
`INSTR_EXTHZ: printRInstr("EXTHZ");
`INSTR_EXTBS: printRInstr("EXTBS");
`INSTR_EXTBZ: printRInstr("EXTBZ");
// FENCE
`INSTR_FENCE: printMnemonic("FENCE");
`INSTR_FENCEI: printMnemonic("FENCEI");
@ -986,6 +1009,7 @@ module riscv_core
3'b010: mnemonic = "LCOUNT";
3'b011: mnemonic = "LCOUNTI";
3'b100: mnemonic = "LSETUP";
3'b101: mnemonic = "LSETUPI";
3'b111: begin
printMnemonic("INVALID");
return;
@ -994,18 +1018,21 @@ module riscv_core
riscv_core.mnemonic = mnemonic;
// decode and print instruction
imm = id_stage_i.imm_i_type;
imm = id_stage_i.imm_iz_type;
case (instr[14:12])
// lp.starti and lp.endi
3'b000,
3'b001: $fdisplay(f, "%7s\tx%0d, 0x%h (-> 0x%h)", mnemonic, rd, imm, pc+imm);
3'b001: $fdisplay(f, "%7s\tx%0d, 0x%h (-> 0x%h)", mnemonic, rd, imm, pc+(imm<<1));
// lp.count
3'b010: $fdisplay(f, "%7s\tx%0d, x%0d (0x%h)", mnemonic, rd, rs1, rs1_value);
// lp.counti
3'b011: $fdisplay(f, "%7s\tx%0d, 0x%h", mnemonic, rd, imm);
// lp.setup
3'b100: $fdisplay(f, "%7s\tx%0d, x%0d (0x%h), 0x%h (-> 0x%h)", mnemonic,
rd, rs1, rs1_value, imm, pc+imm);
rd, rs1, rs1_value, imm, pc+(imm<<1));
// lp.setupi
3'b101: $fdisplay(f, "%7s\tx%0d, x%0d (0x%h), 0x%h (-> 0x%h)", mnemonic,
rd, rs1, rs1_value, imm, pc+(id_stage_i.imm_z_type << 1));
endcase
end
endfunction