frontend: Clean-up instruction frontend

The instuction frontend has become an increasingly messy part an needed
cleaning-up. The current solution contains 2 x 32 bit instruction data
fifos and 1 x 64 bit address fifo. Hence, it should be significantly
more area efficient that the previous one. The interface to `id_stage`
is a ready/valid handshake. The credit based system has been replaced in
favour of a replay mechanism as it was very brittle and overly
pessimistic.

Branch-prediction has been cleaned up: The front-end was also partially
predicting on jumps, this could have potentially let to performance bugs
if the branch detection wasn't correct in the frontend.
This commit is contained in:
Florian Zaruba 2019-04-20 18:53:16 +02:00
parent 90b76d3e4f
commit 830540b757
14 changed files with 1297 additions and 896 deletions

View file

@ -146,6 +146,7 @@ src := $(filter-out src/ariane_regfile.sv, $(wildcard src/*.sv)) \
src/axi/src/axi_delayer.sv \
src/axi/src/axi_to_axi_lite.sv \
src/fpga-support/rtl/SyncSpRamBeNx64.sv \
src/common_cells/src/unread.sv \
src/common_cells/src/sync.sv \
src/common_cells/src/cdc_2phase.sv \
src/common_cells/src/spill_register.sv \
@ -157,6 +158,7 @@ src := $(filter-out src/ariane_regfile.sv, $(wildcard src/*.sv)) \
src/common_cells/src/deprecated/fifo_v2.sv \
src/common_cells/src/fifo_v3.sv \
src/common_cells/src/lzc.sv \
src/common_cells/src/popcount.sv \
src/common_cells/src/rr_arb_tree.sv \
src/common_cells/src/deprecated/rrarbiter.sv \
src/common_cells/src/stream_delay.sv \
@ -361,7 +363,6 @@ verilate_command := $(verilator)
-Wno-UNOPTFLAT \
-Wno-style \
$(if $(PROFILE),--stats --stats-vars --profile-cfuncs,) \
-Wno-lint \
$(if $(DEBUG),--trace --trace-structs,) \
-LDFLAGS "-L$(RISCV)/lib -Wl,-rpath,$(RISCV)/lib -lfesvr$(if $(PROFILE), -g -pg,)" \
-CFLAGS "$(CFLAGS)$(if $(PROFILE), -g -pg,)" -Wall --cc --vpi \

View file

@ -34,6 +34,9 @@ package ariane_pkg;
localparam NrMaxRules = 16;
typedef struct packed {
int RASDepth;
int BTBEntries;
int BHTEntries;
// PMAs
int NrNonIdempotentRules; // Number of non idempotent rules
logic [NrMaxRules-1:0][63:0] NonIdempotentAddrBase; // base which needs to match
@ -52,6 +55,9 @@ package ariane_pkg;
} ariane_cfg_t;
localparam ariane_cfg_t ArianeDefaultConfig = '{
RASDepth: 2,
BTBEntries: 32,
BHTEntries: 128,
// idempotent region
NrNonIdempotentRules: 2,
NonIdempotentAddrBase: {64'b0, 64'b0},
@ -75,6 +81,9 @@ package ariane_pkg;
function automatic void check_cfg (ariane_cfg_t Cfg);
// pragma translate_off
`ifndef VERILATOR
assert(Cfg.RASDepth > 0);
assert(2**$clog2(Cfg.BTBEntries) == Cfg.BTBEntries);
assert(2**$clog2(Cfg.BHTEntries) == Cfg.BHTEntries);
assert(Cfg.NrNonIdempotentRules <= NrMaxRules);
assert(Cfg.NrExecuteRegionRules <= NrMaxRules);
assert(Cfg.NrCachedRegionRules <= NrMaxRules);
@ -131,9 +140,6 @@ package ariane_pkg;
localparam TRANS_ID_BITS = $clog2(NR_SB_ENTRIES); // depending on the number of scoreboard entries we need that many bits
// to uniquely identify the entry in the scoreboard
localparam ASID_WIDTH = 1;
localparam BTB_ENTRIES = 64;
localparam BHT_ENTRIES = 128;
localparam RAS_DEPTH = 2;
localparam BITS_SATURATION_COUNTER = 2;
localparam NR_COMMIT_PORTS = 2;
@ -142,8 +148,8 @@ package ariane_pkg;
localparam ISSUE_WIDTH = 1;
// amount of pipeline registers inserted for load/store return path
// this can be tuned to trade-off IPC vs. cycle time
localparam NR_LOAD_PIPE_REGS = 1;
localparam NR_STORE_PIPE_REGS = 0;
localparam int unsigned NR_LOAD_PIPE_REGS = 1;
localparam int unsigned NR_STORE_PIPE_REGS = 0;
// depth of store-buffers, this needs to be a power of two
localparam int unsigned DEPTH_SPEC = 4;
@ -281,7 +287,7 @@ package ariane_pkg;
// ---------------
// leave as is (fails with >8 entries and wider fetch width)
localparam int unsigned FETCH_FIFO_DEPTH = 8;
localparam int unsigned FETCH_FIFO_DEPTH = 4;
localparam int unsigned FETCH_WIDTH = 32;
// maximum instructions we can fetch on one request (we support compressed instructions)
localparam int unsigned INSTR_PER_FETCH = FETCH_WIDTH / 16;
@ -295,18 +301,24 @@ package ariane_pkg;
logic valid;
} exception_t;
typedef enum logic [1:0] { BHT, BTB, RAS } cf_t;
typedef enum logic [2:0] {
NoCF, // No control flow prediction
Branch, // Branch
Jump, // Jump to address from immediate
JumpR, // Jump to address from registers
Return // Return Address Prediction
} cf_t;
// branch-predict
// this is the struct we get back from ex stage and we will use it to update
// all the necessary data structures
// bp_resolve_t
typedef struct packed {
logic valid; // prediction with all its values is valid
logic [63:0] pc; // pc of predict or mis-predict
logic [63:0] pc; // PC of predict or mis-predict
logic [63:0] target_address; // target address at which to jump, or not
logic is_mispredict; // set if this was a mis-predict
logic is_taken; // branch is taken
// in the lower 16 bit of the word
cf_t cf_type; // Type of control flow change
} bp_resolve_t;
@ -314,11 +326,8 @@ package ariane_pkg;
// this is the struct which we will inject into the pipeline to guide the various
// units towards the correct branch decision and resolve
typedef struct packed {
logic valid; // this is a valid hint
cf_t cf; // type of control flow prediction
logic [63:0] predict_address; // target address at which to jump, or not
logic predict_taken; // branch is taken
// in the lower 16 bit of the word
cf_t cf_type; // Type of control flow change
} branchpredict_sbe_t;
typedef struct packed {
@ -340,14 +349,12 @@ package ariane_pkg;
typedef struct packed {
logic valid;
logic [63:0] pc; // update at PC
logic mispredict;
logic taken;
} bht_update_t;
typedef struct packed {
logic valid;
logic taken;
logic strongly_taken;
} bht_prediction_t;
typedef enum logic[3:0] {
@ -444,7 +451,7 @@ package ariane_pkg;
// comparisons
LTS, LTU, GES, GEU, EQ, NE,
// jumps
JALR,
JALR, BRANCH,
// set lower than operations
SLTS, SLTU,
// CSR functions
@ -482,6 +489,13 @@ package ariane_pkg;
logic [TRANS_ID_BITS-1:0] trans_id;
} fu_data_t;
function automatic logic is_branch (input fu_op op);
unique case (op) inside
EQ, NE, LTS, GES, LTU, GEU: return 1'b1;
default : return 1'b0; // all other ops
endcase
endfunction;
// -------------------------------
// Extract Src/Dst FP Reg from Op
// -------------------------------
@ -570,14 +584,6 @@ package ariane_pkg;
// ---------------
// IF/ID Stage
// ---------------
typedef struct packed {
logic [63:0] address; // the address of the instructions from below
logic [FETCH_WIDTH-1:0] instruction; // instruction word
branchpredict_sbe_t branch_predict; // this field contains branch prediction information regarding the forward branch path
logic [INSTR_PER_FETCH-1:0] bp_taken; // at which instruction is this branch taken?
logic page_fault; // an instruction page fault happened
} frontend_fetch_t;
// store the decompressed instruction
typedef struct packed {
logic [63:0] address; // the address of the instructions from below

View file

@ -1,4 +1,4 @@
// Copyright 2018 ETH Zurich and University of Bologna.
// Copyright 2017-2019 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
@ -59,9 +59,9 @@ module ariane #(
// --------------
// IF <-> ID
// --------------
frontend_fetch_t fetch_entry_if_id;
fetch_entry_t fetch_entry_if_id;
logic fetch_valid_if_id;
logic decode_ack_id_if;
logic fetch_ready_id_if;
// --------------
// ID <-> ISSUE
@ -220,7 +220,7 @@ module ariane #(
// Frontend
// --------------
frontend #(
.DmBaseAddress ( ArianeCfg.DmBaseAddress )
.ArianeCfg ( ArianeCfg )
) i_frontend (
.flush_i ( flush_ctrl_if ), // not entirely correct
.flush_bp_i ( 1'b0 ),
@ -238,7 +238,7 @@ module ariane #(
.ex_valid_i ( ex_commit.valid ),
.fetch_entry_o ( fetch_entry_if_id ),
.fetch_entry_valid_o ( fetch_valid_if_id ),
.fetch_ack_i ( decode_ack_id_if ),
.fetch_entry_ready_i ( fetch_ready_id_if ),
.*
);
@ -246,11 +246,14 @@ module ariane #(
// ID
// ---------
id_stage id_stage_i (
.debug_req_i,
.clk_i,
.rst_ni,
.flush_i ( flush_ctrl_if ),
.debug_req_i,
.fetch_entry_i ( fetch_entry_if_id ),
.fetch_entry_valid_i ( fetch_valid_if_id ),
.decoded_instr_ack_o ( decode_ack_id_if ),
.fetch_entry_ready_o ( fetch_ready_id_if ),
.issue_entry_o ( issue_entry_id_issue ),
.issue_entry_valid_o ( issue_entry_valid_id_issue ),
@ -260,13 +263,12 @@ module ariane #(
.priv_lvl_i ( priv_lvl ),
.fs_i ( fs ),
.frm_i ( frm_csr_id_issue_ex ),
.irq_i ( irq_i ),
.irq_ctrl_i ( irq_ctrl_csr_id ),
.debug_mode_i ( debug_mode ),
.tvm_i ( tvm_csr_id ),
.tw_i ( tw_csr_id ),
.tsr_i ( tsr_csr_id ),
.irq_i ( irq_i ),
.irq_ctrl_i ( irq_ctrl_csr_id ),
.*
.tsr_i ( tsr_csr_id )
);
// ---------
@ -334,6 +336,7 @@ module ariane #(
) ex_stage_i (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.debug_mode_i ( debug_mode ),
.flush_i ( flush_ctrl_ex ),
.fu_data_i ( fu_data_id_ex ),
.pc_i ( pc_id_ex ),
@ -708,9 +711,9 @@ module ariane #(
assign tracer_if.flush_unissued = flush_unissued_instr_ctrl_id;
assign tracer_if.flush = flush_ctrl_ex;
// fetch
assign tracer_if.instruction = id_stage_i.instr_realigner_i.fetch_entry_o.instruction;
assign tracer_if.fetch_valid = id_stage_i.instr_realigner_i.fetch_entry_valid_o;
assign tracer_if.fetch_ack = id_stage_i.instr_realigner_i.fetch_ack_i;
assign tracer_if.instruction = id_stage_i.fetch_entry_i.instruction;
assign tracer_if.fetch_valid = id_stage_i.fetch_entry_valid_i;
assign tracer_if.fetch_ack = id_stage_i.fetch_entry_ready_o;
// Issue
assign tracer_if.issue_ack = issue_stage_i.i_scoreboard.issue_ack_i;
assign tracer_if.issue_sbe = issue_stage_i.i_scoreboard.issue_instr_o;

View file

@ -12,10 +12,11 @@
// Date: 09.05.2017
// Description: Branch target calculation and comparison
import ariane_pkg::*;
module branch_unit (
input fu_data_t fu_data_i,
input logic clk_i,
input logic rst_ni,
input logic debug_mode_i,
input ariane_pkg::fu_data_t fu_data_i,
input logic [63:0] pc_i, // PC of instruction
input logic is_compressed_instr_i,
input logic fu_valid_i, // any functional unit is valid, check that there is no accidental mis-predict
@ -23,83 +24,62 @@ module branch_unit (
input logic branch_comp_res_i, // branch comparison result from ALU
output logic [63:0] branch_result_o,
input branchpredict_sbe_t branch_predict_i, // this is the address we predicted
output bp_resolve_t resolved_branch_o, // this is the actual address we are targeting
input ariane_pkg::branchpredict_sbe_t branch_predict_i, // this is the address we predicted
output ariane_pkg::bp_resolve_t resolved_branch_o, // this is the actual address we are targeting
output logic resolve_branch_o, // to ID to clear that we resolved the branch and we can
// accept new entries to the scoreboard
output exception_t branch_exception_o // branch exception out
output ariane_pkg::exception_t branch_exception_o // branch exception out
);
logic [63:0] target_address;
logic [63:0] next_pc;
// here we handle the various possibilities of mis-predicts
// here we handle the various possibilities of mis-predicts
always_comb begin : mispredict_handler
// set the jump base, for JALR we need to look at the register, for all other control flow instructions we can take the current PC
automatic logic [63:0] jump_base;
jump_base = (fu_data_i.operator == JALR) ? fu_data_i.operand_a : pc_i;
// TODO(zarubaf): The ALU can be used to calculate the branch target
jump_base = (fu_data_i.operator == ariane_pkg::JALR) ? fu_data_i.operand_a : pc_i;
target_address = 64'b0;
resolve_branch_o = 1'b0;
resolved_branch_o.target_address = 64'b0;
resolved_branch_o.is_taken = 1'b0;
resolved_branch_o.valid = branch_valid_i;
resolved_branch_o.is_mispredict = 1'b0;
resolved_branch_o.cf_type = branch_predict_i.cf_type;
resolved_branch_o.cf_type = branch_predict_i.cf;
// calculate next PC, depending on whether the instruction is compressed or not this may be different
// TODO(zarubaf): We already calculate this a couple of times, maybe re-use?
next_pc = pc_i + ((is_compressed_instr_i) ? 64'h2 : 64'h4);
// calculate target address simple 64 bit addition
target_address = $unsigned($signed(jump_base) + $signed(fu_data_i.imm));
// on a JALR we are supposed to reset the LSB to 0 (according to the specification)
if (fu_data_i.operator == JALR)
target_address[0] = 1'b0;
// if we need to put the branch target address in a destination register, output it here to WB
if (fu_data_i.operator == ariane_pkg::JALR) target_address[0] = 1'b0;
// we need to put the branch target address into rd, this is the result of this unit
branch_result_o = next_pc;
// save PC - we need this to get the target row in the branch target buffer
// we play this trick with the branch instruction which wraps a word boundary:
// /---------- Place the prediction on this PC
// \/
// ____________________________________________________
// |branch [15:0] | branch[31:16] | compressed 1[15:0] |
// |____________________________________________________
// This will relief the pre-fetcher to re-fetch partially fetched unaligned branch instructions e.g.:
// we don't have a back arch between the pre-fetcher and decoder/instruction FIFO.
resolved_branch_o.pc = (is_compressed_instr_i || pc_i[1] == 1'b0) ? pc_i : ({pc_i[63:2], 2'b0} + 64'h4);
resolved_branch_o.pc = pc_i;
// There are only two sources of mispredicts:
// 1. Branches
// 2. Jumps to register addresses
if (branch_valid_i) begin
// write target address which goes to pc gen
// write target address which goes to PC Gen
resolved_branch_o.target_address = (branch_comp_res_i) ? target_address : next_pc;
resolved_branch_o.is_taken = branch_comp_res_i;
// we've detected a branch in ID with the following parameters
// we mis-predicted e.g.: the predicted address is unequal to the actual address
if (target_address[0] == 1'b0) begin
// we've got a valid branch prediction
if (branch_predict_i.valid) begin
// if the outcome doesn't match we've got a mis-predict
if (branch_predict_i.predict_taken != branch_comp_res_i) begin
resolved_branch_o.is_mispredict = 1'b1;
end
// check if the address of the predict taken branch is correct
if (branch_predict_i.predict_taken && target_address != branch_predict_i.predict_address) begin
resolved_branch_o.is_mispredict = 1'b1;
end
// branch-prediction didn't do anything (e.g.: it fetched PC + 2/4), so if this branch is taken
// we also have a mis-predict
end else begin
if (branch_comp_res_i) begin
resolved_branch_o.is_mispredict = 1'b1;
end
end
resolved_branch_o.is_taken = branch_comp_res_i;
// check the outcome of the branch speculation
if (ariane_pkg::is_branch(fu_data_i.operator) && branch_comp_res_i != (branch_predict_i.cf == ariane_pkg::Branch)) begin
// we mis-predicted the outcome
// if the outcome doesn't match we've got a mis-predict
resolved_branch_o.is_mispredict = 1'b1;
resolved_branch_o.cf_type = ariane_pkg::Branch;
end
if (fu_data_i.operator == ariane_pkg::JALR
// check if the address of the jump register is correct and that we actually predicted
&& (branch_predict_i.cf == ariane_pkg::NoCF || target_address != branch_predict_i.predict_address)) begin
resolved_branch_o.is_mispredict = 1'b1;
// update BTB only if this wasn't a return
if (branch_predict_i.cf != ariane_pkg::Return) resolved_branch_o.cf_type = ariane_pkg::JumpR;
end
// to resolve the branch in ID
resolve_branch_o = 1'b1;
// the other case would be that this instruction was no branch but branch prediction thought that it was one
// this is essentially also a mis-predict
end else if (fu_valid_i && branch_predict_i.valid && branch_predict_i.predict_taken) begin
// re-set the branch to the next PC
resolved_branch_o.is_mispredict = 1'b1;
resolved_branch_o.target_address = next_pc;
resolved_branch_o.valid = 1'b1;
resolve_branch_o = 1'b1;
end
end
// use ALU exception signal for storing instruction fetch exceptions if
@ -109,7 +89,6 @@ module branch_unit (
branch_exception_o.valid = 1'b0;
branch_exception_o.tval = pc_i;
// only throw exception if this is indeed a branch
if (branch_valid_i && target_address[0] != 1'b0)
branch_exception_o.valid = 1'b1;
if (branch_valid_i && target_address[0] != 1'b0) branch_exception_o.valid = 1'b1;
end
endmodule

View file

@ -21,6 +21,7 @@ module ex_stage #(
input logic clk_i, // Clock
input logic rst_ni, // Asynchronous reset active low
input logic flush_i,
input logic debug_mode_i,
input fu_data_t fu_data_i,
input logic [63:0] pc_i, // PC of current instruction
@ -143,6 +144,9 @@ module ex_stage #(
// we don't silence the branch unit as this is already critical and we do
// not want to add another layer of logic
branch_unit branch_unit_i (
.clk_i,
.rst_ni,
.debug_mode_i,
.fu_data_i,
.pc_i,
.is_compressed_instr_i,

View file

@ -1,4 +1,4 @@
//Copyright (C) 2018 to present,
// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 2.0 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
@ -6,7 +6,8 @@
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.//
// specific language governing permissions and limitations under the License.
//
// Author: Florian Zaruba, ETH Zurich
// Date: 08.02.2018
// Migrated: Luis Vitorio Cargnini, IEEE
@ -20,65 +21,81 @@ module bht #(
input logic rst_ni,
input logic flush_i,
input logic debug_mode_i,
input logic [63:0] vpc_i,
input ariane_pkg::bht_update_t bht_update_i,
output ariane_pkg::bht_prediction_t bht_prediction_o
// we potentially need INSTR_PER_FETCH predictions/cycle
output ariane_pkg::bht_prediction_t [ariane_pkg::INSTR_PER_FETCH-1:0] bht_prediction_o
);
localparam OFFSET = 2; // we are using compressed instructions so do not use the lower 2 bits for prediction
localparam ANTIALIAS_BITS = 8;
// the last bit is always zero, we don't need it for indexing
localparam OFFSET = 1;
// re-shape the branch history table
localparam NR_ROWS = NR_ENTRIES / ariane_pkg::INSTR_PER_FETCH;
// number of bits needed to index the row
localparam ROW_ADDR_BITS = $clog2(ariane_pkg::INSTR_PER_FETCH);
// number of bits we should use for prediction
localparam PREDICTION_BITS = $clog2(NR_ENTRIES) + OFFSET;
localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS;
// we are not interested in all bits of the address
unread i_unread (.d_i(|vpc_i));
struct packed {
logic valid;
logic [1:0] saturation_counter;
} bht_d[NR_ENTRIES-1:0], bht_q[NR_ENTRIES-1:0];
} bht_d[NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0], bht_q[NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0];
logic [$clog2(NR_ENTRIES)-1:0] index, update_pc;
logic [1:0] saturation_counter;
logic [$clog2(NR_ROWS)-1:0] index, update_pc;
logic [ROW_ADDR_BITS-1:0] update_row_index;
logic [1:0] saturation_counter;
assign index = vpc_i[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
assign update_pc = bht_update_i.pc[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
assign update_row_index = bht_update_i.pc[ROW_ADDR_BITS + OFFSET - 1:OFFSET];
assign index = vpc_i[PREDICTION_BITS - 1:OFFSET];
assign update_pc = bht_update_i.pc[PREDICTION_BITS - 1:OFFSET];
// prediction assignment
assign bht_prediction_o.valid = bht_q[index].valid;
assign bht_prediction_o.taken = bht_q[index].saturation_counter == 2'b10;
assign bht_prediction_o.strongly_taken = (bht_q[index].saturation_counter == 2'b11);
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_bht_output
assign bht_prediction_o[i].valid = bht_q[index][i].valid;
assign bht_prediction_o[i].taken = bht_q[index][i].saturation_counter[1] == 1'b1;
end
always_comb begin : update_bht
bht_d = bht_q;
saturation_counter = bht_q[update_pc].saturation_counter;
saturation_counter = bht_q[update_pc][update_row_index].saturation_counter;
if (bht_update_i.valid && !debug_mode_i) begin
bht_d[update_pc].valid = 1'b1;
bht_d[update_pc][update_row_index].valid = 1'b1;
if (saturation_counter == 2'b11) begin
// we can safely decrease it
if (~bht_update_i.taken)
bht_d[update_pc].saturation_counter = saturation_counter - 1;
if (!bht_update_i.taken)
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
// then check if it saturated in the negative regime e.g.: branch not taken
end else if (saturation_counter == 2'b00) begin
// we can safely increase it
if (bht_update_i.taken)
bht_d[update_pc].saturation_counter = saturation_counter + 1;
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
end else begin // otherwise we are not in any boundaries and can decrease or increase it
if (bht_update_i.taken)
bht_d[update_pc].saturation_counter = saturation_counter + 1;
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
else
bht_d[update_pc].saturation_counter = saturation_counter - 1;
bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
end
end
end
always_ff @(posedge clk_i or negedge rst_ni) begin
if (~rst_ni) begin
for (int unsigned i = 0; i < NR_ENTRIES; i++)
bht_q[i] <= '0;
if (!rst_ni) begin
for (int unsigned i = 0; i < NR_ENTRIES; i++) begin
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
bht_q[i][j] <= '0;
end
end
end else begin
// evict all entries
if (flush_i) begin
for (int i = 0; i < NR_ENTRIES; i++) begin
bht_q[i].valid <= 1'b0;
bht_q[i].saturation_counter <= 2'b10;
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
bht_q[i][j].valid <= 1'b0;
bht_q[i][j].saturation_counter <= 2'b10;
end
end
end else begin
bht_q <= bht_d;

View file

@ -1,4 +1,4 @@
//Copyright (C) 2018 to present,
// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 2.0 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
@ -13,10 +13,6 @@
// Migrated: Luis Vitorio Cargnini, IEEE
// Date: 09.06.2018
// ------------------------------
// Branch Prediction
// ------------------------------
// branch target buffer
module btb #(
parameter int NR_ENTRIES = 8
@ -28,23 +24,36 @@ module btb #(
input logic [63:0] vpc_i, // virtual PC from IF stage
input ariane_pkg::btb_update_t btb_update_i, // update btb with this information
output ariane_pkg::btb_prediction_t btb_prediction_o // prediction from btb
output ariane_pkg::btb_prediction_t [ariane_pkg::INSTR_PER_FETCH-1:0] btb_prediction_o // prediction from btb
);
// number of bits which are not used for indexing
localparam OFFSET = 1; // we are using compressed instructions so do use the lower 2 bits for prediction
localparam ANTIALIAS_BITS = 8;
// the last bit is always zero, we don't need it for indexing
localparam OFFSET = 1;
// re-shape the branch history table
localparam NR_ROWS = NR_ENTRIES / ariane_pkg::INSTR_PER_FETCH;
// number of bits needed to index the row
localparam ROW_ADDR_BITS = $clog2(ariane_pkg::INSTR_PER_FETCH);
// number of bits we should use for prediction
localparam PREDICTION_BITS = $clog2(NR_ENTRIES) + OFFSET;
localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS;
// prevent aliasing to degrade performance
localparam ANTIALIAS_BITS = 8;
// we are not interested in all bits of the address
unread i_unread (.d_i(|vpc_i));
// typedef for all branch target entries
// we may want to try to put a tag field that fills the rest of the PC in-order to mitigate aliasing effects
ariane_pkg::btb_prediction_t btb_d [NR_ENTRIES-1:0], btb_q [NR_ENTRIES-1:0];
logic [$clog2(NR_ENTRIES)-1:0] index, update_pc;
ariane_pkg::btb_prediction_t btb_d [NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0],
btb_q [NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0];
logic [$clog2(NR_ROWS)-1:0] index, update_pc;
logic [ROW_ADDR_BITS-1:0] update_row_index;
assign index = vpc_i[PREDICTION_BITS - 1:OFFSET];
assign update_pc = btb_update_i.pc[PREDICTION_BITS - 1:OFFSET];
assign index = vpc_i[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
assign update_pc = btb_update_i.pc[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
assign update_row_index = btb_update_i.pc[ROW_ADDR_BITS + OFFSET - 1:OFFSET];
// output matching prediction
assign btb_prediction_o = btb_q[index];
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_btb_output
assign btb_prediction_o[i] = btb_q[index][i]; // workaround
end
// -------------------------
// Update Branch Prediction
@ -54,23 +63,25 @@ module btb #(
btb_d = btb_q;
if (btb_update_i.valid && !debug_mode_i) begin
btb_d[update_pc].valid = 1'b1;
btb_d[update_pc][update_row_index].valid = 1'b1;
// the target address is simply updated
btb_d[update_pc].target_address = btb_update_i.target_address;
btb_d[update_pc][update_row_index].target_address = btb_update_i.target_address;
end
end
// sequential process
always_ff @(posedge clk_i or negedge rst_ni) begin
if (~rst_ni) begin
if (!rst_ni) begin
// Bias the branches to be taken upon first arrival
for (int i = 0; i < NR_ENTRIES; i++)
for (int i = 0; i < NR_ROWS; i++)
btb_q[i] <= '{default: 0};
end else begin
// evict all entries
if (flush_i) begin
for (int i = 0; i < NR_ENTRIES; i++) begin
btb_q[i].valid <= 1'b0;
for (int i = 0; i < NR_ROWS; i++) begin
for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
btb_q[i][j].valid <= 1'b0;
end
end
end else begin
btb_q <= btb_d;

View file

@ -11,61 +11,66 @@
// Author: Florian Zaruba, ETH Zurich
// Date: 08.02.2018
// Description: Ariane Instruction Fetch Frontend
//
// This module interfaces with the instruction cache, handles control
// change request from the back-end and does branch prediction.
import ariane_pkg::*;
module frontend #(
parameter logic [63:0] DmBaseAddress = 64'h0 // debug module base address
parameter ariane_pkg::ariane_cfg_t ArianeCfg = ariane_pkg::ArianeDefaultConfig
) (
input logic clk_i, // Clock
input logic rst_ni, // Asynchronous reset active low
input logic flush_i, // flush request for PCGEN
input logic flush_bp_i, // flush branch prediction
input logic debug_mode_i,
// global input
input logic [63:0] boot_addr_i,
// Set a new PC
// mispredict
input bp_resolve_t resolved_branch_i, // from controller signaling a branch_predict -> update BTB
// from commit, when flushing the whole pipeline
input logic set_pc_commit_i, // Take the PC from commit stage
input logic [63:0] pc_commit_i, // PC of instruction in commit stage
// CSR input
input logic [63:0] epc_i, // exception PC which we need to return to
input logic eret_i, // return from exception
input logic [63:0] trap_vector_base_i, // base of trap vector
input logic ex_valid_i, // exception is valid - from commit
input logic set_debug_pc_i, // jump to debug address
// Instruction Fetch
input icache_dreq_o_t icache_dreq_i,
output icache_dreq_i_t icache_dreq_o,
// instruction output port -> to processor back-end
output frontend_fetch_t fetch_entry_o, // fetch entry containing all relevant data for the ID stage
output logic fetch_entry_valid_o, // instruction in IF is valid
input logic fetch_ack_i // ID acknowledged this instruction
input logic clk_i, // Clock
input logic rst_ni, // Asynchronous reset active low
input logic flush_i, // flush request for PCGEN
input logic flush_bp_i, // flush branch prediction
input logic debug_mode_i,
// global input
input logic [63:0] boot_addr_i,
// Set a new PC
// mispredict
input bp_resolve_t resolved_branch_i, // from controller signaling a branch_predict -> update BTB
// from commit, when flushing the whole pipeline
input logic set_pc_commit_i, // Take the PC from commit stage
input logic [63:0] pc_commit_i, // PC of instruction in commit stage
// CSR input
input logic [63:0] epc_i, // exception PC which we need to return to
input logic eret_i, // return from exception
input logic [63:0] trap_vector_base_i, // base of trap vector
input logic ex_valid_i, // exception is valid - from commit
input logic set_debug_pc_i, // jump to debug address
// Instruction Fetch
output icache_dreq_i_t icache_dreq_o,
input icache_dreq_o_t icache_dreq_i,
// instruction output port -> to processor back-end
output fetch_entry_t fetch_entry_o, // fetch entry containing all relevant data for the ID stage
output logic fetch_entry_valid_o, // instruction in IF is valid
input logic fetch_entry_ready_i // ID acknowledged this instruction
);
// Registers
logic [31:0] icache_data_q;
logic icache_valid_q;
logic icache_ex_valid_q;
logic instruction_valid;
logic [INSTR_PER_FETCH-1:0] instr_is_compressed;
logic [63:0] icache_vaddr_q;
// BHT, BTB and RAS prediction
bht_prediction_t bht_prediction;
btb_prediction_t btb_prediction;
ras_t ras_predict;
bht_update_t bht_update;
btb_update_t btb_update;
logic ras_push, ras_pop;
logic [63:0] ras_update;
// Instruction Cache Registers, from I$
logic [FETCH_WIDTH-1:0] icache_data_q;
logic icache_valid_q;
logic icache_ex_valid_q;
logic [63:0] icache_vaddr_q;
logic instr_queue_ready;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] instr_queue_consumed;
// upper-most branch-prediction from last cycle
btb_prediction_t btb_q;
bht_prediction_t bht_q;
// instruction fetch is ready
logic if_ready;
logic [63:0] npc_d, npc_q; // next PC
logic npc_rst_load_q; //indicates whether we come out of reset (then we need to load boot_addr_i)
// indicates whether we come out of reset (then we need to load boot_addr_i)
logic npc_rst_load_q;
logic replay;
logic [63:0] replay_addr;
// shift amount
logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] shamt;
// address will always be 16 bit aligned, make this explicit here
assign shamt = icache_dreq_i.vaddr[$clog2(ariane_pkg::INSTR_PER_FETCH):1];
// -----------------------
// Ctrl Flow Speculation
// -----------------------
@ -74,209 +79,185 @@ module frontend #(
rvi_jalr, rvi_jump;
logic [INSTR_PER_FETCH-1:0][63:0] rvi_imm;
// RVC branching
logic [INSTR_PER_FETCH-1:0] is_rvc;
logic [INSTR_PER_FETCH-1:0] rvc_branch, rvc_jump, rvc_jr, rvc_return,
rvc_jalr, rvc_call;
logic [INSTR_PER_FETCH-1:0][63:0] rvc_imm;
// re-aligned instruction and address (coming from cache - combinationally)
logic [INSTR_PER_FETCH-1:0][31:0] instr;
logic [INSTR_PER_FETCH-1:0][63:0] addr;
logic [INSTR_PER_FETCH-1:0] instruction_valid;
// BHT, BTB and RAS prediction
bht_prediction_t [INSTR_PER_FETCH-1:0] bht_prediction;
btb_prediction_t [INSTR_PER_FETCH-1:0] btb_prediction;
bht_prediction_t [INSTR_PER_FETCH-1:0] bht_prediction_shifted;
btb_prediction_t [INSTR_PER_FETCH-1:0] btb_prediction_shifted;
ras_t ras_predict;
logic [63:0] bp_vaddr;
logic bp_valid; // we have a valid branch-prediction
logic is_mispredict;
// branch-prediction which we inject into the pipeline
branchpredict_sbe_t bp_sbe;
// fetch fifo credit system
logic fifo_valid, fifo_ready, fifo_empty, fifo_pop;
logic s2_eff_kill, issue_req, s2_in_flight_d, s2_in_flight_q;
logic [$clog2(FETCH_FIFO_DEPTH):0] fifo_credits_d;
logic [$clog2(FETCH_FIFO_DEPTH):0] fifo_credits_q;
// branch-predict update
logic is_mispredict;
logic ras_push, ras_pop;
logic [63:0] ras_update;
// save the unaligned part of the instruction to this ff
logic [15:0] unaligned_instr_d, unaligned_instr_q;
// the last instruction was unaligned
logic unaligned_d, unaligned_q;
// register to save the unaligned address
logic [63:0] unaligned_address_d, unaligned_address_q;
// Instruction FIFO
logic [63:0] predict_address;
cf_t [ariane_pkg::INSTR_PER_FETCH-1:0] cf_type;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] taken_rvi_cf;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] taken_rvc_cf;
for (genvar i = 0; i < INSTR_PER_FETCH; i ++) begin
// LSB != 2'b11
assign instr_is_compressed[i] = ~&icache_data_q[i * 16 +: 2];
logic serving_unaligned;
// Re-align instructions
instr_realign i_instr_realign (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.flush_i ( icache_dreq_o.kill_s2 ),
.valid_i ( icache_valid_q ),
.serving_unaligned_o ( serving_unaligned ),
.address_i ( icache_vaddr_q ),
.data_i ( icache_data_q ),
.valid_o ( instruction_valid ),
.addr_o ( addr ),
.instr_o ( instr )
);
// --------------------
// Branch Prediction
// --------------------
// select the right branch prediction result
// in case we are serving an unaligned instruction in instr[0] we need to take
// the prediction we saved from the previous fetch
assign bht_prediction_shifted[0] = (serving_unaligned) ? bht_q : bht_prediction[0];
assign btb_prediction_shifted[0] = (serving_unaligned) ? btb_q : btb_prediction[0];
// for all other predictions we can use the generated address to index
// into the branch prediction data structures
for (genvar i = 1; i < INSTR_PER_FETCH; i++) begin : gen_prediction_address
assign bht_prediction_shifted[i] = bht_prediction[addr[i][$clog2(INSTR_PER_FETCH):1]];
assign btb_prediction_shifted[i] = btb_prediction[addr[i][$clog2(INSTR_PER_FETCH):1]];
end
// for the return address stack it doens't matter as we have the
// address of the call/return already
logic bp_valid;
logic [INSTR_PER_FETCH-1:0] is_branch;
logic [INSTR_PER_FETCH-1:0] is_call;
logic [INSTR_PER_FETCH-1:0] is_jump;
logic [INSTR_PER_FETCH-1:0] is_return;
logic [INSTR_PER_FETCH-1:0] is_jalr;
for (genvar i = 0; i < INSTR_PER_FETCH; i++) begin
// branch history table -> BHT
assign is_branch[i] = instruction_valid[i] & (rvi_branch[i] | rvc_branch[i]);
// function calls -> RAS
assign is_call[i] = instruction_valid[i] & (rvi_call[i] | rvc_call[i]);
// function return -> RAS
assign is_return[i] = instruction_valid[i] & (rvi_return[i] | rvc_return[i]);
// unconditional jumps with known target -> immediately resolved
assign is_jump[i] = instruction_valid[i] & (rvi_jump[i] | rvc_jump[i]);
// unconditional jumps with unknown target -> BTB
assign is_jalr[i] = instruction_valid[i] & ~is_return[i] & ~is_call[i] & (rvi_jalr[i] | rvc_jalr[i] | rvc_jr[i]);
end
// Soft-realignment to do branch-prediction
always_comb begin : re_align
unaligned_d = unaligned_q;
unaligned_address_d = unaligned_address_q;
unaligned_instr_d = unaligned_instr_q;
instruction_valid = icache_valid_q;
// taken/not taken
always_comb begin
taken_rvi_cf = '0;
taken_rvc_cf = '0;
predict_address = '0;
// 32-bit can contain 2 instructions
instr[0] = icache_data_q;
addr[0] = icache_vaddr_q;
for (int i = 0; i < INSTR_PER_FETCH; i++) cf_type[i] = ariane_pkg::NoCF;
instr[1] = '0;
addr[1] = {icache_vaddr_q[63:2], 2'b10};
ras_push = 1'b0;
ras_pop = 1'b0;
ras_update = '0;
if (icache_valid_q) begin
// last instruction was unaligned
if (unaligned_q) begin
instr[0] = {icache_data_q[15:0], unaligned_instr_q};
addr[0] = unaligned_address_q;
unaligned_address_d = {icache_vaddr_q[63:2], 2'b10};
unaligned_instr_d = icache_data_q[31:16]; // save the upper bits for next cycle
// check if this is instruction is still unaligned e.g.: it is not compressed
// if its compressed re-set unaligned flag
// for 32 bit we can simply check the next instruction and whether it is compressed or not
// if it is compressed the next fetch will contain an aligned instruction
if (instr_is_compressed[1]) begin
unaligned_d = 1'b0;
instr[1] = {16'b0, icache_data_q[31:16]};
end
end else if (instr_is_compressed[0]) begin // instruction zero is RVC
// is instruction 1 also compressed
// yes? -> no problem, no -> we've got an unaligned instruction
if (instr_is_compressed[1]) begin
instr[1] = {16'b0, icache_data_q[31:16]};
end else begin
unaligned_instr_d = icache_data_q[31:16];
unaligned_address_d = {icache_vaddr_q[63:2], 2'b10};
unaligned_d = 1'b1;
end
end // else -> normal fetch
end
// we started to fetch on a unaligned boundary with a whole instruction -> wait until we've
// received the next instruction
if (icache_valid_q && icache_vaddr_q[1] && !instr_is_compressed[1]) begin
instruction_valid = 1'b0;
unaligned_d = 1'b1;
unaligned_address_d = {icache_vaddr_q[63:2], 2'b10};
unaligned_instr_d = icache_data_q[31:16];
end
// if we killed the consecutive fetch we are starting on a clean slate
if (icache_dreq_o.kill_s2) begin
unaligned_d = 1'b0;
end
end
logic [INSTR_PER_FETCH:0] taken;
// control front-end + branch-prediction
always_comb begin : frontend_ctrl
automatic logic take_rvi_cf; // take the control flow change (non-compressed)
automatic logic take_rvc_cf; // take the control flow change (compressed)
take_rvi_cf = 1'b0;
take_rvc_cf = 1'b0;
ras_pop = 1'b0;
ras_push = 1'b0;
ras_update = '0;
taken = '0;
take_rvi_cf = 1'b0;
bp_vaddr = '0; // predicted address
bp_valid = 1'b0; // prediction is valid
bp_sbe.cf_type = RAS;
// only predict if the response is valid
if (instruction_valid) begin
// look at instruction 0, 1, 2, ...
for (int unsigned i = 0; i < INSTR_PER_FETCH; i++) begin
// only speculate if the previous instruction was not taken
if (!taken[i]) begin
// function call
ras_push = rvi_call[i] | rvc_call[i];
ras_update = addr[i] + (rvc_call[i] ? 2 : 4);
// Branch Prediction - **speculative**
if (rvi_branch[i] || rvc_branch[i]) begin
bp_sbe.cf_type = BHT;
// dynamic prediction valid?
if (bht_prediction.valid) begin
take_rvi_cf = rvi_branch[i] & (bht_prediction.taken | bht_prediction.strongly_taken);
take_rvc_cf = rvc_branch[i] & (bht_prediction.taken | bht_prediction.strongly_taken);
// default to static prediction
end else begin
// set if immediate is negative - static prediction
take_rvi_cf = rvi_branch[i] & rvi_imm[i][63];
take_rvc_cf = rvc_branch[i] & rvc_imm[i][63];
end
end
// unconditional jumps
if (rvi_jump[i] || rvc_jump[i]) begin
take_rvi_cf = rvi_jump[i];
take_rvc_cf = rvc_jump[i];
end
// to take this jump we need a valid prediction target **speculative**
if ((rvi_jalr[i] || rvc_jalr[i]) && ~(rvi_call[i] || rvc_call[i])) begin
bp_sbe.cf_type = BTB;
if (btb_prediction.valid) begin
bp_vaddr = btb_prediction.target_address;
taken[i+1] = 1'b1;
end
end
// is it a return and the RAS contains a valid prediction? **speculative**
if ((rvi_return[i] || rvc_return[i]) && ras_predict.valid) begin
bp_vaddr = ras_predict.ra;
ras_pop = 1'b1;
taken[i+1] = 1'b1;
bp_sbe.cf_type = RAS;
end
if (take_rvi_cf) begin
taken[i+1] = 1'b1;
bp_vaddr = addr[i] + rvi_imm[i];
end
if (take_rvc_cf) begin
taken[i+1] = 1'b1;
bp_vaddr = addr[i] + rvc_imm[i];
end
// we are not interested in the lower instruction
if (icache_vaddr_q[1]) begin
taken[1] = 1'b0;
// TODO(zarubaf): that seems to be overly pessimistic
ras_pop = 1'b0;
ras_push = 1'b0;
end
end
// lower most prediction gets precedence
for (int i = INSTR_PER_FETCH - 1; i >= 0 ; i--) begin
unique case ({is_branch[i], is_return[i], is_jump[i], is_jalr[i]})
4'b0000:; // regular instruction e.g.: no branch
// unconditional jump to register, we need the BTB to resolve this
4'b0001: begin
ras_pop = 1'b0;
ras_push = 1'b0;
if (btb_prediction_shifted[i].valid) begin
predict_address = btb_prediction_shifted[i].target_address;
cf_type[i] = ariane_pkg::JumpR;
end
end
bp_valid = |taken;
// assemble scoreboard entry
bp_sbe.valid = bp_valid;
bp_sbe.predict_address = bp_vaddr;
bp_sbe.predict_taken = bp_valid;
end
// its an unconditional jump to an immediate
4'b0010: begin
ras_pop = 1'b0;
ras_push = 1'b0;
taken_rvi_cf[i] = rvi_jump[i];
taken_rvc_cf[i] = rvc_jump[i];
cf_type[i] = ariane_pkg::Jump;
end
// return
4'b0100: begin
// make sure to only alter the RAS if we actually consumed the instruction
ras_pop = ras_predict.valid & instr_queue_consumed[i];
ras_push = 1'b0;
predict_address = ras_predict.ra;
cf_type[i] = ariane_pkg::Return;
end
// branch prediction
4'b1000: begin
ras_pop = 1'b0;
ras_push = 1'b0;
// if we have a valid dynamic prediction use it
if (bht_prediction_shifted[i].valid) begin
taken_rvi_cf[i] = rvi_branch[i] & bht_prediction_shifted[i].taken;
taken_rvc_cf[i] = rvc_branch[i] & bht_prediction_shifted[i].taken;
// otherwise default to static prediction
end else begin
// set if immediate is negative - static prediction
taken_rvi_cf[i] = rvi_branch[i] & rvi_imm[i][63];
taken_rvc_cf[i] = rvc_branch[i] & rvc_imm[i][63];
end
if (taken_rvi_cf[i] || taken_rvc_cf[i]) cf_type[i] = ariane_pkg::Branch;
end
default:;
// default: $error("Decoded more than one control flow");
endcase
// if this instruction, in addition, is a call, save the resulting address
// but only if we actually consumed the address
if (is_call[i]) begin
ras_push = instr_queue_consumed[i];
ras_update = addr[i] + (rvc_call[i] ? 2 : 4);
end
// calculate the jump target address
if (taken_rvc_cf[i] || taken_rvi_cf[i]) begin
predict_address = addr[i] + (taken_rvc_cf[i] ? rvc_imm[i] : rvi_imm[i]);
end
end
end
// or reduce struct
always_comb begin
bp_valid = 1'b0;
for (int i = 0; i < INSTR_PER_FETCH; i++) bp_valid |= (cf_type[i] != NoCF);
end
assign is_mispredict = resolved_branch_i.valid & resolved_branch_i.is_mispredict;
// we mis-predicted so kill the icache request and the fetch queue
assign icache_dreq_o.kill_s1 = is_mispredict | flush_i;
// if we have a valid branch-prediction we need to kill the last cache request
assign icache_dreq_o.kill_s2 = icache_dreq_o.kill_s1 | bp_valid;
assign fifo_valid = icache_valid_q;
// ----------------------------------------
// Cache interface
assign icache_dreq_o.req = instr_queue_ready;
assign if_ready = icache_dreq_i.ready & instr_queue_ready;
// We need to flush the cache pipeline if:
// 1. We mispredicted
// 2. Want to flush the whole processor front-end
// 3. Need to replay an instruction because the fetch-fifo was full
assign icache_dreq_o.kill_s1 = is_mispredict | flush_i | replay;
// if we have a valid branch-prediction we need to only kill the last cache request
// also if we killed the first stage we also need to kill the second stage (inclusive flush)
assign icache_dreq_o.kill_s2 = icache_dreq_o.kill_s1 | bp_valid;
// Update Control Flow Predictions
// ----------------------------------------
// BHT
assign bht_update.valid = resolved_branch_i.valid & (resolved_branch_i.cf_type == BHT);
bht_update_t bht_update;
btb_update_t btb_update;
assign bht_update.valid = resolved_branch_i.valid
& (resolved_branch_i.cf_type == ariane_pkg::Branch);
assign bht_update.pc = resolved_branch_i.pc;
assign bht_update.mispredict = resolved_branch_i.is_mispredict;
assign bht_update.taken = resolved_branch_i.is_taken;
// BTB
assign btb_update.valid = resolved_branch_i.valid & (resolved_branch_i.cf_type == BTB);
// only update mispredicted branches e.g. no returns from the RAS
assign btb_update.valid = resolved_branch_i.valid
& resolved_branch_i.is_mispredict
& (resolved_branch_i.cf_type == ariane_pkg::JumpR);
assign btb_update.pc = resolved_branch_i.pc;
assign btb_update.target_address = resolved_branch_i.target_address;
@ -284,7 +265,7 @@ module frontend #(
// Next PC
// -------------------
// next PC (NPC) can come from (in order of precedence):
// 0. Default assignment
// 0. Default assignment/replay instruction
// 1. Branch Predict taken
// 2. Control flow change request (misprediction)
// 3. Return from environment call
@ -293,211 +274,160 @@ module frontend #(
// Mis-predict handling is a little bit different
// select PC a.k.a PC Gen
always_comb begin : npc_select
automatic logic [63:0] fetch_address;
// check whether we come out of reset
// this is a workaround. some tools have issues
// having boot_addr_i in the asynchronous
// reset assignment to npc_q, even though
// boot_addr_i will be assigned a constant
// on the top-level.
if (npc_rst_load_q) begin
npc_d = boot_addr_i;
fetch_address = boot_addr_i;
end else begin
fetch_address = npc_q;
// keep stable by default
npc_d = npc_q;
end
// -------------------------------
// 1. Branch Prediction
// -------------------------------
if (bp_valid) begin
fetch_address = bp_vaddr;
npc_d = bp_vaddr;
end
// -------------------------------
// 0. Default assignment
// -------------------------------
if (if_ready) begin
npc_d = {fetch_address[63:2], 2'b0} + 'h4;
end
// -------------------------------
// 2. Control flow change request
// -------------------------------
if (is_mispredict) begin
npc_d = resolved_branch_i.target_address;
end
// -------------------------------
// 3. Return from environment call
// -------------------------------
if (eret_i) begin
npc_d = epc_i;
end
// -------------------------------
// 4. Exception/Interrupt
// -------------------------------
if (ex_valid_i) begin
npc_d = trap_vector_base_i;
end
// -----------------------------------------------
// 5. Pipeline Flush because of CSR side effects
// -----------------------------------------------
// On a pipeline flush start fetching from the next address
// of the instruction in the commit stage
if (set_pc_commit_i) begin
// we came here from a flush request of a CSR instruction or AMO,
// as CSR or AMO instructions do not exist in a compressed form
// we can unconditionally do PC + 4 here
// TODO(zarubaf) This adder can at least be merged with the one in the csr_regfile stage
npc_d = pc_commit_i + 64'h4;
end
// -------------------------------
// 6. Debug
// -------------------------------
// enter debug on a hard-coded base-address
if (set_debug_pc_i) begin
npc_d = DmBaseAddress + dm::HaltAddress;
end
icache_dreq_o.vaddr = fetch_address;
automatic logic [63:0] fetch_address;
// check whether we come out of reset
// this is a workaround. some tools have issues
// having boot_addr_i in the asynchronous
// reset assignment to npc_q, even though
// boot_addr_i will be assigned a constant
// on the top-level.
if (npc_rst_load_q) begin
npc_d = boot_addr_i;
fetch_address = boot_addr_i;
end else begin
fetch_address = npc_q;
// keep stable by default
npc_d = npc_q;
end
// 0. Branch Prediction
if (bp_valid) begin
fetch_address = predict_address;
npc_d = predict_address;
end
// 1. Default assignment
if (if_ready) npc_d = {fetch_address[63:2], 2'b0} + 'h4;
// 2. Replay instruction fetch
if (replay) npc_d = replay_addr;
// 3. Control flow change request
if (is_mispredict) npc_d = resolved_branch_i.target_address;
// 4. Return from environment call
if (eret_i) npc_d = epc_i;
// 5. Exception/Interrupt
if (ex_valid_i) npc_d = trap_vector_base_i;
// 6. Pipeline Flush because of CSR side effects
// On a pipeline flush start fetching from the next address
// of the instruction in the commit stage
// we came here from a flush request of a CSR instruction or AMO,
// as CSR or AMO instructions do not exist in a compressed form
// we can unconditionally do PC + 4 here
// TODO(zarubaf) This adder can at least be merged with the one in the csr_regfile stage
if (set_pc_commit_i) npc_d = pc_commit_i + 64'h4;
// 7. Debug
// enter debug on a hard-coded base-address
if (set_debug_pc_i) npc_d = ArianeCfg.DmBaseAddress + dm::HaltAddress;
icache_dreq_o.vaddr = fetch_address;
end
// -------------------
// Credit-based fetch FIFO flow ctrl
// -------------------
assign fifo_credits_d = (flush_i) ? FETCH_FIFO_DEPTH :
fifo_credits_q + fifo_pop + s2_eff_kill - issue_req;
// check whether there is a request in flight that is being killed now
// if this is the case, we need to increment the credit by 1
assign s2_eff_kill = s2_in_flight_q & icache_dreq_o.kill_s2;
assign s2_in_flight_d = (flush_i) ? 1'b0 :
(issue_req) ? 1'b1 :
(icache_dreq_i.valid) ? 1'b0 :
s2_in_flight_q;
// only enable counter if current request is not being killed
assign issue_req = if_ready & (~icache_dreq_o.kill_s1);
assign fifo_pop = fetch_ack_i & fetch_entry_valid_o;
assign fifo_ready = (|fifo_credits_q);
assign if_ready = icache_dreq_i.ready & fifo_ready;
assign icache_dreq_o.req = fifo_ready;
assign fetch_entry_valid_o = ~fifo_empty;
//pragma translate_off
`ifndef VERILATOR
fetch_fifo_credits0 : assert property (
@(posedge clk_i) disable iff (~rst_ni) (fifo_credits_q <= FETCH_FIFO_DEPTH))
else $fatal(1,"[frontend] fetch fifo credits must be <= FETCH_FIFO_DEPTH!");
initial begin
assert (FETCH_FIFO_DEPTH <= 8) else $fatal(1,"[frontend] fetch fifo deeper than 8 not supported");
assert (FETCH_WIDTH == 32) else $fatal(1,"[frontend] fetch width != not supported");
end
`endif
//pragma translate_on
logic [FETCH_WIDTH-1:0] icache_data;
// re-align the cache line
assign icache_data = icache_dreq_i.data >> {shamt, 4'b0};
always_ff @(posedge clk_i or negedge rst_ni) begin
if (~rst_ni) begin
npc_q <= '0;
npc_rst_load_q <= 1'b1;
icache_data_q <= '0;
icache_valid_q <= 1'b0;
icache_vaddr_q <= 'b0;
icache_ex_valid_q <= 1'b0;
unaligned_q <= 1'b0;
unaligned_address_q <= '0;
unaligned_instr_q <= '0;
fifo_credits_q <= FETCH_FIFO_DEPTH;
s2_in_flight_q <= 1'b0;
end else begin
npc_rst_load_q <= 1'b0;
npc_q <= npc_d;
icache_data_q <= icache_dreq_i.data;
icache_valid_q <= icache_dreq_i.valid;
icache_vaddr_q <= icache_dreq_i.vaddr;
icache_ex_valid_q <= icache_dreq_i.ex.valid;
unaligned_q <= unaligned_d;
unaligned_address_q <= unaligned_address_d;
unaligned_instr_q <= unaligned_instr_d;
fifo_credits_q <= fifo_credits_d;
s2_in_flight_q <= s2_in_flight_d;
if (!rst_ni) begin
npc_rst_load_q <= 1'b1;
npc_q <= '0;
icache_data_q <= '0;
icache_valid_q <= 1'b0;
icache_vaddr_q <= 'b0;
icache_ex_valid_q <= 1'b0;
btb_q <= '0;
bht_q <= '0;
end else begin
npc_rst_load_q <= 1'b0;
npc_q <= npc_d;
icache_valid_q <= icache_dreq_i.valid;
if (icache_dreq_i.valid) begin
icache_data_q <= icache_data;
icache_vaddr_q <= icache_dreq_i.vaddr;
icache_ex_valid_q <= icache_dreq_i.ex;
// save the uppermost prediction
btb_q <= btb_prediction[INSTR_PER_FETCH-1];
bht_q <= bht_prediction[INSTR_PER_FETCH-1];
end
end
end
ras #(
.DEPTH ( RAS_DEPTH )
.DEPTH ( ArianeCfg.RASDepth )
) i_ras (
.clk_i,
.rst_ni,
.flush_i( flush_bp_i ),
.push_i ( ras_push ),
.pop_i ( ras_pop ),
.data_i ( ras_update ),
.data_o ( ras_predict )
.clk_i,
.rst_ni,
.flush_i( flush_bp_i ),
.push_i ( ras_push ),
.pop_i ( ras_pop ),
.data_i ( ras_update ),
.data_o ( ras_predict )
);
btb #(
.NR_ENTRIES ( BTB_ENTRIES )
.NR_ENTRIES ( ArianeCfg.BTBEntries )
) i_btb (
.clk_i,
.rst_ni,
.flush_i ( flush_bp_i ),
.debug_mode_i,
.vpc_i ( icache_vaddr_q ),
.btb_update_i ( btb_update ),
.btb_prediction_o ( btb_prediction )
.clk_i,
.rst_ni,
.flush_i ( flush_bp_i ),
.debug_mode_i,
.vpc_i ( icache_vaddr_q ),
.btb_update_i ( btb_update ),
.btb_prediction_o ( btb_prediction )
);
bht #(
.NR_ENTRIES ( BHT_ENTRIES )
.NR_ENTRIES ( ArianeCfg.BHTEntries )
) i_bht (
.clk_i,
.rst_ni,
.flush_i ( flush_bp_i ),
.debug_mode_i,
.vpc_i ( icache_vaddr_q ),
.bht_update_i ( bht_update ),
.bht_prediction_o ( bht_prediction )
.clk_i,
.rst_ni,
.flush_i ( flush_bp_i ),
.debug_mode_i,
.vpc_i ( icache_vaddr_q ),
.bht_update_i ( bht_update ),
.bht_prediction_o ( bht_prediction )
);
for (genvar i = 0; i < INSTR_PER_FETCH; i++) begin
instr_scan i_instr_scan (
.instr_i ( instr[i] ),
.is_rvc_o ( is_rvc[i] ),
.rvi_return_o ( rvi_return[i] ),
.rvi_call_o ( rvi_call[i] ),
.rvi_branch_o ( rvi_branch[i] ),
.rvi_jalr_o ( rvi_jalr[i] ),
.rvi_jump_o ( rvi_jump[i] ),
.rvi_imm_o ( rvi_imm[i] ),
.rvc_branch_o ( rvc_branch[i] ),
.rvc_jump_o ( rvc_jump[i] ),
.rvc_jr_o ( rvc_jr[i] ),
.rvc_return_o ( rvc_return[i] ),
.rvc_jalr_o ( rvc_jalr[i] ),
.rvc_call_o ( rvc_call[i] ),
.rvc_imm_o ( rvc_imm[i] )
);
// we need to inspect up to INSTR_PER_FETCH instructions for branches
// and jumps
for (genvar i = 0; i < INSTR_PER_FETCH; i++) begin : gen_instr_scan
instr_scan i_instr_scan (
.instr_i ( instr[i] ),
.rvi_return_o ( rvi_return[i] ),
.rvi_call_o ( rvi_call[i] ),
.rvi_branch_o ( rvi_branch[i] ),
.rvi_jalr_o ( rvi_jalr[i] ),
.rvi_jump_o ( rvi_jump[i] ),
.rvi_imm_o ( rvi_imm[i] ),
.rvc_branch_o ( rvc_branch[i] ),
.rvc_jump_o ( rvc_jump[i] ),
.rvc_jr_o ( rvc_jr[i] ),
.rvc_return_o ( rvc_return[i] ),
.rvc_jalr_o ( rvc_jalr[i] ),
.rvc_call_o ( rvc_call[i] ),
.rvc_imm_o ( rvc_imm[i] )
);
end
fifo_v3 #(
.DEPTH ( 8 ),
.dtype ( frontend_fetch_t )
) i_fetch_fifo (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.flush_i ( flush_i ),
.testmode_i ( 1'b0 ),
.full_o ( ),
.empty_o ( fifo_empty ),
.usage_o ( ),
.data_i ( {icache_vaddr_q, icache_data_q, bp_sbe, taken[INSTR_PER_FETCH:1], icache_ex_valid_q} ),
.push_i ( fifo_valid ),
.data_o ( fetch_entry_o ),
.pop_i ( fifo_pop )
instr_queue i_instr_queue (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.flush_i ( flush_i ),
.instr_i ( instr ), // from re-aligner
.addr_i ( addr ), // from re-aligner
.exception_i ( icache_ex_valid_q ), // from I$
.predict_address_i ( predict_address ),
.cf_type_i ( cf_type ),
.valid_i ( instruction_valid ), // from re-aligner
.consumed_o ( instr_queue_consumed ),
.ready_o ( instr_queue_ready ),
.replay_o ( replay ),
.replay_addr_o ( replay_addr ),
.fetch_entry_o ( fetch_entry_o ), // to back-end
.fetch_entry_valid_o ( fetch_entry_valid_o ), // to back-end
.fetch_entry_ready_i ( fetch_entry_ready_i ) // to back-end
);
// pragma translate_off
`ifndef VERILATOR
initial begin
assert (FETCH_WIDTH == 32 || FETCH_WIDTH == 64) else $fatal("[frontend] fetch width != not supported");
end
`endif
// pragma translate_on
endmodule

353
src/frontend/instr_queue.sv Normal file
View file

@ -0,0 +1,353 @@
// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// Author: Florian Zaruba, ETH Zurich
// Date: 26.10.2018sim:/ariane_tb/dut/i_ariane/i_frontend/icache_ex_valid_q
// Description: Instruction Queue, separates instruction front-end from processor
// back-end.
//
// This is an optimized instruction queue which supports the handling of
// compressed instructions (16 bit instructions). Internally it is organized as
// FETCH_ENTRY x 32 bit queues which are filled in a consecutive manner. Two pointers
// point into (`idx_is_q` and `idx_ds_q`) the fill port and the read port. The read port
// is designed so that it will easily allow for multiple issue implementation.
// The input supports arbitrary power of two instruction fetch widths.
//
// The queue supports handling of branch prediction and will take care of
// only saving a valid instruction stream.
//
// Furthermore it contains a replay interface in case the instruction queue
// is already full. As instructions are in general easily replayed this should
// increase the efficiency as I$ misses are potentially hidden. This stands in
// contrast to pessimistic actions (early stalling) or credit based approaches.
// Credit based systems might be difficult to implement with the current system
// as we do not exactly know how much space we are going to need in the fifos
// as each instruction can take either one or two slots.
//
// So the consumed/valid interface degenerates to a `information` interface. If the
// upstream circuits keeps pushing the queue will discard the information
// and start replaying from the point were it could last manage to accept instructions.
//
// The instruction front-end will stop issuing instructions as soon as the
// fifo is full. This will gate the logic if the processor is e.g.: halted
//
// TODO(zarubaf): The instruction queues can be reduced to 16 bit. Potentially
// the replay mechanism gets more complicated as it can be that a 32 bit instruction
// can not be pushed at once.
module instr_queue (
input logic clk_i,
input logic rst_ni,
input logic flush_i,
input logic [ariane_pkg::INSTR_PER_FETCH-1:0][31:0] instr_i,
input logic [ariane_pkg::INSTR_PER_FETCH-1:0][63:0] addr_i,
input logic [ariane_pkg::INSTR_PER_FETCH-1:0] valid_i,
output logic ready_o,
output logic [ariane_pkg::INSTR_PER_FETCH-1:0] consumed_o,
// we've encountered an exception, at this point the only possible exceptions are page-table faults
input logic exception_i,
// branch predict
input logic [63:0] predict_address_i,
input ariane_pkg::cf_t [ariane_pkg::INSTR_PER_FETCH-1:0] cf_type_i,
// replay instruction because one of the FIFO was already full
output logic replay_o,
output logic [63:0] replay_addr_o, // address at which to replay this instruction
// to processor backend
output ariane_pkg::fetch_entry_t fetch_entry_o,
output logic fetch_entry_valid_o,
input logic fetch_entry_ready_i
);
typedef struct packed {
logic [31:0] instr; // instruction word
ariane_pkg::cf_t cf; // branch was taken
logic ex; // exception happened
} instr_data_t;
logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] branch_index;
// instruction queues
logic [ariane_pkg::INSTR_PER_FETCH-1:0]
[$clog2(ariane_pkg::FETCH_FIFO_DEPTH)-1:0] instr_queue_usage;
instr_data_t [ariane_pkg::INSTR_PER_FETCH-1:0] instr_data_in, instr_data_out;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] push_instr, push_instr_fifo;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] pop_instr;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] instr_queue_full;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] instr_queue_empty;
logic instr_overflow;
// address queue
logic [$clog2(ariane_pkg::FETCH_FIFO_DEPTH)-1:0] address_queue_usage;
logic [63:0] address_out;
logic pop_address;
logic push_address;
logic full_address;
logic empty_address;
logic address_overflow;
// input stream counter
logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] idx_is_d, idx_is_q;
// Registers
// output FIFO select, one-hot
logic [ariane_pkg::INSTR_PER_FETCH-1:0] idx_ds_d, idx_ds_q;
logic [63:0] pc_d, pc_q; // current PC
logic reset_address_d, reset_address_q; // we need to re-set the address because of a flush
logic [ariane_pkg::INSTR_PER_FETCH*2-2:0] branch_mask_extended;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] branch_mask;
logic branch_empty;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] taken;
// shift amount, e.g.: instructions we want to retire
logic [$clog2(ariane_pkg::INSTR_PER_FETCH):0] popcount;
logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] shamt;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] valid;
logic [ariane_pkg::INSTR_PER_FETCH*2-1:0] consumed_extended;
// FIFO mask
logic [ariane_pkg::INSTR_PER_FETCH*2-1:0] fifo_pos_extended;
logic [ariane_pkg::INSTR_PER_FETCH-1:0] fifo_pos;
logic [ariane_pkg::INSTR_PER_FETCH*2-1:0][31:0] instr;
ariane_pkg::cf_t [ariane_pkg::INSTR_PER_FETCH*2-1:0] cf;
// replay interface
logic [ariane_pkg::INSTR_PER_FETCH-1:0] instr_overflow_fifo;
assign ready_o = ~(|instr_queue_full) & ~full_address;
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_unpack_taken
assign taken[i] = cf_type_i[i] != ariane_pkg::NoCF;
end
// calculate a branch mask, e.g.: get the first taken branch
lzc #(
.WIDTH ( ariane_pkg::INSTR_PER_FETCH ),
.MODE ( 0 ) // count trailing zeros
) i_lzc_branch_index (
.in_i ( taken ), // we want to count trailing zeros
.cnt_o ( branch_index ), // first branch on branch_index
.empty_o ( branch_empty )
);
// the first index is for sure valid
// for example (64 bit fetch):
// taken mask: 0 1 1 0
// leading zero count = 1
// 0 0 0 1, 1 1 1 << 1 = 0 0 1 1, 1 1 0
// take the upper 4 bits: 0 0 1 1
assign branch_mask_extended = {{{ariane_pkg::INSTR_PER_FETCH-1}{1'b0}}, {{ariane_pkg::INSTR_PER_FETCH}{1'b1}}} << branch_index;
assign branch_mask = branch_mask_extended[ariane_pkg::INSTR_PER_FETCH * 2 - 2:ariane_pkg::INSTR_PER_FETCH - 1];
// mask with taken branches to get the actual amount of instructions we want to push
assign valid = valid_i & branch_mask;
// rotate right again
assign consumed_extended = {push_instr_fifo, push_instr_fifo} >> idx_is_q;
assign consumed_o = consumed_extended[ariane_pkg::INSTR_PER_FETCH-1:0];
// count the numbers of valid instructions we've pushed from this package
popcount #(
.INPUT_WIDTH ( ariane_pkg::INSTR_PER_FETCH )
) i_popcount (
.data_i ( push_instr_fifo ),
.popcount_o ( popcount )
);
assign shamt = popcount[$bits(shamt)-1:0];
// save the shift amount for next cycle
assign idx_is_d = idx_is_q + shamt;
// ----------------------
// Input interface
// ----------------------
// rotate left by the current position
assign fifo_pos_extended = { valid, valid } << idx_is_q;
// we just care about the upper bits
assign fifo_pos = fifo_pos_extended[ariane_pkg::INSTR_PER_FETCH*2-1:ariane_pkg::INSTR_PER_FETCH];
// the fifo_position signal can directly be used to guide the push signal of each FIFO
// make sure it is not full
assign push_instr = fifo_pos & ~instr_queue_full;
// duplicate the entries for easier selection e.g.: 3 2 1 0 3 2 1 0
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_duplicate_instr_input
assign instr[i] = instr_i[i];
assign instr[i + ariane_pkg::INSTR_PER_FETCH] = instr_i[i];
assign cf[i] = cf_type_i[i];
assign cf[i + ariane_pkg::INSTR_PER_FETCH] = cf_type_i[i];
end
// shift the inputs
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_fifo_input_select
/* verilator lint_off WIDTH */
assign instr_data_in[i].instr = instr[i + idx_is_q];
assign instr_data_in[i].cf = cf[i + idx_is_q];
assign instr_data_in[i].ex = exception_i; // exceptions hold for the whole fetch packet
/* verilator lint_on WIDTH */
end
// ----------------------
// Replay Logic
// ----------------------
// We need to replay a instruction fetch iff:
// 1. One of the instruction data FIFOs was full and we needed it
// (e.g.: we pushed and it was full)
// 2. The address/branch predict FIFO was full
// if one of the FIFOs was full we need to replay the faulting instruction
assign instr_overflow_fifo = instr_queue_full & fifo_pos;
assign instr_overflow = |instr_overflow_fifo; // at least one instruction overflowed
assign address_overflow = full_address & push_address;
assign replay_o = instr_overflow | address_overflow;
// select the address, in the case of an address fifo overflow just
// use the base of this package
// if we successfully pushed some instructions we can output the next instruction
// which we didn't manage to push
assign replay_addr_o = (address_overflow) ? addr_i[0] : addr_i[shamt];
// ----------------------
// Downstream interface
// ----------------------
// as long as there is at least one queue which can take the value we have a valid instruction
assign fetch_entry_valid_o = ~(&instr_queue_empty);
always_comb begin
idx_ds_d = idx_ds_q;
pop_instr = '0;
// assemble fetch entry
fetch_entry_o.instruction = '0;
fetch_entry_o.address = pc_q;
fetch_entry_o.ex.valid = 1'b0;
// This is the only exception which can occur up to this point.
fetch_entry_o.ex.cause = riscv::INSTR_PAGE_FAULT;
fetch_entry_o.ex.tval = '0;
fetch_entry_o.branch_predict.predict_address = address_out;
// output mux select
for (int unsigned i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin
if (idx_ds_q[i]) begin
fetch_entry_o.instruction = instr_data_out[i].instr;
fetch_entry_o.ex.valid = instr_data_out[i].ex;
fetch_entry_o.ex.tval = pc_q;
fetch_entry_o.branch_predict.cf = instr_data_out[i].cf;
pop_instr[i] = fetch_entry_valid_o & fetch_entry_ready_i;
end
end
// rotate the pointer left
if (fetch_entry_ready_i) begin
idx_ds_d = {idx_ds_q[ariane_pkg::INSTR_PER_FETCH-2:0], idx_ds_q[ariane_pkg::INSTR_PER_FETCH-1]};
end
end
// TODO(zarubaf): This needs to change for dual-issue
// if the handshaking is successful and we had a prediction pop one address entry
assign pop_address = ((fetch_entry_o.branch_predict.cf != ariane_pkg::NoCF) & |pop_instr);
// ----------------------
// Calculate (Next) PC
// ----------------------
always_comb begin
pc_d = pc_q;
reset_address_d = flush_i ? 1'b1 : reset_address_q;
if (fetch_entry_ready_i) begin
// TODO(zarubaf): This needs to change for a dual issue implementation
// advance the PC
pc_d = pc_q + ((fetch_entry_o.instruction[1:0] != 2'b11) ? 'd2 : 'd4);
end
if (pop_address) pc_d = address_out;
// we previously flushed so we need to reset the address
if (valid_i[0] && reset_address_q) begin
// this is the base of the first instruction
pc_d = addr_i[0];
reset_address_d = 1'b0;
end
end
// FIFOs
for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_instr_fifo
// Make sure we don't save any instructions if we couldn't save the address
assign push_instr_fifo[i] = push_instr[i] & ~address_overflow;
fifo_v3 #(
.DEPTH ( ariane_pkg::FETCH_FIFO_DEPTH ),
.dtype ( instr_data_t )
) i_fifo_instr_data (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.flush_i ( flush_i ),
.testmode_i ( 1'b0 ),
.full_o ( instr_queue_full[i] ),
.empty_o ( instr_queue_empty[i] ),
.usage_o ( instr_queue_usage[i] ),
.data_i ( instr_data_in[i] ),
.push_i ( push_instr_fifo[i] ),
.data_o ( instr_data_out[i] ),
.pop_i ( pop_instr[i] )
);
end
// or reduce and check whether we are retiring a taken branch (might be that the corresponding)
// fifo is full.
always_comb begin
push_address = 1'b0;
// check if we are pushing a ctrl flow change, if so save the address
for (int i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin
push_address |= push_instr[i] & (instr_data_in[i].cf != ariane_pkg::NoCF);
end
end
fifo_v3 #(
.DEPTH ( ariane_pkg::FETCH_FIFO_DEPTH ), // TODO(zarubaf): Fork out to separate param
.DATA_WIDTH ( 64 )
) i_fifo_address (
.clk_i ( clk_i ),
.rst_ni ( rst_ni ),
.flush_i ( flush_i ),
.testmode_i ( 1'b0 ),
.full_o ( full_address ),
.empty_o ( empty_address ),
.usage_o ( address_queue_usage ),
.data_i ( predict_address_i ),
.push_i ( push_address & ~full_address ),
.data_o ( address_out ),
.pop_i ( pop_address )
);
unread i_unread_address_fifo (.d_i(|{empty_address, address_queue_usage}));
unread i_unread_branch_mask (.d_i(|branch_mask_extended));
unread i_unread_lzc (.d_i(|{branch_empty}));
unread i_unread_fifo_pos (.d_i(|fifo_pos_extended)); // we don't care about the lower signals
unread i_unread_instr_fifo (.d_i(|instr_queue_usage));
always_ff @(posedge clk_i or negedge rst_ni) begin
if (!rst_ni) begin
idx_ds_q <= 'b1;
idx_is_q <= '0;
pc_q <= '0;
reset_address_q <= 1'b1;
end else begin
pc_q <= pc_d;
reset_address_q <= reset_address_d;
if (flush_i) begin
// one-hot encoded
idx_ds_q <= 'b1;
// binary encoded
idx_is_q <= '0;
reset_address_q <= 1'b1;
end else begin
idx_ds_q <= idx_ds_d;
idx_is_q <= idx_is_d;
end
end
end
// pragma translate_off
`ifndef VERILATOR
replay_address_fifo: assert property (
@(posedge clk_i) disable iff (!rst_ni) replay_o |-> !i_fifo_address.push_i
) else $fatal(1,"[instr_queue] Pushing address although replay asserted");
output_select_onehot: assert property (
@(posedge clk_i) $onehot0(idx_ds_q)
) else begin $error("Output select should be one-hot encoded"); $stop(); end
`endif
// pragma translate_on
endmodule

View file

@ -1,4 +1,4 @@
//Copyright (C) 2018 to present,
// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 2.0 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
@ -17,7 +17,6 @@
// ------------------------------
module instr_scan (
input logic [31:0] instr_i, // expect aligned instruction, compressed or not
output logic is_rvc_o,
output logic rvi_return_o,
output logic rvi_call_o,
output logic rvi_branch_o,
@ -32,35 +31,39 @@ module instr_scan (
output logic rvc_call_o,
output logic [63:0] rvc_imm_o
);
assign is_rvc_o = (instr_i[1:0] != 2'b11);
// check that rs1 is either x1 or x5 and that rs1 is not x1 or x5, TODO: check the fact about bit 7
assign rvi_return_o = rvi_jalr_o & ~instr_i[7] & ~instr_i[19] & ~instr_i[18] & ~instr_i[16] & instr_i[15];
assign rvi_call_o = (rvi_jalr_o | rvi_jump_o) & instr_i[7]; // TODO: check that this captures calls
logic is_rvc;
assign is_rvc = (instr_i[1:0] != 2'b11);
// check that rs1 is either x1 or x5 and that rs1 is not x1 or x5
assign rvi_return_o = rvi_jalr_o & ((instr_i[11:7] == 5'd1) | instr_i[11:7] == 5'd5)
& (instr_i[19:15] != instr_i[11:7]);
// Opocde is JAL[R] and destination register is either x1 or x5
assign rvi_call_o = (rvi_jalr_o | rvi_jump_o) & ((instr_i[11:7] == 5'd1) | instr_i[11:7] == 5'd5);
// differentiates between JAL and BRANCH opcode, JALR comes from BHT
assign rvi_imm_o = (instr_i[3]) ? ariane_pkg::uj_imm(instr_i) : ariane_pkg::sb_imm(instr_i);
assign rvi_branch_o = (instr_i[6:0] == riscv::OpcodeBranch) ? 1'b1 : 1'b0;
assign rvi_jalr_o = (instr_i[6:0] == riscv::OpcodeJalr) ? 1'b1 : 1'b0;
assign rvi_jump_o = (instr_i[6:0] == riscv::OpcodeJal) ? 1'b1 : 1'b0;
assign rvi_branch_o = (instr_i[6:0] == riscv::OpcodeBranch);
assign rvi_jalr_o = (instr_i[6:0] == riscv::OpcodeJalr);
assign rvi_jump_o = (instr_i[6:0] == riscv::OpcodeJal);
// opcode JAL
assign rvc_jump_o = (instr_i[15:13] == riscv::OpcodeC1J) & is_rvc_o & (instr_i[1:0] == riscv::OpcodeC1);
assign rvc_jump_o = (instr_i[15:13] == riscv::OpcodeC1J) & is_rvc & (instr_i[1:0] == riscv::OpcodeC1);
// always links to register 0
assign rvc_jr_o = (instr_i[15:13] == riscv::OpcodeC2JalrMvAdd)
& ~instr_i[12]
logic is_jal_r;
assign is_jal_r = (instr_i[15:13] == riscv::OpcodeC2JalrMvAdd) &
& (instr_i[6:2] == 5'b00000)
& (instr_i[1:0] == riscv::OpcodeC2)
& is_rvc_o;
assign rvc_branch_o = ((instr_i[15:13] == riscv::OpcodeC1Beqz) | (instr_i[15:13] == riscv::OpcodeC1Bnez))
& (instr_i[1:0] == riscv::OpcodeC1)
& is_rvc_o;
// check that rs1 is x1 or x5
assign rvc_return_o = ~instr_i[11] & ~instr_i[10] & ~instr_i[8] & instr_i[7] & rvc_jr_o ;
& is_rvc;
assign rvc_jr_o = is_jal_r & ~instr_i[12];
// always links to register 1 e.g.: it is a jump
assign rvc_jalr_o = (instr_i[15:13] == riscv::OpcodeC2JalrMvAdd)
& instr_i[12]
& (instr_i[6:2] == 5'b00000) & is_rvc_o;
assign rvc_jalr_o = is_jal_r & instr_i[12];
assign rvc_call_o = rvc_jalr_o;
// // differentiates between JAL and BRANCH opcode, JALR comes from BHT
assign rvc_branch_o = ((instr_i[15:13] == riscv::OpcodeC1Beqz) | (instr_i[15:13] == riscv::OpcodeC1Bnez))
& (instr_i[1:0] == riscv::OpcodeC1)
& is_rvc;
// check that rs1 is x1 or x5
assign rvc_return_o = ((instr_i[11:7] == 5'd1) | (instr_i[11:7] == 5'd5)) & rvc_jr_o ;
// differentiates between JAL and BRANCH opcode, JALR comes from BHT
assign rvc_imm_o = (instr_i[14]) ? {{56{instr_i[12]}}, instr_i[6:5], instr_i[2], instr_i[11:10], instr_i[4:3], 1'b0}
: {{53{instr_i[12]}}, instr_i[8], instr_i[10:9], instr_i[6], instr_i[7], instr_i[2], instr_i[11], instr_i[5:3], 1'b0};
endmodule

View file

@ -10,95 +10,81 @@
//
// Author: Florian Zaruba, ETH Zurich
// Date: 15.04.2017
// Description: Description: Instruction decode, contains the logic for decode,
// Description: Instruction decode, contains the logic for decode,
// issue and read operands.
import ariane_pkg::*;
module id_stage (
input logic clk_i, // Clock
input logic rst_ni, // Asynchronous reset active low
input logic clk_i,
input logic rst_ni,
input logic flush_i,
input logic debug_req_i,
input logic flush_i,
input logic debug_req_i,
// from IF
input frontend_fetch_t fetch_entry_i,
input logic fetch_entry_valid_i,
output logic decoded_instr_ack_o, // acknowledge the instruction (fetch entry)
input ariane_pkg::fetch_entry_t fetch_entry_i,
input logic fetch_entry_valid_i,
output logic fetch_entry_ready_o, // acknowledge the instruction (fetch entry)
// to ID
output scoreboard_entry_t issue_entry_o, // a decoded instruction
output logic issue_entry_valid_o, // issue entry is valid
output logic is_ctrl_flow_o, // the instruction we issue is a ctrl flow instructions
input logic issue_instr_ack_i, // issue stage acknowledged sampling of instructions
output ariane_pkg::scoreboard_entry_t issue_entry_o, // a decoded instruction
output logic issue_entry_valid_o, // issue entry is valid
output logic is_ctrl_flow_o, // the instruction we issue is a ctrl flow instructions
input logic issue_instr_ack_i, // issue stage acknowledged sampling of instructions
// from CSR file
input riscv::priv_lvl_t priv_lvl_i, // current privilege level
input riscv::xs_t fs_i, // floating point extension status
input logic [2:0] frm_i, // floating-point dynamic rounding mode
input logic [1:0] irq_i,
input irq_ctrl_t irq_ctrl_i,
input logic debug_mode_i, // we are in debug mode
input logic tvm_i,
input logic tw_i,
input logic tsr_i
input riscv::priv_lvl_t priv_lvl_i, // current privilege level
input riscv::xs_t fs_i, // floating point extension status
input logic [2:0] frm_i, // floating-point dynamic rounding mode
input logic [1:0] irq_i,
input ariane_pkg::irq_ctrl_t irq_ctrl_i,
input logic debug_mode_i, // we are in debug mode
input logic tvm_i,
input logic tw_i,
input logic tsr_i
);
// register stage
// ID/ISSUE register stage
struct packed {
logic valid;
scoreboard_entry_t sbe;
logic is_ctrl_flow;
logic valid;
ariane_pkg::scoreboard_entry_t sbe;
logic is_ctrl_flow;
} issue_n, issue_q;
logic is_control_flow_instr;
scoreboard_entry_t decoded_instruction;
logic is_control_flow_instr;
ariane_pkg::scoreboard_entry_t decoded_instruction;
fetch_entry_t fetch_entry;
logic is_illegal;
logic [31:0] instruction;
logic is_compressed;
logic fetch_ack_i;
logic fetch_entry_valid;
// ---------------------------------------------------------
// 1. Re-align instructions
// ---------------------------------------------------------
instr_realigner instr_realigner_i (
.fetch_entry_i ( fetch_entry_i ),
.fetch_entry_valid_i ( fetch_entry_valid_i ),
.fetch_ack_o ( decoded_instr_ack_o ),
.fetch_entry_o ( fetch_entry ),
.fetch_entry_valid_o ( fetch_entry_valid ),
.fetch_ack_i ( fetch_ack_i ),
.*
);
// ---------------------------------------------------------
// 2. Check if they are compressed and expand in case they are
// 1. Check if they are compressed and expand in case they are
// ---------------------------------------------------------
compressed_decoder compressed_decoder_i (
.instr_i ( fetch_entry.instruction ),
.instr_i ( fetch_entry_i.instruction ),
.instr_o ( instruction ),
.illegal_instr_o ( is_illegal ),
.is_compressed_o ( is_compressed )
);
// ---------------------------------------------------------
// 3. Decode and emit instruction to issue stage
// 2. Decode and emit instruction to issue stage
// ---------------------------------------------------------
decoder decoder_i (
.debug_req_i,
.pc_i ( fetch_entry.address ),
.is_compressed_i ( is_compressed ),
.compressed_instr_i ( fetch_entry.instruction[15:0] ),
.instruction_i ( instruction ),
.branch_predict_i ( fetch_entry.branch_predict ),
.is_illegal_i ( is_illegal ),
.ex_i ( fetch_entry.ex ),
.instruction_o ( decoded_instruction ),
.is_control_flow_instr_o ( is_control_flow_instr ),
.irq_ctrl_i,
.irq_i,
.pc_i ( fetch_entry_i.address ),
.is_compressed_i ( is_compressed ),
.is_illegal_i ( is_illegal ),
.instruction_i ( instruction ),
.compressed_instr_i ( fetch_entry_i.instruction[15:0] ),
.branch_predict_i ( fetch_entry_i.branch_predict ),
.ex_i ( fetch_entry_i.ex ),
.priv_lvl_i ( priv_lvl_i ),
.debug_mode_i ( debug_mode_i ),
.fs_i,
.frm_i,
.*
.tvm_i,
.tw_i,
.tsr_i,
.instruction_o ( decoded_instruction ),
.is_control_flow_instr_o ( is_control_flow_instr )
);
// ------------------
@ -110,7 +96,7 @@ module id_stage (
always_comb begin
issue_n = issue_q;
fetch_ack_i = 1'b0;
fetch_entry_ready_o = 1'b0;
// Clear the valid flag if issue has acknowledged the instruction
if (issue_instr_ack_i)
@ -119,9 +105,9 @@ module id_stage (
// if we have a space in the register and the fetch is valid, go get it
// or the issue stage is currently acknowledging an instruction, which means that we will have space
// for a new instruction
if ((!issue_q.valid || issue_instr_ack_i) && fetch_entry_valid) begin
fetch_ack_i = 1'b1;
issue_n = {1'b1, decoded_instruction, is_control_flow_instr};
if ((!issue_q.valid || issue_instr_ack_i) && fetch_entry_valid_i) begin
fetch_entry_ready_o = 1'b1;
issue_n = '{1'b1, decoded_instruction, is_control_flow_instr};
end
// invalidate the pipeline register on a flush
@ -138,5 +124,4 @@ module id_stage (
issue_q <= issue_n;
end
end
endmodule

358
src/instr_realign.sv Normal file
View file

@ -0,0 +1,358 @@
// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// Author: Florian Zaruba <zarubaf@iis.ee.ethz.ch>
// Description: Instruction Re-aligner
//
// This module takes 32-bit aligned cache blocks and extracts the instructions.
// As we are supporting the compressed instruction set extension in a 32 bit instruction word
// are up to 2 compressed instructions.
// Furthermore those instructions can be arbitrarily interleaved which makes it possible to fetch
// only the lower part of a 32 bit instruction.
// Furthermore we need to handle the case if we want to start fetching from an unaligned
// instruction e.g. a branch.
import ariane_pkg::*;
module instr_realign (
input logic clk_i,
input logic rst_ni,
input logic flush_i,
input logic valid_i,
output logic serving_unaligned_o, // we have an unaligned instruction in [0]
input logic [63:0] address_i,
input logic [FETCH_WIDTH-1:0] data_i,
output logic [INSTR_PER_FETCH-1:0] valid_o,
output logic [INSTR_PER_FETCH-1:0][63:0] addr_o,
output logic [INSTR_PER_FETCH-1:0][31:0] instr_o
);
// as a maximum we support a fetch width of 64-bit, hence there can be 4 compressed instructions
logic [3:0] instr_is_compressed;
for (genvar i = 0; i < INSTR_PER_FETCH; i ++) begin
// LSB != 2'b11
assign instr_is_compressed[i] = ~&data_i[i * 16 +: 2];
end
// save the unaligned part of the instruction to this ff
logic [15:0] unaligned_instr_d, unaligned_instr_q;
// the last instruction was unaligned
logic unaligned_d, unaligned_q;
// register to save the unaligned address
logic [63:0] unaligned_address_d, unaligned_address_q;
// we have an unaligned instruction
assign serving_unaligned_o = unaligned_q;
// Instruction re-alignment
if (FETCH_WIDTH == 32) begin : realign_bp_32
always_comb begin : re_align
unaligned_d = unaligned_q;
unaligned_address_d = {address_i[63:2], 2'b10};
unaligned_instr_d = data_i[31:16];
valid_o[0] = valid_i;
instr_o[0] = (unaligned_q) ? {data_i[15:0], unaligned_instr_q} : data_i[31:0];
addr_o[0] = (unaligned_q) ? unaligned_address_q : address_i;
valid_o[1] = 1'b0;
instr_o[1] = '0;
addr_o[1] = {address_i[63:2], 2'b10};
// this instruction is compressed or the last instruction was unaligned
if (instr_is_compressed[0] || unaligned_q) begin
// check if this is instruction is still unaligned e.g.: it is not compressed
// if its compressed re-set unaligned flag
// for 32 bit we can simply check the next instruction and whether it is compressed or not
// if it is compressed the next fetch will contain an aligned instruction
// is instruction 1 also compressed
// yes? -> no problem, no -> we've got an unaligned instruction
if (instr_is_compressed[1]) begin
unaligned_d = 1'b0;
valid_o[1] = valid_i;
instr_o[1] = {16'b0, data_i[31:16]};
end else begin
// save the upper bits for next cycle
unaligned_d = 1'b1;
unaligned_instr_d = data_i[31:16];
unaligned_address_d = {address_i[63:2], 2'b10};
end
end // else -> normal fetch
// we started to fetch on a unaligned boundary with a whole instruction -> wait until we've
// received the next instruction
if (valid_i && address_i[1]) begin
// the instruction is not compressed so we can't do anything in this cycle
if (!instr_is_compressed[0]) begin
valid_o = '0;
unaligned_d = 1'b1;
unaligned_address_d = {address_i[63:2], 2'b10};
unaligned_instr_d = data_i[15:0];
// the instruction isn't compressed but only the lower is ready
end else begin
valid_o = 1'b1;
end
end
end
// TODO(zarubaf): Fix 64 bit FETCH_WIDTH, maybe generalize to arbitrary fetch width
end else if (FETCH_WIDTH == 64) begin : realign_bp_64
initial begin
$error("Not propperly implemented");
end
always_comb begin : re_align
unaligned_d = unaligned_q;
unaligned_address_d = unaligned_address_q;
unaligned_instr_d = unaligned_instr_q;
valid_o = '0;
valid_o[0] = valid_i;
instr_o[0] = data_i[31:0];
addr_o[0] = address_i;
instr_o[1] = '0;
addr_o[1] = {address_i[63:3], 3'b010};
instr_o[2] = {16'b0, data_i[47:32]};
addr_o[2] = {address_i[63:3], 3'b100};
instr_o[3] = {16'b0, data_i[63:48]};
addr_o[3] = {address_i[63:3], 3'b110};
// last instruction was unaligned
if (unaligned_q) begin
instr_o[0] = {data_i[15:0], unaligned_instr_q};
addr_o[0] = unaligned_address_q;
// for 64 bit there exist the following options:
// 64 32 0
// | 3 | 2 | 1 | 0 | <- instruction slot
// | I | I | U | -> again unaligned
// | * | C | I | U | -> aligned
// | * | I | C | U | -> aligned
// | I | C | C | U | -> again unaligned
// | * | C | C | C | U | -> aligned
// Legend: C = compressed, I = 32 bit instruction, U = unaligned upper half
// * = don't care
if (instr_is_compressed[1]) begin
instr_o[1] = {16'b0, data_i[31:16]};
valid_o[1] = valid_i;
if (instr_is_compressed[2]) begin
if (instr_is_compressed[3]) begin
unaligned_d = 1'b0;
valid_o[3] = valid_i;
end else begin
// continues to be unaligned
end
end else begin
unaligned_d = 1'b0;
instr_o[2] = data_i[63:32];
valid_o[2] = valid_i;
end
// instruction 1 is not compressed
end else begin
instr_o[1] = data_i[47:16];
valid_o[1] = valid_i;
addr_o[2] = {address_i[63:3], 3'b110};
if (instr_is_compressed[2]) begin
unaligned_d = 1'b0;
instr_o[2] = {16'b0, data_i[63:48]};
valid_o[2] = valid_i;
end else begin
// continues to be unaligned
end
end
end else if (instr_is_compressed[0]) begin // instruction zero is RVC
// 64 32 0
// | 3 | 2 | 1 | 0 | <- instruction slot
// | I | I | C | -> again unaligned
// | * | C | I | C | -> aligned
// | * | I | C | C | -> aligned
// | I | C | C | C | -> again unaligned
// | * | C | C | C | C | -> aligned
if (instr_is_compressed[1]) begin
instr_o[1] = {16'b0, data_i[31:16]};
valid_o[1] = valid_i;
if (instr_is_compressed[2]) begin
valid_o[2] = valid_i;
if (instr_is_compressed[3]) begin
valid_o[3] = valid_i;
end else begin
// this instruction is unaligned
unaligned_d = 1'b1;
unaligned_instr_d = data_i[63:48];
unaligned_address_d = addr_o[3];
end
end else begin
instr_o[2] = data_i[63:32];
valid_o[2] = valid_i;
end
// instruction 1 is not compressed -> check slot 3
end else begin
instr_o[1] = data_i[47:16];
valid_o[1] = valid_i;
addr_o[2] = {address_i[63:3], 3'b110};
if (instr_is_compressed[3]) begin
instr_o[2] = data_i[63:48];
valid_o[2] = valid_i;
end else begin
unaligned_d = 1'b1;
unaligned_instr_d = data_i[63:48];
unaligned_address_d = addr_o[2];
end
end
// Full instruction in slot zero
// 64 32 0
// | 3 | 2 | 1 | 0 | <- instruction slot
// | I | C | I |
// | * | C | C | I |
// | * | I | I |
end else begin
addr_o[1] = {address_i[63:3], 3'b100};
if (instr_is_compressed[2]) begin
instr_o[1] = {16'b0, data_i[47:32]};
valid_o[1] = valid_i;
addr_o[2] = {address_i[63:3], 3'b110};
if (instr_is_compressed[3]) begin
// | * | C | C | I |
valid_o[2] = valid_i;
addr_o[2] = {16'b0, data_i[63:48]};
end else begin
// this instruction is unaligned
unaligned_d = 1'b1;
unaligned_instr_d = data_i[63:48];
unaligned_address_d = addr_o[2];
end
end else begin
// two regular instructions back-to-back
instr_o[1] = data_i[63:32];
valid_o[1] = valid_i;
end
end
// --------------------------
// Unaligned fetch
// --------------------------
// Address was not 64 bit aligned
case (address_i[2:1])
// this means the previouse instruction was either compressed or unaligned
// in any case we don't ccare
2'b01: begin
// 64 32 0
// | 3 | 2 | 1 | 0 | <- instruction slot
// | I | I | x -> again unaligned
// | * | C | I | x -> aligned
// | * | I | C | x -> aligned
// | I | C | C | x -> again unaligned
// | * | C | C | C | x -> aligned
addr_o[0] = {address_i[63:3], 3'b010};
if (instr_is_compressed[1]) begin
instr_o[0] = {16'b0, data_i[31:16]};
valid_o[0] = valid_i;
if (instr_is_compressed[2]) begin
valid_o[1] = valid_i;
instr_o[1] = {16'b0, data_i[47:32]};
addr_o[1] = {address_i[63:3], 3'b100};
if (instr_is_compressed[3]) begin
instr_o[2] = {16'b0, data_i[63:48]};
addr_o[2] = {address_i[63:3], 3'b110};
valid_o[2] = valid_i;
end else begin
// this instruction is unaligned
unaligned_d = 1'b1;
unaligned_instr_d = data_i[63:48];
unaligned_address_d = addr_o[3];
end
end else begin
instr_o[1] = data_i[63:32];
addr_o[1] = {address_i[63:3], 3'b100};
valid_o[1] = valid_i;
end
// instruction 1 is not compressed -> check slot 3
end else begin
instr_o[0] = data_i[47:16];
valid_o[0] = valid_i;
addr_o[1] = {address_i[63:3], 3'b110};
if (instr_is_compressed[3]) begin
instr_o[1] = data_i[63:48];
valid_o[1] = valid_i;
end else begin
unaligned_d = 1'b1;
unaligned_instr_d = data_i[63:48];
unaligned_address_d = addr_o[1];
end
end
end
2'b10: begin
valid_o = '0;
// 64 32 0
// | 3 | 2 | 1 | 0 | <- instruction slot
// | I | C | * | <- unaligned
// | C | C | * | <- aligned
// | I | * | <- aligned
if (instr_is_compressed[2]) begin
valid_o[0] = valid_i;
instr_o[0] = data_i[47:32];
// second instruction is also compressed
if (instr_is_compressed[3]) begin
valid_o[1] = valid_i;
instr_o[1] = data_i[63:48];
// regular instruction -> unaligned
end else begin
unaligned_d = 1'b1;
unaligned_address_d = {address_i[63:3], 3'b110};
unaligned_instr_d = data_i[63:48];
end
// instruction is a regular instruction
end else begin
valid_o[0] = valid_i;
instr_o[0] = data_i[63:32];
addr_o[0] = address_i;
end
end
// we started to fetch on a unaligned boundary with a whole instruction -> wait until we've
// received the next instruction
2'b11: begin
valid_o = '0;
if (!instr_is_compressed[3]) begin
unaligned_d = 1'b1;
unaligned_address_d = {address_i[63:3], 3'b110};
unaligned_instr_d = data_i[63:48];
end else begin
valid_o[3] = valid_i;
end
end
endcase
end
end
always_ff @(posedge clk_i or negedge rst_ni) begin
if (~rst_ni) begin
unaligned_q <= 1'b0;
unaligned_address_q <= '0;
unaligned_instr_q <= '0;
end else begin
if (valid_i) begin
unaligned_address_q <= unaligned_address_d;
unaligned_instr_q <= unaligned_instr_d;
end
if (flush_i) begin
unaligned_q <= 1'b0;
end else if (valid_i) begin
unaligned_q <= unaligned_d;
end
end
end
endmodule

View file

@ -1,252 +0,0 @@
// Copyright 2018 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License. You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// Author: Florian Zaruba, ETH Zurich
// Date: 14.05.2017
// Description: Emits and re-aligns compressed and unaligned instructions
import ariane_pkg::*;
module instr_realigner (
input logic clk_i, // Clock
input logic rst_ni, // Asynchronous reset active low
// control signals
input logic flush_i,
input frontend_fetch_t fetch_entry_i,
input logic fetch_entry_valid_i,
output logic fetch_ack_o,
output fetch_entry_t fetch_entry_o,
output logic fetch_entry_valid_o,
input logic fetch_ack_i
);
// ----------
// Registers
// ----------
// the last instruction was unaligned
logic unaligned_n, unaligned_q;
// save the unaligned part of the instruction to this ff
logic [15:0] unaligned_instr_n, unaligned_instr_q;
// the previous instruction was compressed
logic compressed_n, compressed_q;
// register to save the unaligned address
logic [63:0] unaligned_address_n, unaligned_address_q;
// get the next instruction, needed on a unaligned access
logic jump_unaligned_half_word;
// check if the lower compressed instruction was no branch otherwise we will need to squash this instruction
// but only if we predicted it to be taken, the predict was on the lower 16 bit compressed instruction
logic kill_upper_16_bit;
assign kill_upper_16_bit = fetch_entry_i.branch_predict.valid &
fetch_entry_i.branch_predict.predict_taken &
fetch_entry_i.bp_taken[0];
// ----------
// Registers
// ----------
always_comb begin : realign_instr
unaligned_n = unaligned_q;
unaligned_instr_n = unaligned_instr_q;
compressed_n = compressed_q;
unaligned_address_n = unaligned_address_q;
// directly output this instruction. adoptions are made throughout the always comb block
fetch_entry_o.address = fetch_entry_i.address;
fetch_entry_o.instruction = fetch_entry_i.instruction;
fetch_entry_o.branch_predict = fetch_entry_i.branch_predict;
fetch_entry_o.ex.valid = fetch_entry_i.page_fault;
fetch_entry_o.ex.tval = (fetch_entry_i.page_fault) ? fetch_entry_i.address : '0;
fetch_entry_o.ex.cause = (fetch_entry_i.page_fault) ? riscv::INSTR_PAGE_FAULT : '0;
fetch_entry_valid_o = fetch_entry_valid_i;
fetch_ack_o = fetch_ack_i;
// we just jumped to a half word and encountered an unaligned 32-bit instruction
jump_unaligned_half_word = 1'b0;
// ---------------------------------
// Input port & Instruction Aligner
// ---------------------------------
// check if the entry if the fetch FIFO is valid and if we are currently not serving the second part
// of a compressed instruction
if (fetch_entry_valid_i && !compressed_q) begin
// ------------------------
// Access on Word Boundary
// ------------------------
if (fetch_entry_i.address[1] == 1'b0) begin
// do we actually want the first instruction or was the address a half word access?
if (!unaligned_q) begin
// we got a valid instruction so we can satisfy the unaligned instruction
unaligned_n = 1'b0;
// check if the instruction is compressed
if (fetch_entry_i.instruction[1:0] != 2'b11) begin
// it is compressed
fetch_entry_o.instruction = {15'b0, fetch_entry_i.instruction[15:0]};
// we need to kill the lower prediction
if (fetch_entry_i.branch_predict.valid && !fetch_entry_i.bp_taken[0])
fetch_entry_o.branch_predict.valid = 1'b0;
// should we even look at the upper instruction bits?
if (!kill_upper_16_bit) begin
// Yes, so...
// 1. Is the second instruction also compressed, like:
// _____________________________________________
// | compressed 2 [31:16] | compressed 1[15:0] |
// |____________________________________________
if (fetch_entry_i.instruction[17:16] != 2'b11) begin
// yes, this was a compressed instruction
compressed_n = 1'b1;
// do not advance the queue pointer
fetch_ack_o = 1'b0;
// 2. or is it an unaligned 32 bit instruction like
// ____________________________________________________
// |instr [15:0] | instr [31:16] | compressed 1[15:0] |
// |____________________________________________________
end else begin
// save the lower 16 bit
unaligned_instr_n = fetch_entry_i.instruction[31:16];
// save the address
unaligned_address_n = {fetch_entry_i.address[63:2], 2'b10};
// and that it was unaligned
unaligned_n = 1'b1;
// this does not consume space in the FIFO
end
end
end
end
// this is a full 32 bit instruction like
// _______________________
// | instruction [31:0] |
// |______________________
// we have an outstanding unaligned instruction
else if (unaligned_q) begin
fetch_entry_o.address = unaligned_address_q;
fetch_entry_o.instruction = {fetch_entry_i.instruction[15:0], unaligned_instr_q};
// again should we look at the upper bits?
if (!kill_upper_16_bit) begin
// whats up with the other upper 16 bit of this instruction
// is the second instruction also compressed, like:
// _____________________________________________
// | compressed 2 [31:16] | unaligned[31:16] |
// |____________________________________________
// check if the lower compressed instruction was no branch otherwise we will need to squash this instruction
// but only if we predicted it to be taken, the predict was on the lower 16 bit compressed instruction
if (fetch_entry_i.instruction[17:16] != 2'b11) begin
// this was a compressed instruction
compressed_n = 1'b1;
// do not advance the queue pointer
fetch_ack_o = 1'b0;
// unaligned access served
unaligned_n = 1'b0;
// we need to kill the lower prediction
if (fetch_entry_i.branch_predict.valid && !fetch_entry_i.bp_taken[0])
fetch_entry_o.branch_predict.valid = 1'b0;
// or is it an unaligned 32 bit instruction like
// ____________________________________________________
// |instr [15:0] | instr [31:16] | compressed 1[15:0] |
// |____________________________________________________
end else if (!kill_upper_16_bit) begin
// save the lower 16 bit
unaligned_instr_n = fetch_entry_i.instruction[31:16];
// save the address
unaligned_address_n = {fetch_entry_i.address[63:2], 2'b10};
// and that it was unaligned
unaligned_n = 1'b1;
end
end
// we've got a predicted taken branch we need to clear the unaligned flag if it was decoded as a lower 16 instruction
else if (fetch_entry_i.branch_predict.valid) begin
// the next fetch will start from a 4 byte boundary again
unaligned_n = 1'b0;
end
end
end
// ----------------------------
// Access on half-Word Boundary
// ----------------------------
else if (fetch_entry_i.address[1] == 1'b1) begin // address was a half word access
// reset the unaligned flag as this is a completely new fetch (because consecutive fetches only happen on a word basis)
unaligned_n = 1'b0;
// this is a compressed instruction
if (fetch_entry_i.instruction[17:16] != 2'b11) begin
// it is compressed
fetch_entry_o.instruction = {15'b0, fetch_entry_i.instruction[31:16]};
// this is the first part of a 32 bit unaligned instruction
end else begin
// save the lower 16 bit
unaligned_instr_n = fetch_entry_i.instruction[31:16];
// and that it was unaligned
unaligned_n = 1'b1;
// save the address
unaligned_address_n = {fetch_entry_i.address[63:2], 2'b10};
// we need to wait for the second instruction
fetch_entry_valid_o = 1'b0;
// so get it by acknowledging this instruction
fetch_ack_o = 1'b1;
// we got to an unaligned instruction -> get the next entry to full-fill the need
jump_unaligned_half_word = 1'b1;
end
// there can never be a whole 32 bit instruction on a half word access
end
end
// ----------------------------
// Next compressed instruction
// ----------------------------
// we are serving the second part of an instruction which was also compressed
if (compressed_q) begin
fetch_ack_o = fetch_ack_i;
compressed_n = 1'b0;
fetch_entry_o.instruction = {16'b0, fetch_entry_i.instruction[31:16]};
fetch_entry_o.address = {fetch_entry_i.address[63:2], 2'b10};
fetch_entry_valid_o = 1'b1;
end
// if we didn't get an acknowledge keep the registers stable
if (!fetch_ack_i && !jump_unaligned_half_word) begin
unaligned_n = unaligned_q;
unaligned_instr_n = unaligned_instr_q;
compressed_n = compressed_q;
unaligned_address_n = unaligned_address_q;
end
if (flush_i) begin
// clear the unaligned and compressed instruction
unaligned_n = 1'b0;
compressed_n = 1'b0;
end
// assign the correct address for a potentially faulting unaligned instruction
// we've already done the re-alignment for the instruction word so we
// can just assign it here to tval
fetch_entry_o.ex.tval = fetch_entry_o.address;
end
// ---------
// Registers
// ---------
always_ff @(posedge clk_i or negedge rst_ni) begin
if (~rst_ni) begin
unaligned_q <= 1'b0;
unaligned_instr_q <= 16'b0;
unaligned_address_q <= 64'b0;
compressed_q <= 1'b0;
end else begin
unaligned_q <= unaligned_n;
unaligned_instr_q <= unaligned_instr_n;
unaligned_address_q <= unaligned_address_n;
compressed_q <= compressed_n;
end
end
endmodule

View file

@ -67,6 +67,9 @@ package ariane_soc;
localparam logic [NrRegion-1:0][NB_PERIPHERALS-1:0] ValidRule = {{NrRegion * NB_PERIPHERALS}{1'b1}};
localparam ariane_pkg::ariane_cfg_t ArianeSocCfg = '{
RASDepth: 2,
BTBEntries: 32,
BHTEntries: 128,
// idempotent region
NrNonIdempotentRules: 0,
NonIdempotentAddrBase: {64'b0},