frontend: Clean-up instruction frontend

The instuction frontend has become an increasingly messy part an needed cleaning-up. The current solution contains 2 x 32 bit instruction data fifos and 1 x 64 bit address fifo. Hence, it should be significantly more area efficient that the previous one. The interface to `id_stage` is a ready/valid handshake. The credit based system has been replaced in favour of a replay mechanism as it was very brittle and overly pessimistic. Branch-prediction has been cleaned up: The front-end was also partially predicting on jumps, this could have potentially let to performance bugs if the branch detection wasn't correct in the frontend.
2025-04-20 04:07:36 -04:00 · 2019-04-20 18:53:16 +02:00 · 2019-04-20 18:53:16 +02:00 · 830540b757
commit 830540b757
parent 90b76d3e4f
14 changed files with 1297 additions and 896 deletions
--- a/3
+++ b/3
@ -146,6 +146,7 @@ src :=  $(filter-out src/ariane_regfile.sv, $(wildcard src/*.sv))              \
        src/axi/src/axi_delayer.sv                                             \
        src/axi/src/axi_to_axi_lite.sv                                         \
        src/fpga-support/rtl/SyncSpRamBeNx64.sv                                \
+        src/common_cells/src/unread.sv                                         \
        src/common_cells/src/sync.sv                                           \
        src/common_cells/src/cdc_2phase.sv                                     \
        src/common_cells/src/spill_register.sv                                 \
@ -157,6 +158,7 @@ src :=  $(filter-out src/ariane_regfile.sv, $(wildcard src/*.sv))              \
        src/common_cells/src/deprecated/fifo_v2.sv                             \
        src/common_cells/src/fifo_v3.sv                                        \
        src/common_cells/src/lzc.sv                                            \
+        src/common_cells/src/popcount.sv                                       \
        src/common_cells/src/rr_arb_tree.sv                                    \
        src/common_cells/src/deprecated/rrarbiter.sv                           \
        src/common_cells/src/stream_delay.sv                                   \
@ -361,7 +363,6 @@ verilate_command := $(verilator)
                    -Wno-UNOPTFLAT                                                                     \
                    -Wno-style                                                                         \
                    $(if $(PROFILE),--stats --stats-vars --profile-cfuncs,)                            \
-                    -Wno-lint                                                                          \
                    $(if $(DEBUG),--trace --trace-structs,)                                            \
                    -LDFLAGS "-L$(RISCV)/lib -Wl,-rpath,$(RISCV)/lib -lfesvr$(if $(PROFILE), -g -pg,)" \
                    -CFLAGS "$(CFLAGS)$(if $(PROFILE), -g -pg,)" -Wall --cc  --vpi                     \
--- a/include/ariane_pkg.sv
+++ b/include/ariane_pkg.sv
@ -34,6 +34,9 @@ package ariane_pkg;
    localparam NrMaxRules = 16;

    typedef struct packed {
+      int                               RASDepth;
+      int                               BTBEntries;
+      int                               BHTEntries;
      // PMAs
      int                               NrNonIdempotentRules;  // Number of non idempotent rules
      logic [NrMaxRules-1:0][63:0]      NonIdempotentAddrBase; // base which needs to match
@ -52,6 +55,9 @@ package ariane_pkg;
    } ariane_cfg_t;

    localparam ariane_cfg_t ArianeDefaultConfig = '{
+      RASDepth: 2,
+      BTBEntries: 32,
+      BHTEntries: 128,
      // idempotent region
      NrNonIdempotentRules: 2,
      NonIdempotentAddrBase: {64'b0, 64'b0},
@ -75,6 +81,9 @@ package ariane_pkg;
    function automatic void check_cfg (ariane_cfg_t Cfg);
      // pragma translate_off
      `ifndef VERILATOR
+        assert(Cfg.RASDepth > 0);
+        assert(2**$clog2(Cfg.BTBEntries)  == Cfg.BTBEntries);
+        assert(2**$clog2(Cfg.BHTEntries)  == Cfg.BHTEntries);
        assert(Cfg.NrNonIdempotentRules <= NrMaxRules);
        assert(Cfg.NrExecuteRegionRules <= NrMaxRules);
        assert(Cfg.NrCachedRegionRules  <= NrMaxRules);
@ -131,9 +140,6 @@ package ariane_pkg;
    localparam TRANS_ID_BITS = $clog2(NR_SB_ENTRIES); // depending on the number of scoreboard entries we need that many bits
                                                      // to uniquely identify the entry in the scoreboard
    localparam ASID_WIDTH    = 1;
-    localparam BTB_ENTRIES   = 64;
-    localparam BHT_ENTRIES   = 128;
-    localparam RAS_DEPTH     = 2;
    localparam BITS_SATURATION_COUNTER = 2;
    localparam NR_COMMIT_PORTS = 2;

@ -142,8 +148,8 @@ package ariane_pkg;
    localparam ISSUE_WIDTH = 1;
    // amount of pipeline registers inserted for load/store return path
    // this can be tuned to trade-off IPC vs. cycle time
-    localparam NR_LOAD_PIPE_REGS = 1;
-    localparam NR_STORE_PIPE_REGS = 0;
+    localparam int unsigned NR_LOAD_PIPE_REGS = 1;
+    localparam int unsigned NR_STORE_PIPE_REGS = 0;

    // depth of store-buffers, this needs to be a power of two
    localparam int unsigned DEPTH_SPEC   = 4;
@ -281,7 +287,7 @@ package ariane_pkg;
    // ---------------

    // leave as is (fails with >8 entries and wider fetch width)
-    localparam int unsigned FETCH_FIFO_DEPTH  = 8;
+    localparam int unsigned FETCH_FIFO_DEPTH  = 4;
    localparam int unsigned FETCH_WIDTH       = 32;
    // maximum instructions we can fetch on one request (we support compressed instructions)
    localparam int unsigned INSTR_PER_FETCH = FETCH_WIDTH / 16;
@ -295,18 +301,24 @@ package ariane_pkg;
         logic        valid;
    } exception_t;

-    typedef enum logic [1:0] { BHT, BTB, RAS } cf_t;
+    typedef enum logic [2:0] {
+      NoCF,   // No control flow prediction
+      Branch, // Branch
+      Jump,   // Jump to address from immediate
+      JumpR,  // Jump to address from registers
+      Return  // Return Address Prediction
+    } cf_t;

    // branch-predict
    // this is the struct we get back from ex stage and we will use it to update
    // all the necessary data structures
+    // bp_resolve_t
    typedef struct packed {
        logic        valid;           // prediction with all its values is valid
-        logic [63:0] pc;              // pc of predict or mis-predict
+        logic [63:0] pc;              // PC of predict or mis-predict
        logic [63:0] target_address;  // target address at which to jump, or not
        logic        is_mispredict;   // set if this was a mis-predict
        logic        is_taken;        // branch is taken
-                                      // in the lower 16 bit of the word
        cf_t         cf_type;         // Type of control flow change
    } bp_resolve_t;

@ -314,11 +326,8 @@ package ariane_pkg;
    // this is the struct which we will inject into the pipeline to guide the various
    // units towards the correct branch decision and resolve
    typedef struct packed {
-        logic        valid;           // this is a valid hint
+        cf_t         cf;              // type of control flow prediction
        logic [63:0] predict_address; // target address at which to jump, or not
-        logic        predict_taken;   // branch is taken
-                                      // in the lower 16 bit of the word
-        cf_t         cf_type;         // Type of control flow change
    } branchpredict_sbe_t;

    typedef struct packed {
@ -340,14 +349,12 @@ package ariane_pkg;
    typedef struct packed {
        logic        valid;
        logic [63:0] pc;          // update at PC
-        logic        mispredict;
        logic        taken;
    } bht_update_t;

    typedef struct packed {
        logic       valid;
        logic       taken;
-        logic       strongly_taken;
    } bht_prediction_t;

    typedef enum logic[3:0] {
@ -444,7 +451,7 @@ package ariane_pkg;
                               // comparisons
                               LTS, LTU, GES, GEU, EQ, NE,
                               // jumps
-                               JALR,
+                               JALR, BRANCH,
                               // set lower than operations
                               SLTS, SLTU,
                               // CSR functions
@ -482,6 +489,13 @@ package ariane_pkg;
        logic [TRANS_ID_BITS-1:0] trans_id;
    } fu_data_t;

+    function automatic logic is_branch (input fu_op op);
+        unique case (op) inside
+            EQ, NE, LTS, GES, LTU, GEU: return 1'b1;
+            default                   : return 1'b0; // all other ops
+        endcase
+    endfunction;
+
    // -------------------------------
    // Extract Src/Dst FP Reg from Op
    // -------------------------------
@ -570,14 +584,6 @@ package ariane_pkg;
    // ---------------
    // IF/ID Stage
    // ---------------
-   typedef struct packed {
-        logic [63:0]                address;        // the address of the instructions from below
-        logic [FETCH_WIDTH-1:0]     instruction;    // instruction word
-        branchpredict_sbe_t         branch_predict; // this field contains branch prediction information regarding the forward branch path
-        logic [INSTR_PER_FETCH-1:0] bp_taken;       // at which instruction is this branch taken?
-        logic                       page_fault;     // an instruction page fault happened
-    } frontend_fetch_t;
-
    // store the decompressed instruction
    typedef struct packed {
        logic [63:0]           address;        // the address of the instructions from below
--- a/src/ariane.sv
+++ b/src/ariane.sv
@ -1,4 +1,4 @@
-// Copyright 2018 ETH Zurich and University of Bologna.
+// Copyright 2017-2019 ETH Zurich and University of Bologna.
 // Copyright and related rights are licensed under the Solderpad Hardware
 // License, Version 0.51 (the "License"); you may not use this file except in
 // compliance with the License.  You may obtain a copy of the License at
@ -59,9 +59,9 @@ module ariane #(
  // --------------
  // IF <-> ID
  // --------------
-  frontend_fetch_t          fetch_entry_if_id;
+  fetch_entry_t             fetch_entry_if_id;
  logic                     fetch_valid_if_id;
-  logic                     decode_ack_id_if;
+  logic                     fetch_ready_id_if;

  // --------------
  // ID <-> ISSUE
@ -220,7 +220,7 @@ module ariane #(
  // Frontend
  // --------------
  frontend #(
-    .DmBaseAddress       ( ArianeCfg.DmBaseAddress )
+    .ArianeCfg ( ArianeCfg )
  ) i_frontend (
    .flush_i             ( flush_ctrl_if                 ), // not entirely correct
    .flush_bp_i          ( 1'b0                          ),
@ -238,7 +238,7 @@ module ariane #(
    .ex_valid_i          ( ex_commit.valid               ),
    .fetch_entry_o       ( fetch_entry_if_id             ),
    .fetch_entry_valid_o ( fetch_valid_if_id             ),
-    .fetch_ack_i         ( decode_ack_id_if              ),
+    .fetch_entry_ready_i ( fetch_ready_id_if             ),
    .*
  );

@ -246,11 +246,14 @@ module ariane #(
  // ID
  // ---------
  id_stage id_stage_i (
-    .debug_req_i,
+    .clk_i,
+    .rst_ni,
    .flush_i                    ( flush_ctrl_if              ),
+    .debug_req_i,
+
    .fetch_entry_i              ( fetch_entry_if_id          ),
    .fetch_entry_valid_i        ( fetch_valid_if_id          ),
-    .decoded_instr_ack_o        ( decode_ack_id_if           ),
+    .fetch_entry_ready_o        ( fetch_ready_id_if          ),

    .issue_entry_o              ( issue_entry_id_issue       ),
    .issue_entry_valid_o        ( issue_entry_valid_id_issue ),
@ -260,13 +263,12 @@ module ariane #(
    .priv_lvl_i                 ( priv_lvl                   ),
    .fs_i                       ( fs                         ),
    .frm_i                      ( frm_csr_id_issue_ex        ),
+    .irq_i                      ( irq_i                      ),
+    .irq_ctrl_i                 ( irq_ctrl_csr_id            ),
    .debug_mode_i               ( debug_mode                 ),
    .tvm_i                      ( tvm_csr_id                 ),
    .tw_i                       ( tw_csr_id                  ),
-    .tsr_i                      ( tsr_csr_id                 ),
-    .irq_i                      ( irq_i                      ),
-    .irq_ctrl_i                 ( irq_ctrl_csr_id            ),
-    .*
+    .tsr_i                      ( tsr_csr_id                 )
  );

  // ---------
@ -334,6 +336,7 @@ module ariane #(
  ) ex_stage_i (
    .clk_i                  ( clk_i                       ),
    .rst_ni                 ( rst_ni                      ),
+    .debug_mode_i           ( debug_mode                  ),
    .flush_i                ( flush_ctrl_ex               ),
    .fu_data_i              ( fu_data_id_ex               ),
    .pc_i                   ( pc_id_ex                    ),
@ -708,9 +711,9 @@ module ariane #(
  assign tracer_if.flush_unissued    = flush_unissued_instr_ctrl_id;
  assign tracer_if.flush             = flush_ctrl_ex;
  // fetch
-  assign tracer_if.instruction       = id_stage_i.instr_realigner_i.fetch_entry_o.instruction;
-  assign tracer_if.fetch_valid       = id_stage_i.instr_realigner_i.fetch_entry_valid_o;
-  assign tracer_if.fetch_ack         = id_stage_i.instr_realigner_i.fetch_ack_i;
+  assign tracer_if.instruction       = id_stage_i.fetch_entry_i.instruction;
+  assign tracer_if.fetch_valid       = id_stage_i.fetch_entry_valid_i;
+  assign tracer_if.fetch_ack         = id_stage_i.fetch_entry_ready_o;
  // Issue
  assign tracer_if.issue_ack         = issue_stage_i.i_scoreboard.issue_ack_i;
  assign tracer_if.issue_sbe         = issue_stage_i.i_scoreboard.issue_instr_o;
--- a/src/branch_unit.sv
+++ b/src/branch_unit.sv
@ -12,10 +12,11 @@
 // Date: 09.05.2017
 // Description: Branch target calculation and comparison

-import ariane_pkg::*;
-
 module branch_unit (
-    input  fu_data_t                  fu_data_i,
+    input  logic                      clk_i,
+    input  logic                      rst_ni,
+    input  logic                      debug_mode_i,
+    input  ariane_pkg::fu_data_t      fu_data_i,
    input  logic [63:0]               pc_i,                   // PC of instruction
    input  logic                      is_compressed_instr_i,
    input  logic                      fu_valid_i,             // any functional unit is valid, check that there is no accidental mis-predict
@ -23,83 +24,62 @@ module branch_unit (
    input  logic                      branch_comp_res_i,      // branch comparison result from ALU
    output logic [63:0]               branch_result_o,

-    input  branchpredict_sbe_t        branch_predict_i,       // this is the address we predicted
-    output bp_resolve_t               resolved_branch_o,      // this is the actual address we are targeting
+    input  ariane_pkg::branchpredict_sbe_t        branch_predict_i,       // this is the address we predicted
+    output ariane_pkg::bp_resolve_t               resolved_branch_o,      // this is the actual address we are targeting
    output logic                      resolve_branch_o,       // to ID to clear that we resolved the branch and we can
                                                              // accept new entries to the scoreboard
-    output exception_t                branch_exception_o      // branch exception out
+    output ariane_pkg::exception_t    branch_exception_o      // branch exception out
 );
    logic [63:0] target_address;
    logic [63:0] next_pc;

-    // here we handle the various possibilities of mis-predicts
+   // here we handle the various possibilities of mis-predicts
    always_comb begin : mispredict_handler
        // set the jump base, for JALR we need to look at the register, for all other control flow instructions we can take the current PC
        automatic logic [63:0] jump_base;
-        jump_base = (fu_data_i.operator == JALR) ? fu_data_i.operand_a : pc_i;
+        // TODO(zarubaf): The ALU can be used to calculate the branch target
+        jump_base = (fu_data_i.operator == ariane_pkg::JALR) ? fu_data_i.operand_a : pc_i;

+        target_address                   = 64'b0;
        resolve_branch_o                 = 1'b0;
        resolved_branch_o.target_address = 64'b0;
        resolved_branch_o.is_taken       = 1'b0;
        resolved_branch_o.valid          = branch_valid_i;
        resolved_branch_o.is_mispredict  = 1'b0;
-        resolved_branch_o.cf_type        = branch_predict_i.cf_type;
+        resolved_branch_o.cf_type        = branch_predict_i.cf;
        // calculate next PC, depending on whether the instruction is compressed or not this may be different
+        // TODO(zarubaf): We already calculate this a couple of times, maybe re-use?
        next_pc                          = pc_i + ((is_compressed_instr_i) ? 64'h2 : 64'h4);
        // calculate target address simple 64 bit addition
        target_address                   = $unsigned($signed(jump_base) + $signed(fu_data_i.imm));
        // on a JALR we are supposed to reset the LSB to 0 (according to the specification)
-        if (fu_data_i.operator == JALR)
-            target_address[0] = 1'b0;
-        // if we need to put the branch target address in a destination register, output it here to WB
+        if (fu_data_i.operator == ariane_pkg::JALR) target_address[0] = 1'b0;
+        // we need to put the branch target address into rd, this is the result of this unit
        branch_result_o = next_pc;
-
-        // save PC - we need this to get the target row in the branch target buffer
-        // we play this trick with the branch instruction which wraps a word boundary:
-        //  /---------- Place the prediction on this PC
-        // \/
-        // ____________________________________________________
-        // |branch [15:0] | branch[31:16] | compressed 1[15:0] |
-        // |____________________________________________________
-        // This will relief the pre-fetcher to re-fetch partially fetched unaligned branch instructions e.g.:
-        // we don't have a back arch between the pre-fetcher and decoder/instruction FIFO.
-        resolved_branch_o.pc = (is_compressed_instr_i || pc_i[1] == 1'b0) ? pc_i : ({pc_i[63:2], 2'b0} + 64'h4);
-
+        resolved_branch_o.pc = pc_i;
+        // There are only two sources of mispredicts:
+        // 1. Branches
+        // 2. Jumps to register addresses
        if (branch_valid_i) begin
-            // write target address which goes to pc gen
+            // write target address which goes to PC Gen
            resolved_branch_o.target_address = (branch_comp_res_i) ? target_address : next_pc;
-            resolved_branch_o.is_taken       = branch_comp_res_i;
-            // we've detected a branch in ID with the following parameters
-            // we mis-predicted e.g.: the predicted address is unequal to the actual address
-            if (target_address[0] == 1'b0) begin
-                // we've got a valid branch prediction
-                if (branch_predict_i.valid) begin
-                    // if the outcome doesn't match we've got a mis-predict
-                    if (branch_predict_i.predict_taken != branch_comp_res_i) begin
-                        resolved_branch_o.is_mispredict  = 1'b1;
-                    end
-                    // check if the address of the predict taken branch is correct
-                    if (branch_predict_i.predict_taken && target_address != branch_predict_i.predict_address) begin
-                        resolved_branch_o.is_mispredict  = 1'b1;
-                    end
-                // branch-prediction didn't do anything (e.g.: it fetched PC + 2/4), so if this branch is taken
-                // we also have a mis-predict
-                end else begin
-                    if (branch_comp_res_i) begin
-                        resolved_branch_o.is_mispredict = 1'b1;
-                    end
-                end
+            resolved_branch_o.is_taken = branch_comp_res_i;
+            // check the outcome of the branch speculation
+            if (ariane_pkg::is_branch(fu_data_i.operator) && branch_comp_res_i != (branch_predict_i.cf == ariane_pkg::Branch)) begin
+                // we mis-predicted the outcome
+                // if the outcome doesn't match we've got a mis-predict
+                resolved_branch_o.is_mispredict  = 1'b1;
+                resolved_branch_o.cf_type = ariane_pkg::Branch;
+            end
+            if (fu_data_i.operator == ariane_pkg::JALR
+                // check if the address of the jump register is correct and that we actually predicted
+                && (branch_predict_i.cf == ariane_pkg::NoCF || target_address != branch_predict_i.predict_address)) begin
+                resolved_branch_o.is_mispredict  = 1'b1;
+                // update BTB only if this wasn't a return
+                if (branch_predict_i.cf != ariane_pkg::Return) resolved_branch_o.cf_type = ariane_pkg::JumpR;
            end
            // to resolve the branch in ID
            resolve_branch_o = 1'b1;
-        // the other case would be that this instruction was no branch but branch prediction thought that it was one
-        // this is essentially also a mis-predict
-        end else if (fu_valid_i && branch_predict_i.valid && branch_predict_i.predict_taken) begin
-            // re-set the branch to the next PC
-            resolved_branch_o.is_mispredict  = 1'b1;
-            resolved_branch_o.target_address = next_pc;
-            resolved_branch_o.valid          = 1'b1;
-            resolve_branch_o                 = 1'b1;
        end
    end
    // use ALU exception signal for storing instruction fetch exceptions if
@ -109,7 +89,6 @@ module branch_unit (
        branch_exception_o.valid = 1'b0;
        branch_exception_o.tval  = pc_i;
        // only throw exception if this is indeed a branch
-        if (branch_valid_i && target_address[0] != 1'b0)
-            branch_exception_o.valid = 1'b1;
+        if (branch_valid_i && target_address[0] != 1'b0) branch_exception_o.valid = 1'b1;
    end
 endmodule
--- a/src/ex_stage.sv
+++ b/src/ex_stage.sv
@ -21,6 +21,7 @@ module ex_stage #(
    input  logic                                   clk_i,    // Clock
    input  logic                                   rst_ni,   // Asynchronous reset active low
    input  logic                                   flush_i,
+    input  logic                                   debug_mode_i,

    input  fu_data_t                               fu_data_i,
    input  logic [63:0]                            pc_i,                  // PC of current instruction
@ -143,6 +144,9 @@ module ex_stage #(
    // we don't silence the branch unit as this is already critical and we do
    // not want to add another layer of logic
    branch_unit branch_unit_i (
+        .clk_i,
+        .rst_ni,
+        .debug_mode_i,
        .fu_data_i,
        .pc_i,
        .is_compressed_instr_i,
--- a/src/frontend/bht.sv
+++ b/src/frontend/bht.sv
@ -1,4 +1,4 @@
-//Copyright (C) 2018 to present,
+// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
 // Copyright and related rights are licensed under the Solderpad Hardware
 // License, Version 2.0 (the "License"); you may not use this file except in
 // compliance with the License.  You may obtain a copy of the License at
@ -6,7 +6,8 @@
 // or agreed to in writing, software, hardware and materials distributed under
 // this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.//
+// specific language governing permissions and limitations under the License.
+//
 // Author: Florian Zaruba, ETH Zurich
 // Date: 08.02.2018
 // Migrated: Luis Vitorio Cargnini, IEEE
@ -20,65 +21,81 @@ module bht #(
    input  logic                        rst_ni,
    input  logic                        flush_i,
    input  logic                        debug_mode_i,
-
    input  logic [63:0]                 vpc_i,
    input  ariane_pkg::bht_update_t     bht_update_i,
-    output ariane_pkg::bht_prediction_t bht_prediction_o
+    // we potentially need INSTR_PER_FETCH predictions/cycle
+    output ariane_pkg::bht_prediction_t [ariane_pkg::INSTR_PER_FETCH-1:0] bht_prediction_o
 );
-    localparam OFFSET = 2; // we are using compressed instructions so do not use the lower 2 bits for prediction
-    localparam ANTIALIAS_BITS = 8;
+    // the last bit is always zero, we don't need it for indexing
+    localparam OFFSET = 1;
+    // re-shape the branch history table
+    localparam NR_ROWS = NR_ENTRIES / ariane_pkg::INSTR_PER_FETCH;
+    // number of bits needed to index the row
+    localparam ROW_ADDR_BITS = $clog2(ariane_pkg::INSTR_PER_FETCH);
    // number of bits we should use for prediction
-    localparam PREDICTION_BITS = $clog2(NR_ENTRIES) + OFFSET;
+    localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS;
+    // we are not interested in all bits of the address
+    unread i_unread (.d_i(|vpc_i));

    struct packed {
        logic       valid;
        logic [1:0] saturation_counter;
-    } bht_d[NR_ENTRIES-1:0], bht_q[NR_ENTRIES-1:0];
+    } bht_d[NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0], bht_q[NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0];

-    logic [$clog2(NR_ENTRIES)-1:0]  index, update_pc;
-    logic [1:0]                     saturation_counter;
+    logic [$clog2(NR_ROWS)-1:0]  index, update_pc;
+    logic [ROW_ADDR_BITS-1:0]    update_row_index;
+    logic [1:0]                  saturation_counter;
+
+    assign index     = vpc_i[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
+    assign update_pc = bht_update_i.pc[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
+    assign update_row_index = bht_update_i.pc[ROW_ADDR_BITS + OFFSET - 1:OFFSET];

-    assign index     = vpc_i[PREDICTION_BITS - 1:OFFSET];
-    assign update_pc = bht_update_i.pc[PREDICTION_BITS - 1:OFFSET];
    // prediction assignment
-    assign bht_prediction_o.valid = bht_q[index].valid;
-    assign bht_prediction_o.taken = bht_q[index].saturation_counter == 2'b10;
-    assign bht_prediction_o.strongly_taken = (bht_q[index].saturation_counter == 2'b11);
+    for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_bht_output
+        assign bht_prediction_o[i].valid = bht_q[index][i].valid;
+        assign bht_prediction_o[i].taken = bht_q[index][i].saturation_counter[1] == 1'b1;
+    end
+
    always_comb begin : update_bht
        bht_d = bht_q;
-        saturation_counter = bht_q[update_pc].saturation_counter;
+        saturation_counter = bht_q[update_pc][update_row_index].saturation_counter;

        if (bht_update_i.valid && !debug_mode_i) begin
-            bht_d[update_pc].valid = 1'b1;
+            bht_d[update_pc][update_row_index].valid = 1'b1;

            if (saturation_counter == 2'b11) begin
                // we can safely decrease it
-                if (~bht_update_i.taken)
-                    bht_d[update_pc].saturation_counter = saturation_counter - 1;
+                if (!bht_update_i.taken)
+                    bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
            // then check if it saturated in the negative regime e.g.: branch not taken
            end else if (saturation_counter == 2'b00) begin
                // we can safely increase it
                if (bht_update_i.taken)
-                    bht_d[update_pc].saturation_counter = saturation_counter + 1;
+                    bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
            end else begin // otherwise we are not in any boundaries and can decrease or increase it
                if (bht_update_i.taken)
-                    bht_d[update_pc].saturation_counter = saturation_counter + 1;
+                    bht_d[update_pc][update_row_index].saturation_counter = saturation_counter + 1;
                else
-                    bht_d[update_pc].saturation_counter = saturation_counter - 1;
+                    bht_d[update_pc][update_row_index].saturation_counter = saturation_counter - 1;
            end
        end
    end

    always_ff @(posedge clk_i or negedge rst_ni) begin
-        if (~rst_ni) begin
-            for (int unsigned i = 0; i < NR_ENTRIES; i++)
-                bht_q[i] <= '0;
+        if (!rst_ni) begin
+            for (int unsigned i = 0; i < NR_ENTRIES; i++) begin
+                for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
+                    bht_q[i][j] <= '0;
+                end
+            end
        end else begin
            // evict all entries
            if (flush_i) begin
                for (int i = 0; i < NR_ENTRIES; i++) begin
-                    bht_q[i].valid <=  1'b0;
-                    bht_q[i].saturation_counter <= 2'b10;
+                    for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
+                        bht_q[i][j].valid <=  1'b0;
+                        bht_q[i][j].saturation_counter <= 2'b10;
+                    end
                end
            end else begin
                bht_q <= bht_d;
--- a/src/frontend/btb.sv
+++ b/src/frontend/btb.sv
@ -1,4 +1,4 @@
-//Copyright (C) 2018 to present,
+// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
 // Copyright and related rights are licensed under the Solderpad Hardware
 // License, Version 2.0 (the "License"); you may not use this file except in
 // compliance with the License.  You may obtain a copy of the License at
@ -13,10 +13,6 @@
 // Migrated: Luis Vitorio Cargnini, IEEE
 // Date: 09.06.2018

-// ------------------------------
-// Branch Prediction
-// ------------------------------
-
 // branch target buffer
 module btb #(
    parameter int NR_ENTRIES = 8
@ -28,23 +24,36 @@ module btb #(

    input  logic [63:0]                 vpc_i,           // virtual PC from IF stage
    input  ariane_pkg::btb_update_t     btb_update_i,    // update btb with this information
-    output ariane_pkg::btb_prediction_t btb_prediction_o // prediction from btb
+    output ariane_pkg::btb_prediction_t [ariane_pkg::INSTR_PER_FETCH-1:0] btb_prediction_o // prediction from btb
 );
-    // number of bits which are not used for indexing
-    localparam OFFSET = 1; // we are using compressed instructions so do use the lower 2 bits for prediction
-    localparam ANTIALIAS_BITS = 8;
+    // the last bit is always zero, we don't need it for indexing
+    localparam OFFSET = 1;
+    // re-shape the branch history table
+    localparam NR_ROWS = NR_ENTRIES / ariane_pkg::INSTR_PER_FETCH;
+    // number of bits needed to index the row
+    localparam ROW_ADDR_BITS = $clog2(ariane_pkg::INSTR_PER_FETCH);
    // number of bits we should use for prediction
-    localparam PREDICTION_BITS = $clog2(NR_ENTRIES) + OFFSET;
+    localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS;
+    // prevent aliasing to degrade performance
+    localparam ANTIALIAS_BITS = 8;
+    // we are not interested in all bits of the address
+    unread i_unread (.d_i(|vpc_i));
+
    // typedef for all branch target entries
    // we may want to try to put a tag field that fills the rest of the PC in-order to mitigate aliasing effects
-    ariane_pkg::btb_prediction_t btb_d [NR_ENTRIES-1:0], btb_q [NR_ENTRIES-1:0];
-    logic [$clog2(NR_ENTRIES)-1:0]          index, update_pc;
+    ariane_pkg::btb_prediction_t btb_d [NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0],
+                                 btb_q [NR_ROWS-1:0][ariane_pkg::INSTR_PER_FETCH-1:0];
+    logic [$clog2(NR_ROWS)-1:0]  index, update_pc;
+    logic [ROW_ADDR_BITS-1:0]    update_row_index;

-    assign index     = vpc_i[PREDICTION_BITS - 1:OFFSET];
-    assign update_pc = btb_update_i.pc[PREDICTION_BITS - 1:OFFSET];
+    assign index     = vpc_i[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
+    assign update_pc = btb_update_i.pc[PREDICTION_BITS - 1:ROW_ADDR_BITS + OFFSET];
+    assign update_row_index = btb_update_i.pc[ROW_ADDR_BITS + OFFSET - 1:OFFSET];

    // output matching prediction
-    assign btb_prediction_o = btb_q[index];
+    for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_btb_output
+        assign btb_prediction_o[i] = btb_q[index][i]; // workaround
+    end

    // -------------------------
    // Update Branch Prediction
@ -54,23 +63,25 @@ module btb #(
        btb_d = btb_q;

        if (btb_update_i.valid && !debug_mode_i) begin
-            btb_d[update_pc].valid = 1'b1;
+            btb_d[update_pc][update_row_index].valid = 1'b1;
            // the target address is simply updated
-            btb_d[update_pc].target_address = btb_update_i.target_address;
+            btb_d[update_pc][update_row_index].target_address = btb_update_i.target_address;
        end
    end

    // sequential process
    always_ff @(posedge clk_i or negedge rst_ni) begin
-        if (~rst_ni) begin
+        if (!rst_ni) begin
            // Bias the branches to be taken upon first arrival
-            for (int i = 0; i < NR_ENTRIES; i++)
+            for (int i = 0; i < NR_ROWS; i++)
                btb_q[i] <= '{default: 0};
        end else begin
            // evict all entries
            if (flush_i) begin
-                for (int i = 0; i < NR_ENTRIES; i++) begin
-                    btb_q[i].valid <=  1'b0;
+                for (int i = 0; i < NR_ROWS; i++) begin
+                    for (int j = 0; j < ariane_pkg::INSTR_PER_FETCH; j++) begin
+                        btb_q[i][j].valid <=  1'b0;
+                    end
                end
            end else begin
                btb_q <=  btb_d;
--- a/src/frontend/frontend.sv
+++ b/src/frontend/frontend.sv
@ -11,61 +11,66 @@
 // Author: Florian Zaruba, ETH Zurich
 // Date: 08.02.2018
 // Description: Ariane Instruction Fetch Frontend
-
-
+//
+// This module interfaces with the instruction cache, handles control
+// change request from the back-end and does branch prediction.
 import ariane_pkg::*;

 module frontend #(
-    parameter logic [63:0]     DmBaseAddress = 64'h0 // debug module base address
+  parameter ariane_pkg::ariane_cfg_t ArianeCfg = ariane_pkg::ArianeDefaultConfig
 ) (
-    input  logic               clk_i,              // Clock
-    input  logic               rst_ni,             // Asynchronous reset active low
-    input  logic               flush_i,            // flush request for PCGEN
-    input  logic               flush_bp_i,         // flush branch prediction
-    input  logic               debug_mode_i,
-    // global input
-    input  logic [63:0]        boot_addr_i,
-    // Set a new PC
-    // mispredict
-    input  bp_resolve_t        resolved_branch_i,  // from controller signaling a branch_predict -> update BTB
-    // from commit, when flushing the whole pipeline
-    input  logic               set_pc_commit_i,    // Take the PC from commit stage
-    input  logic [63:0]        pc_commit_i,        // PC of instruction in commit stage
-    // CSR input
-    input  logic [63:0]        epc_i,              // exception PC which we need to return to
-    input  logic               eret_i,             // return from exception
-    input  logic [63:0]        trap_vector_base_i, // base of trap vector
-    input  logic               ex_valid_i,         // exception is valid - from commit
-    input  logic               set_debug_pc_i,     // jump to debug address
-    // Instruction Fetch
-    input  icache_dreq_o_t     icache_dreq_i,
-    output icache_dreq_i_t     icache_dreq_o,
-    // instruction output port -> to processor back-end
-    output frontend_fetch_t    fetch_entry_o,       // fetch entry containing all relevant data for the ID stage
-    output logic               fetch_entry_valid_o, // instruction in IF is valid
-    input  logic               fetch_ack_i          // ID acknowledged this instruction
+  input  logic               clk_i,              // Clock
+  input  logic               rst_ni,             // Asynchronous reset active low
+  input  logic               flush_i,            // flush request for PCGEN
+  input  logic               flush_bp_i,         // flush branch prediction
+  input  logic               debug_mode_i,
+  // global input
+  input  logic [63:0]        boot_addr_i,
+  // Set a new PC
+  // mispredict
+  input  bp_resolve_t        resolved_branch_i,  // from controller signaling a branch_predict -> update BTB
+  // from commit, when flushing the whole pipeline
+  input  logic               set_pc_commit_i,    // Take the PC from commit stage
+  input  logic [63:0]        pc_commit_i,        // PC of instruction in commit stage
+  // CSR input
+  input  logic [63:0]        epc_i,              // exception PC which we need to return to
+  input  logic               eret_i,             // return from exception
+  input  logic [63:0]        trap_vector_base_i, // base of trap vector
+  input  logic               ex_valid_i,         // exception is valid - from commit
+  input  logic               set_debug_pc_i,     // jump to debug address
+  // Instruction Fetch
+  output icache_dreq_i_t     icache_dreq_o,
+  input  icache_dreq_o_t     icache_dreq_i,
+  // instruction output port -> to processor back-end
+  output fetch_entry_t       fetch_entry_o,       // fetch entry containing all relevant data for the ID stage
+  output logic               fetch_entry_valid_o, // instruction in IF is valid
+  input  logic               fetch_entry_ready_i  // ID acknowledged this instruction
 );
-    // Registers
-    logic [31:0] icache_data_q;
-    logic        icache_valid_q;
-    logic        icache_ex_valid_q;
-    logic        instruction_valid;
-    logic [INSTR_PER_FETCH-1:0] instr_is_compressed;
-
-    logic [63:0] icache_vaddr_q;
-    // BHT, BTB and RAS prediction
-    bht_prediction_t bht_prediction;
-    btb_prediction_t btb_prediction;
-    ras_t            ras_predict;
-    bht_update_t     bht_update;
-    btb_update_t     btb_update;
-    logic            ras_push, ras_pop;
-    logic [63:0]     ras_update;
-
+    // Instruction Cache Registers, from I$
+    logic [FETCH_WIDTH-1:0] icache_data_q;
+    logic                   icache_valid_q;
+    logic                   icache_ex_valid_q;
+    logic [63:0]            icache_vaddr_q;
+    logic                   instr_queue_ready;
+    logic [ariane_pkg::INSTR_PER_FETCH-1:0] instr_queue_consumed;
+    // upper-most branch-prediction from last cycle
+    btb_prediction_t        btb_q;
+    bht_prediction_t        bht_q;
    // instruction fetch is ready
    logic          if_ready;
    logic [63:0]   npc_d, npc_q; // next PC
-    logic npc_rst_load_q; //indicates whether we come out of reset (then we need to load boot_addr_i)
+
+    // indicates whether we come out of reset (then we need to load boot_addr_i)
+    logic          npc_rst_load_q;
+
+    logic          replay;
+    logic [63:0]   replay_addr;
+
+    // shift amount
+    logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] shamt;
+    // address will always be 16 bit aligned, make this explicit here
+    assign shamt = icache_dreq_i.vaddr[$clog2(ariane_pkg::INSTR_PER_FETCH):1];
+
    // -----------------------
    // Ctrl Flow Speculation
    // -----------------------
@ -74,209 +79,185 @@ module frontend #(
                                      rvi_jalr, rvi_jump;
    logic [INSTR_PER_FETCH-1:0][63:0] rvi_imm;
    // RVC branching
-    logic [INSTR_PER_FETCH-1:0]       is_rvc;
    logic [INSTR_PER_FETCH-1:0]       rvc_branch, rvc_jump, rvc_jr, rvc_return,
                                      rvc_jalr, rvc_call;
    logic [INSTR_PER_FETCH-1:0][63:0] rvc_imm;
    // re-aligned instruction and address (coming from cache - combinationally)
    logic [INSTR_PER_FETCH-1:0][31:0] instr;
    logic [INSTR_PER_FETCH-1:0][63:0] addr;
+    logic [INSTR_PER_FETCH-1:0]       instruction_valid;
+    // BHT, BTB and RAS prediction
+    bht_prediction_t [INSTR_PER_FETCH-1:0] bht_prediction;
+    btb_prediction_t [INSTR_PER_FETCH-1:0] btb_prediction;
+    bht_prediction_t [INSTR_PER_FETCH-1:0] bht_prediction_shifted;
+    btb_prediction_t [INSTR_PER_FETCH-1:0] btb_prediction_shifted;
+    ras_t            ras_predict;

-    logic [63:0]   bp_vaddr;
-    logic          bp_valid; // we have a valid branch-prediction
-    logic          is_mispredict;
-    // branch-prediction which we inject into the pipeline
-    branchpredict_sbe_t  bp_sbe;
-    // fetch fifo credit system
-    logic fifo_valid, fifo_ready, fifo_empty, fifo_pop;
-    logic s2_eff_kill, issue_req, s2_in_flight_d, s2_in_flight_q;
-    logic [$clog2(FETCH_FIFO_DEPTH):0] fifo_credits_d;
-    logic [$clog2(FETCH_FIFO_DEPTH):0] fifo_credits_q;
+    // branch-predict update
+    logic            is_mispredict;
+    logic            ras_push, ras_pop;
+    logic [63:0]     ras_update;

-    // save the unaligned part of the instruction to this ff
-    logic [15:0] unaligned_instr_d,   unaligned_instr_q;
-    // the last instruction was unaligned
-    logic        unaligned_d,         unaligned_q;
-    // register to save the unaligned address
-    logic [63:0] unaligned_address_d, unaligned_address_q;
+    // Instruction FIFO
+    logic [63:0]                            predict_address;
+    cf_t  [ariane_pkg::INSTR_PER_FETCH-1:0] cf_type;
+    logic [ariane_pkg::INSTR_PER_FETCH-1:0] taken_rvi_cf;
+    logic [ariane_pkg::INSTR_PER_FETCH-1:0] taken_rvc_cf;

-    for (genvar i = 0; i < INSTR_PER_FETCH; i ++) begin
-        // LSB != 2'b11
-        assign instr_is_compressed[i] = ~&icache_data_q[i * 16 +: 2];
+    logic serving_unaligned;
+    // Re-align instructions
+    instr_realign i_instr_realign (
+      .clk_i               ( clk_i                 ),
+      .rst_ni              ( rst_ni                ),
+      .flush_i             ( icache_dreq_o.kill_s2 ),
+      .valid_i             ( icache_valid_q        ),
+      .serving_unaligned_o ( serving_unaligned     ),
+      .address_i           ( icache_vaddr_q        ),
+      .data_i              ( icache_data_q         ),
+      .valid_o             ( instruction_valid     ),
+      .addr_o              ( addr                  ),
+      .instr_o             ( instr                 )
+    );
+    // --------------------
+    // Branch Prediction
+    // --------------------
+    // select the right branch prediction result
+    // in case we are serving an unaligned instruction in instr[0] we need to take
+    // the prediction we saved from the previous fetch
+    assign bht_prediction_shifted[0] = (serving_unaligned) ? bht_q : bht_prediction[0];
+    assign btb_prediction_shifted[0] = (serving_unaligned) ? btb_q : btb_prediction[0];
+    // for all other predictions we can use the generated address to index
+    // into the branch prediction data structures
+    for (genvar i = 1; i < INSTR_PER_FETCH; i++) begin : gen_prediction_address
+      assign bht_prediction_shifted[i] = bht_prediction[addr[i][$clog2(INSTR_PER_FETCH):1]];
+      assign btb_prediction_shifted[i] = btb_prediction[addr[i][$clog2(INSTR_PER_FETCH):1]];
+    end
+    // for the return address stack it doens't matter as we have the
+    // address of the call/return already
+    logic bp_valid;
+
+    logic [INSTR_PER_FETCH-1:0] is_branch;
+    logic [INSTR_PER_FETCH-1:0] is_call;
+    logic [INSTR_PER_FETCH-1:0] is_jump;
+    logic [INSTR_PER_FETCH-1:0] is_return;
+    logic [INSTR_PER_FETCH-1:0] is_jalr;
+
+    for (genvar i = 0; i < INSTR_PER_FETCH; i++) begin
+      // branch history table -> BHT
+      assign is_branch[i] =  instruction_valid[i] & (rvi_branch[i] | rvc_branch[i]);
+      // function calls -> RAS
+      assign is_call[i] = instruction_valid[i] & (rvi_call[i] | rvc_call[i]);
+      // function return -> RAS
+      assign is_return[i] = instruction_valid[i] & (rvi_return[i] | rvc_return[i]);
+      // unconditional jumps with known target -> immediately resolved
+      assign is_jump[i] = instruction_valid[i] & (rvi_jump[i] | rvc_jump[i]);
+      // unconditional jumps with unknown target -> BTB
+      assign is_jalr[i] = instruction_valid[i] & ~is_return[i] & ~is_call[i] & (rvi_jalr[i] | rvc_jalr[i] | rvc_jr[i]);
    end

-    // Soft-realignment to do branch-prediction
-    always_comb begin : re_align
-        unaligned_d = unaligned_q;
-        unaligned_address_d = unaligned_address_q;
-        unaligned_instr_d = unaligned_instr_q;
-        instruction_valid = icache_valid_q;
+    // taken/not taken
+    always_comb begin
+      taken_rvi_cf = '0;
+      taken_rvc_cf = '0;
+      predict_address = '0;

-        // 32-bit can contain 2 instructions
-        instr[0] = icache_data_q;
-        addr[0]  = icache_vaddr_q;
+      for (int i = 0; i < INSTR_PER_FETCH; i++)  cf_type[i] = ariane_pkg::NoCF;

-        instr[1] = '0;
-        addr[1]  = {icache_vaddr_q[63:2], 2'b10};
+      ras_push = 1'b0;
+      ras_pop = 1'b0;
+      ras_update = '0;

-        if (icache_valid_q) begin
-            // last instruction was unaligned
-            if (unaligned_q) begin
-                instr[0] = {icache_data_q[15:0], unaligned_instr_q};
-                addr[0] = unaligned_address_q;
-
-                unaligned_address_d = {icache_vaddr_q[63:2], 2'b10};
-                unaligned_instr_d = icache_data_q[31:16]; // save the upper bits for next cycle
-
-                // check if this is instruction is still unaligned e.g.: it is not compressed
-                // if its compressed re-set unaligned flag
-                // for 32 bit we can simply check the next instruction and whether it is compressed or not
-                // if it is compressed the next fetch will contain an aligned instruction
-                if (instr_is_compressed[1]) begin
-                    unaligned_d = 1'b0;
-                    instr[1] = {16'b0, icache_data_q[31:16]};
-                end
-            end else if (instr_is_compressed[0]) begin // instruction zero is RVC
-                // is instruction 1 also compressed
-                // yes? -> no problem, no -> we've got an unaligned instruction
-                if (instr_is_compressed[1]) begin
-                    instr[1] = {16'b0, icache_data_q[31:16]};
-                end else begin
-                    unaligned_instr_d = icache_data_q[31:16];
-                    unaligned_address_d = {icache_vaddr_q[63:2], 2'b10};
-                    unaligned_d = 1'b1;
-                end
-            end // else -> normal fetch
-        end
-
-        // we started to fetch on a unaligned boundary with a whole instruction -> wait until we've
-        // received the next instruction
-        if (icache_valid_q && icache_vaddr_q[1] && !instr_is_compressed[1]) begin
-            instruction_valid = 1'b0;
-            unaligned_d = 1'b1;
-            unaligned_address_d = {icache_vaddr_q[63:2], 2'b10};
-            unaligned_instr_d = icache_data_q[31:16];
-        end
-
-        // if we killed the consecutive fetch we are starting on a clean slate
-        if (icache_dreq_o.kill_s2) begin
-            unaligned_d = 1'b0;
-        end
-    end
-
-
-    logic [INSTR_PER_FETCH:0] taken;
-    // control front-end + branch-prediction
-    always_comb begin : frontend_ctrl
-        automatic logic take_rvi_cf; // take the control flow change (non-compressed)
-        automatic logic take_rvc_cf; // take the control flow change (compressed)
-
-        take_rvi_cf       = 1'b0;
-        take_rvc_cf       = 1'b0;
-        ras_pop           = 1'b0;
-        ras_push          = 1'b0;
-        ras_update        = '0;
-        taken             = '0;
-        take_rvi_cf       = 1'b0;
-
-        bp_vaddr          = '0;    // predicted address
-        bp_valid          = 1'b0;  // prediction is valid
-
-        bp_sbe.cf_type    = RAS;
-
-        // only predict if the response is valid
-        if (instruction_valid) begin
-            // look at instruction 0, 1, 2, ...
-            for (int unsigned i = 0; i < INSTR_PER_FETCH; i++) begin
-                // only speculate if the previous instruction was not taken
-                if (!taken[i]) begin
-                    // function call
-                    ras_push = rvi_call[i] | rvc_call[i];
-                    ras_update = addr[i] + (rvc_call[i] ? 2 : 4);
-
-                    // Branch Prediction - **speculative**
-                    if (rvi_branch[i] || rvc_branch[i]) begin
-                        bp_sbe.cf_type = BHT;
-                        // dynamic prediction valid?
-                        if (bht_prediction.valid) begin
-                            take_rvi_cf = rvi_branch[i] & (bht_prediction.taken | bht_prediction.strongly_taken);
-                            take_rvc_cf = rvc_branch[i] & (bht_prediction.taken | bht_prediction.strongly_taken);
-                        // default to static prediction
-                        end else begin
-                            // set if immediate is negative - static prediction
-                            take_rvi_cf = rvi_branch[i] & rvi_imm[i][63];
-                            take_rvc_cf = rvc_branch[i] & rvc_imm[i][63];
-                        end
-                    end
-
-                    // unconditional jumps
-                    if (rvi_jump[i] || rvc_jump[i]) begin
-                        take_rvi_cf = rvi_jump[i];
-                        take_rvc_cf = rvc_jump[i];
-                    end
-
-                    // to take this jump we need a valid prediction target **speculative**
-                    if ((rvi_jalr[i] || rvc_jalr[i]) && ~(rvi_call[i] || rvc_call[i])) begin
-                        bp_sbe.cf_type = BTB;
-                        if (btb_prediction.valid) begin
-                            bp_vaddr = btb_prediction.target_address;
-                            taken[i+1] = 1'b1;
-                        end
-                    end
-
-                    // is it a return and the RAS contains a valid prediction? **speculative**
-                    if ((rvi_return[i] || rvc_return[i]) && ras_predict.valid) begin
-                        bp_vaddr = ras_predict.ra;
-                        ras_pop = 1'b1;
-                        taken[i+1] = 1'b1;
-                        bp_sbe.cf_type = RAS;
-                    end
-
-                    if (take_rvi_cf) begin
-                        taken[i+1] = 1'b1;
-                        bp_vaddr = addr[i] + rvi_imm[i];
-                    end
-
-                    if (take_rvc_cf) begin
-                        taken[i+1] = 1'b1;
-                        bp_vaddr = addr[i] + rvc_imm[i];
-                    end
-
-                    // we are not interested in the lower instruction
-                    if (icache_vaddr_q[1]) begin
-                        taken[1] = 1'b0;
-                        // TODO(zarubaf): that seems to be overly pessimistic
-                        ras_pop = 1'b0;
-                        ras_push = 1'b0;
-                    end
-                end
+      // lower most prediction gets precedence
+      for (int i = INSTR_PER_FETCH - 1; i >= 0 ; i--) begin
+        unique case ({is_branch[i], is_return[i], is_jump[i], is_jalr[i]})
+          4'b0000:; // regular instruction e.g.: no branch
+          // unconditional jump to register, we need the BTB to resolve this
+          4'b0001: begin
+            ras_pop = 1'b0;
+            ras_push = 1'b0;
+            if (btb_prediction_shifted[i].valid) begin
+              predict_address = btb_prediction_shifted[i].target_address;
+              cf_type[i] = ariane_pkg::JumpR;
            end
-        end
-
-        bp_valid = |taken;
-        // assemble scoreboard entry
-        bp_sbe.valid = bp_valid;
-        bp_sbe.predict_address = bp_vaddr;
-        bp_sbe.predict_taken = bp_valid;
+          end
+          // its an unconditional jump to an immediate
+          4'b0010: begin
+            ras_pop = 1'b0;
+            ras_push = 1'b0;
+            taken_rvi_cf[i] = rvi_jump[i];
+            taken_rvc_cf[i] = rvc_jump[i];
+            cf_type[i] = ariane_pkg::Jump;
+          end
+          // return
+          4'b0100: begin
+            // make sure to only alter the RAS if we actually consumed the instruction
+            ras_pop = ras_predict.valid & instr_queue_consumed[i];
+            ras_push = 1'b0;
+            predict_address = ras_predict.ra;
+            cf_type[i] = ariane_pkg::Return;
+          end
+          // branch prediction
+          4'b1000: begin
+            ras_pop = 1'b0;
+            ras_push = 1'b0;
+            // if we have a valid dynamic prediction use it
+            if (bht_prediction_shifted[i].valid) begin
+              taken_rvi_cf[i] = rvi_branch[i] & bht_prediction_shifted[i].taken;
+              taken_rvc_cf[i] = rvc_branch[i] & bht_prediction_shifted[i].taken;
+            // otherwise default to static prediction
+            end else begin
+              // set if immediate is negative - static prediction
+              taken_rvi_cf[i] = rvi_branch[i] & rvi_imm[i][63];
+              taken_rvc_cf[i] = rvc_branch[i] & rvc_imm[i][63];
+            end
+            if (taken_rvi_cf[i] || taken_rvc_cf[i]) cf_type[i] = ariane_pkg::Branch;
+          end
+          default:;
+            // default: $error("Decoded more than one control flow");
+        endcase
+          // if this instruction, in addition, is a call, save the resulting address
+          // but only if we actually consumed the address
+          if (is_call[i]) begin
+            ras_push = instr_queue_consumed[i];
+            ras_update = addr[i] + (rvc_call[i] ? 2 : 4);
+          end
+          // calculate the jump target address
+          if (taken_rvc_cf[i] || taken_rvi_cf[i]) begin
+            predict_address = addr[i] + (taken_rvc_cf[i] ? rvc_imm[i] : rvi_imm[i]);
+          end
+      end
+    end
+    // or reduce struct
+    always_comb begin
+      bp_valid = 1'b0;
+      for (int i = 0; i < INSTR_PER_FETCH; i++) bp_valid |= (cf_type[i] != NoCF);
    end
-
    assign is_mispredict = resolved_branch_i.valid & resolved_branch_i.is_mispredict;
-    // we mis-predicted so kill the icache request and the fetch queue
-    assign icache_dreq_o.kill_s1 = is_mispredict | flush_i;
-    // if we have a valid branch-prediction we need to kill the last cache request
-    assign icache_dreq_o.kill_s2 = icache_dreq_o.kill_s1 | bp_valid;
-    assign fifo_valid = icache_valid_q;

-    // ----------------------------------------
+    // Cache interface
+    assign icache_dreq_o.req = instr_queue_ready;
+    assign if_ready = icache_dreq_i.ready & instr_queue_ready;
+    // We need to flush the cache pipeline if:
+    // 1. We mispredicted
+    // 2. Want to flush the whole processor front-end
+    // 3. Need to replay an instruction because the fetch-fifo was full
+    assign icache_dreq_o.kill_s1 = is_mispredict | flush_i | replay;
+    // if we have a valid branch-prediction we need to only kill the last cache request
+    // also if we killed the first stage we also need to kill the second stage (inclusive flush)
+    assign icache_dreq_o.kill_s2 = icache_dreq_o.kill_s1 | bp_valid;
+
    // Update Control Flow Predictions
-    // ----------------------------------------
-    // BHT
-    assign bht_update.valid = resolved_branch_i.valid & (resolved_branch_i.cf_type == BHT);
+    bht_update_t bht_update;
+    btb_update_t btb_update;
+
+    assign bht_update.valid = resolved_branch_i.valid
+                                & (resolved_branch_i.cf_type == ariane_pkg::Branch);
    assign bht_update.pc    = resolved_branch_i.pc;
-    assign bht_update.mispredict = resolved_branch_i.is_mispredict;
    assign bht_update.taken = resolved_branch_i.is_taken;
-    // BTB
-    assign btb_update.valid = resolved_branch_i.valid & (resolved_branch_i.cf_type == BTB);
+    // only update mispredicted branches e.g. no returns from the RAS
+    assign btb_update.valid = resolved_branch_i.valid
+                                & resolved_branch_i.is_mispredict
+                                & (resolved_branch_i.cf_type == ariane_pkg::JumpR);
    assign btb_update.pc    = resolved_branch_i.pc;
    assign btb_update.target_address = resolved_branch_i.target_address;

@ -284,7 +265,7 @@ module frontend #(
    // Next PC
    // -------------------
    // next PC (NPC) can come from (in order of precedence):
-    // 0. Default assignment
+    // 0. Default assignment/replay instruction
    // 1. Branch Predict taken
    // 2. Control flow change request (misprediction)
    // 3. Return from environment call
@ -293,211 +274,160 @@ module frontend #(
    // Mis-predict handling is a little bit different
    // select PC a.k.a PC Gen
    always_comb begin : npc_select
-        automatic logic [63:0] fetch_address;
-
-        // check whether we come out of reset
-        // this is a workaround. some tools have issues
-        // having boot_addr_i in the asynchronous
-        // reset assignment to npc_q, even though
-        // boot_addr_i will be assigned a constant
-        // on the top-level.
-        if (npc_rst_load_q) begin
-            npc_d         = boot_addr_i;
-            fetch_address = boot_addr_i;
-        end else begin
-            fetch_address    = npc_q;
-            // keep stable by default
-            npc_d            = npc_q;
-        end
-
-        // -------------------------------
-        // 1. Branch Prediction
-        // -------------------------------
-        if (bp_valid) begin
-            fetch_address = bp_vaddr;
-            npc_d = bp_vaddr;
-        end
-        // -------------------------------
-        // 0. Default assignment
-        // -------------------------------
-        if (if_ready) begin
-            npc_d = {fetch_address[63:2], 2'b0}  + 'h4;
-        end
-        // -------------------------------
-        // 2. Control flow change request
-        // -------------------------------
-        if (is_mispredict) begin
-            npc_d = resolved_branch_i.target_address;
-        end
-        // -------------------------------
-        // 3. Return from environment call
-        // -------------------------------
-        if (eret_i) begin
-            npc_d = epc_i;
-        end
-        // -------------------------------
-        // 4. Exception/Interrupt
-        // -------------------------------
-        if (ex_valid_i) begin
-            npc_d    = trap_vector_base_i;
-        end
-        // -----------------------------------------------
-        // 5. Pipeline Flush because of CSR side effects
-        // -----------------------------------------------
-        // On a pipeline flush start fetching from the next address
-        // of the instruction in the commit stage
-        if (set_pc_commit_i) begin
-            // we came here from a flush request of a CSR instruction or AMO,
-            // as CSR or AMO instructions do not exist in a compressed form
-            // we can unconditionally do PC + 4 here
-            // TODO(zarubaf) This adder can at least be merged with the one in the csr_regfile stage
-            npc_d    = pc_commit_i + 64'h4;
-        end
-        // -------------------------------
-        // 6. Debug
-        // -------------------------------
-        // enter debug on a hard-coded base-address
-        if (set_debug_pc_i) begin
-            npc_d = DmBaseAddress + dm::HaltAddress;
-        end
-
-        icache_dreq_o.vaddr = fetch_address;
+      automatic logic [63:0] fetch_address;
+      // check whether we come out of reset
+      // this is a workaround. some tools have issues
+      // having boot_addr_i in the asynchronous
+      // reset assignment to npc_q, even though
+      // boot_addr_i will be assigned a constant
+      // on the top-level.
+      if (npc_rst_load_q) begin
+        npc_d         = boot_addr_i;
+        fetch_address = boot_addr_i;
+      end else begin
+        fetch_address    = npc_q;
+        // keep stable by default
+        npc_d            = npc_q;
+      end
+      // 0. Branch Prediction
+      if (bp_valid) begin
+        fetch_address = predict_address;
+        npc_d = predict_address;
+      end
+      // 1. Default assignment
+      if (if_ready) npc_d = {fetch_address[63:2], 2'b0}  + 'h4;
+      // 2. Replay instruction fetch
+      if (replay) npc_d = replay_addr;
+      // 3. Control flow change request
+      if (is_mispredict) npc_d = resolved_branch_i.target_address;
+      // 4. Return from environment call
+      if (eret_i) npc_d = epc_i;
+      // 5. Exception/Interrupt
+      if (ex_valid_i) npc_d = trap_vector_base_i;
+      // 6. Pipeline Flush because of CSR side effects
+      // On a pipeline flush start fetching from the next address
+      // of the instruction in the commit stage
+      // we came here from a flush request of a CSR instruction or AMO,
+      // as CSR or AMO instructions do not exist in a compressed form
+      // we can unconditionally do PC + 4 here
+      // TODO(zarubaf) This adder can at least be merged with the one in the csr_regfile stage
+      if (set_pc_commit_i) npc_d = pc_commit_i + 64'h4;
+      // 7. Debug
+      // enter debug on a hard-coded base-address
+      if (set_debug_pc_i) npc_d = ArianeCfg.DmBaseAddress + dm::HaltAddress;
+      icache_dreq_o.vaddr = fetch_address;
    end

-    // -------------------
-    // Credit-based fetch FIFO flow ctrl
-    // -------------------
-    assign fifo_credits_d       =  (flush_i) ? FETCH_FIFO_DEPTH :
-                                               fifo_credits_q + fifo_pop + s2_eff_kill - issue_req;
-
-    // check whether there is a request in flight that is being killed now
-    // if this is the case, we need to increment the credit by 1
-    assign s2_eff_kill         = s2_in_flight_q & icache_dreq_o.kill_s2;
-    assign s2_in_flight_d      = (flush_i)             ? 1'b0 :
-                                 (issue_req)           ? 1'b1 :
-                                 (icache_dreq_i.valid) ? 1'b0 :
-                                                         s2_in_flight_q;
-
-    // only enable counter if current request is not being killed
-    assign issue_req           = if_ready & (~icache_dreq_o.kill_s1);
-    assign fifo_pop            = fetch_ack_i & fetch_entry_valid_o;
-    assign fifo_ready          = (|fifo_credits_q);
-    assign if_ready            =  icache_dreq_i.ready & fifo_ready;
-    assign icache_dreq_o.req   =  fifo_ready;
-    assign fetch_entry_valid_o = ~fifo_empty;
-
-
-//pragma translate_off
-`ifndef VERILATOR
-  fetch_fifo_credits0 : assert property (
-      @(posedge clk_i) disable iff (~rst_ni) (fifo_credits_q <= FETCH_FIFO_DEPTH))
-         else $fatal(1,"[frontend] fetch fifo credits must be <= FETCH_FIFO_DEPTH!");
-    initial begin
-        assert (FETCH_FIFO_DEPTH <= 8) else $fatal(1,"[frontend] fetch fifo deeper than 8 not supported");
-        assert (FETCH_WIDTH == 32) else $fatal(1,"[frontend] fetch width != not supported");
-    end
-`endif
-//pragma translate_on
+    logic [FETCH_WIDTH-1:0] icache_data;
+    // re-align the cache line
+    assign icache_data = icache_dreq_i.data >> {shamt, 4'b0};

    always_ff @(posedge clk_i or negedge rst_ni) begin
-        if (~rst_ni) begin
-            npc_q                <= '0;
-            npc_rst_load_q       <= 1'b1;
-            icache_data_q        <= '0;
-            icache_valid_q       <= 1'b0;
-            icache_vaddr_q       <= 'b0;
-            icache_ex_valid_q    <= 1'b0;
-            unaligned_q          <= 1'b0;
-            unaligned_address_q  <= '0;
-            unaligned_instr_q    <= '0;
-            fifo_credits_q       <= FETCH_FIFO_DEPTH;
-            s2_in_flight_q       <= 1'b0;
-        end else begin
-            npc_rst_load_q       <= 1'b0;
-            npc_q                <= npc_d;
-            icache_data_q        <= icache_dreq_i.data;
-            icache_valid_q       <= icache_dreq_i.valid;
-            icache_vaddr_q       <= icache_dreq_i.vaddr;
-            icache_ex_valid_q    <= icache_dreq_i.ex.valid;
-            unaligned_q          <= unaligned_d;
-            unaligned_address_q  <= unaligned_address_d;
-            unaligned_instr_q    <= unaligned_instr_d;
-            fifo_credits_q       <= fifo_credits_d;
-            s2_in_flight_q       <= s2_in_flight_d;
+      if (!rst_ni) begin
+        npc_rst_load_q    <= 1'b1;
+        npc_q             <= '0;
+        icache_data_q     <= '0;
+        icache_valid_q    <= 1'b0;
+        icache_vaddr_q    <= 'b0;
+        icache_ex_valid_q <= 1'b0;
+        btb_q             <= '0;
+        bht_q             <= '0;
+      end else begin
+        npc_rst_load_q    <= 1'b0;
+        npc_q             <= npc_d;
+        icache_valid_q    <= icache_dreq_i.valid;
+        if (icache_dreq_i.valid) begin
+          icache_data_q        <= icache_data;
+          icache_vaddr_q       <= icache_dreq_i.vaddr;
+          icache_ex_valid_q    <= icache_dreq_i.ex;
+          // save the uppermost prediction
+          btb_q                <= btb_prediction[INSTR_PER_FETCH-1];
+          bht_q                <= bht_prediction[INSTR_PER_FETCH-1];
        end
+      end
    end

    ras #(
-        .DEPTH  ( RAS_DEPTH   )
+      .DEPTH  ( ArianeCfg.RASDepth  )
    ) i_ras (
-        .clk_i,
-        .rst_ni,
-        .flush_i( flush_bp_i  ),
-        .push_i ( ras_push    ),
-        .pop_i  ( ras_pop     ),
-        .data_i ( ras_update  ),
-        .data_o ( ras_predict )
+      .clk_i,
+      .rst_ni,
+      .flush_i( flush_bp_i  ),
+      .push_i ( ras_push    ),
+      .pop_i  ( ras_pop     ),
+      .data_i ( ras_update  ),
+      .data_o ( ras_predict )
    );

    btb #(
-        .NR_ENTRIES       ( BTB_ENTRIES      )
+      .NR_ENTRIES       ( ArianeCfg.BTBEntries   )
    ) i_btb (
-        .clk_i,
-        .rst_ni,
-        .flush_i          ( flush_bp_i       ),
-        .debug_mode_i,
-        .vpc_i            ( icache_vaddr_q   ),
-        .btb_update_i     ( btb_update       ),
-        .btb_prediction_o ( btb_prediction   )
+      .clk_i,
+      .rst_ni,
+      .flush_i          ( flush_bp_i       ),
+      .debug_mode_i,
+      .vpc_i            ( icache_vaddr_q   ),
+      .btb_update_i     ( btb_update       ),
+      .btb_prediction_o ( btb_prediction   )
    );

    bht #(
-        .NR_ENTRIES       ( BHT_ENTRIES      )
+      .NR_ENTRIES       ( ArianeCfg.BHTEntries   )
    ) i_bht (
-        .clk_i,
-        .rst_ni,
-        .flush_i          ( flush_bp_i       ),
-        .debug_mode_i,
-        .vpc_i            ( icache_vaddr_q   ),
-        .bht_update_i     ( bht_update       ),
-        .bht_prediction_o ( bht_prediction   )
+      .clk_i,
+      .rst_ni,
+      .flush_i          ( flush_bp_i       ),
+      .debug_mode_i,
+      .vpc_i            ( icache_vaddr_q   ),
+      .bht_update_i     ( bht_update       ),
+      .bht_prediction_o ( bht_prediction   )
    );

-    for (genvar i = 0; i < INSTR_PER_FETCH; i++) begin
-        instr_scan i_instr_scan (
-            .instr_i      ( instr[i]      ),
-            .is_rvc_o     ( is_rvc[i]     ),
-            .rvi_return_o ( rvi_return[i] ),
-            .rvi_call_o   ( rvi_call[i]   ),
-            .rvi_branch_o ( rvi_branch[i] ),
-            .rvi_jalr_o   ( rvi_jalr[i]   ),
-            .rvi_jump_o   ( rvi_jump[i]   ),
-            .rvi_imm_o    ( rvi_imm[i]    ),
-            .rvc_branch_o ( rvc_branch[i] ),
-            .rvc_jump_o   ( rvc_jump[i]   ),
-            .rvc_jr_o     ( rvc_jr[i]     ),
-            .rvc_return_o ( rvc_return[i] ),
-            .rvc_jalr_o   ( rvc_jalr[i]   ),
-            .rvc_call_o   ( rvc_call[i]   ),
-            .rvc_imm_o    ( rvc_imm[i]    )
-        );
+    // we need to inspect up to INSTR_PER_FETCH instructions for branches
+    // and jumps
+    for (genvar i = 0; i < INSTR_PER_FETCH; i++) begin : gen_instr_scan
+      instr_scan i_instr_scan (
+        .instr_i      ( instr[i]      ),
+        .rvi_return_o ( rvi_return[i] ),
+        .rvi_call_o   ( rvi_call[i]   ),
+        .rvi_branch_o ( rvi_branch[i] ),
+        .rvi_jalr_o   ( rvi_jalr[i]   ),
+        .rvi_jump_o   ( rvi_jump[i]   ),
+        .rvi_imm_o    ( rvi_imm[i]    ),
+        .rvc_branch_o ( rvc_branch[i] ),
+        .rvc_jump_o   ( rvc_jump[i]   ),
+        .rvc_jr_o     ( rvc_jr[i]     ),
+        .rvc_return_o ( rvc_return[i] ),
+        .rvc_jalr_o   ( rvc_jalr[i]   ),
+        .rvc_call_o   ( rvc_call[i]   ),
+        .rvc_imm_o    ( rvc_imm[i]    )
+      );
    end

-    fifo_v3 #(
-        .DEPTH        (  8                   ),
-        .dtype        ( frontend_fetch_t     )
-    ) i_fetch_fifo (
-        .clk_i       ( clk_i                 ),
-        .rst_ni      ( rst_ni                ),
-        .flush_i     ( flush_i               ),
-        .testmode_i  ( 1'b0                  ),
-        .full_o      (                       ),
-        .empty_o     ( fifo_empty            ),
-        .usage_o     (                       ),
-        .data_i      ( {icache_vaddr_q, icache_data_q, bp_sbe, taken[INSTR_PER_FETCH:1], icache_ex_valid_q} ),
-        .push_i      ( fifo_valid            ),
-        .data_o      ( fetch_entry_o         ),
-        .pop_i       ( fifo_pop              )
+    instr_queue i_instr_queue (
+      .clk_i               ( clk_i                ),
+      .rst_ni              ( rst_ni               ),
+      .flush_i             ( flush_i              ),
+      .instr_i             ( instr                ), // from re-aligner
+      .addr_i              ( addr                 ), // from re-aligner
+      .exception_i         ( icache_ex_valid_q    ), // from I$
+      .predict_address_i   ( predict_address      ),
+      .cf_type_i           ( cf_type              ),
+      .valid_i             ( instruction_valid    ), // from re-aligner
+      .consumed_o          ( instr_queue_consumed ),
+      .ready_o             ( instr_queue_ready    ),
+      .replay_o            ( replay               ),
+      .replay_addr_o       ( replay_addr          ),
+      .fetch_entry_o       ( fetch_entry_o        ), // to back-end
+      .fetch_entry_valid_o ( fetch_entry_valid_o  ), // to back-end
+      .fetch_entry_ready_i ( fetch_entry_ready_i  )  // to back-end
    );

+    // pragma translate_off
+    `ifndef VERILATOR
+      initial begin
+        assert (FETCH_WIDTH == 32 || FETCH_WIDTH == 64) else $fatal("[frontend] fetch width != not supported");
+      end
+    `endif
+    // pragma translate_on
 endmodule
--- a/src/frontend/instr_queue.sv
+++ b/src/frontend/instr_queue.sv
@ -0,0 +1,353 @@
+// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License.  You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// Author: Florian Zaruba, ETH Zurich
+// Date: 26.10.2018sim:/ariane_tb/dut/i_ariane/i_frontend/icache_ex_valid_q
+
+// Description: Instruction Queue, separates instruction front-end from processor
+//              back-end.
+//
+// This is an optimized instruction queue which supports the handling of
+// compressed instructions (16 bit instructions). Internally it is organized as
+// FETCH_ENTRY x 32 bit queues which are filled in a consecutive manner. Two pointers
+// point into (`idx_is_q` and `idx_ds_q`) the fill port and the read port. The read port
+// is designed so that it will easily allow for multiple issue implementation.
+// The input supports arbitrary power of two instruction fetch widths.
+//
+// The queue supports handling of branch prediction and will take care of
+// only saving a valid instruction stream.
+//
+// Furthermore it contains a replay interface in case the instruction queue
+// is already full. As instructions are in general easily replayed this should
+// increase the efficiency as I$ misses are potentially hidden. This stands in
+// contrast to pessimistic actions (early stalling) or credit based approaches.
+// Credit based systems might be difficult to implement with the current system
+// as we do not exactly know how much space we are going to need in the fifos
+// as each instruction can take either one or two slots.
+//
+// So the consumed/valid interface degenerates to a `information` interface. If the
+// upstream circuits keeps pushing the queue will discard the information
+// and start replaying from the point were it could last manage to accept instructions.
+//
+// The instruction front-end will stop issuing instructions as soon as the
+// fifo is full. This will gate the logic if the processor is e.g.: halted
+//
+// TODO(zarubaf): The instruction queues can be reduced to 16 bit. Potentially
+// the replay mechanism gets more complicated as it can be that a 32 bit instruction
+// can not be pushed at once.
+
+module instr_queue (
+  input  logic                                               clk_i,
+  input  logic                                               rst_ni,
+  input  logic                                               flush_i,
+  input  logic [ariane_pkg::INSTR_PER_FETCH-1:0][31:0]       instr_i,
+  input  logic [ariane_pkg::INSTR_PER_FETCH-1:0][63:0]       addr_i,
+  input  logic [ariane_pkg::INSTR_PER_FETCH-1:0]             valid_i,
+  output logic                                               ready_o,
+  output logic [ariane_pkg::INSTR_PER_FETCH-1:0]             consumed_o,
+  // we've encountered an exception, at this point the only possible exceptions are page-table faults
+  input  logic                                               exception_i,
+  // branch predict
+  input  logic [63:0]                                        predict_address_i,
+  input  ariane_pkg::cf_t  [ariane_pkg::INSTR_PER_FETCH-1:0] cf_type_i,
+  // replay instruction because one of the FIFO was already full
+  output logic                                               replay_o,
+  output logic [63:0]                                        replay_addr_o, // address at which to replay this instruction
+  // to processor backend
+  output ariane_pkg::fetch_entry_t                           fetch_entry_o,
+  output logic                                               fetch_entry_valid_o,
+  input  logic                                               fetch_entry_ready_i
+);
+
+  typedef struct packed {
+    logic [31:0]     instr; // instruction word
+    ariane_pkg::cf_t cf;    // branch was taken
+    logic            ex;    // exception happened
+  } instr_data_t;
+
+  logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] branch_index;
+  // instruction queues
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0]
+        [$clog2(ariane_pkg::FETCH_FIFO_DEPTH)-1:0] instr_queue_usage;
+  instr_data_t [ariane_pkg::INSTR_PER_FETCH-1:0]   instr_data_in, instr_data_out;
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0]          push_instr, push_instr_fifo;
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0]          pop_instr;
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0]          instr_queue_full;
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0]          instr_queue_empty;
+  logic instr_overflow;
+  // address queue
+  logic [$clog2(ariane_pkg::FETCH_FIFO_DEPTH)-1:0] address_queue_usage;
+  logic [63:0] address_out;
+  logic pop_address;
+  logic push_address;
+  logic full_address;
+  logic empty_address;
+  logic address_overflow;
+  // input stream counter
+  logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] idx_is_d, idx_is_q;
+  // Registers
+  // output FIFO select, one-hot
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0] idx_ds_d, idx_ds_q;
+  logic [63:0] pc_d, pc_q; // current PC
+  logic reset_address_d, reset_address_q; // we need to re-set the address because of a flush
+
+  logic [ariane_pkg::INSTR_PER_FETCH*2-2:0] branch_mask_extended;
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0] branch_mask;
+  logic branch_empty;
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0] taken;
+  // shift amount, e.g.: instructions we want to retire
+  logic [$clog2(ariane_pkg::INSTR_PER_FETCH):0] popcount;
+  logic [$clog2(ariane_pkg::INSTR_PER_FETCH)-1:0] shamt;
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0] valid;
+  logic [ariane_pkg::INSTR_PER_FETCH*2-1:0] consumed_extended;
+  // FIFO mask
+  logic [ariane_pkg::INSTR_PER_FETCH*2-1:0] fifo_pos_extended;
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0] fifo_pos;
+  logic [ariane_pkg::INSTR_PER_FETCH*2-1:0][31:0] instr;
+  ariane_pkg::cf_t [ariane_pkg::INSTR_PER_FETCH*2-1:0] cf;
+  // replay interface
+  logic [ariane_pkg::INSTR_PER_FETCH-1:0] instr_overflow_fifo;
+
+  assign ready_o = ~(|instr_queue_full) & ~full_address;
+
+  for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_unpack_taken
+    assign taken[i] = cf_type_i[i] != ariane_pkg::NoCF;
+  end
+  // calculate a branch mask, e.g.: get the first taken branch
+  lzc #(
+    .WIDTH   ( ariane_pkg::INSTR_PER_FETCH ),
+    .MODE    ( 0                           ) // count trailing zeros
+  ) i_lzc_branch_index (
+    .in_i    ( taken          ), // we want to count trailing zeros
+    .cnt_o   ( branch_index   ), // first branch on branch_index
+    .empty_o ( branch_empty   )
+  );
+  // the first index is for sure valid
+  // for example (64 bit fetch):
+  // taken mask: 0 1 1 0
+  // leading zero count = 1
+  // 0 0 0 1, 1 1 1 << 1 = 0 0 1 1, 1 1 0
+  // take the upper 4 bits: 0 0 1 1
+  assign branch_mask_extended = {{{ariane_pkg::INSTR_PER_FETCH-1}{1'b0}}, {{ariane_pkg::INSTR_PER_FETCH}{1'b1}}} << branch_index;
+  assign branch_mask = branch_mask_extended[ariane_pkg::INSTR_PER_FETCH * 2 - 2:ariane_pkg::INSTR_PER_FETCH - 1];
+
+  // mask with taken branches to get the actual amount of instructions we want to push
+  assign valid = valid_i & branch_mask;
+  // rotate right again
+  assign consumed_extended = {push_instr_fifo, push_instr_fifo} >> idx_is_q;
+  assign consumed_o = consumed_extended[ariane_pkg::INSTR_PER_FETCH-1:0];
+  // count the numbers of valid instructions we've pushed from this package
+  popcount #(
+    .INPUT_WIDTH   ( ariane_pkg::INSTR_PER_FETCH )
+  ) i_popcount (
+    .data_i     ( push_instr_fifo ),
+    .popcount_o ( popcount        )
+  );
+  assign shamt = popcount[$bits(shamt)-1:0];
+
+  // save the shift amount for next cycle
+  assign idx_is_d = idx_is_q + shamt;
+
+  // ----------------------
+  // Input interface
+  // ----------------------
+  // rotate left by the current position
+  assign fifo_pos_extended = { valid, valid } << idx_is_q;
+  // we just care about the upper bits
+  assign fifo_pos = fifo_pos_extended[ariane_pkg::INSTR_PER_FETCH*2-1:ariane_pkg::INSTR_PER_FETCH];
+  // the fifo_position signal can directly be used to guide the push signal of each FIFO
+  // make sure it is not full
+  assign push_instr = fifo_pos & ~instr_queue_full;
+
+  // duplicate the entries for easier selection e.g.: 3 2 1 0 3 2 1 0
+  for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_duplicate_instr_input
+    assign instr[i] = instr_i[i];
+    assign instr[i + ariane_pkg::INSTR_PER_FETCH] = instr_i[i];
+    assign cf[i] = cf_type_i[i];
+    assign cf[i + ariane_pkg::INSTR_PER_FETCH] = cf_type_i[i];
+  end
+
+  // shift the inputs
+  for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_fifo_input_select
+    /* verilator lint_off WIDTH */
+    assign instr_data_in[i].instr = instr[i + idx_is_q];
+    assign instr_data_in[i].cf = cf[i + idx_is_q];
+    assign instr_data_in[i].ex = exception_i; // exceptions hold for the whole fetch packet
+    /* verilator lint_on WIDTH */
+  end
+
+  // ----------------------
+  // Replay Logic
+  // ----------------------
+  // We need to replay a instruction fetch iff:
+  // 1. One of the instruction data FIFOs was full and we needed it
+  // (e.g.: we pushed and it was full)
+  // 2. The address/branch predict FIFO was full
+  // if one of the FIFOs was full we need to replay the faulting instruction
+  assign instr_overflow_fifo = instr_queue_full & fifo_pos;
+  assign instr_overflow = |instr_overflow_fifo; // at least one instruction overflowed
+  assign address_overflow = full_address & push_address;
+  assign replay_o = instr_overflow | address_overflow;
+
+  // select the address, in the case of an address fifo overflow just
+  // use the base of this package
+  // if we successfully pushed some instructions we can output the next instruction
+  // which we didn't manage to push
+  assign replay_addr_o = (address_overflow) ? addr_i[0] : addr_i[shamt];
+
+  // ----------------------
+  // Downstream interface
+  // ----------------------
+  // as long as there is at least one queue which can take the value we have a valid instruction
+  assign fetch_entry_valid_o = ~(&instr_queue_empty);
+
+  always_comb begin
+    idx_ds_d = idx_ds_q;
+
+    pop_instr = '0;
+    // assemble fetch entry
+    fetch_entry_o.instruction = '0;
+    fetch_entry_o.address = pc_q;
+    fetch_entry_o.ex.valid = 1'b0;
+    // This is the only exception which can occur up to this point.
+    fetch_entry_o.ex.cause = riscv::INSTR_PAGE_FAULT;
+    fetch_entry_o.ex.tval = '0;
+    fetch_entry_o.branch_predict.predict_address = address_out;
+    // output mux select
+    for (int unsigned i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin
+      if (idx_ds_q[i]) begin
+        fetch_entry_o.instruction = instr_data_out[i].instr;
+        fetch_entry_o.ex.valid = instr_data_out[i].ex;
+        fetch_entry_o.ex.tval  = pc_q;
+        fetch_entry_o.branch_predict.cf = instr_data_out[i].cf;
+        pop_instr[i] = fetch_entry_valid_o & fetch_entry_ready_i;
+      end
+    end
+    // rotate the pointer left
+    if (fetch_entry_ready_i) begin
+      idx_ds_d = {idx_ds_q[ariane_pkg::INSTR_PER_FETCH-2:0], idx_ds_q[ariane_pkg::INSTR_PER_FETCH-1]};
+    end
+  end
+
+  // TODO(zarubaf): This needs to change for dual-issue
+  // if the handshaking is successful and we had a prediction pop one address entry
+  assign pop_address = ((fetch_entry_o.branch_predict.cf != ariane_pkg::NoCF) & |pop_instr);
+
+  // ----------------------
+  // Calculate (Next) PC
+  // ----------------------
+  always_comb begin
+    pc_d = pc_q;
+    reset_address_d = flush_i ? 1'b1 : reset_address_q;
+
+    if (fetch_entry_ready_i) begin
+      // TODO(zarubaf): This needs to change for a dual issue implementation
+      // advance the PC
+      pc_d =  pc_q + ((fetch_entry_o.instruction[1:0] != 2'b11) ? 'd2 : 'd4);
+    end
+
+    if (pop_address) pc_d = address_out;
+
+      // we previously flushed so we need to reset the address
+    if (valid_i[0] && reset_address_q) begin
+      // this is the base of the first instruction
+      pc_d = addr_i[0];
+      reset_address_d = 1'b0;
+    end
+  end
+
+  // FIFOs
+  for (genvar i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin : gen_instr_fifo
+    // Make sure we don't save any instructions if we couldn't save the address
+    assign push_instr_fifo[i] = push_instr[i] & ~address_overflow;
+    fifo_v3 #(
+      .DEPTH      ( ariane_pkg::FETCH_FIFO_DEPTH ),
+      .dtype      ( instr_data_t                 )
+    ) i_fifo_instr_data (
+      .clk_i      ( clk_i                ),
+      .rst_ni     ( rst_ni               ),
+      .flush_i    ( flush_i              ),
+      .testmode_i ( 1'b0                 ),
+      .full_o     ( instr_queue_full[i]  ),
+      .empty_o    ( instr_queue_empty[i] ),
+      .usage_o    ( instr_queue_usage[i] ),
+      .data_i     ( instr_data_in[i]     ),
+      .push_i     ( push_instr_fifo[i]   ),
+      .data_o     ( instr_data_out[i]    ),
+      .pop_i      ( pop_instr[i]         )
+    );
+  end
+  // or reduce and check whether we are retiring a taken branch (might be that the corresponding)
+  // fifo is full.
+  always_comb begin
+    push_address = 1'b0;
+    // check if we are pushing a ctrl flow change, if so save the address
+    for (int i = 0; i < ariane_pkg::INSTR_PER_FETCH; i++) begin
+      push_address |= push_instr[i] & (instr_data_in[i].cf != ariane_pkg::NoCF);
+    end
+  end
+
+  fifo_v3 #(
+    .DEPTH      ( ariane_pkg::FETCH_FIFO_DEPTH ), // TODO(zarubaf): Fork out to separate param
+    .DATA_WIDTH ( 64                           )
+  ) i_fifo_address (
+    .clk_i      ( clk_i                        ),
+    .rst_ni     ( rst_ni                       ),
+    .flush_i    ( flush_i                      ),
+    .testmode_i ( 1'b0                         ),
+    .full_o     ( full_address                 ),
+    .empty_o    ( empty_address                ),
+    .usage_o    ( address_queue_usage          ),
+    .data_i     ( predict_address_i            ),
+    .push_i     ( push_address & ~full_address ),
+    .data_o     ( address_out                  ),
+    .pop_i      ( pop_address                  )
+  );
+
+  unread i_unread_address_fifo (.d_i(|{empty_address, address_queue_usage}));
+  unread i_unread_branch_mask (.d_i(|branch_mask_extended));
+  unread i_unread_lzc (.d_i(|{branch_empty}));
+  unread i_unread_fifo_pos (.d_i(|fifo_pos_extended)); // we don't care about the lower signals
+  unread i_unread_instr_fifo (.d_i(|instr_queue_usage));
+
+  always_ff @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      idx_ds_q        <= 'b1;
+      idx_is_q        <= '0;
+      pc_q            <= '0;
+      reset_address_q <= 1'b1;
+    end else begin
+      pc_q            <= pc_d;
+      reset_address_q <= reset_address_d;
+      if (flush_i) begin
+          // one-hot encoded
+          idx_ds_q        <= 'b1;
+          // binary encoded
+          idx_is_q        <= '0;
+          reset_address_q <= 1'b1;
+      end else begin
+          idx_ds_q        <= idx_ds_d;
+          idx_is_q        <= idx_is_d;
+      end
+    end
+  end
+
+  // pragma translate_off
+  `ifndef VERILATOR
+      replay_address_fifo: assert property (
+        @(posedge clk_i) disable iff (!rst_ni) replay_o |-> !i_fifo_address.push_i
+      ) else $fatal(1,"[instr_queue] Pushing address although replay asserted");
+
+      output_select_onehot: assert property (
+        @(posedge clk_i) $onehot0(idx_ds_q)
+      ) else begin $error("Output select should be one-hot encoded"); $stop(); end
+  `endif
+  // pragma translate_on
+endmodule
--- a/src/frontend/instr_scan.sv
+++ b/src/frontend/instr_scan.sv
@ -1,4 +1,4 @@
-//Copyright (C) 2018 to present,
+// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
 // Copyright and related rights are licensed under the Solderpad Hardware
 // License, Version 2.0 (the "License"); you may not use this file except in
 // compliance with the License.  You may obtain a copy of the License at
@ -17,7 +17,6 @@
 // ------------------------------
 module instr_scan (
    input  logic [31:0] instr_i,        // expect aligned instruction, compressed or not
-    output logic        is_rvc_o,
    output logic        rvi_return_o,
    output logic        rvi_call_o,
    output logic        rvi_branch_o,
@ -32,35 +31,39 @@ module instr_scan (
    output logic        rvc_call_o,
    output logic [63:0] rvc_imm_o
 );
-    assign is_rvc_o     = (instr_i[1:0] != 2'b11);
-    // check that rs1 is either x1 or x5 and that rs1 is not x1 or x5, TODO: check the fact about bit 7
-    assign rvi_return_o = rvi_jalr_o & ~instr_i[7] & ~instr_i[19] & ~instr_i[18] & ~instr_i[16] & instr_i[15];
-    assign rvi_call_o   = (rvi_jalr_o | rvi_jump_o) & instr_i[7]; // TODO: check that this captures calls
+    logic is_rvc;
+    assign is_rvc     = (instr_i[1:0] != 2'b11);
+    // check that rs1 is either x1 or x5 and that rs1 is not x1 or x5
+    assign rvi_return_o = rvi_jalr_o & ((instr_i[11:7] == 5'd1) | instr_i[11:7] == 5'd5)
+                                     & (instr_i[19:15] != instr_i[11:7]);
+    // Opocde is JAL[R] and destination register is either x1 or x5
+    assign rvi_call_o   = (rvi_jalr_o | rvi_jump_o) & ((instr_i[11:7] == 5'd1) | instr_i[11:7] == 5'd5);
    // differentiates between JAL and BRANCH opcode, JALR comes from BHT
    assign rvi_imm_o    = (instr_i[3]) ? ariane_pkg::uj_imm(instr_i) : ariane_pkg::sb_imm(instr_i);
-    assign rvi_branch_o = (instr_i[6:0] == riscv::OpcodeBranch) ? 1'b1 : 1'b0;
-    assign rvi_jalr_o   = (instr_i[6:0] == riscv::OpcodeJalr)   ? 1'b1 : 1'b0;
-    assign rvi_jump_o   = (instr_i[6:0] == riscv::OpcodeJal)    ? 1'b1 : 1'b0;
+    assign rvi_branch_o = (instr_i[6:0] == riscv::OpcodeBranch);
+    assign rvi_jalr_o   = (instr_i[6:0] == riscv::OpcodeJalr);
+    assign rvi_jump_o   = (instr_i[6:0] == riscv::OpcodeJal);
+
    // opcode JAL
-    assign rvc_jump_o   = (instr_i[15:13] == riscv::OpcodeC1J) & is_rvc_o & (instr_i[1:0] == riscv::OpcodeC1);
+    assign rvc_jump_o   = (instr_i[15:13] == riscv::OpcodeC1J) & is_rvc & (instr_i[1:0] == riscv::OpcodeC1);
    // always links to register 0
-    assign rvc_jr_o     = (instr_i[15:13] == riscv::OpcodeC2JalrMvAdd)
-                        & ~instr_i[12]
+    logic is_jal_r;
+    assign is_jal_r     = (instr_i[15:13] == riscv::OpcodeC2JalrMvAdd) &
                        & (instr_i[6:2] == 5'b00000)
                        & (instr_i[1:0] == riscv::OpcodeC2)
-                        & is_rvc_o;
-    assign rvc_branch_o = ((instr_i[15:13] == riscv::OpcodeC1Beqz) | (instr_i[15:13] == riscv::OpcodeC1Bnez))
-                        & (instr_i[1:0] == riscv::OpcodeC1)
-                        & is_rvc_o;
-    // check that rs1 is x1 or x5
-    assign rvc_return_o = ~instr_i[11] & ~instr_i[10] & ~instr_i[8] & instr_i[7] & rvc_jr_o ;
+                        & is_rvc;
+    assign rvc_jr_o     = is_jal_r & ~instr_i[12];
    // always links to register 1 e.g.: it is a jump
-    assign rvc_jalr_o   = (instr_i[15:13] == riscv::OpcodeC2JalrMvAdd)
-                        & instr_i[12]
-                        & (instr_i[6:2] == 5'b00000) & is_rvc_o;
+    assign rvc_jalr_o   = is_jal_r & instr_i[12];
    assign rvc_call_o   = rvc_jalr_o;

-    // // differentiates between JAL and BRANCH opcode, JALR comes from BHT
+    assign rvc_branch_o = ((instr_i[15:13] == riscv::OpcodeC1Beqz) | (instr_i[15:13] == riscv::OpcodeC1Bnez))
+                        & (instr_i[1:0] == riscv::OpcodeC1)
+                        & is_rvc;
+    // check that rs1 is x1 or x5
+    assign rvc_return_o = ((instr_i[11:7] == 5'd1) | (instr_i[11:7] == 5'd5))  & rvc_jr_o ;
+
+    // differentiates between JAL and BRANCH opcode, JALR comes from BHT
    assign rvc_imm_o    = (instr_i[14]) ? {{56{instr_i[12]}}, instr_i[6:5], instr_i[2], instr_i[11:10], instr_i[4:3], 1'b0}
                                       : {{53{instr_i[12]}}, instr_i[8], instr_i[10:9], instr_i[6], instr_i[7], instr_i[2], instr_i[11], instr_i[5:3], 1'b0};
 endmodule
--- a/src/id_stage.sv
+++ b/src/id_stage.sv
@ -10,95 +10,81 @@
 //
 // Author: Florian Zaruba, ETH Zurich
 // Date: 15.04.2017
-// Description: Description: Instruction decode, contains the logic for decode,
+// Description: Instruction decode, contains the logic for decode,
 //              issue and read operands.

-import ariane_pkg::*;
-
 module id_stage (
-    input  logic                  clk_i,     // Clock
-    input  logic                  rst_ni,    // Asynchronous reset active low
+    input  logic                          clk_i,
+    input  logic                          rst_ni,

-    input  logic                  flush_i,
-    input  logic                  debug_req_i,
+    input  logic                          flush_i,
+    input  logic                          debug_req_i,
    // from IF
-    input  frontend_fetch_t       fetch_entry_i,
-    input  logic                  fetch_entry_valid_i,
-    output logic                  decoded_instr_ack_o, // acknowledge the instruction (fetch entry)
-
+    input  ariane_pkg::fetch_entry_t      fetch_entry_i,
+    input  logic                          fetch_entry_valid_i,
+    output logic                          fetch_entry_ready_o, // acknowledge the instruction (fetch entry)
    // to ID
-    output scoreboard_entry_t     issue_entry_o,       // a decoded instruction
-    output logic                  issue_entry_valid_o, // issue entry is valid
-    output logic                  is_ctrl_flow_o,      // the instruction we issue is a ctrl flow instructions
-    input  logic                  issue_instr_ack_i,   // issue stage acknowledged sampling of instructions
+    output ariane_pkg::scoreboard_entry_t issue_entry_o,       // a decoded instruction
+    output logic                          issue_entry_valid_o, // issue entry is valid
+    output logic                          is_ctrl_flow_o,      // the instruction we issue is a ctrl flow instructions
+    input  logic                          issue_instr_ack_i,   // issue stage acknowledged sampling of instructions
    // from CSR file
-    input  riscv::priv_lvl_t      priv_lvl_i,          // current privilege level
-    input  riscv::xs_t            fs_i,                // floating point extension status
-    input  logic [2:0]            frm_i,               // floating-point dynamic rounding mode
-    input  logic [1:0]            irq_i,
-    input  irq_ctrl_t             irq_ctrl_i,
-    input  logic                  debug_mode_i,        // we are in debug mode
-    input  logic                  tvm_i,
-    input  logic                  tw_i,
-    input  logic                  tsr_i
+    input  riscv::priv_lvl_t              priv_lvl_i,          // current privilege level
+    input  riscv::xs_t                    fs_i,                // floating point extension status
+    input  logic [2:0]                    frm_i,               // floating-point dynamic rounding mode
+    input  logic [1:0]                    irq_i,
+    input  ariane_pkg::irq_ctrl_t         irq_ctrl_i,
+    input  logic                          debug_mode_i,        // we are in debug mode
+    input  logic                          tvm_i,
+    input  logic                          tw_i,
+    input  logic                          tsr_i
 );
-    // register stage
+    // ID/ISSUE register stage
    struct packed {
-        logic              valid;
-        scoreboard_entry_t sbe;
-        logic              is_ctrl_flow;
+        logic                          valid;
+        ariane_pkg::scoreboard_entry_t sbe;
+        logic                          is_ctrl_flow;
    } issue_n, issue_q;

-    logic                is_control_flow_instr;
-    scoreboard_entry_t   decoded_instruction;
+    logic                            is_control_flow_instr;
+    ariane_pkg::scoreboard_entry_t   decoded_instruction;

-    fetch_entry_t        fetch_entry;
    logic                is_illegal;
    logic                [31:0] instruction;
    logic                is_compressed;
-    logic                fetch_ack_i;
-    logic                fetch_entry_valid;

    // ---------------------------------------------------------
-    // 1. Re-align instructions
-    // ---------------------------------------------------------
-    instr_realigner instr_realigner_i (
-        .fetch_entry_i           ( fetch_entry_i               ),
-        .fetch_entry_valid_i     ( fetch_entry_valid_i         ),
-        .fetch_ack_o             ( decoded_instr_ack_o         ),
-
-        .fetch_entry_o           ( fetch_entry                 ),
-        .fetch_entry_valid_o     ( fetch_entry_valid           ),
-        .fetch_ack_i             ( fetch_ack_i                 ),
-        .*
-    );
-    // ---------------------------------------------------------
-    // 2. Check if they are compressed and expand in case they are
+    // 1. Check if they are compressed and expand in case they are
    // ---------------------------------------------------------
    compressed_decoder compressed_decoder_i (
-        .instr_i                 ( fetch_entry.instruction     ),
+        .instr_i                 ( fetch_entry_i.instruction   ),
        .instr_o                 ( instruction                 ),
        .illegal_instr_o         ( is_illegal                  ),
        .is_compressed_o         ( is_compressed               )
-
    );
    // ---------------------------------------------------------
-    // 3. Decode and emit instruction to issue stage
+    // 2. Decode and emit instruction to issue stage
    // ---------------------------------------------------------
    decoder decoder_i (
        .debug_req_i,
-        .pc_i                    ( fetch_entry.address           ),
-        .is_compressed_i         ( is_compressed                 ),
-        .compressed_instr_i      ( fetch_entry.instruction[15:0] ),
-        .instruction_i           ( instruction                   ),
-        .branch_predict_i        ( fetch_entry.branch_predict    ),
-        .is_illegal_i            ( is_illegal                    ),
-        .ex_i                    ( fetch_entry.ex                ),
-        .instruction_o           ( decoded_instruction           ),
-        .is_control_flow_instr_o ( is_control_flow_instr         ),
+        .irq_ctrl_i,
+        .irq_i,
+        .pc_i                    ( fetch_entry_i.address           ),
+        .is_compressed_i         ( is_compressed                   ),
+        .is_illegal_i            ( is_illegal                      ),
+        .instruction_i           ( instruction                     ),
+        .compressed_instr_i      ( fetch_entry_i.instruction[15:0] ),
+        .branch_predict_i        ( fetch_entry_i.branch_predict    ),
+        .ex_i                    ( fetch_entry_i.ex                ),
+        .priv_lvl_i              ( priv_lvl_i                      ),
+        .debug_mode_i            ( debug_mode_i                    ),
        .fs_i,
        .frm_i,
-        .*
+        .tvm_i,
+        .tw_i,
+        .tsr_i,
+        .instruction_o           ( decoded_instruction          ),
+        .is_control_flow_instr_o ( is_control_flow_instr        )
    );

    // ------------------
@ -110,7 +96,7 @@ module id_stage (

    always_comb begin
        issue_n     = issue_q;
-        fetch_ack_i = 1'b0;
+        fetch_entry_ready_o = 1'b0;

        // Clear the valid flag if issue has acknowledged the instruction
        if (issue_instr_ack_i)
@ -119,9 +105,9 @@ module id_stage (
        // if we have a space in the register and the fetch is valid, go get it
        // or the issue stage is currently acknowledging an instruction, which means that we will have space
        // for a new instruction
-        if ((!issue_q.valid || issue_instr_ack_i) && fetch_entry_valid) begin
-            fetch_ack_i = 1'b1;
-            issue_n = {1'b1, decoded_instruction, is_control_flow_instr};
+        if ((!issue_q.valid || issue_instr_ack_i) && fetch_entry_valid_i) begin
+            fetch_entry_ready_o = 1'b1;
+            issue_n = '{1'b1, decoded_instruction, is_control_flow_instr};
        end

        // invalidate the pipeline register on a flush
@ -138,5 +124,4 @@ module id_stage (
            issue_q <= issue_n;
        end
    end
-
 endmodule
--- a/src/instr_realign.sv
+++ b/src/instr_realign.sv
@ -0,0 +1,358 @@
+// Copyright 2018 - 2019 ETH Zurich and University of Bologna.
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License.  You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// Author: Florian Zaruba <zarubaf@iis.ee.ethz.ch>
+// Description: Instruction Re-aligner
+//
+// This module takes 32-bit aligned cache blocks and extracts the instructions.
+// As we are supporting the compressed instruction set extension in a 32 bit instruction word
+// are up to 2 compressed instructions.
+// Furthermore those instructions can be arbitrarily interleaved which makes it possible to fetch
+// only the lower part of a 32 bit instruction.
+// Furthermore we need to handle the case if we want to start fetching from an unaligned
+// instruction e.g. a branch.
+
+import ariane_pkg::*;
+
+module instr_realign (
+    input  logic                              clk_i,
+    input  logic                              rst_ni,
+    input  logic                              flush_i,
+    input  logic                              valid_i,
+    output logic                              serving_unaligned_o, // we have an unaligned instruction in [0]
+    input  logic [63:0]                       address_i,
+    input  logic [FETCH_WIDTH-1:0]            data_i,
+    output logic [INSTR_PER_FETCH-1:0]        valid_o,
+    output logic [INSTR_PER_FETCH-1:0][63:0]  addr_o,
+    output logic [INSTR_PER_FETCH-1:0][31:0]  instr_o
+);
+    // as a maximum we support a fetch width of 64-bit, hence there can be 4 compressed instructions
+    logic [3:0] instr_is_compressed;
+
+    for (genvar i = 0; i < INSTR_PER_FETCH; i ++) begin
+        // LSB != 2'b11
+        assign instr_is_compressed[i] = ~&data_i[i * 16 +: 2];
+    end
+
+    // save the unaligned part of the instruction to this ff
+    logic [15:0] unaligned_instr_d,   unaligned_instr_q;
+    // the last instruction was unaligned
+    logic        unaligned_d,         unaligned_q;
+    // register to save the unaligned address
+    logic [63:0] unaligned_address_d, unaligned_address_q;
+    // we have an unaligned instruction
+    assign serving_unaligned_o = unaligned_q;
+
+    // Instruction re-alignment
+    if (FETCH_WIDTH == 32) begin : realign_bp_32
+        always_comb begin : re_align
+            unaligned_d = unaligned_q;
+            unaligned_address_d = {address_i[63:2], 2'b10};
+            unaligned_instr_d = data_i[31:16];
+
+            valid_o[0] = valid_i;
+            instr_o[0] = (unaligned_q) ? {data_i[15:0], unaligned_instr_q} : data_i[31:0];
+            addr_o[0]  = (unaligned_q) ? unaligned_address_q : address_i;
+
+            valid_o[1] = 1'b0;
+            instr_o[1] = '0;
+            addr_o[1]  = {address_i[63:2], 2'b10};
+
+            // this instruction is compressed or the last instruction was unaligned
+            if (instr_is_compressed[0] || unaligned_q) begin
+                // check if this is instruction is still unaligned e.g.: it is not compressed
+                // if its compressed re-set unaligned flag
+                // for 32 bit we can simply check the next instruction and whether it is compressed or not
+                // if it is compressed the next fetch will contain an aligned instruction
+                // is instruction 1 also compressed
+                // yes? -> no problem, no -> we've got an unaligned instruction
+                if (instr_is_compressed[1]) begin
+                    unaligned_d = 1'b0;
+                    valid_o[1] = valid_i;
+                    instr_o[1] = {16'b0, data_i[31:16]};
+                end else begin
+                    // save the upper bits for next cycle
+                    unaligned_d = 1'b1;
+                    unaligned_instr_d = data_i[31:16];
+                    unaligned_address_d = {address_i[63:2], 2'b10};
+                end
+            end // else -> normal fetch
+
+            // we started to fetch on a unaligned boundary with a whole instruction -> wait until we've
+            // received the next instruction
+            if (valid_i && address_i[1]) begin
+                // the instruction is not compressed so we can't do anything in this cycle
+                if (!instr_is_compressed[0]) begin
+                    valid_o = '0;
+                    unaligned_d = 1'b1;
+                    unaligned_address_d = {address_i[63:2], 2'b10};
+                    unaligned_instr_d = data_i[15:0];
+                // the instruction isn't compressed but only the lower is ready
+                end else begin
+                    valid_o = 1'b1;
+                end
+            end
+        end
+    // TODO(zarubaf): Fix 64 bit FETCH_WIDTH, maybe generalize to arbitrary fetch width
+    end else if (FETCH_WIDTH == 64) begin : realign_bp_64
+        initial begin
+          $error("Not propperly implemented");
+        end
+        always_comb begin : re_align
+            unaligned_d = unaligned_q;
+            unaligned_address_d = unaligned_address_q;
+            unaligned_instr_d = unaligned_instr_q;
+
+            valid_o    = '0;
+            valid_o[0] = valid_i;
+
+            instr_o[0] = data_i[31:0];
+            addr_o[0]  = address_i;
+
+            instr_o[1] = '0;
+            addr_o[1]  = {address_i[63:3], 3'b010};
+
+            instr_o[2] = {16'b0, data_i[47:32]};
+            addr_o[2]  = {address_i[63:3], 3'b100};
+
+            instr_o[3] = {16'b0, data_i[63:48]};
+            addr_o[3]  = {address_i[63:3], 3'b110};
+
+            // last instruction was unaligned
+            if (unaligned_q) begin
+                instr_o[0] = {data_i[15:0], unaligned_instr_q};
+                addr_o[0] = unaligned_address_q;
+                // for 64 bit there exist the following options:
+                //     64      32      0
+                //     | 3 | 2 | 1 | 0 | <- instruction slot
+                // |   I   |   I   |   U   | -> again unaligned
+                // | * | C |   I   |   U   | -> aligned
+                // | * |   I   | C |   U   | -> aligned
+                // |   I   | C | C |   U   | -> again unaligned
+                // | * | C | C | C |   U   | -> aligned
+                // Legend: C = compressed, I = 32 bit instruction, U = unaligned upper half
+                //         * = don't care
+                if (instr_is_compressed[1]) begin
+                    instr_o[1] = {16'b0, data_i[31:16]};
+                    valid_o[1] = valid_i;
+
+                    if (instr_is_compressed[2]) begin
+                        if (instr_is_compressed[3]) begin
+                            unaligned_d = 1'b0;
+                            valid_o[3] = valid_i;
+                        end else begin
+                            // continues to be unaligned
+                        end
+                    end else begin
+                        unaligned_d = 1'b0;
+                        instr_o[2] = data_i[63:32];
+                        valid_o[2] = valid_i;
+                    end
+                // instruction 1 is not compressed
+                end else begin
+                    instr_o[1] = data_i[47:16];
+                    valid_o[1] = valid_i;
+                    addr_o[2] = {address_i[63:3], 3'b110};
+                    if (instr_is_compressed[2]) begin
+                        unaligned_d = 1'b0;
+                        instr_o[2] = {16'b0, data_i[63:48]};
+                        valid_o[2] = valid_i;
+                    end else begin
+                        // continues to be unaligned
+                    end
+                end
+            end else if (instr_is_compressed[0]) begin // instruction zero is RVC
+                //     64     32       0
+                //     | 3 | 2 | 1 | 0 | <- instruction slot
+                // |   I   |   I   | C | -> again unaligned
+                // | * | C |   I   | C | -> aligned
+                // | * |   I   | C | C | -> aligned
+                // |   I   | C | C | C | -> again unaligned
+                // | * | C | C | C | C | -> aligned
+                if (instr_is_compressed[1]) begin
+                    instr_o[1] = {16'b0, data_i[31:16]};
+                    valid_o[1] = valid_i;
+
+                    if (instr_is_compressed[2]) begin
+                        valid_o[2] = valid_i;
+                        if (instr_is_compressed[3]) begin
+                            valid_o[3] = valid_i;
+                        end else begin
+                            // this instruction is unaligned
+                            unaligned_d = 1'b1;
+                            unaligned_instr_d = data_i[63:48];
+                            unaligned_address_d = addr_o[3];
+                        end
+                    end else begin
+                        instr_o[2] = data_i[63:32];
+                        valid_o[2] = valid_i;
+                    end
+                // instruction 1 is not compressed -> check slot 3
+                end else begin
+                    instr_o[1] = data_i[47:16];
+                    valid_o[1] = valid_i;
+                    addr_o[2] = {address_i[63:3], 3'b110};
+                    if (instr_is_compressed[3]) begin
+                        instr_o[2] = data_i[63:48];
+                        valid_o[2] = valid_i;
+                    end else begin
+                        unaligned_d = 1'b1;
+                        unaligned_instr_d = data_i[63:48];
+                        unaligned_address_d = addr_o[2];
+                    end
+                end
+
+            // Full instruction in slot zero
+            //     64     32       0
+            //     | 3 | 2 | 1 | 0 | <- instruction slot
+            // |   I   | C |   I   |
+            // | * | C | C |   I   |
+            // | * |   I   |   I   |
+            end else begin
+                addr_o[1] = {address_i[63:3], 3'b100};
+
+                if (instr_is_compressed[2]) begin
+                    instr_o[1] = {16'b0, data_i[47:32]};
+                    valid_o[1] = valid_i;
+                    addr_o[2] = {address_i[63:3], 3'b110};
+                    if (instr_is_compressed[3]) begin
+                        // | * | C | C |   I   |
+                        valid_o[2] = valid_i;
+                        addr_o[2] = {16'b0, data_i[63:48]};
+                    end else begin
+                        // this instruction is unaligned
+                        unaligned_d = 1'b1;
+                        unaligned_instr_d = data_i[63:48];
+                        unaligned_address_d = addr_o[2];
+                    end
+                end else begin
+                    // two regular instructions back-to-back
+                    instr_o[1] = data_i[63:32];
+                    valid_o[1] = valid_i;
+                end
+            end
+
+            // --------------------------
+            // Unaligned fetch
+            // --------------------------
+            // Address was not 64 bit aligned
+            case (address_i[2:1])
+                // this means the previouse instruction was either compressed or unaligned
+                // in any case we don't ccare
+                2'b01: begin
+                    //     64     32       0
+                    //     | 3 | 2 | 1 | 0 | <- instruction slot
+                    // |   I   |   I   | x  -> again unaligned
+                    // | * | C |   I   | x  -> aligned
+                    // | * |   I   | C | x  -> aligned
+                    // |   I   | C | C | x  -> again unaligned
+                    // | * | C | C | C | x  -> aligned
+                    addr_o[0] = {address_i[63:3], 3'b010};
+
+                    if (instr_is_compressed[1]) begin
+                        instr_o[0] = {16'b0, data_i[31:16]};
+                        valid_o[0] = valid_i;
+
+                        if (instr_is_compressed[2]) begin
+                            valid_o[1] = valid_i;
+                            instr_o[1] = {16'b0, data_i[47:32]};
+                            addr_o[1] = {address_i[63:3], 3'b100};
+                            if (instr_is_compressed[3]) begin
+                                instr_o[2] = {16'b0, data_i[63:48]};
+                                addr_o[2] = {address_i[63:3], 3'b110};
+                                valid_o[2] = valid_i;
+                            end else begin
+                                // this instruction is unaligned
+                                unaligned_d = 1'b1;
+                                unaligned_instr_d = data_i[63:48];
+                                unaligned_address_d = addr_o[3];
+                            end
+                        end else begin
+                            instr_o[1] = data_i[63:32];
+                            addr_o[1] = {address_i[63:3], 3'b100};
+                            valid_o[1] = valid_i;
+                        end
+                    // instruction 1 is not compressed -> check slot 3
+                    end else begin
+                        instr_o[0] = data_i[47:16];
+                        valid_o[0] = valid_i;
+                        addr_o[1] = {address_i[63:3], 3'b110};
+                        if (instr_is_compressed[3]) begin
+                            instr_o[1] = data_i[63:48];
+                            valid_o[1] = valid_i;
+                        end else begin
+                            unaligned_d = 1'b1;
+                            unaligned_instr_d = data_i[63:48];
+                            unaligned_address_d = addr_o[1];
+                        end
+                    end
+                end
+                2'b10: begin
+                    valid_o = '0;
+                    //     64     32       0
+                    //     | 3 | 2 | 1 | 0 | <- instruction slot
+                    // |   I   | C |   *   | <- unaligned
+                    //    | C  | C |   *   | <- aligned
+                    //    |    I   |   *   | <- aligned
+                    if (instr_is_compressed[2]) begin
+                        valid_o[0] = valid_i;
+                        instr_o[0] = data_i[47:32];
+                        // second instruction is also compressed
+                        if (instr_is_compressed[3]) begin
+                            valid_o[1] = valid_i;
+                            instr_o[1] = data_i[63:48];
+                        // regular instruction -> unaligned
+                        end else begin
+                            unaligned_d = 1'b1;
+                            unaligned_address_d = {address_i[63:3], 3'b110};
+                            unaligned_instr_d = data_i[63:48];
+                        end
+                    // instruction is a regular instruction
+                    end else begin
+                        valid_o[0] = valid_i;
+                        instr_o[0] = data_i[63:32];
+                        addr_o[0] = address_i;
+                    end
+                end
+                // we started to fetch on a unaligned boundary with a whole instruction -> wait until we've
+                // received the next instruction
+                2'b11: begin
+                    valid_o = '0;
+                    if (!instr_is_compressed[3]) begin
+                        unaligned_d = 1'b1;
+                        unaligned_address_d = {address_i[63:3], 3'b110};
+                        unaligned_instr_d = data_i[63:48];
+                    end else begin
+                        valid_o[3] = valid_i;
+                    end
+                end
+            endcase
+        end
+    end
+
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (~rst_ni) begin
+            unaligned_q         <= 1'b0;
+            unaligned_address_q <= '0;
+            unaligned_instr_q   <= '0;
+        end else begin
+            if (valid_i) begin
+                unaligned_address_q <= unaligned_address_d;
+                unaligned_instr_q   <= unaligned_instr_d;
+            end
+
+            if (flush_i) begin
+                unaligned_q <= 1'b0;
+            end else if (valid_i) begin
+                unaligned_q <= unaligned_d;
+            end
+        end
+    end
+endmodule
--- a/src/instr_realigner.sv
+++ b/src/instr_realigner.sv
@ -1,252 +0,0 @@
-// Copyright 2018 ETH Zurich and University of Bologna.
-// Copyright and related rights are licensed under the Solderpad Hardware
-// License, Version 0.51 (the "License"); you may not use this file except in
-// compliance with the License.  You may obtain a copy of the License at
-// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-// or agreed to in writing, software, hardware and materials distributed under
-// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-//
-// Author: Florian Zaruba, ETH Zurich
-// Date: 14.05.2017
-// Description: Emits and re-aligns compressed and unaligned instructions
-
-import ariane_pkg::*;
-
-module instr_realigner (
-    input  logic                   clk_i,               // Clock
-    input  logic                   rst_ni,              // Asynchronous reset active low
-    // control signals
-    input  logic                   flush_i,
-
-    input  frontend_fetch_t        fetch_entry_i,
-    input  logic                   fetch_entry_valid_i,
-    output logic                   fetch_ack_o,
-
-    output fetch_entry_t           fetch_entry_o,
-    output logic                   fetch_entry_valid_o,
-    input  logic                   fetch_ack_i
-);
-    // ----------
-    // Registers
-    // ----------
-    // the last instruction was unaligned
-    logic        unaligned_n,         unaligned_q;
-    // save the unaligned part of the instruction to this ff
-    logic [15:0] unaligned_instr_n,   unaligned_instr_q;
-    // the previous instruction was compressed
-    logic        compressed_n,        compressed_q;
-    // register to save the unaligned address
-    logic [63:0] unaligned_address_n, unaligned_address_q;
-    // get the next instruction, needed on a unaligned access
-    logic jump_unaligned_half_word;
-
-    // check if the lower compressed instruction was no branch otherwise we will need to squash this instruction
-    // but only if we predicted it to be taken, the predict was on the lower 16 bit compressed instruction
-    logic kill_upper_16_bit;
-    assign kill_upper_16_bit = fetch_entry_i.branch_predict.valid &
-                               fetch_entry_i.branch_predict.predict_taken &
-                               fetch_entry_i.bp_taken[0];
-    // ----------
-    // Registers
-    // ----------
-    always_comb begin : realign_instr
-
-        unaligned_n          = unaligned_q;
-        unaligned_instr_n    = unaligned_instr_q;
-        compressed_n         = compressed_q;
-        unaligned_address_n  = unaligned_address_q;
-
-        // directly output this instruction. adoptions are made throughout the always comb block
-        fetch_entry_o.address        = fetch_entry_i.address;
-        fetch_entry_o.instruction    = fetch_entry_i.instruction;
-        fetch_entry_o.branch_predict = fetch_entry_i.branch_predict;
-        fetch_entry_o.ex.valid       = fetch_entry_i.page_fault;
-        fetch_entry_o.ex.tval        = (fetch_entry_i.page_fault) ? fetch_entry_i.address : '0;
-        fetch_entry_o.ex.cause       = (fetch_entry_i.page_fault) ? riscv::INSTR_PAGE_FAULT : '0;
-
-        fetch_entry_valid_o  = fetch_entry_valid_i;
-        fetch_ack_o        = fetch_ack_i;
-        // we just jumped to a half word and encountered an unaligned 32-bit instruction
-        jump_unaligned_half_word = 1'b0;
-        // ---------------------------------
-        // Input port & Instruction Aligner
-        // ---------------------------------
-        // check if the entry if the fetch FIFO is valid and if we are currently not serving the second part
-        // of a compressed instruction
-        if (fetch_entry_valid_i && !compressed_q) begin
-            // ------------------------
-            // Access on Word Boundary
-            // ------------------------
-            if (fetch_entry_i.address[1] == 1'b0) begin
-                // do we actually want the first instruction or was the address a half word access?
-                if (!unaligned_q) begin
-                    // we got a valid instruction so we can satisfy the unaligned instruction
-                    unaligned_n = 1'b0;
-                    // check if the instruction is compressed
-                    if (fetch_entry_i.instruction[1:0] != 2'b11) begin
-                        // it is compressed
-                        fetch_entry_o.instruction = {15'b0, fetch_entry_i.instruction[15:0]};
-                        // we need to kill the lower prediction
-                        if (fetch_entry_i.branch_predict.valid && !fetch_entry_i.bp_taken[0])
-                            fetch_entry_o.branch_predict.valid = 1'b0;
-
-                        // should we even look at the upper instruction bits?
-                        if (!kill_upper_16_bit) begin
-                            // Yes, so...
-                            // 1. Is the second instruction also compressed, like:
-                            // _____________________________________________
-                            // | compressed 2 [31:16] | compressed 1[15:0] |
-                            // |____________________________________________
-                            if (fetch_entry_i.instruction[17:16] != 2'b11) begin
-                                // yes, this was a compressed instruction
-                                compressed_n = 1'b1;
-                                // do not advance the queue pointer
-                                fetch_ack_o = 1'b0;
-                            // 2. or is it an unaligned 32 bit instruction like
-                            // ____________________________________________________
-                            // |instr [15:0] | instr [31:16] | compressed 1[15:0] |
-                            // |____________________________________________________
-                            end else begin
-                                // save the lower 16 bit
-                                unaligned_instr_n = fetch_entry_i.instruction[31:16];
-                                // save the address
-                                unaligned_address_n = {fetch_entry_i.address[63:2], 2'b10};
-                                // and that it was unaligned
-                                unaligned_n = 1'b1;
-                                // this does not consume space in the FIFO
-                            end
-                        end
-                    end
-                end
-                // this is a full 32 bit instruction like
-                // _______________________
-                // | instruction [31:0]  |
-                // |______________________
-
-                // we have an outstanding unaligned instruction
-                else if (unaligned_q) begin
-
-
-                    fetch_entry_o.address = unaligned_address_q;
-                    fetch_entry_o.instruction = {fetch_entry_i.instruction[15:0], unaligned_instr_q};
-
-                    // again should we look at the upper bits?
-                    if (!kill_upper_16_bit) begin
-                        // whats up with the other upper 16 bit of this instruction
-                        // is the second instruction also compressed, like:
-                        // _____________________________________________
-                        // | compressed 2 [31:16] | unaligned[31:16]    |
-                        // |____________________________________________
-                        // check if the lower compressed instruction was no branch otherwise we will need to squash this instruction
-                        // but only if we predicted it to be taken, the predict was on the lower 16 bit compressed instruction
-                        if (fetch_entry_i.instruction[17:16] != 2'b11) begin
-                            // this was a compressed instruction
-                            compressed_n  = 1'b1;
-                            // do not advance the queue pointer
-                            fetch_ack_o = 1'b0;
-                            // unaligned access served
-                            unaligned_n = 1'b0;
-                            // we need to kill the lower prediction
-                            if (fetch_entry_i.branch_predict.valid && !fetch_entry_i.bp_taken[0])
-                                fetch_entry_o.branch_predict.valid = 1'b0;
-                            // or is it an unaligned 32 bit instruction like
-                        // ____________________________________________________
-                        // |instr [15:0] | instr [31:16] | compressed 1[15:0] |
-                        // |____________________________________________________
-                        end else if (!kill_upper_16_bit) begin
-                            // save the lower 16 bit
-                            unaligned_instr_n = fetch_entry_i.instruction[31:16];
-                            // save the address
-                            unaligned_address_n = {fetch_entry_i.address[63:2], 2'b10};
-                            // and that it was unaligned
-                            unaligned_n = 1'b1;
-                        end
-                    end
-                    // we've got a predicted taken branch we need to clear the unaligned flag if it was decoded as a lower 16 instruction
-                    else if (fetch_entry_i.branch_predict.valid) begin
-                        // the next fetch will start from a 4 byte boundary again
-                        unaligned_n = 1'b0;
-                    end
-                end
-            end
-            // ----------------------------
-            // Access on half-Word Boundary
-            // ----------------------------
-            else if (fetch_entry_i.address[1] == 1'b1) begin // address was a half word access
-                // reset the unaligned flag as this is a completely new fetch (because consecutive fetches only happen on a word basis)
-                unaligned_n = 1'b0;
-                // this is a compressed instruction
-                if (fetch_entry_i.instruction[17:16] != 2'b11) begin
-                    // it is compressed
-                    fetch_entry_o.instruction = {15'b0, fetch_entry_i.instruction[31:16]};
-
-                // this is the first part of a 32 bit unaligned instruction
-                end else begin
-                     // save the lower 16 bit
-                    unaligned_instr_n = fetch_entry_i.instruction[31:16];
-                    // and that it was unaligned
-                    unaligned_n = 1'b1;
-                    // save the address
-                    unaligned_address_n = {fetch_entry_i.address[63:2], 2'b10};
-                    // we need to wait for the second instruction
-                    fetch_entry_valid_o = 1'b0;
-                    // so get it by acknowledging this instruction
-                    fetch_ack_o = 1'b1;
-                    // we got to an unaligned instruction -> get the next entry to full-fill the need
-                    jump_unaligned_half_word = 1'b1;
-                end
-                // there can never be a whole 32 bit instruction on a half word access
-            end
-        end
-        // ----------------------------
-        // Next compressed instruction
-        // ----------------------------
-        // we are serving the second part of an instruction which was also compressed
-        if (compressed_q) begin
-            fetch_ack_o = fetch_ack_i;
-            compressed_n  = 1'b0;
-            fetch_entry_o.instruction = {16'b0, fetch_entry_i.instruction[31:16]};
-            fetch_entry_o.address = {fetch_entry_i.address[63:2], 2'b10};
-            fetch_entry_valid_o = 1'b1;
-        end
-
-        // if we didn't get an acknowledge keep the registers stable
-        if (!fetch_ack_i && !jump_unaligned_half_word) begin
-            unaligned_n         = unaligned_q;
-            unaligned_instr_n   = unaligned_instr_q;
-            compressed_n        = compressed_q;
-            unaligned_address_n = unaligned_address_q;
-        end
-
-        if (flush_i) begin
-            // clear the unaligned and compressed instruction
-            unaligned_n  = 1'b0;
-            compressed_n = 1'b0;
-        end
-
-        // assign the correct address for a potentially faulting unaligned instruction
-        // we've already done the re-alignment for the instruction word so we
-        // can just assign it here to tval
-        fetch_entry_o.ex.tval = fetch_entry_o.address;
-    end
-
-    // ---------
-    // Registers
-    // ---------
-    always_ff @(posedge clk_i or negedge rst_ni) begin
-        if (~rst_ni) begin
-            unaligned_q         <= 1'b0;
-            unaligned_instr_q   <= 16'b0;
-            unaligned_address_q <= 64'b0;
-            compressed_q        <= 1'b0;
-        end else begin
-            unaligned_q         <= unaligned_n;
-            unaligned_instr_q   <= unaligned_instr_n;
-            unaligned_address_q <= unaligned_address_n;
-            compressed_q        <= compressed_n;
-        end
-    end
-
-endmodule
--- a/tb/ariane_soc_pkg.sv
+++ b/tb/ariane_soc_pkg.sv
@ -67,6 +67,9 @@ package ariane_soc;
  localparam logic [NrRegion-1:0][NB_PERIPHERALS-1:0] ValidRule = {{NrRegion * NB_PERIPHERALS}{1'b1}};

  localparam ariane_pkg::ariane_cfg_t ArianeSocCfg = '{
+    RASDepth: 2,
+    BTBEntries: 32,
+    BHTEntries: 128,
    // idempotent region
    NrNonIdempotentRules:  0,
    NonIdempotentAddrBase: {64'b0},