// Copyright lowRISC contributors. // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 /** * Instruction cache * * Provides an instruction cache along with cache management, instruction buffering and prefetching */ `include "prim_assert.sv" module ibex_icache #( // Cache arrangement parameters parameter int unsigned BusWidth = 32, parameter int unsigned CacheSizeBytes = 4*1024, parameter bit ICacheECC = 1'b0, parameter int unsigned LineSize = 64, parameter int unsigned NumWays = 2, // Always make speculative bus requests in parallel with lookups parameter bit SpecRequest = 1'b0, // Only cache branch targets parameter bit BranchCache = 1'b0 ) ( // Clock and reset input logic clk_i, input logic rst_ni, // Signal that the core would like instructions input logic req_i, // Set the cache's address counter input logic branch_i, input logic branch_spec_i, input logic [31:0] addr_i, // IF stage interface: Pass fetched instructions to the core input logic ready_i, output logic valid_o, output logic [31:0] rdata_o, output logic [31:0] addr_o, output logic err_o, output logic err_plus2_o, // Instruction memory / interconnect interface: Fetch instruction data from memory output logic instr_req_o, input logic instr_gnt_i, output logic [31:0] instr_addr_o, input logic [BusWidth-1:0] instr_rdata_i, input logic instr_err_i, input logic instr_pmp_err_i, input logic instr_rvalid_i, // Cache status input logic icache_enable_i, input logic icache_inval_i, output logic busy_o ); // Local constants localparam int unsigned ADDR_W = 32; // Number of fill buffers (must be >= 2) localparam int unsigned NUM_FB = 4; // Request throttling threshold localparam int unsigned FB_THRESHOLD = NUM_FB - 2; // Derived parameters localparam int unsigned LINE_SIZE_ECC = ICacheECC ? (LineSize + 8) : LineSize; localparam int unsigned LINE_SIZE_BYTES = LineSize/8; localparam int unsigned LINE_W = $clog2(LINE_SIZE_BYTES); localparam int unsigned BUS_BYTES = BusWidth/8; localparam int unsigned BUS_W = $clog2(BUS_BYTES); localparam int unsigned LINE_BEATS = LINE_SIZE_BYTES / BUS_BYTES; localparam int unsigned LINE_BEATS_W = $clog2(LINE_BEATS); localparam int unsigned NUM_LINES = CacheSizeBytes / NumWays / LINE_SIZE_BYTES; localparam int unsigned INDEX_W = $clog2(NUM_LINES); localparam int unsigned INDEX_HI = INDEX_W + LINE_W - 1; localparam int unsigned TAG_SIZE = ADDR_W - INDEX_W - LINE_W + 1; // 1 valid bit localparam int unsigned TAG_SIZE_ECC = ICacheECC ? (TAG_SIZE + 6) : TAG_SIZE; localparam int unsigned OUTPUT_BEATS = (BUS_BYTES / 2); // number of halfwords // Prefetch signals logic [ADDR_W-1:0] lookup_addr_aligned; logic [ADDR_W-1:0] prefetch_addr_d, prefetch_addr_q; logic prefetch_addr_en; // Cache pipelipe IC0 signals logic branch_suppress; logic lookup_throttle; logic lookup_req_ic0; logic [ADDR_W-1:0] lookup_addr_ic0; logic [INDEX_W-1:0] lookup_index_ic0; logic fill_req_ic0; logic [INDEX_W-1:0] fill_index_ic0; logic [TAG_SIZE-1:0] fill_tag_ic0; logic [LineSize-1:0] fill_wdata_ic0; logic lookup_grant_ic0; logic lookup_actual_ic0; logic fill_grant_ic0; logic tag_req_ic0; logic [INDEX_W-1:0] tag_index_ic0; logic [NumWays-1:0] tag_banks_ic0; logic tag_write_ic0; logic [TAG_SIZE_ECC-1:0] tag_wdata_ic0; logic data_req_ic0; logic [INDEX_W-1:0] data_index_ic0; logic [NumWays-1:0] data_banks_ic0; logic data_write_ic0; logic [LINE_SIZE_ECC-1:0] data_wdata_ic0; // Cache pipelipe IC1 signals logic [TAG_SIZE_ECC-1:0] tag_rdata_ic1 [NumWays]; logic [LINE_SIZE_ECC-1:0] data_rdata_ic1 [NumWays]; logic [LINE_SIZE_ECC-1:0] hit_data_ic1; logic lookup_valid_ic1; logic [ADDR_W-1:INDEX_HI+1] lookup_addr_ic1; logic [NumWays-1:0] tag_match_ic1; logic tag_hit_ic1; logic [NumWays-1:0] tag_invalid_ic1; logic [NumWays-1:0] lowest_invalid_way_ic1; logic [NumWays-1:0] round_robin_way_ic1, round_robin_way_q; logic [NumWays-1:0] sel_way_ic1; logic ecc_err_ic1; logic ecc_write_req; logic [NumWays-1:0] ecc_write_ways; logic [INDEX_W-1:0] ecc_write_index; // Fill buffer signals logic gnt_or_pmp_err, gnt_not_pmp_err; logic [$clog2(NUM_FB)-1:0] fb_fill_level; logic fill_cache_new; logic fill_new_alloc; logic fill_spec_req, fill_spec_done, fill_spec_hold; logic [NUM_FB-1:0][NUM_FB-1:0] fill_older_d, fill_older_q; logic [NUM_FB-1:0] fill_alloc_sel, fill_alloc; logic [NUM_FB-1:0] fill_busy_d, fill_busy_q; logic [NUM_FB-1:0] fill_done; logic [NUM_FB-1:0] fill_in_ic1; logic [NUM_FB-1:0] fill_stale_d, fill_stale_q; logic [NUM_FB-1:0] fill_cache_d, fill_cache_q; logic [NUM_FB-1:0] fill_hit_ic1, fill_hit_d, fill_hit_q; logic [NUM_FB-1:0][LINE_BEATS_W:0] fill_ext_cnt_d, fill_ext_cnt_q; logic [NUM_FB-1:0] fill_ext_hold_d, fill_ext_hold_q; logic [NUM_FB-1:0] fill_ext_done; logic [NUM_FB-1:0][LINE_BEATS_W:0] fill_rvd_cnt_d, fill_rvd_cnt_q; logic [NUM_FB-1:0] fill_rvd_done; logic [NUM_FB-1:0] fill_ram_done_d, fill_ram_done_q; logic [NUM_FB-1:0] fill_out_grant; logic [NUM_FB-1:0][LINE_BEATS_W:0] fill_out_cnt_d, fill_out_cnt_q; logic [NUM_FB-1:0] fill_out_done; logic [NUM_FB-1:0] fill_ext_req, fill_rvd_exp, fill_ram_req, fill_out_req; logic [NUM_FB-1:0] fill_data_sel, fill_data_reg, fill_data_hit, fill_data_rvd; logic [NUM_FB-1:0][LINE_BEATS_W-1:0] fill_ext_off, fill_rvd_off; logic [NUM_FB-1:0][LINE_BEATS_W:0] fill_rvd_beat; logic [NUM_FB-1:0] fill_ext_arb, fill_ram_arb, fill_out_arb; logic [NUM_FB-1:0] fill_rvd_arb; logic [NUM_FB-1:0] fill_entry_en; logic [NUM_FB-1:0] fill_addr_en; logic [NUM_FB-1:0] fill_way_en; logic [NUM_FB-1:0][LINE_BEATS-1:0] fill_data_en; logic [NUM_FB-1:0][LINE_BEATS-1:0] fill_err_d, fill_err_q; logic [ADDR_W-1:0] fill_addr_q [NUM_FB]; logic [NumWays-1:0] fill_way_q [NUM_FB]; logic [LineSize-1:0] fill_data_d [NUM_FB]; logic [LineSize-1:0] fill_data_q [NUM_FB]; logic [ADDR_W-1:BUS_W] fill_ext_req_addr; logic [ADDR_W-1:0] fill_ram_req_addr; logic [NumWays-1:0] fill_ram_req_way; logic [LineSize-1:0] fill_ram_req_data; logic [LineSize-1:0] fill_out_data; logic [LINE_BEATS-1:0] fill_out_err; // External req signals logic instr_req; logic [ADDR_W-1:BUS_W] instr_addr; // Data output signals logic skid_complete_instr; logic skid_ready; logic output_compressed; logic skid_valid_d, skid_valid_q, skid_en; logic [15:0] skid_data_d, skid_data_q; logic skid_err_q; logic output_valid; logic addr_incr_two; logic output_addr_en; logic [ADDR_W-1:1] output_addr_d, output_addr_q; logic [15:0] output_data_lo, output_data_hi; logic data_valid, output_ready; logic [LineSize-1:0] line_data; logic [LINE_BEATS-1:0] line_err; logic [31:0] line_data_muxed; logic line_err_muxed; logic [31:0] output_data; logic output_err; // Invalidations logic start_inval, inval_done; logic reset_inval_q; logic inval_prog_d, inval_prog_q; logic [INDEX_W-1:0] inval_index_d, inval_index_q; ////////////////////////// // Instruction prefetch // ////////////////////////// assign lookup_addr_aligned = {lookup_addr_ic0[ADDR_W-1:LINE_W],{LINE_W{1'b0}}}; // The prefetch address increments by one cache line for each granted request. // This address is also updated if there is a branch that is not granted, since the target // address (addr_i) is only valid for one cycle while branch_i is high. // The captured branch target address is not forced to be aligned since the offset in the cache // line must also be recorded for later use by the fill buffers. assign prefetch_addr_d = lookup_grant_ic0 ? (lookup_addr_aligned + {{ADDR_W-LINE_W-1{1'b0}},1'b1,{LINE_W{1'b0}}}) : addr_i; assign prefetch_addr_en = branch_i | lookup_grant_ic0; always_ff @(posedge clk_i) begin if (prefetch_addr_en) begin prefetch_addr_q <= prefetch_addr_d; end end //////////////////////// // Pipeline stage IC0 // //////////////////////// // Cache lookup assign lookup_throttle = (fb_fill_level > FB_THRESHOLD[$clog2(NUM_FB)-1:0]); assign lookup_req_ic0 = req_i & ~&fill_busy_q & (branch_i | ~lookup_throttle) & ~ecc_write_req; assign lookup_addr_ic0 = branch_spec_i ? addr_i : prefetch_addr_q; assign lookup_index_ic0 = lookup_addr_ic0[INDEX_HI:LINE_W]; // Cache write assign fill_req_ic0 = (|fill_ram_req); assign fill_index_ic0 = fill_ram_req_addr[INDEX_HI:LINE_W]; assign fill_tag_ic0 = {(~inval_prog_q & ~ecc_write_req),fill_ram_req_addr[ADDR_W-1:INDEX_HI+1]}; assign fill_wdata_ic0 = fill_ram_req_data; // Suppress a new lookup on a not-taken branch (as the address will be incorrect) assign branch_suppress = branch_spec_i & ~branch_i; // Arbitrated signals - lookups have highest priority assign lookup_grant_ic0 = lookup_req_ic0 & ~branch_suppress; assign fill_grant_ic0 = fill_req_ic0 & (~lookup_req_ic0 | branch_suppress) & ~inval_prog_q & ~ecc_write_req; // Qualified lookup grant to mask ram signals in IC1 if access was not made assign lookup_actual_ic0 = lookup_grant_ic0 & icache_enable_i & ~inval_prog_q & ~start_inval; // Tagram assign tag_req_ic0 = lookup_req_ic0 | fill_req_ic0 | inval_prog_q | ecc_write_req; assign tag_index_ic0 = inval_prog_q ? inval_index_q : ecc_write_req ? ecc_write_index : fill_grant_ic0 ? fill_index_ic0 : lookup_index_ic0; assign tag_banks_ic0 = ecc_write_req ? ecc_write_ways : fill_grant_ic0 ? fill_ram_req_way : {NumWays{1'b1}}; assign tag_write_ic0 = fill_grant_ic0 | inval_prog_q | ecc_write_req; // Dataram assign data_req_ic0 = lookup_req_ic0 | fill_req_ic0; assign data_index_ic0 = tag_index_ic0; assign data_banks_ic0 = tag_banks_ic0; assign data_write_ic0 = tag_write_ic0; // Append ECC checkbits to write data if required if (ICacheECC) begin : gen_ecc_wdata // Tagram ECC // Reuse the same ecc encoding module for larger cache sizes by padding with zeros logic [21:0] tag_ecc_input_padded; logic [27:0] tag_ecc_output_padded; logic [22-TAG_SIZE:0] tag_ecc_output_unused; assign tag_ecc_input_padded = {{22-TAG_SIZE{1'b0}},fill_tag_ic0}; assign tag_ecc_output_unused = tag_ecc_output_padded[21:TAG_SIZE-1]; prim_secded_28_22_enc tag_ecc_enc ( .in (tag_ecc_input_padded), .out (tag_ecc_output_padded) ); assign tag_wdata_ic0 = {tag_ecc_output_padded[27:22],tag_ecc_output_padded[TAG_SIZE-1:0]}; // Dataram ECC prim_secded_72_64_enc data_ecc_enc ( .in (fill_wdata_ic0), .out (data_wdata_ic0) ); end else begin : gen_noecc_wdata assign tag_wdata_ic0 = fill_tag_ic0; assign data_wdata_ic0 = fill_wdata_ic0; end //////////////// // IC0 -> IC1 // //////////////// for (genvar way = 0; way < NumWays; way++) begin : gen_rams // Tag RAM instantiation prim_ram_1p #( .Width (TAG_SIZE_ECC), .Depth (NUM_LINES), .DataBitsPerMask (TAG_SIZE_ECC) ) tag_bank ( .clk_i (clk_i), .req_i (tag_req_ic0 & tag_banks_ic0[way]), .write_i (tag_write_ic0), .wmask_i ({TAG_SIZE_ECC{1'b1}}), .addr_i (tag_index_ic0), .wdata_i (tag_wdata_ic0), .rdata_o (tag_rdata_ic1[way]) ); // Data RAM instantiation prim_ram_1p #( .Width (LINE_SIZE_ECC), .Depth (NUM_LINES), .DataBitsPerMask (LINE_SIZE_ECC) ) data_bank ( .clk_i (clk_i), .req_i (data_req_ic0 & data_banks_ic0[way]), .write_i (data_write_ic0), .wmask_i ({LINE_SIZE_ECC{1'b1}}), .addr_i (data_index_ic0), .wdata_i (data_wdata_ic0), .rdata_o (data_rdata_ic1[way]) ); end always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin lookup_valid_ic1 <= 1'b0; end else begin lookup_valid_ic1 <= lookup_actual_ic0; end end always_ff @(posedge clk_i) begin if (lookup_grant_ic0) begin lookup_addr_ic1 <= lookup_addr_ic0[ADDR_W-1:INDEX_HI+1]; fill_in_ic1 <= fill_alloc_sel; end end //////////////////////// // Pipeline stage IC1 // //////////////////////// // Tag matching for (genvar way = 0; way < NumWays; way++) begin : gen_tag_match assign tag_match_ic1[way] = (tag_rdata_ic1[way][TAG_SIZE-1:0] == {1'b1,lookup_addr_ic1[ADDR_W-1:INDEX_HI+1]}); assign tag_invalid_ic1[way] = ~tag_rdata_ic1[way][TAG_SIZE-1]; end assign tag_hit_ic1 = |tag_match_ic1; // Hit data mux always_comb begin hit_data_ic1 = 'b0; for (int way = 0; way < NumWays; way++) begin if (tag_match_ic1[way]) begin hit_data_ic1 |= data_rdata_ic1[way]; end end end // Way selection for allocations to the cache (onehot signals) // 1 first invalid way // 2 global round-robin (pseudorandom) way assign lowest_invalid_way_ic1[0] = tag_invalid_ic1[0]; assign round_robin_way_ic1[0] = round_robin_way_q[NumWays-1]; for (genvar way = 1; way < NumWays; way++) begin : gen_lowest_way assign lowest_invalid_way_ic1[way] = tag_invalid_ic1[way] & ~|tag_invalid_ic1[way-1:0]; assign round_robin_way_ic1[way] = round_robin_way_q[way-1]; end always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin round_robin_way_q <= {{NumWays-1{1'b0}},1'b1}; end else if (lookup_valid_ic1) begin round_robin_way_q <= round_robin_way_ic1; end end assign sel_way_ic1 = |tag_invalid_ic1 ? lowest_invalid_way_ic1 : round_robin_way_q; // ECC checking logic if (ICacheECC) begin : gen_data_ecc_checking logic [NumWays-1:0] tag_err_ic1; logic [1:0] data_err_ic1; logic ecc_correction_write_d, ecc_correction_write_q; logic [NumWays-1:0] ecc_correction_ways_d, ecc_correction_ways_q; logic [INDEX_W-1:0] lookup_index_ic1, ecc_correction_index_q; // Tag ECC checking for (genvar way = 0; way < NumWays; way++) begin : gen_tag_ecc logic [1:0] tag_err_bank_ic1; logic [27:0] tag_rdata_padded_ic1; // Expand the tag rdata with extra padding if the tag size is less than the maximum assign tag_rdata_padded_ic1 = {tag_rdata_ic1[way][TAG_SIZE_ECC-1-:6], {22-TAG_SIZE{1'b0}}, tag_rdata_ic1[way][TAG_SIZE-1:0]}; prim_secded_28_22_dec data_ecc_dec ( .in (tag_rdata_padded_ic1), .d_o (), .syndrome_o (), .err_o (tag_err_bank_ic1) ); assign tag_err_ic1[way] = |tag_err_bank_ic1; end // Data ECC checking // Note - could generate for all ways and mux after prim_secded_72_64_dec data_ecc_dec ( .in (hit_data_ic1), .d_o (), .syndrome_o (), .err_o (data_err_ic1) ); assign ecc_err_ic1 = lookup_valid_ic1 & ((|data_err_ic1) | (|tag_err_ic1)); // Error correction // All ways will be invalidated on a tag error to prevent X-propagation from data_err_ic1 on // spurious hits. Also prevents the same line being allocated twice when there was a true // hit and a spurious hit. assign ecc_correction_ways_d = {NumWays{|tag_err_ic1}} | (tag_match_ic1 & {NumWays{|data_err_ic1}}); assign ecc_correction_write_d = ecc_err_ic1; always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin ecc_correction_write_q <= 1'b0; end else begin ecc_correction_write_q <= ecc_correction_write_d; end end // The index is required in IC1 only when ECC is configured so is registered here always_ff @(posedge clk_i) begin if (lookup_grant_ic0) begin lookup_index_ic1 <= lookup_addr_ic0[INDEX_HI-:INDEX_W]; end end // Store the ways with errors to be invalidated always_ff @(posedge clk_i) begin if (ecc_err_ic1) begin ecc_correction_ways_q <= ecc_correction_ways_d; ecc_correction_index_q <= lookup_index_ic1; end end assign ecc_write_req = ecc_correction_write_q; assign ecc_write_ways = ecc_correction_ways_q; assign ecc_write_index = ecc_correction_index_q; end else begin : gen_no_data_ecc assign ecc_err_ic1 = 1'b0; assign ecc_write_req = 1'b0; assign ecc_write_ways = '0; assign ecc_write_index = '0; end /////////////////////////////// // Cache allocation decision // /////////////////////////////// if (BranchCache) begin : gen_caching_logic // Cache branch target + a number of subsequent lines localparam int unsigned CACHE_AHEAD = 2; localparam int unsigned CACHE_CNT_W = (CACHE_AHEAD == 1) ? 1 : $clog2(CACHE_AHEAD) + 1; logic cache_cnt_dec; logic [CACHE_CNT_W-1:0] cache_cnt_d, cache_cnt_q; assign cache_cnt_dec = lookup_grant_ic0 & (|cache_cnt_q); assign cache_cnt_d = branch_i ? CACHE_AHEAD[CACHE_CNT_W-1:0] : (cache_cnt_q - {{CACHE_CNT_W-1{1'b0}},cache_cnt_dec}); always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin cache_cnt_q <= '0; end else begin cache_cnt_q <= cache_cnt_d; end end assign fill_cache_new = (branch_i | (|cache_cnt_q)) & icache_enable_i & ~icache_inval_i & ~inval_prog_q; end else begin : gen_cache_all // Cache all missing fetches assign fill_cache_new = icache_enable_i & ~start_inval & ~inval_prog_q; end ////////////////////////// // Fill buffer tracking // ////////////////////////// always_comb begin fb_fill_level = '0; for (int i = 0; i < NUM_FB; i++) begin if (fill_busy_q[i] & ~fill_stale_q[i]) begin fb_fill_level += {{$clog2(NUM_FB)-1{1'b0}},1'b1}; end end end // PMP errors might not / don't need to be granted (since the external request is masked) assign gnt_or_pmp_err = instr_gnt_i | instr_pmp_err_i; assign gnt_not_pmp_err = instr_gnt_i & ~instr_pmp_err_i; // Allocate a new buffer for every granted lookup assign fill_new_alloc = lookup_grant_ic0; // Track whether a speculative external request was made from IC0, and whether it was granted assign fill_spec_req = (SpecRequest | branch_i) & ~|fill_ext_req; assign fill_spec_done = fill_spec_req & gnt_not_pmp_err; assign fill_spec_hold = fill_spec_req & ~gnt_or_pmp_err; for (genvar fb = 0; fb < NUM_FB; fb++) begin : gen_fbs ///////////////////////////// // Fill buffer allocations // ///////////////////////////// // Allocate the lowest available buffer if (fb == 0) begin : gen_fb_zero assign fill_alloc_sel[fb] = ~fill_busy_q[fb]; end else begin : gen_fb_rest assign fill_alloc_sel[fb] = ~fill_busy_q[fb] & (&fill_busy_q[fb-1:0]); end assign fill_alloc[fb] = fill_alloc_sel[fb] & fill_new_alloc; assign fill_busy_d[fb] = fill_alloc[fb] | (fill_busy_q[fb] & ~fill_done[fb]); // Track which other fill buffers are older than this one (for age-based arbitration) // TODO sparsify assign fill_older_d[fb] = (fill_alloc[fb] ? fill_busy_q : fill_older_q[fb]) & ~fill_done; // A fill buffer can release once all its actions are completed // all data written to the cache (unless hit or error) assign fill_done[fb] = (fill_ram_done_q[fb] | fill_hit_q[fb] | ~fill_cache_q[fb] | (|fill_err_q[fb])) & // all data output unless stale due to intervening branch (fill_out_done[fb] | fill_stale_q[fb] | branch_i) & // all external requests completed fill_rvd_done[fb]; ///////////////////////////////// // Fill buffer status tracking // ///////////////////////////////// // Track staleness (requests become stale when a branch intervenes) assign fill_stale_d[fb] = fill_busy_q[fb] & (branch_i | fill_stale_q[fb]); // Track whether or not this request should allocate to the cache // Any invalidation or disabling of the cache while the buffer is busy will stop allocation assign fill_cache_d[fb] = (fill_alloc[fb] & fill_cache_new) | (fill_cache_q[fb] & fill_busy_q[fb] & icache_enable_i & ~icache_inval_i); // Record whether the request hit in the cache assign fill_hit_ic1[fb] = lookup_valid_ic1 & fill_in_ic1[fb] & tag_hit_ic1 & ~ecc_err_ic1; assign fill_hit_d[fb] = fill_hit_ic1[fb] | (fill_hit_q[fb] & fill_busy_q[fb]); /////////////////////////////////////////// // Fill buffer external request tracking // /////////////////////////////////////////// // Make an external request assign fill_ext_req[fb] = fill_busy_q[fb] & ~fill_ext_done[fb]; // Count the number of completed external requests (each line requires LINE_BEATS requests) // Don't count fake PMP error grants here since they will never receive an rvalid response assign fill_ext_cnt_d[fb] = fill_alloc[fb] ? {{LINE_BEATS_W{1'b0}},fill_spec_done} : (fill_ext_cnt_q[fb] + {{LINE_BEATS_W{1'b0}}, fill_ext_arb[fb] & gnt_not_pmp_err}); // External request must be held until granted assign fill_ext_hold_d[fb] = (fill_alloc[fb] & fill_spec_hold) | (fill_ext_arb[fb] & ~gnt_or_pmp_err); // External requests are completed when the counter is filled or when the request is cancelled assign fill_ext_done[fb] = (fill_ext_cnt_q[fb][LINE_BEATS_W] | // external requests are considered complete if the request hit fill_hit_ic1[fb] | fill_hit_q[fb] | // external requests will stop once any PMP error is received fill_err_q[fb][fill_ext_off[fb]] | // cancel if the line is stale and won't be cached (~fill_cache_q[fb] & (branch_i | fill_stale_q[fb]))) & // can't cancel while we are waiting for a grant on the bus ~fill_ext_hold_q[fb]; // Track whether this fill buffer expects to receive beats of data assign fill_rvd_exp[fb] = fill_busy_q[fb] & ~fill_rvd_done[fb]; // Count the number of rvalid beats received assign fill_rvd_cnt_d[fb] = fill_alloc[fb] ? '0 : (fill_rvd_cnt_q[fb] + {{LINE_BEATS_W{1'b0}},fill_rvd_arb[fb]}); // External data is complete when all issued external requests have received their data assign fill_rvd_done[fb] = fill_ext_done[fb] & (fill_rvd_cnt_q[fb] == fill_ext_cnt_q[fb]); ////////////////////////////////////// // Fill buffer data output tracking // ////////////////////////////////////// // Send data to the IF stage for requests that are not stale, have not completed their // data output, and have data available to send. // Data is available if: // - The request hit in the cache // - The current beat is an error (since a PMP error might not actually receive any data) // - Buffered data is available (fill_rvd_cnt_q is ahead of fill_out_cnt_q) // - Data is available from the bus this cycle (fill_rvd_arb) assign fill_out_req[fb] = fill_busy_q[fb] & ~fill_stale_q[fb] & ~fill_out_done[fb] & (fill_hit_ic1[fb] | fill_hit_q[fb] | (fill_err_q[fb][fill_out_cnt_q[fb][LINE_BEATS_W-1:0]]) | (fill_rvd_beat[fb] > fill_out_cnt_q[fb]) | fill_rvd_arb[fb]); // Calculate when a beat of data is output. Any ECC error squashes the output that cycle. assign fill_out_grant[fb] = fill_out_arb[fb] & output_ready; // Count the beats of data output to the IF stage assign fill_out_cnt_d[fb] = fill_alloc[fb] ? {1'b0,lookup_addr_ic0[LINE_W-1:BUS_W]} : (fill_out_cnt_q[fb] + {{LINE_BEATS_W{1'b0}},fill_out_grant[fb]}); // Data output complete when the counter fills assign fill_out_done[fb] = fill_out_cnt_q[fb][LINE_BEATS_W]; ////////////////////////////////////// // Fill buffer ram request tracking // ////////////////////////////////////// // make a fill request once all data beats received assign fill_ram_req[fb] = fill_busy_q[fb] & fill_rvd_cnt_q[fb][LINE_BEATS_W] & // unless the request hit, was non-allocating or got an error ~fill_hit_q[fb] & fill_cache_q[fb] & ~|fill_err_q[fb] & // or the request was already completed ~fill_ram_done_q[fb]; // Record when a cache allocation request has been completed assign fill_ram_done_d[fb] = fill_ram_arb[fb] | (fill_ram_done_q[fb] & fill_busy_q[fb]); ////////////////////////////// // Fill buffer line offsets // ////////////////////////////// // When we branch into the middle of a line, the output count will not start from zero. This // beat count is used to know which incoming rdata beats are relevant. assign fill_rvd_beat[fb] = {1'b0,fill_addr_q[fb][LINE_W-1:BUS_W]} + fill_rvd_cnt_q[fb][LINE_BEATS_W:0]; assign fill_ext_off[fb] = fill_addr_q[fb][LINE_W-1:BUS_W] + fill_ext_cnt_q[fb][LINE_BEATS_W-1:0]; assign fill_rvd_off[fb] = fill_rvd_beat[fb][LINE_BEATS_W-1:0]; ///////////////////////////// // Fill buffer arbitration // ///////////////////////////// // Age based arbitration - all these signals are one-hot assign fill_ext_arb[fb] = fill_ext_req[fb] & ~|(fill_ext_req & fill_older_q[fb]); assign fill_ram_arb[fb] = fill_ram_req[fb] & fill_grant_ic0 & ~|(fill_ram_req & fill_older_q[fb]); // Calculate which fill buffer is the oldest one which still needs to output data to IF assign fill_data_sel[fb] = ~|(fill_busy_q & ~fill_out_done & ~fill_stale_q & fill_older_q[fb]); // Arbitrate the request which has data available to send, and is the oldest outstanding assign fill_out_arb[fb] = fill_out_req[fb] & fill_data_sel[fb]; // Assign incoming rvalid data to the oldest fill buffer expecting it assign fill_rvd_arb[fb] = instr_rvalid_i & fill_rvd_exp[fb] & ~|(fill_rvd_exp & fill_older_q[fb]); ///////////////////////////// // Fill buffer data muxing // ///////////////////////////// // Output data muxing controls // 1. Select data from the fill buffer data register assign fill_data_reg[fb] = fill_busy_q[fb] & ~fill_stale_q[fb] & ~fill_out_done[fb] & fill_data_sel[fb] & // The incoming data is already ahead of the output count ((fill_rvd_beat[fb] > fill_out_cnt_q[fb]) | fill_hit_q[fb] | (|fill_err_q[fb])); // 2. Select IC1 hit data assign fill_data_hit[fb] = fill_busy_q[fb] & fill_hit_ic1[fb] & fill_data_sel[fb]; // 3. Select incoming instr_rdata_i assign fill_data_rvd[fb] = fill_busy_q[fb] & fill_rvd_arb[fb] & ~fill_hit_q[fb] & ~fill_hit_ic1[fb] & ~fill_stale_q[fb] & ~fill_out_done[fb] & // The incoming data lines up with the output count (fill_rvd_beat[fb] == fill_out_cnt_q[fb]) & fill_data_sel[fb]; /////////////////////////// // Fill buffer registers // /////////////////////////// // Fill buffer general enable assign fill_entry_en[fb] = fill_alloc[fb] | fill_busy_q[fb]; always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin fill_busy_q[fb] <= 1'b0; fill_older_q[fb] <= '0; fill_stale_q[fb] <= 1'b0; fill_cache_q[fb] <= 1'b0; fill_hit_q[fb] <= 1'b0; fill_ext_cnt_q[fb] <= '0; fill_ext_hold_q[fb] <= 1'b0; fill_rvd_cnt_q[fb] <= '0; fill_ram_done_q[fb] <= 1'b0; fill_out_cnt_q[fb] <= '0; end else if (fill_entry_en[fb]) begin fill_busy_q[fb] <= fill_busy_d[fb]; fill_older_q[fb] <= fill_older_d[fb]; fill_stale_q[fb] <= fill_stale_d[fb]; fill_cache_q[fb] <= fill_cache_d[fb]; fill_hit_q[fb] <= fill_hit_d[fb]; fill_ext_cnt_q[fb] <= fill_ext_cnt_d[fb]; fill_ext_hold_q[fb] <= fill_ext_hold_d[fb]; fill_rvd_cnt_q[fb] <= fill_rvd_cnt_d[fb]; fill_ram_done_q[fb] <= fill_ram_done_d[fb]; fill_out_cnt_q[fb] <= fill_out_cnt_d[fb]; end end //////////////////////////////////////// // Fill buffer address / data storage // //////////////////////////////////////// assign fill_addr_en[fb] = fill_alloc[fb]; assign fill_way_en[fb] = (lookup_valid_ic1 & fill_in_ic1[fb]); always_ff @(posedge clk_i) begin if (fill_addr_en[fb]) begin fill_addr_q[fb] <= lookup_addr_ic0; end end always_ff @(posedge clk_i) begin if (fill_way_en[fb]) begin fill_way_q[fb] <= sel_way_ic1; end end // Data either comes from the cache or the bus. If there was an ECC error, we must take // the incoming bus data since the cache hit data is corrupted. assign fill_data_d[fb] = fill_hit_ic1[fb] ? hit_data_ic1[LineSize-1:0] : {LINE_BEATS{instr_rdata_i}}; for (genvar b = 0; b < LINE_BEATS; b++) begin : gen_data_buf // Error tracking (per beat) // Either a PMP error on a speculative request, assign fill_err_d[fb][b] = (instr_pmp_err_i & fill_alloc[fb] & fill_spec_req & (lookup_addr_ic0[LINE_W-1:BUS_W] == b[LINE_BEATS_W-1:0])) | // a PMP error on a fill buffer ext req (instr_pmp_err_i & fill_ext_arb[fb] & (fill_ext_off[fb] == b[LINE_BEATS_W-1:0])) | // Or a data error with instr_rvalid_i (fill_rvd_arb[fb] & instr_err_i & (fill_rvd_off[fb] == b[LINE_BEATS_W-1:0])) | // Hold the error once recorded (fill_busy_q[fb] & fill_err_q[fb][b]); always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin fill_err_q[fb][b] <= '0; end else if (fill_entry_en[fb]) begin fill_err_q[fb][b] <= fill_err_d[fb][b]; end end // Enable the relevant part of the data register (or all for cache hits) // Ignore incoming rvalid data when we already have cache hit data assign fill_data_en[fb][b] = fill_hit_ic1[fb] | (fill_rvd_arb[fb] & ~fill_hit_q[fb] & (fill_rvd_off[fb] == b[LINE_BEATS_W-1:0])); always_ff @(posedge clk_i) begin if (fill_data_en[fb][b]) begin fill_data_q[fb][b*BusWidth+:BusWidth] <= fill_data_d[fb][b*BusWidth+:BusWidth]; end end end end //////////////////////////////// // Fill buffer one-hot muxing // //////////////////////////////// // External req info always_comb begin fill_ext_req_addr = '0; for (int i = 0; i < NUM_FB; i++) begin if (fill_ext_arb[i]) begin fill_ext_req_addr |= {fill_addr_q[i][ADDR_W-1:LINE_W], fill_ext_off[i]}; end end end // Cache req info always_comb begin fill_ram_req_addr = '0; fill_ram_req_way = '0; fill_ram_req_data = '0; for (int i = 0; i < NUM_FB; i++) begin if (fill_ram_arb[i]) begin fill_ram_req_addr |= fill_addr_q[i]; fill_ram_req_way |= fill_way_q[i]; fill_ram_req_data |= fill_data_q[i]; end end end // IF stage output data always_comb begin fill_out_data = '0; fill_out_err = '0; for (int i = 0; i < NUM_FB; i++) begin if (fill_data_reg[i]) begin fill_out_data |= fill_data_q[i]; // Ignore any speculative errors accumulated on cache hits fill_out_err |= (fill_err_q[i] & ~{LINE_BEATS{fill_hit_q[i]}}); end end end /////////////////////// // External requests // /////////////////////// assign instr_req = ((SpecRequest | branch_i) & lookup_grant_ic0) | |fill_ext_req; assign instr_addr = |fill_ext_req ? fill_ext_req_addr : lookup_addr_ic0[ADDR_W-1:BUS_W]; assign instr_req_o = instr_req; assign instr_addr_o = {instr_addr[ADDR_W-1:BUS_W],{BUS_W{1'b0}}}; //////////////////////// // Output data muxing // //////////////////////// // Mux between line-width data sources assign line_data = |fill_data_hit ? hit_data_ic1[LineSize-1:0] : fill_out_data; assign line_err = |fill_data_hit ? {LINE_BEATS{1'b0}} : fill_out_err; // Mux the relevant beat of line data, based on the output address always_comb begin line_data_muxed = '0; line_err_muxed = 1'b0; for (int i = 0; i < LINE_BEATS; i++) begin // When data has been skidded, the output address is behind by one if ((output_addr_q[LINE_W-1:BUS_W] + {{LINE_BEATS_W-1{1'b0}},skid_valid_q}) == i[LINE_BEATS_W-1:0]) begin line_data_muxed |= line_data[i*32+:32]; line_err_muxed |= line_err[i]; end end end // Mux between incoming rdata and the muxed line data assign output_data = |fill_data_rvd ? instr_rdata_i : line_data_muxed; assign output_err = |fill_data_rvd ? instr_err_i : line_err_muxed; // Output data is valid (from any of the three possible sources). Note that fill_out_arb // must be used here rather than fill_out_req because data can become valid out of order // (e.g. cache hit data can become available ahead of an older outstanding miss). assign data_valid = |fill_out_arb; // Skid buffer data assign skid_data_d = output_data[31:16]; assign skid_en = data_valid & (ready_i | skid_ready); always_ff @(posedge clk_i) begin if (skid_en) begin skid_data_q <= skid_data_d; skid_err_q <= output_err; end end // The data in the skid buffer is ready if it's a complete compressed instruction or if there's // an error (no need to wait for the second half) assign skid_complete_instr = skid_valid_q & ((skid_data_q[1:0] != 2'b11) | skid_err_q); // Data can be loaded into the skid buffer for an unaligned uncompressed instruction assign skid_ready = output_addr_q[1] & ~skid_valid_q & (~output_compressed | output_err); assign output_ready = (ready_i | skid_ready) & ~skid_complete_instr; assign output_compressed = (rdata_o[1:0] != 2'b11); assign skid_valid_d = // Branches invalidate the skid buffer branch_i ? 1'b0 : // Once valid, the skid buffer stays valid until a compressed instruction realigns the stream (skid_valid_q ? ~(ready_i & ((skid_data_q[1:0] != 2'b11) | skid_err_q)) : // The skid buffer becomes valid when: // - we branch to an unaligned uncompressed instruction (((output_addr_q[1] & (~output_compressed | output_err)) | // - a compressed instruction misaligns the stream (~output_addr_q[1] & output_compressed & ~output_err & ready_i)) & data_valid)); always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin skid_valid_q <= 1'b0; end else begin skid_valid_q <= skid_valid_d; end end // Signal that valid data is available to the IF stage // Note that if the first half of an unaligned instruction reports an error, we do not need // to wait for the second half (and for PMP errors we might not have fetched the second half) // Compressed instruction completely satisfied by skid buffer assign output_valid = skid_complete_instr | // Output data available and, output stream aligned, or skid data available, (data_valid & (~output_addr_q[1] | skid_valid_q | // or this is an error or an unaligned compressed instruction output_err | (output_data[17:16] != 2'b11))); // Update the address on branches and every time an instruction is driven assign output_addr_en = branch_i | (ready_i & valid_o); // Increment the address by two every time a compressed instruction is popped assign addr_incr_two = output_compressed & ~err_o; assign output_addr_d = branch_i ? addr_i[31:1] : (output_addr_q[31:1] + // Increment address by 4 or 2 {29'd0, ~addr_incr_two, addr_incr_two}); always_ff @(posedge clk_i) begin if (output_addr_en) begin output_addr_q <= output_addr_d; end end // Mux the data from BusWidth to halfword // This muxing realigns data when instruction words are split across BUS_W e.g. // word 1 |----|*h1*| // word 0 |*h0*|----| --> |*h1*|*h0*| // 31 15 0 31 15 0 always_comb begin output_data_lo = '0; for (int i = 0; i < OUTPUT_BEATS; i++) begin if (output_addr_q[BUS_W-1:1] == i[BUS_W-2:0]) begin output_data_lo |= output_data[i*16+:16]; end end end always_comb begin output_data_hi = '0; for (int i = 0; i < OUTPUT_BEATS-1; i++) begin if (output_addr_q[BUS_W-1:1] == i[BUS_W-2:0]) begin output_data_hi |= output_data[(i+1)*16+:16]; end end if (&output_addr_q[BUS_W-1:1]) begin output_data_hi |= output_data[15:0]; end end assign valid_o = output_valid; assign rdata_o = {output_data_hi, (skid_valid_q ? skid_data_q : output_data_lo)}; assign addr_o = {output_addr_q, 1'b0}; assign err_o = (skid_valid_q & skid_err_q) | (~skid_complete_instr & output_err); // Error caused by the second half of a misaligned uncompressed instruction // (only relevant when err_o is set) assign err_plus2_o = skid_valid_q & ~skid_err_q; /////////////////// // Invalidations // /////////////////// // Invalidate on reset, or when instructed. If an invalidation request is received while a // previous invalidation is ongoing, it does not need to be restarted. assign start_inval = (~reset_inval_q | icache_inval_i) & ~inval_prog_q; assign inval_prog_d = start_inval | (inval_prog_q & ~inval_done); assign inval_done = &inval_index_q; assign inval_index_d = start_inval ? '0 : (inval_index_q + {{INDEX_W-1{1'b0}},1'b1}); always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin inval_prog_q <= 1'b0; reset_inval_q <= 1'b0; end else begin inval_prog_q <= inval_prog_d; reset_inval_q <= 1'b1; end end always_ff @(posedge clk_i) begin if (inval_prog_d) begin inval_index_q <= inval_index_d; end end ///////////////// // Busy status // ///////////////// // Only busy (for WFI purposes) while an invalidation is in-progress, or external requests are // outstanding. assign busy_o = inval_prog_q | (|(fill_busy_q & ~fill_rvd_done)); //////////////// // Assertions // //////////////// `ASSERT_INIT(size_param_legal, (LineSize > 32)) // ECC primitives will need to be changed for different sizes `ASSERT_INIT(ecc_tag_param_legal, (TAG_SIZE <= 27)) `ASSERT_INIT(ecc_data_param_legal, (LineSize <= 121)) // Lookups in the tag ram should always give a known result `ASSERT_KNOWN(TagHitKnown, lookup_valid_ic1 & tag_hit_ic1) `ASSERT_KNOWN(TagInvalidKnown, lookup_valid_ic1 & tag_invalid_ic1) // This is only used for the Yosys-based formal flow. Once we have working bind support, we can // get rid of it. `ifdef FORMAL `ifdef YOSYS // Unfortunately, Yosys doesn't support passing unpacked arrays as ports. Explicitly pack up the // signals we need. logic [NUM_FB-1:0][ADDR_W-1:0] packed_fill_addr_q; always_comb begin for (int i = 0; i < NUM_FB; i++) begin packed_fill_addr_q[i][ADDR_W-1:0] = fill_addr_q[i]; end end `include "formal_tb_frag.svh" `endif `endif endmodule