adding cache replacement policy

This commit is contained in:
Blaise Tine 2024-10-15 00:28:09 -07:00
parent 37757fab8f
commit 03a1e25828
12 changed files with 292 additions and 39 deletions

View file

@ -99,6 +99,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.WRITE_ENABLE (1),
.WRITEBACK (`L2_WRITEBACK),
.DIRTY_BYTES (`L2_DIRTYBYTES),
.REPL_POLICY (`L2_REPL_POLICY),
.UUID_WIDTH (`UUID_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.CORE_OUT_BUF (3),

View file

@ -151,6 +151,10 @@
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`endif
`ifndef MEMORY_BANKS
`define MEMORY_BANKS 2
`endif
`ifdef XLEN_64
`ifndef STACK_BASE_ADDR
@ -483,6 +487,11 @@
`define ICACHE_NUM_WAYS 4
`endif
// Replacement Policy
`ifndef ICACHE_REPL_POLICY
`define ICACHE_REPL_POLICY 1
`endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
// Cache Enable
@ -547,6 +556,11 @@
`define DCACHE_DIRTYBYTES `DCACHE_WRITEBACK
`endif
// Replacement Policy
`ifndef DCACHE_REPL_POLICY
`define DCACHE_REPL_POLICY 1
`endif
// LMEM Configurable Knobs ////////////////////////////////////////////////////
`ifndef LMEM_DISABLE
@ -612,6 +626,11 @@
`define L2_DIRTYBYTES `L2_WRITEBACK
`endif
// Replacement Policy
`ifndef L2_REPL_POLICY
`define L2_REPL_POLICY 1
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
@ -659,8 +678,9 @@
`define L3_DIRTYBYTES `L3_WRITEBACK
`endif
`ifndef MEMORY_BANKS
`define MEMORY_BANKS 2
// Replacement Policy
`ifndef L3_REPL_POLICY
`define L3_REPL_POLICY 1
`endif
// Number of Memory Ports from LLC

View file

@ -222,7 +222,7 @@ endgenerate
`define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
`define UP(x) (((x) != 0) ? (x) : 1)
`define UP(x) (((x) > 0) ? (x) : 1)
`define CDIV(n,d) ((n + d - 1) / (d))

View file

@ -103,6 +103,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.FLAGS_WIDTH (0),
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (0),
.REPL_POLICY (`ICACHE_REPL_POLICY),
.NC_ENABLE (0),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (2)
@ -151,6 +152,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.WRITE_ENABLE (1),
.WRITEBACK (`DCACHE_WRITEBACK),
.DIRTY_BYTES (`DCACHE_DIRTYBYTES),
.REPL_POLICY (`DCACHE_REPL_POLICY),
.NC_ENABLE (1),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (2)

View file

@ -85,6 +85,7 @@ module Vortex import VX_gpu_pkg::*; (
.WRITE_ENABLE (1),
.WRITEBACK (`L3_WRITEBACK),
.DIRTY_BYTES (`L3_DIRTYBYTES),
.REPL_POLICY (`L3_REPL_POLICY),
.UUID_WIDTH (`UUID_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.CORE_OUT_BUF (3),

View file

@ -48,6 +48,9 @@ module VX_cache import VX_gpu_pkg::*; #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_CYCLIC,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -393,12 +396,13 @@ module VX_cache import VX_gpu_pkg::*; #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.REPL_POLICY (REPL_POLICY),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.DIRTY_BYTES (DIRTY_BYTES),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.FLAGS_WIDTH (FLAGS_WIDTH),

View file

@ -47,6 +47,9 @@ module VX_cache_bank #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_CYCLIC,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -324,6 +327,14 @@ module VX_cache_bank #(
wire do_write_st0 = valid_st0 && is_write_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
wire do_read_st1 = valid_st1 && is_read_st1;
wire do_write_st1 = valid_st1 && is_write_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1 && WRITEBACK;
assign write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
assign line_idx_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
@ -331,8 +342,32 @@ module VX_cache_bank #(
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st1;
wire [NUM_WAYS-1:0] tag_matches_st1;
wire is_hit_st1 = (| tag_matches_st1);
wire do_lookup_st0 = do_read_st0 || do_write_st0;
reg [NUM_WAYS-1:0] victim_way_st0;
VX_cache_repl #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.REPL_POLICY (REPL_POLICY)
) cache_repl (
.clk (clk),
.reset (reset),
.stall (pipe_stall),
.hit_valid ((do_read_st1 || do_write_st1) && is_hit_st1),
.hit_line (line_idx_st1),
.hit_way (tag_matches_st1),
.repl_valid (do_fill_st0),
.repl_line (line_idx_st0),
.repl_way (victim_way_st0)
);
assign evict_way_st0 = is_fill_st0 ? victim_way_st0 : flush_way_st0;
VX_cache_tags #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
@ -350,12 +385,11 @@ module VX_cache_bank #(
.fill (do_fill_st0 && ~pipe_stall),
.lookup (do_lookup_st0 && ~pipe_stall),
.line_addr (addr_st0),
.flush_way (flush_way_st0),
.evict_way (evict_way_st0),
// outputs
.tag_matches_r(tag_matches_st1),
.line_tag_r (line_tag_st1),
.evict_tag_r(evict_tag_st1),
.evict_way (evict_way_st0),
.evict_way_r(evict_way_st1)
);
@ -374,23 +408,12 @@ module VX_cache_bank #(
.data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, rw_st1, flags_st1, line_idx_st1, data_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_id_st1, mshr_pending_st1})
);
// we have a tag hit
wire is_hit_st1 = (| tag_matches_st1);
if (UUID_WIDTH != 0) begin : g_req_uuid_st1
assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin : g_req_uuid_st1_0
assign req_uuid_st1 = '0;
end
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
wire do_read_st1 = valid_st1 && is_read_st1;
wire do_write_st1 = valid_st1 && is_write_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1 && WRITEBACK;
assign addr_st1 = {line_tag_st1, line_idx_st1};
// ensure mshr replay always get a hit

View file

@ -52,6 +52,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_CYCLIC,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -150,13 +153,14 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.REPL_POLICY (REPL_POLICY),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH),
.FLAGS_WIDTH (FLAGS_WIDTH),

View file

@ -73,4 +73,10 @@
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1))
///////////////////////////////////////////////////////////////////////////////
`define CS_REPL_RANDOM 0
`define CS_REPL_CYCLIC 1
`define CS_REPL_PLRU 2
`endif // VX_CACHE_DEFINE_VH

200
hw/rtl/cache/VX_cache_repl.sv vendored Normal file
View file

@ -0,0 +1,200 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
// Fast PLRU encoder and decoder utility
// Adapted from BaseJump STL: http://bjump.org/data_out.html
module plru_decoder #(
parameter NUM_WAYS = 1,
parameter WAY_IDX_BITS = $clog2(NUM_WAYS),
parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS)
) (
input wire [WAY_IDX_WIDTH-1:0] way_idx,
input wire [`UP(NUM_WAYS-1)-1:0] lru_in,
output wire [`UP(NUM_WAYS-1)-1:0] lru_out
);
if (NUM_WAYS != 1) begin : g_plru_decoder
wire [`UP(NUM_WAYS-1)-1:0] data;
`IGNORE_UNOPTFLAT_BEGIN
wire [`UP(NUM_WAYS-1)-1:0] mask;
`IGNORE_UNOPTFLAT_END
for (genvar i = 0; i < NUM_WAYS-1; ++i) begin : g_i
if (i == 0) begin : g_i_0
assign mask[i] = 1'b1;
end else if (i % 2 == 1) begin : g_i_odd
assign mask[i] = mask[(i-1)/2] & ~way_idx[WAY_IDX_BITS-$clog2(i+2)+1];
end else begin : g_i_even
assign mask[i] = mask[(i-2)/2] & way_idx[WAY_IDX_BITS-$clog2(i+2)+1];
end
assign data[i] = ~way_idx[WAY_IDX_BITS-$clog2(i+2)];
end
assign lru_out = (data & mask) | (lru_in & ~mask);
end else begin : g_plru_decoder_1
`UNUSED_VAR (way_idx)
`UNUSED_VAR (lru_in)
assign lru_out = '0;
end
endmodule
module plru_encoder #(
parameter NUM_WAYS = 1,
parameter WAY_IDX_BITS = $clog2(NUM_WAYS),
parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS)
) (
input wire [`UP(NUM_WAYS-1)-1:0] lru_in,
output wire [WAY_IDX_WIDTH-1:0] way_idx
);
if (NUM_WAYS != 1) begin : g_plru_encoder
wire [WAY_IDX_WIDTH-1:0] tmp;
for (genvar i = 0; i < WAY_IDX_WIDTH; ++i) begin : g_i
if (i == 0) begin : g_i_0
assign tmp[WAY_IDX_WIDTH-1] = lru_in[0];
end else begin : g_i_n
assign tmp[WAY_IDX_WIDTH-1-i] = lru_in[((2**i)-1)+:(1 << i)][tmp[WAY_IDX_WIDTH-1-:i]];
end
end
assign way_idx = tmp;
end else begin : g_plru_encoder_1
`UNUSED_VAR (lru_in)
assign way_idx = '0;
end
endmodule
module VX_cache_repl #(
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// replacement policy
parameter REPL_POLICY = `CS_REPL_CYCLIC
) (
input wire clk,
input wire reset,
input wire stall,
input wire hit_valid,
input wire [`CS_LINE_SEL_BITS-1:0] hit_line,
input wire [NUM_WAYS-1:0] hit_way,
input wire repl_valid,
input wire [`CS_LINE_SEL_BITS-1:0] repl_line,
output wire [NUM_WAYS-1:0] repl_way
);
`UNUSED_VAR (stall)
localparam WAY_IDX_BITS = $clog2(NUM_WAYS);
localparam WAY_IDX_WIDTH = `UP(WAY_IDX_BITS);
if (REPL_POLICY == `CS_REPL_PLRU) begin : g_plru
// Pseudo Least Recently Used replacement policy
localparam LRU_WIDTH = NUM_WAYS-1;
`UNUSED_VAR (repl_valid)
reg [`CS_LINES_PER_BANK-1:0][`UP(LRU_WIDTH)-1:0] plru_tree;
wire [WAY_IDX_WIDTH-1:0] repl_way_idx;
wire [WAY_IDX_WIDTH-1:0] hit_way_idx;
wire [`UP(LRU_WIDTH)-1:0] plru_update;
always @(posedge clk) begin
if (reset) begin
plru_tree <= '0;
end else begin
if (hit_valid) begin
plru_tree[hit_line] <= plru_update;
end
end
end
VX_onehot_encoder #(
.N (NUM_WAYS)
) hit_way_enc (
.data_in (hit_way),
.data_out (hit_way_idx),
`UNUSED_PIN (valid_out)
);
plru_decoder #(
.NUM_WAYS (NUM_WAYS)
) plru_dec (
.way_idx (hit_way_idx),
.lru_in (plru_tree[hit_line]),
.lru_out (plru_update)
);
plru_encoder #(
.NUM_WAYS (NUM_WAYS)
) plru_enc (
.lru_in (plru_tree[repl_line]),
.way_idx (repl_way_idx)
);
VX_decoder #(
.N (WAY_IDX_BITS)
) repl_way_dec (
.sel_in (repl_way_idx),
.data_in (1'b1),
.data_out (repl_way)
);
end else if (REPL_POLICY == `CS_REPL_CYCLIC) begin : g_cyclic
// Cyclic replacement policy
localparam CTR_WIDTH = $clog2(NUM_WAYS);
`UNUSED_VAR (hit_valid)
`UNUSED_VAR (hit_line)
`UNUSED_VAR (hit_way)
reg [`CS_LINES_PER_BANK-1:0][`UP(CTR_WIDTH)-1:0] counters;
always @(posedge clk) begin
if (reset) begin
counters <= '0;
end else if (repl_valid) begin
counters[repl_line] <= counters[repl_line] + 1;
end
end
VX_decoder #(
.N (WAY_IDX_BITS)
) ctr_decoder (
.sel_in (counters[repl_line]),
.data_in (1'b1),
.data_out (repl_way)
);
end else begin : g_random
// Random replacement policy
`UNUSED_VAR (hit_valid)
`UNUSED_VAR (hit_line)
`UNUSED_VAR (hit_way)
`UNUSED_VAR (repl_valid)
`UNUSED_VAR (repl_line)
if (NUM_WAYS != 1) begin : g_repl_way
reg [NUM_WAYS-1:0] victim_way;
always @(posedge clk) begin
if (reset) begin
victim_way <= 1;
end else if (~stall) begin
victim_way <= {victim_way[NUM_WAYS-2:0], victim_way[NUM_WAYS-1]};
end
end
assign repl_way = victim_way;
end else begin : g_repl_way_1
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign repl_way = 1'b1;
end
end
endmodule

View file

@ -37,12 +37,11 @@ module VX_cache_tags #(
input wire fill,
input wire lookup,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [NUM_WAYS-1:0] flush_way,
input wire [NUM_WAYS-1:0] evict_way,
// outputs
output wire [NUM_WAYS-1:0] tag_matches_r,
output wire [`CS_TAG_SEL_BITS-1:0] line_tag_r,
output wire [NUM_WAYS-1:0] evict_way,
output wire [NUM_WAYS-1:0] evict_way_r,
output wire [`CS_TAG_SEL_BITS-1:0] evict_tag_r
);
@ -56,20 +55,9 @@ module VX_cache_tags #(
wire [NUM_WAYS-1:0] read_valid;
if (NUM_WAYS > 1) begin : g_evict_way
reg [NUM_WAYS-1:0] victim_way;
// cyclic assignment of replacement way
always @(posedge clk) begin
if (reset) begin
victim_way <= 1;
end else if (~stall) begin
victim_way <= {victim_way[NUM_WAYS-2:0], victim_way[NUM_WAYS-1]};
end
end
assign evict_way = fill ? victim_way : flush_way;
`BUFFER_EX(evict_way_r, evict_way, ~stall, 1);
end else begin : g_evict_way_0
`UNUSED_VAR (flush_way)
assign evict_way = 1'b1;
`UNUSED_VAR (evict_way)
assign evict_way_r = 1'b1;
end

View file

@ -51,6 +51,9 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_CYCLIC,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -169,13 +172,14 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.REPL_POLICY (REPL_POLICY),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.FLAGS_WIDTH (FLAGS_WIDTH),