Adding support for cache flush and writeback

Crediting Yi-Lin Tsai's original work at https://github.com/richardyilin/GPU_writeback
This commit is contained in:
Blaise Tine 2024-07-27 13:57:36 -07:00
parent a5bde3693f
commit f5014e8975
17 changed files with 727 additions and 271 deletions

View file

@ -99,6 +99,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.MREQ_SIZE (`L2_MREQ_SIZE),
.TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L2_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),

View file

@ -535,6 +535,11 @@
`define DCACHE_NUM_WAYS 1
`endif
// Enable Cache Writeback
`ifndef DCACHE_WRITEBACK
`define DCACHE_WRITEBACK 1
`endif
// LMEM Configurable Knobs ////////////////////////////////////////////////////
`ifndef LMEM_DISABLE
@ -594,6 +599,11 @@
`define L2_NUM_WAYS 2
`endif
// Enable Cache Writeback
`ifndef L2_WRITEBACK
`define L2_WRITEBACK 1
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
@ -635,6 +645,11 @@
`define L3_NUM_WAYS 4
`endif
// Enable Cache Writeback
`ifndef L3_WRITEBACK
`define L3_WRITEBACK 1
`endif
// ISA Extensions /////////////////////////////////////////////////////////////
`ifdef EXT_A_ENABLE

View file

@ -149,6 +149,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`DCACHE_WRITEBACK),
.NC_ENABLE (1),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2)

View file

@ -83,6 +83,7 @@ module Vortex import VX_gpu_pkg::*; (
.MREQ_SIZE (`L3_MREQ_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L3_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),

109
hw/rtl/cache/VX_bank_flush.sv vendored Normal file
View file

@ -0,0 +1,109 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_bank_flush #(
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Enable cache writeback
parameter WRITEBACK = 0
) (
input wire clk,
input wire reset,
input wire flush_in_valid,
output wire flush_in_ready,
output wire flush_out_init,
output wire flush_out_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_out_line,
output wire [NUM_WAYS-1:0] flush_out_way,
input wire flush_out_ready,
input wire mshr_empty
);
parameter CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0);
parameter STATE_IDLE = 2'd0;
parameter STATE_INIT = 2'd1;
parameter STATE_FLUSH = 2'd2;
reg [CTR_WIDTH-1:0] counter_r;
reg [1:0] state_r, state_n;
reg flush_in_ready_r, flush_in_ready_n;
always @(*) begin
state_n = state_r;
flush_in_ready_n = 0;
case (state_r)
// STATE_IDLE
default: begin
if (flush_in_valid && mshr_empty) begin
state_n = STATE_FLUSH;
end
end
STATE_INIT: begin
if (counter_r == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
state_n = STATE_IDLE;
end
end
STATE_FLUSH: begin
if (counter_r == ((2 ** CTR_WIDTH)-1)) begin
state_n = STATE_IDLE;
flush_in_ready_n = 1;
end
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_INIT;
counter_r <= '0;
flush_in_ready_r <= '0;
end else begin
state_r <= state_n;
flush_in_ready_r <= flush_in_ready_n;
if (state_r != STATE_IDLE) begin
if ((state_r == STATE_INIT) || flush_out_ready) begin
counter_r <= counter_r + CTR_WIDTH'(1);
end
end else begin
counter_r <= '0;
end
end
end
assign flush_in_ready = flush_in_ready_r;
assign flush_out_init = (state_r == STATE_INIT);
assign flush_out_valid = (state_r == STATE_FLUSH);
assign flush_out_line = counter_r[`CS_LINE_SEL_BITS-1:0];
if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin
reg [NUM_WAYS-1:0] flush_out_way_r;
always @(*) begin
flush_out_way_r = '0;
flush_out_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
end
assign flush_out_way = flush_out_way_r;
end else begin
assign flush_out_way = {NUM_WAYS{1'b1}};
end
endmodule

View file

@ -42,6 +42,9 @@ module VX_cache import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -67,6 +70,7 @@ module VX_cache import VX_gpu_pkg::*; #(
);
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter"))
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
@ -78,36 +82,46 @@ module VX_cache import VX_gpu_pkg::*; #(
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS);
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH;
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + 1;
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
localparam CORE_REQ_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0;
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
`endif
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_ready;
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
) core_bus2_if[NUM_REQS]();
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus_if[i].req_valid;
assign core_req_rw[i] = core_bus_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus_if[i].req_data.addr;
assign core_req_data[i] = core_bus_if[i].req_data.data;
assign core_req_tag[i] = core_bus_if[i].req_data.tag;
assign core_bus_if[i].req_ready = core_req_ready[i];
`UNUSED_VAR (core_bus_if[i].req_data.atype)
end
wire [NUM_BANKS-1:0] per_bank_flush_valid;
wire [NUM_BANKS-1:0] per_bank_flush_ready;
wire [NUM_BANKS-1:0] per_bank_core_req_fire;
// this reset relay is required to sync with bank initialization
`RESET_RELAY (flush_reset, reset);
VX_cache_flush #(
.NUM_REQS (NUM_REQS),
.NUM_BANKS (NUM_BANKS),
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
) flush_unit (
.clk (clk),
.reset (flush_reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if (core_bus2_if),
.bank_req_fire (per_bank_core_req_fire),
.flush_valid (per_bank_flush_valid),
.flush_ready (per_bank_flush_ready)
);
///////////////////////////////////////////////////////////////////////////
@ -131,9 +145,9 @@ module VX_cache import VX_gpu_pkg::*; #(
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag}),
.valid_out (core_bus_if[i].rsp_valid),
.ready_out (core_bus_if[i].rsp_ready)
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus2_if[i].rsp_ready)
);
end
@ -146,12 +160,15 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [LINE_SIZE-1:0] mem_req_byteen_s;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire mem_req_flush_s;
wire mem_req_ready_s;
wire mem_bus_if_flush;
`RESET_RELAY (mem_req_reset, reset);
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH),
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1),
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
@ -159,13 +176,13 @@ module VX_cache import VX_gpu_pkg::*; #(
.reset (mem_req_reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag}),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag, mem_bus_if_flush}),
.valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready)
);
assign mem_bus_if.req_data.atype = '0;
assign mem_bus_if.req_data.atype = mem_bus_if_flush ? `ADDR_TYPE_WIDTH'(1 << `ADDR_TYPE_FLUSH) : '0;
///////////////////////////////////////////////////////////////////////////
@ -192,27 +209,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.ready_out (mem_rsp_ready_s)
);
///////////////////////////////////////////////////////////////////////
wire [`CS_LINE_SEL_BITS-1:0] init_line_sel;
wire init_enable;
// this reset relay is required to sync with bank initialization
`RESET_RELAY (init_reset, reset);
VX_cache_init #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS)
) cache_init (
.clk (clk),
.reset (init_reset),
.addr_out (init_line_sel),
.valid_out (init_enable)
);
///////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
@ -222,6 +219,7 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
wire [NUM_BANKS-1:0] per_bank_core_req_flush;
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
@ -233,14 +231,16 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0] per_bank_mem_req_wsel;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
wire [NUM_BANKS-1:0] per_bank_mem_req_flush;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
if (NUM_BANKS == 1) begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready;
end else begin
@ -249,12 +249,33 @@ module VX_cache import VX_gpu_pkg::*; #(
// Bank requests dispatch
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_flush;
wire [NUM_REQS-1:0] core_req_ready;
wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] core_req_bid;
wire [NUM_REQS-1:0][WORD_SEL_WIDTH-1:0] core_req_wsel;
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus2_if[i].req_valid;
assign core_req_rw[i] = core_bus2_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus2_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus2_if[i].req_data.addr;
assign core_req_data[i] = core_bus2_if[i].req_data.data;
assign core_req_tag[i] = core_bus2_if[i].req_data.tag;
assign core_req_flush[i] = core_bus2_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
assign core_bus2_if[i].req_ready = core_req_ready[i];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (WORDS_PER_LINE > 1) begin
assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS];
@ -279,7 +300,9 @@ module VX_cache import VX_gpu_pkg::*; #(
core_req_wsel[i],
core_req_byteen[i],
core_req_data[i],
core_req_tag[i]};
core_req_tag[i],
core_req_flush[i]
};
end
`ifdef PERF_ENABLE
@ -288,12 +311,12 @@ module VX_cache import VX_gpu_pkg::*; #(
`RESET_RELAY (req_xbar_reset, reset);
VX_stream_xbar #(
VX_stream_xbar #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS),
.OUT_BUF ((NUM_REQS > 4) ? 2 : 0)
.OUT_BUF (REQ_XBAR_BUF)
) req_xbar (
.clk (clk),
.reset (req_xbar_reset),
@ -319,11 +342,13 @@ module VX_cache import VX_gpu_pkg::*; #(
per_bank_core_req_wsel[i],
per_bank_core_req_byteen[i],
per_bank_core_req_data[i],
per_bank_core_req_tag[i]} = core_req_data_out[i];
per_bank_core_req_tag[i],
per_bank_core_req_flush[i]
} = core_req_data_out[i];
end
// Banks access
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : banks
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire curr_bank_mem_rsp_valid;
@ -348,6 +373,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF),
@ -371,6 +397,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.core_req_data (per_bank_core_req_data[bank_id]),
.core_req_tag (per_bank_core_req_tag[bank_id]),
.core_req_idx (per_bank_core_req_idx[bank_id]),
.core_req_flush (per_bank_core_req_flush[bank_id]),
.core_req_ready (per_bank_core_req_ready[bank_id]),
// Core response
@ -384,10 +411,10 @@ module VX_cache import VX_gpu_pkg::*; #(
.mem_req_valid (per_bank_mem_req_valid[bank_id]),
.mem_req_addr (curr_bank_mem_req_addr),
.mem_req_rw (per_bank_mem_req_rw[bank_id]),
.mem_req_wsel (per_bank_mem_req_wsel[bank_id]),
.mem_req_byteen (per_bank_mem_req_byteen[bank_id]),
.mem_req_data (per_bank_mem_req_data[bank_id]),
.mem_req_id (per_bank_mem_req_id[bank_id]),
.mem_req_flush (per_bank_mem_req_flush[bank_id]),
.mem_req_ready (per_bank_mem_req_ready[bank_id]),
// Memory response
@ -396,9 +423,8 @@ module VX_cache import VX_gpu_pkg::*; #(
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
// initialization
.init_enable (init_enable),
.init_line_sel (init_line_sel)
.flush_valid (per_bank_flush_valid[bank_id]),
.flush_ready (per_bank_flush_ready[bank_id])
);
if (NUM_BANKS == 1) begin
@ -446,31 +472,33 @@ module VX_cache import VX_gpu_pkg::*; #(
wire mem_req_valid_p;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_p;
wire mem_req_rw_p;
wire [WORD_SEL_WIDTH-1:0] mem_req_wsel_p;
wire [WORD_SIZE-1:0] mem_req_byteen_p;
wire [`CS_WORD_WIDTH-1:0] mem_req_data_p;
wire [LINE_SIZE-1:0] mem_req_byteen_p;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_p;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_p;
wire [MSHR_ADDR_WIDTH-1:0] mem_req_id_p;
wire mem_req_flush_p;
wire mem_req_ready_p;
// Memory request arbitration
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH)-1:0] data_in;
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + 1)-1:0] data_in;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign data_in[i] = {per_bank_mem_req_addr[i],
per_bank_mem_req_rw[i],
per_bank_mem_req_wsel[i],
per_bank_mem_req_byteen[i],
per_bank_mem_req_data[i],
per_bank_mem_req_id[i]};
assign data_in[i] = {
per_bank_mem_req_addr[i],
per_bank_mem_req_rw[i],
per_bank_mem_req_byteen[i],
per_bank_mem_req_data[i],
per_bank_mem_req_id[i],
per_bank_mem_req_flush[i]
};
end
`RESET_RELAY (mem_arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + MSHR_ADDR_WIDTH),
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1),
.ARBITER ("F")
) mem_req_arb (
.clk (clk),
@ -478,7 +506,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.valid_in (per_bank_mem_req_valid),
.ready_in (per_bank_mem_req_ready),
.data_in (data_in),
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_wsel_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p}),
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p, mem_req_flush_p}),
.valid_out (mem_req_valid_p),
.ready_out (mem_req_ready_p),
`UNUSED_PIN (sel_out)
@ -496,31 +524,15 @@ module VX_cache import VX_gpu_pkg::*; #(
assign mem_req_valid_s = mem_req_valid_p;
assign mem_req_addr_s = mem_req_addr_p;
assign mem_req_tag_s = mem_req_tag_p;
assign mem_req_flush_s = mem_req_flush_p;
assign mem_req_ready_p = mem_req_ready_s;
if (WRITE_ENABLE != 0) begin
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] mem_req_byteen_r;
reg [`CS_LINE_WIDTH-1:0] mem_req_data_r;
always @(*) begin
mem_req_byteen_r = '0;
mem_req_data_r = 'x;
mem_req_byteen_r[mem_req_wsel_p * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p;
mem_req_data_r[mem_req_wsel_p * `CS_WORD_WIDTH +: `CS_WORD_WIDTH] = mem_req_data_p;
end
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_r;
assign mem_req_data_s = mem_req_data_r;
end else begin
`UNUSED_VAR (mem_req_wsel_p)
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_p;
assign mem_req_data_s = mem_req_data_p;
end
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_p;
assign mem_req_data_s = mem_req_data_p;
end else begin
`UNUSED_VAR (mem_req_byteen_p)
`UNUSED_VAR (mem_req_wsel_p)
`UNUSED_VAR (mem_req_data_p)
`UNUSED_VAR (mem_req_rw_p)
@ -554,7 +566,7 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready;
end
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);

View file

@ -41,6 +41,9 @@ module VX_cache_bank #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -69,12 +72,13 @@ module VX_cache_bank #(
// Core Request
input wire core_req_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] core_req_addr,
input wire core_req_rw,
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel,
input wire [WORD_SIZE-1:0] core_req_byteen,
input wire [`CS_WORD_WIDTH-1:0] core_req_data,
input wire [TAG_WIDTH-1:0] core_req_tag,
input wire [REQ_SEL_WIDTH-1:0] core_req_idx,
input wire core_req_rw, // write enable
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel, // select the word in a cacheline, e.g. word size = 4 bytes, cacheline size = 64 bytes, it should have log(64/4)= 4 bits
input wire [WORD_SIZE-1:0] core_req_byteen,// which bytes in data to write
input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written
input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id)
input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array
input wire core_req_flush, // flush enable
output wire core_req_ready,
// Core Response
@ -88,10 +92,10 @@ module VX_cache_bank #(
output wire mem_req_valid,
output wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr,
output wire mem_req_rw,
output wire [WORD_SEL_WIDTH-1:0] mem_req_wsel,
output wire [WORD_SIZE-1:0] mem_req_byteen,
output wire [`CS_WORD_WIDTH-1:0] mem_req_data,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, // index of the head entry in the mshr
output wire mem_req_flush,
input wire mem_req_ready,
// Memory response
@ -100,9 +104,9 @@ module VX_cache_bank #(
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
output wire mem_rsp_ready,
// initialization
input wire init_enable,
input wire [`CS_LINE_SEL_BITS-1:0] init_line_sel
// flush
input wire flush_valid,
output wire flush_ready
);
localparam PIPELINE_STAGES = 2;
@ -128,23 +132,56 @@ module VX_cache_bank #(
wire [MSHR_ADDR_WIDTH-1:0] replay_id;
wire replay_ready;
wire is_init_st0;
wire is_flush_st0, is_flush_st1;
wire [NUM_WAYS-1:0] flush_way_st0;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
wire rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_st0, tag_st1;
wire rw_sel, rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1;
wire [`CS_WORD_WIDTH-1:0] read_data_st1;
wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1;
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1;
wire valid_sel, valid_st0, valid_st1;
wire is_init_st0;
wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1;
wire is_replay_st0, is_replay_st1;
wire creq_flush_st0, creq_flush_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
wire [NUM_WAYS-1:0] tag_matches_st0, tag_matches_st1;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_prev_st0, mshr_prev_st1;
wire mshr_pending_st0, mshr_pending_st1;
wire mshr_empty;
wire line_flush_valid;
wire line_flush_init;
wire [`CS_LINE_SEL_BITS-1:0] line_flush_sel;
wire [NUM_WAYS-1:0] line_flush_way;
wire line_flush_ready;
// flush unit
VX_bank_flush #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WRITEBACK (WRITEBACK)
) flush_unit (
.clk (clk),
.reset (reset),
.flush_in_valid (flush_valid),
.flush_in_ready (flush_ready),
.flush_out_init (line_flush_init),
.flush_out_valid (line_flush_valid),
.flush_out_line (line_flush_sel),
.flush_out_way (line_flush_way),
.flush_out_ready (line_flush_ready),
.mshr_empty (mshr_empty)
);
wire rdw_hazard_st0;
reg rdw_hazard_st1;
@ -154,47 +191,50 @@ module VX_cache_bank #(
// inputs arbitration:
// mshr replay has highest priority to maximize utilization since there is no miss.
// handle memory responses next to prevent deadlock with potential memory request from a miss.
wire replay_grant = ~init_enable;
// flush has precedence over core requests to ensure that the cache is in a consistent state.
wire replay_grant = ~line_flush_init;
wire replay_enable = replay_grant && replay_valid;
wire fill_grant = ~init_enable && ~replay_enable;
wire fill_grant = ~line_flush_init && ~replay_enable;
wire fill_enable = fill_grant && mem_rsp_valid;
wire creq_grant = ~init_enable && ~replay_enable && ~fill_enable;
wire flush_grant = ~line_flush_init && ~replay_enable && ~fill_enable;
wire flush_enable = flush_grant && line_flush_valid;
wire creq_grant = ~line_flush_init && ~replay_enable && ~fill_enable && ~flush_enable;
wire creq_enable = creq_grant && core_req_valid;
assign replay_ready = replay_grant
&& ~rdw_hazard_st0
&& ~pipe_stall;
&& ~rdw_hazard_st0
&& ~pipe_stall;
assign mem_rsp_ready = fill_grant
&& ~pipe_stall;
assign core_req_ready = creq_grant
&& ~mreq_queue_alm_full
&& ~mshr_alm_full
&& ~pipe_stall;
assign line_flush_ready = flush_grant
&& ~mreq_queue_alm_full
&& ~pipe_stall;
wire init_fire = init_enable;
assign core_req_ready = creq_grant
&& ~mreq_queue_alm_full
&& ~mshr_alm_full
&& ~pipe_stall;
wire init_fire = line_flush_init;
wire replay_fire = replay_valid && replay_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire flush_fire = line_flush_valid && line_flush_ready;
wire core_req_fire = core_req_valid && core_req_ready;
wire [TAG_WIDTH-1:0] mshr_creq_tag = replay_enable ? replay_tag : core_req_tag;
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
assign rw_sel = replay_valid ? replay_rw : core_req_rw;
assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen;
assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel;
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
assign tag_sel = replay_valid ? replay_tag : core_req_tag;
if (UUID_WIDTH != 0) begin
assign req_uuid_sel = mshr_creq_tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_sel = 0;
end
`UNUSED_VAR (mshr_creq_tag)
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || core_req_fire;
assign addr_sel = init_enable ? `CS_LINE_ADDR_WIDTH'(init_line_sel) :
(replay_valid ? replay_addr :
(mem_rsp_valid ? mem_rsp_addr : core_req_addr));
assign addr_sel = (line_flush_init | line_flush_valid) ? `CS_LINE_ADDR_WIDTH'(line_flush_sel) :
(replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr));
if (WRITE_ENABLE) begin
assign data_sel[`CS_WORD_WIDTH-1:0] = replay_valid ? replay_data : (mem_rsp_valid ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : core_req_data);
@ -204,32 +244,24 @@ module VX_cache_bank #(
`UNUSED_VAR (replay_data)
end
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words od data_sel
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
end
if (UUID_WIDTH != 0) begin
assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_sel = 0;
end
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({
valid_sel,
init_enable,
replay_enable,
fill_enable,
creq_enable,
addr_sel,
data_sel,
replay_valid ? replay_rw : core_req_rw,
replay_valid ? replay_byteen : core_req_byteen,
replay_valid ? replay_wsel : core_req_wsel,
replay_valid ? replay_idx : core_req_idx,
replay_valid ? replay_tag : core_req_tag,
replay_id
}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_creq_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
.data_in ({valid_sel, line_flush_init, replay_enable, fill_enable, flush_enable, creq_enable, core_req_flush, line_flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
);
if (UUID_WIDTH != 0) begin
@ -238,15 +270,18 @@ module VX_cache_bank #(
assign req_uuid_st0 = 0;
end
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_init_st0 = valid_st0 && is_init_st0;
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_init_st0);
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire [NUM_WAYS-1:0] tag_matches_st0, tag_matches_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
wire [NUM_WAYS-1:0] repl_way_st0;
wire [`CS_TAG_SEL_BITS-1:0] repl_tag_st0;
`RESET_RELAY (tag_reset, reset);
@ -267,26 +302,33 @@ module VX_cache_bank #(
.stall (pipe_stall),
// read/Fill
// init/fill/lookup/flush
.init (do_init_st0 || do_flush_st0),
.fill (do_fill_st0),
.lookup (do_lookup_st0),
.line_addr (addr_st0),
.fill (do_fill_st0),
.init (do_init_st0),
.way_sel (way_sel_st0),
.tag_matches(tag_matches_st0)
.tag_matches(tag_matches_st0),
// replacement
.repl_way (repl_way_st0),
.repl_tag (repl_tag_st0)
);
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
assign way_sel_st0 = is_fill_st0 ? repl_way_st0 : flush_way_st0;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_r_st0 = (is_fill_st0 || is_flush_st0) ? {repl_tag_st0, addr_st0[`CS_LINE_SEL_BITS-1:0]} : addr_st0;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + NUM_WAYS + 1),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + NUM_WAYS + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_st0, is_replay_st0, is_fill_st0, is_creq_st0, rw_st0, addr_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, tag_matches_st0, way_sel_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_replay_st1, is_fill_st1, is_creq_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, tag_matches_st1, way_sel_st1, mshr_pending_st1})
.data_in ({valid_st0, is_flush_st0, is_replay_st0, is_fill_st0, is_creq_st0, creq_flush_st0, rw_st0, addr_r_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, tag_matches_st0, way_sel_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_flush_st1, is_replay_st1, is_fill_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, tag_matches_st1, way_sel_st1, mshr_pending_st1})
);
// we have a tag hit
@ -298,8 +340,10 @@ module VX_cache_bank #(
assign req_uuid_st1 = 0;
end
wire do_creq_rd_st1 = valid_st1 && is_creq_st1 && ~rw_st1;
wire do_creq_wr_st1 = valid_st1 && is_creq_st1 && rw_st1;
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
wire do_creq_rd_st1 = valid_st1 && is_read_st1;
wire do_creq_wr_st1 = valid_st1 && is_write_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
@ -310,20 +354,41 @@ module VX_cache_bank #(
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1;
`UNUSED_VAR (do_write_miss_st1)
// ensure mshr replay always get a hit
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: invalid mshr replay"));
// detect BRAM's read-during-write hazard
assign rdw_hazard_st0 = do_fill_st0; // after a fill
assign rdw_hazard_st0 = do_fill_st0; // stall cycle after a fill
wire rdw_write_st1 = do_write_hit_st1 || do_replay_wr_st1;
wire rdw_read_st0 = do_creq_rd_st0 || do_replay_rd_st0
|| (!WRITEBACK || (do_flush_st0 || do_fill_st0)); // a writeback also do a data read
always @(posedge clk) begin // after a write to same address
rdw_hazard_st1 <= (do_creq_rd_st0 && do_write_hit_st1 && (addr_st0 == addr_st1))
rdw_hazard_st1 <= (rdw_read_st0 && rdw_write_st1 && (addr_st0 == addr_st1))
&& ~rdw_hazard_st1; // invalidate if pipeline stalled to avoid repeats
end
wire [`CS_WORD_WIDTH-1:0] write_data_st1 = data_st1[`CS_WORD_WIDTH-1:0];
wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}};
wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1;
wire [LINE_SIZE-1:0] write_byteen_st1;
wire [`CS_LINE_WIDTH-1:0] dirty_data_st1;
wire [LINE_SIZE-1:0] dirty_byteen_st1;
wire dirty_valid_st1;
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] write_byteen_r;
always @(*) begin
write_byteen_r = '0;
write_byteen_r[wsel_st1 * WORD_SIZE +: WORD_SIZE] = byteen_st1;
end
assign write_byteen_st1 = write_byteen_r;
end else begin
assign write_byteen_st1 = byteen_st1;
end
`RESET_RELAY (data_reset, reset);
@ -336,6 +401,7 @@ module VX_cache_bank #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH)
) cache_data (
.clk (clk),
@ -347,14 +413,18 @@ module VX_cache_bank #(
.read (do_read_hit_st1 || do_replay_rd_st1),
.fill (do_fill_st1),
.flush (do_flush_st1),
.write (do_write_hit_st1 || do_replay_wr_st1),
.way_sel (way_sel_st1 | tag_matches_st1),
.line_addr (addr_st1),
.wsel (wsel_st1),
.byteen (byteen_st1),
.fill_data (fill_data_st1),
.write_data (write_data_st1),
.read_data (read_data_st1)
.write_byteen(write_byteen_st1),
.read_data (read_data_st1),
.dirty_valid(dirty_valid_st1),
.dirty_data (dirty_data_st1),
.dirty_byteen(dirty_byteen_st1)
);
wire [MSHR_SIZE-1:0] mshr_lookup_pending_st0;
@ -362,7 +432,17 @@ module VX_cache_bank #(
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall;
wire mshr_lookup_st0 = mshr_allocate_st0;
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall;
wire mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
// release allocated mshr entry if we had a hit
wire mshr_release_st1;
if (WRITEBACK) begin
assign mshr_release_st1 = is_hit_st1;
end else begin
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content
// this can happen when writes are sent late, when the fill was already in flight.
assign mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
end
VX_pending_size #(
.SIZE (MSHR_SIZE)
@ -371,7 +451,7 @@ module VX_cache_bank #(
.reset (reset),
.incr (core_req_fire),
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
`UNUSED_PIN (empty),
.empty (mshr_empty),
`UNUSED_PIN (alm_empty),
.full (mshr_alm_full),
`UNUSED_PIN (alm_full),
@ -437,7 +517,7 @@ module VX_cache_bank #(
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign lookup_matches[i] = mshr_lookup_pending_st0[i]
&& (i != mshr_alloc_id_st0) // exclude current mshr id
&& ~mshr_lookup_rw_st0[i]; // exclude write requests
&& (WRITEBACK || ~mshr_lookup_rw_st0[i]); // exclude write requests if writethrough
end
assign mshr_pending_st0 = (| lookup_matches);
@ -475,29 +555,38 @@ module VX_cache_bank #(
// schedule memory request
wire mreq_queue_push, mreq_queue_pop, mreq_queue_empty;
wire [`CS_WORD_WIDTH-1:0] mreq_queue_data;
wire [WORD_SIZE-1:0] mreq_queue_byteen;
wire [WORD_SEL_WIDTH-1:0] mreq_queue_wsel;
wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
wire [LINE_SIZE-1:0] mreq_queue_byteen;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
wire [MSHR_ADDR_WIDTH-1:0] mreq_queue_id;
wire mreq_queue_rw;
wire mreq_queue_flush;
assign mreq_queue_push = (do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1;
wire is_evict_st1 = (is_fill_st1 || is_flush_st1) && dirty_valid_st1;
wire do_writeback_st1 = valid_st1 && is_evict_st1;
`UNUSED_VAR (do_writeback_st1)
if (WRITEBACK) begin
assign mreq_queue_push = ((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1)
|| do_writeback_st1;
end else begin
`UNUSED_VAR (dirty_valid_st1)
assign mreq_queue_push = (do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1;
end
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
assign mreq_queue_rw = WRITE_ENABLE && rw_st1;
assign mreq_queue_rw = WRITE_ENABLE && (WRITEBACK ? is_evict_st1 : rw_st1);
assign mreq_queue_addr = addr_st1;
assign mreq_queue_id = mshr_id_st1;
assign mreq_queue_wsel = wsel_st1;
assign mreq_queue_byteen = byteen_st1;
assign mreq_queue_data = write_data_st1;
assign mreq_queue_data = is_write_st1 ? write_data_st1 : dirty_data_st1;
assign mreq_queue_byteen = is_write_st1 ? write_byteen_st1 : dirty_byteen_st1;
assign mreq_queue_flush = creq_flush_st1;
`RESET_RELAY (mreq_queue_reset, reset);
VX_fifo_queue #(
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH),
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1),
.DEPTH (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-PIPELINE_STAGES),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
@ -506,8 +595,8 @@ module VX_cache_bank #(
.reset (mreq_queue_reset),
.push (mreq_queue_push),
.pop (mreq_queue_pop),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_wsel, mreq_queue_data}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_data, mem_req_flush}),
.empty (mreq_queue_empty),
.alm_full (mreq_queue_alm_full),
`UNUSED_PIN (full),
@ -527,15 +616,12 @@ module VX_cache_bank #(
`ifdef DBG_TRACE_CACHE
wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire);
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid || line_flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire || line_flush_valid);
always @(posedge clk) begin
if (pipeline_stall) begin
`TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full));
end
if (init_enable) begin
`TRACE(2, ("%d: %s init: addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(init_line_sel, BANK_ID)));
end
if (mem_rsp_fire) begin
`TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
end
@ -552,8 +638,10 @@ module VX_cache_bank #(
`TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
end
if (mreq_queue_push) begin
if (do_creq_wr_st1)
if (do_creq_wr_st1 && !WRITEBACK)
`TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
else if (do_writeback_st1)
`TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
else
`TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
end

View file

@ -46,6 +46,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -151,6 +154,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),

View file

@ -28,6 +28,8 @@ module VX_cache_data #(
parameter WORD_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
@ -42,52 +44,90 @@ module VX_cache_data #(
input wire read,
input wire fill,
input wire flush,
input wire write,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel,
input wire [WORD_SIZE-1:0] byteen,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data,
input wire [`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen,
input wire [NUM_WAYS-1:0] way_sel,
output wire [`CS_WORD_WIDTH-1:0] read_data
output wire [`CS_WORD_WIDTH-1:0] read_data,
output wire dirty_valid,
output wire [`CS_LINE_WIDTH-1:0] dirty_data,
output wire [LINE_SIZE-1:0] dirty_byteen
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset)
`UNUSED_VAR (stall)
`UNUSED_VAR (line_addr)
`UNUSED_VAR (read)
`UNUSED_VAR (flush)
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
if (WRITEBACK) begin
reg [`CS_LINES_PER_BANK * NUM_WAYS-1:0][LINE_SIZE-1:0] dirty_bytes_r;
reg [`CS_LINES_PER_BANK * NUM_WAYS-1:0] dirty_blocks_r;
wire [`CLOG2(`CS_LINES_PER_BANK * NUM_WAYS)-1:0] way_addr;
if (NUM_WAYS > 1) begin
assign way_addr = {line_sel, way_idx};
end else begin
assign way_addr = line_sel;
end
always @(posedge clk) begin
if (fill) begin
dirty_bytes_r[way_addr] <= '0;
end else if (write) begin
dirty_bytes_r[way_addr] <= dirty_bytes_r[way_addr] | write_byteen;
end
end
always @(posedge clk) begin
if (reset) begin
dirty_blocks_r <= '0;
end else begin
if (fill) begin
dirty_blocks_r[way_addr] <= 0;
end else if (write) begin
dirty_blocks_r[way_addr] <= 1;
end
end
end
assign dirty_byteen = dirty_bytes_r[way_addr];
assign dirty_valid = dirty_blocks_r[way_addr];
end else begin
assign dirty_byteen = '0;
assign dirty_valid = 0;
end
// order the data layout to perform ways multiplexing last.
// this allows converting way index to binary in parallel with BRAM read.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] wdata;
wire [BYTEENW-1:0] wren;
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
reg [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] wdata_r;
reg [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] wren_r;
always @(*) begin
wdata_r = {`CS_WORDS_PER_LINE{write_data}};
wren_r = '0;
wren_r[wsel] = byteen;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
assign wdata[i] = (fill || !WRITE_ENABLE) ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{write_data[i]}};
end
// order the data layout to perform ways multiplexing last
// this allows performing onehot encoding of the way index in parallel with BRAM read.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
assign wdata[i] = fill ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{wdata_r[i]}};
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign wren_w[i][j] = (fill ? {WORD_SIZE{1'b1}} : wren_r[i])
& {WORD_SIZE{((NUM_WAYS == 1) || way_sel[j])}};
assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i])
& {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}};
end
end
assign wren = wren_w;
end else begin
`UNUSED_VAR (write)
`UNUSED_VAR (byteen)
`UNUSED_VAR (write_byteen)
`UNUSED_VAR (write_data)
assign wdata = fill_data;
assign wren = fill;
@ -123,28 +163,35 @@ module VX_cache_data #(
);
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
if (`CS_WORDS_PER_LINE > 1) begin
assign per_way_rdata = rdata[wsel];
end else begin
`UNUSED_VAR (wsel)
assign per_way_rdata = rdata;
end
assign read_data = per_way_rdata[way_idx];
`UNUSED_VAR (stall)
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] dirty_data_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign dirty_data_w[j][i] = rdata[i][j];
end
end
assign dirty_data = dirty_data_w[way_idx];
`ifdef DBG_TRACE_CACHE
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b, byteen=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_valid, dirty_byteen));
end
if (read && ~stall) begin
`TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, read_data, req_uuid));
end
if (write && ~stall) begin
`TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, byteen, write_data, req_uuid));
`TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, write_byteen, write_data, req_uuid));
end
end
`endif

View file

@ -50,7 +50,7 @@
`define CS_TAG_SEL_ADDR_START (1+`CS_LINE_SEL_ADDR_END)
`define CS_TAG_SEL_ADDR_END (`CS_WORD_ADDR_WIDTH-1)
`define CS_LINE_TAG_ADDR(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS]
`define CS_LINE_ADDR_TAG(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS]
///////////////////////////////////////////////////////////////////////////////

154
hw/rtl/cache/VX_cache_flush.sv vendored Normal file
View file

@ -0,0 +1,154 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_flush #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of banks
parameter NUM_BANKS = 1,
// Bank select latency
parameter BANK_SEL_LATENCY = 1
) (
input wire clk,
input wire reset,
VX_mem_bus_if.slave core_bus_in_if [NUM_REQS],
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
input wire [NUM_BANKS-1:0] bank_req_fire,
output wire [NUM_BANKS-1:0] flush_valid,
input wire [NUM_BANKS-1:0] flush_ready
);
localparam STATE_IDLE = 0;
localparam STATE_WAIT = 1;
localparam STATE_FLUSH = 2;
localparam STATE_DONE = 3;
// track in-flight core requests
wire no_inflight_reqs;
if (BANK_SEL_LATENCY != 0) begin
localparam NUM_REQS_W = `CLOG2(NUM_REQS+1);
localparam NUM_BANKS_W = `CLOG2(NUM_BANKS+1);
wire [NUM_REQS-1:0] core_bus_out_fire;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_fire[i] = core_bus_out_if[i].req_valid && core_bus_out_if[i].req_ready;
end
wire [NUM_REQS_W-1:0] core_bus_out_cnt;
wire [NUM_BANKS_W-1:0] bank_req_cnt;
`POP_COUNT(core_bus_out_cnt, core_bus_out_fire);
`POP_COUNT(bank_req_cnt, bank_req_fire);
`UNUSED_VAR (core_bus_out_cnt)
VX_pending_size #(
.SIZE (BANK_SEL_LATENCY * NUM_BANKS),
.INCRW (NUM_BANKS_W),
.DECRW (NUM_BANKS_W)
) pending_size (
.clk (clk),
.reset (reset),
.incr (NUM_BANKS_W'(core_bus_out_cnt)),
.decr (bank_req_cnt),
.empty (no_inflight_reqs),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end else begin
assign no_inflight_reqs = 0;
`UNUSED_VAR (bank_req_fire)
end
reg [1:0] state, state_n;
reg [NUM_BANKS-1:0] flush_done, flush_done_n;
wire [NUM_REQS-1:0] flush_req_mask;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
end
wire flush_req_enable = (| flush_req_mask);
reg [NUM_REQS-1:0] lock_released, lock_released_n;
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire input_enable = ~flush_req_enable || lock_released[i];
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && input_enable;
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_bus_out_if[i].req_ready && input_enable;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_in_if[i].rsp_valid = core_bus_out_if[i].rsp_valid;
assign core_bus_in_if[i].rsp_data = core_bus_out_if[i].rsp_data;
assign core_bus_out_if[i].rsp_ready = core_bus_in_if[i].rsp_ready;
end
wire [NUM_REQS-1:0] core_bus_out_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_ready[i] = core_bus_out_if[i].req_ready;
end
always @(*) begin
state_n = state;
flush_done_n = flush_done;
lock_released_n = lock_released;
case (state)
STATE_IDLE: begin
if (flush_req_enable) begin
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT : STATE_FLUSH;
end
end
STATE_WAIT: begin
if (no_inflight_reqs) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
flush_done_n = flush_done | flush_ready;
if (flush_done_n == 0) begin
state_n = STATE_DONE;
lock_released_n = flush_req_mask;
end
end
STATE_DONE: begin
lock_released_n = lock_released & ~core_bus_out_ready;
if (lock_released_n == 0) begin
state_n = STATE_IDLE;
end
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
flush_done <= '0;
lock_released <= '0;
end else begin
state <= state_n;
flush_done <= flush_done_n;
lock_released <= lock_released_n;
end
end
assign flush_valid = {NUM_BANKS{state == STATE_FLUSH}};
endmodule

View file

@ -13,6 +13,7 @@
`include "VX_cache_define.vh"
// cache flush unit
module VX_cache_init #(
// Size of cache in bytes
parameter CACHE_SIZE = 1024,

View file

@ -38,45 +38,63 @@ module VX_cache_tags #(
input wire stall,
// read/fill
// init/fill/lookup
input wire init,
input wire fill,
input wire lookup,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire fill,
input wire init,
output wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches
output wire [NUM_WAYS-1:0] tag_matches,
// replacement
output wire [NUM_WAYS-1:0] repl_way,
output wire [`CS_TAG_SEL_BITS-1:0] repl_tag
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup)
// valid, tag
localparam TAG_WIDTH = 1 + `CS_TAG_SEL_BITS;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_TAG_ADDR(line_addr);
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr);
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
wire [NUM_WAYS-1:0] read_valid;
if (NUM_WAYS > 1) begin
reg [NUM_WAYS-1:0] repl_way;
reg [NUM_WAYS-1:0] repl_way_r;
// cyclic assignment of replacement way
always @(posedge clk) begin
if (reset) begin
repl_way <= 1;
repl_way_r <= 1;
end else if (~stall) begin // hold the value on stalls prevent filling different slots twice
repl_way <= {repl_way[NUM_WAYS-2:0], repl_way[NUM_WAYS-1]};
repl_way_r <= {repl_way_r[NUM_WAYS-2:0], repl_way_r[NUM_WAYS-1]};
end
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign way_sel[i] = fill && repl_way[i];
end
assign repl_way = repl_way_r;
VX_onehot_mux #(
.DATAW (`CS_TAG_SEL_BITS),
.N (NUM_WAYS)
) repl_tag_sel (
.data_in (read_tag),
.sel_in (repl_way_r),
.data_out (repl_tag)
);
end else begin
`UNUSED_VAR (stall)
assign way_sel = fill;
assign repl_way = 1'b1;
assign repl_tag = read_tag;
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [`CS_TAG_SEL_BITS-1:0] read_tag;
wire read_valid;
wire do_fill = fill && repl_way[i];
wire do_write = init || do_fill;
wire line_valid = ~init;
VX_sp_ram #(
.DATAW (TAG_WIDTH),
@ -85,27 +103,29 @@ module VX_cache_tags #(
) tag_store (
.clk (clk),
.read (1'b1),
.write (way_sel[i] || init),
.write (do_write),
`UNUSED_PIN (wren),
.addr (line_sel),
.wdata ({~init, line_tag}),
.rdata ({read_valid, read_tag})
.wdata ({line_valid, line_tag}),
.rdata ({read_valid[i], read_tag[i]})
);
end
assign tag_matches[i] = read_valid && (line_tag == read_tag);
for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]);
end
`ifdef DBG_TRACE_CACHE
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag));
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), repl_way, line_sel, line_tag));
end
if (init) begin
`TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
end
if (lookup && ~stall) begin
if (tag_matches != 0) begin
`TRACE(3, ("%d: %s hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag, req_uuid));
`TRACE(3, ("%d: %s hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
end else begin
`TRACE(3, ("%d: %s miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -23,20 +23,20 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
@ -45,6 +45,9 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -63,7 +66,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Memory request output buffer
parameter MEM_OUT_BUF = 0
) (
input wire clk,
input wire reset,
@ -80,7 +83,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam CACHE_MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
@ -98,7 +101,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
) mem_bus_cache_if();
if (NC_OR_BYPASS) begin
`RESET_RELAY (nc_bypass_reset, reset);
VX_cache_bypass #(
@ -108,13 +111,13 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.PASSTHRU (PASSTHRU),
.NC_ENABLE (PASSTHRU ? 0 : NC_ENABLE),
.WORD_SIZE (WORD_SIZE),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (LINE_SIZE),
.CORE_ADDR_WIDTH (`CS_WORD_ADDR_WIDTH),
.CORE_ADDR_WIDTH (`CS_WORD_ADDR_WIDTH),
.CORE_TAG_WIDTH (TAG_WIDTH),
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
@ -132,15 +135,15 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.mem_bus_in_if (mem_bus_cache_if),
.mem_bus_out_if (mem_bus_if)
);
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
end
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_cache_if);
end
end
if (PASSTHRU != 0) begin
@ -152,7 +155,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
assign core_bus_cache_if[i].rsp_valid = 0;
assign core_bus_cache_if[i].rsp_data = '0;
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
end
end
assign mem_bus_cache_if.req_valid = 0;
assign mem_bus_cache_if.req_data = '0;
@ -183,6 +186,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),
@ -195,8 +199,8 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
`endif
.core_bus_if (core_bus_cache_if),
.mem_bus_if (mem_bus_cache_if)
);
);
end
`ifdef DBG_TRACE_CACHE
@ -225,9 +229,9 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
end
if (core_rsp_fire) begin
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
end
end
end
end
end
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
@ -246,17 +250,17 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_bus_if.req_data.rw)
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid));
else
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid));
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n",
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid));
end
end
end
`endif
endmodule

View file

@ -181,7 +181,7 @@ module VX_alu_int #(
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({br_enable, br_wid, br_taken, br_dest}),
.data_in ({br_enable, br_wid, br_taken, br_dest}),
.data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest})
);

View file

@ -269,7 +269,6 @@ module VX_operands import VX_gpu_pkg::*; #(
`endif
.DATAW (`XLEN * `NUM_THREADS),
.SIZE (PER_BANK_REGS * PER_ISSUE_WARPS),
.ADDR_MIN ((b == 0) ? PER_ISSUE_WARPS : 0),
.WRENW (BYTEENW),
.NO_RWCHECK (1)
) gpr_ram (

View file

@ -13,7 +13,7 @@
`include "VX_platform.vh"
`TRACING_OFF
//`TRACING_OFF
module VX_pending_size #(
parameter SIZE = 1,
parameter INCRW = 1,
@ -32,8 +32,8 @@ module VX_pending_size #(
output wire alm_full,
output wire [SIZEW-1:0] size
);
`STATIC_ASSERT(INCRW <= SIZEW, ("invalid parameter"))
`STATIC_ASSERT(DECRW <= SIZEW, ("invalid parameter"))
`STATIC_ASSERT(INCRW <= SIZEW, ("invalid parameter: %d vs %d", INCRW, SIZEW))
`STATIC_ASSERT(DECRW <= SIZEW, ("invalid parameter: %d vs %d", DECRW, SIZEW))
localparam ADDRW = `LOG2UP(SIZE);
reg empty_r, alm_empty_r;
@ -53,8 +53,8 @@ module VX_pending_size #(
full_r <= 0;
size_r <= '0;
end else begin
`ASSERT((incr >= decr) || (size_n >= size_r), ("runtime error: counter overflow"));
`ASSERT((incr <= decr) || (size_n <= size_r), ("runtime error: counter underflow"));
`ASSERT((SIZEW'(incr) >= SIZEW'(decr)) || (size_n >= size_r), ("runtime error: counter overflow"));
`ASSERT((SIZEW'(incr) <= SIZEW'(decr)) || (size_n <= size_r), ("runtime error: counter underflow"));
size_r <= size_n;
empty_r <= (size_n == SIZEW'(0));
alm_empty_r <= (size_n == SIZEW'(ALM_EMPTY));
@ -127,4 +127,4 @@ module VX_pending_size #(
assign full = full_r;
endmodule
`TRACING_ON
//`TRACING_ON