memory coalescing RTL implementation

This commit is contained in:
Blaise Tine 2024-03-04 22:18:39 -08:00
parent 274e6a4c52
commit 288147ac4f
5 changed files with 525 additions and 68 deletions

View file

@ -266,7 +266,7 @@
// LSU line size
`ifndef LSU_LINE_SIZE
`define LSU_LINE_SIZE (`XLEN / 8)
`define LSU_LINE_SIZE `MIN(`NUM_LSU_LANES * (`XLEN / 8), `L1_LINE_SIZE)
`endif
// LSU Duplicate Address Check

View file

@ -114,12 +114,11 @@ package VX_gpu_pkg;
// Input request size
localparam DCACHE_NUM_REQS = `UP((`NUM_LSU_LANES * (`XLEN / 8)) / DCACHE_WORD_SIZE);
// Batch select bits
localparam DCACHE_NUM_BATCHES = ((`NUM_LSU_LANES + DCACHE_NUM_REQS - 1) / DCACHE_NUM_REQS);
localparam DCACHE_BATCH_SEL_BITS = `CLOG2(DCACHE_NUM_BATCHES);
// Core request tag Id bits
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_OUT_SIZE) + DCACHE_BATCH_SEL_BITS);
localparam DCACHE_MERGED_REQS = (`NUM_LSU_LANES * DCACHE_WORD_SIZE) / DCACHE_LINE_SIZE;
localparam DCACHE_MEM_BATCHES = (DCACHE_MERGED_REQS + DCACHE_NUM_REQS - 1) / DCACHE_NUM_REQS;
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_OUT_SIZE) + `CLOG2(DCACHE_MEM_BATCHES));
// Core request tag bits
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);

View file

@ -77,8 +77,6 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
// tag = uuid + tag_id
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
// full address calculation
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;

View file

@ -0,0 +1,365 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`TRACING_OFF
module VX_mem_coalescer #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_REQS = 1,
parameter ADDR_WIDTH = 32,
parameter DATA_IN_SIZE = 4,
parameter DATA_OUT_SIZE = 64,
parameter TAG_WIDTH = 8,
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
parameter QUEUE_SIZE = 8,
parameter DATA_IN_WIDTH = DATA_IN_SIZE * 8,
parameter DATA_OUT_WIDTH= DATA_OUT_SIZE * 8,
parameter OUT_REQS = (NUM_REQS * DATA_IN_WIDTH) / DATA_OUT_WIDTH,
parameter BATCH_SIZE = DATA_OUT_SIZE / DATA_IN_SIZE,
parameter BATCH_SIZE_W = `LOG2UP(BATCH_SIZE),
parameter OUT_ADDR_WIDTH= ADDR_WIDTH - BATCH_SIZE_W,
parameter QUEUE_ADDRW = `CLOG2(QUEUE_SIZE),
parameter OUT_TAG_WIDTH = UUID_WIDTH + QUEUE_ADDRW
) (
input wire clk,
input wire reset,
// Input request
input wire in_req_valid,
input wire in_req_rw,
input wire [NUM_REQS-1:0] in_req_mask,
input wire [NUM_REQS-1:0][DATA_IN_SIZE-1:0] in_req_byteen,
input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] in_req_addr,
input wire [NUM_REQS-1:0][DATA_IN_WIDTH-1:0] in_req_data,
input wire [TAG_WIDTH-1:0] in_req_tag,
output wire in_req_ready,
// Input response
output wire in_rsp_valid,
output wire [NUM_REQS-1:0] in_rsp_mask,
output wire [NUM_REQS-1:0][DATA_IN_WIDTH-1:0] in_rsp_data,
output wire [TAG_WIDTH-1:0] in_rsp_tag,
input wire in_rsp_ready,
// Output request
output wire out_req_valid,
output wire out_req_rw,
output wire [OUT_REQS-1:0] out_req_mask,
output wire [OUT_REQS-1:0][DATA_OUT_SIZE-1:0] out_req_byteen,
output wire [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr,
output wire [OUT_REQS-1:0][DATA_OUT_WIDTH-1:0] out_req_data,
output wire [OUT_TAG_WIDTH-1:0] out_req_tag,
input wire out_req_ready,
// Output response
input wire out_rsp_valid,
input wire [OUT_REQS-1:0] out_rsp_mask,
input wire [OUT_REQS-1:0][DATA_OUT_WIDTH-1:0] out_rsp_data,
input wire [OUT_TAG_WIDTH-1:0] out_rsp_tag,
output wire out_rsp_ready
);
`UNUSED_SPARAM (INSTANCE_ID)
`STATIC_ASSERT (`IS_DIVISBLE(NUM_REQS * DATA_IN_WIDTH, DATA_OUT_WIDTH), ("invalid parameter"))
`STATIC_ASSERT ((NUM_REQS * DATA_IN_WIDTH >= DATA_OUT_WIDTH), ("invalid parameter"))
`RUNTIME_ASSERT ((~in_req_valid || in_req_mask != 0), ("invalid request mask"));
`RUNTIME_ASSERT ((~out_rsp_valid || out_rsp_mask != 0), ("invalid request mask"));
localparam TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH;
localparam NUM_REQS_W = `LOG2UP(NUM_REQS);
// tag + mask + offest
localparam IBUF_DATA_WIDTH = TAG_ID_WIDTH + NUM_REQS + (NUM_REQS * BATCH_SIZE_W);
localparam STATE_SETUP = 0;
localparam STATE_SEND = 1;
logic state_r, state_n;
logic out_req_valid_r, out_req_valid_n;
logic out_req_rw_r, out_req_rw_n;
logic [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n;
logic [OUT_REQS-1:0][DATA_OUT_SIZE-1:0] out_req_byteen_r, out_req_byteen_n;
logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n;
logic [OUT_REQS-1:0][DATA_OUT_WIDTH-1:0] out_req_data_r, out_req_data_n;
logic [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n;
logic in_req_ready_n;
wire ibuf_push;
wire ibuf_pop;
wire [QUEUE_ADDRW-1:0] ibuf_waddr;
wire [QUEUE_ADDRW-1:0] ibuf_raddr;
wire ibuf_full;
wire ibuf_empty;
wire [IBUF_DATA_WIDTH-1:0] ibuf_din;
wire [IBUF_DATA_WIDTH-1:0] ibuf_dout;
logic [OUT_REQS-1:0] batch_valid_r, batch_valid_n;
logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] seed_addr_r, seed_addr_n;
logic [NUM_REQS-1:0] processed_mask_r, processed_mask_n;
wire [OUT_REQS-1:0][NUM_REQS_W-1:0] seed_idx;
wire [NUM_REQS-1:0][OUT_ADDR_WIDTH-1:0] in_addr_base;
wire [NUM_REQS-1:0][BATCH_SIZE_W-1:0] in_addr_offset;
for (genvar i = 0; i < NUM_REQS; i++) begin
assign in_addr_base[i] = in_req_addr[i][ADDR_WIDTH-1:BATCH_SIZE_W];
assign in_addr_offset[i] = in_req_addr[i][BATCH_SIZE_W-1:0];
end
for (genvar i = 0; i < OUT_REQS; ++i) begin
wire [BATCH_SIZE-1:0] batch_mask = in_req_mask[BATCH_SIZE * i +: BATCH_SIZE] & ~processed_mask_r[BATCH_SIZE * i +: BATCH_SIZE];
wire [BATCH_SIZE_W-1:0] batch_idx;
VX_priority_encoder #(
.N (BATCH_SIZE)
) priority_encoder (
.data_in (batch_mask),
.index (batch_idx),
`UNUSED_PIN (onehot),
.valid_out (batch_valid_n[i])
);
assign seed_idx[i] = NUM_REQS_W'(BATCH_SIZE * i) + NUM_REQS_W'(batch_idx);
end
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_SETUP;
processed_mask_r <= '0;
end else begin
state_r <= state_n;
out_req_valid_r <= out_req_valid_n;
batch_valid_r <= batch_valid_n;
seed_addr_r <= seed_addr_n;
out_req_rw_r <= out_req_rw_n;
out_req_mask_r <= out_req_mask_n;
out_req_addr_r <= out_req_addr_n;
out_req_byteen_r <= out_req_byteen_n;
out_req_data_r <= out_req_data_n;
out_req_tag_r <= out_req_tag_n;
processed_mask_r <= processed_mask_n;
end
end
logic [NUM_REQS-1:0] addr_matches;
always @(*) begin
addr_matches = '0;
for (integer i = 0; i < OUT_REQS; ++i) begin
for (integer j = 0; j < BATCH_SIZE; j++) begin
if (in_addr_base[BATCH_SIZE * i + j] == seed_addr_r[i]) begin
addr_matches[BATCH_SIZE * i + j] = 1;
end
end
end
end
wire [NUM_REQS-1:0] current_pmask = in_req_mask & addr_matches;
always @(*) begin
state_n = state_r;
out_req_valid_n = out_req_valid_r;
seed_addr_n = seed_addr_r;
out_req_rw_n = out_req_rw_r;
out_req_mask_n = out_req_mask_r;
out_req_addr_n = out_req_addr_r;
out_req_byteen_n = out_req_byteen_r;
out_req_data_n = out_req_data_r;
out_req_tag_n = out_req_tag_r;
processed_mask_n = processed_mask_r;
in_req_ready_n = 0;
case (state_r)
STATE_SETUP: begin
// find the next seed address
for (integer i = 0; i < OUT_REQS; ++i) begin
seed_addr_n[i] = in_addr_base[seed_idx[i]];
end
// wait for pending outgoing request to submit
if (out_req_valid && out_req_ready) begin
out_req_valid_n = 0;
end
if (in_req_valid && ~out_req_valid_n && ~ibuf_full) begin
state_n = STATE_SEND;
end
end
default/*STATE_SEND*/: begin
out_req_valid_n = 1;
out_req_rw_n = in_req_rw;
out_req_tag_n = {in_req_tag[TAG_WIDTH-1 -: UUID_WIDTH], ibuf_waddr};
in_req_ready_n = 1;
out_req_byteen_n = '0;
for (integer i = 0; i < OUT_REQS; ++i) begin
for (integer j = 0; j < BATCH_SIZE; j++) begin
if (in_req_mask[BATCH_SIZE * i + j]) begin
if (addr_matches[BATCH_SIZE * i + j]) begin
out_req_byteen_n[i][in_addr_offset[BATCH_SIZE * i + j] * DATA_IN_SIZE +: DATA_IN_SIZE] = in_req_byteen[BATCH_SIZE * i + j];
out_req_data_n[i][in_addr_offset[BATCH_SIZE * i + j] * DATA_IN_WIDTH +: DATA_IN_WIDTH] = in_req_data[BATCH_SIZE * i + j];
end else begin
if (!processed_mask_r[BATCH_SIZE * i + j]) begin
in_req_ready_n = 0;
end
end
end
end
out_req_mask_n[i] = batch_valid_r[i];
out_req_addr_n[i] = seed_addr_r[i];
end
if (in_req_ready_n) begin
processed_mask_n = '0;
end else begin
processed_mask_n = processed_mask_r | current_pmask;
end
state_n = STATE_SETUP;
end
endcase
end
wire out_rsp_fire = out_rsp_valid && out_rsp_ready;
wire out_rsp_eop;
assign ibuf_push = (state_r == STATE_SEND) && ~in_req_rw;
assign ibuf_pop = out_rsp_fire && out_rsp_eop;
assign ibuf_raddr = out_rsp_tag[QUEUE_ADDRW-1:0];
wire [TAG_ID_WIDTH-1:0] ibuf_din_tag = in_req_tag[TAG_ID_WIDTH-1:0];
wire [NUM_REQS-1:0][BATCH_SIZE_W-1:0] ibuf_din_offset = in_addr_offset;
wire [NUM_REQS-1:0] ibuf_din_pmask = current_pmask;
assign ibuf_din = {ibuf_din_tag, ibuf_din_pmask, ibuf_din_offset};
VX_index_buffer #(
.DATAW (IBUF_DATA_WIDTH),
.SIZE (QUEUE_SIZE)
) req_ibuf (
.clk (clk),
.reset (reset),
.acquire_en (ibuf_push),
.write_addr (ibuf_waddr),
.write_data (ibuf_din),
.read_data (ibuf_dout),
.read_addr (ibuf_raddr),
.release_en (ibuf_pop),
.full (ibuf_full),
.empty (ibuf_empty)
);
`UNUSED_VAR (ibuf_empty)
assign out_req_valid = out_req_valid_r;
assign out_req_rw = out_req_rw_r;
for (genvar i = 0; i < OUT_REQS; ++i) begin
assign out_req_mask[i] = out_req_mask_r[i];
assign out_req_byteen[i] = out_req_byteen_r[i];
assign out_req_addr[i] = out_req_addr_r[i];
assign out_req_data[i] = out_req_data_r[i];
end
assign out_req_tag = out_req_tag_r;
assign in_req_ready = in_req_ready_n;
// unmerge responses
reg [QUEUE_SIZE-1:0][OUT_REQS-1:0] rsp_rem_mask;
wire [OUT_REQS-1:0] rsp_rem_mask_n = rsp_rem_mask[ibuf_raddr] & ~out_rsp_mask;
assign out_rsp_eop = ~(| rsp_rem_mask_n);
always @(posedge clk) begin
if (ibuf_push) begin
rsp_rem_mask[ibuf_waddr] <= batch_valid_r;
end
if (out_rsp_fire) begin
rsp_rem_mask[ibuf_raddr] <= rsp_rem_mask_n;
end
end
wire [NUM_REQS-1:0][BATCH_SIZE_W-1:0] ibuf_dout_offset;
reg [NUM_REQS-1:0] ibuf_dout_pmask;
wire [TAG_ID_WIDTH-1:0] ibuf_dout_tag;
assign {ibuf_dout_tag, ibuf_dout_pmask, ibuf_dout_offset} = ibuf_dout;
logic [NUM_REQS-1:0][DATA_IN_WIDTH-1:0] in_rsp_data_n;
logic [NUM_REQS-1:0] in_rsp_mask_n;
always @(*) begin
for (integer i = 0; i < OUT_REQS; ++i) begin
for (integer j = 0; j < BATCH_SIZE; j++) begin
in_rsp_mask_n[BATCH_SIZE * i + j] = out_rsp_mask[i] && ibuf_dout_pmask[BATCH_SIZE * i + j];
in_rsp_data_n[BATCH_SIZE * i + j] = out_rsp_data[i][ibuf_dout_offset[BATCH_SIZE * i + j] * DATA_IN_WIDTH +: DATA_IN_WIDTH];
end
end
end
assign in_rsp_valid = out_rsp_valid;
assign in_rsp_mask = in_rsp_mask_n;
assign in_rsp_data = in_rsp_data_n;
assign in_rsp_tag = {out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout_tag};
assign out_rsp_ready = in_rsp_ready;
`ifndef NDEBUG
wire [`UP(UUID_WIDTH)-1:0] out_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] out_rsp_uuid;
if (UUID_WIDTH != 0) begin
assign out_req_uuid = out_req_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH];
assign out_rsp_uuid = out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign out_req_uuid = '0;
assign out_rsp_uuid = '0;
end
reg [NUM_REQS-1:0][BATCH_SIZE_W-1:0] out_req_offset;
reg [NUM_REQS-1:0] out_req_pmask;
always @(posedge clk) begin
if (ibuf_push) begin
out_req_offset <= ibuf_din_offset;
out_req_pmask <= ibuf_din_pmask;
end
end
wire out_req_fire = out_req_valid && out_req_ready;
always @(posedge clk) begin
if (out_req_fire) begin
if (out_req_rw) begin
`TRACE(1, ("%d: %s-out-req-wr: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask));
`TRACE_ARRAY1D(1, "0x%h", out_req_addr, OUT_REQS);
`TRACE(1, (", byteen="));
`TRACE_ARRAY1D(1, "0x%h", out_req_byteen, OUT_REQS);
`TRACE(1, (", data="));
`TRACE_ARRAY1D(1, "0x%0h", out_req_data, OUT_REQS);
end else begin
`TRACE(1, ("%d: %s-out-req-rd: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask));
`TRACE_ARRAY1D(1, "0x%h", out_req_addr, OUT_REQS);
end
`TRACE(1, (", offset="));
`TRACE_ARRAY1D(1, "%0d", out_req_offset, NUM_REQS);
`TRACE(1, (", pmask=%b, tag=0x%0h (#%0d)\n", out_req_pmask, out_req_tag, out_req_uuid));
if ($countones(out_req_pmask) > 1) begin
`TRACE(1, ("%t: *** %s: coalescing=%b (#%0d)\n", $time, INSTANCE_ID, out_req_pmask, out_req_uuid));
end
end
if (out_rsp_fire) begin
`TRACE(1, ("%d: %s-out-rsp: valid=%b, data=", $time, INSTANCE_ID, out_rsp_mask));
`TRACE_ARRAY1D(1, "0x%0h", out_rsp_data, OUT_REQS);
`TRACE(1, (", offset="));
`TRACE_ARRAY1D(1, "%0d", ibuf_dout_offset, NUM_REQS);
`TRACE(1, (", eop=%b, pmask=%b, tag=0x%0h (#%0d)\n", out_rsp_eop, ibuf_dout_pmask, out_rsp_tag, out_rsp_uuid));
end
end
`endif
endmodule
`TRACING_ON

View file

@ -13,7 +13,7 @@
`include "VX_platform.vh"
//`TRACING_OFF
`TRACING_OFF
module VX_mem_scheduler #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_REQS = 1,
@ -30,10 +30,13 @@ module VX_mem_scheduler #(
parameter MEM_OUT_BUF = 0,
parameter WORD_WIDTH = WORD_SIZE * 8,
parameter MEM_BATCHES = (CORE_REQS + MEM_CHANNELS - 1) / MEM_CHANNELS,
parameter MEM_QUEUE_ADDRW= `CLOG2(MEM_QUEUE_SIZE),
parameter LINE_WIDTH = LINE_SIZE * 8,
parameter PER_LINE_REQS = LINE_SIZE / WORD_SIZE,
parameter MERGED_REQS = CORE_REQS / PER_LINE_REQS,
parameter MEM_BATCHES = (MERGED_REQS + MEM_CHANNELS - 1) / MEM_CHANNELS,
parameter MEM_BATCH_BITS= `CLOG2(MEM_BATCHES),
parameter MEM_ADDR_WIDTH= ADDR_WIDTH,
parameter MEM_QUEUE_ADDRW= `CLOG2(MEM_QUEUE_SIZE),
parameter MEM_ADDR_WIDTH= ADDR_WIDTH - `CLOG2(PER_LINE_REQS),
parameter MEM_TAG_WIDTH = UUID_WIDTH + MEM_QUEUE_ADDRW + MEM_BATCH_BITS
) (
input wire clk,
@ -65,24 +68,28 @@ module VX_mem_scheduler #(
output wire [MEM_CHANNELS-1:0] mem_req_rw,
output wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen,
output wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [MEM_CHANNELS-1:0][WORD_WIDTH-1:0] mem_req_data,
output wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire [MEM_CHANNELS-1:0] mem_req_ready,
// Memory response
input wire [MEM_CHANNELS-1:0] mem_rsp_valid,
input wire [MEM_CHANNELS-1:0][WORD_WIDTH-1:0] mem_rsp_data,
input wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire [MEM_CHANNELS-1:0] mem_rsp_ready
);
localparam BATCH_SEL_WIDTH = `UP(MEM_BATCH_BITS);
localparam STALL_TIMEOUT = 10000000;
localparam CORE_QUEUE_ADDRW = `CLOG2(CORE_QUEUE_SIZE);
localparam TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH;
localparam REQQ_TAG_WIDTH = UUID_WIDTH + CORE_QUEUE_ADDRW;
localparam BATCH_SEL_WIDTH = `UP(MEM_BATCH_BITS);
localparam STALL_TIMEOUT = 10000000;
localparam CORE_QUEUE_ADDRW= `CLOG2(CORE_QUEUE_SIZE);
localparam TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH;
localparam REQQ_TAG_WIDTH = UUID_WIDTH + CORE_QUEUE_ADDRW;
localparam MERGED_TAG_WIDTH= UUID_WIDTH + MEM_QUEUE_ADDRW;
localparam COALESCE_ENABLE = (LINE_SIZE != WORD_SIZE);
localparam CORE_CHANNELS = COALESCE_ENABLE ? CORE_REQS : MEM_CHANNELS;
localparam CORE_BATCHES = COALESCE_ENABLE ? 1 : MEM_BATCHES;
localparam CORE_BATCH_BITS = `CLOG2(CORE_BATCHES);
`STATIC_ASSERT ((WORD_SIZE == LINE_SIZE), ("invalid parameter"))
`STATIC_ASSERT ((CORE_QUEUE_SIZE == MEM_QUEUE_SIZE), ("invalid parameter"))
`STATIC_ASSERT (`IS_DIVISBLE(CORE_REQS * WORD_SIZE, LINE_SIZE), ("invalid parameter"))
`STATIC_ASSERT ((TAG_WIDTH >= UUID_WIDTH), ("invalid parameter"))
`STATIC_ASSERT ((0 == RSP_PARTIAL) || (1 == RSP_PARTIAL), ("invalid parameter"))
`RUNTIME_ASSERT((~core_req_valid || core_req_mask != 0), ("invalid request mask"));
@ -105,19 +112,34 @@ module VX_mem_scheduler #(
wire [REQQ_TAG_WIDTH-1:0] reqq_tag;
wire reqq_ready;
wire reqq_valid_s;
wire [MERGED_REQS-1:0] reqq_mask_s;
wire reqq_rw_s;
wire [MERGED_REQS-1:0][LINE_SIZE-1:0] reqq_byteen_s;
wire [MERGED_REQS-1:0][MEM_ADDR_WIDTH-1:0] reqq_addr_s;
wire [MERGED_REQS-1:0][LINE_WIDTH-1:0] reqq_data_s;
wire [MERGED_TAG_WIDTH-1:0] reqq_tag_s;
wire reqq_ready_s;
wire [MEM_CHANNELS-1:0] mem_req_valid_s;
wire [MEM_CHANNELS-1:0] mem_req_mask_s;
wire mem_req_rw_s;
wire [MEM_CHANNELS-1:0][WORD_SIZE-1:0] mem_req_byteen_s;
wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_s;
wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
wire [MEM_CHANNELS-1:0][WORD_WIDTH-1:0] mem_req_data_s;
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire [MEM_CHANNELS-1:0] mem_req_ready_s;
wire mem_rsp_valid_s2;
wire [MEM_CHANNELS-1:0] mem_rsp_mask_s2;
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_rsp_data_s2;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s2;
wire mem_rsp_ready_s2;
wire mem_rsp_valid_s;
wire [MEM_CHANNELS-1:0] mem_rsp_mask_s;
wire [MEM_CHANNELS-1:0][WORD_WIDTH-1:0] mem_rsp_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire [CORE_REQS-1:0] mem_rsp_mask_s;
wire [CORE_REQS-1:0][WORD_WIDTH-1:0] mem_rsp_data_s;
wire [REQQ_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s;
wire crsp_valid;
@ -174,7 +196,7 @@ module VX_mem_scheduler #(
assign ibuf_push = core_req_fire && ~core_req_rw;
assign ibuf_pop = crsp_fire && crsp_eop;
assign ibuf_raddr = mem_rsp_tag_s[MEM_BATCH_BITS +: MEM_QUEUE_ADDRW];
assign ibuf_raddr = mem_rsp_tag_s[CORE_BATCH_BITS +: CORE_QUEUE_ADDRW];
assign ibuf_din = core_req_tag[TAG_ID_WIDTH-1:0];
VX_index_buffer #(
@ -195,23 +217,96 @@ module VX_mem_scheduler #(
`UNUSED_VAR (ibuf_empty)
// Handle memory coalescing ///////////////////////////////////////////////
if (COALESCE_ENABLE) begin
`RESET_RELAY (coalescer_reset, reset);
VX_mem_coalescer #(
.INSTANCE_ID ($sformatf("%s-coalescer", INSTANCE_ID)),
.NUM_REQS (CORE_REQS),
.DATA_IN_SIZE (WORD_SIZE),
.DATA_OUT_SIZE (LINE_SIZE),
.ADDR_WIDTH (ADDR_WIDTH),
.TAG_WIDTH (REQQ_TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH),
.QUEUE_SIZE (MEM_QUEUE_SIZE)
) coalescer (
.clk (clk),
.reset (coalescer_reset),
// Input request
.in_req_valid (reqq_valid),
.in_req_mask (reqq_mask),
.in_req_rw (reqq_rw),
.in_req_byteen (reqq_byteen),
.in_req_addr (reqq_addr),
.in_req_data (reqq_data),
.in_req_tag (reqq_tag),
.in_req_ready (reqq_ready),
// Input response
.in_rsp_valid (mem_rsp_valid_s),
.in_rsp_mask (mem_rsp_mask_s),
.in_rsp_data (mem_rsp_data_s),
.in_rsp_tag (mem_rsp_tag_s),
.in_rsp_ready (mem_rsp_ready_s),
// Output request
.out_req_valid (reqq_valid_s),
.out_req_mask (reqq_mask_s),
.out_req_rw (reqq_rw_s),
.out_req_byteen (reqq_byteen_s),
.out_req_addr (reqq_addr_s),
.out_req_data (reqq_data_s),
.out_req_tag (reqq_tag_s),
.out_req_ready (reqq_ready_s),
// Output response
.out_rsp_valid (mem_rsp_valid_s2),
.out_rsp_mask (mem_rsp_mask_s2),
.out_rsp_data (mem_rsp_data_s2),
.out_rsp_tag (mem_rsp_tag_s2),
.out_rsp_ready (mem_rsp_ready_s2)
);
end else begin
assign reqq_valid_s = reqq_valid;
assign reqq_mask_s = reqq_mask;
assign reqq_rw_s = reqq_rw;
assign reqq_byteen_s= reqq_byteen;
assign reqq_addr_s = reqq_addr;
assign reqq_data_s = reqq_data;
assign reqq_tag_s = reqq_tag;
assign reqq_ready = reqq_ready_s;
assign mem_rsp_valid_s = mem_rsp_valid_s2;
assign mem_rsp_mask_s = mem_rsp_mask_s2;
assign mem_rsp_data_s = mem_rsp_data_s2;
assign mem_rsp_tag_s = mem_rsp_tag_s2;
assign mem_rsp_ready_s2 = mem_rsp_ready_s;
end
// Handle memory requests /////////////////////////////////////////////////
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0] mem_req_mask_b;
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][WORD_SIZE-1:0] mem_req_byteen_b;
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_b;
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_b;
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][WORD_WIDTH-1:0] mem_req_data_b;
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_b;
wire [BATCH_SEL_WIDTH-1:0] req_batch_idx;
for (genvar i = 0; i < MEM_BATCHES; ++i) begin
for (genvar j = 0; j < MEM_CHANNELS; ++j) begin
localparam r = i * MEM_CHANNELS + j;
if (r < CORE_REQS) begin
assign mem_req_mask_b[i][j] = reqq_mask[r];
assign mem_req_byteen_b[i][j] = reqq_byteen[r];
assign mem_req_addr_b[i][j] = reqq_addr[r];
assign mem_req_data_b[i][j] = reqq_data[r];
if (r < MERGED_REQS) begin
assign mem_req_mask_b[i][j] = reqq_mask_s[r];
assign mem_req_byteen_b[i][j] = reqq_byteen_s[r];
assign mem_req_addr_b[i][j] = reqq_addr_s[r];
assign mem_req_data_b[i][j] = reqq_data_s[r];
end else begin
assign mem_req_mask_b[i][j] = 0;
assign mem_req_byteen_b[i][j] = '0;
@ -222,7 +317,7 @@ module VX_mem_scheduler #(
end
assign mem_req_mask_s = mem_req_mask_b[req_batch_idx];
assign mem_req_rw_s = reqq_rw;
assign mem_req_rw_s = reqq_rw_s;
assign mem_req_byteen_s = mem_req_byteen_b[req_batch_idx];
assign mem_req_addr_s = mem_req_addr_b[req_batch_idx];
assign mem_req_data_s = mem_req_data_b[req_batch_idx];
@ -235,7 +330,7 @@ module VX_mem_scheduler #(
if (reset) begin
batch_sent_mask <= '0;
end else begin
if (reqq_valid) begin
if (reqq_valid_s) begin
if (batch_sent_all) begin
batch_sent_mask <= '0;
end else begin
@ -251,7 +346,7 @@ module VX_mem_scheduler #(
if (reset) begin
req_batch_idx_r <= '0;
end else begin
if (reqq_valid && batch_sent_all) begin
if (reqq_valid_s && batch_sent_all) begin
if (req_sent_all) begin
req_batch_idx_r <= '0;
end else begin
@ -283,22 +378,22 @@ module VX_mem_scheduler #(
assign req_batch_idx = req_batch_idx_r;
assign req_sent_all = batch_sent_all && (req_batch_idx_r == req_batch_idx_last);
assign mem_req_tag_s = {reqq_tag, req_batch_idx};
assign mem_req_tag_s = {reqq_tag_s, req_batch_idx};
end else begin
assign req_batch_idx = '0;
assign req_sent_all = batch_sent_all;
assign mem_req_tag_s = reqq_tag;
assign mem_req_tag_s = reqq_tag_s;
end
assign mem_req_valid_s = {MEM_CHANNELS{reqq_valid}} & mem_req_mask_s & ~batch_sent_mask;
assign reqq_ready = req_sent_all;
assign mem_req_valid_s = {MEM_CHANNELS{reqq_valid_s}} & mem_req_mask_s & ~batch_sent_mask;
assign reqq_ready_s = req_sent_all;
for (genvar i = 0; i < MEM_CHANNELS; ++i) begin
VX_elastic_buffer #(
.DATAW (1 + WORD_SIZE + MEM_ADDR_WIDTH + WORD_WIDTH + MEM_TAG_WIDTH),
.DATAW (1 + LINE_SIZE + MEM_ADDR_WIDTH + LINE_WIDTH + MEM_TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
@ -318,7 +413,7 @@ module VX_mem_scheduler #(
// Select memory response
VX_mem_rsp_sel #(
.NUM_REQS (MEM_CHANNELS),
.DATA_WIDTH (WORD_WIDTH),
.DATA_WIDTH (LINE_WIDTH),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_BITS (MEM_TAG_WIDTH - UUID_WIDTH),
.OUT_BUF (2)
@ -329,26 +424,26 @@ module VX_mem_scheduler #(
.rsp_data_in (mem_rsp_data),
.rsp_tag_in (mem_rsp_tag),
.rsp_ready_in (mem_rsp_ready),
.rsp_valid_out (mem_rsp_valid_s),
.rsp_mask_out (mem_rsp_mask_s),
.rsp_data_out (mem_rsp_data_s),
.rsp_tag_out (mem_rsp_tag_s),
.rsp_ready_out (mem_rsp_ready_s)
.rsp_valid_out (mem_rsp_valid_s2),
.rsp_mask_out (mem_rsp_mask_s2),
.rsp_data_out (mem_rsp_data_s2),
.rsp_tag_out (mem_rsp_tag_s2),
.rsp_ready_out (mem_rsp_ready_s2)
);
reg [CORE_QUEUE_SIZE-1:0][CORE_REQS-1:0] rsp_rem_mask;
wire [CORE_REQS-1:0] rsp_rem_mask_n, curr_mask;
wire [BATCH_SEL_WIDTH-1:0] rsp_batch_idx;
if (MEM_BATCHES > 1) begin
assign rsp_batch_idx = mem_rsp_tag_s[MEM_BATCH_BITS-1:0];
if (CORE_BATCHES > 1) begin
assign rsp_batch_idx = mem_rsp_tag_s[CORE_BATCH_BITS-1:0];
end else begin
assign rsp_batch_idx = '0;
end
for (genvar r = 0; r < CORE_REQS; ++r) begin
localparam i = r / MEM_CHANNELS;
localparam j = r % MEM_CHANNELS;
localparam i = r / CORE_CHANNELS;
localparam j = r % CORE_CHANNELS;
assign curr_mask[r] = (BATCH_SEL_WIDTH'(i) == rsp_batch_idx) && mem_rsp_mask_s[j];
end
@ -385,7 +480,7 @@ module VX_mem_scheduler #(
assign crsp_sop = rsp_sop_r[ibuf_raddr];
for (genvar r = 0; r < CORE_REQS; ++r) begin
localparam j = r % MEM_CHANNELS;
localparam j = r % CORE_CHANNELS;
assign crsp_data[r] = mem_rsp_data_s[j];
end
@ -393,15 +488,15 @@ module VX_mem_scheduler #(
end else begin
reg [MEM_BATCHES*MEM_CHANNELS*WORD_WIDTH-1:0] rsp_store [CORE_QUEUE_SIZE-1:0];
reg [MEM_BATCHES*MEM_CHANNELS*WORD_WIDTH-1:0] rsp_store_n;
reg [CORE_BATCHES*CORE_CHANNELS*WORD_WIDTH-1:0] rsp_store [CORE_QUEUE_SIZE-1:0];
reg [CORE_BATCHES*CORE_CHANNELS*WORD_WIDTH-1:0] rsp_store_n;
reg [CORE_REQS-1:0] rsp_orig_mask [CORE_QUEUE_SIZE-1:0];
always @(*) begin
rsp_store_n = rsp_store[ibuf_raddr];
for (integer i = 0; i < MEM_CHANNELS; ++i) begin
if ((MEM_CHANNELS == 1) || mem_rsp_mask_s[i]) begin
rsp_store_n[(rsp_batch_idx * MEM_CHANNELS + i) * WORD_WIDTH +: WORD_WIDTH] = mem_rsp_data_s[i];
for (integer i = 0; i < CORE_CHANNELS; ++i) begin
if ((CORE_CHANNELS == 1) || mem_rsp_mask_s[i]) begin
rsp_store_n[(rsp_batch_idx * CORE_CHANNELS + i) * WORD_WIDTH +: WORD_WIDTH] = mem_rsp_data_s[i];
end
end
end
@ -420,16 +515,16 @@ module VX_mem_scheduler #(
assign crsp_sop = 1'b1;
for (genvar r = 0; r < CORE_REQS; ++r) begin
localparam i = r / MEM_CHANNELS;
localparam j = r % MEM_CHANNELS;
assign crsp_data[r] = rsp_store_n[(i * MEM_CHANNELS + j) * WORD_WIDTH +: WORD_WIDTH];
localparam i = r / CORE_CHANNELS;
localparam j = r % CORE_CHANNELS;
assign crsp_data[r] = rsp_store_n[(i * CORE_CHANNELS + j) * WORD_WIDTH +: WORD_WIDTH];
end
assign mem_rsp_ready_s = crsp_ready || ~rsp_complete;
end
if (UUID_WIDTH != 0) begin
assign crsp_tag = {mem_rsp_tag_s[MEM_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout};
assign crsp_tag = {mem_rsp_tag_s[REQQ_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout};
end else begin
assign crsp_tag = ibuf_dout;
end
@ -532,24 +627,24 @@ module VX_mem_scheduler #(
if (| mem_req_fire_s) begin
if (| mem_req_rw_s) begin
`TRACE(1, ("%d: %s-mem-req-wr: valid=%b, addr=", $time, INSTANCE_ID, mem_req_fire_s));
`TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, MEM_CHANNELS);
`TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS);
`TRACE(1, (", byteen="));
`TRACE_ARRAY1D(1, "0x%h", mem_req_byteen_s, MEM_CHANNELS);
`TRACE_ARRAY1D(1, "0x%h", mem_req_byteen_s, CORE_CHANNELS);
`TRACE(1, (", data="));
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data_s, MEM_CHANNELS);
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data_s, CORE_CHANNELS);
end else begin
`TRACE(1, ("%d: %s-mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_fire_s));
`TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, MEM_CHANNELS);
`TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS);
end
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr, req_batch_idx, mem_req_dbg_uuid));
end
if (mem_rsp_fire_s) begin
`TRACE(1, ("%d: %s-mem-rsp: valid=%b, data=", $time, INSTANCE_ID, mem_rsp_mask_s));
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data_s, MEM_CHANNELS);
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data_s, CORE_CHANNELS);
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_raddr, rsp_batch_idx, mem_rsp_dbg_uuid));
end
end
`endif
endmodule
//`TRACING_ON
`TRACING_ON