Using packed LSU memory requests within the code

This commit is contained in:
Blaise Tine 2024-03-18 21:22:02 -07:00
parent df38cc00f5
commit c175e11a18
14 changed files with 717 additions and 292 deletions

View file

@ -500,7 +500,7 @@
// Number of Banks
`ifndef LMEM_NUM_BANKS
`define LMEM_NUM_BANKS `DCACHE_NUM_BANKS
`define LMEM_NUM_BANKS `NUM_LSU_LANES
`endif
// L2cache Configurable Knobs /////////////////////////////////////////////////

View file

@ -349,6 +349,14 @@
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_LSU_MEM_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
if (enable) begin \

View file

@ -77,6 +77,46 @@ package VX_gpu_pkg;
/* verilator lint_off UNUSED */
///////////////////////// LSU memory Parameters ///////////////////////////
localparam LSU_WORD_SIZE = `XLEN / 8;
localparam LSU_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(LSU_WORD_SIZE));
localparam LSU_MEM_BATCHES = 1;
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
////////////////////////// Dcache Parameters //////////////////////////////
// Word size in bytes
localparam DCACHE_WORD_SIZE = `LSU_LINE_SIZE;
localparam DCACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(DCACHE_WORD_SIZE));
// Block size in bytes
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
// Input request size
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE);
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
// Core request tag Id bits
localparam DCACHE_MERGED_REQS = (`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE;
localparam DCACHE_MEM_BATCHES = (DCACHE_MERGED_REQS + DCACHE_CHANNELS - 1) / DCACHE_CHANNELS;
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_OUT_SIZE) + `CLOG2(DCACHE_MEM_BATCHES));
// Core request tag bits
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
// Memory request data bits
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
////////////////////////// Icache Parameters //////////////////////////////
// Word size in bytes
@ -102,38 +142,6 @@ package VX_gpu_pkg;
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
// Word size in bytes
localparam DCACHE_WORD_SIZE = `LSU_LINE_SIZE;
localparam DCACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(DCACHE_WORD_SIZE));
// Block size in bytes
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
// Input request size
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * (`XLEN / 8)) / DCACHE_WORD_SIZE);
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
// Core request tag Id bits
localparam DCACHE_MERGED_REQS = (`NUM_LSU_LANES * (`XLEN / 8)) / DCACHE_WORD_SIZE;
localparam DCACHE_MEM_BATCHES = (DCACHE_MERGED_REQS + DCACHE_CHANNELS - 1) / DCACHE_CHANNELS;
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_OUT_SIZE) + `CLOG2(DCACHE_MEM_BATCHES));
// Core request tag bits
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
// Memory request data bits
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);

View file

@ -61,10 +61,11 @@ module VX_core import VX_gpu_pkg::*; #(
VX_commit_if commit_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_lmem_bus_if[DCACHE_NUM_REQS]();
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_mem_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
@ -176,7 +177,7 @@ module VX_core import VX_gpu_pkg::*; #(
.base_dcrs (base_dcrs),
.dcache_bus_if (dcache_lmem_bus_if),
.lsu_mem_if (lsu_mem_if),
.dispatch_if (dispatch_if),
.commit_if (commit_if),
@ -206,36 +207,131 @@ module VX_core import VX_gpu_pkg::*; #(
.sim_wb_value (sim_wb_value)
);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
`ifdef LMEM_ENABLE
`RESET_RELAY (lmem_unit_reset, reset);
VX_lmem_unit #(
.CORE_ID (CORE_ID)
) lmem_unit (
.clk (clk),
.reset (reset),
.clk (clk),
.reset (lmem_unit_reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.lmem),
.cache_perf (mem_perf_tmp_if.lmem),
`endif
.dcache_bus_in_if (dcache_lmem_bus_if),
.dcache_bus_out_if (dcache_bus_if)
.lsu_mem_in_if (lsu_mem_if),
.lsu_mem_out_if (lsu_dcache_if)
);
`else
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_lmem_bus_if[i]);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
`ASSIGN_VX_LSU_MEM_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end
`endif
VX_lsu_mem_if #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
`RESET_RELAY (coalescer_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin
VX_mem_coalescer #(
.INSTANCE_ID ($sformatf("core%0d-coalescer", CORE_ID)),
.NUM_REQS (`NUM_LSU_LANES),
.DATA_IN_SIZE (LSU_WORD_SIZE),
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
) coalescer (
.clk (clk),
.reset (coalescer_reset),
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
.in_req_rw (lsu_dcache_if[i].req_data.rw),
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
.in_req_addr (lsu_dcache_if[i].req_data.addr),
.in_req_atype (lsu_dcache_if[i].req_data.atype),
.in_req_data (lsu_dcache_if[i].req_data.data),
.in_req_tag (lsu_dcache_if[i].req_data.tag),
.in_req_ready (lsu_dcache_if[i].req_ready),
// Input response
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
// Output request
.out_req_valid (dcache_coalesced_if[i].req_valid),
.out_req_mask (dcache_coalesced_if[i].req_data.mask),
.out_req_rw (dcache_coalesced_if[i].req_data.rw),
.out_req_byteen (dcache_coalesced_if[i].req_data.byteen),
.out_req_addr (dcache_coalesced_if[i].req_data.addr),
.out_req_atype (dcache_coalesced_if[i].req_data.atype),
.out_req_data (dcache_coalesced_if[i].req_data.data),
.out_req_tag (dcache_coalesced_if[i].req_data.tag),
.out_req_ready (dcache_coalesced_if[i].req_ready),
// Output response
.out_rsp_valid (dcache_coalesced_if[i].rsp_valid),
.out_rsp_mask (dcache_coalesced_if[i].rsp_data.mask),
.out_rsp_data (dcache_coalesced_if[i].rsp_data.data),
.out_rsp_tag (dcache_coalesced_if[i].rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if[i].rsp_ready)
);
end else begin
`ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
end
end
`RESET_RELAY (lsu_adapter_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_lsu_adapter #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH)
) lsu_adapter (
.clk (clk),
.reset (lsu_adapter_reset),
.lsu_mem_if (dcache_coalesced_if[i]),
.mem_bus_if (dcache_bus_if[i * DCACHE_CHANNELS +: DCACHE_CHANNELS])
);
end
`ifdef PERF_ENABLE
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire [1:0] perf_icache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
wire [`CLOG2(LSU_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
@ -247,14 +343,16 @@ module VX_core import VX_gpu_pkg::*; #(
wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
wire [LSU_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
wire [LSU_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [LSU_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_lmem_bus_if[i].req_valid && dcache_lmem_bus_if[i].req_ready && ~dcache_lmem_bus_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i] = dcache_lmem_bus_if[i].req_valid && dcache_lmem_bus_if[i].req_ready && dcache_lmem_bus_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i] = dcache_lmem_bus_if[i].rsp_valid && dcache_lmem_bus_if[i].rsp_ready;
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
assign perf_dcache_rd_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && ~lsu_mem_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && lsu_mem_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].rsp_valid && lsu_mem_if[i].rsp_data.mask[j] && lsu_mem_if[i].rsp_ready;
end
end
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);

View file

@ -29,7 +29,7 @@ module VX_execute import VX_gpu_pkg::*; #(
input base_dcrs_t base_dcrs,
// Dcache interface
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
VX_lsu_mem_if.master lsu_mem_if [`NUM_LSU_BLOCKS],
// dispatch interface
VX_dispatch_if.slave dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
@ -77,7 +77,7 @@ module VX_execute import VX_gpu_pkg::*; #(
.reset (lsu_reset),
.dispatch_if (dispatch_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.cache_bus_if (dcache_bus_if)
.lsu_mem_if (lsu_mem_if)
);
`ifdef EXT_F_ENABLE
@ -105,12 +105,12 @@ module VX_execute import VX_gpu_pkg::*; #(
`endif
.base_dcrs (base_dcrs),
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
.commit_if (commit_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.sched_csr_if (sched_csr_if),
.warp_ctl_if (warp_ctl_if)
);

View file

@ -23,47 +23,115 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
output cache_perf_t cache_perf,
`endif
VX_mem_bus_if.slave dcache_bus_in_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master dcache_bus_out_if [DCACHE_NUM_REQS]
VX_lsu_mem_if.slave lsu_mem_in_if [`NUM_LSU_BLOCKS],
VX_lsu_mem_if.master lsu_mem_out_if [`NUM_LSU_BLOCKS]
);
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(DCACHE_WORD_SIZE);
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) lmem_bus_if[DCACHE_NUM_REQS]();
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_lsu_if[`NUM_LSU_BLOCKS]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) switch_out_bus_if[2 * DCACHE_NUM_REQS]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
wire [`NUM_LSU_LANES-1:0] is_addr_local_mask;
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
assign is_addr_local_mask[j] = lsu_mem_in_if[i].req_data.mask[j]
&& lsu_mem_in_if[i].req_data.atype[j][`ADDR_TYPE_LOCAL];
end
wire is_addr_local = (| is_addr_local_mask);
wire is_addr_global = (| ~is_addr_local_mask);
assign lsu_mem_out_if[i].req_valid = lsu_mem_in_if[i].req_valid && is_addr_global;
assign lsu_mem_out_if[i].req_data.mask = lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask;
assign lsu_mem_out_if[i].req_data.rw = lsu_mem_in_if[i].req_data.rw;
assign lsu_mem_out_if[i].req_data.byteen= lsu_mem_in_if[i].req_data.byteen;
assign lsu_mem_out_if[i].req_data.addr = lsu_mem_in_if[i].req_data.addr;
assign lsu_mem_out_if[i].req_data.atype = lsu_mem_in_if[i].req_data.atype;
assign lsu_mem_out_if[i].req_data.data = lsu_mem_in_if[i].req_data.data;
assign lsu_mem_out_if[i].req_data.tag = lsu_mem_in_if[i].req_data.tag;
`RESET_RELAY (switch_reset, reset);
assign lmem_lsu_if[i].req_valid = lsu_mem_in_if[i].req_valid && is_addr_local;
assign lmem_lsu_if[i].req_data.mask = lsu_mem_in_if[i].req_data.mask & is_addr_local_mask;
assign lmem_lsu_if[i].req_data.rw = lsu_mem_in_if[i].req_data.rw;
assign lmem_lsu_if[i].req_data.byteen = lsu_mem_in_if[i].req_data.byteen;
assign lmem_lsu_if[i].req_data.addr = lsu_mem_in_if[i].req_data.addr;
assign lmem_lsu_if[i].req_data.atype = lsu_mem_in_if[i].req_data.atype;
assign lmem_lsu_if[i].req_data.data = lsu_mem_in_if[i].req_data.data;
assign lmem_lsu_if[i].req_data.tag = lsu_mem_in_if[i].req_data.tag;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
VX_mem_switch #(
.NUM_REQS (2),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (2)
) lmem_switch (
.clk (clk),
.reset (switch_reset),
.bus_sel (dcache_bus_in_if[i].req_data.atype[`ADDR_TYPE_LOCAL]),
.bus_in_if (dcache_bus_in_if[i]),
.bus_out_if (switch_out_bus_if[i * 2 +: 2])
assign lsu_mem_in_if[i].req_ready = (lsu_mem_out_if[i].req_ready && is_addr_global)
|| (lmem_lsu_if[i].req_ready && is_addr_local);
end
`RESET_RELAY (arb_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
wire rsp_arb_valid;
wire rsp_arb_index;
wire rsp_arb_ready;
VX_generic_arbiter #(
.NUM_REQS (2),
.LOCK_ENABLE (1),
.TYPE ("R")
) arbiter (
.clk (clk),
.reset (arb_reset),
.requests ({
lmem_lsu_if[i].rsp_valid,
lsu_mem_out_if[i].rsp_valid
}),
.grant_valid (rsp_arb_valid),
.grant_index (rsp_arb_index),
`UNUSED_PIN (grant_onehot),
.grant_unlock(rsp_arb_ready)
);
// output bus[0] goes to the dcache
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2 + 0]);
assign lsu_mem_in_if[i].rsp_valid = rsp_arb_valid;
assign lsu_mem_in_if[i].rsp_data.mask = rsp_arb_index ? lmem_lsu_if[i].rsp_data.mask : lsu_mem_out_if[i].rsp_data.mask;
assign lsu_mem_in_if[i].rsp_data.data = rsp_arb_index ? lmem_lsu_if[i].rsp_data.data : lsu_mem_out_if[i].rsp_data.data;
assign lsu_mem_in_if[i].rsp_data.tag = rsp_arb_index ? lmem_lsu_if[i].rsp_data.tag : lsu_mem_out_if[i].rsp_data.tag;
assign lsu_mem_out_if[i].rsp_ready = lsu_mem_in_if[i].rsp_ready && ~rsp_arb_index;
assign lmem_lsu_if[i].rsp_ready = lsu_mem_in_if[i].rsp_ready && rsp_arb_index;
assign rsp_arb_ready = lsu_mem_in_if[i].rsp_ready;
end
// output bus[1] goes to the local memory
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i], switch_out_bus_if[i * 2 + 1]);
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS]();
`RESET_RELAY (adapter_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH)
) lsu_adapter (
.clk (clk),
.reset (adapter_reset),
.lsu_mem_if (lmem_lsu_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
end
end
`RESET_RELAY (lmem_reset, reset);
@ -71,16 +139,15 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
VX_local_mem #(
.INSTANCE_ID($sformatf("core%0d-lmem", CORE_ID)),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (DCACHE_NUM_REQS),
.NUM_REQS (LSU_NUM_REQS),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (DCACHE_WORD_SIZE),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
.TAG_WIDTH (LSU_TAG_WIDTH)
) local_mem (
.clk (clk),
.reset (lmem_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif

View file

@ -0,0 +1,121 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
module VX_lsu_adapter #(
parameter NUM_LANES = 1,
parameter DATA_SIZE = 1,
parameter TAG_WIDTH = 1,
parameter TAG_SEL_BITS = 0,
parameter `STRING ARBITER = "P",
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0
) (
input wire clk,
input wire reset,
VX_lsu_mem_if.slave lsu_mem_if,
VX_mem_bus_if.master mem_bus_if [NUM_LANES]
);
localparam REQ_ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE);
localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `ADDR_TYPE_WIDTH + DATA_SIZE * 8;
localparam RSP_DATA_WIDTH = DATA_SIZE * 8;
// handle request unpacking
wire [NUM_LANES-1:0][REQ_DATA_WIDTH-1:0] req_data_in;
wire [NUM_LANES-1:0] req_valid_out;
wire [NUM_LANES-1:0][REQ_DATA_WIDTH-1:0] req_data_out;
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] req_tag_out;
wire [NUM_LANES-1:0] req_ready_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_data_in[i] = {
lsu_mem_if.req_data.rw,
lsu_mem_if.req_data.byteen[i],
lsu_mem_if.req_data.addr[i],
lsu_mem_if.req_data.atype[i],
lsu_mem_if.req_data.data[i]
};
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mem_bus_if[i].req_valid = req_valid_out[i];
assign {
mem_bus_if[i].req_data.rw,
mem_bus_if[i].req_data.byteen,
mem_bus_if[i].req_data.addr,
mem_bus_if[i].req_data.atype,
mem_bus_if[i].req_data.data
} = req_data_out[i];
assign mem_bus_if[i].req_data.tag = req_tag_out[i];
assign req_ready_out[i] = mem_bus_if[i].req_ready;
end
VX_stream_unpack #(
.NUM_REQS (NUM_LANES),
.DATA_WIDTH (REQ_DATA_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.OUT_BUF (REQ_OUT_BUF)
) stream_unpack (
.clk (clk),
.reset (reset),
.valid_in (lsu_mem_if.req_valid),
.mask_in (lsu_mem_if.req_data.mask),
.data_in (req_data_in),
.tag_in (lsu_mem_if.req_data.tag),
.ready_in (lsu_mem_if.req_ready),
.valid_out (req_valid_out),
.data_out (req_data_out),
.tag_out (req_tag_out),
.ready_out (req_ready_out)
);
// handle response packing
wire [NUM_LANES-1:0] rsp_valid_out;
wire [NUM_LANES-1:0][RSP_DATA_WIDTH-1:0] rsp_data_out;
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] rsp_tag_out;
wire [NUM_LANES-1:0] rsp_ready_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign rsp_valid_out[i] = mem_bus_if[i].rsp_valid;
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
assign mem_bus_if[i].rsp_ready = rsp_ready_out[i];
end
VX_stream_pack #(
.NUM_REQS (NUM_LANES),
.DATA_WIDTH (RSP_DATA_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.TAG_SEL_BITS (TAG_SEL_BITS),
.ARBITER (ARBITER),
.OUT_BUF (RSP_OUT_BUF)
) stream_pack (
.clk (clk),
.reset (reset),
.valid_in (rsp_valid_out),
.data_in (rsp_data_out),
.tag_in (rsp_tag_out),
.ready_in (rsp_ready_out),
.valid_out (lsu_mem_if.rsp_valid),
.mask_out (lsu_mem_if.rsp_data.mask),
.data_out (lsu_mem_if.rsp_data.data),
.tag_out (lsu_mem_if.rsp_data.tag),
.ready_out (lsu_mem_if.rsp_ready)
);
endmodule

View file

@ -27,21 +27,19 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
// Outputs
VX_commit_if.master commit_if,
VX_mem_bus_if.master cache_bus_if [DCACHE_CHANNELS]
VX_lsu_mem_if.master lsu_mem_if
);
localparam WORD_SIZE = `XLEN / 8;
localparam ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(WORD_SIZE);
localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_IN_SIZE);
localparam REQ_ASHIFT = `CLOG2(WORD_SIZE);
localparam REQ_ASHIFT = `CLOG2(LSU_WORD_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
// tag_id = wid + PC + rd + op_type + align + pid + pkt_addr
localparam TAG_ID_WIDTH = `NW_WIDTH + `XLEN + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + PID_WIDTH + LSUQ_SIZEW;
localparam TAG_ID_WIDTH = `NW_WIDTH + `XLEN + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * REQ_ASHIFT) + PID_WIDTH + LSUQ_SIZEW;
// tag = uuid + tag_id
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
@ -99,15 +97,15 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire mem_req_valid;
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][`MEM_ADDR_WIDTH-REQ_ASHIFT-1:0] mem_req_addr;
reg [NUM_LANES-1:0][WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] mem_req_addr;
reg [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][LSU_WORD_SIZE*8-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire [NUM_LANES-1:0] mem_rsp_mask;
wire [NUM_LANES-1:0][`XLEN-1:0] mem_rsp_data;
wire [NUM_LANES-1:0][LSU_WORD_SIZE*8-1:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_sop;
wire mem_rsp_eop;
@ -154,7 +152,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
default : mem_req_byteen[i] = {WORD_SIZE{1'b1}};
default : mem_req_byteen[i] = {LSU_WORD_SIZE{1'b1}};
endcase
end
end
@ -268,28 +266,31 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
pkt_waddr
};
wire [DCACHE_CHANNELS-1:0] cache_req_valid;
wire [DCACHE_CHANNELS-1:0] cache_req_rw;
wire [DCACHE_CHANNELS-1:0][DCACHE_WORD_SIZE-1:0] cache_req_byteen;
wire [DCACHE_CHANNELS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
wire [DCACHE_CHANNELS-1:0][`ADDR_TYPE_WIDTH-1:0] cache_req_atype;
wire [DCACHE_CHANNELS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_req_data;
wire [DCACHE_CHANNELS-1:0][DCACHE_TAG_WIDTH-1:0] cache_req_tag;
wire [DCACHE_CHANNELS-1:0] cache_req_ready;
wire [DCACHE_CHANNELS-1:0] cache_rsp_valid;
wire [DCACHE_CHANNELS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_rsp_data;
wire [DCACHE_CHANNELS-1:0][DCACHE_TAG_WIDTH-1:0] cache_rsp_tag;
wire [DCACHE_CHANNELS-1:0] cache_rsp_ready;
wire lsu_mem_req_valid;
wire lsu_mem_req_rw;
wire [NUM_LANES-1:0] lsu_mem_req_mask;
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_mem_req_byteen;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_mem_req_addr;
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] lsu_mem_req_atype;
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_req_data;
wire [LSU_TAG_WIDTH-1:0] lsu_mem_req_tag;
wire lsu_mem_req_ready;
wire lsu_mem_rsp_valid;
wire [NUM_LANES-1:0] lsu_mem_rsp_mask;
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_rsp_data;
wire [LSU_TAG_WIDTH-1:0] lsu_mem_rsp_tag;
wire lsu_mem_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched%0d", CORE_ID, BLOCK_ID)),
.CORE_REQS (`NUM_LSU_LANES),
.MEM_CHANNELS(DCACHE_CHANNELS),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (ADDR_WIDTH),
.CORE_REQS (NUM_LANES),
.MEM_CHANNELS(NUM_LANES),
.WORD_SIZE (LSU_WORD_SIZE),
.LINE_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
@ -324,37 +325,39 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.core_rsp_ready (mem_rsp_ready),
// Memory request
.mem_req_valid (cache_req_valid),
.mem_req_rw (cache_req_rw),
.mem_req_byteen (cache_req_byteen),
.mem_req_addr (cache_req_addr),
.mem_req_atype (cache_req_atype),
.mem_req_data (cache_req_data),
.mem_req_tag (cache_req_tag),
.mem_req_ready (cache_req_ready),
.mem_req_valid (lsu_mem_req_valid),
.mem_req_rw (lsu_mem_req_rw),
.mem_req_mask (lsu_mem_req_mask),
.mem_req_byteen (lsu_mem_req_byteen),
.mem_req_addr (lsu_mem_req_addr),
.mem_req_atype (lsu_mem_req_atype),
.mem_req_data (lsu_mem_req_data),
.mem_req_tag (lsu_mem_req_tag),
.mem_req_ready (lsu_mem_req_ready),
// Memory response
.mem_rsp_valid (cache_rsp_valid),
.mem_rsp_data (cache_rsp_data),
.mem_rsp_tag (cache_rsp_tag),
.mem_rsp_ready (cache_rsp_ready)
.mem_rsp_valid (lsu_mem_rsp_valid),
.mem_rsp_mask (lsu_mem_rsp_mask),
.mem_rsp_data (lsu_mem_rsp_data),
.mem_rsp_tag (lsu_mem_rsp_tag),
.mem_rsp_ready (lsu_mem_rsp_ready)
);
for (genvar i = 0; i < DCACHE_CHANNELS; ++i) begin
assign cache_bus_if[i].req_valid = cache_req_valid[i];
assign cache_bus_if[i].req_data.rw = cache_req_rw[i];
assign cache_bus_if[i].req_data.byteen = cache_req_byteen[i];
assign cache_bus_if[i].req_data.addr = cache_req_addr[i];
assign cache_bus_if[i].req_data.atype = cache_req_atype[i];
assign cache_bus_if[i].req_data.data = cache_req_data[i];
assign cache_bus_if[i].req_data.tag = cache_req_tag[i];
assign cache_req_ready[i] = cache_bus_if[i].req_ready;
assign lsu_mem_if.req_valid = lsu_mem_req_valid;
assign lsu_mem_if.req_data.mask = lsu_mem_req_mask;
assign lsu_mem_if.req_data.rw = lsu_mem_req_rw;
assign lsu_mem_if.req_data.byteen = lsu_mem_req_byteen;
assign lsu_mem_if.req_data.addr = lsu_mem_req_addr;
assign lsu_mem_if.req_data.atype = lsu_mem_req_atype;
assign lsu_mem_if.req_data.data = lsu_mem_req_data;
assign lsu_mem_if.req_data.tag = lsu_mem_req_tag;
assign lsu_mem_req_ready = lsu_mem_if.req_ready;
assign cache_rsp_valid[i] = cache_bus_if[i].rsp_valid;
assign cache_rsp_data[i] = cache_bus_if[i].rsp_data.data;
assign cache_rsp_tag[i] = cache_bus_if[i].rsp_data.tag;
assign cache_bus_if[i].rsp_ready = cache_rsp_ready[i];
end
assign lsu_mem_rsp_valid = lsu_mem_if.rsp_valid;
assign lsu_mem_rsp_mask = lsu_mem_if.rsp_data.mask;
assign lsu_mem_rsp_data = lsu_mem_if.rsp_data.data;
assign lsu_mem_rsp_tag = lsu_mem_if.rsp_data.tag;
assign lsu_mem_if.rsp_ready = lsu_mem_rsp_ready;
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [`NW_WIDTH-1:0] rsp_wid;
@ -455,8 +458,6 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
assign commit_st_if.data.data = commit_ld_if.data.data; // force arbiter passthru
// lsu commit
`RESET_RELAY (commit_arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (2),
@ -464,7 +465,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
.reset (commit_arb_reset),
.reset (reset),
.valid_in ({commit_st_if.valid, commit_ld_if.valid}),
.ready_in ({commit_st_if.ready, commit_ld_if.ready}),
.data_in ({commit_st_if.data, commit_ld_if.data}),
@ -531,8 +532,8 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.clk (clk),
.probe0 ({mem_req_data_0, execute_if.data.uuid, execute_if.data.wid, execute_if.data.PC, mem_req_mask, full_addr_0, mem_req_byteen, mem_req_rw, mem_req_ready, mem_req_valid}),
.probe1 ({rsp_data_0, rsp_uuid, mem_rsp_eop, rsp_pc, rsp_rd, mem_rsp_mask, rsp_wid, mem_rsp_ready, mem_rsp_valid}),
.probe2 ({cache_bus_if.req_data.data, cache_bus_if.req_data.tag, cache_bus_if.req_data.byteen, cache_bus_if.req_data.addr, cache_bus_if.req_data.rw, cache_bus_if.req_ready, cache_bus_if.req_valid}),
.probe3 ({cache_bus_if.rsp_data.data, cache_bus_if.rsp_data.tag, cache_bus_if.rsp_ready, cache_bus_if.rsp_valid})
.probe2 ({lsu_mem_if.req_data.data, lsu_mem_if.req_data.tag, lsu_mem_if.req_data.byteen, lsu_mem_if.req_data.addr, lsu_mem_if.req_data.rw, lsu_mem_if.req_ready, lsu_mem_if.req_valid}),
.probe3 ({lsu_mem_if.rsp_data.data, lsu_mem_if.rsp_data.tag, lsu_mem_if.rsp_ready, lsu_mem_if.rsp_valid})
);
`endif
end

View file

@ -26,11 +26,11 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
// Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_mem_bus_if.master cache_bus_if [DCACHE_NUM_REQS]
VX_lsu_mem_if.master lsu_mem_if [`NUM_LSU_BLOCKS]
);
localparam BLOCK_SIZE = `NUM_LSU_BLOCKS;
localparam NUM_LANES = `NUM_LSU_LANES;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) per_block_execute_if[BLOCK_SIZE]();
@ -51,16 +51,18 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`RESET_RELAY (slice_reset, reset);
`RESET_RELAY (block_reset, reset);
VX_lsu_slice #(
.CORE_ID (CORE_ID),
.BLOCK_ID (block_idx)
) lsu_slice(
.clk (clk),
.reset (slice_reset),
.execute_if (per_block_execute_if[block_idx]),
.commit_if (per_block_commit_if[block_idx]),
.cache_bus_if (cache_bus_if[block_idx * DCACHE_CHANNELS +: DCACHE_CHANNELS])
.clk (clk),
.reset (block_reset),
.execute_if (per_block_execute_if[block_idx]),
.commit_if (per_block_commit_if[block_idx]),
.lsu_mem_if (lsu_mem_if[block_idx])
);
end

View file

@ -0,0 +1,69 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_lsu_mem_if #(
parameter NUM_LANES = 1,
parameter DATA_SIZE = 1,
parameter ATYPE_WIDTH= `ADDR_TYPE_WIDTH,
parameter TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE)
) ();
typedef struct packed {
logic rw;
logic [NUM_LANES-1:0] mask;
logic [NUM_LANES-1:0][DATA_SIZE-1:0] byteen;
logic [NUM_LANES-1:0][ADDR_WIDTH-1:0] addr;
logic [NUM_LANES-1:0][ATYPE_WIDTH-1:0] atype;
logic [NUM_LANES-1:0][DATA_SIZE*8-1:0] data;
logic [TAG_WIDTH-1:0] tag;
} req_data_t;
typedef struct packed {
logic [NUM_LANES-1:0] mask;
logic [NUM_LANES-1:0][DATA_SIZE*8-1:0] data;
logic [TAG_WIDTH-1:0] tag;
} rsp_data_t;
logic req_valid;
req_data_t req_data;
logic req_ready;
logic rsp_valid;
rsp_data_t rsp_data;
logic rsp_ready;
modport master (
output req_valid,
output req_data,
input req_ready,
input rsp_valid,
input rsp_data,
output rsp_ready
);
modport slave (
input req_valid,
input req_data,
output req_ready,
output rsp_valid,
output rsp_data,
input rsp_ready
);
endinterface

View file

@ -31,12 +31,13 @@ module VX_mem_scheduler #(
parameter MEM_OUT_BUF = 0,
parameter WORD_WIDTH = WORD_SIZE * 8,
parameter LINE_WIDTH = LINE_SIZE * 8,
parameter LINE_WIDTH = LINE_SIZE * 8,
parameter COALESCE_ENABLE = (LINE_SIZE != WORD_SIZE),
parameter PER_LINE_REQS = LINE_SIZE / WORD_SIZE,
parameter MERGED_REQS = CORE_REQS / PER_LINE_REQS,
parameter MEM_BATCHES = (MERGED_REQS + MEM_CHANNELS - 1) / MEM_CHANNELS,
parameter MEM_BATCH_BITS= `CLOG2(MEM_BATCHES),
parameter MEM_QUEUE_ADDRW= `CLOG2(MEM_QUEUE_SIZE),
parameter MEM_QUEUE_ADDRW= `CLOG2(COALESCE_ENABLE ? MEM_QUEUE_SIZE : CORE_QUEUE_SIZE),
parameter MEM_ADDR_WIDTH= ADDR_WIDTH - `CLOG2(PER_LINE_REQS),
parameter MEM_TAG_WIDTH = UUID_WIDTH + MEM_QUEUE_ADDRW + MEM_BATCH_BITS
) (
@ -66,20 +67,22 @@ module VX_mem_scheduler #(
input wire core_rsp_ready,
// Memory request
output wire [MEM_CHANNELS-1:0] mem_req_valid,
output wire [MEM_CHANNELS-1:0] mem_req_rw,
output wire mem_req_valid,
output wire mem_req_rw,
output wire [MEM_CHANNELS-1:0] mem_req_mask,
output wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen,
output wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [MEM_CHANNELS-1:0][ATYPE_WIDTH-1:0] mem_req_atype,
output wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire [MEM_CHANNELS-1:0] mem_req_ready,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire [MEM_CHANNELS-1:0] mem_rsp_valid,
input wire mem_rsp_valid,
input wire [MEM_CHANNELS-1:0] mem_rsp_mask,
input wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire [MEM_CHANNELS-1:0] mem_rsp_ready
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready
);
localparam BATCH_SEL_WIDTH = `UP(MEM_BATCH_BITS);
localparam STALL_TIMEOUT = 10000000;
@ -87,7 +90,6 @@ module VX_mem_scheduler #(
localparam TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH;
localparam REQQ_TAG_WIDTH = UUID_WIDTH + CORE_QUEUE_ADDRW;
localparam MERGED_TAG_WIDTH= UUID_WIDTH + MEM_QUEUE_ADDRW;
localparam COALESCE_ENABLE = (LINE_SIZE != WORD_SIZE);
localparam CORE_CHANNELS = COALESCE_ENABLE ? CORE_REQS : MEM_CHANNELS;
localparam CORE_BATCHES = COALESCE_ENABLE ? 1 : MEM_BATCHES;
localparam CORE_BATCH_BITS = `CLOG2(CORE_BATCHES);
@ -126,7 +128,7 @@ module VX_mem_scheduler #(
wire [MERGED_TAG_WIDTH-1:0] reqq_tag_s;
wire reqq_ready_s;
wire [MEM_CHANNELS-1:0] mem_req_valid_s;
wire mem_req_valid_s;
wire [MEM_CHANNELS-1:0] mem_req_mask_s;
wire mem_req_rw_s;
wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_s;
@ -134,13 +136,7 @@ module VX_mem_scheduler #(
wire [MEM_CHANNELS-1:0][ATYPE_WIDTH-1:0] mem_req_atype_s;
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire [MEM_CHANNELS-1:0] mem_req_ready_s;
wire mem_rsp_valid_s2;
wire [MEM_CHANNELS-1:0] mem_rsp_mask_s2;
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_rsp_data_s2;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s2;
wire mem_rsp_ready_s2;
wire mem_req_ready_s;
wire mem_rsp_valid_s;
wire [CORE_REQS-1:0] mem_rsp_mask_s;
@ -273,11 +269,11 @@ module VX_mem_scheduler #(
.out_req_ready (reqq_ready_s),
// Output response
.out_rsp_valid (mem_rsp_valid_s2),
.out_rsp_mask (mem_rsp_mask_s2),
.out_rsp_data (mem_rsp_data_s2),
.out_rsp_tag (mem_rsp_tag_s2),
.out_rsp_ready (mem_rsp_ready_s2)
.out_rsp_valid (mem_rsp_valid),
.out_rsp_mask (mem_rsp_mask),
.out_rsp_data (mem_rsp_data),
.out_rsp_tag (mem_rsp_tag),
.out_rsp_ready (mem_rsp_ready)
);
end else begin
@ -292,11 +288,11 @@ module VX_mem_scheduler #(
assign reqq_tag_s = reqq_tag;
assign reqq_ready = reqq_ready_s;
assign mem_rsp_valid_s = mem_rsp_valid_s2;
assign mem_rsp_mask_s = mem_rsp_mask_s2;
assign mem_rsp_data_s = mem_rsp_data_s2;
assign mem_rsp_tag_s = mem_rsp_tag_s2;
assign mem_rsp_ready_s2 = mem_rsp_ready_s;
assign mem_rsp_valid_s = mem_rsp_valid;
assign mem_rsp_mask_s = mem_rsp_mask;
assign mem_rsp_data_s = mem_rsp_data;
assign mem_rsp_tag_s = mem_rsp_tag;
assign mem_rsp_ready = mem_rsp_ready_s;
end
@ -335,24 +331,6 @@ module VX_mem_scheduler #(
assign mem_req_addr_s = mem_req_addr_b[req_batch_idx];
assign mem_req_atype_s = mem_req_atype_b[req_batch_idx];
assign mem_req_data_s = mem_req_data_b[req_batch_idx];
reg [MEM_CHANNELS-1:0] batch_sent_mask;
wire [MEM_CHANNELS-1:0] batch_sent_mask_n = batch_sent_mask | mem_req_ready_s;
wire batch_sent_all = (mem_req_mask_s & ~batch_sent_mask_n) == 0;
always @(posedge clk) begin
if (reset) begin
batch_sent_mask <= '0;
end else begin
if (reqq_valid_s) begin
if (batch_sent_all) begin
batch_sent_mask <= '0;
end else begin
batch_sent_mask <= batch_sent_mask_n;
end
end
end
end
if (MEM_BATCHES != 1) begin
reg [MEM_BATCH_BITS-1:0] req_batch_idx_r;
@ -360,7 +338,7 @@ module VX_mem_scheduler #(
if (reset) begin
req_batch_idx_r <= '0;
end else begin
if (reqq_valid_s && batch_sent_all) begin
if (reqq_valid_s && mem_req_ready_s) begin
if (req_sent_all) begin
req_batch_idx_r <= '0;
end else begin
@ -391,60 +369,37 @@ module VX_mem_scheduler #(
);
assign req_batch_idx = req_batch_idx_r;
assign req_sent_all = batch_sent_all && (req_batch_idx_r == req_batch_idx_last);
assign req_sent_all = mem_req_ready_s && (req_batch_idx_r == req_batch_idx_last);
assign mem_req_tag_s = {reqq_tag_s, req_batch_idx};
end else begin
assign req_batch_idx = '0;
assign req_sent_all = batch_sent_all;
assign req_sent_all = mem_req_ready_s;
assign mem_req_tag_s = reqq_tag_s;
end
assign mem_req_valid_s = {MEM_CHANNELS{reqq_valid_s}} & mem_req_mask_s & ~batch_sent_mask;
assign mem_req_valid_s = reqq_valid_s;
assign reqq_ready_s = req_sent_all;
for (genvar i = 0; i < MEM_CHANNELS; ++i) begin
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + MEM_ADDR_WIDTH + ATYPE_WIDTH + LINE_WIDTH + MEM_TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid_s[i]),
.ready_in (mem_req_ready_s[i]),
.data_in ({mem_req_rw_s, mem_req_byteen_s[i], mem_req_addr_s[i], mem_req_atype_s[i], mem_req_data_s[i], mem_req_tag_s}),
.data_out ({mem_req_rw[i], mem_req_byteen[i], mem_req_addr[i], mem_req_atype[i], mem_req_data[i], mem_req_tag[i]}),
.valid_out (mem_req_valid[i]),
.ready_out (mem_req_ready[i])
);
end
VX_elastic_buffer #(
.DATAW (MEM_CHANNELS + 1 + MEM_CHANNELS * (LINE_SIZE + MEM_ADDR_WIDTH + ATYPE_WIDTH + LINE_WIDTH) + MEM_TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_mask_s, mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_atype_s, mem_req_data_s, mem_req_tag_s}),
.data_out ({mem_req_mask, mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_atype, mem_req_data, mem_req_tag}),
.valid_out (mem_req_valid),
.ready_out (mem_req_ready)
);
// Handle memory responses ////////////////////////////////////////////////
// Merge memory responses
VX_stream_merge #(
.NUM_REQS (MEM_CHANNELS),
.DATA_WIDTH (LINE_WIDTH),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_BITS (MEM_TAG_WIDTH - UUID_WIDTH),
.OUT_BUF (2)
) rsp_merge (
.clk (clk),
.reset (reset),
.rsp_valid_in (mem_rsp_valid),
.rsp_data_in (mem_rsp_data),
.rsp_tag_in (mem_rsp_tag),
.rsp_ready_in (mem_rsp_ready),
.rsp_valid_out (mem_rsp_valid_s2),
.rsp_mask_out (mem_rsp_mask_s2),
.rsp_data_out (mem_rsp_data_s2),
.rsp_tag_out (mem_rsp_tag_s2),
.rsp_ready_out (mem_rsp_ready_s2)
);
reg [CORE_QUEUE_SIZE-1:0][CORE_REQS-1:0] rsp_rem_mask;
wire [CORE_REQS-1:0] rsp_rem_mask_n, curr_mask;
wire [BATCH_SEL_WIDTH-1:0] rsp_batch_idx;
@ -617,7 +572,8 @@ module VX_mem_scheduler #(
assign rsp_dbg_uuid = '0;
end
wire [MEM_CHANNELS-1:0] mem_req_fire_s = mem_req_valid_s & mem_req_ready_s;
wire mem_req_fire_s = mem_req_valid_s && mem_req_ready_s;
always @(posedge clk) begin
if (core_req_fire) begin
if (core_req_rw) begin
@ -640,14 +596,14 @@ module VX_mem_scheduler #(
end
if (| mem_req_fire_s) begin
if (| mem_req_rw_s) begin
`TRACE(1, ("%d: %s-mem-req-wr: valid=%b, addr=", $time, INSTANCE_ID, mem_req_fire_s));
`TRACE(1, ("%d: %s-mem-req-wr: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s));
`TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS);
`TRACE(1, (", byteen="));
`TRACE_ARRAY1D(1, "0x%h", mem_req_byteen_s, CORE_CHANNELS);
`TRACE(1, (", data="));
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data_s, CORE_CHANNELS);
end else begin
`TRACE(1, ("%d: %s-mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_fire_s));
`TRACE(1, ("%d: %s-mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s));
`TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS);
end
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr, req_batch_idx, mem_req_dbg_uuid));

View file

@ -13,33 +13,31 @@
`include "VX_platform.vh"
`TRACING_OFF
module VX_stream_merge #(
//`TRACING_OFF
module VX_stream_pack #(
parameter NUM_REQS = 1,
parameter DATA_WIDTH = 1,
parameter TAG_WIDTH = 1,
parameter TAG_SEL_BITS = 0,
parameter `STRING ARBITER = "P",
parameter OUT_BUF = 0
) (
input wire clk,
input wire reset,
// input response
input wire [NUM_REQS-1:0] rsp_valid_in,
input wire [NUM_REQS-1:0][DATA_WIDTH-1:0] rsp_data_in,
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] rsp_tag_in,
output wire [NUM_REQS-1:0] rsp_ready_in,
// input
input wire [NUM_REQS-1:0] valid_in,
input wire [NUM_REQS-1:0][DATA_WIDTH-1:0] data_in,
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] tag_in,
output wire [NUM_REQS-1:0] ready_in,
// output responses
output wire rsp_valid_out,
output wire [NUM_REQS-1:0] rsp_mask_out,
output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] rsp_data_out,
output wire [TAG_WIDTH-1:0] rsp_tag_out,
input wire rsp_ready_out
// output
output wire valid_out,
output wire [NUM_REQS-1:0] mask_out,
output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] data_out,
output wire [TAG_WIDTH-1:0] tag_out,
input wire ready_out
);
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS);
if (NUM_REQS > 1) begin
@ -51,35 +49,35 @@ module VX_stream_merge #(
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (1),
.TYPE ("P")
.TYPE (ARBITER)
) arbiter (
.clk (clk),
.reset (reset),
.requests (rsp_valid_in),
.requests (valid_in),
.grant_valid (grant_valid),
.grant_index (grant_index),
`UNUSED_PIN (grant_onehot),
.grant_unlock(grant_ready)
);
reg [NUM_REQS-1:0] rsp_valid_sel;
reg [NUM_REQS-1:0] rsp_ready_sel;
wire rsp_ready_unqual;
reg [NUM_REQS-1:0] valid_sel;
reg [NUM_REQS-1:0] ready_sel;
wire ready_unqual;
wire [TAG_WIDTH-1:0] rsp_tag_sel = rsp_tag_in[grant_index];
wire [TAG_WIDTH-1:0] tag_sel = tag_in[grant_index];
always @(*) begin
rsp_valid_sel = '0;
rsp_ready_sel = '0;
valid_sel = '0;
ready_sel = '0;
for (integer i = 0; i < NUM_REQS; ++i) begin
if (rsp_tag_in[i][TAG_SEL_BITS-1:0] == rsp_tag_sel[TAG_SEL_BITS-1:0]) begin
rsp_valid_sel[i] = rsp_valid_in[i];
rsp_ready_sel[i] = rsp_ready_unqual;
if (tag_in[i][TAG_SEL_BITS-1:0] == tag_sel[TAG_SEL_BITS-1:0]) begin
valid_sel[i] = valid_in[i];
ready_sel[i] = ready_unqual;
end
end
end
assign grant_ready = rsp_ready_unqual;
assign grant_ready = ready_unqual;
VX_elastic_buffer #(
.DATAW (NUM_REQS + TAG_WIDTH + (NUM_REQS * DATA_WIDTH)),
@ -89,24 +87,26 @@ module VX_stream_merge #(
.clk (clk),
.reset (reset),
.valid_in (grant_valid),
.data_in ({rsp_valid_sel, rsp_tag_sel, rsp_data_in}),
.ready_in (rsp_ready_unqual),
.valid_out (rsp_valid_out),
.data_out ({rsp_mask_out, rsp_tag_out, rsp_data_out}),
.ready_out (rsp_ready_out)
.data_in ({valid_sel, tag_sel, data_in}),
.ready_in (ready_unqual),
.valid_out (valid_out),
.data_out ({mask_out, tag_out, data_out}),
.ready_out (ready_out)
);
assign rsp_ready_in = rsp_ready_sel;
assign ready_in = ready_sel;
end else begin
assign rsp_valid_out = rsp_valid_in;
assign rsp_mask_out = 1'b1;
assign rsp_tag_out = rsp_tag_in;
assign rsp_data_out = rsp_data_in;
assign rsp_ready_in = rsp_ready_out;
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign valid_out = valid_in;
assign mask_out = 1'b1;
assign data_out = data_in;
assign tag_out = tag_in;
assign ready_in = ready_out;
end
endmodule
`TRACING_ON
//`TRACING_ON

View file

@ -0,0 +1,93 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
//`TRACING_OFF
module VX_stream_unpack #(
parameter NUM_REQS = 1,
parameter DATA_WIDTH = 1,
parameter TAG_WIDTH = 1,
parameter OUT_BUF = 0
) (
input wire clk,
input wire reset,
// input
input wire valid_in,
input wire [NUM_REQS-1:0] mask_in,
input wire [NUM_REQS-1:0][DATA_WIDTH-1:0] data_in,
input wire [TAG_WIDTH-1:0] tag_in,
output wire ready_in,
// output
output wire [NUM_REQS-1:0] valid_out,
output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] data_out,
output wire [NUM_REQS-1:0][TAG_WIDTH-1:0] tag_out,
input wire [NUM_REQS-1:0] ready_out
);
if (NUM_REQS > 1) begin
reg [NUM_REQS-1:0] sent_mask;
wire [NUM_REQS-1:0] ready_out_r;
wire [NUM_REQS-1:0] sent_mask_n = sent_mask | ready_out_r;
wire sent_all = ~(| (mask_in & ~sent_mask_n));
always @(posedge clk) begin
if (reset) begin
sent_mask <= '0;
end else begin
if (valid_in) begin
if (sent_all) begin
sent_mask <= '0;
end else begin
sent_mask <= sent_mask_n;
end
end
end
end
assign ready_in = sent_all;
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_elastic_buffer #(
.DATAW (DATA_WIDTH + TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_in && mask_in[i] && ~sent_mask[i]),
.ready_in (ready_out_r[i]),
.data_in ({data_in[i], tag_in}),
.data_out ({data_out[i], tag_out[i]}),
.valid_out (valid_out[i]),
.ready_out (ready_out[i])
);
end
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (mask_in)
assign valid_out = valid_in;
assign data_out = data_in;
assign tag_out = tag_in;
assign ready_in = ready_out;
end
endmodule
//`TRACING_ON

View file

@ -153,6 +153,8 @@ module VX_local_mem import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_rsp_tag;
wire [NUM_BANKS-1:0] per_bank_rsp_ready;
`RESET_RELAY (bank_reset, reset);
for (genvar i = 0; i < NUM_BANKS; ++i) begin
VX_sp_ram #(
.DATAW (WORD_WIDTH),
@ -178,7 +180,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
.SIZE (0)
) bank_buf (
.clk (clk),
.reset (reset),
.reset (bank_reset),
.valid_in (per_bank_req_valid_w),
.ready_in (per_bank_req_ready_w),
.data_in ({per_bank_req_idx[i], per_bank_req_tag[i]}),