mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
Using packed LSU memory requests within the code
This commit is contained in:
parent
df38cc00f5
commit
c175e11a18
14 changed files with 717 additions and 292 deletions
|
@ -500,7 +500,7 @@
|
|||
|
||||
// Number of Banks
|
||||
`ifndef LMEM_NUM_BANKS
|
||||
`define LMEM_NUM_BANKS `DCACHE_NUM_BANKS
|
||||
`define LMEM_NUM_BANKS `NUM_LSU_LANES
|
||||
`endif
|
||||
|
||||
// L2cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
|
|
@ -349,6 +349,14 @@
|
|||
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define ASSIGN_VX_LSU_MEM_IF(dst, src) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data = src.req_data; \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data = dst.rsp_data; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
|
||||
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
|
||||
if (enable) begin \
|
||||
|
|
|
@ -77,6 +77,46 @@ package VX_gpu_pkg;
|
|||
|
||||
/* verilator lint_off UNUSED */
|
||||
|
||||
///////////////////////// LSU memory Parameters ///////////////////////////
|
||||
|
||||
localparam LSU_WORD_SIZE = `XLEN / 8;
|
||||
localparam LSU_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(LSU_WORD_SIZE));
|
||||
localparam LSU_MEM_BATCHES = 1;
|
||||
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
|
||||
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
|
||||
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
|
||||
|
||||
////////////////////////// Dcache Parameters //////////////////////////////
|
||||
|
||||
// Word size in bytes
|
||||
localparam DCACHE_WORD_SIZE = `LSU_LINE_SIZE;
|
||||
localparam DCACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(DCACHE_WORD_SIZE));
|
||||
|
||||
// Block size in bytes
|
||||
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE);
|
||||
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
|
||||
|
||||
// Core request tag Id bits
|
||||
localparam DCACHE_MERGED_REQS = (`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE;
|
||||
localparam DCACHE_MEM_BATCHES = (DCACHE_MERGED_REQS + DCACHE_CHANNELS - 1) / DCACHE_CHANNELS;
|
||||
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_OUT_SIZE) + `CLOG2(DCACHE_MEM_BATCHES));
|
||||
|
||||
// Core request tag bits
|
||||
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
|
||||
|
||||
// Memory request data bits
|
||||
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
|
||||
|
||||
// Memory request tag bits
|
||||
`ifdef DCACHE_ENABLE
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
`else
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
`endif
|
||||
|
||||
////////////////////////// Icache Parameters //////////////////////////////
|
||||
|
||||
// Word size in bytes
|
||||
|
@ -102,38 +142,6 @@ package VX_gpu_pkg;
|
|||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
`endif
|
||||
|
||||
////////////////////////// Dcache Parameters //////////////////////////////
|
||||
|
||||
// Word size in bytes
|
||||
localparam DCACHE_WORD_SIZE = `LSU_LINE_SIZE;
|
||||
localparam DCACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(DCACHE_WORD_SIZE));
|
||||
|
||||
// Block size in bytes
|
||||
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * (`XLEN / 8)) / DCACHE_WORD_SIZE);
|
||||
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
|
||||
|
||||
// Core request tag Id bits
|
||||
|
||||
localparam DCACHE_MERGED_REQS = (`NUM_LSU_LANES * (`XLEN / 8)) / DCACHE_WORD_SIZE;
|
||||
localparam DCACHE_MEM_BATCHES = (DCACHE_MERGED_REQS + DCACHE_CHANNELS - 1) / DCACHE_CHANNELS;
|
||||
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_OUT_SIZE) + `CLOG2(DCACHE_MEM_BATCHES));
|
||||
|
||||
// Core request tag bits
|
||||
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
|
||||
|
||||
// Memory request data bits
|
||||
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
|
||||
|
||||
// Memory request tag bits
|
||||
`ifdef DCACHE_ENABLE
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
`else
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// L1 Parameters /////////////////////////////
|
||||
|
||||
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||
|
|
|
@ -61,10 +61,11 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
VX_commit_if commit_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
|
||||
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_lmem_bus_if[DCACHE_NUM_REQS]();
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lsu_mem_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
|
@ -176,7 +177,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
.dcache_bus_if (dcache_lmem_bus_if),
|
||||
.lsu_mem_if (lsu_mem_if),
|
||||
|
||||
.dispatch_if (dispatch_if),
|
||||
.commit_if (commit_if),
|
||||
|
@ -206,36 +207,131 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.sim_wb_value (sim_wb_value)
|
||||
);
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef LMEM_ENABLE
|
||||
|
||||
`RESET_RELAY (lmem_unit_reset, reset);
|
||||
|
||||
VX_lmem_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) lmem_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.clk (clk),
|
||||
.reset (lmem_unit_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.lmem),
|
||||
.cache_perf (mem_perf_tmp_if.lmem),
|
||||
`endif
|
||||
.dcache_bus_in_if (dcache_lmem_bus_if),
|
||||
.dcache_bus_out_if (dcache_bus_if)
|
||||
.lsu_mem_in_if (lsu_mem_if),
|
||||
.lsu_mem_out_if (lsu_dcache_if)
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_lmem_bus_if[i]);
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
`ASSIGN_VX_LSU_MEM_IF (lsu_dcache_if[i], lsu_mem_if[i]);
|
||||
end
|
||||
|
||||
`endif
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (DCACHE_CHANNELS),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`RESET_RELAY (coalescer_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
|
||||
if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin
|
||||
|
||||
VX_mem_coalescer #(
|
||||
.INSTANCE_ID ($sformatf("core%0d-coalescer", CORE_ID)),
|
||||
.NUM_REQS (`NUM_LSU_LANES),
|
||||
.DATA_IN_SIZE (LSU_WORD_SIZE),
|
||||
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
|
||||
.ADDR_WIDTH (LSU_ADDR_WIDTH),
|
||||
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
|
||||
) coalescer (
|
||||
.clk (clk),
|
||||
.reset (coalescer_reset),
|
||||
|
||||
// Input request
|
||||
.in_req_valid (lsu_dcache_if[i].req_valid),
|
||||
.in_req_mask (lsu_dcache_if[i].req_data.mask),
|
||||
.in_req_rw (lsu_dcache_if[i].req_data.rw),
|
||||
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
|
||||
.in_req_addr (lsu_dcache_if[i].req_data.addr),
|
||||
.in_req_atype (lsu_dcache_if[i].req_data.atype),
|
||||
.in_req_data (lsu_dcache_if[i].req_data.data),
|
||||
.in_req_tag (lsu_dcache_if[i].req_data.tag),
|
||||
.in_req_ready (lsu_dcache_if[i].req_ready),
|
||||
|
||||
// Input response
|
||||
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
|
||||
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
|
||||
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
|
||||
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
|
||||
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
|
||||
|
||||
// Output request
|
||||
.out_req_valid (dcache_coalesced_if[i].req_valid),
|
||||
.out_req_mask (dcache_coalesced_if[i].req_data.mask),
|
||||
.out_req_rw (dcache_coalesced_if[i].req_data.rw),
|
||||
.out_req_byteen (dcache_coalesced_if[i].req_data.byteen),
|
||||
.out_req_addr (dcache_coalesced_if[i].req_data.addr),
|
||||
.out_req_atype (dcache_coalesced_if[i].req_data.atype),
|
||||
.out_req_data (dcache_coalesced_if[i].req_data.data),
|
||||
.out_req_tag (dcache_coalesced_if[i].req_data.tag),
|
||||
.out_req_ready (dcache_coalesced_if[i].req_ready),
|
||||
|
||||
// Output response
|
||||
.out_rsp_valid (dcache_coalesced_if[i].rsp_valid),
|
||||
.out_rsp_mask (dcache_coalesced_if[i].rsp_data.mask),
|
||||
.out_rsp_data (dcache_coalesced_if[i].rsp_data.data),
|
||||
.out_rsp_tag (dcache_coalesced_if[i].rsp_data.tag),
|
||||
.out_rsp_ready (dcache_coalesced_if[i].rsp_ready)
|
||||
);
|
||||
|
||||
end else begin
|
||||
|
||||
`ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
|
||||
|
||||
end
|
||||
end
|
||||
|
||||
`RESET_RELAY (lsu_adapter_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (DCACHE_CHANNELS),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH)
|
||||
) lsu_adapter (
|
||||
.clk (clk),
|
||||
.reset (lsu_adapter_reset),
|
||||
.lsu_mem_if (dcache_coalesced_if[i]),
|
||||
.mem_bus_if (dcache_bus_if[i * DCACHE_CHANNELS +: DCACHE_CHANNELS])
|
||||
);
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||
|
||||
wire [1:0] perf_icache_pending_read_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
||||
|
@ -247,14 +343,16 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
|
||||
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
|
||||
wire [LSU_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
|
||||
wire [LSU_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
|
||||
wire [LSU_NUM_REQS-1:0] perf_dcache_rsp_fire;
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
assign perf_dcache_rd_req_fire[i] = dcache_lmem_bus_if[i].req_valid && dcache_lmem_bus_if[i].req_ready && ~dcache_lmem_bus_if[i].req_data.rw;
|
||||
assign perf_dcache_wr_req_fire[i] = dcache_lmem_bus_if[i].req_valid && dcache_lmem_bus_if[i].req_ready && dcache_lmem_bus_if[i].req_data.rw;
|
||||
assign perf_dcache_rsp_fire[i] = dcache_lmem_bus_if[i].rsp_valid && dcache_lmem_bus_if[i].rsp_ready;
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
|
||||
assign perf_dcache_rd_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && ~lsu_mem_if[i].req_data.rw;
|
||||
assign perf_dcache_wr_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && lsu_mem_if[i].req_data.rw;
|
||||
assign perf_dcache_rsp_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].rsp_valid && lsu_mem_if[i].rsp_data.mask[j] && lsu_mem_if[i].rsp_ready;
|
||||
end
|
||||
end
|
||||
|
||||
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
|
||||
|
|
|
@ -29,7 +29,7 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
input base_dcrs_t base_dcrs,
|
||||
|
||||
// Dcache interface
|
||||
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
||||
VX_lsu_mem_if.master lsu_mem_if [`NUM_LSU_BLOCKS],
|
||||
|
||||
// dispatch interface
|
||||
VX_dispatch_if.slave dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
|
@ -77,7 +77,7 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
.reset (lsu_reset),
|
||||
.dispatch_if (dispatch_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.cache_bus_if (dcache_bus_if)
|
||||
.lsu_mem_if (lsu_mem_if)
|
||||
);
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -105,12 +105,12 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
.base_dcrs (base_dcrs),
|
||||
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
`endif
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.sched_csr_if (sched_csr_if),
|
||||
.commit_if (commit_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.sched_csr_if (sched_csr_if),
|
||||
.warp_ctl_if (warp_ctl_if)
|
||||
);
|
||||
|
||||
|
|
|
@ -23,47 +23,115 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
|
|||
output cache_perf_t cache_perf,
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if.slave dcache_bus_in_if [DCACHE_NUM_REQS],
|
||||
VX_mem_bus_if.master dcache_bus_out_if [DCACHE_NUM_REQS]
|
||||
VX_lsu_mem_if.slave lsu_mem_in_if [`NUM_LSU_BLOCKS],
|
||||
VX_lsu_mem_if.master lsu_mem_out_if [`NUM_LSU_BLOCKS]
|
||||
);
|
||||
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
|
||||
|
||||
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(DCACHE_WORD_SIZE);
|
||||
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) lmem_bus_if[DCACHE_NUM_REQS]();
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lmem_lsu_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) switch_out_bus_if[2 * DCACHE_NUM_REQS]();
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
|
||||
wire [`NUM_LSU_LANES-1:0] is_addr_local_mask;
|
||||
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
|
||||
assign is_addr_local_mask[j] = lsu_mem_in_if[i].req_data.mask[j]
|
||||
&& lsu_mem_in_if[i].req_data.atype[j][`ADDR_TYPE_LOCAL];
|
||||
end
|
||||
|
||||
wire is_addr_local = (| is_addr_local_mask);
|
||||
wire is_addr_global = (| ~is_addr_local_mask);
|
||||
|
||||
assign lsu_mem_out_if[i].req_valid = lsu_mem_in_if[i].req_valid && is_addr_global;
|
||||
assign lsu_mem_out_if[i].req_data.mask = lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask;
|
||||
assign lsu_mem_out_if[i].req_data.rw = lsu_mem_in_if[i].req_data.rw;
|
||||
assign lsu_mem_out_if[i].req_data.byteen= lsu_mem_in_if[i].req_data.byteen;
|
||||
assign lsu_mem_out_if[i].req_data.addr = lsu_mem_in_if[i].req_data.addr;
|
||||
assign lsu_mem_out_if[i].req_data.atype = lsu_mem_in_if[i].req_data.atype;
|
||||
assign lsu_mem_out_if[i].req_data.data = lsu_mem_in_if[i].req_data.data;
|
||||
assign lsu_mem_out_if[i].req_data.tag = lsu_mem_in_if[i].req_data.tag;
|
||||
|
||||
`RESET_RELAY (switch_reset, reset);
|
||||
assign lmem_lsu_if[i].req_valid = lsu_mem_in_if[i].req_valid && is_addr_local;
|
||||
assign lmem_lsu_if[i].req_data.mask = lsu_mem_in_if[i].req_data.mask & is_addr_local_mask;
|
||||
assign lmem_lsu_if[i].req_data.rw = lsu_mem_in_if[i].req_data.rw;
|
||||
assign lmem_lsu_if[i].req_data.byteen = lsu_mem_in_if[i].req_data.byteen;
|
||||
assign lmem_lsu_if[i].req_data.addr = lsu_mem_in_if[i].req_data.addr;
|
||||
assign lmem_lsu_if[i].req_data.atype = lsu_mem_in_if[i].req_data.atype;
|
||||
assign lmem_lsu_if[i].req_data.data = lsu_mem_in_if[i].req_data.data;
|
||||
assign lmem_lsu_if[i].req_data.tag = lsu_mem_in_if[i].req_data.tag;
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
VX_mem_switch #(
|
||||
.NUM_REQS (2),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (2),
|
||||
.RSP_OUT_BUF (2)
|
||||
) lmem_switch (
|
||||
.clk (clk),
|
||||
.reset (switch_reset),
|
||||
.bus_sel (dcache_bus_in_if[i].req_data.atype[`ADDR_TYPE_LOCAL]),
|
||||
.bus_in_if (dcache_bus_in_if[i]),
|
||||
.bus_out_if (switch_out_bus_if[i * 2 +: 2])
|
||||
assign lsu_mem_in_if[i].req_ready = (lsu_mem_out_if[i].req_ready && is_addr_global)
|
||||
|| (lmem_lsu_if[i].req_ready && is_addr_local);
|
||||
end
|
||||
|
||||
`RESET_RELAY (arb_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
|
||||
wire rsp_arb_valid;
|
||||
wire rsp_arb_index;
|
||||
wire rsp_arb_ready;
|
||||
|
||||
VX_generic_arbiter #(
|
||||
.NUM_REQS (2),
|
||||
.LOCK_ENABLE (1),
|
||||
.TYPE ("R")
|
||||
) arbiter (
|
||||
.clk (clk),
|
||||
.reset (arb_reset),
|
||||
.requests ({
|
||||
lmem_lsu_if[i].rsp_valid,
|
||||
lsu_mem_out_if[i].rsp_valid
|
||||
}),
|
||||
.grant_valid (rsp_arb_valid),
|
||||
.grant_index (rsp_arb_index),
|
||||
`UNUSED_PIN (grant_onehot),
|
||||
.grant_unlock(rsp_arb_ready)
|
||||
);
|
||||
|
||||
// output bus[0] goes to the dcache
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2 + 0]);
|
||||
assign lsu_mem_in_if[i].rsp_valid = rsp_arb_valid;
|
||||
assign lsu_mem_in_if[i].rsp_data.mask = rsp_arb_index ? lmem_lsu_if[i].rsp_data.mask : lsu_mem_out_if[i].rsp_data.mask;
|
||||
assign lsu_mem_in_if[i].rsp_data.data = rsp_arb_index ? lmem_lsu_if[i].rsp_data.data : lsu_mem_out_if[i].rsp_data.data;
|
||||
assign lsu_mem_in_if[i].rsp_data.tag = rsp_arb_index ? lmem_lsu_if[i].rsp_data.tag : lsu_mem_out_if[i].rsp_data.tag;
|
||||
assign lsu_mem_out_if[i].rsp_ready = lsu_mem_in_if[i].rsp_ready && ~rsp_arb_index;
|
||||
assign lmem_lsu_if[i].rsp_ready = lsu_mem_in_if[i].rsp_ready && rsp_arb_index;
|
||||
assign rsp_arb_ready = lsu_mem_in_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
// output bus[1] goes to the local memory
|
||||
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i], switch_out_bus_if[i * 2 + 1]);
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lmem_bus_if[LSU_NUM_REQS]();
|
||||
|
||||
`RESET_RELAY (adapter_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
|
||||
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH)
|
||||
) lsu_adapter (
|
||||
.clk (clk),
|
||||
.reset (adapter_reset),
|
||||
.lsu_mem_if (lmem_lsu_if[i]),
|
||||
.mem_bus_if (lmem_bus_tmp_if)
|
||||
);
|
||||
|
||||
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
|
||||
end
|
||||
end
|
||||
|
||||
`RESET_RELAY (lmem_reset, reset);
|
||||
|
@ -71,16 +139,15 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
|
|||
VX_local_mem #(
|
||||
.INSTANCE_ID($sformatf("core%0d-lmem", CORE_ID)),
|
||||
.SIZE (1 << `LMEM_LOG_SIZE),
|
||||
.NUM_REQS (DCACHE_NUM_REQS),
|
||||
.NUM_REQS (LSU_NUM_REQS),
|
||||
.NUM_BANKS (`LMEM_NUM_BANKS),
|
||||
.WORD_SIZE (DCACHE_WORD_SIZE),
|
||||
.WORD_SIZE (LSU_WORD_SIZE),
|
||||
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) local_mem (
|
||||
.clk (clk),
|
||||
.reset (lmem_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (cache_perf),
|
||||
`endif
|
||||
|
|
121
hw/rtl/core/VX_lsu_adapter.sv
Normal file
121
hw/rtl/core/VX_lsu_adapter.sv
Normal file
|
@ -0,0 +1,121 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
module VX_lsu_adapter #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter TAG_SEL_BITS = 0,
|
||||
parameter `STRING ARBITER = "P",
|
||||
parameter REQ_OUT_BUF = 0,
|
||||
parameter RSP_OUT_BUF = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_lsu_mem_if.slave lsu_mem_if,
|
||||
VX_mem_bus_if.master mem_bus_if [NUM_LANES]
|
||||
);
|
||||
localparam REQ_ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE);
|
||||
localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `ADDR_TYPE_WIDTH + DATA_SIZE * 8;
|
||||
localparam RSP_DATA_WIDTH = DATA_SIZE * 8;
|
||||
|
||||
// handle request unpacking
|
||||
|
||||
wire [NUM_LANES-1:0][REQ_DATA_WIDTH-1:0] req_data_in;
|
||||
|
||||
wire [NUM_LANES-1:0] req_valid_out;
|
||||
wire [NUM_LANES-1:0][REQ_DATA_WIDTH-1:0] req_data_out;
|
||||
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] req_tag_out;
|
||||
wire [NUM_LANES-1:0] req_ready_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign req_data_in[i] = {
|
||||
lsu_mem_if.req_data.rw,
|
||||
lsu_mem_if.req_data.byteen[i],
|
||||
lsu_mem_if.req_data.addr[i],
|
||||
lsu_mem_if.req_data.atype[i],
|
||||
lsu_mem_if.req_data.data[i]
|
||||
};
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign mem_bus_if[i].req_valid = req_valid_out[i];
|
||||
assign {
|
||||
mem_bus_if[i].req_data.rw,
|
||||
mem_bus_if[i].req_data.byteen,
|
||||
mem_bus_if[i].req_data.addr,
|
||||
mem_bus_if[i].req_data.atype,
|
||||
mem_bus_if[i].req_data.data
|
||||
} = req_data_out[i];
|
||||
assign mem_bus_if[i].req_data.tag = req_tag_out[i];
|
||||
assign req_ready_out[i] = mem_bus_if[i].req_ready;
|
||||
end
|
||||
|
||||
VX_stream_unpack #(
|
||||
.NUM_REQS (NUM_LANES),
|
||||
.DATA_WIDTH (REQ_DATA_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.OUT_BUF (REQ_OUT_BUF)
|
||||
) stream_unpack (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (lsu_mem_if.req_valid),
|
||||
.mask_in (lsu_mem_if.req_data.mask),
|
||||
.data_in (req_data_in),
|
||||
.tag_in (lsu_mem_if.req_data.tag),
|
||||
.ready_in (lsu_mem_if.req_ready),
|
||||
.valid_out (req_valid_out),
|
||||
.data_out (req_data_out),
|
||||
.tag_out (req_tag_out),
|
||||
.ready_out (req_ready_out)
|
||||
);
|
||||
|
||||
// handle response packing
|
||||
|
||||
wire [NUM_LANES-1:0] rsp_valid_out;
|
||||
wire [NUM_LANES-1:0][RSP_DATA_WIDTH-1:0] rsp_data_out;
|
||||
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] rsp_tag_out;
|
||||
wire [NUM_LANES-1:0] rsp_ready_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign rsp_valid_out[i] = mem_bus_if[i].rsp_valid;
|
||||
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
|
||||
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
|
||||
assign mem_bus_if[i].rsp_ready = rsp_ready_out[i];
|
||||
end
|
||||
|
||||
VX_stream_pack #(
|
||||
.NUM_REQS (NUM_LANES),
|
||||
.DATA_WIDTH (RSP_DATA_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.TAG_SEL_BITS (TAG_SEL_BITS),
|
||||
.ARBITER (ARBITER),
|
||||
.OUT_BUF (RSP_OUT_BUF)
|
||||
) stream_pack (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (rsp_valid_out),
|
||||
.data_in (rsp_data_out),
|
||||
.tag_in (rsp_tag_out),
|
||||
.ready_in (rsp_ready_out),
|
||||
.valid_out (lsu_mem_if.rsp_valid),
|
||||
.mask_out (lsu_mem_if.rsp_data.mask),
|
||||
.data_out (lsu_mem_if.rsp_data.data),
|
||||
.tag_out (lsu_mem_if.rsp_data.tag),
|
||||
.ready_out (lsu_mem_if.rsp_ready)
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -27,21 +27,19 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
// Outputs
|
||||
VX_commit_if.master commit_if,
|
||||
VX_mem_bus_if.master cache_bus_if [DCACHE_CHANNELS]
|
||||
VX_lsu_mem_if.master lsu_mem_if
|
||||
);
|
||||
localparam WORD_SIZE = `XLEN / 8;
|
||||
localparam ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(WORD_SIZE);
|
||||
localparam NUM_LANES = `NUM_LSU_LANES;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_IN_SIZE);
|
||||
localparam REQ_ASHIFT = `CLOG2(WORD_SIZE);
|
||||
localparam REQ_ASHIFT = `CLOG2(LSU_WORD_SIZE);
|
||||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
|
||||
|
||||
// tag_id = wid + PC + rd + op_type + align + pid + pkt_addr
|
||||
localparam TAG_ID_WIDTH = `NW_WIDTH + `XLEN + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + PID_WIDTH + LSUQ_SIZEW;
|
||||
localparam TAG_ID_WIDTH = `NW_WIDTH + `XLEN + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * REQ_ASHIFT) + PID_WIDTH + LSUQ_SIZEW;
|
||||
|
||||
// tag = uuid + tag_id
|
||||
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
|
||||
|
@ -99,15 +97,15 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire mem_req_valid;
|
||||
wire [NUM_LANES-1:0] mem_req_mask;
|
||||
wire mem_req_rw;
|
||||
wire [NUM_LANES-1:0][`MEM_ADDR_WIDTH-REQ_ASHIFT-1:0] mem_req_addr;
|
||||
reg [NUM_LANES-1:0][WORD_SIZE-1:0] mem_req_byteen;
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
|
||||
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] mem_req_addr;
|
||||
reg [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] mem_req_byteen;
|
||||
reg [NUM_LANES-1:0][LSU_WORD_SIZE*8-1:0] mem_req_data;
|
||||
wire [TAG_WIDTH-1:0] mem_req_tag;
|
||||
wire mem_req_ready;
|
||||
|
||||
wire mem_rsp_valid;
|
||||
wire [NUM_LANES-1:0] mem_rsp_mask;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] mem_rsp_data;
|
||||
wire [NUM_LANES-1:0][LSU_WORD_SIZE*8-1:0] mem_rsp_data;
|
||||
wire [TAG_WIDTH-1:0] mem_rsp_tag;
|
||||
wire mem_rsp_sop;
|
||||
wire mem_rsp_eop;
|
||||
|
@ -154,7 +152,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
|
||||
end
|
||||
`endif
|
||||
default : mem_req_byteen[i] = {WORD_SIZE{1'b1}};
|
||||
default : mem_req_byteen[i] = {LSU_WORD_SIZE{1'b1}};
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
@ -268,28 +266,31 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
pkt_waddr
|
||||
};
|
||||
|
||||
wire [DCACHE_CHANNELS-1:0] cache_req_valid;
|
||||
wire [DCACHE_CHANNELS-1:0] cache_req_rw;
|
||||
wire [DCACHE_CHANNELS-1:0][DCACHE_WORD_SIZE-1:0] cache_req_byteen;
|
||||
wire [DCACHE_CHANNELS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
|
||||
wire [DCACHE_CHANNELS-1:0][`ADDR_TYPE_WIDTH-1:0] cache_req_atype;
|
||||
wire [DCACHE_CHANNELS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_req_data;
|
||||
wire [DCACHE_CHANNELS-1:0][DCACHE_TAG_WIDTH-1:0] cache_req_tag;
|
||||
wire [DCACHE_CHANNELS-1:0] cache_req_ready;
|
||||
wire [DCACHE_CHANNELS-1:0] cache_rsp_valid;
|
||||
wire [DCACHE_CHANNELS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_rsp_data;
|
||||
wire [DCACHE_CHANNELS-1:0][DCACHE_TAG_WIDTH-1:0] cache_rsp_tag;
|
||||
wire [DCACHE_CHANNELS-1:0] cache_rsp_ready;
|
||||
wire lsu_mem_req_valid;
|
||||
wire lsu_mem_req_rw;
|
||||
wire [NUM_LANES-1:0] lsu_mem_req_mask;
|
||||
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_mem_req_byteen;
|
||||
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_mem_req_addr;
|
||||
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] lsu_mem_req_atype;
|
||||
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_req_data;
|
||||
wire [LSU_TAG_WIDTH-1:0] lsu_mem_req_tag;
|
||||
wire lsu_mem_req_ready;
|
||||
|
||||
wire lsu_mem_rsp_valid;
|
||||
wire [NUM_LANES-1:0] lsu_mem_rsp_mask;
|
||||
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_rsp_data;
|
||||
wire [LSU_TAG_WIDTH-1:0] lsu_mem_rsp_tag;
|
||||
wire lsu_mem_rsp_ready;
|
||||
|
||||
`RESET_RELAY (mem_scheduler_reset, reset);
|
||||
|
||||
VX_mem_scheduler #(
|
||||
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched%0d", CORE_ID, BLOCK_ID)),
|
||||
.CORE_REQS (`NUM_LSU_LANES),
|
||||
.MEM_CHANNELS(DCACHE_CHANNELS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.LINE_SIZE (DCACHE_WORD_SIZE),
|
||||
.ADDR_WIDTH (ADDR_WIDTH),
|
||||
.CORE_REQS (NUM_LANES),
|
||||
.MEM_CHANNELS(NUM_LANES),
|
||||
.WORD_SIZE (LSU_WORD_SIZE),
|
||||
.LINE_SIZE (LSU_WORD_SIZE),
|
||||
.ADDR_WIDTH (LSU_ADDR_WIDTH),
|
||||
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
|
||||
|
@ -324,37 +325,39 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
.core_rsp_ready (mem_rsp_ready),
|
||||
|
||||
// Memory request
|
||||
.mem_req_valid (cache_req_valid),
|
||||
.mem_req_rw (cache_req_rw),
|
||||
.mem_req_byteen (cache_req_byteen),
|
||||
.mem_req_addr (cache_req_addr),
|
||||
.mem_req_atype (cache_req_atype),
|
||||
.mem_req_data (cache_req_data),
|
||||
.mem_req_tag (cache_req_tag),
|
||||
.mem_req_ready (cache_req_ready),
|
||||
.mem_req_valid (lsu_mem_req_valid),
|
||||
.mem_req_rw (lsu_mem_req_rw),
|
||||
.mem_req_mask (lsu_mem_req_mask),
|
||||
.mem_req_byteen (lsu_mem_req_byteen),
|
||||
.mem_req_addr (lsu_mem_req_addr),
|
||||
.mem_req_atype (lsu_mem_req_atype),
|
||||
.mem_req_data (lsu_mem_req_data),
|
||||
.mem_req_tag (lsu_mem_req_tag),
|
||||
.mem_req_ready (lsu_mem_req_ready),
|
||||
|
||||
// Memory response
|
||||
.mem_rsp_valid (cache_rsp_valid),
|
||||
.mem_rsp_data (cache_rsp_data),
|
||||
.mem_rsp_tag (cache_rsp_tag),
|
||||
.mem_rsp_ready (cache_rsp_ready)
|
||||
.mem_rsp_valid (lsu_mem_rsp_valid),
|
||||
.mem_rsp_mask (lsu_mem_rsp_mask),
|
||||
.mem_rsp_data (lsu_mem_rsp_data),
|
||||
.mem_rsp_tag (lsu_mem_rsp_tag),
|
||||
.mem_rsp_ready (lsu_mem_rsp_ready)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < DCACHE_CHANNELS; ++i) begin
|
||||
assign cache_bus_if[i].req_valid = cache_req_valid[i];
|
||||
assign cache_bus_if[i].req_data.rw = cache_req_rw[i];
|
||||
assign cache_bus_if[i].req_data.byteen = cache_req_byteen[i];
|
||||
assign cache_bus_if[i].req_data.addr = cache_req_addr[i];
|
||||
assign cache_bus_if[i].req_data.atype = cache_req_atype[i];
|
||||
assign cache_bus_if[i].req_data.data = cache_req_data[i];
|
||||
assign cache_bus_if[i].req_data.tag = cache_req_tag[i];
|
||||
assign cache_req_ready[i] = cache_bus_if[i].req_ready;
|
||||
assign lsu_mem_if.req_valid = lsu_mem_req_valid;
|
||||
assign lsu_mem_if.req_data.mask = lsu_mem_req_mask;
|
||||
assign lsu_mem_if.req_data.rw = lsu_mem_req_rw;
|
||||
assign lsu_mem_if.req_data.byteen = lsu_mem_req_byteen;
|
||||
assign lsu_mem_if.req_data.addr = lsu_mem_req_addr;
|
||||
assign lsu_mem_if.req_data.atype = lsu_mem_req_atype;
|
||||
assign lsu_mem_if.req_data.data = lsu_mem_req_data;
|
||||
assign lsu_mem_if.req_data.tag = lsu_mem_req_tag;
|
||||
assign lsu_mem_req_ready = lsu_mem_if.req_ready;
|
||||
|
||||
assign cache_rsp_valid[i] = cache_bus_if[i].rsp_valid;
|
||||
assign cache_rsp_data[i] = cache_bus_if[i].rsp_data.data;
|
||||
assign cache_rsp_tag[i] = cache_bus_if[i].rsp_data.tag;
|
||||
assign cache_bus_if[i].rsp_ready = cache_rsp_ready[i];
|
||||
end
|
||||
assign lsu_mem_rsp_valid = lsu_mem_if.rsp_valid;
|
||||
assign lsu_mem_rsp_mask = lsu_mem_if.rsp_data.mask;
|
||||
assign lsu_mem_rsp_data = lsu_mem_if.rsp_data.data;
|
||||
assign lsu_mem_rsp_tag = lsu_mem_if.rsp_data.tag;
|
||||
assign lsu_mem_if.rsp_ready = lsu_mem_rsp_ready;
|
||||
|
||||
wire [`UUID_WIDTH-1:0] rsp_uuid;
|
||||
wire [`NW_WIDTH-1:0] rsp_wid;
|
||||
|
@ -455,8 +458,6 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
assign commit_st_if.data.data = commit_ld_if.data.data; // force arbiter passthru
|
||||
|
||||
// lsu commit
|
||||
|
||||
`RESET_RELAY (commit_arb_reset, reset);
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
|
@ -464,7 +465,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
.OUT_BUF (3)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (commit_arb_reset),
|
||||
.reset (reset),
|
||||
.valid_in ({commit_st_if.valid, commit_ld_if.valid}),
|
||||
.ready_in ({commit_st_if.ready, commit_ld_if.ready}),
|
||||
.data_in ({commit_st_if.data, commit_ld_if.data}),
|
||||
|
@ -531,8 +532,8 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.probe0 ({mem_req_data_0, execute_if.data.uuid, execute_if.data.wid, execute_if.data.PC, mem_req_mask, full_addr_0, mem_req_byteen, mem_req_rw, mem_req_ready, mem_req_valid}),
|
||||
.probe1 ({rsp_data_0, rsp_uuid, mem_rsp_eop, rsp_pc, rsp_rd, mem_rsp_mask, rsp_wid, mem_rsp_ready, mem_rsp_valid}),
|
||||
.probe2 ({cache_bus_if.req_data.data, cache_bus_if.req_data.tag, cache_bus_if.req_data.byteen, cache_bus_if.req_data.addr, cache_bus_if.req_data.rw, cache_bus_if.req_ready, cache_bus_if.req_valid}),
|
||||
.probe3 ({cache_bus_if.rsp_data.data, cache_bus_if.rsp_data.tag, cache_bus_if.rsp_ready, cache_bus_if.rsp_valid})
|
||||
.probe2 ({lsu_mem_if.req_data.data, lsu_mem_if.req_data.tag, lsu_mem_if.req_data.byteen, lsu_mem_if.req_data.addr, lsu_mem_if.req_data.rw, lsu_mem_if.req_ready, lsu_mem_if.req_valid}),
|
||||
.probe3 ({lsu_mem_if.rsp_data.data, lsu_mem_if.rsp_data.tag, lsu_mem_if.rsp_ready, lsu_mem_if.rsp_valid})
|
||||
);
|
||||
`endif
|
||||
end
|
||||
|
|
|
@ -26,11 +26,11 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
// Outputs
|
||||
VX_commit_if.master commit_if [`ISSUE_WIDTH],
|
||||
VX_mem_bus_if.master cache_bus_if [DCACHE_NUM_REQS]
|
||||
VX_lsu_mem_if.master lsu_mem_if [`NUM_LSU_BLOCKS]
|
||||
);
|
||||
localparam BLOCK_SIZE = `NUM_LSU_BLOCKS;
|
||||
localparam NUM_LANES = `NUM_LSU_LANES;
|
||||
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) per_block_execute_if[BLOCK_SIZE]();
|
||||
|
@ -51,16 +51,18 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
) per_block_commit_if[BLOCK_SIZE]();
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||
`RESET_RELAY (slice_reset, reset);
|
||||
|
||||
`RESET_RELAY (block_reset, reset);
|
||||
|
||||
VX_lsu_slice #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.BLOCK_ID (block_idx)
|
||||
) lsu_slice(
|
||||
.clk (clk),
|
||||
.reset (slice_reset),
|
||||
.execute_if (per_block_execute_if[block_idx]),
|
||||
.commit_if (per_block_commit_if[block_idx]),
|
||||
.cache_bus_if (cache_bus_if[block_idx * DCACHE_CHANNELS +: DCACHE_CHANNELS])
|
||||
.clk (clk),
|
||||
.reset (block_reset),
|
||||
.execute_if (per_block_execute_if[block_idx]),
|
||||
.commit_if (per_block_commit_if[block_idx]),
|
||||
.lsu_mem_if (lsu_mem_if[block_idx])
|
||||
);
|
||||
end
|
||||
|
||||
|
|
69
hw/rtl/interfaces/VX_lsu_mem_if.sv
Normal file
69
hw/rtl/interfaces/VX_lsu_mem_if.sv
Normal file
|
@ -0,0 +1,69 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_lsu_mem_if #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter ATYPE_WIDTH= `ADDR_TYPE_WIDTH,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE)
|
||||
) ();
|
||||
|
||||
typedef struct packed {
|
||||
logic rw;
|
||||
logic [NUM_LANES-1:0] mask;
|
||||
logic [NUM_LANES-1:0][DATA_SIZE-1:0] byteen;
|
||||
logic [NUM_LANES-1:0][ADDR_WIDTH-1:0] addr;
|
||||
logic [NUM_LANES-1:0][ATYPE_WIDTH-1:0] atype;
|
||||
logic [NUM_LANES-1:0][DATA_SIZE*8-1:0] data;
|
||||
logic [TAG_WIDTH-1:0] tag;
|
||||
} req_data_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [NUM_LANES-1:0] mask;
|
||||
logic [NUM_LANES-1:0][DATA_SIZE*8-1:0] data;
|
||||
logic [TAG_WIDTH-1:0] tag;
|
||||
} rsp_data_t;
|
||||
|
||||
logic req_valid;
|
||||
req_data_t req_data;
|
||||
logic req_ready;
|
||||
|
||||
logic rsp_valid;
|
||||
rsp_data_t rsp_data;
|
||||
logic rsp_ready;
|
||||
|
||||
modport master (
|
||||
output req_valid,
|
||||
output req_data,
|
||||
input req_ready,
|
||||
|
||||
input rsp_valid,
|
||||
input rsp_data,
|
||||
output rsp_ready
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input req_valid,
|
||||
input req_data,
|
||||
output req_ready,
|
||||
|
||||
output rsp_valid,
|
||||
output rsp_data,
|
||||
input rsp_ready
|
||||
);
|
||||
|
||||
endinterface
|
|
@ -31,12 +31,13 @@ module VX_mem_scheduler #(
|
|||
parameter MEM_OUT_BUF = 0,
|
||||
|
||||
parameter WORD_WIDTH = WORD_SIZE * 8,
|
||||
parameter LINE_WIDTH = LINE_SIZE * 8,
|
||||
parameter LINE_WIDTH = LINE_SIZE * 8,
|
||||
parameter COALESCE_ENABLE = (LINE_SIZE != WORD_SIZE),
|
||||
parameter PER_LINE_REQS = LINE_SIZE / WORD_SIZE,
|
||||
parameter MERGED_REQS = CORE_REQS / PER_LINE_REQS,
|
||||
parameter MEM_BATCHES = (MERGED_REQS + MEM_CHANNELS - 1) / MEM_CHANNELS,
|
||||
parameter MEM_BATCH_BITS= `CLOG2(MEM_BATCHES),
|
||||
parameter MEM_QUEUE_ADDRW= `CLOG2(MEM_QUEUE_SIZE),
|
||||
parameter MEM_QUEUE_ADDRW= `CLOG2(COALESCE_ENABLE ? MEM_QUEUE_SIZE : CORE_QUEUE_SIZE),
|
||||
parameter MEM_ADDR_WIDTH= ADDR_WIDTH - `CLOG2(PER_LINE_REQS),
|
||||
parameter MEM_TAG_WIDTH = UUID_WIDTH + MEM_QUEUE_ADDRW + MEM_BATCH_BITS
|
||||
) (
|
||||
|
@ -66,20 +67,22 @@ module VX_mem_scheduler #(
|
|||
input wire core_rsp_ready,
|
||||
|
||||
// Memory request
|
||||
output wire [MEM_CHANNELS-1:0] mem_req_valid,
|
||||
output wire [MEM_CHANNELS-1:0] mem_req_rw,
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [MEM_CHANNELS-1:0] mem_req_mask,
|
||||
output wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen,
|
||||
output wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [MEM_CHANNELS-1:0][ATYPE_WIDTH-1:0] mem_req_atype,
|
||||
output wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire [MEM_CHANNELS-1:0] mem_req_ready,
|
||||
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire [MEM_CHANNELS-1:0] mem_rsp_valid,
|
||||
input wire mem_rsp_valid,
|
||||
input wire [MEM_CHANNELS-1:0] mem_rsp_mask,
|
||||
input wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire [MEM_CHANNELS-1:0] mem_rsp_ready
|
||||
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready
|
||||
);
|
||||
localparam BATCH_SEL_WIDTH = `UP(MEM_BATCH_BITS);
|
||||
localparam STALL_TIMEOUT = 10000000;
|
||||
|
@ -87,7 +90,6 @@ module VX_mem_scheduler #(
|
|||
localparam TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH;
|
||||
localparam REQQ_TAG_WIDTH = UUID_WIDTH + CORE_QUEUE_ADDRW;
|
||||
localparam MERGED_TAG_WIDTH= UUID_WIDTH + MEM_QUEUE_ADDRW;
|
||||
localparam COALESCE_ENABLE = (LINE_SIZE != WORD_SIZE);
|
||||
localparam CORE_CHANNELS = COALESCE_ENABLE ? CORE_REQS : MEM_CHANNELS;
|
||||
localparam CORE_BATCHES = COALESCE_ENABLE ? 1 : MEM_BATCHES;
|
||||
localparam CORE_BATCH_BITS = `CLOG2(CORE_BATCHES);
|
||||
|
@ -126,7 +128,7 @@ module VX_mem_scheduler #(
|
|||
wire [MERGED_TAG_WIDTH-1:0] reqq_tag_s;
|
||||
wire reqq_ready_s;
|
||||
|
||||
wire [MEM_CHANNELS-1:0] mem_req_valid_s;
|
||||
wire mem_req_valid_s;
|
||||
wire [MEM_CHANNELS-1:0] mem_req_mask_s;
|
||||
wire mem_req_rw_s;
|
||||
wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_s;
|
||||
|
@ -134,13 +136,7 @@ module VX_mem_scheduler #(
|
|||
wire [MEM_CHANNELS-1:0][ATYPE_WIDTH-1:0] mem_req_atype_s;
|
||||
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_s;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
|
||||
wire [MEM_CHANNELS-1:0] mem_req_ready_s;
|
||||
|
||||
wire mem_rsp_valid_s2;
|
||||
wire [MEM_CHANNELS-1:0] mem_rsp_mask_s2;
|
||||
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_rsp_data_s2;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s2;
|
||||
wire mem_rsp_ready_s2;
|
||||
wire mem_req_ready_s;
|
||||
|
||||
wire mem_rsp_valid_s;
|
||||
wire [CORE_REQS-1:0] mem_rsp_mask_s;
|
||||
|
@ -273,11 +269,11 @@ module VX_mem_scheduler #(
|
|||
.out_req_ready (reqq_ready_s),
|
||||
|
||||
// Output response
|
||||
.out_rsp_valid (mem_rsp_valid_s2),
|
||||
.out_rsp_mask (mem_rsp_mask_s2),
|
||||
.out_rsp_data (mem_rsp_data_s2),
|
||||
.out_rsp_tag (mem_rsp_tag_s2),
|
||||
.out_rsp_ready (mem_rsp_ready_s2)
|
||||
.out_rsp_valid (mem_rsp_valid),
|
||||
.out_rsp_mask (mem_rsp_mask),
|
||||
.out_rsp_data (mem_rsp_data),
|
||||
.out_rsp_tag (mem_rsp_tag),
|
||||
.out_rsp_ready (mem_rsp_ready)
|
||||
);
|
||||
|
||||
end else begin
|
||||
|
@ -292,11 +288,11 @@ module VX_mem_scheduler #(
|
|||
assign reqq_tag_s = reqq_tag;
|
||||
assign reqq_ready = reqq_ready_s;
|
||||
|
||||
assign mem_rsp_valid_s = mem_rsp_valid_s2;
|
||||
assign mem_rsp_mask_s = mem_rsp_mask_s2;
|
||||
assign mem_rsp_data_s = mem_rsp_data_s2;
|
||||
assign mem_rsp_tag_s = mem_rsp_tag_s2;
|
||||
assign mem_rsp_ready_s2 = mem_rsp_ready_s;
|
||||
assign mem_rsp_valid_s = mem_rsp_valid;
|
||||
assign mem_rsp_mask_s = mem_rsp_mask;
|
||||
assign mem_rsp_data_s = mem_rsp_data;
|
||||
assign mem_rsp_tag_s = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_rsp_ready_s;
|
||||
|
||||
end
|
||||
|
||||
|
@ -335,24 +331,6 @@ module VX_mem_scheduler #(
|
|||
assign mem_req_addr_s = mem_req_addr_b[req_batch_idx];
|
||||
assign mem_req_atype_s = mem_req_atype_b[req_batch_idx];
|
||||
assign mem_req_data_s = mem_req_data_b[req_batch_idx];
|
||||
|
||||
reg [MEM_CHANNELS-1:0] batch_sent_mask;
|
||||
wire [MEM_CHANNELS-1:0] batch_sent_mask_n = batch_sent_mask | mem_req_ready_s;
|
||||
wire batch_sent_all = (mem_req_mask_s & ~batch_sent_mask_n) == 0;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
batch_sent_mask <= '0;
|
||||
end else begin
|
||||
if (reqq_valid_s) begin
|
||||
if (batch_sent_all) begin
|
||||
batch_sent_mask <= '0;
|
||||
end else begin
|
||||
batch_sent_mask <= batch_sent_mask_n;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if (MEM_BATCHES != 1) begin
|
||||
reg [MEM_BATCH_BITS-1:0] req_batch_idx_r;
|
||||
|
@ -360,7 +338,7 @@ module VX_mem_scheduler #(
|
|||
if (reset) begin
|
||||
req_batch_idx_r <= '0;
|
||||
end else begin
|
||||
if (reqq_valid_s && batch_sent_all) begin
|
||||
if (reqq_valid_s && mem_req_ready_s) begin
|
||||
if (req_sent_all) begin
|
||||
req_batch_idx_r <= '0;
|
||||
end else begin
|
||||
|
@ -391,60 +369,37 @@ module VX_mem_scheduler #(
|
|||
);
|
||||
|
||||
assign req_batch_idx = req_batch_idx_r;
|
||||
assign req_sent_all = batch_sent_all && (req_batch_idx_r == req_batch_idx_last);
|
||||
assign req_sent_all = mem_req_ready_s && (req_batch_idx_r == req_batch_idx_last);
|
||||
assign mem_req_tag_s = {reqq_tag_s, req_batch_idx};
|
||||
|
||||
end else begin
|
||||
|
||||
assign req_batch_idx = '0;
|
||||
assign req_sent_all = batch_sent_all;
|
||||
assign req_sent_all = mem_req_ready_s;
|
||||
assign mem_req_tag_s = reqq_tag_s;
|
||||
|
||||
end
|
||||
|
||||
assign mem_req_valid_s = {MEM_CHANNELS{reqq_valid_s}} & mem_req_mask_s & ~batch_sent_mask;
|
||||
assign mem_req_valid_s = reqq_valid_s;
|
||||
assign reqq_ready_s = req_sent_all;
|
||||
|
||||
for (genvar i = 0; i < MEM_CHANNELS; ++i) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + LINE_SIZE + MEM_ADDR_WIDTH + ATYPE_WIDTH + LINE_WIDTH + MEM_TAG_WIDTH),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_req_valid_s[i]),
|
||||
.ready_in (mem_req_ready_s[i]),
|
||||
.data_in ({mem_req_rw_s, mem_req_byteen_s[i], mem_req_addr_s[i], mem_req_atype_s[i], mem_req_data_s[i], mem_req_tag_s}),
|
||||
.data_out ({mem_req_rw[i], mem_req_byteen[i], mem_req_addr[i], mem_req_atype[i], mem_req_data[i], mem_req_tag[i]}),
|
||||
.valid_out (mem_req_valid[i]),
|
||||
.ready_out (mem_req_ready[i])
|
||||
);
|
||||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (MEM_CHANNELS + 1 + MEM_CHANNELS * (LINE_SIZE + MEM_ADDR_WIDTH + ATYPE_WIDTH + LINE_WIDTH) + MEM_TAG_WIDTH),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_req_valid_s),
|
||||
.ready_in (mem_req_ready_s),
|
||||
.data_in ({mem_req_mask_s, mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_atype_s, mem_req_data_s, mem_req_tag_s}),
|
||||
.data_out ({mem_req_mask, mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_atype, mem_req_data, mem_req_tag}),
|
||||
.valid_out (mem_req_valid),
|
||||
.ready_out (mem_req_ready)
|
||||
);
|
||||
|
||||
// Handle memory responses ////////////////////////////////////////////////
|
||||
|
||||
// Merge memory responses
|
||||
VX_stream_merge #(
|
||||
.NUM_REQS (MEM_CHANNELS),
|
||||
.DATA_WIDTH (LINE_WIDTH),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (MEM_TAG_WIDTH - UUID_WIDTH),
|
||||
.OUT_BUF (2)
|
||||
) rsp_merge (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.rsp_valid_in (mem_rsp_valid),
|
||||
.rsp_data_in (mem_rsp_data),
|
||||
.rsp_tag_in (mem_rsp_tag),
|
||||
.rsp_ready_in (mem_rsp_ready),
|
||||
.rsp_valid_out (mem_rsp_valid_s2),
|
||||
.rsp_mask_out (mem_rsp_mask_s2),
|
||||
.rsp_data_out (mem_rsp_data_s2),
|
||||
.rsp_tag_out (mem_rsp_tag_s2),
|
||||
.rsp_ready_out (mem_rsp_ready_s2)
|
||||
);
|
||||
|
||||
reg [CORE_QUEUE_SIZE-1:0][CORE_REQS-1:0] rsp_rem_mask;
|
||||
wire [CORE_REQS-1:0] rsp_rem_mask_n, curr_mask;
|
||||
wire [BATCH_SEL_WIDTH-1:0] rsp_batch_idx;
|
||||
|
@ -617,7 +572,8 @@ module VX_mem_scheduler #(
|
|||
assign rsp_dbg_uuid = '0;
|
||||
end
|
||||
|
||||
wire [MEM_CHANNELS-1:0] mem_req_fire_s = mem_req_valid_s & mem_req_ready_s;
|
||||
wire mem_req_fire_s = mem_req_valid_s && mem_req_ready_s;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (core_req_fire) begin
|
||||
if (core_req_rw) begin
|
||||
|
@ -640,14 +596,14 @@ module VX_mem_scheduler #(
|
|||
end
|
||||
if (| mem_req_fire_s) begin
|
||||
if (| mem_req_rw_s) begin
|
||||
`TRACE(1, ("%d: %s-mem-req-wr: valid=%b, addr=", $time, INSTANCE_ID, mem_req_fire_s));
|
||||
`TRACE(1, ("%d: %s-mem-req-wr: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s));
|
||||
`TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS);
|
||||
`TRACE(1, (", byteen="));
|
||||
`TRACE_ARRAY1D(1, "0x%h", mem_req_byteen_s, CORE_CHANNELS);
|
||||
`TRACE(1, (", data="));
|
||||
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data_s, CORE_CHANNELS);
|
||||
end else begin
|
||||
`TRACE(1, ("%d: %s-mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_fire_s));
|
||||
`TRACE(1, ("%d: %s-mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s));
|
||||
`TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS);
|
||||
end
|
||||
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr, req_batch_idx, mem_req_dbg_uuid));
|
||||
|
|
|
@ -13,33 +13,31 @@
|
|||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
`TRACING_OFF
|
||||
module VX_stream_merge #(
|
||||
//`TRACING_OFF
|
||||
module VX_stream_pack #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter DATA_WIDTH = 1,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter TAG_SEL_BITS = 0,
|
||||
parameter `STRING ARBITER = "P",
|
||||
parameter OUT_BUF = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// input response
|
||||
input wire [NUM_REQS-1:0] rsp_valid_in,
|
||||
input wire [NUM_REQS-1:0][DATA_WIDTH-1:0] rsp_data_in,
|
||||
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] rsp_tag_in,
|
||||
output wire [NUM_REQS-1:0] rsp_ready_in,
|
||||
// input
|
||||
input wire [NUM_REQS-1:0] valid_in,
|
||||
input wire [NUM_REQS-1:0][DATA_WIDTH-1:0] data_in,
|
||||
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] tag_in,
|
||||
output wire [NUM_REQS-1:0] ready_in,
|
||||
|
||||
// output responses
|
||||
output wire rsp_valid_out,
|
||||
output wire [NUM_REQS-1:0] rsp_mask_out,
|
||||
output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] rsp_data_out,
|
||||
output wire [TAG_WIDTH-1:0] rsp_tag_out,
|
||||
input wire rsp_ready_out
|
||||
// output
|
||||
output wire valid_out,
|
||||
output wire [NUM_REQS-1:0] mask_out,
|
||||
output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] data_out,
|
||||
output wire [TAG_WIDTH-1:0] tag_out,
|
||||
input wire ready_out
|
||||
);
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS);
|
||||
|
||||
if (NUM_REQS > 1) begin
|
||||
|
@ -51,35 +49,35 @@ module VX_stream_merge #(
|
|||
VX_generic_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.LOCK_ENABLE (1),
|
||||
.TYPE ("P")
|
||||
.TYPE (ARBITER)
|
||||
) arbiter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.requests (rsp_valid_in),
|
||||
.requests (valid_in),
|
||||
.grant_valid (grant_valid),
|
||||
.grant_index (grant_index),
|
||||
`UNUSED_PIN (grant_onehot),
|
||||
.grant_unlock(grant_ready)
|
||||
);
|
||||
|
||||
reg [NUM_REQS-1:0] rsp_valid_sel;
|
||||
reg [NUM_REQS-1:0] rsp_ready_sel;
|
||||
wire rsp_ready_unqual;
|
||||
reg [NUM_REQS-1:0] valid_sel;
|
||||
reg [NUM_REQS-1:0] ready_sel;
|
||||
wire ready_unqual;
|
||||
|
||||
wire [TAG_WIDTH-1:0] rsp_tag_sel = rsp_tag_in[grant_index];
|
||||
wire [TAG_WIDTH-1:0] tag_sel = tag_in[grant_index];
|
||||
|
||||
always @(*) begin
|
||||
rsp_valid_sel = '0;
|
||||
rsp_ready_sel = '0;
|
||||
valid_sel = '0;
|
||||
ready_sel = '0;
|
||||
for (integer i = 0; i < NUM_REQS; ++i) begin
|
||||
if (rsp_tag_in[i][TAG_SEL_BITS-1:0] == rsp_tag_sel[TAG_SEL_BITS-1:0]) begin
|
||||
rsp_valid_sel[i] = rsp_valid_in[i];
|
||||
rsp_ready_sel[i] = rsp_ready_unqual;
|
||||
if (tag_in[i][TAG_SEL_BITS-1:0] == tag_sel[TAG_SEL_BITS-1:0]) begin
|
||||
valid_sel[i] = valid_in[i];
|
||||
ready_sel[i] = ready_unqual;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign grant_ready = rsp_ready_unqual;
|
||||
assign grant_ready = ready_unqual;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (NUM_REQS + TAG_WIDTH + (NUM_REQS * DATA_WIDTH)),
|
||||
|
@ -89,24 +87,26 @@ module VX_stream_merge #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (grant_valid),
|
||||
.data_in ({rsp_valid_sel, rsp_tag_sel, rsp_data_in}),
|
||||
.ready_in (rsp_ready_unqual),
|
||||
.valid_out (rsp_valid_out),
|
||||
.data_out ({rsp_mask_out, rsp_tag_out, rsp_data_out}),
|
||||
.ready_out (rsp_ready_out)
|
||||
.data_in ({valid_sel, tag_sel, data_in}),
|
||||
.ready_in (ready_unqual),
|
||||
.valid_out (valid_out),
|
||||
.data_out ({mask_out, tag_out, data_out}),
|
||||
.ready_out (ready_out)
|
||||
);
|
||||
|
||||
assign rsp_ready_in = rsp_ready_sel;
|
||||
assign ready_in = ready_sel;
|
||||
|
||||
end else begin
|
||||
|
||||
assign rsp_valid_out = rsp_valid_in;
|
||||
assign rsp_mask_out = 1'b1;
|
||||
assign rsp_tag_out = rsp_tag_in;
|
||||
assign rsp_data_out = rsp_data_in;
|
||||
assign rsp_ready_in = rsp_ready_out;
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
assign valid_out = valid_in;
|
||||
assign mask_out = 1'b1;
|
||||
assign data_out = data_in;
|
||||
assign tag_out = tag_in;
|
||||
assign ready_in = ready_out;
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
||||
`TRACING_ON
|
||||
//`TRACING_ON
|
93
hw/rtl/libs/VX_stream_unpack.sv
Normal file
93
hw/rtl/libs/VX_stream_unpack.sv
Normal file
|
@ -0,0 +1,93 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
//`TRACING_OFF
|
||||
module VX_stream_unpack #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter DATA_WIDTH = 1,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter OUT_BUF = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// input
|
||||
input wire valid_in,
|
||||
input wire [NUM_REQS-1:0] mask_in,
|
||||
input wire [NUM_REQS-1:0][DATA_WIDTH-1:0] data_in,
|
||||
input wire [TAG_WIDTH-1:0] tag_in,
|
||||
output wire ready_in,
|
||||
|
||||
// output
|
||||
output wire [NUM_REQS-1:0] valid_out,
|
||||
output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] data_out,
|
||||
output wire [NUM_REQS-1:0][TAG_WIDTH-1:0] tag_out,
|
||||
input wire [NUM_REQS-1:0] ready_out
|
||||
);
|
||||
if (NUM_REQS > 1) begin
|
||||
|
||||
reg [NUM_REQS-1:0] sent_mask;
|
||||
wire [NUM_REQS-1:0] ready_out_r;
|
||||
|
||||
wire [NUM_REQS-1:0] sent_mask_n = sent_mask | ready_out_r;
|
||||
wire sent_all = ~(| (mask_in & ~sent_mask_n));
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
sent_mask <= '0;
|
||||
end else begin
|
||||
if (valid_in) begin
|
||||
if (sent_all) begin
|
||||
sent_mask <= '0;
|
||||
end else begin
|
||||
sent_mask <= sent_mask_n;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign ready_in = sent_all;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATA_WIDTH + TAG_WIDTH),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && mask_in[i] && ~sent_mask[i]),
|
||||
.ready_in (ready_out_r[i]),
|
||||
.data_in ({data_in[i], tag_in}),
|
||||
.data_out ({data_out[i], tag_out[i]}),
|
||||
.valid_out (valid_out[i]),
|
||||
.ready_out (ready_out[i])
|
||||
);
|
||||
end
|
||||
|
||||
end else begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
`UNUSED_VAR (mask_in)
|
||||
assign valid_out = valid_in;
|
||||
assign data_out = data_in;
|
||||
assign tag_out = tag_in;
|
||||
assign ready_in = ready_out;
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
||||
//`TRACING_ON
|
|
@ -153,6 +153,8 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_rsp_tag;
|
||||
wire [NUM_BANKS-1:0] per_bank_rsp_ready;
|
||||
|
||||
`RESET_RELAY (bank_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin
|
||||
VX_sp_ram #(
|
||||
.DATAW (WORD_WIDTH),
|
||||
|
@ -178,7 +180,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
.SIZE (0)
|
||||
) bank_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (bank_reset),
|
||||
.valid_in (per_bank_req_valid_w),
|
||||
.ready_in (per_bank_req_ready_w),
|
||||
.data_in ({per_bank_req_idx[i], per_bank_req_tag[i]}),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue