extending memory interface with address type

This commit is contained in:
Blaise Tine 2024-03-18 00:35:03 -07:00
parent dc19d25bcc
commit 6556e8c66d
16 changed files with 155 additions and 58 deletions

View file

@ -282,6 +282,10 @@
`define L1_ENABLE
`endif
`define ADDR_TYPE_IO 0
`define ADDR_TYPE_LOCAL 1
`define ADDR_TYPE_WIDTH (`LMEM_ENABLED + 1)
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
@ -333,6 +337,7 @@
assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.atype = src.req_data.atype; \
assign dst.req_data.data = src.req_data.data; \
if (TD != TS) \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \

View file

@ -102,6 +102,7 @@ module Vortex import VX_gpu_pkg::*; (
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;

View file

@ -514,6 +514,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.mem_rsp_tag_out (cci_vx_mem_bus_if[1].rsp_data.tag),
.mem_rsp_ready_out (cci_vx_mem_bus_if[1].rsp_ready)
);
assign cci_vx_mem_bus_if[1].req_data.atype = '0;
`UNUSED_VAR (cci_vx_mem_bus_if[1].req_data.atype)
//--
@ -563,6 +566,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.mem_rsp_ready_out (cci_vx_mem_bus_if[0].rsp_ready)
);
assign cci_vx_mem_bus_if[0].req_data.atype = '0;
`UNUSED_VAR (cci_vx_mem_bus_if[0].req_data.atype)
//--
VX_mem_bus_if #(
.DATA_SIZE (LMEM_DATA_SIZE),
@ -631,6 +637,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.avs_readdatavalid(avs_readdatavalid)
);
assign mem_bus_if[0].req_data.atype = '0;
`UNUSED_VAR (mem_bus_if[0].req_data.atype)
// CCI-P Read Request ///////////////////////////////////////////////////////////
reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_ctr;

View file

@ -100,13 +100,14 @@ module VX_cache import VX_gpu_pkg::*; #(
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus_if[i].req_valid;
assign core_req_addr[i] = core_bus_if[i].req_data.addr;
assign core_req_rw[i] = core_bus_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus_if[i].req_data.addr;
assign core_req_data[i] = core_bus_if[i].req_data.data;
assign core_req_tag[i] = core_bus_if[i].req_data.tag;
assign core_bus_if[i].req_ready = core_req_ready[i];
end
`UNUSED_VAR (core_bus_if[i].req_data.atype)
end
///////////////////////////////////////////////////////////////////////////
@ -163,6 +164,8 @@ module VX_cache import VX_gpu_pkg::*; #(
.valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready)
);
assign mem_bus_if.req_data.atype = '0;
///////////////////////////////////////////////////////////////////////////
@ -288,7 +291,8 @@ module VX_cache import VX_gpu_pkg::*; #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS)
.PERF_CTR_BITS (`PERF_CTR_BITS),
.OUT_BUF ((NUM_REQS > 4) ? 2 : 0)
) req_xbar (
.clk (clk),
.reset (req_xbar_reset),

View file

@ -56,7 +56,7 @@ module VX_cache_bypass #(
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
localparam MUX_DATAW = CORE_TAG_WIDTH + CORE_DATA_WIDTH + WORD_SIZE + CORE_ADDR_WIDTH + 1;
localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `ADDR_TYPE_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
@ -65,9 +65,6 @@ module VX_cache_bypass #(
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS;
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
// handle core requests ///////////////////////////////////////////////////
@ -83,8 +80,7 @@ module VX_cache_bypass #(
if (PASSTHRU != 0) begin
assign core_req_nc_idxs[i] = 1'b1;
end else if (NC_ENABLE) begin
wire [MEM_ADDRW-1:0] block_addr = core_bus_in_if[i].req_data.addr[CORE_ADDR_WIDTH-1 -: MEM_ADDRW];
assign core_req_nc_idxs[i] = (block_addr >= MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT));
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.atype[`ADDR_TYPE_IO];
end else begin
assign core_req_nc_idxs[i] = 1'b0;
end
@ -118,15 +114,17 @@ module VX_cache_bypass #(
wire mem_req_out_rw;
wire [LINE_SIZE-1:0] mem_req_out_byteen;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
wire [`ADDR_TYPE_WIDTH-1:0] mem_req_out_atype;
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
wire mem_req_out_ready;
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
wire core_req_nc_sel_rw;
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
wire core_req_nc_sel_rw;
wire [`ADDR_TYPE_WIDTH-1:0] core_req_nc_sel_atype;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
@ -134,6 +132,7 @@ module VX_cache_bypass #(
core_bus_in_if[i].req_data.rw,
core_bus_in_if[i].req_data.byteen,
core_bus_in_if[i].req_data.addr,
core_bus_in_if[i].req_data.atype,
core_bus_in_if[i].req_data.data,
core_bus_in_if[i].req_data.tag
};
@ -143,6 +142,7 @@ module VX_cache_bypass #(
core_req_nc_sel_rw,
core_req_nc_sel_byteen,
core_req_nc_sel_addr,
core_req_nc_sel_atype,
core_req_nc_sel_data,
core_req_nc_sel_tag
} = core_req_nc_mux_in[core_req_nc_idx];
@ -152,6 +152,7 @@ module VX_cache_bypass #(
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign mem_req_out_atype = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.atype : core_req_nc_sel_atype;
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
@ -218,7 +219,7 @@ module VX_cache_bypass #(
assign mem_bus_in_if.req_ready = mem_req_out_ready;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
@ -226,8 +227,8 @@ module VX_cache_bypass #(
.reset (reset),
.valid_in (mem_req_out_valid),
.ready_in (mem_req_out_ready),
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_data, mem_req_out_tag}),
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}),
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.atype, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
.valid_out (mem_bus_out_if.req_valid),
.ready_out (mem_bus_out_if.req_ready)
);

View file

@ -69,6 +69,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
input wire [NUM_REQS-1:0] core_req_rw,
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] core_req_atype,
input wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_REQS-1:0] core_req_ready,
@ -110,6 +111,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
assign core_bus_if[i].req_data.rw = core_req_rw[i];
assign core_bus_if[i].req_data.byteen = core_req_byteen[i];
assign core_bus_if[i].req_data.addr = core_req_addr[i];
assign core_bus_if[i].req_data.atype = core_req_atype[i];
assign core_bus_if[i].req_data.data = core_req_data[i];
assign core_bus_if[i].req_data.tag = core_req_tag[i];
assign core_req_ready[i] = core_bus_if[i].req_ready;
@ -131,6 +133,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
// Memory response
assign mem_bus_if.rsp_valid = mem_rsp_valid;

View file

@ -32,6 +32,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
output wire [DCACHE_NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] dcache_req_atype,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag,
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
@ -100,6 +101,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw;
assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen;
assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr;
assign dcache_req_atype[i] = dcache_bus_if[i].req_data.atype;
assign dcache_req_data[i] = dcache_bus_if[i].req_data.data;
assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag;
assign dcache_bus_if[i].req_ready = dcache_req_ready[i];
@ -122,6 +124,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_req_data = icache_bus_if.req_data.data;
assign icache_req_tag = icache_bus_if.req_data.tag;
assign icache_bus_if.req_ready = icache_req_ready;
`UNUSED_VAR (icache_bus_if.req_data.atype)
assign icache_bus_if.rsp_valid = icache_rsp_valid;
assign icache_bus_if.rsp_data.tag = icache_rsp_tag;

View file

@ -113,6 +113,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
.ready_out (icache_bus_if.req_ready)
);
assign icache_bus_if.req_data.atype = '0;
assign icache_bus_if.req_data.rw = 0;
assign icache_bus_if.req_data.byteen = 4'b1111;
assign icache_bus_if.req_data.data = '0;

View file

@ -30,10 +30,6 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(DCACHE_WORD_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
localparam LMEM_START_B = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
localparam LMEM_END_B = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + (1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
@ -47,11 +43,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
`RESET_RELAY (switch_reset, reset);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
wire [MEM_ADDRW-1:0] block_addr = dcache_bus_in_if[i].req_data.addr[DCACHE_ADDR_WIDTH-1 -: MEM_ADDRW];
wire bus_sel = (block_addr >= LMEM_START_B) && (block_addr < LMEM_END_B);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
VX_mem_switch #(
.NUM_REQS (2),
.DATA_SIZE (DCACHE_WORD_SIZE),
@ -62,7 +54,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
) lmem_switch (
.clk (clk),
.reset (switch_reset),
.bus_sel (bus_sel),
.bus_sel (dcache_bus_in_if[i].req_data.atype[`ADDR_TYPE_LOCAL]),
.bus_in_if (dcache_bus_in_if[i]),
.bus_out_if (switch_out_bus_if[i * 2 +: 2])
);

View file

@ -37,6 +37,8 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_IN_SIZE);
localparam REQ_ASHIFT = `CLOG2(WORD_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
// tag_id = wid + PC + rd + op_type + align + pid + pkt_addr
localparam TAG_ID_WIDTH = `NW_WIDTH + `XLEN + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + PID_WIDTH + LSUQ_SIZEW;
@ -88,6 +90,22 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
assign full_addr[i] = execute_if[block_idx].data.rs1_data[i][`XLEN-1:0] + execute_if[block_idx].data.imm;
end
// address type calculation
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is I/O address
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start);
`ifdef LMEM_ENABLE
// is local memory address
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
`endif
end
wire mem_req_empty;
wire st_rsp_ready;
wire lsu_valid, lsu_ready;
@ -277,6 +295,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
wire [DCACHE_CHANNELS-1:0] cache_req_rw;
wire [DCACHE_CHANNELS-1:0][DCACHE_WORD_SIZE-1:0] cache_req_byteen;
wire [DCACHE_CHANNELS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
wire [DCACHE_CHANNELS-1:0][`ADDR_TYPE_WIDTH-1:0] cache_req_atype;
wire [DCACHE_CHANNELS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_req_data;
wire [DCACHE_CHANNELS-1:0][DCACHE_TAG_WIDTH-1:0] cache_req_tag;
wire [DCACHE_CHANNELS-1:0] cache_req_ready;
@ -294,6 +313,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
.MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE),
@ -310,6 +330,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
.core_req_mask (mem_req_mask),
.core_req_byteen(mem_req_byteen),
.core_req_addr (mem_req_addr),
.core_req_atype (mem_req_atype),
.core_req_data (mem_req_data),
.core_req_tag (mem_req_tag),
.core_req_ready (mem_req_ready),
@ -330,6 +351,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
.mem_req_rw (cache_req_rw),
.mem_req_byteen (cache_req_byteen),
.mem_req_addr (cache_req_addr),
.mem_req_atype (cache_req_atype),
.mem_req_data (cache_req_data),
.mem_req_tag (cache_req_tag),
.mem_req_ready (cache_req_ready),
@ -346,6 +368,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.rw = cache_req_rw[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.byteen = cache_req_byteen[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.addr = cache_req_addr[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.atype = cache_req_atype[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.data = cache_req_data[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.tag = cache_req_tag[i];
assign cache_req_ready[i] = cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_ready;
@ -483,20 +506,24 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[block_idx].data.wid, execute_if[block_idx].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, data=", mem_req_tag, mem_req_byteen));
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen));
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES);
`TRACE(1, (" (#%0d)\n", execute_if[block_idx].data.uuid));
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if[block_idx].data.uuid));
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[block_idx].data.wid, execute_if[block_idx].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, rd=%0d (#%0d)\n", mem_req_tag, mem_req_byteen, execute_if[block_idx].data.rd, execute_if[block_idx].data.uuid));
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, rd=%0d, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if[block_idx].data.rd, mem_req_tag, execute_if[block_idx].data.uuid));
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: D$%0d Rsp: wid=%0d, PC=0x%0h, tmask=%b, tag=0x%0h, rd=%0d, sop=%b, eop=%b, data=",
$time, CORE_ID, rsp_wid, rsp_pc, mem_rsp_mask, mem_rsp_tag, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE(1, ("%d: D$%0d Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, CORE_ID, rsp_wid, rsp_pc, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES);
`TRACE(1, (" (#%0d)\n", rsp_uuid));
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid));
end
end
`endif

View file

@ -18,6 +18,7 @@ module VX_mem_coalescer #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_REQS = 1,
parameter ADDR_WIDTH = 32,
parameter ATYPE_WIDTH = 1,
parameter DATA_IN_SIZE = 4,
parameter DATA_OUT_SIZE = 64,
parameter TAG_WIDTH = 8,
@ -42,6 +43,7 @@ module VX_mem_coalescer #(
input wire [NUM_REQS-1:0] in_req_mask,
input wire [NUM_REQS-1:0][DATA_IN_SIZE-1:0] in_req_byteen,
input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] in_req_addr,
input wire [NUM_REQS-1:0][ATYPE_WIDTH-1:0] in_req_atype,
input wire [NUM_REQS-1:0][DATA_IN_WIDTH-1:0] in_req_data,
input wire [TAG_WIDTH-1:0] in_req_tag,
output wire in_req_ready,
@ -59,6 +61,7 @@ module VX_mem_coalescer #(
output wire [OUT_REQS-1:0] out_req_mask,
output wire [OUT_REQS-1:0][DATA_OUT_SIZE-1:0] out_req_byteen,
output wire [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr,
output wire [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype,
output wire [OUT_REQS-1:0][DATA_OUT_WIDTH-1:0] out_req_data,
output wire [OUT_TAG_WIDTH-1:0] out_req_tag,
input wire out_req_ready,
@ -91,6 +94,7 @@ module VX_mem_coalescer #(
logic [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n;
logic [OUT_REQS-1:0][DATA_OUT_SIZE-1:0] out_req_byteen_r, out_req_byteen_n;
logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n;
logic [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n;
logic [OUT_REQS-1:0][DATA_OUT_WIDTH-1:0] out_req_data_r, out_req_data_n;
logic [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n;
@ -107,6 +111,7 @@ module VX_mem_coalescer #(
logic [OUT_REQS-1:0] batch_valid_r, batch_valid_n;
logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] seed_addr_r, seed_addr_n;
logic [OUT_REQS-1:0][ATYPE_WIDTH-1:0] seed_atype_r, seed_atype_n;
logic [NUM_REQS-1:0] processed_mask_r, processed_mask_n;
wire [OUT_REQS-1:0][NUM_REQS_W-1:0] seed_idx;
@ -141,10 +146,12 @@ module VX_mem_coalescer #(
state_r <= state_n;
out_req_valid_r <= out_req_valid_n;
batch_valid_r <= batch_valid_n;
seed_addr_r <= seed_addr_n;
seed_addr_r <= seed_addr_n;
seed_atype_r <= seed_atype_n;
out_req_rw_r <= out_req_rw_n;
out_req_mask_r <= out_req_mask_n;
out_req_addr_r <= out_req_addr_n;
out_req_atype_r <= out_req_atype_n;
out_req_byteen_r <= out_req_byteen_n;
out_req_data_r <= out_req_data_n;
out_req_tag_r <= out_req_tag_n;
@ -171,9 +178,11 @@ module VX_mem_coalescer #(
state_n = state_r;
out_req_valid_n = out_req_valid_r;
seed_addr_n = seed_addr_r;
seed_atype_n = seed_atype_r;
out_req_rw_n = out_req_rw_r;
out_req_mask_n = out_req_mask_r;
out_req_addr_n = out_req_addr_r;
out_req_atype_n = out_req_atype_r;
out_req_byteen_n = out_req_byteen_r;
out_req_data_n = out_req_data_r;
out_req_tag_n = out_req_tag_r;
@ -185,6 +194,7 @@ module VX_mem_coalescer #(
// find the next seed address
for (integer i = 0; i < OUT_REQS; ++i) begin
seed_addr_n[i] = in_addr_base[seed_idx[i]];
seed_atype_n[i] = in_req_atype[seed_idx[i]];
end
// wait for pending outgoing request to submit
if (out_req_valid && out_req_ready) begin
@ -220,6 +230,7 @@ module VX_mem_coalescer #(
end
out_req_mask_n[i] = batch_valid_r[i];
out_req_addr_n[i] = seed_addr_r[i];
out_req_atype_n[i]= seed_atype_r[i];
end
if (in_req_ready_n) begin
processed_mask_n = '0;
@ -262,15 +273,14 @@ module VX_mem_coalescer #(
);
`UNUSED_VAR (ibuf_empty)
assign out_req_valid = out_req_valid_r;
assign out_req_rw = out_req_rw_r;
for (genvar i = 0; i < OUT_REQS; ++i) begin
assign out_req_mask[i] = out_req_mask_r[i];
assign out_req_byteen[i] = out_req_byteen_r[i];
assign out_req_addr[i] = out_req_addr_r[i];
assign out_req_data[i] = out_req_data_r[i];
end
assign out_req_tag = out_req_tag_r;
assign out_req_valid = out_req_valid_r;
assign out_req_rw = out_req_rw_r;
assign out_req_mask = out_req_mask_r;
assign out_req_byteen = out_req_byteen_r;
assign out_req_addr = out_req_addr_r;
assign out_req_atype = out_req_atype_r;
assign out_req_data = out_req_data_r;
assign out_req_tag = out_req_tag_r;
assign in_req_ready = in_req_ready_n;
@ -341,14 +351,18 @@ module VX_mem_coalescer #(
if (out_req_fire) begin
if (out_req_rw) begin
`TRACE(1, ("%d: %s-out-req-wr: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask));
`TRACE_ARRAY1D(1, "0x%h", out_req_addr, OUT_REQS);
`TRACE_ARRAY1D(1, "0x%h", out_req_addr, OUT_REQS);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", out_req_atype, OUT_REQS);
`TRACE(1, (", byteen="));
`TRACE_ARRAY1D(1, "0x%h", out_req_byteen, OUT_REQS);
`TRACE(1, (", data="));
`TRACE_ARRAY1D(1, "0x%0h", out_req_data, OUT_REQS);
end else begin
`TRACE(1, ("%d: %s-out-req-rd: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask));
`TRACE_ARRAY1D(1, "0x%h", out_req_addr, OUT_REQS);
`TRACE_ARRAY1D(1, "0x%h", out_req_addr, OUT_REQS);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", out_req_atype, OUT_REQS);
end
`TRACE(1, (", offset="));
`TRACE_ARRAY1D(1, "%0d", out_req_offset, NUM_REQS);

View file

@ -21,6 +21,7 @@ module VX_mem_scheduler #(
parameter WORD_SIZE = 4,
parameter LINE_SIZE = WORD_SIZE,
parameter ADDR_WIDTH = 32 - `CLOG2(WORD_SIZE),
parameter ATYPE_WIDTH = 1,
parameter TAG_WIDTH = 8,
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
parameter CORE_QUEUE_SIZE= 8,
@ -48,6 +49,7 @@ module VX_mem_scheduler #(
input wire [CORE_REQS-1:0] core_req_mask,
input wire [CORE_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [CORE_REQS-1:0][ADDR_WIDTH-1:0] core_req_addr,
input wire [CORE_REQS-1:0][ATYPE_WIDTH-1:0] core_req_atype,
input wire [CORE_REQS-1:0][WORD_WIDTH-1:0] core_req_data,
input wire [TAG_WIDTH-1:0] core_req_tag,
output wire core_req_ready,
@ -68,6 +70,7 @@ module VX_mem_scheduler #(
output wire [MEM_CHANNELS-1:0] mem_req_rw,
output wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen,
output wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [MEM_CHANNELS-1:0][ATYPE_WIDTH-1:0] mem_req_atype,
output wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire [MEM_CHANNELS-1:0] mem_req_ready,
@ -108,6 +111,7 @@ module VX_mem_scheduler #(
wire reqq_rw;
wire [CORE_REQS-1:0][WORD_SIZE-1:0] reqq_byteen;
wire [CORE_REQS-1:0][ADDR_WIDTH-1:0] reqq_addr;
wire [CORE_REQS-1:0][ATYPE_WIDTH-1:0] reqq_atype;
wire [CORE_REQS-1:0][WORD_WIDTH-1:0] reqq_data;
wire [REQQ_TAG_WIDTH-1:0] reqq_tag;
wire reqq_ready;
@ -117,6 +121,7 @@ module VX_mem_scheduler #(
wire reqq_rw_s;
wire [MERGED_REQS-1:0][LINE_SIZE-1:0] reqq_byteen_s;
wire [MERGED_REQS-1:0][MEM_ADDR_WIDTH-1:0] reqq_addr_s;
wire [MERGED_REQS-1:0][ATYPE_WIDTH-1:0] reqq_atype_s;
wire [MERGED_REQS-1:0][LINE_WIDTH-1:0] reqq_data_s;
wire [MERGED_TAG_WIDTH-1:0] reqq_tag_s;
wire reqq_ready_s;
@ -126,6 +131,7 @@ module VX_mem_scheduler #(
wire mem_req_rw_s;
wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_s;
wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
wire [MEM_CHANNELS-1:0][ATYPE_WIDTH-1:0] mem_req_atype_s;
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire [MEM_CHANNELS-1:0] mem_req_ready_s;
@ -166,7 +172,7 @@ module VX_mem_scheduler #(
end
VX_elastic_buffer #(
.DATAW (1 + CORE_REQS * (1 + WORD_SIZE + ADDR_WIDTH + WORD_WIDTH) + REQQ_TAG_WIDTH),
.DATAW (1 + CORE_REQS * (1 + WORD_SIZE + ADDR_WIDTH + ATYPE_WIDTH + WORD_WIDTH) + REQQ_TAG_WIDTH),
.SIZE (CORE_QUEUE_SIZE),
.OUT_REG (1)
) req_queue (
@ -174,8 +180,8 @@ module VX_mem_scheduler #(
.reset (reset),
.valid_in (reqq_valid_in),
.ready_in (reqq_ready_in),
.data_in ({core_req_rw, core_req_mask, core_req_byteen, core_req_addr, core_req_data, reqq_tag_u}),
.data_out ({reqq_rw, reqq_mask, reqq_byteen, reqq_addr, reqq_data, reqq_tag}),
.data_in ({core_req_rw, core_req_mask, core_req_byteen, core_req_addr, core_req_atype, core_req_data, reqq_tag_u}),
.data_out ({reqq_rw, reqq_mask, reqq_byteen, reqq_addr, reqq_atype, reqq_data, reqq_tag}),
.valid_out(reqq_valid),
.ready_out(reqq_ready)
);
@ -229,6 +235,7 @@ module VX_mem_scheduler #(
.DATA_IN_SIZE (WORD_SIZE),
.DATA_OUT_SIZE (LINE_SIZE),
.ADDR_WIDTH (ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (REQQ_TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH),
.QUEUE_SIZE (MEM_QUEUE_SIZE)
@ -242,6 +249,7 @@ module VX_mem_scheduler #(
.in_req_rw (reqq_rw),
.in_req_byteen (reqq_byteen),
.in_req_addr (reqq_addr),
.in_req_atype (reqq_atype),
.in_req_data (reqq_data),
.in_req_tag (reqq_tag),
.in_req_ready (reqq_ready),
@ -259,6 +267,7 @@ module VX_mem_scheduler #(
.out_req_rw (reqq_rw_s),
.out_req_byteen (reqq_byteen_s),
.out_req_addr (reqq_addr_s),
.out_req_atype (reqq_atype_s),
.out_req_data (reqq_data_s),
.out_req_tag (reqq_tag_s),
.out_req_ready (reqq_ready_s),
@ -277,7 +286,8 @@ module VX_mem_scheduler #(
assign reqq_mask_s = reqq_mask;
assign reqq_rw_s = reqq_rw;
assign reqq_byteen_s= reqq_byteen;
assign reqq_addr_s = reqq_addr;
assign reqq_addr_s = reqq_addr;
assign reqq_atype_s = reqq_atype;
assign reqq_data_s = reqq_data;
assign reqq_tag_s = reqq_tag;
assign reqq_ready = reqq_ready_s;
@ -295,6 +305,7 @@ module VX_mem_scheduler #(
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0] mem_req_mask_b;
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_b;
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_b;
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][ATYPE_WIDTH-1:0] mem_req_atype_b;
wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_b;
wire [BATCH_SEL_WIDTH-1:0] req_batch_idx;
@ -306,11 +317,13 @@ module VX_mem_scheduler #(
assign mem_req_mask_b[i][j] = reqq_mask_s[r];
assign mem_req_byteen_b[i][j] = reqq_byteen_s[r];
assign mem_req_addr_b[i][j] = reqq_addr_s[r];
assign mem_req_atype_b[i][j] = reqq_atype_s[r];
assign mem_req_data_b[i][j] = reqq_data_s[r];
end else begin
assign mem_req_mask_b[i][j] = 0;
assign mem_req_byteen_b[i][j] = '0;
assign mem_req_addr_b[i][j] = '0;
assign mem_req_atype_b[i][j] = '0;
assign mem_req_data_b[i][j] = '0;
end
end
@ -320,6 +333,7 @@ module VX_mem_scheduler #(
assign mem_req_rw_s = reqq_rw_s;
assign mem_req_byteen_s = mem_req_byteen_b[req_batch_idx];
assign mem_req_addr_s = mem_req_addr_b[req_batch_idx];
assign mem_req_atype_s = mem_req_atype_b[req_batch_idx];
assign mem_req_data_s = mem_req_data_b[req_batch_idx];
reg [MEM_CHANNELS-1:0] batch_sent_mask;
@ -393,7 +407,7 @@ module VX_mem_scheduler #(
for (genvar i = 0; i < MEM_CHANNELS; ++i) begin
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + MEM_ADDR_WIDTH + LINE_WIDTH + MEM_TAG_WIDTH),
.DATAW (1 + LINE_SIZE + MEM_ADDR_WIDTH + ATYPE_WIDTH + LINE_WIDTH + MEM_TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
@ -401,8 +415,8 @@ module VX_mem_scheduler #(
.reset (reset),
.valid_in (mem_req_valid_s[i]),
.ready_in (mem_req_ready_s[i]),
.data_in ({mem_req_rw_s, mem_req_byteen_s[i], mem_req_addr_s[i], mem_req_data_s[i], mem_req_tag_s}),
.data_out ({mem_req_rw[i], mem_req_byteen[i], mem_req_addr[i], mem_req_data[i], mem_req_tag[i]}),
.data_in ({mem_req_rw_s, mem_req_byteen_s[i], mem_req_addr_s[i], mem_req_atype_s[i], mem_req_data_s[i], mem_req_tag_s}),
.data_out ({mem_req_rw[i], mem_req_byteen[i], mem_req_addr[i], mem_req_atype[i], mem_req_data[i], mem_req_tag[i]}),
.valid_out (mem_req_valid[i]),
.ready_out (mem_req_ready[i])
);

View file

@ -77,6 +77,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0][BANK_ADDR_WIDTH-1:0] req_bank_addr;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign req_bank_addr[i] = mem_bus_if[i].req_data.addr[BANK_SEL_BITS +: BANK_ADDR_WIDTH];
`UNUSED_VAR (mem_bus_if[i].req_data.atype)
end
// bank requests dispatch

View file

@ -33,7 +33,7 @@ module VX_mem_arb #(
);
localparam DATA_WIDTH = (8 * DATA_SIZE);
localparam LOG_NUM_REQS = `ARB_SEL_BITS(NUM_INPUTS, NUM_OUTPUTS);
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `ADDR_TYPE_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = TAG_WIDTH + DATA_WIDTH;
`STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter"))
@ -49,7 +49,14 @@ module VX_mem_arb #(
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
assign req_valid_in[i] = bus_in_if[i].req_valid;
assign req_data_in[i] = {bus_in_if[i].req_data.tag, bus_in_if[i].req_data.addr, bus_in_if[i].req_data.rw, bus_in_if[i].req_data.byteen, bus_in_if[i].req_data.data};
assign req_data_in[i] = {
bus_in_if[i].req_data.rw,
bus_in_if[i].req_data.byteen,
bus_in_if[i].req_data.addr,
bus_in_if[i].req_data.atype,
bus_in_if[i].req_data.data,
bus_in_if[i].req_data.tag
};
assign bus_in_if[i].req_ready = req_ready_in[i];
end
@ -83,7 +90,14 @@ module VX_mem_arb #(
.data_out (bus_out_if[i].req_data.tag)
);
assign bus_out_if[i].req_valid = req_valid_out[i];
assign {req_tag_out, bus_out_if[i].req_data.addr, bus_out_if[i].req_data.rw, bus_out_if[i].req_data.byteen, bus_out_if[i].req_data.data} = req_data_out[i];
assign {
bus_out_if[i].req_data.rw,
bus_out_if[i].req_data.byteen,
bus_out_if[i].req_data.addr,
bus_out_if[i].req_data.atype,
bus_out_if[i].req_data.data,
req_tag_out
} = req_data_out[i];
assign req_ready_out[i] = bus_out_if[i].req_ready;
end
@ -144,7 +158,10 @@ module VX_mem_arb #(
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
assign rsp_data_in[i] = {bus_out_if[i].rsp_data.tag, bus_out_if[i].rsp_data.data};
assign rsp_data_in[i] = {
bus_out_if[i].rsp_data.tag,
bus_out_if[i].rsp_data.data
};
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
end
@ -170,7 +187,10 @@ module VX_mem_arb #(
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
assign bus_in_if[i].rsp_valid = rsp_valid_out[i];
assign {bus_in_if[i].rsp_data.tag, bus_in_if[i].rsp_data.data} = rsp_data_out[i];
assign {
bus_in_if[i].rsp_data.tag,
bus_in_if[i].rsp_data.data
} = rsp_data_out[i];
assign rsp_ready_out[i] = bus_in_if[i].rsp_ready;
end

View file

@ -15,15 +15,17 @@
interface VX_mem_bus_if #(
parameter DATA_SIZE = 1,
parameter ATYPE_WIDTH= `ADDR_TYPE_WIDTH,
parameter TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE)
) ();
typedef struct packed {
logic rw;
logic rw;
logic [DATA_SIZE-1:0] byteen;
logic [ADDR_WIDTH-1:0] addr;
logic [ATYPE_WIDTH-1:0] atype;
logic [DATA_SIZE*8-1:0] data;
logic [TAG_WIDTH-1:0] tag;
} req_data_t;

View file

@ -32,7 +32,7 @@ module VX_mem_switch import VX_gpu_pkg::*; #(
);
localparam ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE));
localparam DATA_WIDTH = (8 * DATA_SIZE);
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `ADDR_TYPE_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = TAG_WIDTH + DATA_WIDTH;
// handle requests ////////////////////////////////////////////////////////