multiport

This commit is contained in:
tinebp 2024-12-10 23:25:05 -08:00
parent aa6a47eb11
commit 70ade222b1
39 changed files with 1636 additions and 1129 deletions

View file

@ -323,8 +323,10 @@ config2()
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress
# test memory ports
CONFIGS="-DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=demo
CONFIGS="-DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=demo --threads=32
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=4" ./ci/blackbox.sh --driver=simx --app=sgemmx --threads=16
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --threads=16
echo "configuration-2 tests done!"
}

View file

@ -33,7 +33,13 @@ The recommended method to enable debugging is to pass the `--debug` flag to `bla
// Running demo program on rtlsim in debug mode
$ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution. You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution.
By default all library modules unde the /libs/ folder are excluded from the trace to reduce the waveform file size, you can chnage that behavoir by either explicitly commenting out `TRACING_OFF`/`TRACING_ON` inside a lib module source (e.g. VX_stream_buffer.sv) or simply enabling a full trace using the following command.
// Debugging the demo program with rtlsim in full tracing mode
$ CONFIGS="-DTRACING_ALL" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
## FPGA Debugging

View file

@ -31,7 +31,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
VX_dcr_bus_if.slave dcr_bus_if,
// Memory
VX_mem_bus_if.master mem_bus_if,
VX_mem_bus_if.master mem_bus_if [`L2_MEM_PORTS],
// Status
output wire busy
@ -79,7 +79,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) per_socket_mem_bus_if[`NUM_SOCKETS]();
) per_socket_mem_bus_if[`NUM_SOCKETS * `L1_MEM_PORTS]();
`RESET_RELAY (l2_reset, reset);
@ -91,6 +91,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.NUM_WAYS (`L2_NUM_WAYS),
.WORD_SIZE (L2_WORD_SIZE),
.NUM_REQS (L2_NUM_REQS),
.MEM_PORTS (`L2_MEM_PORTS),
.CRSQ_SIZE (`L2_CRSQ_SIZE),
.MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE),
@ -144,7 +145,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.dcr_bus_if (socket_dcr_bus_if),
.mem_bus_if (per_socket_mem_bus_if[socket_id]),
.mem_bus_if (per_socket_mem_bus_if[socket_id * `L1_MEM_PORTS +: `L1_MEM_PORTS]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[socket_id]),

View file

@ -270,14 +270,14 @@
///////////////////////////////////////////////////////////////////////////////
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, uuid_width) \
(uuid_width + `CLOG2(mshr_size) + `CLOG2(num_banks))
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width) \
(uuid_width + `CLOG2(mshr_size) + `CLOG2(num_banks / mem_ports))
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width) \
(`CLOG2(`CDIV(num_reqs, mem_ports)) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, uuid_width) \
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, uuid_width), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, uuid_width) \
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), `CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width)) + 1)
///////////////////////////////////////////////////////////////////////////////
@ -287,14 +287,14 @@
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches, uuid_width) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, uuid_width), num_caches)
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, num_caches, uuid_width) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), num_caches)
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches, uuid_width) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches), uuid_width), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches, uuid_width) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches), uuid_width), num_caches)
///////////////////////////////////////////////////////////////////////////////
@ -311,6 +311,7 @@
`define MEM_REQ_FLAG_LOCAL 2 // shoud be last since optional
`define MEM_REQ_FLAGS_WIDTH (`MEM_REQ_FLAG_LOCAL + `LMEM_ENABLED)
`define VX_MEM_PORTS `L3_MEM_PORTS
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
@ -388,7 +389,7 @@
assign src.rsp_data.tag = dst.rsp_data.tag; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
`define ASSIGN_VX_MEM_BUS_IF_EX(dst, src, TD, TS, UUID) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.addr = src.req_data.addr; \
@ -397,7 +398,19 @@
assign dst.req_data.flags = src.req_data.flags; \
/* verilator lint_off GENUNNAMED */ \
if (TD != TS) begin \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
if (UUID != 0) begin \
if (TD > TS) begin \
assign dst.req_data.tag = {src.req_data.tag.uuid, {(TD-TS){1'b0}}, src.req_data.tag.value}; \
end else begin \
assign dst.req_data.tag = {src.req_data.tag.uuid, src.req_data.tag.value[TD-UUID-1:0]}; \
end \
end else begin \
if (TD > TS) begin \
assign dst.req_data.tag = {{(TD-TS){1'b0}}, src.req_data.tag}; \
end else begin \
assign dst.req_data.tag = src.req_data.tag[TD-1:0]; \
end \
end \
end else begin \
assign dst.req_data.tag = src.req_data.tag; \
end \
@ -405,7 +418,25 @@
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data.data = dst.rsp_data.data; \
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
/* verilator lint_off GENUNNAMED */ \
if (TD != TS) begin \
if (UUID != 0) begin \
if (TD > TS) begin \
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, dst.rsp_data.tag.value[TS-UUID-1:0]}; \
end else begin \
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, {(TS-TD){1'b0}}, dst.rsp_data.tag.value}; \
end \
end else begin \
if (TD > TS) begin \
assign src.rsp_data.tag = dst.rsp_data.tag[TS-1:0]; \
end else begin \
assign src.rsp_data.tag = {{(TS-TD){1'b0}}, dst.rsp_data.tag}; \
end \
end \
end else begin \
assign src.rsp_data.tag = dst.rsp_data.tag; \
end \
/* verilator lint_on GENUNNAMED */ \
assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, ena, latency) \

View file

@ -166,9 +166,9 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES, `UUID_WIDTH);
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, 1, `NUM_ICACHES, `UUID_WIDTH);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, 1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
@ -180,7 +180,7 @@ package VX_gpu_pkg;
// Block size in bytes
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
// Input request size
// Input request size (using coalesced memory blocks)
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE);
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
@ -197,26 +197,27 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES, `UUID_WIDTH);
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES, `UUID_WIDTH);
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
// arbitrate between icache and dcache
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
/////////////////////////////// L2 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_IDX = 0;
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
localparam ICACHE_MEM_ARB_IDX = 0;
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
// Word size in bytes
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
// Input request size
localparam L2_NUM_REQS = `NUM_SOCKETS;
localparam L2_NUM_REQS = `NUM_SOCKETS * `L1_MEM_PORTS;
// Core request tag bits
localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH;
@ -226,9 +227,9 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef L2_ENABLE
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH, `UUID_WIDTH);
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH, `UUID_WIDTH);
`else
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`endif
/////////////////////////////// L3 Parameters /////////////////////////////
@ -237,7 +238,7 @@ package VX_gpu_pkg;
localparam L3_WORD_SIZE = `L2_LINE_SIZE;
// Input request size
localparam L3_NUM_REQS = `NUM_CLUSTERS;
localparam L3_NUM_REQS = `NUM_CLUSTERS * `L2_MEM_PORTS;
// Core request tag bits
localparam L3_TAG_WIDTH = L2_MEM_TAG_WIDTH;
@ -247,9 +248,9 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef L3_ENABLE
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH, `UUID_WIDTH);
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH, `UUID_WIDTH);
`else
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif
/////////////////////////////// Issue parameters //////////////////////////

View file

@ -25,11 +25,9 @@
`ifdef SIMULATION
`define STATIC_ASSERT(cond, msg) \
generate \
/* verilator lint_off GENUNNAMED */ \
if (!(cond)) $error msg; \
/* verilator lint_on GENUNNAMED */ \
endgenerate
`define ERROR(msg) \
$error msg
@ -103,7 +101,7 @@ endgenerate
`define UNUSED_VAR(x) /* verilator lint_off GENUNNAMED */ \
if (1) begin \
/* verilator lint_off UNUSED */ \
wire [$bits(x)-1:0] __x = x; \
wire [$bits(x)-1:0] __unused = x; \
/* verilator lint_on UNUSED */ \
end \
/* verilator lint_on GENUNNAMED */

View file

@ -31,7 +31,7 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_dcr_bus_if.slave dcr_bus_if,
// Memory
VX_mem_bus_if.master mem_bus_if,
VX_mem_bus_if.master mem_bus_if [`L1_MEM_PORTS],
`ifdef GBAR_ENABLE
// Barrier
@ -80,7 +80,7 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) icache_mem_bus_if();
) icache_mem_bus_if[1]();
`RESET_RELAY (icache_reset, reset);
@ -95,6 +95,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.NUM_WAYS (`ICACHE_NUM_WAYS),
.WORD_SIZE (ICACHE_WORD_SIZE),
.NUM_REQS (1),
.MEM_PORTS (1),
.CRSQ_SIZE (`ICACHE_CRSQ_SIZE),
.MSHR_SIZE (`ICACHE_MSHR_SIZE),
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
@ -127,7 +128,7 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) dcache_mem_bus_if();
) dcache_mem_bus_if[`L1_MEM_PORTS]();
`RESET_RELAY (dcache_reset, reset);
@ -142,6 +143,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.NUM_WAYS (`DCACHE_NUM_WAYS),
.WORD_SIZE (DCACHE_WORD_SIZE),
.NUM_REQS (DCACHE_NUM_REQS),
.MEM_PORTS (`L1_MEM_PORTS),
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
@ -168,35 +170,47 @@ module VX_socket import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) l1_mem_bus_if[2]();
for (genvar i = 0; i < `L1_MEM_PORTS; ++i) begin : g_mem_bus_if
if (i == 0) begin : g_i0
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) l1_mem_bus_if[2]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if[1]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if[1]();
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX(0),
.ARBITER ("P"), // prioritize the icache
.REQ_OUT_BUF(3),
.RSP_OUT_BUF(3)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX(0),
.ARBITER ("P"), // prioritize the icache
.REQ_OUT_BUF(3),
.RSP_OUT_BUF(3)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[0], l1_mem_arb_bus_if[0]);
end else begin : g_i
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if();
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_arb_bus_if, dcache_mem_bus_if[i], L1_MEM_ARB_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], l1_mem_arb_bus_if);
end
end
///////////////////////////////////////////////////////////////////////////

View file

@ -21,19 +21,19 @@ module Vortex import VX_gpu_pkg::*; (
input wire reset,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
output wire mem_req_valid [`VX_MEM_PORTS-1:0],
output wire mem_req_rw [`VX_MEM_PORTS],
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS],
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS],
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS],
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS],
input wire mem_req_ready [`VX_MEM_PORTS],
// Memory response
input wire mem_rsp_valid,
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
input wire mem_rsp_valid [`VX_MEM_PORTS],
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS],
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS],
output wire mem_rsp_ready [`VX_MEM_PORTS],
// DCR write request
input wire dcr_wr_valid,
@ -60,12 +60,12 @@ module Vortex import VX_gpu_pkg::*; (
VX_mem_bus_if #(
.DATA_SIZE (`L2_LINE_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH)
) per_cluster_mem_bus_if[`NUM_CLUSTERS]();
) per_cluster_mem_bus_if[`NUM_CLUSTERS * `L2_MEM_PORTS]();
VX_mem_bus_if #(
.DATA_SIZE (`L3_LINE_SIZE),
.TAG_WIDTH (L3_MEM_TAG_WIDTH)
) mem_bus_if();
) mem_bus_if[`L3_MEM_PORTS]();
`RESET_RELAY (l3_reset, reset);
@ -77,6 +77,7 @@ module Vortex import VX_gpu_pkg::*; (
.NUM_WAYS (`L3_NUM_WAYS),
.WORD_SIZE (L3_WORD_SIZE),
.NUM_REQS (L3_NUM_REQS),
.MEM_PORTS (`L3_MEM_PORTS),
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
@ -104,24 +105,21 @@ module Vortex import VX_gpu_pkg::*; (
.mem_bus_if (mem_bus_if)
);
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen= mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.flags)
for (genvar i = 0; i < `L3_MEM_PORTS; ++i) begin : g_mem_bus_if
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
assign mem_req_byteen[i]= mem_bus_if[i].req_data.byteen;
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
`UNUSED_VAR (mem_bus_if[i].req_data.flags)
assign mem_bus_if[i].req_ready = mem_req_ready[i];
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
end
VX_dcr_bus_if dcr_bus_if();
assign dcr_bus_if.write_valid = dcr_wr_valid;
@ -153,7 +151,7 @@ module Vortex import VX_gpu_pkg::*; (
.dcr_bus_if (cluster_dcr_bus_if),
.mem_bus_if (per_cluster_mem_bus_if[cluster_id]),
.mem_bus_if (per_cluster_mem_bus_if[cluster_id * `L2_MEM_PORTS +: `L2_MEM_PORTS]),
.busy (per_cluster_busy[cluster_id])
);
@ -163,6 +161,26 @@ module Vortex import VX_gpu_pkg::*; (
`ifdef PERF_ENABLE
localparam MEM_PORTS_CTR_W = `CLOG2(`VX_MEM_PORTS+1);
wire [`VX_MEM_PORTS-1:0] mem_req_fire, mem_rsp_fire;
wire [`VX_MEM_PORTS-1:0] mem_rd_req_fire, mem_wr_req_fire;
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_perf_ctrs
assign mem_req_fire[i] = mem_req_valid[i] & mem_req_ready[i];
assign mem_rsp_fire[i] = mem_rsp_valid[i] & mem_rsp_ready[i];
assign mem_rd_req_fire[i] = mem_req_fire[i] & ~mem_req_rw[i];
assign mem_wr_req_fire[i] = mem_req_fire[i] & mem_req_rw[i];
end
wire [MEM_PORTS_CTR_W-1:0] perf_mem_reads_per_cycle;
wire [MEM_PORTS_CTR_W-1:0] perf_mem_writes_per_cycle;
wire [MEM_PORTS_CTR_W-1:0] perf_mem_rsps_per_cycle;
`POP_COUNT(perf_mem_reads_per_cycle, mem_rd_req_fire);
`POP_COUNT(perf_mem_writes_per_cycle, mem_wr_req_fire);
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
@ -171,19 +189,16 @@ module Vortex import VX_gpu_pkg::*; (
perf_mem_pending_reads <= '0;
end else begin
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
`PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire)));
`PERF_CTR_BITS'($signed((MEM_PORTS_CTR_W+1)'(perf_mem_reads_per_cycle) - (MEM_PORTS_CTR_W+1)'(perf_mem_rsps_per_cycle)));
end
end
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
always @(posedge clk) begin
if (reset) begin
mem_perf <= '0;
end else begin
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(perf_mem_reads_per_cycle);
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(perf_mem_writes_per_cycle);
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end
end
@ -198,19 +213,18 @@ module Vortex import VX_gpu_pkg::*; (
end
`ifdef DBG_TRACE_MEM
wire [`UUID_WIDTH-1:0] mem_req_uuid = mem_req_tag[`VX_MEM_TAG_WIDTH-1 -: `UUID_WIDTH];
wire [`UUID_WIDTH-1:0] mem_rsp_uuid = mem_rsp_tag[`VX_MEM_TAG_WIDTH-1 -: `UUID_WIDTH];
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(2, ("%t: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h (#%0d)\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data, mem_req_uuid))
end else begin
`TRACE(2, ("%t: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h (#%0d)\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_uuid))
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_trace
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: MEM Wr Req[%0d]: addr=0x%0h, byteen=0x%h data=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: MEM Rd Req[%0d]: addr=0x%0h, byteen=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end
end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: MEM Rd Rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n", $time, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
end
end
if (mem_rsp_fire) begin
`TRACE(2, ("%t: MEM Rd Rsp: tag=0x%0h, data=0x%h (#%0d)\n", $time, mem_rsp_tag, mem_rsp_data, mem_rsp_uuid))
end
end
`endif

View file

@ -16,6 +16,7 @@
`else
`include "vortex_afu.vh"
`endif
`include "VX_define.vh"
`ifndef PLATFORM_MEMORY_INTERLEAVE

View file

@ -19,6 +19,9 @@ module VX_cache import VX_gpu_pkg::*; #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 32768,
// Size of line inside a bank in bytes
@ -75,17 +78,18 @@ module VX_cache import VX_gpu_pkg::*; #(
input wire reset,
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
VX_mem_bus_if.master mem_bus_if
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
);
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2"))
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable"))
`STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback"))
`STATIC_ASSERT(NUM_BANKS >= MEM_PORTS, ("invalid parameter: number of banks must be greater or equal to number of memory ports"))
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, UUID_WIDTH);
localparam MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WORD_WIDTH = WORD_SIZE * 8;
localparam WORD_SEL_BITS = `CLOG2(WORDS_PER_LINE);
@ -95,6 +99,11 @@ module VX_cache import VX_gpu_pkg::*; #(
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + `UP(FLAGS_WIDTH);
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
localparam BANK_MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH;
localparam MEM_REQ_DATAW = (`CS_LINE_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH));
localparam MEM_RSP_DATAW = `CS_LINE_WIDTH + MEM_TAG_WIDTH;
localparam MEM_PORTS_SEL_BITS = `CLOG2(MEM_PORTS);
localparam MEM_ARB_SEL_BITS = `CLOG2(`CDIV(NUM_BANKS, MEM_PORTS));
localparam MEM_ARB_SEL_WIDTH = `UP(MEM_ARB_SEL_BITS);
localparam CORE_RSP_REG_DISABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_REG_DISABLE = (NUM_BANKS != 1);
@ -135,113 +144,97 @@ module VX_cache import VX_gpu_pkg::*; #(
.flush_end (per_bank_flush_end)
);
///////////////////////////////////////////////////////////////////////////
// Core response buffering
wire [NUM_REQS-1:0] core_rsp_valid_s;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_RSP_REG_DISABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus2_if[i].rsp_ready)
);
end
///////////////////////////////////////////////////////////////////////////
// Memory response gather /////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_tmp_if();
) mem_bus_tmp_if[MEM_PORTS]();
// Memory response buffering
wire [MEM_PORTS-1:0] mem_rsp_queue_valid;
wire [MEM_PORTS-1:0][MEM_RSP_DATAW-1:0] mem_rsp_queue_data;
wire [MEM_PORTS-1:0] mem_rsp_queue_ready;
wire mem_rsp_valid_s;
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s;
VX_elastic_buffer #(
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (reset),
.valid_in (mem_bus_tmp_if.rsp_valid),
.ready_in (mem_bus_tmp_if.rsp_ready),
.data_in ({mem_bus_tmp_if.rsp_data.tag, mem_bus_tmp_if.rsp_data.data}),
.data_out ({mem_rsp_tag_s, mem_rsp_data_s}),
.valid_out (mem_rsp_valid_s),
.ready_out (mem_rsp_ready_s)
);
wire [BANK_MEM_TAG_WIDTH-1:0] bank_mem_rsp_tag;
wire [`UP(`CS_BANK_SEL_BITS)-1:0] mem_rsp_bank_id;
if (NUM_BANKS > 1) begin : g_mem_rsp_tag_s_with_banks
assign bank_mem_rsp_tag = mem_rsp_tag_s[MEM_TAG_WIDTH-1:`CS_BANK_SEL_BITS];
assign mem_rsp_bank_id = mem_rsp_tag_s[`CS_BANK_SEL_BITS-1:0];
end else begin : g_mem_rsp_tag_s_no_bank
assign bank_mem_rsp_tag = mem_rsp_tag_s;
assign mem_rsp_bank_id = 0;
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue
VX_elastic_buffer #(
.DATAW (MEM_RSP_DATAW),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (reset),
.valid_in (mem_bus_tmp_if[i].rsp_valid),
.data_in (mem_bus_tmp_if[i].rsp_data),
.ready_in (mem_bus_tmp_if[i].rsp_ready),
.valid_out (mem_rsp_queue_valid[i]),
.data_out (mem_rsp_queue_data[i]),
.ready_out (mem_rsp_queue_ready[i])
);
end
// Memory request buffering
wire [MEM_PORTS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] mem_rsp_queue_data_s;
wire [MEM_PORTS-1:0][BANK_SEL_WIDTH-1:0] mem_rsp_queue_sel;
wire mem_req_valid;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr;
wire mem_req_rw;
wire [LINE_SIZE-1:0] mem_req_byteen;
wire [`CS_LINE_WIDTH-1:0] mem_req_data;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag;
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags;
wire mem_req_ready;
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_data_s
wire [BANK_MEM_TAG_WIDTH-1:0] mem_rsp_tag_s = mem_rsp_queue_data[i][MEM_TAG_WIDTH-1:MEM_ARB_SEL_BITS];
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s = mem_rsp_queue_data[i][MEM_RSP_DATAW-1:MEM_TAG_WIDTH];
assign mem_rsp_queue_data_s[i] = {mem_rsp_data_s, mem_rsp_tag_s};
end
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flush_b;
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_sel
if (NUM_BANKS > 1) begin : g_multibanks
if (MEM_ARB_SEL_BITS != 0) begin : g_arb_sel
VX_bits_concat #(
.L (MEM_ARB_SEL_BITS),
.R (MEM_PORTS_SEL_BITS)
) mem_rsp_sel_concat (
.left_in (mem_rsp_queue_data[i][MEM_ARB_SEL_BITS-1:0]),
.right_in (MEM_PORTS_SEL_BITS'(i)),
.data_out (mem_rsp_queue_sel[i])
);
end else begin : g_no_arb_sel
assign mem_rsp_queue_sel[i] = MEM_PORTS_SEL_BITS'(i);
end
end else begin : g_singlebank
assign mem_rsp_queue_sel[i] = 0;
end
end
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
.SIZE (MEM_REQ_REG_DISABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
wire [NUM_BANKS-1:0] per_bank_mem_rsp_valid;
wire [NUM_BANKS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] per_bank_mem_rsp_pdata;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
VX_stream_omega #(
.NUM_INPUTS (MEM_PORTS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (MEM_RSP_DATAW-MEM_ARB_SEL_BITS),
.ARBITER ("R"),
.OUT_BUF (3)
) mem_rsp_xbar (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid),
.ready_in (mem_req_ready),
.data_in ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data, mem_req_tag, mem_req_flags}),
.data_out ({mem_bus_tmp_if.req_data.rw, mem_bus_tmp_if.req_data.byteen, mem_bus_tmp_if.req_data.addr, mem_bus_tmp_if.req_data.data, mem_bus_tmp_if.req_data.tag, mem_req_flush_b}),
.valid_out (mem_bus_tmp_if.req_valid),
.ready_out (mem_bus_tmp_if.req_ready)
.valid_in (mem_rsp_queue_valid),
.data_in (mem_rsp_queue_data_s),
.sel_in (mem_rsp_queue_sel),
.ready_in (mem_rsp_queue_ready),
.valid_out (per_bank_mem_rsp_valid),
.data_out (per_bank_mem_rsp_pdata),
`UNUSED_PIN (sel_out),
.ready_out (per_bank_mem_rsp_ready),
`UNUSED_PIN (collisions)
);
if (FLAGS_WIDTH != 0) begin : g_mem_req_flags
assign mem_bus_tmp_if.req_data.flags = mem_req_flush_b;
end else begin : g_no_mem_req_flags
assign mem_bus_tmp_if.req_data.flags = '0;
`UNUSED_VAR (mem_req_flush_b)
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_rsp_data;
wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_rsp_tag;
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_rsp_data
assign {
per_bank_mem_rsp_data[i],
per_bank_mem_rsp_tag[i]
} = per_bank_mem_rsp_pdata[i];
end
if (WRITE_ENABLE) begin : g_mem_bus_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if);
end else begin : g_mem_bus_if_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if, mem_bus_tmp_if);
end
///////////////////////////////////////////////////////////////////////////
// Core requests dispatch /////////////////////////////////////////////////
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
@ -261,7 +254,7 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
@ -269,14 +262,6 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_mem_req_flags;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
assign mem_rsp_ready_s = per_bank_mem_rsp_ready[mem_rsp_bank_id];
// Bank requests dispatch
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
@ -336,6 +321,8 @@ module VX_cache import VX_gpu_pkg::*; #(
};
end
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
`ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_collisions;
`endif
@ -377,12 +364,9 @@ module VX_cache import VX_gpu_pkg::*; #(
} = core_req_data_out[i];
end
// Banks access
// Banks access ///////////////////////////////////////////////////////////
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : g_banks
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire curr_bank_mem_rsp_valid = mem_rsp_valid_s && (mem_rsp_bank_id == bank_id);
VX_cache_bank #(
.BANK_ID (bank_id),
.INSTANCE_ID (`SFORMATF(("%s-bank%0d", INSTANCE_ID, bank_id))),
@ -409,9 +393,9 @@ module VX_cache import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.perf_read_misses (perf_read_miss_per_bank[bank_id]),
.perf_write_misses (perf_write_miss_per_bank[bank_id]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[bank_id]),
.perf_read_miss (perf_read_miss_per_bank[bank_id]),
.perf_write_miss (perf_write_miss_per_bank[bank_id]),
.perf_mshr_stall (perf_mshr_stall_per_bank[bank_id]),
`endif
// Core request
@ -435,7 +419,7 @@ module VX_cache import VX_gpu_pkg::*; #(
// Memory request
.mem_req_valid (per_bank_mem_req_valid[bank_id]),
.mem_req_addr (curr_bank_mem_req_addr),
.mem_req_addr (per_bank_mem_req_addr[bank_id]),
.mem_req_rw (per_bank_mem_req_rw[bank_id]),
.mem_req_byteen (per_bank_mem_req_byteen[bank_id]),
.mem_req_data (per_bank_mem_req_data[bank_id]),
@ -444,9 +428,9 @@ module VX_cache import VX_gpu_pkg::*; #(
.mem_req_ready (per_bank_mem_req_ready[bank_id]),
// Memory response
.mem_rsp_valid (curr_bank_mem_rsp_valid),
.mem_rsp_data (mem_rsp_data_s),
.mem_rsp_tag (bank_mem_rsp_tag),
.mem_rsp_valid (per_bank_mem_rsp_valid[bank_id]),
.mem_rsp_data (per_bank_mem_rsp_data[bank_id]),
.mem_rsp_tag (per_bank_mem_rsp_tag[bank_id]),
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
// Flush request
@ -454,19 +438,18 @@ module VX_cache import VX_gpu_pkg::*; #(
.flush_uuid (flush_uuid),
.flush_end (per_bank_flush_end[bank_id])
);
if (NUM_BANKS == 1) begin : g_per_bank_mem_req_addr_multibanks
assign per_bank_mem_req_addr[bank_id] = curr_bank_mem_req_addr;
end else begin : g_per_bank_mem_req_addr_singlebank
assign per_bank_mem_req_addr[bank_id] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, bank_id);
end
end
// Bank responses gather
// Core responses gather //////////////////////////////////////////////////
wire [NUM_BANKS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_in;
wire [NUM_REQS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_out;
wire [NUM_REQS-1:0] core_rsp_valid_s;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_rsp_data_in
assign core_rsp_data_in[i] = {per_bank_core_rsp_data[i], per_bank_core_rsp_tag[i]};
end
@ -494,77 +477,166 @@ module VX_cache import VX_gpu_pkg::*; #(
assign {core_rsp_data_s[i], core_rsp_tag_s[i]} = core_rsp_data_out[i];
end
// Memory request arbitration
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_RSP_REG_DISABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus2_if[i].rsp_ready)
);
end
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH))-1:0] data_in;
// Memory request arbitration /////////////////////////////////////////////
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_data_in
assign data_in[i] = {
per_bank_mem_req_addr[i],
wire [NUM_BANKS-1:0][MEM_REQ_DATAW-1:0] per_bank_mem_req_pdata;
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_req_pdata
assign per_bank_mem_req_pdata[i] = {
per_bank_mem_req_rw[i],
per_bank_mem_req_byteen[i],
per_bank_mem_req_addr[i],
per_bank_mem_req_data[i],
per_bank_mem_req_tag[i],
per_bank_mem_req_flags[i]
per_bank_mem_req_byteen[i],
per_bank_mem_req_flags[i],
per_bank_mem_req_tag[i]
};
end
wire [BANK_MEM_TAG_WIDTH-1:0] bank_mem_req_tag;
wire [MEM_PORTS-1:0] mem_req_valid;
wire [MEM_PORTS-1:0][MEM_REQ_DATAW-1:0] mem_req_pdata;
wire [MEM_PORTS-1:0] mem_req_ready;
wire [MEM_PORTS-1:0][MEM_ARB_SEL_WIDTH-1:0] mem_req_sel_out;
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
.NUM_OUTPUTS(MEM_PORTS),
.DATAW (MEM_REQ_DATAW),
.ARBITER ("R")
) mem_req_arb (
.clk (clk),
.reset (reset),
.valid_in (per_bank_mem_req_valid),
.data_in (per_bank_mem_req_pdata),
.ready_in (per_bank_mem_req_ready),
.data_in (data_in),
.data_out ({mem_req_addr, mem_req_rw, mem_req_byteen, mem_req_data, bank_mem_req_tag, mem_req_flags}),
.valid_out (mem_req_valid),
.data_out (mem_req_pdata),
.ready_out (mem_req_ready),
`UNUSED_PIN (sel_out)
.sel_out (mem_req_sel_out)
);
if (NUM_BANKS > 1) begin : g_mem_req_tag_multibanks
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id = `CS_MEM_ADDR_TO_BANK_ID(mem_req_addr);
assign mem_req_tag = MEM_TAG_WIDTH'({bank_mem_req_tag, mem_req_bank_id});
end else begin : g_mem_req_tag
assign mem_req_tag = MEM_TAG_WIDTH'(bank_mem_req_tag);
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_req_buf
wire mem_req_rw;
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr;
wire [`CS_LINE_WIDTH-1:0] mem_req_data;
wire [LINE_SIZE-1:0] mem_req_byteen;
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags;
wire [BANK_MEM_TAG_WIDTH-1:0] mem_req_tag;
assign {
mem_req_rw,
mem_req_addr,
mem_req_data,
mem_req_byteen,
mem_req_flags,
mem_req_tag
} = mem_req_pdata[i];
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_w;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_w;
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags_w;
if (NUM_BANKS > 1) begin : g_mem_req_tag_multibanks
if (MEM_ARB_SEL_BITS != 0) begin : g_arb_sel
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id;
VX_bits_concat #(
.L (MEM_ARB_SEL_BITS),
.R (MEM_PORTS_SEL_BITS)
) bank_id_concat (
.left_in (mem_req_sel_out[i]),
.right_in (MEM_PORTS_SEL_BITS'(i)),
.data_out (mem_req_bank_id)
);
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, mem_req_bank_id});
assign mem_req_tag_w = {mem_req_tag, mem_req_sel_out[i]};
end else begin : g_no_arb_sel
`UNUSED_VAR (mem_req_sel_out)
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, MEM_PORTS_SEL_BITS'(i)});
assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag);
end
end else begin : g_mem_req_tag
`UNUSED_VAR (mem_req_sel_out)
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'(mem_req_addr);
assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag);
end
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
.SIZE (MEM_REQ_REG_DISABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid[i]),
.ready_in (mem_req_ready[i]),
.data_in ({mem_req_rw, mem_req_byteen, mem_req_addr_w, mem_req_data, mem_req_tag_w, mem_req_flags}),
.data_out ({mem_bus_tmp_if[i].req_data.rw, mem_bus_tmp_if[i].req_data.byteen, mem_bus_tmp_if[i].req_data.addr, mem_bus_tmp_if[i].req_data.data, mem_bus_tmp_if[i].req_data.tag, mem_req_flags_w}),
.valid_out (mem_bus_tmp_if[i].req_valid),
.ready_out (mem_bus_tmp_if[i].req_ready)
);
if (FLAGS_WIDTH != 0) begin : g_mem_req_flags
assign mem_bus_tmp_if[i].req_data.flags = mem_req_flags_w;
end else begin : g_no_mem_req_flags
assign mem_bus_tmp_if[i].req_data.flags = '0;
`UNUSED_VAR (mem_req_flags_w)
end
if (WRITE_ENABLE) begin : g_mem_bus_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end else begin : g_mem_bus_if_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end
end
`ifdef PERF_ENABLE
// per cycle: core_reads, core_writes
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
end
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready;
end
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
wire perf_mem_stall_per_cycle = mem_bus_if.req_valid && ~mem_bus_if.req_ready;
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;

View file

@ -74,9 +74,9 @@ module VX_cache_bank #(
input wire reset,
`ifdef PERF_ENABLE
output wire perf_read_misses,
output wire perf_write_misses,
output wire perf_mshr_stalls,
output wire perf_read_miss,
output wire perf_write_miss,
output wire perf_mshr_stall,
`endif
// Core Request
@ -682,9 +682,9 @@ module VX_cache_bank #(
///////////////////////////////////////////////////////////////////////////////
`ifdef PERF_ENABLE
assign perf_read_misses = do_read_st1 && ~is_hit_st1;
assign perf_write_misses = do_write_st1 && ~is_hit_st1;
assign perf_mshr_stalls = mshr_alm_full;
assign perf_read_miss = do_read_st1 && ~is_hit_st1;
assign perf_write_miss = do_write_st1 && ~is_hit_st1;
assign perf_mshr_stall = mshr_alm_full;
`endif
`ifdef DBG_TRACE_CACHE

View file

@ -15,6 +15,7 @@
module VX_cache_bypass #(
parameter NUM_REQS = 1,
parameter MEM_PORTS = 1,
parameter TAG_SEL_IDX = 0,
parameter PASSTHRU = 0,
@ -29,14 +30,11 @@ module VX_cache_bypass #(
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_TAG_IN_WIDTH = 1,
parameter MEM_TAG_OUT_WIDTH = 1,
parameter UUID_WIDTH = 0,
parameter CORE_OUT_BUF = 0,
parameter MEM_OUT_BUF = 0,
parameter CORE_DATA_WIDTH = WORD_SIZE * 8
parameter MEM_OUT_BUF = 0
) (
input wire clk,
input wire reset,
@ -48,296 +46,223 @@ module VX_cache_bypass #(
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
// Memory request in
VX_mem_bus_if.slave mem_bus_in_if,
VX_mem_bus_if.slave mem_bus_in_if [MEM_PORTS],
// Memory request out
VX_mem_bus_if.master mem_bus_out_if
VX_mem_bus_if.master mem_bus_out_if [MEM_PORTS]
);
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
localparam CORE_DATA_WIDTH = WORD_SIZE * 8;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam CORE_TAG_ID_BITS = CORE_TAG_WIDTH - UUID_WIDTH;
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS;
localparam CORE_TAG_ID_WIDTH = CORE_TAG_WIDTH - UUID_WIDTH;
localparam MEM_TAG_ID_WIDTH = `CLOG2(NUM_REQS / MEM_PORTS) + CORE_TAG_ID_WIDTH;
localparam MEM_TAG_NC1_WIDTH = UUID_WIDTH + MEM_TAG_ID_WIDTH;
localparam MEM_TAG_NC2_WIDTH = WSEL_BITS + MEM_TAG_NC1_WIDTH;
localparam MEM_TAG_OUT_WIDTH = `MAX(MEM_TAG_IN_WIDTH, MEM_TAG_NC2_WIDTH);
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
// handle core requests ///////////////////////////////////////////////////
// hanlde non-cacheable core request switch ///////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH)
) core_bus_nc_switch_if[2 * NUM_REQS]();
wire core_req_nc_valid;
wire [NUM_REQS-1:0] core_req_nc_valids;
wire [NUM_REQS-1:0] core_req_nc_idxs;
wire [REQ_SEL_WIDTH-1:0] core_req_nc_idx;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_nc
if (PASSTHRU != 0) begin : g_passthru
assign core_req_nc_idxs[i] = 1'b1;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_is_nc
if (PASSTHRU) begin : g_passthru
assign core_req_nc_sel[i] = 1'b1;
end else if (NC_ENABLE) begin : g_nc
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_IO];
assign core_req_nc_sel[i] = core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_IO];
end else begin : g_no_nc
assign core_req_nc_idxs[i] = 1'b0;
assign core_req_nc_sel[i] = 1'b0;
end
assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i];
end
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.TYPE (PASSTHRU ? "R" : "P")
) core_req_nc_arb (
.clk (clk),
.reset (reset),
.requests (core_req_nc_valids),
.grant_index (core_req_nc_idx),
.grant_onehot (core_req_nc_sel),
.grant_valid (core_req_nc_valid),
.grant_ready (core_req_nc_ready)
VX_mem_switch #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS (2 * NUM_REQS),
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH),
.ARBITER ("R"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(CORE_OUT_BUF))
) core_bus_nc_switch (
.clk (clk),
.reset (reset),
.bus_sel (core_req_nc_sel),
.bus_in_if (core_bus_in_if),
.bus_out_if(core_bus_nc_switch_if)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_if
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i];
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
: core_bus_out_if[i].req_ready;
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH)
) core_bus_in_nc_if[NUM_REQS]();
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_in_cs
assign core_bus_out_if[i].req_valid = core_bus_nc_switch_if[0 * NUM_REQS + i].req_valid;
assign core_bus_out_if[i].req_data = core_bus_nc_switch_if[0 * NUM_REQS + i].req_data;
assign core_bus_nc_switch_if[0 * NUM_REQS + i].req_ready = core_bus_out_if[i].req_ready;
assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_valid = core_bus_out_if[i].rsp_valid;
assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_data = core_bus_out_if[i].rsp_data;
assign core_bus_out_if[i].rsp_ready = core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_ready;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_in_nc
assign core_bus_in_nc_if[i].req_valid = core_bus_nc_switch_if[1 * NUM_REQS + i].req_valid;
assign core_bus_in_nc_if[i].req_data = core_bus_nc_switch_if[1 * NUM_REQS + i].req_data;
assign core_bus_nc_switch_if[1 * NUM_REQS + i].req_ready = core_bus_in_nc_if[i].req_ready;
assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_valid = core_bus_in_nc_if[i].rsp_valid;
assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_data = core_bus_in_nc_if[i].rsp_data;
assign core_bus_in_nc_if[i].rsp_ready = core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_ready;
end
// handle memory requests /////////////////////////////////////////////////
wire mem_req_out_valid;
wire mem_req_out_rw;
wire [LINE_SIZE-1:0] mem_req_out_byteen;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
wire [`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_out_flags;
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
wire mem_req_out_ready;
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (MEM_TAG_NC1_WIDTH)
) core_bus_nc_arb_if[MEM_PORTS]();
wire core_req_nc_sel_rw;
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
wire [`MEM_REQ_FLAGS_WIDTH-1:0] core_req_nc_sel_flags;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
VX_mem_arb #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS(MEM_PORTS),
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH),
.TAG_SEL_IDX(TAG_SEL_IDX),
.ARBITER (PASSTHRU ? "R" : "P"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(0)
) core_bus_nc_arb (
.clk (clk),
.reset (reset),
.bus_in_if (core_bus_in_nc_if),
.bus_out_if (core_bus_nc_arb_if)
);
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_nc_mux_in
assign core_req_nc_mux_in[i] = {
core_bus_in_if[i].req_data.rw,
core_bus_in_if[i].req_data.addr,
core_bus_in_if[i].req_data.data,
core_bus_in_if[i].req_data.byteen,
core_bus_in_if[i].req_data.flags,
core_bus_in_if[i].req_data.tag
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_NC2_WIDTH)
) mem_bus_out_nc_if[MEM_PORTS]();
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_nc
wire core_req_nc_arb_rw;
wire [WORD_SIZE-1:0] core_req_nc_arb_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_arb_addr;
wire [`MEM_REQ_FLAGS_WIDTH-1:0] core_req_nc_arb_flags;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_arb_data;
wire [MEM_TAG_NC1_WIDTH-1:0] core_req_nc_arb_tag;
assign {
core_req_nc_arb_rw,
core_req_nc_arb_addr,
core_req_nc_arb_data,
core_req_nc_arb_byteen,
core_req_nc_arb_flags,
core_req_nc_arb_tag
} = core_bus_nc_arb_if[i].req_data;
logic [MEM_ADDR_WIDTH-1:0] core_req_nc_arb_addr_w;
logic [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] core_req_nc_arb_byteen_w;
logic [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] core_req_nc_arb_data_w;
logic [CORE_DATA_WIDTH-1:0] core_rsp_nc_arb_data_w;
wire [MEM_TAG_NC2_WIDTH-1:0] core_req_nc_arb_tag_w;
wire [MEM_TAG_NC1_WIDTH-1:0] core_rsp_nc_arb_tag_w;
if (PASSTHRU || NC_ENABLE) begin : g_mem_req_out_tag_nc
if (WORDS_PER_LINE > 1) begin : g_multi_word_line
wire [WSEL_BITS-1:0] rsp_wsel;
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_arb_addr[WSEL_BITS-1:0];
always @(*) begin
core_req_nc_arb_byteen_w = '0;
core_req_nc_arb_byteen_w[req_wsel] = core_req_nc_arb_byteen;
core_req_nc_arb_data_w = 'x;
core_req_nc_arb_data_w[req_wsel] = core_req_nc_arb_data;
end
VX_bits_insert #(
.N (MEM_TAG_NC1_WIDTH),
.S (WSEL_BITS),
.POS (MEM_TAG_ID_WIDTH)
) wsel_insert (
.data_in (core_req_nc_arb_tag),
.ins_in (req_wsel),
.data_out (core_req_nc_arb_tag_w)
);
VX_bits_remove #(
.N (MEM_TAG_NC2_WIDTH),
.S (WSEL_BITS),
.POS (MEM_TAG_ID_WIDTH)
) wsel_remove (
.data_in (mem_bus_out_nc_if[i].rsp_data.tag),
.sel_out (rsp_wsel),
.data_out (core_rsp_nc_arb_tag_w)
);
assign core_req_nc_arb_addr_w = core_req_nc_arb_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end else begin : g_single_word_line
assign core_req_nc_arb_addr_w = core_req_nc_arb_addr;
assign core_req_nc_arb_byteen_w = core_req_nc_arb_byteen;
assign core_req_nc_arb_data_w = core_req_nc_arb_data;
assign core_req_nc_arb_tag_w = MEM_TAG_NC2_WIDTH'(core_req_nc_arb_tag);
assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data;
assign core_rsp_nc_arb_tag_w = MEM_TAG_NC1_WIDTH'(mem_bus_out_nc_if[i].rsp_data.tag);
end
end else begin : g_mem_req_out_tag
assign core_req_nc_arb_tag_w = core_req_nc_arb_tag;
end
assign mem_bus_out_nc_if[i].req_valid = core_bus_nc_arb_if[i].req_valid;
assign mem_bus_out_nc_if[i].req_data = {
core_req_nc_arb_rw,
core_req_nc_arb_addr_w,
core_req_nc_arb_data_w,
core_req_nc_arb_byteen_w,
core_req_nc_arb_flags,
core_req_nc_arb_tag_w
};
assign core_bus_nc_arb_if[i].req_ready = mem_bus_out_nc_if[i].req_ready;
assign core_bus_nc_arb_if[i].rsp_valid = mem_bus_out_nc_if[i].rsp_valid;
assign core_bus_nc_arb_if[i].rsp_data = {
core_rsp_nc_arb_data_w,
core_rsp_nc_arb_tag_w
};
assign mem_bus_out_nc_if[i].rsp_ready = core_bus_nc_arb_if[i].rsp_ready;
end
assign {
core_req_nc_sel_rw,
core_req_nc_sel_addr,
core_req_nc_sel_data,
core_req_nc_sel_byteen,
core_req_nc_sel_flags,
core_req_nc_sel_tag
} = core_req_nc_mux_in[core_req_nc_idx];
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_OUT_WIDTH)
) mem_bus_out_src_if[(PASSTHRU ? 1 : 2) * MEM_PORTS]();
assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign mem_req_out_flags = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.flags : core_req_nc_sel_flags;
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0];
if (WORDS_PER_LINE > 1) begin : g_mem_req_multi_word_line
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_w;
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_w;
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0];
always @(*) begin
mem_req_byteen_in_w = '0;
mem_req_byteen_in_w[req_wsel] = core_req_nc_sel_byteen;
mem_req_data_in_w = 'x;
mem_req_data_in_w[req_wsel] = core_req_nc_sel_data;
end
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_w;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_w;
if (NUM_REQS > 1) begin : g_multiple_requests
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
end else begin : g_single_request
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id});
end
end else begin : g_mem_req_single_word_line
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
if (NUM_REQS > 1) begin : g_multiple_requests
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
end else begin : g_single_request
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_in_id});
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_src
`ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[0 * MEM_PORTS + i], mem_bus_out_nc_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_NC2_WIDTH, UUID_WIDTH);
if (!PASSTHRU) begin : g_not_passthru
`ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[1 * MEM_PORTS + i], mem_bus_in_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_IN_WIDTH, UUID_WIDTH);
end
end
wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
if (UUID_WIDTH != 0) begin : g_mem_req_tag_bypass_with_uuid
assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
end else begin : g_mem_req_tag_bypass
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
end
if (PASSTHRU != 0) begin : g_mem_req_out_tag_passthru
assign mem_req_out_tag = mem_req_tag_bypass;
`UNUSED_VAR (mem_bus_in_if.req_data.tag)
end else if (NC_ENABLE) begin : g_mem_req_out_tag_nc
VX_bits_insert #(
.N (MEM_TAG_OUT_WIDTH-1),
.S (1),
.POS (TAG_SEL_IDX)
) mem_req_tag_in_nc_insert (
.data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
.ins_in (~mem_bus_in_if.req_valid),
.data_out (mem_req_out_tag)
);
end else begin : g_mem_req_out_tag
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
end
assign mem_bus_in_if.req_ready = mem_req_out_ready;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
.SIZE (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_out_valid),
.ready_in (mem_req_out_ready),
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_flags, mem_req_out_data, mem_req_out_tag}),
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.flags, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
.valid_out (mem_bus_out_if.req_valid),
.ready_out (mem_bus_out_if.req_ready)
VX_mem_arb #(
.NUM_INPUTS ((PASSTHRU ? 1 : 2) * MEM_PORTS),
.NUM_OUTPUTS(MEM_PORTS),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_OUT_WIDTH),
.ARBITER ("R"),
.REQ_OUT_BUF(DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.RSP_OUT_BUF(0)
) mem_bus_out_arb (
.clk (clk),
.reset (reset),
.bus_in_if (mem_bus_out_src_if),
.bus_out_if (mem_bus_out_if)
);
// handle core responses //////////////////////////////////////////////////
wire [NUM_REQS-1:0] core_rsp_in_valid;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_in_data;
wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_in_tag;
wire [NUM_REQS-1:0] core_rsp_in_ready;
wire is_mem_rsp_nc;
if (PASSTHRU != 0) begin : g_is_mem_rsp_nc_passthru
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid;
end else if (NC_ENABLE) begin : g_is_mem_rsp_nc
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
end else begin : g_is_no_mem_rsp_nc
assign is_mem_rsp_nc = 1'b0;
end
wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc;
VX_bits_remove #(
.N (MEM_TAG_OUT_WIDTH),
.S (NC_ENABLE),
.POS (TAG_SEL_IDX)
) mem_rsp_tag_in_nc_remove (
.data_in (mem_bus_out_if.rsp_data.tag),
.data_out (mem_rsp_tag_id_nc)
);
wire [REQ_SEL_WIDTH-1:0] rsp_idx;
if (NUM_REQS > 1) begin : g_rsp_idx
assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
end else begin : g_rsp_idx_0
assign rsp_idx = 1'b0;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_valid
assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || (is_mem_rsp_nc && rsp_idx == REQ_SEL_WIDTH'(i));
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_ready
assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_data
if (WORDS_PER_LINE > 1) begin : g_wsel
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end else begin : g_no_wsel
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data;
end
end
wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2;
if (UUID_WIDTH != 0) begin : g_mem_rsp_tag_in_nc2_uuid
assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]};
end else begin : g_mem_rsp_tag_in_nc2
assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_tag
if (PASSTHRU) begin : g_passthru
assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2;
end else if (NC_ENABLE) begin : g_nc
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2;
end else begin : g_no_nc
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
.SIZE (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(CORE_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_in_valid[i]),
.ready_in (core_rsp_in_ready[i]),
.data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}),
.data_out ({core_bus_in_if[i].rsp_data.data, core_bus_in_if[i].rsp_data.tag}),
.valid_out (core_bus_in_if[i].rsp_valid),
.ready_out (core_bus_in_if[i].rsp_ready)
);
end
// handle memory responses ////////////////////////////////////////////////
if (PASSTHRU != 0) begin : g_mem_bus_in_if_passthru
assign mem_bus_in_if.rsp_valid = 1'b0;
assign mem_bus_in_if.rsp_data.data = '0;
assign mem_bus_in_if.rsp_data.tag = '0;
end else if (NC_ENABLE) begin : g_mem_bus_in_if_nc
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0];
end else begin : g_mem_bus_in_if
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid;
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc;
end
wire [NUM_REQS-1:0] core_rsp_out_valid;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_out_valid
assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid;
end
assign mem_bus_out_if.rsp_ready = is_mem_rsp_nc ? (~core_rsp_out_valid[rsp_idx] && core_rsp_in_ready[rsp_idx]) : mem_bus_in_if.rsp_ready;
endmodule

View file

@ -23,6 +23,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Number of requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 32768,
// Size of line inside a bank in bytes
@ -82,14 +85,16 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`endif
VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS],
VX_mem_bus_if.master mem_bus_if
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
);
localparam NUM_CACHES = `UP(NUM_UNITS);
localparam PASSTHRU = (NUM_UNITS == 0);
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH, UUID_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, UUID_WIDTH));
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH);
localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1;
localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH);
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
@ -101,7 +106,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) cache_mem_bus_if[NUM_CACHES]();
) cache_mem_bus_if[NUM_CACHES * MEM_PORTS]();
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
@ -153,6 +158,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
@ -176,34 +182,46 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
.core_bus_if (arb_core_bus_if[i * NUM_REQS +: NUM_REQS]),
.mem_bus_if (cache_mem_bus_if[i])
.mem_bus_if (cache_mem_bus_if[i * MEM_PORTS +: MEM_PORTS])
);
end
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
) mem_bus_tmp_if[1]();
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) arb_core_bus_tmp_if[NUM_CACHES]();
VX_mem_arb #(
.NUM_INPUTS (NUM_CACHES),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_CACHES > 1) ? MEM_OUT_BUF : 0),
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (cache_mem_bus_if),
.bus_out_if (mem_bus_tmp_if)
);
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
) mem_bus_tmp_if[1]();
if (WRITE_ENABLE) begin : g_mem_bus_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
end else begin : g_mem_bus_if_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if, mem_bus_tmp_if[0]);
for (genvar j = 0; j < NUM_CACHES; ++j) begin : g_arb_core_bus_tmp_if
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_tmp_if[j], cache_mem_bus_if[j * MEM_PORTS + i]);
end
VX_mem_arb #(
.NUM_INPUTS (NUM_CACHES),
.NUM_OUTPUTS (1),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_CACHES > 1) ? MEM_OUT_BUF : 0),
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (arb_core_bus_tmp_if),
.bus_out_if (mem_bus_tmp_if)
);
if (WRITE_ENABLE) begin : g_we
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[0]);
end else begin : g_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[0]);
end
end
endmodule

View file

@ -55,10 +55,6 @@
///////////////////////////////////////////////////////////////////////////////
`define CS_LINE_TO_MEM_ADDR(x, i) {x, `CS_BANK_SEL_BITS'(i)}
`define CS_MEM_ADDR_TO_BANK_ID(x) x[0 +: `CS_BANK_SEL_BITS]
`define CS_MEM_TAG_TO_REQ_ID(x) x[MSHR_ADDR_WIDTH-1:0]
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}

View file

@ -34,6 +34,8 @@ module VX_cache_flush #(
output wire [`UP(UUID_WIDTH)-1:0] flush_uuid,
input wire [NUM_BANKS-1:0] flush_end
);
`UNUSED_PARAM (TAG_WIDTH)
localparam STATE_IDLE = 0;
localparam STATE_WAIT1 = 1;
localparam STATE_FLUSH = 2;
@ -112,7 +114,7 @@ module VX_cache_flush #(
wire [NUM_REQS-1:0] core_bus_out_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_uuid
if (UUID_WIDTH != 0) begin : g_uuid
assign core_bus_out_uuid[i] = core_bus_in_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign core_bus_out_uuid[i] = core_bus_in_if[i].req_data.tag.uuid;
end else begin : g_no_uuid
assign core_bus_out_uuid[i] = 0;
end

View file

@ -45,8 +45,8 @@ module VX_cache_tags #(
output wire evict_dirty,
output wire [`CS_TAG_SEL_BITS-1:0] evict_tag
);
// valid, dirty, tag
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
// valid, dirty, tag
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
wire [NUM_WAYS-1:0] read_valid;

View file

@ -19,6 +19,9 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 65536,
// Size of line inside a bank in bytes
@ -60,7 +63,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Memory request output buffer
parameter MEM_OUT_BUF = 3,
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS)
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS / MEM_PORTS)
) (
input wire clk,
input wire reset,
@ -155,6 +158,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),

View file

@ -21,6 +21,8 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
@ -85,16 +87,15 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
`endif
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
VX_mem_bus_if.master mem_bus_if
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
);
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, UUID_WIDTH);
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH, UUID_WIDTH) :
CACHE_MEM_TAG_WIDTH);
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, TAG_WIDTH);
localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1;
localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH);
localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU);
@ -106,17 +107,18 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (CACHE_MEM_TAG_WIDTH)
) mem_bus_cache_if();
) mem_bus_cache_if[MEM_PORTS]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_tmp_if();
) mem_bus_tmp_if[MEM_PORTS]();
if (NC_OR_BYPASS) begin : g_bypass
VX_cache_bypass #(
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.TAG_SEL_IDX (TAG_SEL_IDX),
.PASSTHRU (PASSTHRU),
@ -130,7 +132,6 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH),
@ -153,13 +154,17 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
end
`ASSIGN_VX_MEM_BUS_IF (mem_bus_tmp_if, mem_bus_cache_if);
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_tmp_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_tmp_if[i], mem_bus_cache_if[i]);
end
end
if (WRITE_ENABLE) begin : g_mem_bus_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if);
end else begin : g_mem_bus_if_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if, mem_bus_tmp_if);
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if
if (WRITE_ENABLE) begin : g_we
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end else begin : g_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end
end
if (PASSTHRU == 0) begin : g_cache
@ -172,6 +177,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
@ -207,13 +213,15 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
end
assign mem_bus_cache_if.req_valid = 0;
assign mem_bus_cache_if.req_data = '0;
`UNUSED_VAR (mem_bus_cache_if.req_ready)
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_cache_if
assign mem_bus_cache_if[i].req_valid = 0;
assign mem_bus_cache_if[i].req_data = '0;
`UNUSED_VAR (mem_bus_cache_if[i].req_ready)
`UNUSED_VAR (mem_bus_cache_if.rsp_valid)
`UNUSED_VAR (mem_bus_cache_if.rsp_data)
assign mem_bus_cache_if.rsp_ready = 0;
`UNUSED_VAR (mem_bus_cache_if[i].rsp_valid)
`UNUSED_VAR (mem_bus_cache_if[i].rsp_data)
assign mem_bus_cache_if[i].rsp_ready = 0;
end
`ifdef PERF_ENABLE
assign cache_perf = '0;
@ -222,62 +230,36 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
end
`ifdef DBG_TRACE_CACHE
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_trace
wire [`UP(UUID_WIDTH)-1:0] core_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] core_rsp_uuid;
if (UUID_WIDTH != 0) begin : g_core_rsp_uuid
assign core_req_uuid = core_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign core_rsp_uuid = core_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin : g_no_core_rsp_uuid
assign core_req_uuid = 0;
assign core_rsp_uuid = 0;
end
wire core_req_fire = core_bus_if[i].req_valid && core_bus_if[i].req_ready;
wire core_rsp_fire = core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_trace_core
always @(posedge clk) begin
if (core_req_fire) begin
if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin
if (core_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid))
`TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid))
`TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.tag.uuid))
end
end
if (core_rsp_fire) begin
`TRACE(2, ("%t: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid))
if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag.value, i, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
end
end
end
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin : g_mem_req_uuid
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
end else begin : g_no_mem_req_uuid
assign mem_req_uuid = 0;
assign mem_rsp_uuid = 0;
end
wire mem_req_fire = mem_bus_if.req_valid && mem_bus_if.req_ready;
wire mem_rsp_fire = mem_bus_if.rsp_valid && mem_bus_if.rsp_ready;
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_bus_if.req_data.rw) begin
`TRACE(2, ("%t: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid))
end else begin
`TRACE(2, ("%t: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid))
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_trace_mem
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end
end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s mem-rd-rsp: data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if[i].rsp_data.data[i], mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
end
end
if (mem_rsp_fire) begin
`TRACE(2, ("%t: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid))
end
end
`endif

View file

@ -137,8 +137,6 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire [`UUID_WIDTH-1:0] icache_bus_req_uuid = icache_bus_if.req_data.tag[ICACHE_TAG_WIDTH-1 -: `UUID_WIDTH];
wire [`UUID_WIDTH-1:0] icache_bus_rsp_uuid = icache_bus_if.rsp_data.tag[ICACHE_TAG_WIDTH-1 -: `UUID_WIDTH];
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 1, 6, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
@ -157,8 +155,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
icache_bus_rsp_fire
},{
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
icache_bus_req_uuid, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_rsp_uuid, icache_bus_if.rsp_data.data
icache_bus_if.req_data.tag.uuid, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.tag.uuid, icache_bus_if.rsp_data.data
},
reset_negedge, 1'b0, 4096
);

View file

@ -40,6 +40,7 @@ module VX_pe_switch import VX_gpu_pkg::*; #(
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_INPUTS (1),
.NUM_OUTPUTS (PE_COUNT),
.OUT_BUF (REQ_OUT_BUF)
) req_switch (

View file

@ -171,9 +171,9 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
end
`ifdef GBAR_ENABLE
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_data.id)) begin
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
barrier_masks_n[gbar_bus_if.rsp_id] = '0; // reset barrier mask
barrier_masks_n[gbar_bus_if.rsp_data.id] = '0; // reset barrier mask
stalled_warps_n = '0; // unlock all warps
end
`endif
@ -281,10 +281,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
// barrier handling
`ifdef GBAR_ENABLE
assign gbar_bus_if.req_valid = gbar_req_valid;
assign gbar_bus_if.req_id = gbar_req_id;
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
assign gbar_bus_if.req_valid = gbar_req_valid;
assign gbar_bus_if.req_data.id = gbar_req_id;
assign gbar_bus_if.req_data.size_m1 = gbar_req_size_m1;
assign gbar_bus_if.req_data.core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
`endif
// split/join handling

View file

@ -0,0 +1,36 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`TRACING_OFF
module VX_bits_concat #(
parameter L = 1,
parameter R = 1
) (
input wire [`UP(L)-1:0] left_in,
input wire [`UP(R)-1:0] right_in,
output wire [(L+R)-1:0] data_out
);
if (L == 0) begin : g_right_only
`UNUSED_VAR (left_in)
assign data_out = right_in;
end else if (R == 0) begin : g_left_only
`UNUSED_VAR (right_in)
assign data_out = left_in;
end else begin : g_concat
assign data_out = {left_in, right_in};
end
endmodule
`TRACING_ON

View file

@ -20,17 +20,22 @@ module VX_bits_remove #(
parameter POS = 0
) (
input wire [N-1:0] data_in,
output wire [`UP(S)-1:0] sel_out,
output wire [N-S-1:0] data_out
);
`STATIC_ASSERT (((0 == S) || ((POS + S) <= N)), ("invalid parameter"))
if (S == 0) begin : g_passthru
assign sel_out = 0;
assign data_out = data_in;
end else if (POS == 0) begin : g_pos_0
assign sel_out = data_in[0 +: S];
assign data_out = data_in[N-1:S];
end else if ((POS + S) == N) begin : g_pos_N
assign sel_out = data_in[POS +: S];
assign data_out = data_in[POS-1:0];
end else begin : g_pos
assign sel_out = data_in[POS +: S];
assign data_out = {data_in[N-1:(POS+S)], data_in[POS-1:0]};
end

View file

@ -21,7 +21,8 @@ module VX_stream_arb #(
parameter `STRING ARBITER = "R",
parameter MAX_FANOUT = `MAX_FANOUT,
parameter OUT_BUF = 0,
parameter NUM_REQS = `CDIV(NUM_INPUTS, NUM_OUTPUTS),
parameter NUM_REQS = (NUM_INPUTS > NUM_OUTPUTS) ? `CDIV(NUM_INPUTS, NUM_OUTPUTS) : `CDIV(NUM_OUTPUTS, NUM_INPUTS),
parameter SEL_COUNT = `MIN(NUM_INPUTS, NUM_OUTPUTS),
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
parameter NUM_REQS_W = `UP(LOG_NUM_REQS)
) (
@ -34,65 +35,38 @@ module VX_stream_arb #(
output wire [NUM_OUTPUTS-1:0] valid_out,
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
output wire [NUM_OUTPUTS-1:0][NUM_REQS_W-1:0] sel_out,
input wire [NUM_OUTPUTS-1:0] ready_out
input wire [NUM_OUTPUTS-1:0] ready_out,
output wire [SEL_COUNT-1:0][NUM_REQS_W-1:0] sel_out
);
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_more_inputs
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_input_select
if (NUM_OUTPUTS > 1) begin : g_multiple_outputs
// #Inputs > #Outputs
// (#inputs > #outputs) and (#outputs > 1)
if (MAX_FANOUT != 0 && (NUM_REQS > (MAX_FANOUT + MAX_FANOUT /2))) begin : g_fanout
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_arb_slices
localparam SLICE_BEGIN = i * NUM_REQS;
localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_INPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
VX_stream_arb #(
.NUM_INPUTS (SLICE_SIZE),
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
) arb_slice (
.clk (clk),
.reset (reset),
.valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]),
.ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]),
.data_in (data_in[SLICE_END-1: SLICE_BEGIN]),
.data_out (data_out[i]),
.sel_out (sel_out[i]),
.valid_out (valid_out[i]),
.ready_out (ready_out[i])
);
end
end else if (MAX_FANOUT != 0 && (NUM_INPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin : g_fanout
// (#inputs > max_fanout) and (#outputs == 1)
localparam NUM_SLICES = `CDIV(NUM_INPUTS, MAX_FANOUT);
localparam NUM_SLICES = `CDIV(NUM_REQS, MAX_FANOUT);
localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT);
localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES);
localparam DATAW2 = DATAW + LOG_NUM_REQS2;
wire [NUM_SLICES-1:0] valid_tmp;
wire [NUM_SLICES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp;
wire [NUM_SLICES-1:0] ready_tmp;
wire [NUM_SLICES-1:0][NUM_OUTPUTS-1:0] valid_tmp;
wire [NUM_SLICES-1:0][NUM_OUTPUTS-1:0][DATAW2-1:0] data_tmp;
wire [NUM_SLICES-1:0][NUM_OUTPUTS-1:0] ready_tmp;
for (genvar i = 0; i < NUM_SLICES; ++i) begin : g_fanout_slice_arbs
for (genvar s = 0; s < NUM_SLICES; ++s) begin : g_slice_arbs
localparam SLICE_BEGIN = i * MAX_FANOUT;
localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_INPUTS);
localparam SLICE_STRIDE= MAX_FANOUT * NUM_OUTPUTS;
localparam SLICE_BEGIN = s * SLICE_STRIDE;
localparam SLICE_END = `MIN(SLICE_BEGIN + SLICE_STRIDE, NUM_INPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
wire [DATAW-1:0] data_tmp_u;
wire [`LOG2UP(SLICE_SIZE)-1:0] sel_tmp_u;
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_tmp_u;
wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS2-1:0] sel_tmp_u;
VX_stream_arb #(
.NUM_INPUTS (SLICE_SIZE),
.NUM_OUTPUTS (1),
.NUM_OUTPUTS (NUM_OUTPUTS),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
@ -103,22 +77,24 @@ module VX_stream_arb #(
.valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]),
.data_in (data_in[SLICE_END-1: SLICE_BEGIN]),
.ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]),
.valid_out (valid_tmp[i]),
.valid_out (valid_tmp[s]),
.data_out (data_tmp_u),
.sel_out (sel_tmp_u),
.ready_out (ready_tmp[i])
.ready_out (ready_tmp[s]),
.sel_out (sel_tmp_u)
);
assign data_tmp[i] = {data_tmp_u, LOG_NUM_REQS2'(sel_tmp_u)};
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_tmp
assign data_tmp[s][o] = {data_tmp_u[o], sel_tmp_u[o]};
end
end
wire [DATAW+LOG_NUM_REQS2-1:0] data_out_u;
wire [LOG_NUM_REQS3-1:0] sel_out_u;
wire [NUM_OUTPUTS-1:0][DATAW2-1:0] data_out_u;
wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS3-1:0] sel_out_u;
VX_stream_arb #(
.NUM_INPUTS (NUM_SLICES),
.NUM_OUTPUTS (1),
.DATAW (DATAW + LOG_NUM_REQS2),
.NUM_INPUTS (NUM_SLICES * NUM_OUTPUTS),
.NUM_OUTPUTS (NUM_OUTPUTS),
.DATAW (DATAW2),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
@ -134,109 +110,107 @@ module VX_stream_arb #(
.ready_out (ready_out)
);
assign data_out = data_out_u[LOG_NUM_REQS2 +: DATAW];
assign sel_out = {sel_out_u, data_out_u[0 +: LOG_NUM_REQS2]};
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_out
assign sel_out[o] = {sel_out_u[o], data_out_u[o][LOG_NUM_REQS2-1:0]};
assign data_out[o] = data_out_u[o][DATAW2-1:LOG_NUM_REQS2];
end
end else begin : g_one_output
// (#inputs <= max_fanout) and (#outputs == 1)
wire valid_in_w;
wire [DATAW-1:0] data_in_w;
wire ready_in_w;
end else begin : g_arbiter
wire [NUM_REQS-1:0] arb_requests;
wire arb_valid;
wire [NUM_REQS_W-1:0] arb_index;
wire [NUM_REQS-1:0] arb_onehot;
wire arb_ready;
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_requests
wire [NUM_OUTPUTS-1:0] requests;
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_o
localparam i = r * NUM_OUTPUTS + o;
assign requests[o] = valid_in[i];
end
assign arb_requests[r] = (| requests);
end
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.TYPE (ARBITER)
) arbiter (
.clk (clk),
.reset (reset),
.requests (valid_in),
.requests (arb_requests),
.grant_valid (arb_valid),
.grant_index (arb_index),
.grant_onehot (arb_onehot),
.grant_ready (arb_ready)
);
assign valid_in_w = arb_valid;
assign data_in_w = data_in[arb_index];
assign arb_ready = ready_in_w;
wire [NUM_OUTPUTS-1:0] valid_out_w;
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w;
wire [NUM_OUTPUTS-1:0] ready_out_w;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_ready_in
assign ready_in[i] = ready_in_w && arb_onehot[i];
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_out_w
wire [NUM_REQS-1:0] valid_in_w;
wire [NUM_REQS-1:0][DATAW-1:0] data_in_w;
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r
localparam i = r * NUM_OUTPUTS + o;
if (r < NUM_INPUTS) begin : g_valid
assign valid_in_w[r] = valid_in[i];
assign data_in_w[r] = data_in[i];
end else begin : g_padding
assign valid_in_w[r] = 0;
assign data_in_w[r] = '0;
end
end
assign valid_out_w[o] = ((NUM_OUTPUTS == 1) || (| valid_in_w)) && arb_valid;
assign data_out_w[o] = data_in_w[arb_index];
end
VX_elastic_buffer #(
.DATAW (LOG_NUM_REQS + DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (`TO_OUT_BUF_LUTRAM(OUT_BUF))
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_in_w),
.ready_in (ready_in_w),
.data_in ({arb_index, data_in_w}),
.data_out ({sel_out, data_out}),
.valid_out (valid_out),
.ready_out (ready_out)
);
end
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_ready_in
localparam o = i % NUM_OUTPUTS;
localparam r = i / NUM_OUTPUTS;
assign ready_in[i] = ready_out_w[o] && arb_onehot[r];
end
end else if (NUM_OUTPUTS > NUM_INPUTS) begin : g_more_outputs
assign arb_ready = (| ready_out_w);
if (NUM_INPUTS > 1) begin : g_multiple_inputs
// (#inputs > 1) and (#outputs > #inputs)
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_arb_slices
localparam SLICE_BEGIN = i * NUM_REQS;
localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_OUTPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (SLICE_SIZE),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
) arb_slice (
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
VX_elastic_buffer #(
.DATAW (LOG_NUM_REQS + DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (`TO_OUT_BUF_LUTRAM(OUT_BUF))
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_in[i]),
.ready_in (ready_in[i]),
.data_in (data_in[i]),
.data_out (data_out[SLICE_END-1: SLICE_BEGIN]),
.valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]),
.ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]),
`UNUSED_PIN (sel_out)
.valid_in (valid_out_w[o]),
.ready_in (ready_out_w[o]),
.data_in ({arb_index, data_out_w[o]}),
.data_out ({sel_out[o], data_out[o]}),
.valid_out (valid_out[o]),
.ready_out (ready_out[o])
);
for (genvar j = SLICE_BEGIN; j < SLICE_END; ++j) begin : g_sel_out
assign sel_out[j] = i;
end
end
end
end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin : g_fanout
end else if (NUM_INPUTS < NUM_OUTPUTS) begin : g_output_select
// (#inputs == 1) and (#outputs > max_fanout)
// #Inputs < #Outputs
localparam NUM_SLICES = `CDIV(NUM_OUTPUTS, MAX_FANOUT);
if (MAX_FANOUT != 0 && (NUM_REQS > (MAX_FANOUT + MAX_FANOUT /2))) begin : g_fanout
wire [NUM_SLICES-1:0] valid_tmp;
wire [NUM_SLICES-1:0][DATAW-1:0] data_tmp;
wire [NUM_SLICES-1:0] ready_tmp;
localparam NUM_SLICES = `CDIV(NUM_REQS, MAX_FANOUT);
localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT);
localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES);
wire [NUM_SLICES-1:0][NUM_INPUTS-1:0] valid_tmp;
wire [NUM_SLICES-1:0][NUM_INPUTS-1:0][DATAW-1:0] data_tmp;
wire [NUM_SLICES-1:0][NUM_INPUTS-1:0] ready_tmp;
wire [NUM_INPUTS-1:0][LOG_NUM_REQS3-1:0] sel_tmp;
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (NUM_SLICES),
.NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (NUM_SLICES * NUM_INPUTS),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
@ -250,17 +224,22 @@ module VX_stream_arb #(
.data_out (data_tmp),
.valid_out (valid_tmp),
.ready_out (ready_tmp),
`UNUSED_PIN (sel_out)
.sel_out (sel_tmp)
);
for (genvar i = 0; i < NUM_SLICES; ++i) begin : g_fanout_slice_arbs
wire [NUM_SLICES-1:0][NUM_INPUTS-1:0][LOG_NUM_REQS2-1:0] sel_out_w;
localparam SLICE_BEGIN = i * MAX_FANOUT;
localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_OUTPUTS);
for (genvar s = 0; s < NUM_SLICES; ++s) begin : g_slice_arbs
localparam SLICE_STRIDE= MAX_FANOUT * NUM_INPUTS;
localparam SLICE_BEGIN = s * SLICE_STRIDE;
localparam SLICE_END = `MIN(SLICE_BEGIN + SLICE_STRIDE, NUM_OUTPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
wire [NUM_INPUTS-1:0][LOG_NUM_REQS2-1:0] sel_out_u;
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (SLICE_SIZE),
.DATAW (DATAW),
.ARBITER (ARBITER),
@ -269,45 +248,73 @@ module VX_stream_arb #(
) fanout_slice_arb (
.clk (clk),
.reset (reset),
.valid_in (valid_tmp[i]),
.ready_in (ready_tmp[i]),
.data_in (data_tmp[i]),
.valid_in (valid_tmp[s]),
.ready_in (ready_tmp[s]),
.data_in (data_tmp[s]),
.data_out (data_out[SLICE_END-1: SLICE_BEGIN]),
.valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]),
.ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]),
`UNUSED_PIN (sel_out)
.sel_out (sel_out_w[s])
);
end
end else begin : g_one_input
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_sel_out
assign sel_out[i] = {sel_tmp[i], sel_out_w[sel_tmp[i]][i]};
end
// (#inputs == 1) and (#outputs <= max_fanout)
end else begin : g_arbiter
wire [NUM_OUTPUTS-1:0] ready_in_w;
wire [NUM_OUTPUTS-1:0] arb_requests;
wire [NUM_REQS-1:0] arb_requests;
wire arb_valid;
wire [NUM_OUTPUTS-1:0] arb_onehot;
wire [NUM_REQS_W-1:0] arb_index;
wire [NUM_REQS-1:0] arb_onehot;
wire arb_ready;
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_requests
wire [NUM_INPUTS-1:0] requests;
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_i
localparam o = r * NUM_INPUTS + i;
assign requests[i] = ready_out[o];
end
assign arb_requests[r] = (| requests);
end
VX_generic_arbiter #(
.NUM_REQS (NUM_OUTPUTS),
.NUM_REQS (NUM_REQS),
.TYPE (ARBITER)
) arbiter (
.clk (clk),
.reset (reset),
.requests (arb_requests),
.grant_valid (arb_valid),
`UNUSED_PIN (grant_index),
.grant_index (arb_index),
.grant_onehot (arb_onehot),
.grant_ready (arb_ready)
);
assign arb_requests = ready_in_w;
assign arb_ready = valid_in[0];
assign ready_in = arb_valid;
wire [NUM_OUTPUTS-1:0] valid_out_w;
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w;
wire [NUM_OUTPUTS-1:0] ready_out_w;
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_out_buf
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_out_w
localparam i = o % NUM_INPUTS;
localparam r = o / NUM_INPUTS;
assign valid_out_w[o] = valid_in[i] && arb_onehot[r];
assign data_out_w[o] = data_in[i];
end
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_ready_in
wire [NUM_REQS-1:0] ready_out_s;
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r
localparam o = r * NUM_INPUTS + i;
assign ready_out_s[r] = ready_out_w[o];
end
assign ready_in[i] = ((NUM_INPUTS == 1) || (| ready_out_s)) && arb_valid;
end
assign arb_ready = (| valid_in);
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
@ -316,23 +323,25 @@ module VX_stream_arb #(
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_in && arb_onehot[i]),
.ready_in (ready_in_w[i]),
.data_in (data_in),
.data_out (data_out[i]),
.valid_out (valid_out[i]),
.ready_out (ready_out[i])
.valid_in (valid_out_w[o]),
.ready_in (ready_out_w[o]),
.data_in (data_out_w[o]),
.data_out (data_out[o]),
.valid_out (valid_out[o]),
.ready_out (ready_out[o])
);
end
end
assign sel_out = 0;
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_sel_out
assign sel_out[i] = arb_index;
end
end
end else begin : g_passthru
// #Inputs == #Outputs
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_out_buf
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
@ -341,14 +350,14 @@ module VX_stream_arb #(
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_in[i]),
.ready_in (ready_in[i]),
.data_in (data_in[i]),
.data_out (data_out[i]),
.valid_out (valid_out[i]),
.ready_out (ready_out[i])
.valid_in (valid_in[o]),
.ready_in (ready_in[o]),
.data_in (data_in[o]),
.data_out (data_out[o]),
.valid_out (valid_out[o]),
.ready_out (ready_out[o])
);
assign sel_out[i] = NUM_REQS_W'(i);
assign sel_out[o] = NUM_REQS_W'(0);
end
end

View file

@ -0,0 +1,215 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`TRACING_OFF
module VX_stream_omega #(
parameter NUM_INPUTS = 4,
parameter NUM_OUTPUTS = 4,
parameter RADIX = 2,
parameter DATAW = 4,
parameter ARBITER = "R",
parameter OUT_BUF = 0,
parameter MAX_FANOUT = `MAX_FANOUT,
parameter PERF_CTR_BITS = 32,
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS)
) (
input wire clk,
input wire reset,
input wire [NUM_INPUTS-1:0] valid_in,
input wire [NUM_INPUTS-1:0][DATAW-1:0] data_in,
input wire [NUM_INPUTS-1:0][OUT_WIDTH-1:0] sel_in,
output wire [NUM_INPUTS-1:0] ready_in,
output wire [NUM_OUTPUTS-1:0] valid_out,
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
output wire [NUM_OUTPUTS-1:0][IN_WIDTH-1:0] sel_out,
input wire [NUM_OUTPUTS-1:0] ready_out,
output wire [PERF_CTR_BITS-1:0] collisions
);
`STATIC_ASSERT (`IS_POW2(RADIX), ("inavlid parameters"))
// If network size smaller than radix, simply use a crossbar.
if (NUM_INPUTS <= RADIX && NUM_OUTPUTS <= RADIX) begin : g_fallback
VX_stream_xbar #(
.NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (NUM_OUTPUTS),
.DATAW (DATAW),
.ARBITER (ARBITER),
.OUT_BUF (OUT_BUF),
.MAX_FANOUT (MAX_FANOUT),
.PERF_CTR_BITS (PERF_CTR_BITS)
) xbar_switch (
.clk,
.reset,
.valid_in,
.data_in,
.sel_in,
.ready_in,
.valid_out,
.data_out,
.sel_out,
.ready_out,
.collisions
);
end else begin : g_omega
localparam RADIX_LG = `LOG2UP(RADIX);
localparam N_INPUTS_M = `MAX(NUM_INPUTS, NUM_OUTPUTS);
localparam N_INPUTS_LG = `CDIV(`CLOG2(N_INPUTS_M), RADIX_LG);
localparam N_INPUTS = RADIX ** N_INPUTS_LG;
localparam NUM_STAGES = `LOG2UP(N_INPUTS) / RADIX_LG;
localparam NUM_SWITCHES = N_INPUTS / RADIX;
typedef struct packed {
logic [N_INPUTS_LG-1:0] sel_in;
logic [DATAW-1:0] data;
logic [IN_WIDTH-1:0] sel_out;
} omega_t;
// Wires for internal connections between stages
wire [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] switch_valid_in, switch_valid_out;
omega_t [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] switch_data_in, switch_data_out;
wire [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0][RADIX_LG-1:0] switch_sel_in;
wire [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] switch_ready_in, switch_ready_out;
// Connect inputs to first stage
for (genvar i = 0; i < N_INPUTS; ++i) begin : g_tie_inputs
localparam DST_IDX = ((i << 1) | (i >> (N_INPUTS_LG-1))) & (N_INPUTS-1);
localparam switch = DST_IDX / RADIX;
localparam port = DST_IDX % RADIX;
if (i < NUM_INPUTS) begin : g_valid
assign switch_valid_in[0][switch][port] = valid_in[i];
assign switch_data_in[0][switch][port] = '{
sel_in: N_INPUTS_LG'(sel_in[i]),
data: data_in[i],
sel_out: IN_WIDTH'(i)
};
assign ready_in[i] = switch_ready_in[0][switch][port];
end else begin : g_padding
assign switch_valid_in[0][switch][port] = 0;
assign switch_data_in[0][switch][port] = 'x;
`UNUSED_VAR (switch_ready_in[0][switch][port])
end
end
// Connect switch sel_in
for (genvar stage = 0; stage < NUM_STAGES; ++stage) begin : g_sel_in
for (genvar switch = 0; switch < NUM_SWITCHES; ++switch) begin : g_switches
for (genvar port = 0; port < RADIX; ++port) begin : g_ports
assign switch_sel_in[stage][switch][port] = switch_data_in[stage][switch][port].sel_in[(NUM_STAGES-1-stage) * RADIX_LG +: RADIX_LG];
end
end
end
// Connect internal stages
for (genvar stage = 0; stage < NUM_STAGES-1; ++stage) begin : g_stages
for (genvar switch = 0; switch < NUM_SWITCHES; ++switch) begin : g_switches
for (genvar port = 0; port < RADIX; port++) begin : g_ports
localparam lane = switch * RADIX + port;
localparam dst_lane = ((lane << 1) | (lane >> (N_INPUTS_LG-1))) & (N_INPUTS-1);
localparam dst_switch = dst_lane / RADIX;
localparam dst_port = dst_lane % RADIX;
assign switch_valid_in[stage+1][dst_switch][dst_port] = switch_valid_out[stage][switch][port];
assign switch_data_in[stage+1][dst_switch][dst_port] = switch_data_out[stage][switch][port];
assign switch_ready_out[stage][switch][port] = switch_ready_in[stage+1][dst_switch][dst_port];
end
end
end
// Connect network switches
for (genvar switch = 0; switch < NUM_SWITCHES; ++switch) begin : g_switches
for (genvar stage = 0; stage < NUM_STAGES; ++stage) begin : g_stages
VX_stream_xbar #(
.NUM_INPUTS (RADIX),
.NUM_OUTPUTS (RADIX),
.DATAW ($bits(omega_t)),
.ARBITER (ARBITER),
.OUT_BUF (OUT_BUF),
.MAX_FANOUT (MAX_FANOUT),
.PERF_CTR_BITS(PERF_CTR_BITS)
) xbar_switch (
.clk (clk),
.reset (reset),
.valid_in (switch_valid_in[stage][switch]),
.data_in (switch_data_in[stage][switch]),
.sel_in (switch_sel_in[stage][switch]),
.ready_in (switch_ready_in[stage][switch]),
.valid_out (switch_valid_out[stage][switch]),
.data_out (switch_data_out[stage][switch]),
`UNUSED_PIN (sel_out),
.ready_out (switch_ready_out[stage][switch]),
`UNUSED_PIN (collisions)
);
end
end
// Connect outputs to last stage
for (genvar i = 0; i < N_INPUTS; ++i) begin : g_tie_outputs
localparam switch = i / RADIX;
localparam port = i % RADIX;
if (i < NUM_OUTPUTS) begin : g_valid
assign valid_out[i] = switch_valid_out[NUM_STAGES-1][switch][port];
assign data_out[i] = switch_data_out[NUM_STAGES-1][switch][port].data;
assign sel_out[i] = switch_data_out[NUM_STAGES-1][switch][port].sel_out;
assign switch_ready_out[NUM_STAGES-1][switch][port] = ready_out[i];
end else begin : g_padding
`UNUSED_VAR (switch_valid_out[NUM_STAGES-1][switch][port])
`UNUSED_VAR (switch_data_out[NUM_STAGES-1][switch][port])
assign switch_ready_out[NUM_STAGES-1][switch][port] = 0;
end
end
// compute inputs collision
// we have a collision when there exists a valid transfer with multiple input candicates
// we count the unique duplicates each cycle.
reg [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] per_cycle_collision, per_cycle_collision_r;
wire [`CLOG2(NUM_STAGES*NUM_SWITCHES*RADIX+1)-1:0] collision_count;
reg [PERF_CTR_BITS-1:0] collisions_r;
always @(*) begin
per_cycle_collision = 0;
for (integer stage = 0; stage < NUM_STAGES; ++stage) begin
for (integer switch = 0; switch < NUM_SWITCHES; ++switch) begin
for (integer port_a = 0; port_a < RADIX; ++port_a) begin
for (integer port_b = port_a + 1; port_b < RADIX; ++port_b) begin
per_cycle_collision[stage][switch][port_a] |= switch_valid_in[stage][switch][port_a]
&& switch_valid_in[stage][switch][port_b]
&& (switch_sel_in[stage][switch][port_a] == switch_sel_in[stage][switch][port_b])
&& (switch_ready_in[stage][switch][port_a] | switch_ready_in[stage][switch][port_b]);
end
end
end
end
end
`BUFFER(per_cycle_collision_r, per_cycle_collision);
`POP_COUNT(collision_count, per_cycle_collision_r);
always @(posedge clk) begin
if (reset) begin
collisions_r <= '0;
end else begin
collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count);
end
end
assign collisions = collisions_r;
end
endmodule
`TRACING_ON

View file

@ -36,42 +36,27 @@ module VX_stream_switch #(
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
input wire [NUM_OUTPUTS-1:0] ready_out
);
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_more_inputs
wire [NUM_OUTPUTS-1:0][NUM_REQS-1:0] valid_in_w;
wire [NUM_OUTPUTS-1:0][NUM_REQS-1:0][DATAW-1:0] data_in_w;
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_input_select
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_data_in
for (genvar j = 0; j < NUM_REQS; ++j) begin : g_j
localparam ii = i * NUM_REQS + j;
if (ii < NUM_INPUTS) begin : g_valid
assign valid_in_w[i][j] = valid_in[ii];
assign data_in_w[i][j] = data_in[ii];
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
wire [NUM_REQS-1:0] valid_in_w;
wire [NUM_REQS-1:0][DATAW-1:0] data_in_w;
wire [NUM_REQS-1:0] ready_in_w;
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r
localparam i = r * NUM_OUTPUTS + o;
if (i < NUM_INPUTS) begin : g_valid
assign valid_in_w[r] = valid_in[i];
assign data_in_w[r] = data_in[i];
assign ready_in[i] = ready_in_w[r];
end else begin : g_padding
assign valid_in_w[i][j] = 0;
assign data_in_w[i][j] = '0;
assign valid_in_w[r] = 0;
assign data_in_w[r] = '0;
`UNUSED_VAR (ready_in_w[r])
end
end
end
wire [NUM_OUTPUTS-1:0] valid_out_w;
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w;
wire [NUM_OUTPUTS-1:0] ready_out_w;
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_data_out_w
assign valid_out_w[i] = valid_in_w[i][sel_in[i]];
assign data_out_w[i] = data_in_w[i][sel_in[i]];
end
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_ready_out_w
for (genvar j = 0; j < NUM_REQS; ++j) begin : g_j
localparam ii = i * NUM_REQS + j;
if (ii < NUM_INPUTS) begin : g_valid
assign ready_in[ii] = ready_out_w[i] && (sel_in[i] == LOG_NUM_REQS'(j));
end
end
end
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_out_buf
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
@ -79,34 +64,27 @@ module VX_stream_switch #(
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_out_w[i]),
.ready_in (ready_out_w[i]),
.data_in (data_out_w[i]),
.data_out (data_out[i]),
.valid_out (valid_out[i]),
.ready_out (ready_out[i])
.valid_in (valid_in_w[sel_in[o]]),
.ready_in (ready_in_w[sel_in[o]]),
.data_in (data_in_w[sel_in[o]]),
.data_out (data_out[o]),
.valid_out (valid_out[o]),
.ready_out (ready_out[o])
);
end
end else if (NUM_OUTPUTS > NUM_INPUTS) begin : g_more_outputs
end else if (NUM_OUTPUTS > NUM_INPUTS) begin : g_output_select
wire [NUM_INPUTS-1:0][NUM_REQS-1:0] valid_out_w;
wire [NUM_INPUTS-1:0][NUM_REQS-1:0] ready_out_w;
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_valid_out_w
for (genvar j = 0; j < NUM_REQS; ++j) begin : g_j
assign valid_out_w[i][j] = valid_in[i] && (sel_in[i] == LOG_NUM_REQS'(j));
end
end
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_ready_in
assign ready_in[i] = ready_out_w[i][sel_in[i]];
end
// Inputs < Outputs
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_out_buf
for (genvar j = 0; j < NUM_REQS; ++j) begin : g_j
localparam ii = i * NUM_REQS + j;
if (ii < NUM_OUTPUTS) begin : g_valid
wire [NUM_REQS-1:0] ready_out_w;
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r
localparam o = r * NUM_INPUTS + i;
if (o < NUM_OUTPUTS) begin : g_valid
wire valid_out_w = valid_in[i] && (sel_in[i] == LOG_NUM_REQS'(r));
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
@ -114,18 +92,19 @@ module VX_stream_switch #(
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_out_w[i][j]),
.ready_in (ready_out_w[i][j]),
.valid_in (valid_out_w),
.ready_in (ready_out_w[r]),
.data_in (data_in[i]),
.data_out (data_out[ii]),
.valid_out (valid_out[ii]),
.ready_out (ready_out[ii])
.data_out (data_out[o]),
.valid_out (valid_out[o]),
.ready_out (ready_out[o])
);
end else begin : g_padding
`UNUSED_VAR (valid_out_w[i][j])
assign ready_out_w[i][j] = '0;
assign ready_out_w[r] = '0;
end
end
assign ready_in[i] = ready_out_w[sel_in[i]];
end
end else begin : g_passthru
@ -150,7 +129,6 @@ module VX_stream_switch #(
.ready_out (ready_out[i])
);
end
end
endmodule

View file

@ -18,18 +18,16 @@ module VX_stream_xbar #(
parameter NUM_INPUTS = 4,
parameter NUM_OUTPUTS = 4,
parameter DATAW = 4,
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS),
parameter ARBITER = "R",
parameter OUT_BUF = 0,
parameter MAX_FANOUT = `MAX_FANOUT,
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1)
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1),
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS)
) (
input wire clk,
input wire reset,
output wire [PERF_CTR_BITS-1:0] collisions,
input wire [NUM_INPUTS-1:0] valid_in,
input wire [NUM_INPUTS-1:0][DATAW-1:0] data_in,
input wire [NUM_INPUTS-1:0][OUT_WIDTH-1:0] sel_in,
@ -38,12 +36,14 @@ module VX_stream_xbar #(
output wire [NUM_OUTPUTS-1:0] valid_out,
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
output wire [NUM_OUTPUTS-1:0][IN_WIDTH-1:0] sel_out,
input wire [NUM_OUTPUTS-1:0] ready_out
input wire [NUM_OUTPUTS-1:0] ready_out,
output wire [PERF_CTR_BITS-1:0] collisions
);
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
if (NUM_INPUTS != 1) begin : g_multiple_inputs
if (NUM_INPUTS != 1) begin : g_multi_inputs
if (NUM_OUTPUTS != 1) begin : g_multiple_outputs
@ -130,7 +130,7 @@ module VX_stream_xbar #(
`UNUSED_VAR (sel_in)
end
end else if (NUM_OUTPUTS != 1) begin : g_one_input
end else if (NUM_OUTPUTS != 1) begin : g_single_input
// (#inputs == 1) and (#outputs > 1)

View file

@ -35,7 +35,7 @@ module VX_gbar_arb #(
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_data_in
assign req_valid_in[i] = bus_in_if[i].req_valid;
assign req_data_in[i] = {bus_in_if[i].req_id, bus_in_if[i].req_size_m1, bus_in_if[i].req_core_id};
assign req_data_in[i] = bus_in_if[i].req_data;
assign bus_in_if[i].req_ready = req_ready_in[i];
end
@ -51,7 +51,7 @@ module VX_gbar_arb #(
.valid_in (req_valid_in),
.ready_in (req_ready_in),
.data_in (req_data_in),
.data_out ({bus_out_if.req_id, bus_out_if.req_size_m1, bus_out_if.req_core_id}),
.data_out (bus_out_if.req_data),
.valid_out (bus_out_if.req_valid),
.ready_out (bus_out_if.req_ready),
`UNUSED_PIN (sel_out)
@ -60,7 +60,7 @@ module VX_gbar_arb #(
// broadcast response
reg rsp_valid;
reg [`NB_WIDTH-1:0] rsp_id;
reg [`NB_WIDTH-1:0] rsp_data;
always @(posedge clk) begin
if (reset) begin
@ -68,12 +68,12 @@ module VX_gbar_arb #(
end else begin
rsp_valid <= bus_out_if.rsp_valid;
end
rsp_id <= bus_out_if.rsp_id;
rsp_data <= bus_out_if.rsp_data;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_bus_in_if
assign bus_in_if[i].rsp_valid = rsp_valid;
assign bus_in_if[i].rsp_id = rsp_id;
assign bus_in_if[i].rsp_data = rsp_data;
end
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,35 +15,39 @@
interface VX_gbar_bus_if ();
wire req_valid;
wire [`NB_WIDTH-1:0] req_id;
wire [`NC_WIDTH-1:0] req_size_m1;
wire [`NC_WIDTH-1:0] req_core_id;
wire req_ready;
typedef struct packed {
logic [`NB_WIDTH-1:0] id;
logic [`NC_WIDTH-1:0] size_m1;
logic [`NC_WIDTH-1:0] core_id;
} req_data_t;
wire rsp_valid;
wire [`NB_WIDTH-1:0] rsp_id;
typedef struct packed {
logic [`NB_WIDTH-1:0] id;
} rsp_data_t;
logic req_valid;
req_data_t req_data;
logic req_ready;
logic rsp_valid;
rsp_data_t rsp_data;
modport master (
output req_valid,
output req_id,
output req_size_m1,
output req_core_id,
input req_ready,
output req_valid,
output req_data,
input req_ready,
input rsp_valid,
input rsp_id
input rsp_valid,
input rsp_data
);
modport slave (
input req_valid,
input req_id,
input req_size_m1,
input req_core_id,
output req_ready,
output rsp_valid,
output rsp_id
input req_valid,
input req_data,
output req_ready,
output rsp_valid,
output rsp_data
);
endinterface

View file

@ -25,7 +25,7 @@ module VX_gbar_unit #(
reg [`NB_WIDTH-1:0][`NUM_CORES-1:0] barrier_masks;
wire [`CLOG2(`NUM_CORES+1)-1:0] active_barrier_count;
wire [`NUM_CORES-1:0] curr_barrier_mask = barrier_masks[gbar_bus_if.req_id];
wire [`NUM_CORES-1:0] curr_barrier_mask = barrier_masks[gbar_bus_if.req_data.id];
`POP_COUNT(active_barrier_count, curr_barrier_mask);
`UNUSED_VAR (active_barrier_count)
@ -42,29 +42,29 @@ module VX_gbar_unit #(
rsp_valid <= 0;
end
if (gbar_bus_if.req_valid) begin
if (active_barrier_count[`NC_WIDTH-1:0] == gbar_bus_if.req_size_m1) begin
barrier_masks[gbar_bus_if.req_id] <= '0;
rsp_bar_id <= gbar_bus_if.req_id;
if (active_barrier_count[`NC_WIDTH-1:0] == gbar_bus_if.req_data.size_m1) begin
barrier_masks[gbar_bus_if.req_data.id] <= '0;
rsp_bar_id <= gbar_bus_if.req_data.id;
rsp_valid <= 1;
end else begin
barrier_masks[gbar_bus_if.req_id][gbar_bus_if.req_core_id] <= 1;
barrier_masks[gbar_bus_if.req_data.id][gbar_bus_if.req_data.core_id] <= 1;
end
end
end
end
assign gbar_bus_if.rsp_valid = rsp_valid;
assign gbar_bus_if.rsp_id = rsp_bar_id;
assign gbar_bus_if.rsp_data.id = rsp_bar_id;
assign gbar_bus_if.req_ready = 1; // global barrier unit is always ready (no dependencies)
`ifdef DBG_TRACE_GBAR
always @(posedge clk) begin
if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin
`TRACE(2, ("%t: %s acquire: bar_id=%0d, size=%0d, core_id=%0d\n",
$time, INSTANCE_ID, gbar_bus_if.req_id, gbar_bus_if.req_size_m1, gbar_bus_if.req_core_id))
$time, INSTANCE_ID, gbar_bus_if.req_data.id, gbar_bus_if.req_data.size_m1, gbar_bus_if.req_data.core_id))
end
if (gbar_bus_if.rsp_valid) begin
`TRACE(2, ("%t: %s release: bar_id=%0d\n", $time, INSTANCE_ID, gbar_bus_if.rsp_id))
`TRACE(2, ("%t: %s release: bar_id=%0d\n", $time, INSTANCE_ID, gbar_bus_if.rsp_data.id))
end
end
`endif

View file

@ -61,15 +61,7 @@ module VX_lmem_switch import VX_gpu_pkg::*; #(
}),
.ready_in (req_global_ready),
.valid_out (global_out_if.req_valid),
.data_out ({
global_out_if.req_data.mask,
global_out_if.req_data.rw,
global_out_if.req_data.addr,
global_out_if.req_data.data,
global_out_if.req_data.byteen,
global_out_if.req_data.flags,
global_out_if.req_data.tag
}),
.data_out (global_out_if.req_data),
.ready_out (global_out_if.req_ready)
);
@ -92,15 +84,7 @@ module VX_lmem_switch import VX_gpu_pkg::*; #(
}),
.ready_in (req_local_ready),
.valid_out (local_out_if.req_valid),
.data_out ({
local_out_if.req_data.mask,
local_out_if.req_data.rw,
local_out_if.req_data.addr,
local_out_if.req_data.data,
local_out_if.req_data.byteen,
local_out_if.req_data.flags,
local_out_if.req_data.tag
}),
.data_out (local_out_if.req_data),
.ready_out (local_out_if.req_ready)
);

View file

@ -109,8 +109,8 @@ module VX_local_mem import VX_gpu_pkg::*; #(
assign req_data_in[i] = {
mem_bus_if[i].req_data.rw,
req_bank_addr[i],
mem_bus_if[i].req_data.byteen,
mem_bus_if[i].req_data.data,
mem_bus_if[i].req_data.byteen,
mem_bus_if[i].req_data.tag
};
assign mem_bus_if[i].req_ready = req_ready_in[i];
@ -145,8 +145,8 @@ module VX_local_mem import VX_gpu_pkg::*; #(
assign {
per_bank_req_rw[i],
per_bank_req_addr[i],
per_bank_req_byteen[i],
per_bank_req_data[i],
per_bank_req_byteen[i],
per_bank_req_tag[i]
} = per_bank_req_data_aos[i];
end
@ -245,7 +245,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_mem_bus_if
assign mem_bus_if[i].rsp_valid = rsp_valid_out[i];
assign mem_bus_if[i].rsp_data = rsp_data_out[i];
assign mem_bus_if[i].rsp_data = rsp_data_out[i];
assign rsp_ready_out[i] = mem_bus_if[i].rsp_ready;
end
@ -299,23 +299,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
`ifdef DBG_TRACE_MEM
wire [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] req_uuid;
wire [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] rsp_uuid;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_uuid
if (UUID_WIDTH != 0) begin : g_uuid
assign req_uuid[i] = mem_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign rsp_uuid[i] = mem_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin : g_no_uuid
assign req_uuid[i] = 0;
assign rsp_uuid[i] = 0;
end
end
wire [NUM_BANKS-1:0][TAG_WIDTH-UUID_WIDTH-1:0] per_bank_req_tag_value;
wire [NUM_BANKS-1:0][`UP(UUID_WIDTH)-1:0] per_bank_req_uuid;
wire [NUM_BANKS-1:0][TAG_WIDTH-UUID_WIDTH-1:0] per_bank_rsp_tag_value;
wire [NUM_BANKS-1:0][`UP(UUID_WIDTH)-1:0] per_bank_rsp_uuid;
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_req_uuid
assign per_bank_req_tag_value[i] = per_bank_req_tag[i][TAG_WIDTH-UUID_WIDTH-1:0];
assign per_bank_rsp_tag_value[i] = per_bank_rsp_tag[i][TAG_WIDTH-UUID_WIDTH-1:0];
if (UUID_WIDTH != 0) begin : g_uuid
assign per_bank_req_uuid[i] = per_bank_req_tag[i][TAG_WIDTH-1 -: UUID_WIDTH];
assign per_bank_rsp_uuid[i] = per_bank_rsp_tag[i][TAG_WIDTH-1 -: UUID_WIDTH];
@ -329,16 +321,16 @@ module VX_local_mem import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, req_uuid[i]))
`TRACE(2, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, req_uuid[i]))
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end
end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.tag, mem_bus_if[i].rsp_data.data[i], rsp_uuid[i]))
`TRACE(2, ("%t: %s rd-rsp: req_idx=%0d, data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
end
end
end
@ -347,16 +339,16 @@ module VX_local_mem import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin
if (per_bank_req_rw[i]) begin
`TRACE(2, ("%t: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_uuid[i]))
`TRACE(2, ("%t: %s-bank%0d wr-req: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
end else begin
`TRACE(2, ("%t: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag[i], per_bank_req_uuid[i]))
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
end
end
if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin
`TRACE(2, ("%t: %s-bank%0d rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_rsp_tag[i], per_bank_rsp_data[i], per_bank_rsp_uuid[i]))
`TRACE(2, ("%t: %s-bank%0d rd-rsp: data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_rsp_data[i], per_bank_rsp_tag_value[i], per_bank_rsp_uuid[i]))
end
end
end

View file

@ -92,8 +92,8 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_bus_rsp
assign rsp_valid_out[i] = mem_bus_if[i].rsp_valid;
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
assign mem_bus_if[i].rsp_ready = rsp_ready_out[i];
end

View file

@ -0,0 +1,185 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lsu_mem_arb #(
parameter NUM_INPUTS = 1,
parameter NUM_OUTPUTS = 1,
parameter NUM_LANES = 1,
parameter DATA_SIZE = 1,
parameter TAG_WIDTH = 1,
parameter TAG_SEL_IDX = 0,
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0,
parameter `STRING ARBITER = "R",
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
parameter FLAGS_WIDTH = `MEM_REQ_FLAGS_WIDTH
) (
input wire clk,
input wire reset,
VX_lsu_mem_if.slave bus_in_if [NUM_INPUTS],
VX_lsu_mem_if.master bus_out_if [NUM_OUTPUTS]
);
localparam DATA_WIDTH = (8 * DATA_SIZE);
localparam LOG_NUM_REQS = `ARB_SEL_BITS(NUM_INPUTS, NUM_OUTPUTS);
localparam REQ_DATAW = 1 + NUM_LANES * (1 + ADDR_WIDTH + DATA_WIDTH + DATA_SIZE + FLAGS_WIDTH) + TAG_WIDTH;
localparam RSP_DATAW = NUM_LANES * (1 + DATA_WIDTH) + TAG_WIDTH;
`STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter: NUM_INPUTS=%0d, NUM_OUTPUTS=%0d", NUM_INPUTS, NUM_OUTPUTS));
wire [NUM_INPUTS-1:0] req_valid_in;
wire [NUM_INPUTS-1:0][REQ_DATAW-1:0] req_data_in;
wire [NUM_INPUTS-1:0] req_ready_in;
wire [NUM_OUTPUTS-1:0] req_valid_out;
wire [NUM_OUTPUTS-1:0][REQ_DATAW-1:0] req_data_out;
wire [NUM_OUTPUTS-1:0][`UP(LOG_NUM_REQS)-1:0] req_sel_out;
wire [NUM_OUTPUTS-1:0] req_ready_out;
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_req_data_in
assign req_valid_in[i] = bus_in_if[i].req_valid;
assign req_data_in[i] = bus_in_if[i].req_data;
assign bus_in_if[i].req_ready = req_ready_in[i];
end
VX_stream_arb #(
.NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (NUM_OUTPUTS),
.DATAW (REQ_DATAW),
.ARBITER (ARBITER),
.OUT_BUF (REQ_OUT_BUF)
) req_arb (
.clk (clk),
.reset (reset),
.valid_in (req_valid_in),
.ready_in (req_ready_in),
.data_in (req_data_in),
.data_out (req_data_out),
.sel_out (req_sel_out),
.valid_out (req_valid_out),
.ready_out (req_ready_out)
);
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_bus_out_if
wire [TAG_WIDTH-1:0] req_tag_out;
VX_bits_insert #(
.N (TAG_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_insert (
.data_in (req_tag_out),
.ins_in (req_sel_out[i]),
.data_out (bus_out_if[i].req_data.tag)
);
assign bus_out_if[i].req_valid = req_valid_out[i];
assign {
bus_out_if[i].req_data.mask,
bus_out_if[i].req_data.rw,
bus_out_if[i].req_data.addr,
bus_out_if[i].req_data.data,
bus_out_if[i].req_data.byteen,
bus_out_if[i].req_data.flags,
req_tag_out
} = req_data_out[i];
assign req_ready_out[i] = bus_out_if[i].req_ready;
end
///////////////////////////////////////////////////////////////////////////
wire [NUM_INPUTS-1:0] rsp_valid_out;
wire [NUM_INPUTS-1:0][RSP_DATAW-1:0] rsp_data_out;
wire [NUM_INPUTS-1:0] rsp_ready_out;
wire [NUM_OUTPUTS-1:0] rsp_valid_in;
wire [NUM_OUTPUTS-1:0][RSP_DATAW-1:0] rsp_data_in;
wire [NUM_OUTPUTS-1:0] rsp_ready_in;
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_rsp_enabled
wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS-1:0] rsp_sel_in;
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in
wire [TAG_WIDTH-1:0] rsp_tag_out;
VX_bits_remove #(
.N (TAG_WIDTH + LOG_NUM_REQS),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_remove (
.data_in (bus_out_if[i].rsp_data.tag),
.sel_out (rsp_sel_in[i]),
.data_out (rsp_tag_out)
);
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
assign rsp_data_in[i] = {
bus_out_if[i].rsp_data.mask,
bus_out_if[i].rsp_data.data,
rsp_tag_out
};
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
end
VX_stream_switch #(
.NUM_INPUTS (NUM_OUTPUTS),
.NUM_OUTPUTS (NUM_INPUTS),
.DATAW (RSP_DATAW),
.OUT_BUF (RSP_OUT_BUF)
) rsp_switch (
.clk (clk),
.reset (reset),
.sel_in (rsp_sel_in),
.valid_in (rsp_valid_in),
.ready_in (rsp_ready_in),
.data_in (rsp_data_in),
.data_out (rsp_data_out),
.valid_out (rsp_valid_out),
.ready_out (rsp_ready_out)
);
end else begin : g_passthru
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
assign rsp_data_in[i] = bus_out_if[i].rsp_data;
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
end
VX_stream_arb #(
.NUM_INPUTS (NUM_OUTPUTS),
.NUM_OUTPUTS (NUM_INPUTS),
.DATAW (RSP_DATAW),
.ARBITER (ARBITER),
.OUT_BUF (RSP_OUT_BUF)
) req_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_valid_in),
.ready_in (rsp_ready_in),
.data_in (rsp_data_in),
.data_out (rsp_data_out),
.valid_out (rsp_valid_out),
.ready_out (rsp_ready_out),
`UNUSED_PIN (sel_out)
);
end
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_output
assign bus_in_if[i].rsp_valid = rsp_valid_out[i];
assign bus_in_if[i].rsp_data = rsp_data_out[i];
assign rsp_ready_out[i] = bus_in_if[i].rsp_ready;
end
endmodule

View file

@ -16,26 +16,32 @@
interface VX_lsu_mem_if #(
parameter NUM_LANES = 1,
parameter DATA_SIZE = 1,
parameter FLAGS_WIDTH= `MEM_REQ_FLAGS_WIDTH,
parameter TAG_WIDTH = 1,
parameter FLAGS_WIDTH= `MEM_REQ_FLAGS_WIDTH,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE)
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE),
parameter UUID_WIDTH = `UUID_WIDTH
) ();
typedef struct packed {
logic rw;
logic [NUM_LANES-1:0] mask;
logic [`UP(UUID_WIDTH)-1:0] uuid;
logic [TAG_WIDTH-`UP(UUID_WIDTH)-1:0] value;
} tag_t;
typedef struct packed {
logic [NUM_LANES-1:0] mask;
logic rw;
logic [NUM_LANES-1:0][ADDR_WIDTH-1:0] addr;
logic [NUM_LANES-1:0][DATA_SIZE*8-1:0] data;
logic [NUM_LANES-1:0][DATA_SIZE-1:0] byteen;
logic [NUM_LANES-1:0][FLAGS_WIDTH-1:0] flags;
logic [TAG_WIDTH-1:0] tag;
tag_t tag;
} req_data_t;
typedef struct packed {
logic [NUM_LANES-1:0] mask;
logic [NUM_LANES-1:0] mask;
logic [NUM_LANES-1:0][DATA_SIZE*8-1:0] data;
logic [TAG_WIDTH-1:0] tag;
tag_t tag;
} rsp_data_t;
logic req_valid;

View file

@ -17,13 +17,14 @@ module VX_mem_arb #(
parameter NUM_INPUTS = 1,
parameter NUM_OUTPUTS = 1,
parameter DATA_SIZE = 1,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
parameter TAG_WIDTH = 1,
parameter TAG_SEL_IDX = 0,
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0,
parameter `STRING ARBITER = "R"
parameter `STRING ARBITER = "R",
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
parameter FLAGS_WIDTH = `MEM_REQ_FLAGS_WIDTH
) (
input wire clk,
input wire reset,
@ -33,10 +34,10 @@ module VX_mem_arb #(
);
localparam DATA_WIDTH = (8 * DATA_SIZE);
localparam LOG_NUM_REQS = `ARB_SEL_BITS(NUM_INPUTS, NUM_OUTPUTS);
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = TAG_WIDTH + DATA_WIDTH;
localparam REQ_DATAW = 1 + ADDR_WIDTH + DATA_WIDTH + DATA_SIZE + FLAGS_WIDTH + TAG_WIDTH;
localparam RSP_DATAW = DATA_WIDTH + TAG_WIDTH;
`STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter"))
`STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter: NUM_INPUTS=%0d, NUM_OUTPUTS=%0d", NUM_INPUTS, NUM_OUTPUTS));
wire [NUM_INPUTS-1:0] req_valid_in;
wire [NUM_INPUTS-1:0][REQ_DATAW-1:0] req_data_in;
@ -49,14 +50,7 @@ module VX_mem_arb #(
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_req_data_in
assign req_valid_in[i] = bus_in_if[i].req_valid;
assign req_data_in[i] = {
bus_in_if[i].req_data.rw,
bus_in_if[i].req_data.byteen,
bus_in_if[i].req_data.addr,
bus_in_if[i].req_data.flags,
bus_in_if[i].req_data.data,
bus_in_if[i].req_data.tag
};
assign req_data_in[i] = bus_in_if[i].req_data;
assign bus_in_if[i].req_ready = req_ready_in[i];
end
@ -92,10 +86,10 @@ module VX_mem_arb #(
assign bus_out_if[i].req_valid = req_valid_out[i];
assign {
bus_out_if[i].req_data.rw,
bus_out_if[i].req_data.byteen,
bus_out_if[i].req_data.addr,
bus_out_if[i].req_data.flags,
bus_out_if[i].req_data.data,
bus_out_if[i].req_data.byteen,
bus_out_if[i].req_data.flags,
req_tag_out
} = req_data_out[i];
assign req_ready_out[i] = bus_out_if[i].req_ready;
@ -123,18 +117,12 @@ module VX_mem_arb #(
.POS (TAG_SEL_IDX)
) bits_remove (
.data_in (bus_out_if[i].rsp_data.tag),
.sel_out (rsp_sel_in[i]),
.data_out (rsp_tag_out)
);
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
assign rsp_data_in[i] = {rsp_tag_out, bus_out_if[i].rsp_data.data};
assign rsp_data_in[i] = {bus_out_if[i].rsp_data.data, rsp_tag_out};
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
if (NUM_INPUTS > 1) begin : g_rsp_sel_in
assign rsp_sel_in[i] = bus_out_if[i].rsp_data.tag[TAG_SEL_IDX +: LOG_NUM_REQS];
end else begin : g_no_rsp_sel_in
assign rsp_sel_in[i] = '0;
end
end
VX_stream_switch #(
@ -158,10 +146,7 @@ module VX_mem_arb #(
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
assign rsp_data_in[i] = {
bus_out_if[i].rsp_data.tag,
bus_out_if[i].rsp_data.data
};
assign rsp_data_in[i] = bus_out_if[i].rsp_data;
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
end
@ -187,10 +172,7 @@ module VX_mem_arb #(
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_output
assign bus_in_if[i].rsp_valid = rsp_valid_out[i];
assign {
bus_in_if[i].rsp_data.tag,
bus_in_if[i].rsp_data.data
} = rsp_data_out[i];
assign bus_in_if[i].rsp_data = rsp_data_out[i];
assign rsp_ready_out[i] = bus_in_if[i].rsp_ready;
end

View file

@ -18,21 +18,27 @@ interface VX_mem_bus_if #(
parameter FLAGS_WIDTH= `MEM_REQ_FLAGS_WIDTH,
parameter TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE)
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE),
parameter UUID_WIDTH = `UUID_WIDTH
) ();
typedef struct packed {
logic [`UP(UUID_WIDTH)-1:0] uuid;
logic [TAG_WIDTH-`UP(UUID_WIDTH)-1:0] value;
} tag_t;
typedef struct packed {
logic rw;
logic [ADDR_WIDTH-1:0] addr;
logic [DATA_SIZE*8-1:0] data;
logic [DATA_SIZE-1:0] byteen;
logic [FLAGS_WIDTH-1:0] flags;
logic [TAG_WIDTH-1:0] tag;
tag_t tag;
} req_data_t;
typedef struct packed {
logic [DATA_SIZE*8-1:0] data;
logic [TAG_WIDTH-1:0] tag;
tag_t tag;
} rsp_data_t;
logic req_valid;

View file

@ -14,21 +14,25 @@
`include "VX_define.vh"
module VX_mem_switch import VX_gpu_pkg::*; #(
parameter NUM_REQS = 1,
parameter NUM_INPUTS = 1,
parameter NUM_OUTPUTS = 1,
parameter DATA_SIZE = 1,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
parameter TAG_WIDTH = 1,
parameter ADDR_WIDTH = 1,
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0,
parameter `STRING ARBITER = "R",
parameter NUM_REQS = (NUM_INPUTS > NUM_OUTPUTS) ? `CDIV(NUM_INPUTS, NUM_OUTPUTS) : `CDIV(NUM_OUTPUTS, NUM_INPUTS),
parameter SEL_COUNT = `MIN(NUM_INPUTS, NUM_OUTPUTS),
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS)
) (
input wire clk,
input wire reset,
input wire [`UP(LOG_NUM_REQS)-1:0] bus_sel,
VX_mem_bus_if.slave bus_in_if,
VX_mem_bus_if.master bus_out_if [NUM_REQS]
input wire [SEL_COUNT-1:0][`UP(LOG_NUM_REQS)-1:0] bus_sel,
VX_mem_bus_if.slave bus_in_if [NUM_INPUTS],
VX_mem_bus_if.master bus_out_if [NUM_OUTPUTS]
);
localparam DATA_WIDTH = (8 * DATA_SIZE);
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
@ -36,46 +40,62 @@ module VX_mem_switch import VX_gpu_pkg::*; #(
// handle requests ////////////////////////////////////////////////////////
wire [NUM_REQS-1:0] req_valid_out;
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_out;
wire [NUM_REQS-1:0] req_ready_out;
wire [NUM_INPUTS-1:0] req_valid_in;
wire [NUM_INPUTS-1:0][REQ_DATAW-1:0] req_data_in;
wire [NUM_INPUTS-1:0] req_ready_in;
wire [NUM_OUTPUTS-1:0] req_valid_out;
wire [NUM_OUTPUTS-1:0][REQ_DATAW-1:0] req_data_out;
wire [NUM_OUTPUTS-1:0] req_ready_out;
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_req_data_in
assign req_valid_in[i] = bus_in_if[i].req_valid;
assign req_data_in[i] = bus_in_if[i].req_data;
assign bus_in_if[i].req_ready = req_ready_in[i];
end
VX_stream_switch #(
.NUM_OUTPUTS (NUM_REQS),
.NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (NUM_OUTPUTS),
.DATAW (REQ_DATAW),
.OUT_BUF (REQ_OUT_BUF)
) req_switch (
.clk (clk),
.reset (reset),
.sel_in (bus_sel),
.valid_in (bus_in_if.req_valid),
.data_in (bus_in_if.req_data),
.ready_in (bus_in_if.req_ready),
.valid_in (req_valid_in),
.data_in (req_data_in),
.ready_in (req_ready_in),
.valid_out (req_valid_out),
.data_out (req_data_out),
.ready_out (req_ready_out)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_req_data_out
assign bus_out_if[i].req_valid = req_valid_out[i];
assign bus_out_if[i].req_data = req_data_out[i];
assign bus_out_if[i].req_data = req_data_out[i];
assign req_ready_out[i] = bus_out_if[i].req_ready;
end
// handle responses ///////////////////////////////////////////////////////
wire [NUM_REQS-1:0] rsp_valid_in;
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_in;
wire [NUM_REQS-1:0] rsp_ready_in;
wire [NUM_OUTPUTS-1:0] rsp_valid_in;
wire [NUM_OUTPUTS-1:0][RSP_DATAW-1:0] rsp_data_in;
wire [NUM_OUTPUTS-1:0] rsp_ready_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [NUM_INPUTS-1:0] rsp_valid_out;
wire [NUM_INPUTS-1:0][RSP_DATAW-1:0] rsp_data_out;
wire [NUM_INPUTS-1:0] rsp_ready_out;
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
assign rsp_data_in[i] = bus_out_if[i].rsp_data;
assign rsp_data_in[i] = bus_out_if[i].rsp_data;
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
end
VX_stream_arb #(
.NUM_INPUTS (NUM_REQS),
.NUM_INPUTS (NUM_OUTPUTS),
.NUM_OUTPUTS(NUM_INPUTS),
.DATAW (RSP_DATAW),
.ARBITER (ARBITER),
.OUT_BUF (RSP_OUT_BUF)
@ -85,10 +105,16 @@ module VX_mem_switch import VX_gpu_pkg::*; #(
.valid_in (rsp_valid_in),
.data_in (rsp_data_in),
.ready_in (rsp_ready_in),
.valid_out (bus_in_if.rsp_valid),
.data_out (bus_in_if.rsp_data),
.ready_out (bus_in_if.rsp_ready),
.valid_out (rsp_valid_out),
.data_out (rsp_data_out),
.ready_out (rsp_ready_out),
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_rsp_data_out
assign bus_in_if[i].rsp_valid = rsp_valid_out[i];
assign bus_in_if[i].rsp_data = rsp_data_out[i];
assign rsp_ready_out[i] = bus_in_if[i].rsp_ready;
end
endmodule

View file

@ -152,7 +152,9 @@ public:
// start
device_->reset = 0;
device_->mem_req_ready = 1;
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
device_->mem_req_ready[b] = 1;
}
// wait on device to go busy
while (!device_->busy) {
@ -186,11 +188,14 @@ private:
this->dcr_bus_reset();
print_bufs_.clear();
pending_mem_reqs_.clear();
{
for (auto& reqs : pending_mem_reqs_) {
reqs.clear();
}
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
std::queue<mem_req_t*> empty;
std::swap(dram_queue_, empty);
std::swap(dram_queue_[b], empty);
}
device_->reset = 1;
@ -217,17 +222,19 @@ private:
dram_sim_.tick();
if (!dram_queue_.empty()) {
auto mem_req = dram_queue_.front();
if (dram_sim_.send_request(mem_req->write, mem_req->addr, 0, [](void* arg) {
auto orig_req = reinterpret_cast<mem_req_t*>(arg);
if (orig_req->ready) {
delete orig_req;
} else {
orig_req->ready = true;
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
if (!dram_queue_[b].empty()) {
auto mem_req = dram_queue_[b].front();
if (dram_sim_.send_request(mem_req->write, mem_req->addr, b, [](void* arg) {
auto orig_req = reinterpret_cast<mem_req_t*>(arg);
if (orig_req->ready) {
delete orig_req;
} else {
orig_req->ready = true;
}
}, mem_req)) {
dram_queue_[b].pop();
}
}, mem_req)) {
dram_queue_.pop();
}
}
@ -247,101 +254,107 @@ private:
}
void mem_bus_reset() {
device_->mem_req_ready = 0;
device_->mem_rsp_valid = 0;
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
device_->mem_req_ready[b] = 0;
device_->mem_rsp_valid[b] = 0;
}
}
void mem_bus_eval(bool clk) {
if (!clk) {
mem_rd_rsp_ready_ = device_->mem_rsp_ready;
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
mem_rd_rsp_ready_[b] = device_->mem_rsp_ready[b];
}
return;
}
// process memory read responses
if (device_->mem_rsp_valid && mem_rd_rsp_ready_) {
device_->mem_rsp_valid = 0;
}
if (!device_->mem_rsp_valid) {
if (!pending_mem_reqs_.empty()
&& (*pending_mem_reqs_.begin())->ready) {
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_rsp = *mem_rsp_it;
/*printf("%0ld: [sim] MEM Rd Rsp: tag=0x%0lx, addr=0x%0lx, data=0x", timestamp, mem_rsp->tag, mem_rsp->addr);
for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) {
printf("%02x", mem_rsp->data[i]);
}
printf("\n");
*/
device_->mem_rsp_valid = 1;
memcpy(VDataCast<void*, MEM_BLOCK_SIZE>::get(device_->mem_rsp_data), mem_rsp->data.data(), MEM_BLOCK_SIZE);
device_->mem_rsp_tag = mem_rsp->tag;
pending_mem_reqs_.erase(mem_rsp_it);
delete mem_rsp;
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
// process memory read responses
if (device_->mem_rsp_valid[b] && mem_rd_rsp_ready_[b]) {
device_->mem_rsp_valid[b] = 0;
}
}
// process memory requests
if (device_->mem_req_valid && device_->mem_req_ready) {
uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
if (device_->mem_req_rw) {
auto byteen = device_->mem_req_byteen;
auto data = VDataCast<uint8_t*, MEM_BLOCK_SIZE>::get(device_->mem_req_data);
if (byte_addr >= uint64_t(IO_COUT_ADDR)
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
// process console output
for (int i = 0; i < IO_COUT_SIZE; i++) {
if ((byteen >> i) & 0x1) {
auto& ss_buf = print_bufs_[i];
char c = data[i];
ss_buf << c;
if (c == '\n') {
std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush;
ss_buf.str("");
}
}
}
} else {
// process writes
/*
printf("%0ld: [sim] MEM Wr Req: tag=0x%0lx, addr=0x%0lx, byteen=0x", timestamp, device_->mem_req_tag, byte_addr);
for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) {
printf("%x", (int)((byteen >> (4 * i)) & 0xf));
}
printf(", data=0x");
if (!device_->mem_rsp_valid[b]) {
if (!pending_mem_reqs_[b].empty()
&& (*pending_mem_reqs_[b].begin())->ready) {
auto mem_rsp_it = pending_mem_reqs_[b].begin();
auto mem_rsp = *mem_rsp_it;
/*printf("%0ld: [sim] MEM Rd Rsp: tag=0x%0lx, addr=0x%0lx, data=0x", timestamp, mem_rsp->tag, mem_rsp->addr);
for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) {
printf("%d=%02x,", i, data[i]);
printf("%02x", mem_rsp->data[i]);
}
printf("\n");
*/
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
(*ram_)[byte_addr + i] = data[i];
}
}
device_->mem_rsp_valid[b] = 1;
memcpy(VDataCast<void*, MEM_BLOCK_SIZE>::get(device_->mem_rsp_data[b]), mem_rsp->data.data(), MEM_BLOCK_SIZE);
device_->mem_rsp_tag[b] = mem_rsp->tag;
pending_mem_reqs_[b].erase(mem_rsp_it);
delete mem_rsp;
}
}
// process memory requests
if (device_->mem_req_valid[b] && device_->mem_req_ready[b]) {
uint64_t byte_addr = (device_->mem_req_addr[b] * MEM_BLOCK_SIZE);
if (device_->mem_req_rw[b]) {
auto byteen = device_->mem_req_byteen[b];
auto data = VDataCast<uint8_t*, MEM_BLOCK_SIZE>::get(device_->mem_req_data[b]);
if (byte_addr >= uint64_t(IO_COUT_ADDR)
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
// process console output
for (int i = 0; i < IO_COUT_SIZE; i++) {
if ((byteen >> i) & 0x1) {
auto& ss_buf = print_bufs_[i];
char c = data[i];
ss_buf << c;
if (c == '\n') {
std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush;
ss_buf.str("");
}
}
}
} else {
// process writes
/*
printf("%0ld: [sim] MEM Wr Req: tag=0x%0lx, addr=0x%0lx, byteen=0x", timestamp, device_->mem_req_tag, byte_addr);
for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) {
printf("%x", (int)((byteen >> (4 * i)) & 0xf));
}
printf(", data=0x");
for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) {
printf("%d=%02x,", i, data[i]);
}
printf("\n");
*/
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
(*ram_)[byte_addr + i] = data[i];
}
}
auto mem_req = new mem_req_t();
mem_req->tag = device_->mem_req_tag[b];
mem_req->addr = byte_addr;
mem_req->write = true;
mem_req->ready = true;
// send dram request
dram_queue_[b].push(mem_req);
}
} else {
// process reads
auto mem_req = new mem_req_t();
mem_req->tag = device_->mem_req_tag;
mem_req->tag = device_->mem_req_tag[b];
mem_req->addr = byte_addr;
mem_req->write = true;
mem_req->ready = true;
mem_req->write = false;
mem_req->ready = false;
ram_->read(mem_req->data.data(), byte_addr, MEM_BLOCK_SIZE);
pending_mem_reqs_[b].emplace_back(mem_req);
//printf("%0ld: [sim] MEM Rd Req: addr=0x%0lx, tag=0x%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
// send dram request
dram_queue_.push(mem_req);
dram_queue_[b].push(mem_req);
}
} else {
// process reads
auto mem_req = new mem_req_t();
mem_req->tag = device_->mem_req_tag;
mem_req->addr = byte_addr;
mem_req->write = false;
mem_req->ready = false;
ram_->read(mem_req->data.data(), byte_addr, MEM_BLOCK_SIZE);
pending_mem_reqs_.emplace_back(mem_req);
//printf("%0ld: [sim] MEM Rd Req: addr=0x%0lx, tag=0x%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
// send dram request
dram_queue_.push(mem_req);
}
}
}
@ -369,21 +382,21 @@ private:
std::unordered_map<int, std::stringstream> print_bufs_;
std::list<mem_req_t*> pending_mem_reqs_;
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS];
std::queue<mem_req_t*> dram_queue_;
std::queue<mem_req_t*> dram_queue_[PLATFORM_MEMORY_BANKS];
std::array<bool, PLATFORM_MEMORY_BANKS> mem_rd_rsp_ready_;
DramSim dram_sim_;
VVortex* device_;
RAM* ram_;
#ifdef VCD_OUTPUT
VerilatedVcdC *tfp_;
#endif
bool mem_rd_rsp_ready_;
RAM* ram_;
};
///////////////////////////////////////////////////////////////////////////////