mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-18 19:16:51 -04:00
multiport
This commit is contained in:
parent
aa6a47eb11
commit
70ade222b1
39 changed files with 1636 additions and 1129 deletions
|
@ -323,8 +323,10 @@ config2()
|
|||
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
|
||||
# test memory ports
|
||||
CONFIGS="-DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=demo
|
||||
CONFIGS="-DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=demo --threads=32
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=4" ./ci/blackbox.sh --driver=simx --app=sgemmx --threads=16
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --threads=16
|
||||
|
||||
echo "configuration-2 tests done!"
|
||||
}
|
||||
|
|
|
@ -33,7 +33,13 @@ The recommended method to enable debugging is to pass the `--debug` flag to `bla
|
|||
// Running demo program on rtlsim in debug mode
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
|
||||
|
||||
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution. You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
|
||||
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution.
|
||||
By default all library modules unde the /libs/ folder are excluded from the trace to reduce the waveform file size, you can chnage that behavoir by either explicitly commenting out `TRACING_OFF`/`TRACING_ON` inside a lib module source (e.g. VX_stream_buffer.sv) or simply enabling a full trace using the following command.
|
||||
|
||||
// Debugging the demo program with rtlsim in full tracing mode
|
||||
$ CONFIGS="-DTRACING_ALL" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
|
||||
|
||||
You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
|
||||
|
||||
## FPGA Debugging
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
||||
// Memory
|
||||
VX_mem_bus_if.master mem_bus_if,
|
||||
VX_mem_bus_if.master mem_bus_if [`L2_MEM_PORTS],
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
|
@ -79,7 +79,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) per_socket_mem_bus_if[`NUM_SOCKETS]();
|
||||
) per_socket_mem_bus_if[`NUM_SOCKETS * `L1_MEM_PORTS]();
|
||||
|
||||
`RESET_RELAY (l2_reset, reset);
|
||||
|
||||
|
@ -91,6 +91,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
.NUM_WAYS (`L2_NUM_WAYS),
|
||||
.WORD_SIZE (L2_WORD_SIZE),
|
||||
.NUM_REQS (L2_NUM_REQS),
|
||||
.MEM_PORTS (`L2_MEM_PORTS),
|
||||
.CRSQ_SIZE (`L2_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`L2_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`L2_MRSQ_SIZE),
|
||||
|
@ -144,7 +145,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
|
||||
.dcr_bus_if (socket_dcr_bus_if),
|
||||
|
||||
.mem_bus_if (per_socket_mem_bus_if[socket_id]),
|
||||
.mem_bus_if (per_socket_mem_bus_if[socket_id * `L1_MEM_PORTS +: `L1_MEM_PORTS]),
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (per_socket_gbar_bus_if[socket_id]),
|
||||
|
|
|
@ -270,14 +270,14 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, uuid_width) \
|
||||
(uuid_width + `CLOG2(mshr_size) + `CLOG2(num_banks))
|
||||
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width) \
|
||||
(uuid_width + `CLOG2(mshr_size) + `CLOG2(num_banks / mem_ports))
|
||||
|
||||
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
|
||||
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width) \
|
||||
(`CLOG2(`CDIV(num_reqs, mem_ports)) + `CLOG2(line_size / word_size) + tag_width)
|
||||
|
||||
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, uuid_width) \
|
||||
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, uuid_width), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
|
||||
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, uuid_width) \
|
||||
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), `CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width)) + 1)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -287,14 +287,14 @@
|
|||
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
|
||||
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
|
||||
|
||||
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches, uuid_width) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, uuid_width), num_caches)
|
||||
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, num_caches, uuid_width) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches, uuid_width) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches), uuid_width), num_caches)
|
||||
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches, uuid_width) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches), uuid_width), num_caches)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -311,6 +311,7 @@
|
|||
`define MEM_REQ_FLAG_LOCAL 2 // shoud be last since optional
|
||||
`define MEM_REQ_FLAGS_WIDTH (`MEM_REQ_FLAG_LOCAL + `LMEM_ENABLED)
|
||||
|
||||
`define VX_MEM_PORTS `L3_MEM_PORTS
|
||||
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
|
||||
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
|
||||
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
|
||||
|
@ -388,7 +389,7 @@
|
|||
assign src.rsp_data.tag = dst.rsp_data.tag; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
|
||||
`define ASSIGN_VX_MEM_BUS_IF_EX(dst, src, TD, TS, UUID) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data.rw = src.req_data.rw; \
|
||||
assign dst.req_data.addr = src.req_data.addr; \
|
||||
|
@ -397,7 +398,19 @@
|
|||
assign dst.req_data.flags = src.req_data.flags; \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (TD != TS) begin \
|
||||
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
|
||||
if (UUID != 0) begin \
|
||||
if (TD > TS) begin \
|
||||
assign dst.req_data.tag = {src.req_data.tag.uuid, {(TD-TS){1'b0}}, src.req_data.tag.value}; \
|
||||
end else begin \
|
||||
assign dst.req_data.tag = {src.req_data.tag.uuid, src.req_data.tag.value[TD-UUID-1:0]}; \
|
||||
end \
|
||||
end else begin \
|
||||
if (TD > TS) begin \
|
||||
assign dst.req_data.tag = {{(TD-TS){1'b0}}, src.req_data.tag}; \
|
||||
end else begin \
|
||||
assign dst.req_data.tag = src.req_data.tag[TD-1:0]; \
|
||||
end \
|
||||
end \
|
||||
end else begin \
|
||||
assign dst.req_data.tag = src.req_data.tag; \
|
||||
end \
|
||||
|
@ -405,7 +418,25 @@
|
|||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data.data = dst.rsp_data.data; \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (TD != TS) begin \
|
||||
if (UUID != 0) begin \
|
||||
if (TD > TS) begin \
|
||||
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, dst.rsp_data.tag.value[TS-UUID-1:0]}; \
|
||||
end else begin \
|
||||
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, {(TS-TD){1'b0}}, dst.rsp_data.tag.value}; \
|
||||
end \
|
||||
end else begin \
|
||||
if (TD > TS) begin \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag[TS-1:0]; \
|
||||
end else begin \
|
||||
assign src.rsp_data.tag = {{(TS-TD){1'b0}}, dst.rsp_data.tag}; \
|
||||
end \
|
||||
end \
|
||||
end else begin \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */ \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define BUFFER_DCR_BUS_IF(dst, src, ena, latency) \
|
||||
|
|
|
@ -166,9 +166,9 @@ package VX_gpu_pkg;
|
|||
|
||||
// Memory request tag bits
|
||||
`ifdef ICACHE_ENABLE
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES, `UUID_WIDTH);
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, 1, `NUM_ICACHES, `UUID_WIDTH);
|
||||
`else
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, 1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
`endif
|
||||
|
||||
////////////////////////// Dcache Parameters //////////////////////////////
|
||||
|
@ -180,7 +180,7 @@ package VX_gpu_pkg;
|
|||
// Block size in bytes
|
||||
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
// Input request size (using coalesced memory blocks)
|
||||
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE);
|
||||
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
|
||||
|
||||
|
@ -197,26 +197,27 @@ package VX_gpu_pkg;
|
|||
|
||||
// Memory request tag bits
|
||||
`ifdef DCACHE_ENABLE
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES, `UUID_WIDTH);
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES, `UUID_WIDTH);
|
||||
`else
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// L1 Parameters /////////////////////////////
|
||||
|
||||
// arbitrate between icache and dcache
|
||||
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
|
||||
|
||||
/////////////////////////////// L2 Parameters /////////////////////////////
|
||||
|
||||
localparam ICACHE_MEM_ARB_IDX = 0;
|
||||
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
|
||||
localparam ICACHE_MEM_ARB_IDX = 0;
|
||||
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
|
||||
|
||||
// Word size in bytes
|
||||
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
localparam L2_NUM_REQS = `NUM_SOCKETS;
|
||||
localparam L2_NUM_REQS = `NUM_SOCKETS * `L1_MEM_PORTS;
|
||||
|
||||
// Core request tag bits
|
||||
localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH;
|
||||
|
@ -226,9 +227,9 @@ package VX_gpu_pkg;
|
|||
|
||||
// Memory request tag bits
|
||||
`ifdef L2_ENABLE
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH, `UUID_WIDTH);
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH, `UUID_WIDTH);
|
||||
`else
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// L3 Parameters /////////////////////////////
|
||||
|
@ -237,7 +238,7 @@ package VX_gpu_pkg;
|
|||
localparam L3_WORD_SIZE = `L2_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
localparam L3_NUM_REQS = `NUM_CLUSTERS;
|
||||
localparam L3_NUM_REQS = `NUM_CLUSTERS * `L2_MEM_PORTS;
|
||||
|
||||
// Core request tag bits
|
||||
localparam L3_TAG_WIDTH = L2_MEM_TAG_WIDTH;
|
||||
|
@ -247,9 +248,9 @@ package VX_gpu_pkg;
|
|||
|
||||
// Memory request tag bits
|
||||
`ifdef L3_ENABLE
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH, `UUID_WIDTH);
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH, `UUID_WIDTH);
|
||||
`else
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// Issue parameters //////////////////////////
|
||||
|
|
|
@ -25,11 +25,9 @@
|
|||
`ifdef SIMULATION
|
||||
|
||||
`define STATIC_ASSERT(cond, msg) \
|
||||
generate \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (!(cond)) $error msg; \
|
||||
/* verilator lint_on GENUNNAMED */ \
|
||||
endgenerate
|
||||
|
||||
`define ERROR(msg) \
|
||||
$error msg
|
||||
|
@ -103,7 +101,7 @@ endgenerate
|
|||
`define UNUSED_VAR(x) /* verilator lint_off GENUNNAMED */ \
|
||||
if (1) begin \
|
||||
/* verilator lint_off UNUSED */ \
|
||||
wire [$bits(x)-1:0] __x = x; \
|
||||
wire [$bits(x)-1:0] __unused = x; \
|
||||
/* verilator lint_on UNUSED */ \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */
|
||||
|
|
|
@ -31,7 +31,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
||||
// Memory
|
||||
VX_mem_bus_if.master mem_bus_if,
|
||||
VX_mem_bus_if.master mem_bus_if [`L1_MEM_PORTS],
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
// Barrier
|
||||
|
@ -80,7 +80,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
||||
) icache_mem_bus_if();
|
||||
) icache_mem_bus_if[1]();
|
||||
|
||||
`RESET_RELAY (icache_reset, reset);
|
||||
|
||||
|
@ -95,6 +95,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.NUM_WAYS (`ICACHE_NUM_WAYS),
|
||||
.WORD_SIZE (ICACHE_WORD_SIZE),
|
||||
.NUM_REQS (1),
|
||||
.MEM_PORTS (1),
|
||||
.CRSQ_SIZE (`ICACHE_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`ICACHE_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
|
||||
|
@ -127,7 +128,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||
) dcache_mem_bus_if();
|
||||
) dcache_mem_bus_if[`L1_MEM_PORTS]();
|
||||
|
||||
`RESET_RELAY (dcache_reset, reset);
|
||||
|
||||
|
@ -142,6 +143,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.NUM_WAYS (`DCACHE_NUM_WAYS),
|
||||
.WORD_SIZE (DCACHE_WORD_SIZE),
|
||||
.NUM_REQS (DCACHE_NUM_REQS),
|
||||
.MEM_PORTS (`L1_MEM_PORTS),
|
||||
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
|
||||
|
@ -168,35 +170,47 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
||||
) l1_mem_bus_if[2]();
|
||||
for (genvar i = 0; i < `L1_MEM_PORTS; ++i) begin : g_mem_bus_if
|
||||
if (i == 0) begin : g_i0
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
||||
) l1_mem_bus_if[2]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) l1_mem_arb_bus_if[1]();
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) l1_mem_arb_bus_if[1]();
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX(0),
|
||||
.ARBITER ("P"), // prioritize the icache
|
||||
.REQ_OUT_BUF(3),
|
||||
.RSP_OUT_BUF(3)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (l1_mem_bus_if),
|
||||
.bus_out_if (l1_mem_arb_bus_if)
|
||||
);
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX(0),
|
||||
.ARBITER ("P"), // prioritize the icache
|
||||
.REQ_OUT_BUF(3),
|
||||
.RSP_OUT_BUF(3)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (l1_mem_bus_if),
|
||||
.bus_out_if (l1_mem_arb_bus_if)
|
||||
);
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[0], l1_mem_arb_bus_if[0]);
|
||||
end else begin : g_i
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) l1_mem_arb_bus_if();
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_arb_bus_if, dcache_mem_bus_if[i], L1_MEM_ARB_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], l1_mem_arb_bus_if);
|
||||
end
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
|
112
hw/rtl/Vortex.sv
112
hw/rtl/Vortex.sv
|
@ -21,19 +21,19 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
input wire reset,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
||||
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data,
|
||||
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
output wire mem_req_valid [`VX_MEM_PORTS-1:0],
|
||||
output wire mem_req_rw [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS],
|
||||
input wire mem_req_ready [`VX_MEM_PORTS],
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
input wire mem_rsp_valid [`VX_MEM_PORTS],
|
||||
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS],
|
||||
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS],
|
||||
output wire mem_rsp_ready [`VX_MEM_PORTS],
|
||||
|
||||
// DCR write request
|
||||
input wire dcr_wr_valid,
|
||||
|
@ -60,12 +60,12 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L2_LINE_SIZE),
|
||||
.TAG_WIDTH (L2_MEM_TAG_WIDTH)
|
||||
) per_cluster_mem_bus_if[`NUM_CLUSTERS]();
|
||||
) per_cluster_mem_bus_if[`NUM_CLUSTERS * `L2_MEM_PORTS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L3_LINE_SIZE),
|
||||
.TAG_WIDTH (L3_MEM_TAG_WIDTH)
|
||||
) mem_bus_if();
|
||||
) mem_bus_if[`L3_MEM_PORTS]();
|
||||
|
||||
`RESET_RELAY (l3_reset, reset);
|
||||
|
||||
|
@ -77,6 +77,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.NUM_WAYS (`L3_NUM_WAYS),
|
||||
.WORD_SIZE (L3_WORD_SIZE),
|
||||
.NUM_REQS (L3_NUM_REQS),
|
||||
.MEM_PORTS (`L3_MEM_PORTS),
|
||||
.CRSQ_SIZE (`L3_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`L3_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`L3_MRSQ_SIZE),
|
||||
|
@ -104,24 +105,21 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.mem_bus_if (mem_bus_if)
|
||||
);
|
||||
|
||||
assign mem_req_valid = mem_bus_if.req_valid;
|
||||
assign mem_req_rw = mem_bus_if.req_data.rw;
|
||||
assign mem_req_byteen= mem_bus_if.req_data.byteen;
|
||||
assign mem_req_addr = mem_bus_if.req_data.addr;
|
||||
assign mem_req_data = mem_bus_if.req_data.data;
|
||||
assign mem_req_tag = mem_bus_if.req_data.tag;
|
||||
assign mem_bus_if.req_ready = mem_req_ready;
|
||||
`UNUSED_VAR (mem_bus_if.req_data.flags)
|
||||
for (genvar i = 0; i < `L3_MEM_PORTS; ++i) begin : g_mem_bus_if
|
||||
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
|
||||
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
|
||||
assign mem_req_byteen[i]= mem_bus_if[i].req_data.byteen;
|
||||
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
|
||||
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
|
||||
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
|
||||
`UNUSED_VAR (mem_bus_if[i].req_data.flags)
|
||||
assign mem_bus_if[i].req_ready = mem_req_ready[i];
|
||||
|
||||
assign mem_bus_if.rsp_valid = mem_rsp_valid;
|
||||
assign mem_bus_if.rsp_data.data = mem_rsp_data;
|
||||
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_bus_if.rsp_ready;
|
||||
|
||||
wire mem_req_fire = mem_req_valid && mem_req_ready;
|
||||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
`UNUSED_VAR (mem_req_fire)
|
||||
`UNUSED_VAR (mem_rsp_fire)
|
||||
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
|
||||
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
|
||||
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
|
||||
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
VX_dcr_bus_if dcr_bus_if();
|
||||
assign dcr_bus_if.write_valid = dcr_wr_valid;
|
||||
|
@ -153,7 +151,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
|
||||
.dcr_bus_if (cluster_dcr_bus_if),
|
||||
|
||||
.mem_bus_if (per_cluster_mem_bus_if[cluster_id]),
|
||||
.mem_bus_if (per_cluster_mem_bus_if[cluster_id * `L2_MEM_PORTS +: `L2_MEM_PORTS]),
|
||||
|
||||
.busy (per_cluster_busy[cluster_id])
|
||||
);
|
||||
|
@ -163,6 +161,26 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
localparam MEM_PORTS_CTR_W = `CLOG2(`VX_MEM_PORTS+1);
|
||||
|
||||
wire [`VX_MEM_PORTS-1:0] mem_req_fire, mem_rsp_fire;
|
||||
wire [`VX_MEM_PORTS-1:0] mem_rd_req_fire, mem_wr_req_fire;
|
||||
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_perf_ctrs
|
||||
assign mem_req_fire[i] = mem_req_valid[i] & mem_req_ready[i];
|
||||
assign mem_rsp_fire[i] = mem_rsp_valid[i] & mem_rsp_ready[i];
|
||||
assign mem_rd_req_fire[i] = mem_req_fire[i] & ~mem_req_rw[i];
|
||||
assign mem_wr_req_fire[i] = mem_req_fire[i] & mem_req_rw[i];
|
||||
end
|
||||
|
||||
wire [MEM_PORTS_CTR_W-1:0] perf_mem_reads_per_cycle;
|
||||
wire [MEM_PORTS_CTR_W-1:0] perf_mem_writes_per_cycle;
|
||||
wire [MEM_PORTS_CTR_W-1:0] perf_mem_rsps_per_cycle;
|
||||
|
||||
`POP_COUNT(perf_mem_reads_per_cycle, mem_rd_req_fire);
|
||||
`POP_COUNT(perf_mem_writes_per_cycle, mem_wr_req_fire);
|
||||
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
mem_perf_t mem_perf;
|
||||
|
||||
|
@ -171,19 +189,16 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
perf_mem_pending_reads <= '0;
|
||||
end else begin
|
||||
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
|
||||
`PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire)));
|
||||
`PERF_CTR_BITS'($signed((MEM_PORTS_CTR_W+1)'(perf_mem_reads_per_cycle) - (MEM_PORTS_CTR_W+1)'(perf_mem_rsps_per_cycle)));
|
||||
end
|
||||
end
|
||||
|
||||
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
|
||||
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
mem_perf <= '0;
|
||||
end else begin
|
||||
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
|
||||
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
|
||||
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(perf_mem_reads_per_cycle);
|
||||
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(perf_mem_writes_per_cycle);
|
||||
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
|
@ -198,19 +213,18 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
end
|
||||
|
||||
`ifdef DBG_TRACE_MEM
|
||||
wire [`UUID_WIDTH-1:0] mem_req_uuid = mem_req_tag[`VX_MEM_TAG_WIDTH-1 -: `UUID_WIDTH];
|
||||
wire [`UUID_WIDTH-1:0] mem_rsp_uuid = mem_rsp_tag[`VX_MEM_TAG_WIDTH-1 -: `UUID_WIDTH];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (mem_req_fire) begin
|
||||
if (mem_req_rw) begin
|
||||
`TRACE(2, ("%t: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h (#%0d)\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data, mem_req_uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h (#%0d)\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_uuid))
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_trace
|
||||
always @(posedge clk) begin
|
||||
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
|
||||
if (mem_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: MEM Wr Req[%0d]: addr=0x%0h, byteen=0x%h data=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: MEM Rd Req[%0d]: addr=0x%0h, byteen=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: MEM Rd Rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n", $time, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (mem_rsp_fire) begin
|
||||
`TRACE(2, ("%t: MEM Rd Rsp: tag=0x%0h, data=0x%h (#%0d)\n", $time, mem_rsp_tag, mem_rsp_data, mem_rsp_uuid))
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
`else
|
||||
`include "vortex_afu.vh"
|
||||
`endif
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifndef PLATFORM_MEMORY_INTERLEAVE
|
||||
|
|
388
hw/rtl/cache/VX_cache.sv
vendored
388
hw/rtl/cache/VX_cache.sv
vendored
|
@ -19,6 +19,9 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Number of memory ports
|
||||
parameter MEM_PORTS = 1,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 32768,
|
||||
// Size of line inside a bank in bytes
|
||||
|
@ -75,17 +78,18 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
|
||||
VX_mem_bus_if.master mem_bus_if
|
||||
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
|
||||
);
|
||||
|
||||
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2"))
|
||||
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable"))
|
||||
`STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback"))
|
||||
`STATIC_ASSERT(NUM_BANKS >= MEM_PORTS, ("invalid parameter: number of banks must be greater or equal to number of memory ports"))
|
||||
|
||||
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
|
||||
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
|
||||
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
|
||||
localparam MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, UUID_WIDTH);
|
||||
localparam MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
|
||||
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
|
||||
localparam WORD_WIDTH = WORD_SIZE * 8;
|
||||
localparam WORD_SEL_BITS = `CLOG2(WORDS_PER_LINE);
|
||||
|
@ -95,6 +99,11 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + `UP(FLAGS_WIDTH);
|
||||
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
|
||||
localparam BANK_MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH;
|
||||
localparam MEM_REQ_DATAW = (`CS_LINE_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH));
|
||||
localparam MEM_RSP_DATAW = `CS_LINE_WIDTH + MEM_TAG_WIDTH;
|
||||
localparam MEM_PORTS_SEL_BITS = `CLOG2(MEM_PORTS);
|
||||
localparam MEM_ARB_SEL_BITS = `CLOG2(`CDIV(NUM_BANKS, MEM_PORTS));
|
||||
localparam MEM_ARB_SEL_WIDTH = `UP(MEM_ARB_SEL_BITS);
|
||||
|
||||
localparam CORE_RSP_REG_DISABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
|
||||
localparam MEM_REQ_REG_DISABLE = (NUM_BANKS != 1);
|
||||
|
@ -135,113 +144,97 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.flush_end (per_bank_flush_end)
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Core response buffering
|
||||
wire [NUM_REQS-1:0] core_rsp_valid_s;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
|
||||
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
|
||||
wire [NUM_REQS-1:0] core_rsp_ready_s;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
|
||||
.SIZE (CORE_RSP_REG_DISABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
|
||||
) core_rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (core_rsp_valid_s[i]),
|
||||
.ready_in (core_rsp_ready_s[i]),
|
||||
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
|
||||
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
|
||||
.valid_out (core_bus2_if[i].rsp_valid),
|
||||
.ready_out (core_bus2_if[i].rsp_ready)
|
||||
);
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Memory response gather /////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) mem_bus_tmp_if();
|
||||
) mem_bus_tmp_if[MEM_PORTS]();
|
||||
|
||||
// Memory response buffering
|
||||
wire [MEM_PORTS-1:0] mem_rsp_queue_valid;
|
||||
wire [MEM_PORTS-1:0][MEM_RSP_DATAW-1:0] mem_rsp_queue_data;
|
||||
wire [MEM_PORTS-1:0] mem_rsp_queue_ready;
|
||||
|
||||
wire mem_rsp_valid_s;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
|
||||
wire mem_rsp_ready_s;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
|
||||
.SIZE (MRSQ_SIZE),
|
||||
.OUT_REG (MRSQ_SIZE > 2)
|
||||
) mem_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_bus_tmp_if.rsp_valid),
|
||||
.ready_in (mem_bus_tmp_if.rsp_ready),
|
||||
.data_in ({mem_bus_tmp_if.rsp_data.tag, mem_bus_tmp_if.rsp_data.data}),
|
||||
.data_out ({mem_rsp_tag_s, mem_rsp_data_s}),
|
||||
.valid_out (mem_rsp_valid_s),
|
||||
.ready_out (mem_rsp_ready_s)
|
||||
);
|
||||
|
||||
wire [BANK_MEM_TAG_WIDTH-1:0] bank_mem_rsp_tag;
|
||||
wire [`UP(`CS_BANK_SEL_BITS)-1:0] mem_rsp_bank_id;
|
||||
|
||||
if (NUM_BANKS > 1) begin : g_mem_rsp_tag_s_with_banks
|
||||
assign bank_mem_rsp_tag = mem_rsp_tag_s[MEM_TAG_WIDTH-1:`CS_BANK_SEL_BITS];
|
||||
assign mem_rsp_bank_id = mem_rsp_tag_s[`CS_BANK_SEL_BITS-1:0];
|
||||
end else begin : g_mem_rsp_tag_s_no_bank
|
||||
assign bank_mem_rsp_tag = mem_rsp_tag_s;
|
||||
assign mem_rsp_bank_id = 0;
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (MEM_RSP_DATAW),
|
||||
.SIZE (MRSQ_SIZE),
|
||||
.OUT_REG (MRSQ_SIZE > 2)
|
||||
) mem_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_bus_tmp_if[i].rsp_valid),
|
||||
.data_in (mem_bus_tmp_if[i].rsp_data),
|
||||
.ready_in (mem_bus_tmp_if[i].rsp_ready),
|
||||
.valid_out (mem_rsp_queue_valid[i]),
|
||||
.data_out (mem_rsp_queue_data[i]),
|
||||
.ready_out (mem_rsp_queue_ready[i])
|
||||
);
|
||||
end
|
||||
|
||||
// Memory request buffering
|
||||
wire [MEM_PORTS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] mem_rsp_queue_data_s;
|
||||
wire [MEM_PORTS-1:0][BANK_SEL_WIDTH-1:0] mem_rsp_queue_sel;
|
||||
|
||||
wire mem_req_valid;
|
||||
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr;
|
||||
wire mem_req_rw;
|
||||
wire [LINE_SIZE-1:0] mem_req_byteen;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_req_data;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_req_tag;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags;
|
||||
wire mem_req_ready;
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_data_s
|
||||
wire [BANK_MEM_TAG_WIDTH-1:0] mem_rsp_tag_s = mem_rsp_queue_data[i][MEM_TAG_WIDTH-1:MEM_ARB_SEL_BITS];
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s = mem_rsp_queue_data[i][MEM_RSP_DATAW-1:MEM_TAG_WIDTH];
|
||||
assign mem_rsp_queue_data_s[i] = {mem_rsp_data_s, mem_rsp_tag_s};
|
||||
end
|
||||
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flush_b;
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_sel
|
||||
if (NUM_BANKS > 1) begin : g_multibanks
|
||||
if (MEM_ARB_SEL_BITS != 0) begin : g_arb_sel
|
||||
VX_bits_concat #(
|
||||
.L (MEM_ARB_SEL_BITS),
|
||||
.R (MEM_PORTS_SEL_BITS)
|
||||
) mem_rsp_sel_concat (
|
||||
.left_in (mem_rsp_queue_data[i][MEM_ARB_SEL_BITS-1:0]),
|
||||
.right_in (MEM_PORTS_SEL_BITS'(i)),
|
||||
.data_out (mem_rsp_queue_sel[i])
|
||||
);
|
||||
end else begin : g_no_arb_sel
|
||||
assign mem_rsp_queue_sel[i] = MEM_PORTS_SEL_BITS'(i);
|
||||
end
|
||||
end else begin : g_singlebank
|
||||
assign mem_rsp_queue_sel[i] = 0;
|
||||
end
|
||||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
|
||||
.SIZE (MEM_REQ_REG_DISABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_rsp_valid;
|
||||
wire [NUM_BANKS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] per_bank_mem_rsp_pdata;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
|
||||
|
||||
VX_stream_omega #(
|
||||
.NUM_INPUTS (MEM_PORTS),
|
||||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (MEM_RSP_DATAW-MEM_ARB_SEL_BITS),
|
||||
.ARBITER ("R"),
|
||||
.OUT_BUF (3)
|
||||
) mem_rsp_xbar (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_req_valid),
|
||||
.ready_in (mem_req_ready),
|
||||
.data_in ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data, mem_req_tag, mem_req_flags}),
|
||||
.data_out ({mem_bus_tmp_if.req_data.rw, mem_bus_tmp_if.req_data.byteen, mem_bus_tmp_if.req_data.addr, mem_bus_tmp_if.req_data.data, mem_bus_tmp_if.req_data.tag, mem_req_flush_b}),
|
||||
.valid_out (mem_bus_tmp_if.req_valid),
|
||||
.ready_out (mem_bus_tmp_if.req_ready)
|
||||
.valid_in (mem_rsp_queue_valid),
|
||||
.data_in (mem_rsp_queue_data_s),
|
||||
.sel_in (mem_rsp_queue_sel),
|
||||
.ready_in (mem_rsp_queue_ready),
|
||||
.valid_out (per_bank_mem_rsp_valid),
|
||||
.data_out (per_bank_mem_rsp_pdata),
|
||||
`UNUSED_PIN (sel_out),
|
||||
.ready_out (per_bank_mem_rsp_ready),
|
||||
`UNUSED_PIN (collisions)
|
||||
);
|
||||
|
||||
if (FLAGS_WIDTH != 0) begin : g_mem_req_flags
|
||||
assign mem_bus_tmp_if.req_data.flags = mem_req_flush_b;
|
||||
end else begin : g_no_mem_req_flags
|
||||
assign mem_bus_tmp_if.req_data.flags = '0;
|
||||
`UNUSED_VAR (mem_req_flush_b)
|
||||
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_rsp_data;
|
||||
wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_rsp_tag;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_rsp_data
|
||||
assign {
|
||||
per_bank_mem_rsp_data[i],
|
||||
per_bank_mem_rsp_tag[i]
|
||||
} = per_bank_mem_rsp_pdata[i];
|
||||
end
|
||||
|
||||
if (WRITE_ENABLE) begin : g_mem_bus_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if);
|
||||
end else begin : g_mem_bus_if_ro
|
||||
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if, mem_bus_tmp_if);
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Core requests dispatch /////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
|
||||
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
|
||||
|
@ -261,7 +254,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
|
||||
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
|
||||
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
|
||||
wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
|
||||
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
|
||||
|
@ -269,14 +262,6 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_mem_req_flags;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
|
||||
|
||||
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
|
||||
|
||||
assign mem_rsp_ready_s = per_bank_mem_rsp_ready[mem_rsp_bank_id];
|
||||
|
||||
// Bank requests dispatch
|
||||
|
||||
wire [NUM_REQS-1:0] core_req_valid;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
|
||||
wire [NUM_REQS-1:0] core_req_rw;
|
||||
|
@ -336,6 +321,8 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
};
|
||||
end
|
||||
|
||||
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`PERF_CTR_BITS-1:0] perf_collisions;
|
||||
`endif
|
||||
|
@ -377,12 +364,9 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
} = core_req_data_out[i];
|
||||
end
|
||||
|
||||
// Banks access
|
||||
// Banks access ///////////////////////////////////////////////////////////
|
||||
|
||||
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : g_banks
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
|
||||
|
||||
wire curr_bank_mem_rsp_valid = mem_rsp_valid_s && (mem_rsp_bank_id == bank_id);
|
||||
|
||||
VX_cache_bank #(
|
||||
.BANK_ID (bank_id),
|
||||
.INSTANCE_ID (`SFORMATF(("%s-bank%0d", INSTANCE_ID, bank_id))),
|
||||
|
@ -409,9 +393,9 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_read_misses (perf_read_miss_per_bank[bank_id]),
|
||||
.perf_write_misses (perf_write_miss_per_bank[bank_id]),
|
||||
.perf_mshr_stalls (perf_mshr_stall_per_bank[bank_id]),
|
||||
.perf_read_miss (perf_read_miss_per_bank[bank_id]),
|
||||
.perf_write_miss (perf_write_miss_per_bank[bank_id]),
|
||||
.perf_mshr_stall (perf_mshr_stall_per_bank[bank_id]),
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
|
@ -435,7 +419,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
|
||||
// Memory request
|
||||
.mem_req_valid (per_bank_mem_req_valid[bank_id]),
|
||||
.mem_req_addr (curr_bank_mem_req_addr),
|
||||
.mem_req_addr (per_bank_mem_req_addr[bank_id]),
|
||||
.mem_req_rw (per_bank_mem_req_rw[bank_id]),
|
||||
.mem_req_byteen (per_bank_mem_req_byteen[bank_id]),
|
||||
.mem_req_data (per_bank_mem_req_data[bank_id]),
|
||||
|
@ -444,9 +428,9 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.mem_req_ready (per_bank_mem_req_ready[bank_id]),
|
||||
|
||||
// Memory response
|
||||
.mem_rsp_valid (curr_bank_mem_rsp_valid),
|
||||
.mem_rsp_data (mem_rsp_data_s),
|
||||
.mem_rsp_tag (bank_mem_rsp_tag),
|
||||
.mem_rsp_valid (per_bank_mem_rsp_valid[bank_id]),
|
||||
.mem_rsp_data (per_bank_mem_rsp_data[bank_id]),
|
||||
.mem_rsp_tag (per_bank_mem_rsp_tag[bank_id]),
|
||||
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
|
||||
|
||||
// Flush request
|
||||
|
@ -454,19 +438,18 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.flush_uuid (flush_uuid),
|
||||
.flush_end (per_bank_flush_end[bank_id])
|
||||
);
|
||||
|
||||
if (NUM_BANKS == 1) begin : g_per_bank_mem_req_addr_multibanks
|
||||
assign per_bank_mem_req_addr[bank_id] = curr_bank_mem_req_addr;
|
||||
end else begin : g_per_bank_mem_req_addr_singlebank
|
||||
assign per_bank_mem_req_addr[bank_id] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, bank_id);
|
||||
end
|
||||
end
|
||||
|
||||
// Bank responses gather
|
||||
// Core responses gather //////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_BANKS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_in;
|
||||
wire [NUM_REQS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_out;
|
||||
|
||||
wire [NUM_REQS-1:0] core_rsp_valid_s;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
|
||||
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
|
||||
wire [NUM_REQS-1:0] core_rsp_ready_s;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_rsp_data_in
|
||||
assign core_rsp_data_in[i] = {per_bank_core_rsp_data[i], per_bank_core_rsp_tag[i]};
|
||||
end
|
||||
|
@ -494,77 +477,166 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
assign {core_rsp_data_s[i], core_rsp_tag_s[i]} = core_rsp_data_out[i];
|
||||
end
|
||||
|
||||
// Memory request arbitration
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
|
||||
.SIZE (CORE_RSP_REG_DISABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
|
||||
) core_rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (core_rsp_valid_s[i]),
|
||||
.ready_in (core_rsp_ready_s[i]),
|
||||
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
|
||||
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
|
||||
.valid_out (core_bus2_if[i].rsp_valid),
|
||||
.ready_out (core_bus2_if[i].rsp_ready)
|
||||
);
|
||||
end
|
||||
|
||||
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH))-1:0] data_in;
|
||||
// Memory request arbitration /////////////////////////////////////////////
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_data_in
|
||||
assign data_in[i] = {
|
||||
per_bank_mem_req_addr[i],
|
||||
wire [NUM_BANKS-1:0][MEM_REQ_DATAW-1:0] per_bank_mem_req_pdata;
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_req_pdata
|
||||
assign per_bank_mem_req_pdata[i] = {
|
||||
per_bank_mem_req_rw[i],
|
||||
per_bank_mem_req_byteen[i],
|
||||
per_bank_mem_req_addr[i],
|
||||
per_bank_mem_req_data[i],
|
||||
per_bank_mem_req_tag[i],
|
||||
per_bank_mem_req_flags[i]
|
||||
per_bank_mem_req_byteen[i],
|
||||
per_bank_mem_req_flags[i],
|
||||
per_bank_mem_req_tag[i]
|
||||
};
|
||||
end
|
||||
|
||||
wire [BANK_MEM_TAG_WIDTH-1:0] bank_mem_req_tag;
|
||||
wire [MEM_PORTS-1:0] mem_req_valid;
|
||||
wire [MEM_PORTS-1:0][MEM_REQ_DATAW-1:0] mem_req_pdata;
|
||||
wire [MEM_PORTS-1:0] mem_req_ready;
|
||||
wire [MEM_PORTS-1:0][MEM_ARB_SEL_WIDTH-1:0] mem_req_sel_out;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_BANKS),
|
||||
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
|
||||
.NUM_OUTPUTS(MEM_PORTS),
|
||||
.DATAW (MEM_REQ_DATAW),
|
||||
.ARBITER ("R")
|
||||
) mem_req_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (per_bank_mem_req_valid),
|
||||
.data_in (per_bank_mem_req_pdata),
|
||||
.ready_in (per_bank_mem_req_ready),
|
||||
.data_in (data_in),
|
||||
.data_out ({mem_req_addr, mem_req_rw, mem_req_byteen, mem_req_data, bank_mem_req_tag, mem_req_flags}),
|
||||
.valid_out (mem_req_valid),
|
||||
.data_out (mem_req_pdata),
|
||||
.ready_out (mem_req_ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
.sel_out (mem_req_sel_out)
|
||||
);
|
||||
|
||||
if (NUM_BANKS > 1) begin : g_mem_req_tag_multibanks
|
||||
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id = `CS_MEM_ADDR_TO_BANK_ID(mem_req_addr);
|
||||
assign mem_req_tag = MEM_TAG_WIDTH'({bank_mem_req_tag, mem_req_bank_id});
|
||||
end else begin : g_mem_req_tag
|
||||
assign mem_req_tag = MEM_TAG_WIDTH'(bank_mem_req_tag);
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_req_buf
|
||||
wire mem_req_rw;
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_req_data;
|
||||
wire [LINE_SIZE-1:0] mem_req_byteen;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags;
|
||||
wire [BANK_MEM_TAG_WIDTH-1:0] mem_req_tag;
|
||||
|
||||
assign {
|
||||
mem_req_rw,
|
||||
mem_req_addr,
|
||||
mem_req_data,
|
||||
mem_req_byteen,
|
||||
mem_req_flags,
|
||||
mem_req_tag
|
||||
} = mem_req_pdata[i];
|
||||
|
||||
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_w;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_w;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags_w;
|
||||
|
||||
if (NUM_BANKS > 1) begin : g_mem_req_tag_multibanks
|
||||
if (MEM_ARB_SEL_BITS != 0) begin : g_arb_sel
|
||||
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id;
|
||||
VX_bits_concat #(
|
||||
.L (MEM_ARB_SEL_BITS),
|
||||
.R (MEM_PORTS_SEL_BITS)
|
||||
) bank_id_concat (
|
||||
.left_in (mem_req_sel_out[i]),
|
||||
.right_in (MEM_PORTS_SEL_BITS'(i)),
|
||||
.data_out (mem_req_bank_id)
|
||||
);
|
||||
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, mem_req_bank_id});
|
||||
assign mem_req_tag_w = {mem_req_tag, mem_req_sel_out[i]};
|
||||
end else begin : g_no_arb_sel
|
||||
`UNUSED_VAR (mem_req_sel_out)
|
||||
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, MEM_PORTS_SEL_BITS'(i)});
|
||||
assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag);
|
||||
end
|
||||
end else begin : g_mem_req_tag
|
||||
`UNUSED_VAR (mem_req_sel_out)
|
||||
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'(mem_req_addr);
|
||||
assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag);
|
||||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
|
||||
.SIZE (MEM_REQ_REG_DISABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_req_valid[i]),
|
||||
.ready_in (mem_req_ready[i]),
|
||||
.data_in ({mem_req_rw, mem_req_byteen, mem_req_addr_w, mem_req_data, mem_req_tag_w, mem_req_flags}),
|
||||
.data_out ({mem_bus_tmp_if[i].req_data.rw, mem_bus_tmp_if[i].req_data.byteen, mem_bus_tmp_if[i].req_data.addr, mem_bus_tmp_if[i].req_data.data, mem_bus_tmp_if[i].req_data.tag, mem_req_flags_w}),
|
||||
.valid_out (mem_bus_tmp_if[i].req_valid),
|
||||
.ready_out (mem_bus_tmp_if[i].req_ready)
|
||||
);
|
||||
|
||||
if (FLAGS_WIDTH != 0) begin : g_mem_req_flags
|
||||
assign mem_bus_tmp_if[i].req_data.flags = mem_req_flags_w;
|
||||
end else begin : g_no_mem_req_flags
|
||||
assign mem_bus_tmp_if[i].req_data.flags = '0;
|
||||
`UNUSED_VAR (mem_req_flags_w)
|
||||
end
|
||||
|
||||
if (WRITE_ENABLE) begin : g_mem_bus_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
|
||||
end else begin : g_mem_bus_if_ro
|
||||
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
// per cycle: core_reads, core_writes
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_req;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_req;
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_req;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_req;
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
|
||||
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
|
||||
|
||||
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
|
||||
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
|
||||
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
|
||||
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
|
||||
end
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
|
||||
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
|
||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
|
||||
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
|
||||
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
|
||||
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
|
||||
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
|
||||
assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
||||
|
||||
wire perf_mem_stall_per_cycle = mem_bus_if.req_valid && ~mem_bus_if.req_ready;
|
||||
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
|
|
12
hw/rtl/cache/VX_cache_bank.sv
vendored
12
hw/rtl/cache/VX_cache_bank.sv
vendored
|
@ -74,9 +74,9 @@ module VX_cache_bank #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output wire perf_read_misses,
|
||||
output wire perf_write_misses,
|
||||
output wire perf_mshr_stalls,
|
||||
output wire perf_read_miss,
|
||||
output wire perf_write_miss,
|
||||
output wire perf_mshr_stall,
|
||||
`endif
|
||||
|
||||
// Core Request
|
||||
|
@ -682,9 +682,9 @@ module VX_cache_bank #(
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_read_misses = do_read_st1 && ~is_hit_st1;
|
||||
assign perf_write_misses = do_write_st1 && ~is_hit_st1;
|
||||
assign perf_mshr_stalls = mshr_alm_full;
|
||||
assign perf_read_miss = do_read_st1 && ~is_hit_st1;
|
||||
assign perf_write_miss = do_write_st1 && ~is_hit_st1;
|
||||
assign perf_mshr_stall = mshr_alm_full;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CACHE
|
||||
|
|
453
hw/rtl/cache/VX_cache_bypass.sv
vendored
453
hw/rtl/cache/VX_cache_bypass.sv
vendored
|
@ -15,6 +15,7 @@
|
|||
|
||||
module VX_cache_bypass #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter MEM_PORTS = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
|
||||
parameter PASSTHRU = 0,
|
||||
|
@ -29,14 +30,11 @@ module VX_cache_bypass #(
|
|||
|
||||
parameter MEM_ADDR_WIDTH = 1,
|
||||
parameter MEM_TAG_IN_WIDTH = 1,
|
||||
parameter MEM_TAG_OUT_WIDTH = 1,
|
||||
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
parameter CORE_OUT_BUF = 0,
|
||||
parameter MEM_OUT_BUF = 0,
|
||||
|
||||
parameter CORE_DATA_WIDTH = WORD_SIZE * 8
|
||||
parameter MEM_OUT_BUF = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -48,296 +46,223 @@ module VX_cache_bypass #(
|
|||
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
|
||||
|
||||
// Memory request in
|
||||
VX_mem_bus_if.slave mem_bus_in_if,
|
||||
VX_mem_bus_if.slave mem_bus_in_if [MEM_PORTS],
|
||||
|
||||
// Memory request out
|
||||
VX_mem_bus_if.master mem_bus_out_if
|
||||
VX_mem_bus_if.master mem_bus_out_if [MEM_PORTS]
|
||||
);
|
||||
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
|
||||
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
|
||||
localparam CORE_DATA_WIDTH = WORD_SIZE * 8;
|
||||
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
|
||||
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
|
||||
|
||||
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
|
||||
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
|
||||
localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH;
|
||||
|
||||
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
|
||||
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
|
||||
|
||||
localparam CORE_TAG_ID_BITS = CORE_TAG_WIDTH - UUID_WIDTH;
|
||||
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
|
||||
localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS;
|
||||
localparam CORE_TAG_ID_WIDTH = CORE_TAG_WIDTH - UUID_WIDTH;
|
||||
localparam MEM_TAG_ID_WIDTH = `CLOG2(NUM_REQS / MEM_PORTS) + CORE_TAG_ID_WIDTH;
|
||||
localparam MEM_TAG_NC1_WIDTH = UUID_WIDTH + MEM_TAG_ID_WIDTH;
|
||||
localparam MEM_TAG_NC2_WIDTH = WSEL_BITS + MEM_TAG_NC1_WIDTH;
|
||||
localparam MEM_TAG_OUT_WIDTH = `MAX(MEM_TAG_IN_WIDTH, MEM_TAG_NC2_WIDTH);
|
||||
|
||||
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
|
||||
// handle core requests ///////////////////////////////////////////////////
|
||||
// hanlde non-cacheable core request switch ///////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (CORE_TAG_WIDTH)
|
||||
) core_bus_nc_switch_if[2 * NUM_REQS]();
|
||||
|
||||
wire core_req_nc_valid;
|
||||
wire [NUM_REQS-1:0] core_req_nc_valids;
|
||||
wire [NUM_REQS-1:0] core_req_nc_idxs;
|
||||
wire [REQ_SEL_WIDTH-1:0] core_req_nc_idx;
|
||||
wire [NUM_REQS-1:0] core_req_nc_sel;
|
||||
wire core_req_nc_ready;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_nc
|
||||
if (PASSTHRU != 0) begin : g_passthru
|
||||
assign core_req_nc_idxs[i] = 1'b1;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_is_nc
|
||||
if (PASSTHRU) begin : g_passthru
|
||||
assign core_req_nc_sel[i] = 1'b1;
|
||||
end else if (NC_ENABLE) begin : g_nc
|
||||
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_IO];
|
||||
assign core_req_nc_sel[i] = core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_IO];
|
||||
end else begin : g_no_nc
|
||||
assign core_req_nc_idxs[i] = 1'b0;
|
||||
assign core_req_nc_sel[i] = 1'b0;
|
||||
end
|
||||
assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i];
|
||||
end
|
||||
|
||||
VX_generic_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.TYPE (PASSTHRU ? "R" : "P")
|
||||
) core_req_nc_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.requests (core_req_nc_valids),
|
||||
.grant_index (core_req_nc_idx),
|
||||
.grant_onehot (core_req_nc_sel),
|
||||
.grant_valid (core_req_nc_valid),
|
||||
.grant_ready (core_req_nc_ready)
|
||||
VX_mem_switch #(
|
||||
.NUM_INPUTS (NUM_REQS),
|
||||
.NUM_OUTPUTS (2 * NUM_REQS),
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (CORE_TAG_WIDTH),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(CORE_OUT_BUF))
|
||||
) core_bus_nc_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_sel (core_req_nc_sel),
|
||||
.bus_in_if (core_bus_in_if),
|
||||
.bus_out_if(core_bus_nc_switch_if)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_if
|
||||
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i];
|
||||
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
|
||||
assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
|
||||
: core_bus_out_if[i].req_ready;
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (CORE_TAG_WIDTH)
|
||||
) core_bus_in_nc_if[NUM_REQS]();
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_in_cs
|
||||
assign core_bus_out_if[i].req_valid = core_bus_nc_switch_if[0 * NUM_REQS + i].req_valid;
|
||||
assign core_bus_out_if[i].req_data = core_bus_nc_switch_if[0 * NUM_REQS + i].req_data;
|
||||
assign core_bus_nc_switch_if[0 * NUM_REQS + i].req_ready = core_bus_out_if[i].req_ready;
|
||||
|
||||
assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_valid = core_bus_out_if[i].rsp_valid;
|
||||
assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_data = core_bus_out_if[i].rsp_data;
|
||||
assign core_bus_out_if[i].rsp_ready = core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_ready;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_in_nc
|
||||
assign core_bus_in_nc_if[i].req_valid = core_bus_nc_switch_if[1 * NUM_REQS + i].req_valid;
|
||||
assign core_bus_in_nc_if[i].req_data = core_bus_nc_switch_if[1 * NUM_REQS + i].req_data;
|
||||
assign core_bus_nc_switch_if[1 * NUM_REQS + i].req_ready = core_bus_in_nc_if[i].req_ready;
|
||||
|
||||
assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_valid = core_bus_in_nc_if[i].rsp_valid;
|
||||
assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_data = core_bus_in_nc_if[i].rsp_data;
|
||||
assign core_bus_in_nc_if[i].rsp_ready = core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_ready;
|
||||
end
|
||||
|
||||
// handle memory requests /////////////////////////////////////////////////
|
||||
|
||||
wire mem_req_out_valid;
|
||||
wire mem_req_out_rw;
|
||||
wire [LINE_SIZE-1:0] mem_req_out_byteen;
|
||||
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
|
||||
wire [`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_out_flags;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
|
||||
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
|
||||
wire mem_req_out_ready;
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_NC1_WIDTH)
|
||||
) core_bus_nc_arb_if[MEM_PORTS]();
|
||||
|
||||
wire core_req_nc_sel_rw;
|
||||
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
|
||||
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
|
||||
wire [`MEM_REQ_FLAGS_WIDTH-1:0] core_req_nc_sel_flags;
|
||||
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
|
||||
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (NUM_REQS),
|
||||
.NUM_OUTPUTS(MEM_PORTS),
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (CORE_TAG_WIDTH),
|
||||
.TAG_SEL_IDX(TAG_SEL_IDX),
|
||||
.ARBITER (PASSTHRU ? "R" : "P"),
|
||||
.REQ_OUT_BUF(0),
|
||||
.RSP_OUT_BUF(0)
|
||||
) core_bus_nc_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (core_bus_in_nc_if),
|
||||
.bus_out_if (core_bus_nc_arb_if)
|
||||
);
|
||||
|
||||
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_nc_mux_in
|
||||
assign core_req_nc_mux_in[i] = {
|
||||
core_bus_in_if[i].req_data.rw,
|
||||
core_bus_in_if[i].req_data.addr,
|
||||
core_bus_in_if[i].req_data.data,
|
||||
core_bus_in_if[i].req_data.byteen,
|
||||
core_bus_in_if[i].req_data.flags,
|
||||
core_bus_in_if[i].req_data.tag
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_NC2_WIDTH)
|
||||
) mem_bus_out_nc_if[MEM_PORTS]();
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_nc
|
||||
wire core_req_nc_arb_rw;
|
||||
wire [WORD_SIZE-1:0] core_req_nc_arb_byteen;
|
||||
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_arb_addr;
|
||||
wire [`MEM_REQ_FLAGS_WIDTH-1:0] core_req_nc_arb_flags;
|
||||
wire [CORE_DATA_WIDTH-1:0] core_req_nc_arb_data;
|
||||
wire [MEM_TAG_NC1_WIDTH-1:0] core_req_nc_arb_tag;
|
||||
|
||||
assign {
|
||||
core_req_nc_arb_rw,
|
||||
core_req_nc_arb_addr,
|
||||
core_req_nc_arb_data,
|
||||
core_req_nc_arb_byteen,
|
||||
core_req_nc_arb_flags,
|
||||
core_req_nc_arb_tag
|
||||
} = core_bus_nc_arb_if[i].req_data;
|
||||
|
||||
logic [MEM_ADDR_WIDTH-1:0] core_req_nc_arb_addr_w;
|
||||
logic [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] core_req_nc_arb_byteen_w;
|
||||
logic [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] core_req_nc_arb_data_w;
|
||||
logic [CORE_DATA_WIDTH-1:0] core_rsp_nc_arb_data_w;
|
||||
wire [MEM_TAG_NC2_WIDTH-1:0] core_req_nc_arb_tag_w;
|
||||
wire [MEM_TAG_NC1_WIDTH-1:0] core_rsp_nc_arb_tag_w;
|
||||
|
||||
if (PASSTHRU || NC_ENABLE) begin : g_mem_req_out_tag_nc
|
||||
if (WORDS_PER_LINE > 1) begin : g_multi_word_line
|
||||
wire [WSEL_BITS-1:0] rsp_wsel;
|
||||
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_arb_addr[WSEL_BITS-1:0];
|
||||
always @(*) begin
|
||||
core_req_nc_arb_byteen_w = '0;
|
||||
core_req_nc_arb_byteen_w[req_wsel] = core_req_nc_arb_byteen;
|
||||
core_req_nc_arb_data_w = 'x;
|
||||
core_req_nc_arb_data_w[req_wsel] = core_req_nc_arb_data;
|
||||
end
|
||||
VX_bits_insert #(
|
||||
.N (MEM_TAG_NC1_WIDTH),
|
||||
.S (WSEL_BITS),
|
||||
.POS (MEM_TAG_ID_WIDTH)
|
||||
) wsel_insert (
|
||||
.data_in (core_req_nc_arb_tag),
|
||||
.ins_in (req_wsel),
|
||||
.data_out (core_req_nc_arb_tag_w)
|
||||
);
|
||||
VX_bits_remove #(
|
||||
.N (MEM_TAG_NC2_WIDTH),
|
||||
.S (WSEL_BITS),
|
||||
.POS (MEM_TAG_ID_WIDTH)
|
||||
) wsel_remove (
|
||||
.data_in (mem_bus_out_nc_if[i].rsp_data.tag),
|
||||
.sel_out (rsp_wsel),
|
||||
.data_out (core_rsp_nc_arb_tag_w)
|
||||
);
|
||||
assign core_req_nc_arb_addr_w = core_req_nc_arb_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
|
||||
assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
|
||||
end else begin : g_single_word_line
|
||||
assign core_req_nc_arb_addr_w = core_req_nc_arb_addr;
|
||||
assign core_req_nc_arb_byteen_w = core_req_nc_arb_byteen;
|
||||
assign core_req_nc_arb_data_w = core_req_nc_arb_data;
|
||||
assign core_req_nc_arb_tag_w = MEM_TAG_NC2_WIDTH'(core_req_nc_arb_tag);
|
||||
|
||||
assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data;
|
||||
assign core_rsp_nc_arb_tag_w = MEM_TAG_NC1_WIDTH'(mem_bus_out_nc_if[i].rsp_data.tag);
|
||||
end
|
||||
end else begin : g_mem_req_out_tag
|
||||
assign core_req_nc_arb_tag_w = core_req_nc_arb_tag;
|
||||
end
|
||||
|
||||
assign mem_bus_out_nc_if[i].req_valid = core_bus_nc_arb_if[i].req_valid;
|
||||
assign mem_bus_out_nc_if[i].req_data = {
|
||||
core_req_nc_arb_rw,
|
||||
core_req_nc_arb_addr_w,
|
||||
core_req_nc_arb_data_w,
|
||||
core_req_nc_arb_byteen_w,
|
||||
core_req_nc_arb_flags,
|
||||
core_req_nc_arb_tag_w
|
||||
};
|
||||
assign core_bus_nc_arb_if[i].req_ready = mem_bus_out_nc_if[i].req_ready;
|
||||
|
||||
assign core_bus_nc_arb_if[i].rsp_valid = mem_bus_out_nc_if[i].rsp_valid;
|
||||
assign core_bus_nc_arb_if[i].rsp_data = {
|
||||
core_rsp_nc_arb_data_w,
|
||||
core_rsp_nc_arb_tag_w
|
||||
};
|
||||
assign mem_bus_out_nc_if[i].rsp_ready = core_bus_nc_arb_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
assign {
|
||||
core_req_nc_sel_rw,
|
||||
core_req_nc_sel_addr,
|
||||
core_req_nc_sel_data,
|
||||
core_req_nc_sel_byteen,
|
||||
core_req_nc_sel_flags,
|
||||
core_req_nc_sel_tag
|
||||
} = core_req_nc_mux_in[core_req_nc_idx];
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_OUT_WIDTH)
|
||||
) mem_bus_out_src_if[(PASSTHRU ? 1 : 2) * MEM_PORTS]();
|
||||
|
||||
assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
|
||||
|
||||
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
|
||||
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
|
||||
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
|
||||
assign mem_req_out_flags = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.flags : core_req_nc_sel_flags;
|
||||
|
||||
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
|
||||
|
||||
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0];
|
||||
|
||||
if (WORDS_PER_LINE > 1) begin : g_mem_req_multi_word_line
|
||||
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_w;
|
||||
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_w;
|
||||
|
||||
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0];
|
||||
|
||||
always @(*) begin
|
||||
mem_req_byteen_in_w = '0;
|
||||
mem_req_byteen_in_w[req_wsel] = core_req_nc_sel_byteen;
|
||||
|
||||
mem_req_data_in_w = 'x;
|
||||
mem_req_data_in_w[req_wsel] = core_req_nc_sel_data;
|
||||
end
|
||||
|
||||
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_w;
|
||||
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_w;
|
||||
if (NUM_REQS > 1) begin : g_multiple_requests
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
|
||||
end else begin : g_single_request
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id});
|
||||
end
|
||||
end else begin : g_mem_req_single_word_line
|
||||
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen;
|
||||
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
|
||||
if (NUM_REQS > 1) begin : g_multiple_requests
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
|
||||
end else begin : g_single_request
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_in_id});
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_src
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[0 * MEM_PORTS + i], mem_bus_out_nc_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_NC2_WIDTH, UUID_WIDTH);
|
||||
if (!PASSTHRU) begin : g_not_passthru
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[1 * MEM_PORTS + i], mem_bus_in_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_IN_WIDTH, UUID_WIDTH);
|
||||
end
|
||||
end
|
||||
|
||||
wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
|
||||
|
||||
if (UUID_WIDTH != 0) begin : g_mem_req_tag_bypass_with_uuid
|
||||
assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
|
||||
end else begin : g_mem_req_tag_bypass
|
||||
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
|
||||
end
|
||||
|
||||
if (PASSTHRU != 0) begin : g_mem_req_out_tag_passthru
|
||||
assign mem_req_out_tag = mem_req_tag_bypass;
|
||||
`UNUSED_VAR (mem_bus_in_if.req_data.tag)
|
||||
end else if (NC_ENABLE) begin : g_mem_req_out_tag_nc
|
||||
VX_bits_insert #(
|
||||
.N (MEM_TAG_OUT_WIDTH-1),
|
||||
.S (1),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) mem_req_tag_in_nc_insert (
|
||||
.data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
|
||||
.ins_in (~mem_bus_in_if.req_valid),
|
||||
.data_out (mem_req_out_tag)
|
||||
);
|
||||
end else begin : g_mem_req_out_tag
|
||||
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
|
||||
end
|
||||
|
||||
assign mem_bus_in_if.req_ready = mem_req_out_ready;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
|
||||
.SIZE (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_req_out_valid),
|
||||
.ready_in (mem_req_out_ready),
|
||||
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_flags, mem_req_out_data, mem_req_out_tag}),
|
||||
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.flags, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
|
||||
.valid_out (mem_bus_out_if.req_valid),
|
||||
.ready_out (mem_bus_out_if.req_ready)
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS ((PASSTHRU ? 1 : 2) * MEM_PORTS),
|
||||
.NUM_OUTPUTS(MEM_PORTS),
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_OUT_WIDTH),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF(DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
|
||||
.RSP_OUT_BUF(0)
|
||||
) mem_bus_out_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (mem_bus_out_src_if),
|
||||
.bus_out_if (mem_bus_out_if)
|
||||
);
|
||||
|
||||
// handle core responses //////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0] core_rsp_in_valid;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_in_data;
|
||||
wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_in_tag;
|
||||
wire [NUM_REQS-1:0] core_rsp_in_ready;
|
||||
|
||||
wire is_mem_rsp_nc;
|
||||
if (PASSTHRU != 0) begin : g_is_mem_rsp_nc_passthru
|
||||
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid;
|
||||
end else if (NC_ENABLE) begin : g_is_mem_rsp_nc
|
||||
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
|
||||
end else begin : g_is_no_mem_rsp_nc
|
||||
assign is_mem_rsp_nc = 1'b0;
|
||||
end
|
||||
|
||||
wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (MEM_TAG_OUT_WIDTH),
|
||||
.S (NC_ENABLE),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) mem_rsp_tag_in_nc_remove (
|
||||
.data_in (mem_bus_out_if.rsp_data.tag),
|
||||
.data_out (mem_rsp_tag_id_nc)
|
||||
);
|
||||
|
||||
wire [REQ_SEL_WIDTH-1:0] rsp_idx;
|
||||
if (NUM_REQS > 1) begin : g_rsp_idx
|
||||
assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
|
||||
end else begin : g_rsp_idx_0
|
||||
assign rsp_idx = 1'b0;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_valid
|
||||
assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || (is_mem_rsp_nc && rsp_idx == REQ_SEL_WIDTH'(i));
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_ready
|
||||
assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_data
|
||||
if (WORDS_PER_LINE > 1) begin : g_wsel
|
||||
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
|
||||
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
|
||||
core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
|
||||
end else begin : g_no_wsel
|
||||
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data;
|
||||
end
|
||||
end
|
||||
|
||||
wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2;
|
||||
if (UUID_WIDTH != 0) begin : g_mem_rsp_tag_in_nc2_uuid
|
||||
assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]};
|
||||
end else begin : g_mem_rsp_tag_in_nc2
|
||||
assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_tag
|
||||
if (PASSTHRU) begin : g_passthru
|
||||
assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2;
|
||||
end else if (NC_ENABLE) begin : g_nc
|
||||
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2;
|
||||
end else begin : g_no_nc
|
||||
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
|
||||
.SIZE (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(CORE_OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
|
||||
) core_rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (core_rsp_in_valid[i]),
|
||||
.ready_in (core_rsp_in_ready[i]),
|
||||
.data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}),
|
||||
.data_out ({core_bus_in_if[i].rsp_data.data, core_bus_in_if[i].rsp_data.tag}),
|
||||
.valid_out (core_bus_in_if[i].rsp_valid),
|
||||
.ready_out (core_bus_in_if[i].rsp_ready)
|
||||
);
|
||||
end
|
||||
|
||||
// handle memory responses ////////////////////////////////////////////////
|
||||
|
||||
if (PASSTHRU != 0) begin : g_mem_bus_in_if_passthru
|
||||
assign mem_bus_in_if.rsp_valid = 1'b0;
|
||||
assign mem_bus_in_if.rsp_data.data = '0;
|
||||
assign mem_bus_in_if.rsp_data.tag = '0;
|
||||
end else if (NC_ENABLE) begin : g_mem_bus_in_if_nc
|
||||
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
|
||||
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
|
||||
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0];
|
||||
end else begin : g_mem_bus_in_if
|
||||
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid;
|
||||
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
|
||||
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc;
|
||||
end
|
||||
|
||||
wire [NUM_REQS-1:0] core_rsp_out_valid;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_out_valid
|
||||
assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid;
|
||||
end
|
||||
|
||||
assign mem_bus_out_if.rsp_ready = is_mem_rsp_nc ? (~core_rsp_out_valid[rsp_idx] && core_rsp_in_ready[rsp_idx]) : mem_bus_in_if.rsp_ready;
|
||||
|
||||
endmodule
|
||||
|
|
74
hw/rtl/cache/VX_cache_cluster.sv
vendored
74
hw/rtl/cache/VX_cache_cluster.sv
vendored
|
@ -23,6 +23,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
// Number of requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Number of memory ports
|
||||
parameter MEM_PORTS = 1,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 32768,
|
||||
// Size of line inside a bank in bytes
|
||||
|
@ -82,14 +85,16 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS],
|
||||
VX_mem_bus_if.master mem_bus_if
|
||||
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
|
||||
);
|
||||
localparam NUM_CACHES = `UP(NUM_UNITS);
|
||||
localparam PASSTHRU = (NUM_UNITS == 0);
|
||||
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH, UUID_WIDTH) :
|
||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, UUID_WIDTH));
|
||||
|
||||
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
|
||||
localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH);
|
||||
localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1;
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH);
|
||||
|
||||
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
||||
|
||||
|
@ -101,7 +106,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) cache_mem_bus_if[NUM_CACHES]();
|
||||
) cache_mem_bus_if[NUM_CACHES * MEM_PORTS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
|
@ -153,6 +158,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.MEM_PORTS (MEM_PORTS),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.DIRTY_BYTES (DIRTY_BYTES),
|
||||
|
@ -176,34 +182,46 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.core_bus_if (arb_core_bus_if[i * NUM_REQS +: NUM_REQS]),
|
||||
.mem_bus_if (cache_mem_bus_if[i])
|
||||
.mem_bus_if (cache_mem_bus_if[i * MEM_PORTS +: MEM_PORTS])
|
||||
);
|
||||
end
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
|
||||
) mem_bus_tmp_if[1]();
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) arb_core_bus_tmp_if[NUM_CACHES]();
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (NUM_CACHES),
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF ((NUM_CACHES > 1) ? MEM_OUT_BUF : 0),
|
||||
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (cache_mem_bus_if),
|
||||
.bus_out_if (mem_bus_tmp_if)
|
||||
);
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
|
||||
) mem_bus_tmp_if[1]();
|
||||
|
||||
if (WRITE_ENABLE) begin : g_mem_bus_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
|
||||
end else begin : g_mem_bus_if_ro
|
||||
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if, mem_bus_tmp_if[0]);
|
||||
for (genvar j = 0; j < NUM_CACHES; ++j) begin : g_arb_core_bus_tmp_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_tmp_if[j], cache_mem_bus_if[j * MEM_PORTS + i]);
|
||||
end
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (NUM_CACHES),
|
||||
.NUM_OUTPUTS (1),
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF ((NUM_CACHES > 1) ? MEM_OUT_BUF : 0),
|
||||
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (arb_core_bus_tmp_if),
|
||||
.bus_out_if (mem_bus_tmp_if)
|
||||
);
|
||||
|
||||
if (WRITE_ENABLE) begin : g_we
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[0]);
|
||||
end else begin : g_ro
|
||||
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[0]);
|
||||
end
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
|
4
hw/rtl/cache/VX_cache_define.vh
vendored
4
hw/rtl/cache/VX_cache_define.vh
vendored
|
@ -55,10 +55,6 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CS_LINE_TO_MEM_ADDR(x, i) {x, `CS_BANK_SEL_BITS'(i)}
|
||||
`define CS_MEM_ADDR_TO_BANK_ID(x) x[0 +: `CS_BANK_SEL_BITS]
|
||||
`define CS_MEM_TAG_TO_REQ_ID(x) x[MSHR_ADDR_WIDTH-1:0]
|
||||
|
||||
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
|
||||
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
|
||||
|
||||
|
|
4
hw/rtl/cache/VX_cache_flush.sv
vendored
4
hw/rtl/cache/VX_cache_flush.sv
vendored
|
@ -34,6 +34,8 @@ module VX_cache_flush #(
|
|||
output wire [`UP(UUID_WIDTH)-1:0] flush_uuid,
|
||||
input wire [NUM_BANKS-1:0] flush_end
|
||||
);
|
||||
`UNUSED_PARAM (TAG_WIDTH)
|
||||
|
||||
localparam STATE_IDLE = 0;
|
||||
localparam STATE_WAIT1 = 1;
|
||||
localparam STATE_FLUSH = 2;
|
||||
|
@ -112,7 +114,7 @@ module VX_cache_flush #(
|
|||
wire [NUM_REQS-1:0] core_bus_out_ready;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_uuid
|
||||
if (UUID_WIDTH != 0) begin : g_uuid
|
||||
assign core_bus_out_uuid[i] = core_bus_in_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign core_bus_out_uuid[i] = core_bus_in_if[i].req_data.tag.uuid;
|
||||
end else begin : g_no_uuid
|
||||
assign core_bus_out_uuid[i] = 0;
|
||||
end
|
||||
|
|
4
hw/rtl/cache/VX_cache_tags.sv
vendored
4
hw/rtl/cache/VX_cache_tags.sv
vendored
|
@ -45,8 +45,8 @@ module VX_cache_tags #(
|
|||
output wire evict_dirty,
|
||||
output wire [`CS_TAG_SEL_BITS-1:0] evict_tag
|
||||
);
|
||||
// valid, dirty, tag
|
||||
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
|
||||
// valid, dirty, tag
|
||||
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
|
||||
|
||||
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
|
||||
wire [NUM_WAYS-1:0] read_valid;
|
||||
|
|
6
hw/rtl/cache/VX_cache_top.sv
vendored
6
hw/rtl/cache/VX_cache_top.sv
vendored
|
@ -19,6 +19,9 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Number of memory ports
|
||||
parameter MEM_PORTS = 1,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 65536,
|
||||
// Size of line inside a bank in bytes
|
||||
|
@ -60,7 +63,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
// Memory request output buffer
|
||||
parameter MEM_OUT_BUF = 3,
|
||||
|
||||
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS)
|
||||
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS / MEM_PORTS)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -155,6 +158,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.MEM_PORTS (MEM_PORTS),
|
||||
.CRSQ_SIZE (CRSQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
|
|
114
hw/rtl/cache/VX_cache_wrap.sv
vendored
114
hw/rtl/cache/VX_cache_wrap.sv
vendored
|
@ -21,6 +21,8 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Number of memory ports
|
||||
parameter MEM_PORTS = 1,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 4096,
|
||||
|
@ -85,16 +87,15 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
|
||||
VX_mem_bus_if.master mem_bus_if
|
||||
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
|
||||
);
|
||||
|
||||
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
|
||||
|
||||
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, UUID_WIDTH);
|
||||
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH, UUID_WIDTH) :
|
||||
CACHE_MEM_TAG_WIDTH);
|
||||
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
|
||||
localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, TAG_WIDTH);
|
||||
localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1;
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH);
|
||||
|
||||
localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU);
|
||||
|
||||
|
@ -106,17 +107,18 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (CACHE_MEM_TAG_WIDTH)
|
||||
) mem_bus_cache_if();
|
||||
) mem_bus_cache_if[MEM_PORTS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) mem_bus_tmp_if();
|
||||
) mem_bus_tmp_if[MEM_PORTS]();
|
||||
|
||||
if (NC_OR_BYPASS) begin : g_bypass
|
||||
|
||||
VX_cache_bypass #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.MEM_PORTS (MEM_PORTS),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
|
||||
.PASSTHRU (PASSTHRU),
|
||||
|
@ -130,7 +132,6 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
|
||||
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
|
||||
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
|
||||
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
|
||||
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
|
||||
|
@ -153,13 +154,17 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
|
||||
end
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_tmp_if, mem_bus_cache_if);
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_tmp_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_tmp_if[i], mem_bus_cache_if[i]);
|
||||
end
|
||||
end
|
||||
|
||||
if (WRITE_ENABLE) begin : g_mem_bus_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if);
|
||||
end else begin : g_mem_bus_if_ro
|
||||
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if, mem_bus_tmp_if);
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if
|
||||
if (WRITE_ENABLE) begin : g_we
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
|
||||
end else begin : g_ro
|
||||
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
|
||||
end
|
||||
end
|
||||
|
||||
if (PASSTHRU == 0) begin : g_cache
|
||||
|
@ -172,6 +177,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.MEM_PORTS (MEM_PORTS),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.DIRTY_BYTES (DIRTY_BYTES),
|
||||
|
@ -207,13 +213,15 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
|
||||
end
|
||||
|
||||
assign mem_bus_cache_if.req_valid = 0;
|
||||
assign mem_bus_cache_if.req_data = '0;
|
||||
`UNUSED_VAR (mem_bus_cache_if.req_ready)
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_cache_if
|
||||
assign mem_bus_cache_if[i].req_valid = 0;
|
||||
assign mem_bus_cache_if[i].req_data = '0;
|
||||
`UNUSED_VAR (mem_bus_cache_if[i].req_ready)
|
||||
|
||||
`UNUSED_VAR (mem_bus_cache_if.rsp_valid)
|
||||
`UNUSED_VAR (mem_bus_cache_if.rsp_data)
|
||||
assign mem_bus_cache_if.rsp_ready = 0;
|
||||
`UNUSED_VAR (mem_bus_cache_if[i].rsp_valid)
|
||||
`UNUSED_VAR (mem_bus_cache_if[i].rsp_data)
|
||||
assign mem_bus_cache_if[i].rsp_ready = 0;
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign cache_perf = '0;
|
||||
|
@ -222,62 +230,36 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
`ifdef DBG_TRACE_CACHE
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_trace
|
||||
wire [`UP(UUID_WIDTH)-1:0] core_req_uuid;
|
||||
wire [`UP(UUID_WIDTH)-1:0] core_rsp_uuid;
|
||||
|
||||
if (UUID_WIDTH != 0) begin : g_core_rsp_uuid
|
||||
assign core_req_uuid = core_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign core_rsp_uuid = core_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin : g_no_core_rsp_uuid
|
||||
assign core_req_uuid = 0;
|
||||
assign core_rsp_uuid = 0;
|
||||
end
|
||||
|
||||
wire core_req_fire = core_bus_if[i].req_valid && core_bus_if[i].req_ready;
|
||||
wire core_rsp_fire = core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_trace_core
|
||||
always @(posedge clk) begin
|
||||
if (core_req_fire) begin
|
||||
if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin
|
||||
if (core_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid))
|
||||
`TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid))
|
||||
`TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (core_rsp_fire) begin
|
||||
`TRACE(2, ("%t: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid))
|
||||
if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag.value, i, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
|
||||
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
|
||||
|
||||
if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin : g_mem_req_uuid
|
||||
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin : g_no_mem_req_uuid
|
||||
assign mem_req_uuid = 0;
|
||||
assign mem_rsp_uuid = 0;
|
||||
end
|
||||
|
||||
wire mem_req_fire = mem_bus_if.req_valid && mem_bus_if.req_ready;
|
||||
wire mem_rsp_fire = mem_bus_if.rsp_valid && mem_bus_if.rsp_ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (mem_req_fire) begin
|
||||
if (mem_bus_if.req_data.rw) begin
|
||||
`TRACE(2, ("%t: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
|
||||
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid))
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_trace_mem
|
||||
always @(posedge clk) begin
|
||||
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
|
||||
if (mem_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
|
||||
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: %s mem-rd-rsp: data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, mem_bus_if[i].rsp_data.data[i], mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (mem_rsp_fire) begin
|
||||
`TRACE(2, ("%t: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
|
||||
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid))
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -137,8 +137,6 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
wire schedule_fire = schedule_if.valid && schedule_if.ready;
|
||||
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
|
||||
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||
wire [`UUID_WIDTH-1:0] icache_bus_req_uuid = icache_bus_if.req_data.tag[ICACHE_TAG_WIDTH-1 -: `UUID_WIDTH];
|
||||
wire [`UUID_WIDTH-1:0] icache_bus_rsp_uuid = icache_bus_if.rsp_data.tag[ICACHE_TAG_WIDTH-1 -: `UUID_WIDTH];
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 1, 6, 3, (
|
||||
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
|
||||
|
@ -157,8 +155,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
icache_bus_rsp_fire
|
||||
},{
|
||||
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
|
||||
icache_bus_req_uuid, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
|
||||
icache_bus_rsp_uuid, icache_bus_if.rsp_data.data
|
||||
icache_bus_if.req_data.tag.uuid, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
|
||||
icache_bus_if.rsp_data.tag.uuid, icache_bus_if.rsp_data.data
|
||||
},
|
||||
reset_negedge, 1'b0, 4096
|
||||
);
|
||||
|
|
|
@ -40,6 +40,7 @@ module VX_pe_switch import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_stream_switch #(
|
||||
.DATAW (REQ_DATAW),
|
||||
.NUM_INPUTS (1),
|
||||
.NUM_OUTPUTS (PE_COUNT),
|
||||
.OUT_BUF (REQ_OUT_BUF)
|
||||
) req_switch (
|
||||
|
|
|
@ -171,9 +171,9 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
`ifdef GBAR_ENABLE
|
||||
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
|
||||
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_data.id)) begin
|
||||
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
|
||||
barrier_masks_n[gbar_bus_if.rsp_id] = '0; // reset barrier mask
|
||||
barrier_masks_n[gbar_bus_if.rsp_data.id] = '0; // reset barrier mask
|
||||
stalled_warps_n = '0; // unlock all warps
|
||||
end
|
||||
`endif
|
||||
|
@ -281,10 +281,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
// barrier handling
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
assign gbar_bus_if.req_valid = gbar_req_valid;
|
||||
assign gbar_bus_if.req_id = gbar_req_id;
|
||||
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
|
||||
assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
|
||||
assign gbar_bus_if.req_valid = gbar_req_valid;
|
||||
assign gbar_bus_if.req_data.id = gbar_req_id;
|
||||
assign gbar_bus_if.req_data.size_m1 = gbar_req_size_m1;
|
||||
assign gbar_bus_if.req_data.core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
|
||||
`endif
|
||||
|
||||
// split/join handling
|
||||
|
|
36
hw/rtl/libs/VX_bits_concat.sv
Normal file
36
hw/rtl/libs/VX_bits_concat.sv
Normal file
|
@ -0,0 +1,36 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
`TRACING_OFF
|
||||
module VX_bits_concat #(
|
||||
parameter L = 1,
|
||||
parameter R = 1
|
||||
) (
|
||||
input wire [`UP(L)-1:0] left_in,
|
||||
input wire [`UP(R)-1:0] right_in,
|
||||
output wire [(L+R)-1:0] data_out
|
||||
);
|
||||
if (L == 0) begin : g_right_only
|
||||
`UNUSED_VAR (left_in)
|
||||
assign data_out = right_in;
|
||||
end else if (R == 0) begin : g_left_only
|
||||
`UNUSED_VAR (right_in)
|
||||
assign data_out = left_in;
|
||||
end else begin : g_concat
|
||||
assign data_out = {left_in, right_in};
|
||||
end
|
||||
|
||||
endmodule
|
||||
`TRACING_ON
|
|
@ -20,17 +20,22 @@ module VX_bits_remove #(
|
|||
parameter POS = 0
|
||||
) (
|
||||
input wire [N-1:0] data_in,
|
||||
output wire [`UP(S)-1:0] sel_out,
|
||||
output wire [N-S-1:0] data_out
|
||||
);
|
||||
`STATIC_ASSERT (((0 == S) || ((POS + S) <= N)), ("invalid parameter"))
|
||||
|
||||
if (S == 0) begin : g_passthru
|
||||
assign sel_out = 0;
|
||||
assign data_out = data_in;
|
||||
end else if (POS == 0) begin : g_pos_0
|
||||
assign sel_out = data_in[0 +: S];
|
||||
assign data_out = data_in[N-1:S];
|
||||
end else if ((POS + S) == N) begin : g_pos_N
|
||||
assign sel_out = data_in[POS +: S];
|
||||
assign data_out = data_in[POS-1:0];
|
||||
end else begin : g_pos
|
||||
assign sel_out = data_in[POS +: S];
|
||||
assign data_out = {data_in[N-1:(POS+S)], data_in[POS-1:0]};
|
||||
end
|
||||
|
||||
|
|
|
@ -21,7 +21,8 @@ module VX_stream_arb #(
|
|||
parameter `STRING ARBITER = "R",
|
||||
parameter MAX_FANOUT = `MAX_FANOUT,
|
||||
parameter OUT_BUF = 0,
|
||||
parameter NUM_REQS = `CDIV(NUM_INPUTS, NUM_OUTPUTS),
|
||||
parameter NUM_REQS = (NUM_INPUTS > NUM_OUTPUTS) ? `CDIV(NUM_INPUTS, NUM_OUTPUTS) : `CDIV(NUM_OUTPUTS, NUM_INPUTS),
|
||||
parameter SEL_COUNT = `MIN(NUM_INPUTS, NUM_OUTPUTS),
|
||||
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
|
||||
parameter NUM_REQS_W = `UP(LOG_NUM_REQS)
|
||||
) (
|
||||
|
@ -34,65 +35,38 @@ module VX_stream_arb #(
|
|||
|
||||
output wire [NUM_OUTPUTS-1:0] valid_out,
|
||||
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
|
||||
output wire [NUM_OUTPUTS-1:0][NUM_REQS_W-1:0] sel_out,
|
||||
input wire [NUM_OUTPUTS-1:0] ready_out
|
||||
input wire [NUM_OUTPUTS-1:0] ready_out,
|
||||
|
||||
output wire [SEL_COUNT-1:0][NUM_REQS_W-1:0] sel_out
|
||||
);
|
||||
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_more_inputs
|
||||
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_input_select
|
||||
|
||||
if (NUM_OUTPUTS > 1) begin : g_multiple_outputs
|
||||
// #Inputs > #Outputs
|
||||
|
||||
// (#inputs > #outputs) and (#outputs > 1)
|
||||
if (MAX_FANOUT != 0 && (NUM_REQS > (MAX_FANOUT + MAX_FANOUT /2))) begin : g_fanout
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_arb_slices
|
||||
|
||||
localparam SLICE_BEGIN = i * NUM_REQS;
|
||||
localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_INPUTS);
|
||||
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (SLICE_SIZE),
|
||||
.NUM_OUTPUTS (1),
|
||||
.DATAW (DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.MAX_FANOUT (MAX_FANOUT),
|
||||
.OUT_BUF (OUT_BUF)
|
||||
) arb_slice (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]),
|
||||
.ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]),
|
||||
.data_in (data_in[SLICE_END-1: SLICE_BEGIN]),
|
||||
.data_out (data_out[i]),
|
||||
.sel_out (sel_out[i]),
|
||||
.valid_out (valid_out[i]),
|
||||
.ready_out (ready_out[i])
|
||||
);
|
||||
end
|
||||
|
||||
end else if (MAX_FANOUT != 0 && (NUM_INPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin : g_fanout
|
||||
|
||||
// (#inputs > max_fanout) and (#outputs == 1)
|
||||
|
||||
localparam NUM_SLICES = `CDIV(NUM_INPUTS, MAX_FANOUT);
|
||||
localparam NUM_SLICES = `CDIV(NUM_REQS, MAX_FANOUT);
|
||||
localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT);
|
||||
localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES);
|
||||
localparam DATAW2 = DATAW + LOG_NUM_REQS2;
|
||||
|
||||
wire [NUM_SLICES-1:0] valid_tmp;
|
||||
wire [NUM_SLICES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp;
|
||||
wire [NUM_SLICES-1:0] ready_tmp;
|
||||
wire [NUM_SLICES-1:0][NUM_OUTPUTS-1:0] valid_tmp;
|
||||
wire [NUM_SLICES-1:0][NUM_OUTPUTS-1:0][DATAW2-1:0] data_tmp;
|
||||
wire [NUM_SLICES-1:0][NUM_OUTPUTS-1:0] ready_tmp;
|
||||
|
||||
for (genvar i = 0; i < NUM_SLICES; ++i) begin : g_fanout_slice_arbs
|
||||
for (genvar s = 0; s < NUM_SLICES; ++s) begin : g_slice_arbs
|
||||
|
||||
localparam SLICE_BEGIN = i * MAX_FANOUT;
|
||||
localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_INPUTS);
|
||||
localparam SLICE_STRIDE= MAX_FANOUT * NUM_OUTPUTS;
|
||||
localparam SLICE_BEGIN = s * SLICE_STRIDE;
|
||||
localparam SLICE_END = `MIN(SLICE_BEGIN + SLICE_STRIDE, NUM_INPUTS);
|
||||
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
|
||||
|
||||
wire [DATAW-1:0] data_tmp_u;
|
||||
wire [`LOG2UP(SLICE_SIZE)-1:0] sel_tmp_u;
|
||||
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_tmp_u;
|
||||
wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS2-1:0] sel_tmp_u;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (SLICE_SIZE),
|
||||
.NUM_OUTPUTS (1),
|
||||
.NUM_OUTPUTS (NUM_OUTPUTS),
|
||||
.DATAW (DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.MAX_FANOUT (MAX_FANOUT),
|
||||
|
@ -103,22 +77,24 @@ module VX_stream_arb #(
|
|||
.valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]),
|
||||
.data_in (data_in[SLICE_END-1: SLICE_BEGIN]),
|
||||
.ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]),
|
||||
.valid_out (valid_tmp[i]),
|
||||
.valid_out (valid_tmp[s]),
|
||||
.data_out (data_tmp_u),
|
||||
.sel_out (sel_tmp_u),
|
||||
.ready_out (ready_tmp[i])
|
||||
.ready_out (ready_tmp[s]),
|
||||
.sel_out (sel_tmp_u)
|
||||
);
|
||||
|
||||
assign data_tmp[i] = {data_tmp_u, LOG_NUM_REQS2'(sel_tmp_u)};
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_tmp
|
||||
assign data_tmp[s][o] = {data_tmp_u[o], sel_tmp_u[o]};
|
||||
end
|
||||
end
|
||||
|
||||
wire [DATAW+LOG_NUM_REQS2-1:0] data_out_u;
|
||||
wire [LOG_NUM_REQS3-1:0] sel_out_u;
|
||||
wire [NUM_OUTPUTS-1:0][DATAW2-1:0] data_out_u;
|
||||
wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS3-1:0] sel_out_u;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_SLICES),
|
||||
.NUM_OUTPUTS (1),
|
||||
.DATAW (DATAW + LOG_NUM_REQS2),
|
||||
.NUM_INPUTS (NUM_SLICES * NUM_OUTPUTS),
|
||||
.NUM_OUTPUTS (NUM_OUTPUTS),
|
||||
.DATAW (DATAW2),
|
||||
.ARBITER (ARBITER),
|
||||
.MAX_FANOUT (MAX_FANOUT),
|
||||
.OUT_BUF (OUT_BUF)
|
||||
|
@ -134,109 +110,107 @@ module VX_stream_arb #(
|
|||
.ready_out (ready_out)
|
||||
);
|
||||
|
||||
assign data_out = data_out_u[LOG_NUM_REQS2 +: DATAW];
|
||||
assign sel_out = {sel_out_u, data_out_u[0 +: LOG_NUM_REQS2]};
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_out
|
||||
assign sel_out[o] = {sel_out_u[o], data_out_u[o][LOG_NUM_REQS2-1:0]};
|
||||
assign data_out[o] = data_out_u[o][DATAW2-1:LOG_NUM_REQS2];
|
||||
end
|
||||
|
||||
end else begin : g_one_output
|
||||
|
||||
// (#inputs <= max_fanout) and (#outputs == 1)
|
||||
|
||||
wire valid_in_w;
|
||||
wire [DATAW-1:0] data_in_w;
|
||||
wire ready_in_w;
|
||||
end else begin : g_arbiter
|
||||
|
||||
wire [NUM_REQS-1:0] arb_requests;
|
||||
wire arb_valid;
|
||||
wire [NUM_REQS_W-1:0] arb_index;
|
||||
wire [NUM_REQS-1:0] arb_onehot;
|
||||
wire arb_ready;
|
||||
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_requests
|
||||
wire [NUM_OUTPUTS-1:0] requests;
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_o
|
||||
localparam i = r * NUM_OUTPUTS + o;
|
||||
assign requests[o] = valid_in[i];
|
||||
end
|
||||
assign arb_requests[r] = (| requests);
|
||||
end
|
||||
|
||||
VX_generic_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.TYPE (ARBITER)
|
||||
) arbiter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.requests (valid_in),
|
||||
.requests (arb_requests),
|
||||
.grant_valid (arb_valid),
|
||||
.grant_index (arb_index),
|
||||
.grant_onehot (arb_onehot),
|
||||
.grant_ready (arb_ready)
|
||||
);
|
||||
|
||||
assign valid_in_w = arb_valid;
|
||||
assign data_in_w = data_in[arb_index];
|
||||
assign arb_ready = ready_in_w;
|
||||
wire [NUM_OUTPUTS-1:0] valid_out_w;
|
||||
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w;
|
||||
wire [NUM_OUTPUTS-1:0] ready_out_w;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_ready_in
|
||||
assign ready_in[i] = ready_in_w && arb_onehot[i];
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_out_w
|
||||
wire [NUM_REQS-1:0] valid_in_w;
|
||||
wire [NUM_REQS-1:0][DATAW-1:0] data_in_w;
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r
|
||||
localparam i = r * NUM_OUTPUTS + o;
|
||||
if (r < NUM_INPUTS) begin : g_valid
|
||||
assign valid_in_w[r] = valid_in[i];
|
||||
assign data_in_w[r] = data_in[i];
|
||||
end else begin : g_padding
|
||||
assign valid_in_w[r] = 0;
|
||||
assign data_in_w[r] = '0;
|
||||
end
|
||||
end
|
||||
assign valid_out_w[o] = ((NUM_OUTPUTS == 1) || (| valid_in_w)) && arb_valid;
|
||||
assign data_out_w[o] = data_in_w[arb_index];
|
||||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (LOG_NUM_REQS + DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
|
||||
.LUTRAM (`TO_OUT_BUF_LUTRAM(OUT_BUF))
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in_w),
|
||||
.ready_in (ready_in_w),
|
||||
.data_in ({arb_index, data_in_w}),
|
||||
.data_out ({sel_out, data_out}),
|
||||
.valid_out (valid_out),
|
||||
.ready_out (ready_out)
|
||||
);
|
||||
end
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_ready_in
|
||||
localparam o = i % NUM_OUTPUTS;
|
||||
localparam r = i / NUM_OUTPUTS;
|
||||
assign ready_in[i] = ready_out_w[o] && arb_onehot[r];
|
||||
end
|
||||
|
||||
end else if (NUM_OUTPUTS > NUM_INPUTS) begin : g_more_outputs
|
||||
assign arb_ready = (| ready_out_w);
|
||||
|
||||
if (NUM_INPUTS > 1) begin : g_multiple_inputs
|
||||
|
||||
// (#inputs > 1) and (#outputs > #inputs)
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_arb_slices
|
||||
|
||||
localparam SLICE_BEGIN = i * NUM_REQS;
|
||||
localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_OUTPUTS);
|
||||
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (1),
|
||||
.NUM_OUTPUTS (SLICE_SIZE),
|
||||
.DATAW (DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.MAX_FANOUT (MAX_FANOUT),
|
||||
.OUT_BUF (OUT_BUF)
|
||||
) arb_slice (
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (LOG_NUM_REQS + DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
|
||||
.LUTRAM (`TO_OUT_BUF_LUTRAM(OUT_BUF))
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in[i]),
|
||||
.ready_in (ready_in[i]),
|
||||
.data_in (data_in[i]),
|
||||
.data_out (data_out[SLICE_END-1: SLICE_BEGIN]),
|
||||
.valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]),
|
||||
.ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]),
|
||||
`UNUSED_PIN (sel_out)
|
||||
.valid_in (valid_out_w[o]),
|
||||
.ready_in (ready_out_w[o]),
|
||||
.data_in ({arb_index, data_out_w[o]}),
|
||||
.data_out ({sel_out[o], data_out[o]}),
|
||||
.valid_out (valid_out[o]),
|
||||
.ready_out (ready_out[o])
|
||||
);
|
||||
|
||||
for (genvar j = SLICE_BEGIN; j < SLICE_END; ++j) begin : g_sel_out
|
||||
assign sel_out[j] = i;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin : g_fanout
|
||||
end else if (NUM_INPUTS < NUM_OUTPUTS) begin : g_output_select
|
||||
|
||||
// (#inputs == 1) and (#outputs > max_fanout)
|
||||
// #Inputs < #Outputs
|
||||
|
||||
localparam NUM_SLICES = `CDIV(NUM_OUTPUTS, MAX_FANOUT);
|
||||
if (MAX_FANOUT != 0 && (NUM_REQS > (MAX_FANOUT + MAX_FANOUT /2))) begin : g_fanout
|
||||
|
||||
wire [NUM_SLICES-1:0] valid_tmp;
|
||||
wire [NUM_SLICES-1:0][DATAW-1:0] data_tmp;
|
||||
wire [NUM_SLICES-1:0] ready_tmp;
|
||||
localparam NUM_SLICES = `CDIV(NUM_REQS, MAX_FANOUT);
|
||||
localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT);
|
||||
localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES);
|
||||
|
||||
wire [NUM_SLICES-1:0][NUM_INPUTS-1:0] valid_tmp;
|
||||
wire [NUM_SLICES-1:0][NUM_INPUTS-1:0][DATAW-1:0] data_tmp;
|
||||
wire [NUM_SLICES-1:0][NUM_INPUTS-1:0] ready_tmp;
|
||||
wire [NUM_INPUTS-1:0][LOG_NUM_REQS3-1:0] sel_tmp;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (1),
|
||||
.NUM_OUTPUTS (NUM_SLICES),
|
||||
.NUM_INPUTS (NUM_INPUTS),
|
||||
.NUM_OUTPUTS (NUM_SLICES * NUM_INPUTS),
|
||||
.DATAW (DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.MAX_FANOUT (MAX_FANOUT),
|
||||
|
@ -250,17 +224,22 @@ module VX_stream_arb #(
|
|||
.data_out (data_tmp),
|
||||
.valid_out (valid_tmp),
|
||||
.ready_out (ready_tmp),
|
||||
`UNUSED_PIN (sel_out)
|
||||
.sel_out (sel_tmp)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_SLICES; ++i) begin : g_fanout_slice_arbs
|
||||
wire [NUM_SLICES-1:0][NUM_INPUTS-1:0][LOG_NUM_REQS2-1:0] sel_out_w;
|
||||
|
||||
localparam SLICE_BEGIN = i * MAX_FANOUT;
|
||||
localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_OUTPUTS);
|
||||
for (genvar s = 0; s < NUM_SLICES; ++s) begin : g_slice_arbs
|
||||
|
||||
localparam SLICE_STRIDE= MAX_FANOUT * NUM_INPUTS;
|
||||
localparam SLICE_BEGIN = s * SLICE_STRIDE;
|
||||
localparam SLICE_END = `MIN(SLICE_BEGIN + SLICE_STRIDE, NUM_OUTPUTS);
|
||||
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
|
||||
|
||||
wire [NUM_INPUTS-1:0][LOG_NUM_REQS2-1:0] sel_out_u;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (1),
|
||||
.NUM_INPUTS (NUM_INPUTS),
|
||||
.NUM_OUTPUTS (SLICE_SIZE),
|
||||
.DATAW (DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
|
@ -269,45 +248,73 @@ module VX_stream_arb #(
|
|||
) fanout_slice_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_tmp[i]),
|
||||
.ready_in (ready_tmp[i]),
|
||||
.data_in (data_tmp[i]),
|
||||
.valid_in (valid_tmp[s]),
|
||||
.ready_in (ready_tmp[s]),
|
||||
.data_in (data_tmp[s]),
|
||||
.data_out (data_out[SLICE_END-1: SLICE_BEGIN]),
|
||||
.valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]),
|
||||
.ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]),
|
||||
`UNUSED_PIN (sel_out)
|
||||
.sel_out (sel_out_w[s])
|
||||
);
|
||||
end
|
||||
|
||||
end else begin : g_one_input
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_sel_out
|
||||
assign sel_out[i] = {sel_tmp[i], sel_out_w[sel_tmp[i]][i]};
|
||||
end
|
||||
|
||||
// (#inputs == 1) and (#outputs <= max_fanout)
|
||||
end else begin : g_arbiter
|
||||
|
||||
wire [NUM_OUTPUTS-1:0] ready_in_w;
|
||||
|
||||
wire [NUM_OUTPUTS-1:0] arb_requests;
|
||||
wire [NUM_REQS-1:0] arb_requests;
|
||||
wire arb_valid;
|
||||
wire [NUM_OUTPUTS-1:0] arb_onehot;
|
||||
wire [NUM_REQS_W-1:0] arb_index;
|
||||
wire [NUM_REQS-1:0] arb_onehot;
|
||||
wire arb_ready;
|
||||
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_requests
|
||||
wire [NUM_INPUTS-1:0] requests;
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_i
|
||||
localparam o = r * NUM_INPUTS + i;
|
||||
assign requests[i] = ready_out[o];
|
||||
end
|
||||
assign arb_requests[r] = (| requests);
|
||||
end
|
||||
|
||||
VX_generic_arbiter #(
|
||||
.NUM_REQS (NUM_OUTPUTS),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.TYPE (ARBITER)
|
||||
) arbiter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.requests (arb_requests),
|
||||
.grant_valid (arb_valid),
|
||||
`UNUSED_PIN (grant_index),
|
||||
.grant_index (arb_index),
|
||||
.grant_onehot (arb_onehot),
|
||||
.grant_ready (arb_ready)
|
||||
);
|
||||
|
||||
assign arb_requests = ready_in_w;
|
||||
assign arb_ready = valid_in[0];
|
||||
assign ready_in = arb_valid;
|
||||
wire [NUM_OUTPUTS-1:0] valid_out_w;
|
||||
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w;
|
||||
wire [NUM_OUTPUTS-1:0] ready_out_w;
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_out_buf
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_out_w
|
||||
localparam i = o % NUM_INPUTS;
|
||||
localparam r = o / NUM_INPUTS;
|
||||
assign valid_out_w[o] = valid_in[i] && arb_onehot[r];
|
||||
assign data_out_w[o] = data_in[i];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_ready_in
|
||||
wire [NUM_REQS-1:0] ready_out_s;
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r
|
||||
localparam o = r * NUM_INPUTS + i;
|
||||
assign ready_out_s[r] = ready_out_w[o];
|
||||
end
|
||||
assign ready_in[i] = ((NUM_INPUTS == 1) || (| ready_out_s)) && arb_valid;
|
||||
end
|
||||
|
||||
assign arb_ready = (| valid_in);
|
||||
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
|
@ -316,23 +323,25 @@ module VX_stream_arb #(
|
|||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && arb_onehot[i]),
|
||||
.ready_in (ready_in_w[i]),
|
||||
.data_in (data_in),
|
||||
.data_out (data_out[i]),
|
||||
.valid_out (valid_out[i]),
|
||||
.ready_out (ready_out[i])
|
||||
.valid_in (valid_out_w[o]),
|
||||
.ready_in (ready_out_w[o]),
|
||||
.data_in (data_out_w[o]),
|
||||
.data_out (data_out[o]),
|
||||
.valid_out (valid_out[o]),
|
||||
.ready_out (ready_out[o])
|
||||
);
|
||||
end
|
||||
end
|
||||
|
||||
assign sel_out = 0;
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_sel_out
|
||||
assign sel_out[i] = arb_index;
|
||||
end
|
||||
end
|
||||
|
||||
end else begin : g_passthru
|
||||
|
||||
// #Inputs == #Outputs
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_out_buf
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
|
@ -341,14 +350,14 @@ module VX_stream_arb #(
|
|||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in[i]),
|
||||
.ready_in (ready_in[i]),
|
||||
.data_in (data_in[i]),
|
||||
.data_out (data_out[i]),
|
||||
.valid_out (valid_out[i]),
|
||||
.ready_out (ready_out[i])
|
||||
.valid_in (valid_in[o]),
|
||||
.ready_in (ready_in[o]),
|
||||
.data_in (data_in[o]),
|
||||
.data_out (data_out[o]),
|
||||
.valid_out (valid_out[o]),
|
||||
.ready_out (ready_out[o])
|
||||
);
|
||||
assign sel_out[i] = NUM_REQS_W'(i);
|
||||
assign sel_out[o] = NUM_REQS_W'(0);
|
||||
end
|
||||
end
|
||||
|
||||
|
|
215
hw/rtl/libs/VX_stream_omega.sv
Normal file
215
hw/rtl/libs/VX_stream_omega.sv
Normal file
|
@ -0,0 +1,215 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`TRACING_OFF
|
||||
module VX_stream_omega #(
|
||||
parameter NUM_INPUTS = 4,
|
||||
parameter NUM_OUTPUTS = 4,
|
||||
parameter RADIX = 2,
|
||||
parameter DATAW = 4,
|
||||
parameter ARBITER = "R",
|
||||
parameter OUT_BUF = 0,
|
||||
parameter MAX_FANOUT = `MAX_FANOUT,
|
||||
parameter PERF_CTR_BITS = 32,
|
||||
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
|
||||
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire [NUM_INPUTS-1:0] valid_in,
|
||||
input wire [NUM_INPUTS-1:0][DATAW-1:0] data_in,
|
||||
input wire [NUM_INPUTS-1:0][OUT_WIDTH-1:0] sel_in,
|
||||
output wire [NUM_INPUTS-1:0] ready_in,
|
||||
|
||||
output wire [NUM_OUTPUTS-1:0] valid_out,
|
||||
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
|
||||
output wire [NUM_OUTPUTS-1:0][IN_WIDTH-1:0] sel_out,
|
||||
input wire [NUM_OUTPUTS-1:0] ready_out,
|
||||
|
||||
output wire [PERF_CTR_BITS-1:0] collisions
|
||||
);
|
||||
`STATIC_ASSERT (`IS_POW2(RADIX), ("inavlid parameters"))
|
||||
|
||||
// If network size smaller than radix, simply use a crossbar.
|
||||
if (NUM_INPUTS <= RADIX && NUM_OUTPUTS <= RADIX) begin : g_fallback
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_INPUTS),
|
||||
.NUM_OUTPUTS (NUM_OUTPUTS),
|
||||
.DATAW (DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.OUT_BUF (OUT_BUF),
|
||||
.MAX_FANOUT (MAX_FANOUT),
|
||||
.PERF_CTR_BITS (PERF_CTR_BITS)
|
||||
) xbar_switch (
|
||||
.clk,
|
||||
.reset,
|
||||
.valid_in,
|
||||
.data_in,
|
||||
.sel_in,
|
||||
.ready_in,
|
||||
.valid_out,
|
||||
.data_out,
|
||||
.sel_out,
|
||||
.ready_out,
|
||||
.collisions
|
||||
);
|
||||
end else begin : g_omega
|
||||
localparam RADIX_LG = `LOG2UP(RADIX);
|
||||
localparam N_INPUTS_M = `MAX(NUM_INPUTS, NUM_OUTPUTS);
|
||||
localparam N_INPUTS_LG = `CDIV(`CLOG2(N_INPUTS_M), RADIX_LG);
|
||||
localparam N_INPUTS = RADIX ** N_INPUTS_LG;
|
||||
localparam NUM_STAGES = `LOG2UP(N_INPUTS) / RADIX_LG;
|
||||
localparam NUM_SWITCHES = N_INPUTS / RADIX;
|
||||
|
||||
typedef struct packed {
|
||||
logic [N_INPUTS_LG-1:0] sel_in;
|
||||
logic [DATAW-1:0] data;
|
||||
logic [IN_WIDTH-1:0] sel_out;
|
||||
} omega_t;
|
||||
|
||||
// Wires for internal connections between stages
|
||||
wire [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] switch_valid_in, switch_valid_out;
|
||||
omega_t [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] switch_data_in, switch_data_out;
|
||||
wire [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0][RADIX_LG-1:0] switch_sel_in;
|
||||
wire [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] switch_ready_in, switch_ready_out;
|
||||
|
||||
// Connect inputs to first stage
|
||||
for (genvar i = 0; i < N_INPUTS; ++i) begin : g_tie_inputs
|
||||
localparam DST_IDX = ((i << 1) | (i >> (N_INPUTS_LG-1))) & (N_INPUTS-1);
|
||||
localparam switch = DST_IDX / RADIX;
|
||||
localparam port = DST_IDX % RADIX;
|
||||
if (i < NUM_INPUTS) begin : g_valid
|
||||
assign switch_valid_in[0][switch][port] = valid_in[i];
|
||||
assign switch_data_in[0][switch][port] = '{
|
||||
sel_in: N_INPUTS_LG'(sel_in[i]),
|
||||
data: data_in[i],
|
||||
sel_out: IN_WIDTH'(i)
|
||||
};
|
||||
assign ready_in[i] = switch_ready_in[0][switch][port];
|
||||
end else begin : g_padding
|
||||
assign switch_valid_in[0][switch][port] = 0;
|
||||
assign switch_data_in[0][switch][port] = 'x;
|
||||
`UNUSED_VAR (switch_ready_in[0][switch][port])
|
||||
end
|
||||
end
|
||||
|
||||
// Connect switch sel_in
|
||||
for (genvar stage = 0; stage < NUM_STAGES; ++stage) begin : g_sel_in
|
||||
for (genvar switch = 0; switch < NUM_SWITCHES; ++switch) begin : g_switches
|
||||
for (genvar port = 0; port < RADIX; ++port) begin : g_ports
|
||||
assign switch_sel_in[stage][switch][port] = switch_data_in[stage][switch][port].sel_in[(NUM_STAGES-1-stage) * RADIX_LG +: RADIX_LG];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// Connect internal stages
|
||||
for (genvar stage = 0; stage < NUM_STAGES-1; ++stage) begin : g_stages
|
||||
for (genvar switch = 0; switch < NUM_SWITCHES; ++switch) begin : g_switches
|
||||
for (genvar port = 0; port < RADIX; port++) begin : g_ports
|
||||
localparam lane = switch * RADIX + port;
|
||||
localparam dst_lane = ((lane << 1) | (lane >> (N_INPUTS_LG-1))) & (N_INPUTS-1);
|
||||
localparam dst_switch = dst_lane / RADIX;
|
||||
localparam dst_port = dst_lane % RADIX;
|
||||
assign switch_valid_in[stage+1][dst_switch][dst_port] = switch_valid_out[stage][switch][port];
|
||||
assign switch_data_in[stage+1][dst_switch][dst_port] = switch_data_out[stage][switch][port];
|
||||
assign switch_ready_out[stage][switch][port] = switch_ready_in[stage+1][dst_switch][dst_port];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// Connect network switches
|
||||
for (genvar switch = 0; switch < NUM_SWITCHES; ++switch) begin : g_switches
|
||||
for (genvar stage = 0; stage < NUM_STAGES; ++stage) begin : g_stages
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (RADIX),
|
||||
.NUM_OUTPUTS (RADIX),
|
||||
.DATAW ($bits(omega_t)),
|
||||
.ARBITER (ARBITER),
|
||||
.OUT_BUF (OUT_BUF),
|
||||
.MAX_FANOUT (MAX_FANOUT),
|
||||
.PERF_CTR_BITS(PERF_CTR_BITS)
|
||||
) xbar_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (switch_valid_in[stage][switch]),
|
||||
.data_in (switch_data_in[stage][switch]),
|
||||
.sel_in (switch_sel_in[stage][switch]),
|
||||
.ready_in (switch_ready_in[stage][switch]),
|
||||
.valid_out (switch_valid_out[stage][switch]),
|
||||
.data_out (switch_data_out[stage][switch]),
|
||||
`UNUSED_PIN (sel_out),
|
||||
.ready_out (switch_ready_out[stage][switch]),
|
||||
`UNUSED_PIN (collisions)
|
||||
);
|
||||
end
|
||||
end
|
||||
|
||||
// Connect outputs to last stage
|
||||
for (genvar i = 0; i < N_INPUTS; ++i) begin : g_tie_outputs
|
||||
localparam switch = i / RADIX;
|
||||
localparam port = i % RADIX;
|
||||
if (i < NUM_OUTPUTS) begin : g_valid
|
||||
assign valid_out[i] = switch_valid_out[NUM_STAGES-1][switch][port];
|
||||
assign data_out[i] = switch_data_out[NUM_STAGES-1][switch][port].data;
|
||||
assign sel_out[i] = switch_data_out[NUM_STAGES-1][switch][port].sel_out;
|
||||
assign switch_ready_out[NUM_STAGES-1][switch][port] = ready_out[i];
|
||||
end else begin : g_padding
|
||||
`UNUSED_VAR (switch_valid_out[NUM_STAGES-1][switch][port])
|
||||
`UNUSED_VAR (switch_data_out[NUM_STAGES-1][switch][port])
|
||||
assign switch_ready_out[NUM_STAGES-1][switch][port] = 0;
|
||||
end
|
||||
end
|
||||
|
||||
// compute inputs collision
|
||||
// we have a collision when there exists a valid transfer with multiple input candicates
|
||||
// we count the unique duplicates each cycle.
|
||||
|
||||
reg [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] per_cycle_collision, per_cycle_collision_r;
|
||||
wire [`CLOG2(NUM_STAGES*NUM_SWITCHES*RADIX+1)-1:0] collision_count;
|
||||
reg [PERF_CTR_BITS-1:0] collisions_r;
|
||||
|
||||
always @(*) begin
|
||||
per_cycle_collision = 0;
|
||||
for (integer stage = 0; stage < NUM_STAGES; ++stage) begin
|
||||
for (integer switch = 0; switch < NUM_SWITCHES; ++switch) begin
|
||||
for (integer port_a = 0; port_a < RADIX; ++port_a) begin
|
||||
for (integer port_b = port_a + 1; port_b < RADIX; ++port_b) begin
|
||||
per_cycle_collision[stage][switch][port_a] |= switch_valid_in[stage][switch][port_a]
|
||||
&& switch_valid_in[stage][switch][port_b]
|
||||
&& (switch_sel_in[stage][switch][port_a] == switch_sel_in[stage][switch][port_b])
|
||||
&& (switch_ready_in[stage][switch][port_a] | switch_ready_in[stage][switch][port_b]);
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
`BUFFER(per_cycle_collision_r, per_cycle_collision);
|
||||
`POP_COUNT(collision_count, per_cycle_collision_r);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
collisions_r <= '0;
|
||||
end else begin
|
||||
collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count);
|
||||
end
|
||||
end
|
||||
|
||||
assign collisions = collisions_r;
|
||||
end
|
||||
|
||||
endmodule
|
||||
`TRACING_ON
|
|
@ -36,42 +36,27 @@ module VX_stream_switch #(
|
|||
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
|
||||
input wire [NUM_OUTPUTS-1:0] ready_out
|
||||
);
|
||||
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_more_inputs
|
||||
wire [NUM_OUTPUTS-1:0][NUM_REQS-1:0] valid_in_w;
|
||||
wire [NUM_OUTPUTS-1:0][NUM_REQS-1:0][DATAW-1:0] data_in_w;
|
||||
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_input_select
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_data_in
|
||||
for (genvar j = 0; j < NUM_REQS; ++j) begin : g_j
|
||||
localparam ii = i * NUM_REQS + j;
|
||||
if (ii < NUM_INPUTS) begin : g_valid
|
||||
assign valid_in_w[i][j] = valid_in[ii];
|
||||
assign data_in_w[i][j] = data_in[ii];
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
|
||||
|
||||
wire [NUM_REQS-1:0] valid_in_w;
|
||||
wire [NUM_REQS-1:0][DATAW-1:0] data_in_w;
|
||||
wire [NUM_REQS-1:0] ready_in_w;
|
||||
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r
|
||||
localparam i = r * NUM_OUTPUTS + o;
|
||||
if (i < NUM_INPUTS) begin : g_valid
|
||||
assign valid_in_w[r] = valid_in[i];
|
||||
assign data_in_w[r] = data_in[i];
|
||||
assign ready_in[i] = ready_in_w[r];
|
||||
end else begin : g_padding
|
||||
assign valid_in_w[i][j] = 0;
|
||||
assign data_in_w[i][j] = '0;
|
||||
assign valid_in_w[r] = 0;
|
||||
assign data_in_w[r] = '0;
|
||||
`UNUSED_VAR (ready_in_w[r])
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire [NUM_OUTPUTS-1:0] valid_out_w;
|
||||
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w;
|
||||
wire [NUM_OUTPUTS-1:0] ready_out_w;
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_data_out_w
|
||||
assign valid_out_w[i] = valid_in_w[i][sel_in[i]];
|
||||
assign data_out_w[i] = data_in_w[i][sel_in[i]];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_ready_out_w
|
||||
for (genvar j = 0; j < NUM_REQS; ++j) begin : g_j
|
||||
localparam ii = i * NUM_REQS + j;
|
||||
if (ii < NUM_INPUTS) begin : g_valid
|
||||
assign ready_in[ii] = ready_out_w[i] && (sel_in[i] == LOG_NUM_REQS'(j));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_out_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
|
@ -79,34 +64,27 @@ module VX_stream_switch #(
|
|||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_out_w[i]),
|
||||
.ready_in (ready_out_w[i]),
|
||||
.data_in (data_out_w[i]),
|
||||
.data_out (data_out[i]),
|
||||
.valid_out (valid_out[i]),
|
||||
.ready_out (ready_out[i])
|
||||
.valid_in (valid_in_w[sel_in[o]]),
|
||||
.ready_in (ready_in_w[sel_in[o]]),
|
||||
.data_in (data_in_w[sel_in[o]]),
|
||||
.data_out (data_out[o]),
|
||||
.valid_out (valid_out[o]),
|
||||
.ready_out (ready_out[o])
|
||||
);
|
||||
end
|
||||
|
||||
end else if (NUM_OUTPUTS > NUM_INPUTS) begin : g_more_outputs
|
||||
end else if (NUM_OUTPUTS > NUM_INPUTS) begin : g_output_select
|
||||
|
||||
wire [NUM_INPUTS-1:0][NUM_REQS-1:0] valid_out_w;
|
||||
wire [NUM_INPUTS-1:0][NUM_REQS-1:0] ready_out_w;
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_valid_out_w
|
||||
for (genvar j = 0; j < NUM_REQS; ++j) begin : g_j
|
||||
assign valid_out_w[i][j] = valid_in[i] && (sel_in[i] == LOG_NUM_REQS'(j));
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_ready_in
|
||||
assign ready_in[i] = ready_out_w[i][sel_in[i]];
|
||||
end
|
||||
// Inputs < Outputs
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_out_buf
|
||||
for (genvar j = 0; j < NUM_REQS; ++j) begin : g_j
|
||||
localparam ii = i * NUM_REQS + j;
|
||||
if (ii < NUM_OUTPUTS) begin : g_valid
|
||||
|
||||
wire [NUM_REQS-1:0] ready_out_w;
|
||||
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r
|
||||
localparam o = r * NUM_INPUTS + i;
|
||||
if (o < NUM_OUTPUTS) begin : g_valid
|
||||
wire valid_out_w = valid_in[i] && (sel_in[i] == LOG_NUM_REQS'(r));
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
|
@ -114,18 +92,19 @@ module VX_stream_switch #(
|
|||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_out_w[i][j]),
|
||||
.ready_in (ready_out_w[i][j]),
|
||||
.valid_in (valid_out_w),
|
||||
.ready_in (ready_out_w[r]),
|
||||
.data_in (data_in[i]),
|
||||
.data_out (data_out[ii]),
|
||||
.valid_out (valid_out[ii]),
|
||||
.ready_out (ready_out[ii])
|
||||
.data_out (data_out[o]),
|
||||
.valid_out (valid_out[o]),
|
||||
.ready_out (ready_out[o])
|
||||
);
|
||||
end else begin : g_padding
|
||||
`UNUSED_VAR (valid_out_w[i][j])
|
||||
assign ready_out_w[i][j] = '0;
|
||||
assign ready_out_w[r] = '0;
|
||||
end
|
||||
end
|
||||
|
||||
assign ready_in[i] = ready_out_w[sel_in[i]];
|
||||
end
|
||||
|
||||
end else begin : g_passthru
|
||||
|
@ -150,7 +129,6 @@ module VX_stream_switch #(
|
|||
.ready_out (ready_out[i])
|
||||
);
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -18,18 +18,16 @@ module VX_stream_xbar #(
|
|||
parameter NUM_INPUTS = 4,
|
||||
parameter NUM_OUTPUTS = 4,
|
||||
parameter DATAW = 4,
|
||||
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
|
||||
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS),
|
||||
parameter ARBITER = "R",
|
||||
parameter OUT_BUF = 0,
|
||||
parameter MAX_FANOUT = `MAX_FANOUT,
|
||||
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1)
|
||||
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1),
|
||||
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
|
||||
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire [PERF_CTR_BITS-1:0] collisions,
|
||||
|
||||
input wire [NUM_INPUTS-1:0] valid_in,
|
||||
input wire [NUM_INPUTS-1:0][DATAW-1:0] data_in,
|
||||
input wire [NUM_INPUTS-1:0][OUT_WIDTH-1:0] sel_in,
|
||||
|
@ -38,12 +36,14 @@ module VX_stream_xbar #(
|
|||
output wire [NUM_OUTPUTS-1:0] valid_out,
|
||||
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
|
||||
output wire [NUM_OUTPUTS-1:0][IN_WIDTH-1:0] sel_out,
|
||||
input wire [NUM_OUTPUTS-1:0] ready_out
|
||||
input wire [NUM_OUTPUTS-1:0] ready_out,
|
||||
|
||||
output wire [PERF_CTR_BITS-1:0] collisions
|
||||
);
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
if (NUM_INPUTS != 1) begin : g_multiple_inputs
|
||||
if (NUM_INPUTS != 1) begin : g_multi_inputs
|
||||
|
||||
if (NUM_OUTPUTS != 1) begin : g_multiple_outputs
|
||||
|
||||
|
@ -130,7 +130,7 @@ module VX_stream_xbar #(
|
|||
`UNUSED_VAR (sel_in)
|
||||
end
|
||||
|
||||
end else if (NUM_OUTPUTS != 1) begin : g_one_input
|
||||
end else if (NUM_OUTPUTS != 1) begin : g_single_input
|
||||
|
||||
// (#inputs == 1) and (#outputs > 1)
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ module VX_gbar_arb #(
|
|||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_data_in
|
||||
assign req_valid_in[i] = bus_in_if[i].req_valid;
|
||||
assign req_data_in[i] = {bus_in_if[i].req_id, bus_in_if[i].req_size_m1, bus_in_if[i].req_core_id};
|
||||
assign req_data_in[i] = bus_in_if[i].req_data;
|
||||
assign bus_in_if[i].req_ready = req_ready_in[i];
|
||||
end
|
||||
|
||||
|
@ -51,7 +51,7 @@ module VX_gbar_arb #(
|
|||
.valid_in (req_valid_in),
|
||||
.ready_in (req_ready_in),
|
||||
.data_in (req_data_in),
|
||||
.data_out ({bus_out_if.req_id, bus_out_if.req_size_m1, bus_out_if.req_core_id}),
|
||||
.data_out (bus_out_if.req_data),
|
||||
.valid_out (bus_out_if.req_valid),
|
||||
.ready_out (bus_out_if.req_ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
|
@ -60,7 +60,7 @@ module VX_gbar_arb #(
|
|||
// broadcast response
|
||||
|
||||
reg rsp_valid;
|
||||
reg [`NB_WIDTH-1:0] rsp_id;
|
||||
reg [`NB_WIDTH-1:0] rsp_data;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -68,12 +68,12 @@ module VX_gbar_arb #(
|
|||
end else begin
|
||||
rsp_valid <= bus_out_if.rsp_valid;
|
||||
end
|
||||
rsp_id <= bus_out_if.rsp_id;
|
||||
rsp_data <= bus_out_if.rsp_data;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_bus_in_if
|
||||
assign bus_in_if[i].rsp_valid = rsp_valid;
|
||||
assign bus_in_if[i].rsp_id = rsp_id;
|
||||
assign bus_in_if[i].rsp_data = rsp_data;
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -15,35 +15,39 @@
|
|||
|
||||
interface VX_gbar_bus_if ();
|
||||
|
||||
wire req_valid;
|
||||
wire [`NB_WIDTH-1:0] req_id;
|
||||
wire [`NC_WIDTH-1:0] req_size_m1;
|
||||
wire [`NC_WIDTH-1:0] req_core_id;
|
||||
wire req_ready;
|
||||
typedef struct packed {
|
||||
logic [`NB_WIDTH-1:0] id;
|
||||
logic [`NC_WIDTH-1:0] size_m1;
|
||||
logic [`NC_WIDTH-1:0] core_id;
|
||||
} req_data_t;
|
||||
|
||||
wire rsp_valid;
|
||||
wire [`NB_WIDTH-1:0] rsp_id;
|
||||
typedef struct packed {
|
||||
logic [`NB_WIDTH-1:0] id;
|
||||
} rsp_data_t;
|
||||
|
||||
logic req_valid;
|
||||
req_data_t req_data;
|
||||
logic req_ready;
|
||||
|
||||
logic rsp_valid;
|
||||
rsp_data_t rsp_data;
|
||||
|
||||
modport master (
|
||||
output req_valid,
|
||||
output req_id,
|
||||
output req_size_m1,
|
||||
output req_core_id,
|
||||
input req_ready,
|
||||
output req_valid,
|
||||
output req_data,
|
||||
input req_ready,
|
||||
|
||||
input rsp_valid,
|
||||
input rsp_id
|
||||
input rsp_valid,
|
||||
input rsp_data
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input req_valid,
|
||||
input req_id,
|
||||
input req_size_m1,
|
||||
input req_core_id,
|
||||
output req_ready,
|
||||
|
||||
output rsp_valid,
|
||||
output rsp_id
|
||||
input req_valid,
|
||||
input req_data,
|
||||
output req_ready,
|
||||
|
||||
output rsp_valid,
|
||||
output rsp_data
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
|
|
@ -25,7 +25,7 @@ module VX_gbar_unit #(
|
|||
|
||||
reg [`NB_WIDTH-1:0][`NUM_CORES-1:0] barrier_masks;
|
||||
wire [`CLOG2(`NUM_CORES+1)-1:0] active_barrier_count;
|
||||
wire [`NUM_CORES-1:0] curr_barrier_mask = barrier_masks[gbar_bus_if.req_id];
|
||||
wire [`NUM_CORES-1:0] curr_barrier_mask = barrier_masks[gbar_bus_if.req_data.id];
|
||||
|
||||
`POP_COUNT(active_barrier_count, curr_barrier_mask);
|
||||
`UNUSED_VAR (active_barrier_count)
|
||||
|
@ -42,29 +42,29 @@ module VX_gbar_unit #(
|
|||
rsp_valid <= 0;
|
||||
end
|
||||
if (gbar_bus_if.req_valid) begin
|
||||
if (active_barrier_count[`NC_WIDTH-1:0] == gbar_bus_if.req_size_m1) begin
|
||||
barrier_masks[gbar_bus_if.req_id] <= '0;
|
||||
rsp_bar_id <= gbar_bus_if.req_id;
|
||||
if (active_barrier_count[`NC_WIDTH-1:0] == gbar_bus_if.req_data.size_m1) begin
|
||||
barrier_masks[gbar_bus_if.req_data.id] <= '0;
|
||||
rsp_bar_id <= gbar_bus_if.req_data.id;
|
||||
rsp_valid <= 1;
|
||||
end else begin
|
||||
barrier_masks[gbar_bus_if.req_id][gbar_bus_if.req_core_id] <= 1;
|
||||
barrier_masks[gbar_bus_if.req_data.id][gbar_bus_if.req_data.core_id] <= 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign gbar_bus_if.rsp_valid = rsp_valid;
|
||||
assign gbar_bus_if.rsp_id = rsp_bar_id;
|
||||
assign gbar_bus_if.rsp_data.id = rsp_bar_id;
|
||||
assign gbar_bus_if.req_ready = 1; // global barrier unit is always ready (no dependencies)
|
||||
|
||||
`ifdef DBG_TRACE_GBAR
|
||||
always @(posedge clk) begin
|
||||
if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin
|
||||
`TRACE(2, ("%t: %s acquire: bar_id=%0d, size=%0d, core_id=%0d\n",
|
||||
$time, INSTANCE_ID, gbar_bus_if.req_id, gbar_bus_if.req_size_m1, gbar_bus_if.req_core_id))
|
||||
$time, INSTANCE_ID, gbar_bus_if.req_data.id, gbar_bus_if.req_data.size_m1, gbar_bus_if.req_data.core_id))
|
||||
end
|
||||
if (gbar_bus_if.rsp_valid) begin
|
||||
`TRACE(2, ("%t: %s release: bar_id=%0d\n", $time, INSTANCE_ID, gbar_bus_if.rsp_id))
|
||||
`TRACE(2, ("%t: %s release: bar_id=%0d\n", $time, INSTANCE_ID, gbar_bus_if.rsp_data.id))
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -61,15 +61,7 @@ module VX_lmem_switch import VX_gpu_pkg::*; #(
|
|||
}),
|
||||
.ready_in (req_global_ready),
|
||||
.valid_out (global_out_if.req_valid),
|
||||
.data_out ({
|
||||
global_out_if.req_data.mask,
|
||||
global_out_if.req_data.rw,
|
||||
global_out_if.req_data.addr,
|
||||
global_out_if.req_data.data,
|
||||
global_out_if.req_data.byteen,
|
||||
global_out_if.req_data.flags,
|
||||
global_out_if.req_data.tag
|
||||
}),
|
||||
.data_out (global_out_if.req_data),
|
||||
.ready_out (global_out_if.req_ready)
|
||||
);
|
||||
|
||||
|
@ -92,15 +84,7 @@ module VX_lmem_switch import VX_gpu_pkg::*; #(
|
|||
}),
|
||||
.ready_in (req_local_ready),
|
||||
.valid_out (local_out_if.req_valid),
|
||||
.data_out ({
|
||||
local_out_if.req_data.mask,
|
||||
local_out_if.req_data.rw,
|
||||
local_out_if.req_data.addr,
|
||||
local_out_if.req_data.data,
|
||||
local_out_if.req_data.byteen,
|
||||
local_out_if.req_data.flags,
|
||||
local_out_if.req_data.tag
|
||||
}),
|
||||
.data_out (local_out_if.req_data),
|
||||
.ready_out (local_out_if.req_ready)
|
||||
);
|
||||
|
||||
|
|
|
@ -109,8 +109,8 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
assign req_data_in[i] = {
|
||||
mem_bus_if[i].req_data.rw,
|
||||
req_bank_addr[i],
|
||||
mem_bus_if[i].req_data.byteen,
|
||||
mem_bus_if[i].req_data.data,
|
||||
mem_bus_if[i].req_data.byteen,
|
||||
mem_bus_if[i].req_data.tag
|
||||
};
|
||||
assign mem_bus_if[i].req_ready = req_ready_in[i];
|
||||
|
@ -145,8 +145,8 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
assign {
|
||||
per_bank_req_rw[i],
|
||||
per_bank_req_addr[i],
|
||||
per_bank_req_byteen[i],
|
||||
per_bank_req_data[i],
|
||||
per_bank_req_byteen[i],
|
||||
per_bank_req_tag[i]
|
||||
} = per_bank_req_data_aos[i];
|
||||
end
|
||||
|
@ -245,7 +245,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_mem_bus_if
|
||||
assign mem_bus_if[i].rsp_valid = rsp_valid_out[i];
|
||||
assign mem_bus_if[i].rsp_data = rsp_data_out[i];
|
||||
assign mem_bus_if[i].rsp_data = rsp_data_out[i];
|
||||
assign rsp_ready_out[i] = mem_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
|
@ -299,23 +299,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
|
||||
`ifdef DBG_TRACE_MEM
|
||||
|
||||
wire [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] req_uuid;
|
||||
wire [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] rsp_uuid;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_uuid
|
||||
if (UUID_WIDTH != 0) begin : g_uuid
|
||||
assign req_uuid[i] = mem_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign rsp_uuid[i] = mem_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin : g_no_uuid
|
||||
assign req_uuid[i] = 0;
|
||||
assign rsp_uuid[i] = 0;
|
||||
end
|
||||
end
|
||||
|
||||
wire [NUM_BANKS-1:0][TAG_WIDTH-UUID_WIDTH-1:0] per_bank_req_tag_value;
|
||||
wire [NUM_BANKS-1:0][`UP(UUID_WIDTH)-1:0] per_bank_req_uuid;
|
||||
|
||||
wire [NUM_BANKS-1:0][TAG_WIDTH-UUID_WIDTH-1:0] per_bank_rsp_tag_value;
|
||||
wire [NUM_BANKS-1:0][`UP(UUID_WIDTH)-1:0] per_bank_rsp_uuid;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_req_uuid
|
||||
assign per_bank_req_tag_value[i] = per_bank_req_tag[i][TAG_WIDTH-UUID_WIDTH-1:0];
|
||||
assign per_bank_rsp_tag_value[i] = per_bank_rsp_tag[i][TAG_WIDTH-UUID_WIDTH-1:0];
|
||||
if (UUID_WIDTH != 0) begin : g_uuid
|
||||
assign per_bank_req_uuid[i] = per_bank_req_tag[i][TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign per_bank_rsp_uuid[i] = per_bank_rsp_tag[i][TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
|
@ -329,16 +321,16 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
always @(posedge clk) begin
|
||||
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
|
||||
if (mem_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, req_uuid[i]))
|
||||
`TRACE(2, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, req_uuid[i]))
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.tag, mem_bus_if[i].rsp_data.data[i], rsp_uuid[i]))
|
||||
`TRACE(2, ("%t: %s rd-rsp: req_idx=%0d, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -347,16 +339,16 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
always @(posedge clk) begin
|
||||
if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin
|
||||
if (per_bank_req_rw[i]) begin
|
||||
`TRACE(2, ("%t: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_uuid[i]))
|
||||
`TRACE(2, ("%t: %s-bank%0d wr-req: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag[i], per_bank_req_uuid[i]))
|
||||
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
|
||||
end
|
||||
end
|
||||
if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin
|
||||
`TRACE(2, ("%t: %s-bank%0d rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_rsp_tag[i], per_bank_rsp_data[i], per_bank_rsp_uuid[i]))
|
||||
`TRACE(2, ("%t: %s-bank%0d rd-rsp: data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_rsp_data[i], per_bank_rsp_tag_value[i], per_bank_rsp_uuid[i]))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -92,8 +92,8 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
|
|||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_bus_rsp
|
||||
assign rsp_valid_out[i] = mem_bus_if[i].rsp_valid;
|
||||
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
|
||||
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
|
||||
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
|
||||
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
|
||||
assign mem_bus_if[i].rsp_ready = rsp_ready_out[i];
|
||||
end
|
||||
|
||||
|
|
185
hw/rtl/mem/VX_lsu_mem_arb.sv
Normal file
185
hw/rtl/mem/VX_lsu_mem_arb.sv
Normal file
|
@ -0,0 +1,185 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_lsu_mem_arb #(
|
||||
parameter NUM_INPUTS = 1,
|
||||
parameter NUM_OUTPUTS = 1,
|
||||
parameter NUM_LANES = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter REQ_OUT_BUF = 0,
|
||||
parameter RSP_OUT_BUF = 0,
|
||||
parameter `STRING ARBITER = "R",
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
|
||||
parameter FLAGS_WIDTH = `MEM_REQ_FLAGS_WIDTH
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_lsu_mem_if.slave bus_in_if [NUM_INPUTS],
|
||||
VX_lsu_mem_if.master bus_out_if [NUM_OUTPUTS]
|
||||
);
|
||||
localparam DATA_WIDTH = (8 * DATA_SIZE);
|
||||
localparam LOG_NUM_REQS = `ARB_SEL_BITS(NUM_INPUTS, NUM_OUTPUTS);
|
||||
localparam REQ_DATAW = 1 + NUM_LANES * (1 + ADDR_WIDTH + DATA_WIDTH + DATA_SIZE + FLAGS_WIDTH) + TAG_WIDTH;
|
||||
localparam RSP_DATAW = NUM_LANES * (1 + DATA_WIDTH) + TAG_WIDTH;
|
||||
|
||||
`STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter: NUM_INPUTS=%0d, NUM_OUTPUTS=%0d", NUM_INPUTS, NUM_OUTPUTS));
|
||||
|
||||
wire [NUM_INPUTS-1:0] req_valid_in;
|
||||
wire [NUM_INPUTS-1:0][REQ_DATAW-1:0] req_data_in;
|
||||
wire [NUM_INPUTS-1:0] req_ready_in;
|
||||
|
||||
wire [NUM_OUTPUTS-1:0] req_valid_out;
|
||||
wire [NUM_OUTPUTS-1:0][REQ_DATAW-1:0] req_data_out;
|
||||
wire [NUM_OUTPUTS-1:0][`UP(LOG_NUM_REQS)-1:0] req_sel_out;
|
||||
wire [NUM_OUTPUTS-1:0] req_ready_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_req_data_in
|
||||
assign req_valid_in[i] = bus_in_if[i].req_valid;
|
||||
assign req_data_in[i] = bus_in_if[i].req_data;
|
||||
assign bus_in_if[i].req_ready = req_ready_in[i];
|
||||
end
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_INPUTS),
|
||||
.NUM_OUTPUTS (NUM_OUTPUTS),
|
||||
.DATAW (REQ_DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.OUT_BUF (REQ_OUT_BUF)
|
||||
) req_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (req_valid_in),
|
||||
.ready_in (req_ready_in),
|
||||
.data_in (req_data_in),
|
||||
.data_out (req_data_out),
|
||||
.sel_out (req_sel_out),
|
||||
.valid_out (req_valid_out),
|
||||
.ready_out (req_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_bus_out_if
|
||||
wire [TAG_WIDTH-1:0] req_tag_out;
|
||||
VX_bits_insert #(
|
||||
.N (TAG_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_insert (
|
||||
.data_in (req_tag_out),
|
||||
.ins_in (req_sel_out[i]),
|
||||
.data_out (bus_out_if[i].req_data.tag)
|
||||
);
|
||||
assign bus_out_if[i].req_valid = req_valid_out[i];
|
||||
assign {
|
||||
bus_out_if[i].req_data.mask,
|
||||
bus_out_if[i].req_data.rw,
|
||||
bus_out_if[i].req_data.addr,
|
||||
bus_out_if[i].req_data.data,
|
||||
bus_out_if[i].req_data.byteen,
|
||||
bus_out_if[i].req_data.flags,
|
||||
req_tag_out
|
||||
} = req_data_out[i];
|
||||
assign req_ready_out[i] = bus_out_if[i].req_ready;
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_INPUTS-1:0] rsp_valid_out;
|
||||
wire [NUM_INPUTS-1:0][RSP_DATAW-1:0] rsp_data_out;
|
||||
wire [NUM_INPUTS-1:0] rsp_ready_out;
|
||||
|
||||
wire [NUM_OUTPUTS-1:0] rsp_valid_in;
|
||||
wire [NUM_OUTPUTS-1:0][RSP_DATAW-1:0] rsp_data_in;
|
||||
wire [NUM_OUTPUTS-1:0] rsp_ready_in;
|
||||
|
||||
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_rsp_enabled
|
||||
|
||||
wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS-1:0] rsp_sel_in;
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in
|
||||
wire [TAG_WIDTH-1:0] rsp_tag_out;
|
||||
VX_bits_remove #(
|
||||
.N (TAG_WIDTH + LOG_NUM_REQS),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_remove (
|
||||
.data_in (bus_out_if[i].rsp_data.tag),
|
||||
.sel_out (rsp_sel_in[i]),
|
||||
.data_out (rsp_tag_out)
|
||||
);
|
||||
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
|
||||
assign rsp_data_in[i] = {
|
||||
bus_out_if[i].rsp_data.mask,
|
||||
bus_out_if[i].rsp_data.data,
|
||||
rsp_tag_out
|
||||
};
|
||||
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
|
||||
end
|
||||
|
||||
VX_stream_switch #(
|
||||
.NUM_INPUTS (NUM_OUTPUTS),
|
||||
.NUM_OUTPUTS (NUM_INPUTS),
|
||||
.DATAW (RSP_DATAW),
|
||||
.OUT_BUF (RSP_OUT_BUF)
|
||||
) rsp_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (rsp_sel_in),
|
||||
.valid_in (rsp_valid_in),
|
||||
.ready_in (rsp_ready_in),
|
||||
.data_in (rsp_data_in),
|
||||
.data_out (rsp_data_out),
|
||||
.valid_out (rsp_valid_out),
|
||||
.ready_out (rsp_ready_out)
|
||||
);
|
||||
|
||||
end else begin : g_passthru
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in
|
||||
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
|
||||
assign rsp_data_in[i] = bus_out_if[i].rsp_data;
|
||||
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
|
||||
end
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_OUTPUTS),
|
||||
.NUM_OUTPUTS (NUM_INPUTS),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.OUT_BUF (RSP_OUT_BUF)
|
||||
) req_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (rsp_valid_in),
|
||||
.ready_in (rsp_ready_in),
|
||||
.data_in (rsp_data_in),
|
||||
.data_out (rsp_data_out),
|
||||
.valid_out (rsp_valid_out),
|
||||
.ready_out (rsp_ready_out),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_output
|
||||
assign bus_in_if[i].rsp_valid = rsp_valid_out[i];
|
||||
assign bus_in_if[i].rsp_data = rsp_data_out[i];
|
||||
assign rsp_ready_out[i] = bus_in_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -16,26 +16,32 @@
|
|||
interface VX_lsu_mem_if #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter FLAGS_WIDTH= `MEM_REQ_FLAGS_WIDTH,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter FLAGS_WIDTH= `MEM_REQ_FLAGS_WIDTH,
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE)
|
||||
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE),
|
||||
parameter UUID_WIDTH = `UUID_WIDTH
|
||||
) ();
|
||||
|
||||
typedef struct packed {
|
||||
logic rw;
|
||||
logic [NUM_LANES-1:0] mask;
|
||||
logic [`UP(UUID_WIDTH)-1:0] uuid;
|
||||
logic [TAG_WIDTH-`UP(UUID_WIDTH)-1:0] value;
|
||||
} tag_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [NUM_LANES-1:0] mask;
|
||||
logic rw;
|
||||
logic [NUM_LANES-1:0][ADDR_WIDTH-1:0] addr;
|
||||
logic [NUM_LANES-1:0][DATA_SIZE*8-1:0] data;
|
||||
logic [NUM_LANES-1:0][DATA_SIZE-1:0] byteen;
|
||||
logic [NUM_LANES-1:0][FLAGS_WIDTH-1:0] flags;
|
||||
logic [TAG_WIDTH-1:0] tag;
|
||||
tag_t tag;
|
||||
} req_data_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [NUM_LANES-1:0] mask;
|
||||
logic [NUM_LANES-1:0] mask;
|
||||
logic [NUM_LANES-1:0][DATA_SIZE*8-1:0] data;
|
||||
logic [TAG_WIDTH-1:0] tag;
|
||||
tag_t tag;
|
||||
} rsp_data_t;
|
||||
|
||||
logic req_valid;
|
||||
|
|
|
@ -17,13 +17,14 @@ module VX_mem_arb #(
|
|||
parameter NUM_INPUTS = 1,
|
||||
parameter NUM_OUTPUTS = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter REQ_OUT_BUF = 0,
|
||||
parameter RSP_OUT_BUF = 0,
|
||||
parameter `STRING ARBITER = "R"
|
||||
parameter `STRING ARBITER = "R",
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
|
||||
parameter FLAGS_WIDTH = `MEM_REQ_FLAGS_WIDTH
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -33,10 +34,10 @@ module VX_mem_arb #(
|
|||
);
|
||||
localparam DATA_WIDTH = (8 * DATA_SIZE);
|
||||
localparam LOG_NUM_REQS = `ARB_SEL_BITS(NUM_INPUTS, NUM_OUTPUTS);
|
||||
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
localparam RSP_DATAW = TAG_WIDTH + DATA_WIDTH;
|
||||
localparam REQ_DATAW = 1 + ADDR_WIDTH + DATA_WIDTH + DATA_SIZE + FLAGS_WIDTH + TAG_WIDTH;
|
||||
localparam RSP_DATAW = DATA_WIDTH + TAG_WIDTH;
|
||||
|
||||
`STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter"))
|
||||
`STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter: NUM_INPUTS=%0d, NUM_OUTPUTS=%0d", NUM_INPUTS, NUM_OUTPUTS));
|
||||
|
||||
wire [NUM_INPUTS-1:0] req_valid_in;
|
||||
wire [NUM_INPUTS-1:0][REQ_DATAW-1:0] req_data_in;
|
||||
|
@ -49,14 +50,7 @@ module VX_mem_arb #(
|
|||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_req_data_in
|
||||
assign req_valid_in[i] = bus_in_if[i].req_valid;
|
||||
assign req_data_in[i] = {
|
||||
bus_in_if[i].req_data.rw,
|
||||
bus_in_if[i].req_data.byteen,
|
||||
bus_in_if[i].req_data.addr,
|
||||
bus_in_if[i].req_data.flags,
|
||||
bus_in_if[i].req_data.data,
|
||||
bus_in_if[i].req_data.tag
|
||||
};
|
||||
assign req_data_in[i] = bus_in_if[i].req_data;
|
||||
assign bus_in_if[i].req_ready = req_ready_in[i];
|
||||
end
|
||||
|
||||
|
@ -92,10 +86,10 @@ module VX_mem_arb #(
|
|||
assign bus_out_if[i].req_valid = req_valid_out[i];
|
||||
assign {
|
||||
bus_out_if[i].req_data.rw,
|
||||
bus_out_if[i].req_data.byteen,
|
||||
bus_out_if[i].req_data.addr,
|
||||
bus_out_if[i].req_data.flags,
|
||||
bus_out_if[i].req_data.data,
|
||||
bus_out_if[i].req_data.byteen,
|
||||
bus_out_if[i].req_data.flags,
|
||||
req_tag_out
|
||||
} = req_data_out[i];
|
||||
assign req_ready_out[i] = bus_out_if[i].req_ready;
|
||||
|
@ -123,18 +117,12 @@ module VX_mem_arb #(
|
|||
.POS (TAG_SEL_IDX)
|
||||
) bits_remove (
|
||||
.data_in (bus_out_if[i].rsp_data.tag),
|
||||
.sel_out (rsp_sel_in[i]),
|
||||
.data_out (rsp_tag_out)
|
||||
);
|
||||
|
||||
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
|
||||
assign rsp_data_in[i] = {rsp_tag_out, bus_out_if[i].rsp_data.data};
|
||||
assign rsp_data_in[i] = {bus_out_if[i].rsp_data.data, rsp_tag_out};
|
||||
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
|
||||
|
||||
if (NUM_INPUTS > 1) begin : g_rsp_sel_in
|
||||
assign rsp_sel_in[i] = bus_out_if[i].rsp_data.tag[TAG_SEL_IDX +: LOG_NUM_REQS];
|
||||
end else begin : g_no_rsp_sel_in
|
||||
assign rsp_sel_in[i] = '0;
|
||||
end
|
||||
end
|
||||
|
||||
VX_stream_switch #(
|
||||
|
@ -158,10 +146,7 @@ module VX_mem_arb #(
|
|||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in
|
||||
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
|
||||
assign rsp_data_in[i] = {
|
||||
bus_out_if[i].rsp_data.tag,
|
||||
bus_out_if[i].rsp_data.data
|
||||
};
|
||||
assign rsp_data_in[i] = bus_out_if[i].rsp_data;
|
||||
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
|
||||
end
|
||||
|
||||
|
@ -187,10 +172,7 @@ module VX_mem_arb #(
|
|||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_output
|
||||
assign bus_in_if[i].rsp_valid = rsp_valid_out[i];
|
||||
assign {
|
||||
bus_in_if[i].rsp_data.tag,
|
||||
bus_in_if[i].rsp_data.data
|
||||
} = rsp_data_out[i];
|
||||
assign bus_in_if[i].rsp_data = rsp_data_out[i];
|
||||
assign rsp_ready_out[i] = bus_in_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
|
|
|
@ -18,21 +18,27 @@ interface VX_mem_bus_if #(
|
|||
parameter FLAGS_WIDTH= `MEM_REQ_FLAGS_WIDTH,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE)
|
||||
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE),
|
||||
parameter UUID_WIDTH = `UUID_WIDTH
|
||||
) ();
|
||||
|
||||
typedef struct packed {
|
||||
logic [`UP(UUID_WIDTH)-1:0] uuid;
|
||||
logic [TAG_WIDTH-`UP(UUID_WIDTH)-1:0] value;
|
||||
} tag_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic rw;
|
||||
logic [ADDR_WIDTH-1:0] addr;
|
||||
logic [DATA_SIZE*8-1:0] data;
|
||||
logic [DATA_SIZE-1:0] byteen;
|
||||
logic [FLAGS_WIDTH-1:0] flags;
|
||||
logic [TAG_WIDTH-1:0] tag;
|
||||
tag_t tag;
|
||||
} req_data_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [DATA_SIZE*8-1:0] data;
|
||||
logic [TAG_WIDTH-1:0] tag;
|
||||
tag_t tag;
|
||||
} rsp_data_t;
|
||||
|
||||
logic req_valid;
|
||||
|
|
|
@ -14,21 +14,25 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_mem_switch import VX_gpu_pkg::*; #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter NUM_INPUTS = 1,
|
||||
parameter NUM_OUTPUTS = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter ADDR_WIDTH = 1,
|
||||
parameter REQ_OUT_BUF = 0,
|
||||
parameter RSP_OUT_BUF = 0,
|
||||
parameter `STRING ARBITER = "R",
|
||||
parameter NUM_REQS = (NUM_INPUTS > NUM_OUTPUTS) ? `CDIV(NUM_INPUTS, NUM_OUTPUTS) : `CDIV(NUM_OUTPUTS, NUM_INPUTS),
|
||||
parameter SEL_COUNT = `MIN(NUM_INPUTS, NUM_OUTPUTS),
|
||||
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire [`UP(LOG_NUM_REQS)-1:0] bus_sel,
|
||||
VX_mem_bus_if.slave bus_in_if,
|
||||
VX_mem_bus_if.master bus_out_if [NUM_REQS]
|
||||
input wire [SEL_COUNT-1:0][`UP(LOG_NUM_REQS)-1:0] bus_sel,
|
||||
VX_mem_bus_if.slave bus_in_if [NUM_INPUTS],
|
||||
VX_mem_bus_if.master bus_out_if [NUM_OUTPUTS]
|
||||
);
|
||||
localparam DATA_WIDTH = (8 * DATA_SIZE);
|
||||
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
|
@ -36,46 +40,62 @@ module VX_mem_switch import VX_gpu_pkg::*; #(
|
|||
|
||||
// handle requests ////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0] req_valid_out;
|
||||
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_out;
|
||||
wire [NUM_REQS-1:0] req_ready_out;
|
||||
wire [NUM_INPUTS-1:0] req_valid_in;
|
||||
wire [NUM_INPUTS-1:0][REQ_DATAW-1:0] req_data_in;
|
||||
wire [NUM_INPUTS-1:0] req_ready_in;
|
||||
|
||||
wire [NUM_OUTPUTS-1:0] req_valid_out;
|
||||
wire [NUM_OUTPUTS-1:0][REQ_DATAW-1:0] req_data_out;
|
||||
wire [NUM_OUTPUTS-1:0] req_ready_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_req_data_in
|
||||
assign req_valid_in[i] = bus_in_if[i].req_valid;
|
||||
assign req_data_in[i] = bus_in_if[i].req_data;
|
||||
assign bus_in_if[i].req_ready = req_ready_in[i];
|
||||
end
|
||||
|
||||
VX_stream_switch #(
|
||||
.NUM_OUTPUTS (NUM_REQS),
|
||||
.NUM_INPUTS (NUM_INPUTS),
|
||||
.NUM_OUTPUTS (NUM_OUTPUTS),
|
||||
.DATAW (REQ_DATAW),
|
||||
.OUT_BUF (REQ_OUT_BUF)
|
||||
) req_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (bus_sel),
|
||||
.valid_in (bus_in_if.req_valid),
|
||||
.data_in (bus_in_if.req_data),
|
||||
.ready_in (bus_in_if.req_ready),
|
||||
.valid_in (req_valid_in),
|
||||
.data_in (req_data_in),
|
||||
.ready_in (req_ready_in),
|
||||
.valid_out (req_valid_out),
|
||||
.data_out (req_data_out),
|
||||
.ready_out (req_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_req_data_out
|
||||
assign bus_out_if[i].req_valid = req_valid_out[i];
|
||||
assign bus_out_if[i].req_data = req_data_out[i];
|
||||
assign bus_out_if[i].req_data = req_data_out[i];
|
||||
assign req_ready_out[i] = bus_out_if[i].req_ready;
|
||||
end
|
||||
|
||||
// handle responses ///////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0] rsp_valid_in;
|
||||
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_in;
|
||||
wire [NUM_REQS-1:0] rsp_ready_in;
|
||||
wire [NUM_OUTPUTS-1:0] rsp_valid_in;
|
||||
wire [NUM_OUTPUTS-1:0][RSP_DATAW-1:0] rsp_data_in;
|
||||
wire [NUM_OUTPUTS-1:0] rsp_ready_in;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
wire [NUM_INPUTS-1:0] rsp_valid_out;
|
||||
wire [NUM_INPUTS-1:0][RSP_DATAW-1:0] rsp_data_out;
|
||||
wire [NUM_INPUTS-1:0] rsp_ready_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in
|
||||
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
|
||||
assign rsp_data_in[i] = bus_out_if[i].rsp_data;
|
||||
assign rsp_data_in[i] = bus_out_if[i].rsp_data;
|
||||
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
|
||||
end
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_REQS),
|
||||
.NUM_INPUTS (NUM_OUTPUTS),
|
||||
.NUM_OUTPUTS(NUM_INPUTS),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.OUT_BUF (RSP_OUT_BUF)
|
||||
|
@ -85,10 +105,16 @@ module VX_mem_switch import VX_gpu_pkg::*; #(
|
|||
.valid_in (rsp_valid_in),
|
||||
.data_in (rsp_data_in),
|
||||
.ready_in (rsp_ready_in),
|
||||
.valid_out (bus_in_if.rsp_valid),
|
||||
.data_out (bus_in_if.rsp_data),
|
||||
.ready_out (bus_in_if.rsp_ready),
|
||||
.valid_out (rsp_valid_out),
|
||||
.data_out (rsp_data_out),
|
||||
.ready_out (rsp_ready_out),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_rsp_data_out
|
||||
assign bus_in_if[i].rsp_valid = rsp_valid_out[i];
|
||||
assign bus_in_if[i].rsp_data = rsp_data_out[i];
|
||||
assign rsp_ready_out[i] = bus_in_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -152,7 +152,9 @@ public:
|
|||
|
||||
// start
|
||||
device_->reset = 0;
|
||||
device_->mem_req_ready = 1;
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
device_->mem_req_ready[b] = 1;
|
||||
}
|
||||
|
||||
// wait on device to go busy
|
||||
while (!device_->busy) {
|
||||
|
@ -186,11 +188,14 @@ private:
|
|||
this->dcr_bus_reset();
|
||||
|
||||
print_bufs_.clear();
|
||||
pending_mem_reqs_.clear();
|
||||
|
||||
{
|
||||
for (auto& reqs : pending_mem_reqs_) {
|
||||
reqs.clear();
|
||||
}
|
||||
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
std::queue<mem_req_t*> empty;
|
||||
std::swap(dram_queue_, empty);
|
||||
std::swap(dram_queue_[b], empty);
|
||||
}
|
||||
|
||||
device_->reset = 1;
|
||||
|
@ -217,17 +222,19 @@ private:
|
|||
|
||||
dram_sim_.tick();
|
||||
|
||||
if (!dram_queue_.empty()) {
|
||||
auto mem_req = dram_queue_.front();
|
||||
if (dram_sim_.send_request(mem_req->write, mem_req->addr, 0, [](void* arg) {
|
||||
auto orig_req = reinterpret_cast<mem_req_t*>(arg);
|
||||
if (orig_req->ready) {
|
||||
delete orig_req;
|
||||
} else {
|
||||
orig_req->ready = true;
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
if (!dram_queue_[b].empty()) {
|
||||
auto mem_req = dram_queue_[b].front();
|
||||
if (dram_sim_.send_request(mem_req->write, mem_req->addr, b, [](void* arg) {
|
||||
auto orig_req = reinterpret_cast<mem_req_t*>(arg);
|
||||
if (orig_req->ready) {
|
||||
delete orig_req;
|
||||
} else {
|
||||
orig_req->ready = true;
|
||||
}
|
||||
}, mem_req)) {
|
||||
dram_queue_[b].pop();
|
||||
}
|
||||
}, mem_req)) {
|
||||
dram_queue_.pop();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -247,101 +254,107 @@ private:
|
|||
}
|
||||
|
||||
void mem_bus_reset() {
|
||||
device_->mem_req_ready = 0;
|
||||
device_->mem_rsp_valid = 0;
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
device_->mem_req_ready[b] = 0;
|
||||
device_->mem_rsp_valid[b] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void mem_bus_eval(bool clk) {
|
||||
if (!clk) {
|
||||
mem_rd_rsp_ready_ = device_->mem_rsp_ready;
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
mem_rd_rsp_ready_[b] = device_->mem_rsp_ready[b];
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// process memory read responses
|
||||
if (device_->mem_rsp_valid && mem_rd_rsp_ready_) {
|
||||
device_->mem_rsp_valid = 0;
|
||||
}
|
||||
if (!device_->mem_rsp_valid) {
|
||||
if (!pending_mem_reqs_.empty()
|
||||
&& (*pending_mem_reqs_.begin())->ready) {
|
||||
auto mem_rsp_it = pending_mem_reqs_.begin();
|
||||
auto mem_rsp = *mem_rsp_it;
|
||||
/*printf("%0ld: [sim] MEM Rd Rsp: tag=0x%0lx, addr=0x%0lx, data=0x", timestamp, mem_rsp->tag, mem_rsp->addr);
|
||||
for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) {
|
||||
printf("%02x", mem_rsp->data[i]);
|
||||
}
|
||||
printf("\n");
|
||||
*/
|
||||
device_->mem_rsp_valid = 1;
|
||||
memcpy(VDataCast<void*, MEM_BLOCK_SIZE>::get(device_->mem_rsp_data), mem_rsp->data.data(), MEM_BLOCK_SIZE);
|
||||
device_->mem_rsp_tag = mem_rsp->tag;
|
||||
pending_mem_reqs_.erase(mem_rsp_it);
|
||||
delete mem_rsp;
|
||||
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
|
||||
// process memory read responses
|
||||
if (device_->mem_rsp_valid[b] && mem_rd_rsp_ready_[b]) {
|
||||
device_->mem_rsp_valid[b] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// process memory requests
|
||||
if (device_->mem_req_valid && device_->mem_req_ready) {
|
||||
uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
|
||||
if (device_->mem_req_rw) {
|
||||
auto byteen = device_->mem_req_byteen;
|
||||
auto data = VDataCast<uint8_t*, MEM_BLOCK_SIZE>::get(device_->mem_req_data);
|
||||
if (byte_addr >= uint64_t(IO_COUT_ADDR)
|
||||
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
||||
// process console output
|
||||
for (int i = 0; i < IO_COUT_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
auto& ss_buf = print_bufs_[i];
|
||||
char c = data[i];
|
||||
ss_buf << c;
|
||||
if (c == '\n') {
|
||||
std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush;
|
||||
ss_buf.str("");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// process writes
|
||||
/*
|
||||
printf("%0ld: [sim] MEM Wr Req: tag=0x%0lx, addr=0x%0lx, byteen=0x", timestamp, device_->mem_req_tag, byte_addr);
|
||||
for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) {
|
||||
printf("%x", (int)((byteen >> (4 * i)) & 0xf));
|
||||
}
|
||||
printf(", data=0x");
|
||||
if (!device_->mem_rsp_valid[b]) {
|
||||
if (!pending_mem_reqs_[b].empty()
|
||||
&& (*pending_mem_reqs_[b].begin())->ready) {
|
||||
auto mem_rsp_it = pending_mem_reqs_[b].begin();
|
||||
auto mem_rsp = *mem_rsp_it;
|
||||
/*printf("%0ld: [sim] MEM Rd Rsp: tag=0x%0lx, addr=0x%0lx, data=0x", timestamp, mem_rsp->tag, mem_rsp->addr);
|
||||
for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) {
|
||||
printf("%d=%02x,", i, data[i]);
|
||||
printf("%02x", mem_rsp->data[i]);
|
||||
}
|
||||
printf("\n");
|
||||
*/
|
||||
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
(*ram_)[byte_addr + i] = data[i];
|
||||
}
|
||||
}
|
||||
device_->mem_rsp_valid[b] = 1;
|
||||
memcpy(VDataCast<void*, MEM_BLOCK_SIZE>::get(device_->mem_rsp_data[b]), mem_rsp->data.data(), MEM_BLOCK_SIZE);
|
||||
device_->mem_rsp_tag[b] = mem_rsp->tag;
|
||||
pending_mem_reqs_[b].erase(mem_rsp_it);
|
||||
delete mem_rsp;
|
||||
}
|
||||
}
|
||||
|
||||
// process memory requests
|
||||
if (device_->mem_req_valid[b] && device_->mem_req_ready[b]) {
|
||||
uint64_t byte_addr = (device_->mem_req_addr[b] * MEM_BLOCK_SIZE);
|
||||
if (device_->mem_req_rw[b]) {
|
||||
auto byteen = device_->mem_req_byteen[b];
|
||||
auto data = VDataCast<uint8_t*, MEM_BLOCK_SIZE>::get(device_->mem_req_data[b]);
|
||||
if (byte_addr >= uint64_t(IO_COUT_ADDR)
|
||||
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
||||
// process console output
|
||||
for (int i = 0; i < IO_COUT_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
auto& ss_buf = print_bufs_[i];
|
||||
char c = data[i];
|
||||
ss_buf << c;
|
||||
if (c == '\n') {
|
||||
std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush;
|
||||
ss_buf.str("");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// process writes
|
||||
/*
|
||||
printf("%0ld: [sim] MEM Wr Req: tag=0x%0lx, addr=0x%0lx, byteen=0x", timestamp, device_->mem_req_tag, byte_addr);
|
||||
for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) {
|
||||
printf("%x", (int)((byteen >> (4 * i)) & 0xf));
|
||||
}
|
||||
printf(", data=0x");
|
||||
for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) {
|
||||
printf("%d=%02x,", i, data[i]);
|
||||
}
|
||||
printf("\n");
|
||||
*/
|
||||
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
(*ram_)[byte_addr + i] = data[i];
|
||||
}
|
||||
}
|
||||
|
||||
auto mem_req = new mem_req_t();
|
||||
mem_req->tag = device_->mem_req_tag[b];
|
||||
mem_req->addr = byte_addr;
|
||||
mem_req->write = true;
|
||||
mem_req->ready = true;
|
||||
|
||||
// send dram request
|
||||
dram_queue_[b].push(mem_req);
|
||||
}
|
||||
} else {
|
||||
// process reads
|
||||
auto mem_req = new mem_req_t();
|
||||
mem_req->tag = device_->mem_req_tag;
|
||||
mem_req->tag = device_->mem_req_tag[b];
|
||||
mem_req->addr = byte_addr;
|
||||
mem_req->write = true;
|
||||
mem_req->ready = true;
|
||||
mem_req->write = false;
|
||||
mem_req->ready = false;
|
||||
ram_->read(mem_req->data.data(), byte_addr, MEM_BLOCK_SIZE);
|
||||
pending_mem_reqs_[b].emplace_back(mem_req);
|
||||
|
||||
//printf("%0ld: [sim] MEM Rd Req: addr=0x%0lx, tag=0x%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
|
||||
|
||||
// send dram request
|
||||
dram_queue_.push(mem_req);
|
||||
dram_queue_[b].push(mem_req);
|
||||
}
|
||||
} else {
|
||||
// process reads
|
||||
auto mem_req = new mem_req_t();
|
||||
mem_req->tag = device_->mem_req_tag;
|
||||
mem_req->addr = byte_addr;
|
||||
mem_req->write = false;
|
||||
mem_req->ready = false;
|
||||
ram_->read(mem_req->data.data(), byte_addr, MEM_BLOCK_SIZE);
|
||||
pending_mem_reqs_.emplace_back(mem_req);
|
||||
|
||||
//printf("%0ld: [sim] MEM Rd Req: addr=0x%0lx, tag=0x%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
|
||||
|
||||
// send dram request
|
||||
dram_queue_.push(mem_req);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -369,21 +382,21 @@ private:
|
|||
|
||||
std::unordered_map<int, std::stringstream> print_bufs_;
|
||||
|
||||
std::list<mem_req_t*> pending_mem_reqs_;
|
||||
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS];
|
||||
|
||||
std::queue<mem_req_t*> dram_queue_;
|
||||
std::queue<mem_req_t*> dram_queue_[PLATFORM_MEMORY_BANKS];
|
||||
|
||||
std::array<bool, PLATFORM_MEMORY_BANKS> mem_rd_rsp_ready_;
|
||||
|
||||
DramSim dram_sim_;
|
||||
|
||||
VVortex* device_;
|
||||
|
||||
RAM* ram_;
|
||||
|
||||
#ifdef VCD_OUTPUT
|
||||
VerilatedVcdC *tfp_;
|
||||
#endif
|
||||
|
||||
bool mem_rd_rsp_ready_;
|
||||
|
||||
RAM* ram_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
|
Loading…
Add table
Reference in a new issue