ramulator memory addressing bug fix + platform memory refactoring
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions

This commit is contained in:
tinebp 2025-01-26 06:28:51 -08:00
parent e80ee2c819
commit 22398c991d
33 changed files with 310 additions and 281 deletions

View file

@ -301,11 +301,11 @@ config2()
# test single-bank memory
if [ "$XLEN" == "64" ]; then
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=48" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=48" ./ci/blackbox.sh --driver=xrt --app=mstress
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
else
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32" ./ci/blackbox.sh --driver=xrt --app=mstress
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
fi
# test larger memory address
@ -322,10 +322,10 @@ config2()
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress
# test memory ports
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=opae --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=xrt --app=mstress --threads=8

View file

@ -172,8 +172,26 @@
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`endif
`ifndef PLATFORM_MEMORY_BANKS
`define PLATFORM_MEMORY_BANKS 2
// Platform memory parameters
`ifndef PLATFORM_MEMORY_NUM_BANKS
`define PLATFORM_MEMORY_NUM_BANKS 2
`endif
`ifndef PLATFORM_MEMORY_ADDR_WIDTH
`ifdef XLEN_64
`define PLATFORM_MEMORY_ADDR_WIDTH 48
`else
`define PLATFORM_MEMORY_ADDR_WIDTH 32
`endif
`endif
`ifndef PLATFORM_MEMORY_DATA_SIZE
`define PLATFORM_MEMORY_DATA_SIZE 64
`endif
`ifndef PLATFORM_MEMORY_INTERLEAVE
`define PLATFORM_MEMORY_INTERLEAVE 1
`endif
`ifdef XLEN_64
@ -656,9 +674,9 @@
// Number of Memory Ports
`ifndef L1_MEM_PORTS
`ifdef L1_DISABLE
`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_BANKS)
`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`else
`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_BANKS)
`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`endif
`endif
@ -735,9 +753,9 @@
// Number of Memory Ports
`ifndef L2_MEM_PORTS
`ifdef L2_ENABLE
`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_BANKS)
`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`else
`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_BANKS)
`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`endif
`endif
@ -796,9 +814,9 @@
// Number of Memory Ports
`ifndef L3_MEM_PORTS
`ifdef L3_ENABLE
`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_BANKS)
`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`else
`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_BANKS)
`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`endif
`endif

View file

@ -193,7 +193,7 @@ module Vortex_axi import VX_gpu_pkg::*; #(
.TAG_WIDTH_OUT (AXI_TID_WIDTH),
.NUM_PORTS_IN (`VX_MEM_PORTS),
.NUM_BANKS_OUT (AXI_NUM_BANKS),
.INTERLEAVE (0),
.INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
.REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0),
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter (

View file

@ -28,18 +28,18 @@
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//`include "platform_afu_top_config.vh"
`include "VX_define.vh"
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH (`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_DATA_WIDTH/8))
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH ((`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS)) - $clog2(`PLATFORM_MEMORY_DATA_SIZE))
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH `PLATFORM_MEMORY_DATA_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH (`PLATFORM_MEMORY_DATA_SIZE * 8)
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH `PLATFORM_MEMORY_BURST_CNT_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
`endif
package local_mem_cfg_pkg;

View file

@ -11,18 +11,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`ifndef NOPAE
`include "afu_json_info.vh"
`else
`include "vortex_afu.vh"
`endif
`include "VX_define.vh"
`ifndef PLATFORM_MEMORY_INTERLEAVE
`define PLATFORM_MEMORY_INTERLEAVE 1
`endif
module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_gpu_pkg::*; #(
parameter NUM_LOCAL_MEM_BANKS = 2
) (

View file

@ -134,10 +134,12 @@ module VX_afu_ctrl #(
RSTATE_RESP = 2'd2,
RSTATE_WIDTH = 2;
localparam MEMORY_BANK_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - `CLOG2(`PLATFORM_MEMORY_NUM_BANKS);
// device caps
wire [63:0] dev_caps = {8'b0,
5'(`PLATFORM_MEMORY_ADDR_WIDTH-20),
3'(`CLOG2(`PLATFORM_MEMORY_BANKS)),
5'(MEMORY_BANK_ADDR_WIDTH-20),
3'(`CLOG2(`PLATFORM_MEMORY_NUM_BANKS)),
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),

View file

@ -31,7 +31,7 @@ module VX_afu_wrap #(
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`endif
// AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid,
@ -58,11 +58,7 @@ module VX_afu_wrap #(
output wire interrupt
);
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH + $clog2(`PLATFORM_MEMORY_BANKS);
`else
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH;
`endif
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - $clog2(C_M_AXI_MEM_NUM_BANKS);
typedef enum logic [1:0] {
STATE_IDLE = 0,
@ -71,8 +67,8 @@ module VX_afu_wrap #(
STATE_DONE = 3
} state_e;
localparam PENDING_SIZEW = 12; // max outstanding requests size
localparam C_M_AXI_MEM_NUM_BANKS_SW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
localparam PENDING_WR_SIZEW = 12; // max outstanding requests size
localparam NUM_MEM_BANKS_SIZEW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS];
@ -108,11 +104,11 @@ module VX_afu_wrap #(
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`else
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`endif
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
reg [PENDING_SIZEW-1:0] vx_pending_writes;
reg [PENDING_WR_SIZEW-1:0] vx_pending_writes;
reg vx_reset = 1; // asserted at initialization
wire vx_busy;
@ -200,7 +196,7 @@ module VX_afu_wrap #(
end
wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire;
wire [C_M_AXI_MEM_NUM_BANKS_SW-1:0] cur_wr_reqs, cur_wr_rsps;
wire [NUM_MEM_BANKS_SIZEW-1:0] cur_wr_reqs, cur_wr_rsps;
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_req_fire
VX_axi_write_ack axi_write_ack (
@ -224,14 +220,14 @@ module VX_afu_wrap #(
`POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire);
`POP_COUNT(cur_wr_rsps, m_axi_wr_rsp_fire);
wire signed [C_M_AXI_MEM_NUM_BANKS_SW:0] reqs_sub = (C_M_AXI_MEM_NUM_BANKS_SW+1)'(cur_wr_reqs) -
(C_M_AXI_MEM_NUM_BANKS_SW+1)'(cur_wr_rsps);
wire signed [NUM_MEM_BANKS_SIZEW:0] reqs_sub = (NUM_MEM_BANKS_SIZEW+1)'(cur_wr_reqs) -
(NUM_MEM_BANKS_SIZEW+1)'(cur_wr_rsps);
always @(posedge clk) begin
if (reset) begin
vx_pending_writes <= '0;
end else begin
vx_pending_writes <= vx_pending_writes + PENDING_SIZEW'(reqs_sub);
vx_pending_writes <= vx_pending_writes + PENDING_WR_SIZEW'(reqs_sub);
end
end
@ -270,7 +266,7 @@ module VX_afu_wrap #(
.ap_ready (ap_ready),
.ap_idle (ap_idle),
.interrupt (interrupt),
.ap_ctrl_read (ap_ctrl_read),
`ifdef SCOPE

View file

@ -17,12 +17,12 @@ module vortex_afu #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
parameter C_M_AXI_MEM_NUM_BANKS = 1
`else
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_BANKS
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
`endif
) (
// System signals
@ -33,7 +33,7 @@ module vortex_afu #(
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`endif
// AXI4-Lite slave interface
@ -75,7 +75,7 @@ module vortex_afu #(
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, AXI_MEM_ARGS, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
`endif
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
.s_axi_ctrl_awready (s_axi_ctrl_awready),
@ -94,7 +94,7 @@ module vortex_afu #(
.s_axi_ctrl_rready (s_axi_ctrl_rready),
.s_axi_ctrl_rdata (s_axi_ctrl_rdata),
.s_axi_ctrl_rresp (s_axi_ctrl_rresp),
.s_axi_ctrl_bvalid (s_axi_ctrl_bvalid),
.s_axi_ctrl_bready (s_axi_ctrl_bready),
.s_axi_ctrl_bresp (s_axi_ctrl_bresp),

View file

@ -14,18 +14,6 @@
`ifndef VORTEX_AFU_VH
`define VORTEX_AFU_VH
`ifndef PLATFORM_MEMORY_BANKS
`define PLATFORM_MEMORY_BANKS 2
`endif
`ifndef PLATFORM_MEMORY_ADDR_WIDTH
`define PLATFORM_MEMORY_ADDR_WIDTH 31
`endif
`ifndef PLATFORM_MEMORY_DATA_WIDTH
`define PLATFORM_MEMORY_DATA_WIDTH 512
`endif
`ifndef PLATFORM_MEMORY_OFFSET
`define PLATFORM_MEMORY_OFFSET 0
`endif

View file

@ -221,7 +221,7 @@ module VX_async_ram_patch #(
VX_placeholder #(
.O (1)
) placeholder2 (
.in (),
.in (1'b0),
.out (is_raddr_reg)
);
wire [DATAW-1:0] rdata_a;

View file

@ -280,7 +280,13 @@ module VX_axi_adapter #(
end
assign m_axi_arvalid[i] = req_xbar_valid_out[i] && ~xbar_rw_out;
assign m_axi_araddr[i] = ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE;
// convert address to byte-addressable space
if (INTERLEAVE) begin : g_m_axi_araddr_i
assign m_axi_araddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << (BANK_SEL_BITS + LOG2_DATA_SIZE)) | (ADDR_WIDTH_OUT'(i) << LOG2_DATA_SIZE);
end else begin : g_m_axi_araddr_ni
assign m_axi_araddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE) | (ADDR_WIDTH_OUT'(i) << (BANK_ADDR_WIDTH + LOG2_DATA_SIZE));
end
assign m_axi_arid[i] = TAG_WIDTH_OUT'(xbar_tag_r_out);
assign m_axi_arlen[i] = 8'b00000000;
assign m_axi_arsize[i] = 3'(LOG2_DATA_SIZE);

View file

@ -7,22 +7,6 @@ include ../../common.mk
# AFU parameters
CONFIGS += -DNOPAE
CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
ifeq ($(XLEN),64)
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
else
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
endif
endif
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
endif
ifeq (,$(findstring PLATFORM_MEMORY_BURST_CNT_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BURST_CNT_WIDTH=4
endif
#CONFIGS += -DNUM_CORES=2
#CONFIGS += -DNUM_WARPS=32

View file

@ -99,7 +99,7 @@ ifdef PERF
endif
# ast dump flags
XML_CFLAGS = $(filter-out -DSYNTHESIS -DQUARTUS, $(CFLAGS)) $(RTL_PKGS) -I$(AFU_DIR)/ccip -I$(DPI_DIR) -DPLATFORM_PROVIDES_LOCAL_MEMORY -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32 -DPLATFORM_MEMORY_DATA_WIDTH=512 -DPLATFORM_MEMORY_BURST_CNT_WIDTH=4 -DNOPAE -DSV_DPI
XML_CFLAGS = $(filter-out -DSYNTHESIS -DQUARTUS, $(CFLAGS)) $(RTL_PKGS) -I$(AFU_DIR)/ccip -I$(DPI_DIR) -DPLATFORM_PROVIDES_LOCAL_MEMORY -DPLATFORM_MEMORY_NUM_BANKS=1 -DNOPAE -DSV_DPI
all: swconfig ip-gen setup build

View file

@ -52,7 +52,7 @@ foreach def $vdefines_list {
if { $name == "CHIPSCOPE" } {
set chipscope 1
}
if { $name == "PLATFORM_MEMORY_BANKS" } {
if { $name == "PLATFORM_MEMORY_NUM_BANKS" } {
set num_banks [lindex $fields 1]
}
if { $name == "PLATFORM_MERGED_MEMORY_INTERFACE" } {

View file

@ -5,31 +5,36 @@ CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
ifeq ($(DEV_ARCH), zynquplus)
# zynquplus
CONFIGS += -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
else ifeq ($(DEV_ARCH), versal)
# versal
CONFIGS += -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
ifneq ($(findstring xilinx_vck5000,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_OFFSET=40'hC000000000
endif
else
# alveo
ifneq ($(findstring xilinx_u55c,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=29
# 16 GB of HBM2 with 32 channels (512 MB per channel)
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=34
CONFIGS += -DPLATFORM_MERGED_MEMORY_INTERFACE
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31]
#VPP_FLAGS += $(foreach i,$(shell seq 0 31), --connectivity.sp vortex_afu_1.m_axi_mem_$(i):HBM[$(i)])
else ifneq ($(findstring xilinx_u50,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=28
# 8 GB of HBM2 with 32 channels (256 MB per channel)
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=33
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31]
else ifneq ($(findstring xilinx_u280,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=28
# 8 GB of HBM2 with 32 channels (256 MB per channel)
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=33
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31]
else ifneq ($(findstring xilinx_u250,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=34
# 64 GB of DDR4 with 4 channels (16 GB per channel)
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36
else ifneq ($(findstring xilinx_u200,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=34
# 64 GB of DDR4 with 4 channels (16 GB per channel)
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36
else
CONFIGS += -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
endif
endif

View file

@ -78,10 +78,10 @@ public:
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
case VX_CAPS_NUM_MEM_BANKS:
_value = PLATFORM_MEMORY_BANKS;
_value = PLATFORM_MEMORY_NUM_BANKS;
break;
case VX_CAPS_MEM_BANK_SIZE:
_value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_BANKS);
_value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_NUM_BANKS);
break;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;

View file

@ -113,10 +113,10 @@ public:
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
case VX_CAPS_NUM_MEM_BANKS:
_value = PLATFORM_MEMORY_BANKS;
_value = PLATFORM_MEMORY_NUM_BANKS;
break;
case VX_CAPS_MEM_BANK_SIZE:
_value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_BANKS);
_value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_NUM_BANKS);
break;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;

View file

@ -29,19 +29,54 @@ using namespace vortex;
class DramSim::Impl {
private:
struct mem_req_t {
uint64_t addr;
bool is_write;
ResponseCallback callback;
void* arg;
};
Ramulator::IFrontEnd* ramulator_frontend_;
Ramulator::IMemorySystem* ramulator_memorysystem_;
uint32_t cpu_channel_size_;
uint64_t cpu_cycles_;
uint32_t scaled_dram_cycles_;
static const uint32_t tick_cycles_ = 1000;
static const uint32_t dram_channel_size_ = 16; // 128 bits
std::queue<mem_req_t> pending_reqs_;
void handle_pending_requests() {
if (pending_reqs_.empty())
return;
auto& req = pending_reqs_.front();
auto req_type = req.is_write ? Ramulator::Request::Type::Write : Ramulator::Request::Type::Read;
std::function<void(Ramulator::Request&)> callback = nullptr;
if (req.callback) {
callback = [req_callback = std::move(req.callback), req_arg = std::move(req.arg)](Ramulator::Request& /*dram_req*/) {
req_callback(req_arg);
};
}
if (ramulator_frontend_->receive_external_requests(req_type, req.addr, 0, callback)) {
if (req.is_write) {
// Ramulator does not handle write responses, so we fire the callback ourselves.
if (req.callback) {
req.callback(req.arg);
}
}
pending_reqs_.pop();
}
}
public:
Impl(int clock_ratio) {
Impl(uint32_t num_channels, uint32_t channel_size, float clock_ratio) {
YAML::Node dram_config;
dram_config["Frontend"]["impl"] = "GEM5";
dram_config["MemorySystem"]["impl"] = "GenericDRAM";
dram_config["MemorySystem"]["clock_ratio"] = clock_ratio;
dram_config["MemorySystem"]["clock_ratio"] = 1;
dram_config["MemorySystem"]["DRAM"]["impl"] = "HBM2";
dram_config["MemorySystem"]["DRAM"]["org"]["preset"] = "HBM2_8Gb";
dram_config["MemorySystem"]["DRAM"]["org"]["density"] = 8192;
dram_config["MemorySystem"]["DRAM"]["org"]["channel"] = 8;
dram_config["MemorySystem"]["DRAM"]["org"]["channel"] = num_channels;
dram_config["MemorySystem"]["DRAM"]["timing"]["preset"] = "HBM2_2Gbps";
dram_config["MemorySystem"]["Controller"]["impl"] = "Generic";
dram_config["MemorySystem"]["Controller"]["Scheduler"]["impl"] = "FRFCFS";
@ -59,6 +94,10 @@ public:
ramulator_memorysystem_ = Ramulator::Factory::create_memory_system(dram_config);
ramulator_frontend_->connect_memory_system(ramulator_memorysystem_);
ramulator_memorysystem_->connect_frontend(ramulator_frontend_);
cpu_channel_size_ = channel_size;
scaled_dram_cycles_ = static_cast<uint64_t>(clock_ratio * tick_cycles_);
this->reset();
}
~Impl() {
@ -66,41 +105,49 @@ public:
auto original_buf = std::cout.rdbuf();
std::cout.rdbuf(nullstream.rdbuf());
ramulator_frontend_->finalize();
ramulator_memorysystem_->finalize();
ramulator_memorysystem_->finalize();
std::cout.rdbuf(original_buf);
}
void reset() {
//--
cpu_cycles_ = 0;
}
void tick() {
ramulator_memorysystem_->tick();
cpu_cycles_ += tick_cycles_;
while (cpu_cycles_ >= scaled_dram_cycles_) {
this->handle_pending_requests();
ramulator_memorysystem_->tick();
cpu_cycles_ -= scaled_dram_cycles_;
}
}
bool send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback response_cb, void* arg) {
if (!ramulator_frontend_->receive_external_requests(
is_write ? Ramulator::Request::Type::Write : Ramulator::Request::Type::Read,
addr,
source_id,
[callback_ = std::move(response_cb), arg_ = std::move(arg)](Ramulator::Request& /*dram_req*/) {
callback_(arg_);
void send_request(uint64_t addr, bool is_write, ResponseCallback response_cb, void* arg) {
// enqueue the request
if (cpu_channel_size_ > dram_channel_size_) {
uint32_t n = cpu_channel_size_ / dram_channel_size_;
for (uint32_t i = 0; i < n; ++i) {
uint64_t dram_byte_addr = (addr / cpu_channel_size_) * dram_channel_size_ + (i * dram_channel_size_);
if (i == 0) {
pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg});
} else {
pending_reqs_.push({dram_byte_addr, is_write, nullptr, nullptr});
}
}
)) {
return false;
} else if (cpu_channel_size_ < dram_channel_size_) {
uint64_t dram_byte_addr = (addr / cpu_channel_size_) * dram_channel_size_;
pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg});
} else {
uint64_t dram_byte_addr = addr;
pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg});
}
if (is_write) {
// Ramulator does not handle write responses, so we call the callback ourselves
response_cb(arg);
}
return true;
}
}
};
///////////////////////////////////////////////////////////////////////////////
DramSim::DramSim(int clock_ratio)
: impl_(new Impl(clock_ratio))
DramSim::DramSim(uint32_t num_channels, uint32_t channel_size, float clock_ratio)
: impl_(new Impl(num_channels, channel_size, clock_ratio))
{}
DramSim::~DramSim() {
@ -115,6 +162,6 @@ void DramSim::tick() {
impl_->tick();
}
bool DramSim::send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback callback, void* arg) {
return impl_->send_request(is_write, addr, source_id, callback, arg);
void DramSim::send_request(uint64_t addr, bool is_write, ResponseCallback callback, void* arg) {
impl_->send_request(addr, is_write, callback, arg);
}

View file

@ -19,14 +19,15 @@ class DramSim {
public:
typedef void (*ResponseCallback)(void *arg);
DramSim(int clock_ratio);
DramSim(uint32_t num_channels, uint32_t channel_size, float clock_ratio);
~DramSim();
void reset();
void tick();
bool send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback response_cb, void* arg);
// addr: per-channel block address
void send_request(uint64_t addr, bool is_write, ResponseCallback response_cb, void* arg);
private:
class Impl;

View file

@ -31,24 +31,6 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
# Platform parameters
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
ifeq ($(XLEN),64)
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
else
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
endif
endif
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
endif
ifeq (,$(findstring PLATFORM_MEMORY_BURST_CNT_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BURST_CNT_WIDTH=4
endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp

View file

@ -35,8 +35,6 @@
#include <unordered_map>
#include <util.h>
#define PLATFORM_MEMORY_DATA_SIZE (PLATFORM_MEMORY_DATA_WIDTH/8)
#ifndef MEM_CLOCK_RATIO
#define MEM_CLOCK_RATIO 1
#endif
@ -66,6 +64,8 @@
using namespace vortex;
static uint32_t g_mem_bank_addr_width = (PLATFORM_MEMORY_ADDR_WIDTH - log2ceil(PLATFORM_MEMORY_NUM_BANKS));
static uint64_t timestamp = 0;
double sc_time_stamp() {
@ -95,7 +95,7 @@ public:
Impl()
: device_(nullptr)
, ram_(nullptr)
, dram_sim_(MEM_CLOCK_RATIO)
, dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO)
, stop_(false)
, host_buffer_ids_(0)
#ifdef VCD_OUTPUT
@ -146,9 +146,6 @@ public:
// allocate RAM
ram_ = new RAM(0, RAM_PAGE_SIZE);
// calculate memory bank size
mem_bank_size_ = 1ull << PLATFORM_MEMORY_ADDR_WIDTH;
// reset the device
this->reset();
@ -274,16 +271,15 @@ private:
if (!dram_queue_.empty()) {
auto mem_req = dram_queue_.front();
if (dram_sim_.send_request(mem_req->write, mem_req->addr, mem_req->bank_id, [](void* arg) {
dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) {
auto orig_req = reinterpret_cast<mem_req_t*>(arg);
if (orig_req->ready) {
delete orig_req;
} else {
orig_req->ready = true;
}
}, mem_req)) {
dram_queue_.pop();
}
}, mem_req);
dram_queue_.pop();
}
dram_sim_.tick();
@ -407,14 +403,14 @@ private:
}
void avs_bus_reset() {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
device_->avs_readdatavalid[b] = 0;
device_->avs_waitrequest[b] = 0;
}
}
void avs_bus_eval() {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
// process memory responses
device_->avs_readdatavalid[b] = 0;
if (!pending_mem_reqs_[b].empty()
@ -430,7 +426,12 @@ private:
// process memory requests
assert(!device_->avs_read[b] || !device_->avs_write[b]);
uint64_t byte_addr = b * mem_bank_size_ + uint64_t(device_->avs_address[b]) * PLATFORM_MEMORY_DATA_SIZE;
#if PLATFORM_MEMORY_INTERLEAVE == 1
uint64_t byte_addr = (uint64_t(device_->avs_address[b]) * PLATFORM_MEMORY_NUM_BANKS + b) * PLATFORM_MEMORY_DATA_SIZE;
#else
uint64_t byte_addr = (uint64_t(device_->avs_address[b]) + (b << g_mem_bank_addr_width)) * PLATFORM_MEMORY_DATA_SIZE;
#endif
if (device_->avs_write[b]) {
// process write request
uint64_t byteen = device_->avs_byteenable[b];
@ -515,9 +516,8 @@ private:
std::unordered_map<int64_t, host_buffer_t> host_buffers_;
uint64_t host_buffer_ids_;
uint64_t mem_bank_size_;
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS];
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS];
std::list<cci_rd_req_t> cci_reads_;
std::list<cci_wr_req_t> cci_writes_;

View file

@ -78,22 +78,22 @@ module vortex_afu_shim import local_mem_cfg_pkg::*; import ccip_if_pkg::*; (
output t_ccip_mmioData af2cp_sTxPort_c2_data,
// Avalon signals for local memory access
output t_local_mem_data avs_writedata [`PLATFORM_MEMORY_BANKS],
input t_local_mem_data avs_readdata [`PLATFORM_MEMORY_BANKS],
output t_local_mem_addr avs_address [`PLATFORM_MEMORY_BANKS],
input logic avs_waitrequest [`PLATFORM_MEMORY_BANKS],
output logic avs_write [`PLATFORM_MEMORY_BANKS],
output logic avs_read [`PLATFORM_MEMORY_BANKS],
output t_local_mem_byte_mask avs_byteenable [`PLATFORM_MEMORY_BANKS],
output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_MEMORY_BANKS],
input avs_readdatavalid [`PLATFORM_MEMORY_BANKS]
output t_local_mem_data avs_writedata [`PLATFORM_MEMORY_NUM_BANKS],
input t_local_mem_data avs_readdata [`PLATFORM_MEMORY_NUM_BANKS],
output t_local_mem_addr avs_address [`PLATFORM_MEMORY_NUM_BANKS],
input logic avs_waitrequest [`PLATFORM_MEMORY_NUM_BANKS],
output logic avs_write [`PLATFORM_MEMORY_NUM_BANKS],
output logic avs_read [`PLATFORM_MEMORY_NUM_BANKS],
output t_local_mem_byte_mask avs_byteenable [`PLATFORM_MEMORY_NUM_BANKS],
output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_MEMORY_NUM_BANKS],
input avs_readdatavalid [`PLATFORM_MEMORY_NUM_BANKS]
);
t_if_ccip_Rx cp2af_sRxPort;
t_if_ccip_Tx af2cp_sTxPort;
vortex_afu #(
.NUM_LOCAL_MEM_BANKS(`PLATFORM_MEMORY_BANKS)
.NUM_LOCAL_MEM_BANKS(`PLATFORM_MEMORY_NUM_BANKS)
) afu (
.clk(clk),
.reset(reset),

View file

@ -24,21 +24,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR
# Platform parameters
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
ifeq ($(XLEN),64)
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
else
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
endif
endif
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv

View file

@ -35,8 +35,6 @@
#include <dram_sim.h>
#include <util.h>
#define PLATFORM_MEMORY_DATA_SIZE (PLATFORM_MEMORY_DATA_WIDTH/8)
#ifndef MEM_CLOCK_RATIO
#define MEM_CLOCK_RATIO 1
#endif
@ -66,6 +64,8 @@ typedef uint64_t Word;
using namespace vortex;
static uint32_t g_mem_bank_addr_width = (PLATFORM_MEMORY_ADDR_WIDTH - log2ceil(PLATFORM_MEMORY_NUM_BANKS));
static uint64_t timestamp = 0;
double sc_time_stamp() {
@ -93,7 +93,7 @@ void sim_trace_enable(bool enable) {
class Processor::Impl {
public:
Impl() : dram_sim_(MEM_CLOCK_RATIO) {
Impl() : dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO) {
// force random values for uninitialized signals
Verilated::randReset(VERILATOR_RESET_VALUE);
Verilated::randSeed(50);
@ -154,7 +154,7 @@ public:
// start
device_->reset = 0;
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
device_->mem_req_ready[b] = 1;
}
@ -195,7 +195,7 @@ private:
reqs.clear();
}
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
std::queue<mem_req_t*> empty;
std::swap(dram_queue_[b], empty);
}
@ -224,17 +224,15 @@ private:
dram_sim_.tick();
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
if (!dram_queue_[b].empty()) {
auto mem_req = dram_queue_[b].front();
if (dram_sim_.send_request(mem_req->write, mem_req->addr, b, [](void* arg) {
dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) {
// mark completed request as ready
auto orig_req = reinterpret_cast<mem_req_t*>(arg);
orig_req->ready = true;
}, mem_req)) {
// was successfully sent to dram, remove from queue
dram_queue_[b].pop();
}
}, mem_req);
dram_queue_[b].pop();
}
}
@ -254,7 +252,7 @@ private:
}
void mem_bus_reset() {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
device_->mem_req_ready[b] = 0;
device_->mem_rsp_valid[b] = 0;
}
@ -262,13 +260,13 @@ private:
void mem_bus_eval(bool clk) {
if (!clk) {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
mem_rd_rsp_ready_[b] = device_->mem_rsp_ready[b];
}
return;
}
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
// process memory responses
if (device_->mem_rsp_valid[b] && mem_rd_rsp_ready_[b]) {
device_->mem_rsp_valid[b] = 0;
@ -293,11 +291,16 @@ private:
// process memory requests
if (device_->mem_req_valid[b] && device_->mem_req_ready[b]) {
uint64_t byte_addr = (device_->mem_req_addr[b] * PLATFORM_MEMORY_DATA_SIZE);
#if PLATFORM_MEMORY_INTERLEAVE == 1
uint64_t byte_addr = (uint64_t(device_->mem_req_addr[b]) * PLATFORM_MEMORY_NUM_BANKS + b) * PLATFORM_MEMORY_DATA_SIZE;
#else
uint64_t byte_addr = (uint64_t(device_->mem_req_addr[b]) + (b << g_mem_bank_addr_width)) * PLATFORM_MEMORY_DATA_SIZE;
#endif
// check read/write
if (device_->mem_req_rw[b]) {
auto byteen = device_->mem_req_byteen[b];
auto data = VDataCast<uint8_t*, PLATFORM_MEMORY_DATA_SIZE>::get(device_->mem_req_data[b]);
// check address range
// check if console output address
if (byte_addr >= uint64_t(IO_COUT_ADDR)
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
// process console output
@ -313,21 +316,23 @@ private:
}
}
} else {
// process writes
// process memory writes
/*printf("%0ld: [sim] MEM Wr Req[%d]: addr=0x%0lx, tag=0x%0lx, byteen=0x", timestamp, b, byte_addr, device_->mem_req_tag[b]);
for (int i = (PLATFORM_MEMORY_DATA_SIZE/4)-1; i >= 0; --i) {
printf("%x", (int)((byteen >> (4 * i)) & 0xf));
}
printf(", data=0x");
for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) {
printf("%d=%02x,", i, data[i]);
printf("%02x", data[i]);
}
printf("\n");*/
for (int i = 0; i < PLATFORM_MEMORY_DATA_SIZE; i++) {
if ((byteen >> i) & 0x1) {
(*ram_)[byte_addr + i] = data[i];
}
}
auto mem_req = new mem_req_t();
mem_req->tag = device_->mem_req_tag[b];
mem_req->addr = byte_addr;
@ -341,7 +346,7 @@ private:
pending_mem_reqs_[b].emplace_back(mem_req);
}
} else {
// process reads
// process memory reads
auto mem_req = new mem_req_t();
mem_req->tag = device_->mem_req_tag[b];
mem_req->addr = byte_addr;
@ -388,11 +393,11 @@ private:
std::unordered_map<int, std::stringstream> print_bufs_;
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS];
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS];
std::queue<mem_req_t*> dram_queue_[PLATFORM_MEMORY_BANKS];
std::queue<mem_req_t*> dram_queue_[PLATFORM_MEMORY_NUM_BANKS];
std::array<bool, PLATFORM_MEMORY_BANKS> mem_rd_rsp_ready_;
std::array<bool, PLATFORM_MEMORY_NUM_BANKS> mem_rd_rsp_ready_;
DramSim dram_sim_;

View file

@ -14,9 +14,9 @@
`include "VX_define.vh"
module rtlsim_shim import VX_gpu_pkg::*; #(
parameter MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_WIDTH,
parameter MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH,
parameter MEM_NUM_BANKS = `PLATFORM_MEMORY_BANKS,
parameter MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
parameter MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS),
parameter MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS,
parameter MEM_TAG_WIDTH = 64
) (
`SCOPE_IO_DECL
@ -159,7 +159,7 @@ module rtlsim_shim import VX_gpu_pkg::*; #(
.TAG_WIDTH_OUT (MEM_TAG_WIDTH),
.NUM_PORTS_IN (`VX_MEM_PORTS),
.NUM_BANKS_OUT (MEM_NUM_BANKS),
.INTERLEAVE (0),
.INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
.REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0),
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || MEM_NUM_BANKS > 1) ? 2 : 0)
) mem_bank_adapter (

View file

@ -43,8 +43,13 @@ public:
char sname[100];
snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
uint32_t wsel_bits = log2ceil(config_.line_size);
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, (1 << config.B), wsel_bits);
uint32_t lg2_line_size = log2ceil(config_.line_size);
uint32_t num_banks = 1 << config.B;
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, num_banks, 1,
[lg2_line_size, num_banks](const MemCrossBar::ReqType& req) {
// Custom logic to calculate the output index using bank interleaving
return (uint32_t)((req.addr >> lg2_line_size) & (num_banks-1));
});
for (uint32_t i = 0; i < config.num_reqs; ++i) {
simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i));
mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i));

View file

@ -30,7 +30,6 @@ private:
MemCrossBar::Ptr mem_xbar_;
DramSim dram_sim_;
mutable PerfStats perf_stats_;
struct DramCallbackArgs {
MemSim::Impl* memsim;
MemReq request;
@ -41,11 +40,15 @@ public:
Impl(MemSim* simobject, const Config& config)
: simobject_(simobject)
, config_(config)
, dram_sim_(MEM_CLOCK_RATIO)
, dram_sim_(config.num_banks, config.block_size, config.clock_ratio)
{
char sname[100];
snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_ports, config.num_banks);
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_ports, config.num_banks, 1,
[lg2_block_size = log2ceil(config.block_size), num_banks = config.num_banks](const MemCrossBar::ReqType& req) {
// Custom logic to calculate the output index using bank interleaving
return (uint32_t)((req.addr >> lg2_block_size) & (num_banks-1));
});
for (uint32_t i = 0; i < config.num_ports; ++i) {
simobject->MemReqPorts.at(i).bind(&mem_xbar_->ReqIn.at(i));
mem_xbar_->RspIn.at(i).bind(&simobject->MemRspPorts.at(i));
@ -74,16 +77,15 @@ public:
auto& mem_req = mem_xbar_->ReqOut.at(i).front();
// try to enqueue the request to the memory system
// enqueue the request to the memory system
auto req_args = new DramCallbackArgs{this, mem_req, i};
auto enqueue_success = dram_sim_.send_request(
mem_req.write,
dram_sim_.send_request(
mem_req.addr,
0,
mem_req.write,
[](void* arg) {
auto rsp_args = reinterpret_cast<const DramCallbackArgs*>(arg);
// only send a response for read requests
if (!rsp_args->request.write) {
// only send a response for read requests
MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid};
rsp_args->memsim->mem_xbar_->RspOut.at(rsp_args->bank_id).push(mem_rsp, 1);
DT(3, rsp_args->memsim->simobject_->name() << "-mem-rsp[" << rsp_args->bank_id << "]: " << mem_rsp);
@ -93,14 +95,7 @@ public:
req_args
);
// check if the request was enqueued successfully
if (!enqueue_success) {
delete req_args;
continue;
}
DT(3, simobject_->name() << "-mem-req[" << i << "]: " << mem_req);
mem_xbar_->ReqOut.at(i).pop();
}
}

View file

@ -23,6 +23,8 @@ public:
struct Config {
uint32_t num_banks;
uint32_t num_ports;
uint32_t block_size;
float clock_ratio;
};
struct PerfStats {

View file

@ -22,10 +22,14 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
{
SimPlatform::instance().initialize();
assert(PLATFORM_MEMORY_DATA_SIZE == MEM_BLOCK_SIZE);
// create memory simulator
memsim_ = MemSim::Create("dram", MemSim::Config{
PLATFORM_MEMORY_BANKS,
L3_MEM_PORTS
PLATFORM_MEMORY_NUM_BANKS,
L3_MEM_PORTS,
MEM_BLOCK_SIZE,
MEM_CLOCK_RATIO
});
// create clusters

View file

@ -398,6 +398,8 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
template <typename T>
class HashTable {
public:
typedef T DataType;
HashTable(uint32_t capacity)
: entries_(capacity)
, size_(0)
@ -470,6 +472,8 @@ private:
template <typename Type>
class Arbiter : public SimObject<Arbiter<Type>> {
public:
typedef Type ReqType;
std::vector<SimPort<Type>> Inputs;
std::vector<SimPort<Type>> Outputs;
@ -556,6 +560,8 @@ protected:
template <typename Type>
class CrossBar : public SimObject<CrossBar<Type>> {
public:
typedef Type ReqType;
std::vector<SimPort<Type>> Inputs;
std::vector<SimPort<Type>> Outputs;
@ -565,8 +571,8 @@ public:
ArbiterType type,
uint32_t num_inputs,
uint32_t num_outputs = 1,
uint32_t addr_start = 0,
uint32_t delay = 1
uint32_t delay = 1,
std::function<uint32_t(const Type& req)> output_sel = nullptr
)
: SimObject<CrossBar<Type>>(ctx, name)
, Inputs(num_inputs, this)
@ -576,12 +582,18 @@ public:
, grants_(num_outputs, 0)
, lg2_inputs_(log2ceil(num_inputs))
, lg2_outputs_(log2ceil(num_outputs))
, addr_start_(addr_start)
, collisions_(0) {
assert(delay != 0);
assert(num_inputs <= 64);
assert(num_outputs <= 64);
assert(ispow2(num_outputs));
if (output_sel != nullptr) {
output_sel_ = output_sel;
} else {
output_sel_ = [this](const Type& req) {
return (uint32_t)bit_getw(req.addr, 0, (lg2_outputs_-1));
};
}
}
void reset() {
@ -609,7 +621,8 @@ public:
auto& req = req_in.front();
uint32_t output_idx = 0;
if (lg2_outputs_ != 0) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
// select output index
output_idx = output_sel_(req);
// skip if input is not going to current output
if (output_idx != o)
continue;
@ -649,7 +662,7 @@ protected:
std::vector<uint32_t> grants_;
uint32_t lg2_inputs_;
uint32_t lg2_outputs_;
uint32_t addr_start_;
std::function<uint32_t(const Type& req)> output_sel_;
uint64_t collisions_;
};
@ -658,6 +671,9 @@ protected:
template <typename Req, typename Rsp>
class TxArbiter : public SimObject<TxArbiter<Req, Rsp>> {
public:
typedef Req ReqType;
typedef Rsp RspType;
std::vector<SimPort<Req>> ReqIn;
std::vector<SimPort<Rsp>> RspIn;
@ -771,6 +787,9 @@ protected:
template <typename Req, typename Rsp>
class TxCrossBar : public SimObject<TxCrossBar<Req, Rsp>> {
public:
typedef Req ReqType;
typedef Rsp RspType;
std::vector<SimPort<Req>> ReqIn;
std::vector<SimPort<Rsp>> RspIn;
@ -783,8 +802,8 @@ public:
ArbiterType type,
uint32_t num_inputs,
uint32_t num_outputs = 1,
uint32_t addr_start = 0,
uint32_t delay = 1
uint32_t delay = 1,
std::function<uint32_t(const Req& req)> output_sel = nullptr
)
: SimObject<TxCrossBar<Req, Rsp>>(ctx, name)
, ReqIn(num_inputs, this)
@ -797,7 +816,6 @@ public:
, rsp_grants_(num_inputs, 0)
, lg2_inputs_(log2ceil(num_inputs))
, lg2_outputs_(log2ceil(num_outputs))
, addr_start_(addr_start)
, req_collisions_(0)
, rsp_collisions_(0) {
assert(delay != 0);
@ -805,6 +823,13 @@ public:
assert(num_outputs <= 64);
assert(ispow2(num_inputs));
assert(ispow2(num_outputs));
if (output_sel != nullptr) {
output_sel_ = output_sel;
} else {
output_sel_ = [this](const Req& req) {
return (uint32_t)bit_getw(req.addr, 0, (lg2_outputs_-1));
};
}
}
void reset() {
@ -875,7 +900,8 @@ public:
auto& req = req_in.front();
uint32_t output_idx = 0;
if (lg2_outputs_ != 0) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
// select output index
output_idx = output_sel_(req);
// skip if request is not going to current output
if (output_idx != o)
continue;
@ -929,7 +955,7 @@ protected:
std::vector<uint32_t> rsp_grants_;
uint32_t lg2_inputs_;
uint32_t lg2_outputs_;
uint32_t addr_start_;
std::function<uint32_t(const Req& req)> output_sel_;
uint64_t req_collisions_;
uint64_t rsp_collisions_;
};

View file

@ -31,21 +31,6 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
# Platform parameters
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
ifeq ($(XLEN),64)
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
else
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
endif
endif
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp

View file

@ -17,16 +17,16 @@ module vortex_afu_shim #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_BANKS
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
) (
// System signals
input wire ap_clk,
input wire ap_rst_n,
// AXI4 master interface
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
// AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid,
@ -61,7 +61,7 @@ module vortex_afu_shim #(
.clk (ap_clk),
.reset (~ap_rst_n),
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
.s_axi_ctrl_awready (s_axi_ctrl_awready),

View file

@ -37,8 +37,6 @@
#include <iostream>
#define PLATFORM_MEMORY_DATA_SIZE (PLATFORM_MEMORY_DATA_WIDTH/8)
#ifndef MEM_CLOCK_RATIO
#define MEM_CLOCK_RATIO 1
#endif
@ -61,10 +59,10 @@
#define CPU_GPU_LATENCY 200
#if PLATFORM_MEMORY_DATA_WIDTH > 64
typedef VlWide<(PLATFORM_MEMORY_DATA_WIDTH/32)> Vl_m_data_t;
#if PLATFORM_MEMORY_DATA_SIZE > 8
typedef VlWide<(PLATFORM_MEMORY_DATA_SIZE/4)> Vl_m_data_t;
#else
#if PLATFORM_MEMORY_DATA_WIDTH > 32
#if PLATFORM_MEMORY_DATA_SIZE > 4
typedef QData Vl_m_data_t;
#else
typedef IData Vl_m_data_t;
@ -130,7 +128,7 @@ public:
Impl()
: device_(nullptr)
, ram_(nullptr)
, dram_sim_(MEM_CLOCK_RATIO)
, dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO)
, stop_(false)
#ifdef VCD_OUTPUT
, tfp_(nullptr)
@ -142,7 +140,7 @@ public:
if (future_.valid()) {
future_.wait();
}
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
delete mem_alloc_[b];
}
if (ram_) {
@ -178,16 +176,16 @@ public:
#endif
// calculate memory bank size
mem_bank_size_ = 1ull << PLATFORM_MEMORY_ADDR_WIDTH;
mem_bank_size_ = (1ull << PLATFORM_MEMORY_ADDR_WIDTH) / PLATFORM_MEMORY_NUM_BANKS;
// allocate RAM
ram_ = new RAM(0, RAM_PAGE_SIZE);
// initialize AXI memory interfaces
MP_M_AXI_MEM(PLATFORM_MEMORY_BANKS);
MP_M_AXI_MEM(PLATFORM_MEMORY_NUM_BANKS);
// initialize memory allocator
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
mem_alloc_[b] = new MemoryAllocator(0, mem_bank_size_, 4096, 64);
}
@ -209,13 +207,13 @@ public:
}
int mem_alloc(uint64_t size, uint32_t bank_id, uint64_t* addr) {
if (bank_id >= PLATFORM_MEMORY_BANKS)
if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
return -1;
return mem_alloc_[bank_id]->allocate(size, addr);
}
int mem_free(uint32_t bank_id, uint64_t addr) {
if (bank_id >= PLATFORM_MEMORY_BANKS)
if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
return -1;
return mem_alloc_[bank_id]->release(addr);
}
@ -223,7 +221,7 @@ public:
int mem_write(uint32_t bank_id, uint64_t addr, uint64_t size, const void* data) {
std::lock_guard<std::mutex> guard(mutex_);
if (bank_id >= PLATFORM_MEMORY_BANKS)
if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
return -1;
uint64_t base_addr = bank_id * mem_bank_size_ + addr;
ram_->write(data, base_addr, size);
@ -238,7 +236,7 @@ public:
int mem_read(uint32_t bank_id, uint64_t addr, uint64_t size, void* data) {
std::lock_guard<std::mutex> guard(mutex_);
if (bank_id >= PLATFORM_MEMORY_BANKS)
if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
return -1;
uint64_t base_addr = bank_id * mem_bank_size_ + addr;
ram_->read(data, base_addr, size);
@ -321,7 +319,7 @@ private:
reqs.clear();
}
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
std::queue<mem_req_t*> empty;
std::swap(dram_queues_[b], empty);
}
@ -338,7 +336,7 @@ private:
device_->ap_rst_n = 1;
// this AXI device is always ready to accept new requests
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
*m_axi_mem_[b].arready = 1;
*m_axi_mem_[b].awready = 1;
*m_axi_mem_[b].wready = 1;
@ -358,19 +356,18 @@ private:
dram_sim_.tick();
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
if (!dram_queues_[b].empty()) {
auto mem_req = dram_queues_[b].front();
if (dram_sim_.send_request(mem_req->write, mem_req->addr, b, [](void* arg) {
dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) {
auto orig_req = reinterpret_cast<mem_req_t*>(arg);
if (orig_req->ready) {
delete orig_req;
} else {
orig_req->ready = true;
}
}, mem_req)) {
dram_queues_[b].pop();
}
}, mem_req);
dram_queues_[b].pop();
}
}
@ -411,7 +408,7 @@ private:
}
void axi_mem_bus_reset() {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
// read request address
*m_axi_mem_[b].arready = 0;
@ -435,14 +432,14 @@ private:
void axi_mem_bus_eval(bool clk) {
if (!clk) {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
m_axi_states_[b].read_rsp_ready = *m_axi_mem_[b].rready;
m_axi_states_[b].write_rsp_ready = *m_axi_mem_[b].bready;
}
return;
}
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) {
for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
// handle read responses
if (*m_axi_mem_[b].rvalid && m_axi_states_[b].read_rsp_ready) {
*m_axi_mem_[b].rvalid = 0;
@ -607,15 +604,15 @@ private:
std::mutex mutex_;
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS];
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS];
m_axi_mem_t m_axi_mem_[PLATFORM_MEMORY_BANKS];
m_axi_mem_t m_axi_mem_[PLATFORM_MEMORY_NUM_BANKS];
MemoryAllocator* mem_alloc_[PLATFORM_MEMORY_BANKS];
MemoryAllocator* mem_alloc_[PLATFORM_MEMORY_NUM_BANKS];
m_axi_state_t m_axi_states_[PLATFORM_MEMORY_BANKS];
m_axi_state_t m_axi_states_[PLATFORM_MEMORY_NUM_BANKS];
std::queue<mem_req_t*> dram_queues_[PLATFORM_MEMORY_BANKS];
std::queue<mem_req_t*> dram_queues_[PLATFORM_MEMORY_NUM_BANKS];
#ifdef VCD_OUTPUT
VerilatedVcdC* tfp_;