ramulator memory addressing bug fix + platform memory refactoring
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions

This commit is contained in:
tinebp 2025-01-26 06:28:51 -08:00
parent e80ee2c819
commit 22398c991d
33 changed files with 310 additions and 281 deletions

View file

@ -301,11 +301,11 @@ config2()
# test single-bank memory # test single-bank memory
if [ "$XLEN" == "64" ]; then if [ "$XLEN" == "64" ]; then
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=48" ./ci/blackbox.sh --driver=opae --app=mstress CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=48" ./ci/blackbox.sh --driver=xrt --app=mstress CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
else else
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32" ./ci/blackbox.sh --driver=opae --app=mstress CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32" ./ci/blackbox.sh --driver=xrt --app=mstress CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
fi fi
# test larger memory address # test larger memory address
@ -322,10 +322,10 @@ config2()
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress
# test memory ports # test memory ports
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8 CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8 CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=opae --app=mstress --threads=8 CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=opae --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=xrt --app=mstress --threads=8 CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=xrt --app=mstress --threads=8

View file

@ -172,8 +172,26 @@
`define L3_LINE_SIZE `MEM_BLOCK_SIZE `define L3_LINE_SIZE `MEM_BLOCK_SIZE
`endif `endif
`ifndef PLATFORM_MEMORY_BANKS // Platform memory parameters
`define PLATFORM_MEMORY_BANKS 2
`ifndef PLATFORM_MEMORY_NUM_BANKS
`define PLATFORM_MEMORY_NUM_BANKS 2
`endif
`ifndef PLATFORM_MEMORY_ADDR_WIDTH
`ifdef XLEN_64
`define PLATFORM_MEMORY_ADDR_WIDTH 48
`else
`define PLATFORM_MEMORY_ADDR_WIDTH 32
`endif
`endif
`ifndef PLATFORM_MEMORY_DATA_SIZE
`define PLATFORM_MEMORY_DATA_SIZE 64
`endif
`ifndef PLATFORM_MEMORY_INTERLEAVE
`define PLATFORM_MEMORY_INTERLEAVE 1
`endif `endif
`ifdef XLEN_64 `ifdef XLEN_64
@ -656,9 +674,9 @@
// Number of Memory Ports // Number of Memory Ports
`ifndef L1_MEM_PORTS `ifndef L1_MEM_PORTS
`ifdef L1_DISABLE `ifdef L1_DISABLE
`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_BANKS) `define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`else `else
`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_BANKS) `define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`endif `endif
`endif `endif
@ -735,9 +753,9 @@
// Number of Memory Ports // Number of Memory Ports
`ifndef L2_MEM_PORTS `ifndef L2_MEM_PORTS
`ifdef L2_ENABLE `ifdef L2_ENABLE
`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_BANKS) `define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`else `else
`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_BANKS) `define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`endif `endif
`endif `endif
@ -796,9 +814,9 @@
// Number of Memory Ports // Number of Memory Ports
`ifndef L3_MEM_PORTS `ifndef L3_MEM_PORTS
`ifdef L3_ENABLE `ifdef L3_ENABLE
`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_BANKS) `define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`else `else
`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_BANKS) `define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`endif `endif
`endif `endif

View file

@ -193,7 +193,7 @@ module Vortex_axi import VX_gpu_pkg::*; #(
.TAG_WIDTH_OUT (AXI_TID_WIDTH), .TAG_WIDTH_OUT (AXI_TID_WIDTH),
.NUM_PORTS_IN (`VX_MEM_PORTS), .NUM_PORTS_IN (`VX_MEM_PORTS),
.NUM_BANKS_OUT (AXI_NUM_BANKS), .NUM_BANKS_OUT (AXI_NUM_BANKS),
.INTERLEAVE (0), .INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
.REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0), .REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0),
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || AXI_NUM_BANKS > 1) ? 2 : 0) .RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter ( ) axi_adapter (

View file

@ -28,18 +28,18 @@
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE. // POSSIBILITY OF SUCH DAMAGE.
//`include "platform_afu_top_config.vh" `include "VX_define.vh"
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH `ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH (`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_DATA_WIDTH/8)) `define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH ((`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS)) - $clog2(`PLATFORM_MEMORY_DATA_SIZE))
`endif `endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH `ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH `PLATFORM_MEMORY_DATA_WIDTH `define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH (`PLATFORM_MEMORY_DATA_SIZE * 8)
`endif `endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH `ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH `PLATFORM_MEMORY_BURST_CNT_WIDTH `define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
`endif `endif
package local_mem_cfg_pkg; package local_mem_cfg_pkg;

View file

@ -11,18 +11,14 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
`include "VX_define.vh"
`ifndef NOPAE `ifndef NOPAE
`include "afu_json_info.vh" `include "afu_json_info.vh"
`else `else
`include "vortex_afu.vh" `include "vortex_afu.vh"
`endif `endif
`include "VX_define.vh"
`ifndef PLATFORM_MEMORY_INTERLEAVE
`define PLATFORM_MEMORY_INTERLEAVE 1
`endif
module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_gpu_pkg::*; #( module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_gpu_pkg::*; #(
parameter NUM_LOCAL_MEM_BANKS = 2 parameter NUM_LOCAL_MEM_BANKS = 2
) ( ) (

View file

@ -134,10 +134,12 @@ module VX_afu_ctrl #(
RSTATE_RESP = 2'd2, RSTATE_RESP = 2'd2,
RSTATE_WIDTH = 2; RSTATE_WIDTH = 2;
localparam MEMORY_BANK_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - `CLOG2(`PLATFORM_MEMORY_NUM_BANKS);
// device caps // device caps
wire [63:0] dev_caps = {8'b0, wire [63:0] dev_caps = {8'b0,
5'(`PLATFORM_MEMORY_ADDR_WIDTH-20), 5'(MEMORY_BANK_ADDR_WIDTH-20),
3'(`CLOG2(`PLATFORM_MEMORY_BANKS)), 3'(`CLOG2(`PLATFORM_MEMORY_NUM_BANKS)),
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0), 8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS), 16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS), 8'(`NUM_WARPS),

View file

@ -31,7 +31,7 @@ module VX_afu_wrap #(
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE `ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA), `REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
`else `else
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA), `REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`endif `endif
// AXI4-Lite slave interface // AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid, input wire s_axi_ctrl_awvalid,
@ -58,11 +58,7 @@ module VX_afu_wrap #(
output wire interrupt output wire interrupt
); );
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - $clog2(C_M_AXI_MEM_NUM_BANKS);
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH + $clog2(`PLATFORM_MEMORY_BANKS);
`else
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH;
`endif
typedef enum logic [1:0] { typedef enum logic [1:0] {
STATE_IDLE = 0, STATE_IDLE = 0,
@ -71,8 +67,8 @@ module VX_afu_wrap #(
STATE_DONE = 3 STATE_DONE = 3
} state_e; } state_e;
localparam PENDING_SIZEW = 12; // max outstanding requests size localparam PENDING_WR_SIZEW = 12; // max outstanding requests size
localparam C_M_AXI_MEM_NUM_BANKS_SW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1); localparam NUM_MEM_BANKS_SIZEW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS]; wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS]; wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS];
@ -108,11 +104,11 @@ module VX_afu_wrap #(
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE `ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON); `REPEAT (1, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`else `else
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON); `REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`endif `endif
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr; reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
reg [PENDING_SIZEW-1:0] vx_pending_writes; reg [PENDING_WR_SIZEW-1:0] vx_pending_writes;
reg vx_reset = 1; // asserted at initialization reg vx_reset = 1; // asserted at initialization
wire vx_busy; wire vx_busy;
@ -200,7 +196,7 @@ module VX_afu_wrap #(
end end
wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire; wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire;
wire [C_M_AXI_MEM_NUM_BANKS_SW-1:0] cur_wr_reqs, cur_wr_rsps; wire [NUM_MEM_BANKS_SIZEW-1:0] cur_wr_reqs, cur_wr_rsps;
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_req_fire for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_req_fire
VX_axi_write_ack axi_write_ack ( VX_axi_write_ack axi_write_ack (
@ -224,14 +220,14 @@ module VX_afu_wrap #(
`POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire); `POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire);
`POP_COUNT(cur_wr_rsps, m_axi_wr_rsp_fire); `POP_COUNT(cur_wr_rsps, m_axi_wr_rsp_fire);
wire signed [C_M_AXI_MEM_NUM_BANKS_SW:0] reqs_sub = (C_M_AXI_MEM_NUM_BANKS_SW+1)'(cur_wr_reqs) - wire signed [NUM_MEM_BANKS_SIZEW:0] reqs_sub = (NUM_MEM_BANKS_SIZEW+1)'(cur_wr_reqs) -
(C_M_AXI_MEM_NUM_BANKS_SW+1)'(cur_wr_rsps); (NUM_MEM_BANKS_SIZEW+1)'(cur_wr_rsps);
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
vx_pending_writes <= '0; vx_pending_writes <= '0;
end else begin end else begin
vx_pending_writes <= vx_pending_writes + PENDING_SIZEW'(reqs_sub); vx_pending_writes <= vx_pending_writes + PENDING_WR_SIZEW'(reqs_sub);
end end
end end
@ -270,7 +266,7 @@ module VX_afu_wrap #(
.ap_ready (ap_ready), .ap_ready (ap_ready),
.ap_idle (ap_idle), .ap_idle (ap_idle),
.interrupt (interrupt), .interrupt (interrupt),
.ap_ctrl_read (ap_ctrl_read), .ap_ctrl_read (ap_ctrl_read),
`ifdef SCOPE `ifdef SCOPE

View file

@ -17,12 +17,12 @@ module vortex_afu #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8, parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32, parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH, parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_WIDTH, parameter C_M_AXI_MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
parameter C_M_AXI_MEM_ADDR_WIDTH = 64, parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE `ifdef PLATFORM_MERGED_MEMORY_INTERFACE
parameter C_M_AXI_MEM_NUM_BANKS = 1 parameter C_M_AXI_MEM_NUM_BANKS = 1
`else `else
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_BANKS parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
`endif `endif
) ( ) (
// System signals // System signals
@ -33,7 +33,7 @@ module vortex_afu #(
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE `ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA), `REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
`else `else
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA), `REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`endif `endif
// AXI4-Lite slave interface // AXI4-Lite slave interface
@ -75,7 +75,7 @@ module vortex_afu #(
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE `ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, AXI_MEM_ARGS, REPEAT_COMMA), `REPEAT (1, AXI_MEM_ARGS, REPEAT_COMMA),
`else `else
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_ARGS, REPEAT_COMMA), `REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
`endif `endif
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid), .s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
.s_axi_ctrl_awready (s_axi_ctrl_awready), .s_axi_ctrl_awready (s_axi_ctrl_awready),
@ -94,7 +94,7 @@ module vortex_afu #(
.s_axi_ctrl_rready (s_axi_ctrl_rready), .s_axi_ctrl_rready (s_axi_ctrl_rready),
.s_axi_ctrl_rdata (s_axi_ctrl_rdata), .s_axi_ctrl_rdata (s_axi_ctrl_rdata),
.s_axi_ctrl_rresp (s_axi_ctrl_rresp), .s_axi_ctrl_rresp (s_axi_ctrl_rresp),
.s_axi_ctrl_bvalid (s_axi_ctrl_bvalid), .s_axi_ctrl_bvalid (s_axi_ctrl_bvalid),
.s_axi_ctrl_bready (s_axi_ctrl_bready), .s_axi_ctrl_bready (s_axi_ctrl_bready),
.s_axi_ctrl_bresp (s_axi_ctrl_bresp), .s_axi_ctrl_bresp (s_axi_ctrl_bresp),

View file

@ -14,18 +14,6 @@
`ifndef VORTEX_AFU_VH `ifndef VORTEX_AFU_VH
`define VORTEX_AFU_VH `define VORTEX_AFU_VH
`ifndef PLATFORM_MEMORY_BANKS
`define PLATFORM_MEMORY_BANKS 2
`endif
`ifndef PLATFORM_MEMORY_ADDR_WIDTH
`define PLATFORM_MEMORY_ADDR_WIDTH 31
`endif
`ifndef PLATFORM_MEMORY_DATA_WIDTH
`define PLATFORM_MEMORY_DATA_WIDTH 512
`endif
`ifndef PLATFORM_MEMORY_OFFSET `ifndef PLATFORM_MEMORY_OFFSET
`define PLATFORM_MEMORY_OFFSET 0 `define PLATFORM_MEMORY_OFFSET 0
`endif `endif

View file

@ -221,7 +221,7 @@ module VX_async_ram_patch #(
VX_placeholder #( VX_placeholder #(
.O (1) .O (1)
) placeholder2 ( ) placeholder2 (
.in (), .in (1'b0),
.out (is_raddr_reg) .out (is_raddr_reg)
); );
wire [DATAW-1:0] rdata_a; wire [DATAW-1:0] rdata_a;

View file

@ -280,7 +280,13 @@ module VX_axi_adapter #(
end end
assign m_axi_arvalid[i] = req_xbar_valid_out[i] && ~xbar_rw_out; assign m_axi_arvalid[i] = req_xbar_valid_out[i] && ~xbar_rw_out;
assign m_axi_araddr[i] = ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE;
// convert address to byte-addressable space
if (INTERLEAVE) begin : g_m_axi_araddr_i
assign m_axi_araddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << (BANK_SEL_BITS + LOG2_DATA_SIZE)) | (ADDR_WIDTH_OUT'(i) << LOG2_DATA_SIZE);
end else begin : g_m_axi_araddr_ni
assign m_axi_araddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE) | (ADDR_WIDTH_OUT'(i) << (BANK_ADDR_WIDTH + LOG2_DATA_SIZE));
end
assign m_axi_arid[i] = TAG_WIDTH_OUT'(xbar_tag_r_out); assign m_axi_arid[i] = TAG_WIDTH_OUT'(xbar_tag_r_out);
assign m_axi_arlen[i] = 8'b00000000; assign m_axi_arlen[i] = 8'b00000000;
assign m_axi_arsize[i] = 3'(LOG2_DATA_SIZE); assign m_axi_arsize[i] = 3'(LOG2_DATA_SIZE);

View file

@ -7,22 +7,6 @@ include ../../common.mk
# AFU parameters # AFU parameters
CONFIGS += -DNOPAE CONFIGS += -DNOPAE
CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
ifeq ($(XLEN),64)
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
else
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
endif
endif
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
endif
ifeq (,$(findstring PLATFORM_MEMORY_BURST_CNT_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BURST_CNT_WIDTH=4
endif
#CONFIGS += -DNUM_CORES=2 #CONFIGS += -DNUM_CORES=2
#CONFIGS += -DNUM_WARPS=32 #CONFIGS += -DNUM_WARPS=32

View file

@ -99,7 +99,7 @@ ifdef PERF
endif endif
# ast dump flags # ast dump flags
XML_CFLAGS = $(filter-out -DSYNTHESIS -DQUARTUS, $(CFLAGS)) $(RTL_PKGS) -I$(AFU_DIR)/ccip -I$(DPI_DIR) -DPLATFORM_PROVIDES_LOCAL_MEMORY -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32 -DPLATFORM_MEMORY_DATA_WIDTH=512 -DPLATFORM_MEMORY_BURST_CNT_WIDTH=4 -DNOPAE -DSV_DPI XML_CFLAGS = $(filter-out -DSYNTHESIS -DQUARTUS, $(CFLAGS)) $(RTL_PKGS) -I$(AFU_DIR)/ccip -I$(DPI_DIR) -DPLATFORM_PROVIDES_LOCAL_MEMORY -DPLATFORM_MEMORY_NUM_BANKS=1 -DNOPAE -DSV_DPI
all: swconfig ip-gen setup build all: swconfig ip-gen setup build

View file

@ -52,7 +52,7 @@ foreach def $vdefines_list {
if { $name == "CHIPSCOPE" } { if { $name == "CHIPSCOPE" } {
set chipscope 1 set chipscope 1
} }
if { $name == "PLATFORM_MEMORY_BANKS" } { if { $name == "PLATFORM_MEMORY_NUM_BANKS" } {
set num_banks [lindex $fields 1] set num_banks [lindex $fields 1]
} }
if { $name == "PLATFORM_MERGED_MEMORY_INTERFACE" } { if { $name == "PLATFORM_MERGED_MEMORY_INTERFACE" } {

View file

@ -5,31 +5,36 @@ CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
ifeq ($(DEV_ARCH), zynquplus) ifeq ($(DEV_ARCH), zynquplus)
# zynquplus # zynquplus
CONFIGS += -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32 CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
else ifeq ($(DEV_ARCH), versal) else ifeq ($(DEV_ARCH), versal)
# versal # versal
CONFIGS += -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32 CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
ifneq ($(findstring xilinx_vck5000,$(XSA)),) ifneq ($(findstring xilinx_vck5000,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_OFFSET=40'hC000000000 CONFIGS += -DPLATFORM_MEMORY_OFFSET=40'hC000000000
endif endif
else else
# alveo # alveo
ifneq ($(findstring xilinx_u55c,$(XSA)),) ifneq ($(findstring xilinx_u55c,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=29 # 16 GB of HBM2 with 32 channels (512 MB per channel)
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=34
CONFIGS += -DPLATFORM_MERGED_MEMORY_INTERFACE CONFIGS += -DPLATFORM_MERGED_MEMORY_INTERFACE
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31] VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31]
#VPP_FLAGS += $(foreach i,$(shell seq 0 31), --connectivity.sp vortex_afu_1.m_axi_mem_$(i):HBM[$(i)]) #VPP_FLAGS += $(foreach i,$(shell seq 0 31), --connectivity.sp vortex_afu_1.m_axi_mem_$(i):HBM[$(i)])
else ifneq ($(findstring xilinx_u50,$(XSA)),) else ifneq ($(findstring xilinx_u50,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=28 # 8 GB of HBM2 with 32 channels (256 MB per channel)
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=33
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31] VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31]
else ifneq ($(findstring xilinx_u280,$(XSA)),) else ifneq ($(findstring xilinx_u280,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=28 # 8 GB of HBM2 with 32 channels (256 MB per channel)
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=33
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31] VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31]
else ifneq ($(findstring xilinx_u250,$(XSA)),) else ifneq ($(findstring xilinx_u250,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=34 # 64 GB of DDR4 with 4 channels (16 GB per channel)
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36
else ifneq ($(findstring xilinx_u200,$(XSA)),) else ifneq ($(findstring xilinx_u200,$(XSA)),)
CONFIGS += -DPLATFORM_MEMORY_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=34 # 64 GB of DDR4 with 4 channels (16 GB per channel)
CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36
else else
CONFIGS += -DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32 CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32
endif endif
endif endif

View file

@ -78,10 +78,10 @@ public:
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break; break;
case VX_CAPS_NUM_MEM_BANKS: case VX_CAPS_NUM_MEM_BANKS:
_value = PLATFORM_MEMORY_BANKS; _value = PLATFORM_MEMORY_NUM_BANKS;
break; break;
case VX_CAPS_MEM_BANK_SIZE: case VX_CAPS_MEM_BANK_SIZE:
_value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_BANKS); _value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_NUM_BANKS);
break; break;
default: default:
std::cout << "invalid caps id: " << caps_id << std::endl; std::cout << "invalid caps id: " << caps_id << std::endl;

View file

@ -113,10 +113,10 @@ public:
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break; break;
case VX_CAPS_NUM_MEM_BANKS: case VX_CAPS_NUM_MEM_BANKS:
_value = PLATFORM_MEMORY_BANKS; _value = PLATFORM_MEMORY_NUM_BANKS;
break; break;
case VX_CAPS_MEM_BANK_SIZE: case VX_CAPS_MEM_BANK_SIZE:
_value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_BANKS); _value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_NUM_BANKS);
break; break;
default: default:
std::cout << "invalid caps id: " << caps_id << std::endl; std::cout << "invalid caps id: " << caps_id << std::endl;

View file

@ -29,19 +29,54 @@ using namespace vortex;
class DramSim::Impl { class DramSim::Impl {
private: private:
struct mem_req_t {
uint64_t addr;
bool is_write;
ResponseCallback callback;
void* arg;
};
Ramulator::IFrontEnd* ramulator_frontend_; Ramulator::IFrontEnd* ramulator_frontend_;
Ramulator::IMemorySystem* ramulator_memorysystem_; Ramulator::IMemorySystem* ramulator_memorysystem_;
uint32_t cpu_channel_size_;
uint64_t cpu_cycles_;
uint32_t scaled_dram_cycles_;
static const uint32_t tick_cycles_ = 1000;
static const uint32_t dram_channel_size_ = 16; // 128 bits
std::queue<mem_req_t> pending_reqs_;
void handle_pending_requests() {
if (pending_reqs_.empty())
return;
auto& req = pending_reqs_.front();
auto req_type = req.is_write ? Ramulator::Request::Type::Write : Ramulator::Request::Type::Read;
std::function<void(Ramulator::Request&)> callback = nullptr;
if (req.callback) {
callback = [req_callback = std::move(req.callback), req_arg = std::move(req.arg)](Ramulator::Request& /*dram_req*/) {
req_callback(req_arg);
};
}
if (ramulator_frontend_->receive_external_requests(req_type, req.addr, 0, callback)) {
if (req.is_write) {
// Ramulator does not handle write responses, so we fire the callback ourselves.
if (req.callback) {
req.callback(req.arg);
}
}
pending_reqs_.pop();
}
}
public: public:
Impl(int clock_ratio) { Impl(uint32_t num_channels, uint32_t channel_size, float clock_ratio) {
YAML::Node dram_config; YAML::Node dram_config;
dram_config["Frontend"]["impl"] = "GEM5"; dram_config["Frontend"]["impl"] = "GEM5";
dram_config["MemorySystem"]["impl"] = "GenericDRAM"; dram_config["MemorySystem"]["impl"] = "GenericDRAM";
dram_config["MemorySystem"]["clock_ratio"] = clock_ratio; dram_config["MemorySystem"]["clock_ratio"] = 1;
dram_config["MemorySystem"]["DRAM"]["impl"] = "HBM2"; dram_config["MemorySystem"]["DRAM"]["impl"] = "HBM2";
dram_config["MemorySystem"]["DRAM"]["org"]["preset"] = "HBM2_8Gb"; dram_config["MemorySystem"]["DRAM"]["org"]["preset"] = "HBM2_8Gb";
dram_config["MemorySystem"]["DRAM"]["org"]["density"] = 8192; dram_config["MemorySystem"]["DRAM"]["org"]["density"] = 8192;
dram_config["MemorySystem"]["DRAM"]["org"]["channel"] = 8; dram_config["MemorySystem"]["DRAM"]["org"]["channel"] = num_channels;
dram_config["MemorySystem"]["DRAM"]["timing"]["preset"] = "HBM2_2Gbps"; dram_config["MemorySystem"]["DRAM"]["timing"]["preset"] = "HBM2_2Gbps";
dram_config["MemorySystem"]["Controller"]["impl"] = "Generic"; dram_config["MemorySystem"]["Controller"]["impl"] = "Generic";
dram_config["MemorySystem"]["Controller"]["Scheduler"]["impl"] = "FRFCFS"; dram_config["MemorySystem"]["Controller"]["Scheduler"]["impl"] = "FRFCFS";
@ -59,6 +94,10 @@ public:
ramulator_memorysystem_ = Ramulator::Factory::create_memory_system(dram_config); ramulator_memorysystem_ = Ramulator::Factory::create_memory_system(dram_config);
ramulator_frontend_->connect_memory_system(ramulator_memorysystem_); ramulator_frontend_->connect_memory_system(ramulator_memorysystem_);
ramulator_memorysystem_->connect_frontend(ramulator_frontend_); ramulator_memorysystem_->connect_frontend(ramulator_frontend_);
cpu_channel_size_ = channel_size;
scaled_dram_cycles_ = static_cast<uint64_t>(clock_ratio * tick_cycles_);
this->reset();
} }
~Impl() { ~Impl() {
@ -66,41 +105,49 @@ public:
auto original_buf = std::cout.rdbuf(); auto original_buf = std::cout.rdbuf();
std::cout.rdbuf(nullstream.rdbuf()); std::cout.rdbuf(nullstream.rdbuf());
ramulator_frontend_->finalize(); ramulator_frontend_->finalize();
ramulator_memorysystem_->finalize(); ramulator_memorysystem_->finalize();
std::cout.rdbuf(original_buf); std::cout.rdbuf(original_buf);
} }
void reset() { void reset() {
//-- cpu_cycles_ = 0;
} }
void tick() { void tick() {
ramulator_memorysystem_->tick(); cpu_cycles_ += tick_cycles_;
while (cpu_cycles_ >= scaled_dram_cycles_) {
this->handle_pending_requests();
ramulator_memorysystem_->tick();
cpu_cycles_ -= scaled_dram_cycles_;
}
} }
bool send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback response_cb, void* arg) { void send_request(uint64_t addr, bool is_write, ResponseCallback response_cb, void* arg) {
if (!ramulator_frontend_->receive_external_requests( // enqueue the request
is_write ? Ramulator::Request::Type::Write : Ramulator::Request::Type::Read, if (cpu_channel_size_ > dram_channel_size_) {
addr, uint32_t n = cpu_channel_size_ / dram_channel_size_;
source_id, for (uint32_t i = 0; i < n; ++i) {
[callback_ = std::move(response_cb), arg_ = std::move(arg)](Ramulator::Request& /*dram_req*/) { uint64_t dram_byte_addr = (addr / cpu_channel_size_) * dram_channel_size_ + (i * dram_channel_size_);
callback_(arg_); if (i == 0) {
pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg});
} else {
pending_reqs_.push({dram_byte_addr, is_write, nullptr, nullptr});
}
} }
)) { } else if (cpu_channel_size_ < dram_channel_size_) {
return false; uint64_t dram_byte_addr = (addr / cpu_channel_size_) * dram_channel_size_;
pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg});
} else {
uint64_t dram_byte_addr = addr;
pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg});
} }
if (is_write) { }
// Ramulator does not handle write responses, so we call the callback ourselves
response_cb(arg);
}
return true;
}
}; };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
DramSim::DramSim(int clock_ratio) DramSim::DramSim(uint32_t num_channels, uint32_t channel_size, float clock_ratio)
: impl_(new Impl(clock_ratio)) : impl_(new Impl(num_channels, channel_size, clock_ratio))
{} {}
DramSim::~DramSim() { DramSim::~DramSim() {
@ -115,6 +162,6 @@ void DramSim::tick() {
impl_->tick(); impl_->tick();
} }
bool DramSim::send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback callback, void* arg) { void DramSim::send_request(uint64_t addr, bool is_write, ResponseCallback callback, void* arg) {
return impl_->send_request(is_write, addr, source_id, callback, arg); impl_->send_request(addr, is_write, callback, arg);
} }

View file

@ -19,14 +19,15 @@ class DramSim {
public: public:
typedef void (*ResponseCallback)(void *arg); typedef void (*ResponseCallback)(void *arg);
DramSim(int clock_ratio); DramSim(uint32_t num_channels, uint32_t channel_size, float clock_ratio);
~DramSim(); ~DramSim();
void reset(); void reset();
void tick(); void tick();
bool send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback response_cb, void* arg); // addr: per-channel block address
void send_request(uint64_t addr, bool is_write, ResponseCallback response_cb, void* arg);
private: private:
class Impl; class Impl;

View file

@ -31,24 +31,6 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
# Platform parameters
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
ifeq ($(XLEN),64)
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
else
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
endif
endif
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
endif
ifeq (,$(findstring PLATFORM_MEMORY_BURST_CNT_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BURST_CNT_WIDTH=4
endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS) DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp

View file

@ -35,8 +35,6 @@
#include <unordered_map> #include <unordered_map>
#include <util.h> #include <util.h>
#define PLATFORM_MEMORY_DATA_SIZE (PLATFORM_MEMORY_DATA_WIDTH/8)
#ifndef MEM_CLOCK_RATIO #ifndef MEM_CLOCK_RATIO
#define MEM_CLOCK_RATIO 1 #define MEM_CLOCK_RATIO 1
#endif #endif
@ -66,6 +64,8 @@
using namespace vortex; using namespace vortex;
static uint32_t g_mem_bank_addr_width = (PLATFORM_MEMORY_ADDR_WIDTH - log2ceil(PLATFORM_MEMORY_NUM_BANKS));
static uint64_t timestamp = 0; static uint64_t timestamp = 0;
double sc_time_stamp() { double sc_time_stamp() {
@ -95,7 +95,7 @@ public:
Impl() Impl()
: device_(nullptr) : device_(nullptr)
, ram_(nullptr) , ram_(nullptr)
, dram_sim_(MEM_CLOCK_RATIO) , dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO)
, stop_(false) , stop_(false)
, host_buffer_ids_(0) , host_buffer_ids_(0)
#ifdef VCD_OUTPUT #ifdef VCD_OUTPUT
@ -146,9 +146,6 @@ public:
// allocate RAM // allocate RAM
ram_ = new RAM(0, RAM_PAGE_SIZE); ram_ = new RAM(0, RAM_PAGE_SIZE);
// calculate memory bank size
mem_bank_size_ = 1ull << PLATFORM_MEMORY_ADDR_WIDTH;
// reset the device // reset the device
this->reset(); this->reset();
@ -274,16 +271,15 @@ private:
if (!dram_queue_.empty()) { if (!dram_queue_.empty()) {
auto mem_req = dram_queue_.front(); auto mem_req = dram_queue_.front();
if (dram_sim_.send_request(mem_req->write, mem_req->addr, mem_req->bank_id, [](void* arg) { dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) {
auto orig_req = reinterpret_cast<mem_req_t*>(arg); auto orig_req = reinterpret_cast<mem_req_t*>(arg);
if (orig_req->ready) { if (orig_req->ready) {
delete orig_req; delete orig_req;
} else { } else {
orig_req->ready = true; orig_req->ready = true;
} }
}, mem_req)) { }, mem_req);
dram_queue_.pop(); dram_queue_.pop();
}
} }
dram_sim_.tick(); dram_sim_.tick();
@ -407,14 +403,14 @@ private:
} }
void avs_bus_reset() { void avs_bus_reset() {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
device_->avs_readdatavalid[b] = 0; device_->avs_readdatavalid[b] = 0;
device_->avs_waitrequest[b] = 0; device_->avs_waitrequest[b] = 0;
} }
} }
void avs_bus_eval() { void avs_bus_eval() {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
// process memory responses // process memory responses
device_->avs_readdatavalid[b] = 0; device_->avs_readdatavalid[b] = 0;
if (!pending_mem_reqs_[b].empty() if (!pending_mem_reqs_[b].empty()
@ -430,7 +426,12 @@ private:
// process memory requests // process memory requests
assert(!device_->avs_read[b] || !device_->avs_write[b]); assert(!device_->avs_read[b] || !device_->avs_write[b]);
uint64_t byte_addr = b * mem_bank_size_ + uint64_t(device_->avs_address[b]) * PLATFORM_MEMORY_DATA_SIZE; #if PLATFORM_MEMORY_INTERLEAVE == 1
uint64_t byte_addr = (uint64_t(device_->avs_address[b]) * PLATFORM_MEMORY_NUM_BANKS + b) * PLATFORM_MEMORY_DATA_SIZE;
#else
uint64_t byte_addr = (uint64_t(device_->avs_address[b]) + (b << g_mem_bank_addr_width)) * PLATFORM_MEMORY_DATA_SIZE;
#endif
if (device_->avs_write[b]) { if (device_->avs_write[b]) {
// process write request // process write request
uint64_t byteen = device_->avs_byteenable[b]; uint64_t byteen = device_->avs_byteenable[b];
@ -515,9 +516,8 @@ private:
std::unordered_map<int64_t, host_buffer_t> host_buffers_; std::unordered_map<int64_t, host_buffer_t> host_buffers_;
uint64_t host_buffer_ids_; uint64_t host_buffer_ids_;
uint64_t mem_bank_size_;
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS]; std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS];
std::list<cci_rd_req_t> cci_reads_; std::list<cci_rd_req_t> cci_reads_;
std::list<cci_wr_req_t> cci_writes_; std::list<cci_wr_req_t> cci_writes_;

View file

@ -78,22 +78,22 @@ module vortex_afu_shim import local_mem_cfg_pkg::*; import ccip_if_pkg::*; (
output t_ccip_mmioData af2cp_sTxPort_c2_data, output t_ccip_mmioData af2cp_sTxPort_c2_data,
// Avalon signals for local memory access // Avalon signals for local memory access
output t_local_mem_data avs_writedata [`PLATFORM_MEMORY_BANKS], output t_local_mem_data avs_writedata [`PLATFORM_MEMORY_NUM_BANKS],
input t_local_mem_data avs_readdata [`PLATFORM_MEMORY_BANKS], input t_local_mem_data avs_readdata [`PLATFORM_MEMORY_NUM_BANKS],
output t_local_mem_addr avs_address [`PLATFORM_MEMORY_BANKS], output t_local_mem_addr avs_address [`PLATFORM_MEMORY_NUM_BANKS],
input logic avs_waitrequest [`PLATFORM_MEMORY_BANKS], input logic avs_waitrequest [`PLATFORM_MEMORY_NUM_BANKS],
output logic avs_write [`PLATFORM_MEMORY_BANKS], output logic avs_write [`PLATFORM_MEMORY_NUM_BANKS],
output logic avs_read [`PLATFORM_MEMORY_BANKS], output logic avs_read [`PLATFORM_MEMORY_NUM_BANKS],
output t_local_mem_byte_mask avs_byteenable [`PLATFORM_MEMORY_BANKS], output t_local_mem_byte_mask avs_byteenable [`PLATFORM_MEMORY_NUM_BANKS],
output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_MEMORY_BANKS], output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_MEMORY_NUM_BANKS],
input avs_readdatavalid [`PLATFORM_MEMORY_BANKS] input avs_readdatavalid [`PLATFORM_MEMORY_NUM_BANKS]
); );
t_if_ccip_Rx cp2af_sRxPort; t_if_ccip_Rx cp2af_sRxPort;
t_if_ccip_Tx af2cp_sTxPort; t_if_ccip_Tx af2cp_sTxPort;
vortex_afu #( vortex_afu #(
.NUM_LOCAL_MEM_BANKS(`PLATFORM_MEMORY_BANKS) .NUM_LOCAL_MEM_BANKS(`PLATFORM_MEMORY_NUM_BANKS)
) afu ( ) afu (
.clk(clk), .clk(clk),
.reset(reset), .reset(reset),

View file

@ -24,21 +24,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_AFU
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR
# Platform parameters
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
ifeq ($(XLEN),64)
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
else
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
endif
endif
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS) DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv

View file

@ -35,8 +35,6 @@
#include <dram_sim.h> #include <dram_sim.h>
#include <util.h> #include <util.h>
#define PLATFORM_MEMORY_DATA_SIZE (PLATFORM_MEMORY_DATA_WIDTH/8)
#ifndef MEM_CLOCK_RATIO #ifndef MEM_CLOCK_RATIO
#define MEM_CLOCK_RATIO 1 #define MEM_CLOCK_RATIO 1
#endif #endif
@ -66,6 +64,8 @@ typedef uint64_t Word;
using namespace vortex; using namespace vortex;
static uint32_t g_mem_bank_addr_width = (PLATFORM_MEMORY_ADDR_WIDTH - log2ceil(PLATFORM_MEMORY_NUM_BANKS));
static uint64_t timestamp = 0; static uint64_t timestamp = 0;
double sc_time_stamp() { double sc_time_stamp() {
@ -93,7 +93,7 @@ void sim_trace_enable(bool enable) {
class Processor::Impl { class Processor::Impl {
public: public:
Impl() : dram_sim_(MEM_CLOCK_RATIO) { Impl() : dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO) {
// force random values for uninitialized signals // force random values for uninitialized signals
Verilated::randReset(VERILATOR_RESET_VALUE); Verilated::randReset(VERILATOR_RESET_VALUE);
Verilated::randSeed(50); Verilated::randSeed(50);
@ -154,7 +154,7 @@ public:
// start // start
device_->reset = 0; device_->reset = 0;
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
device_->mem_req_ready[b] = 1; device_->mem_req_ready[b] = 1;
} }
@ -195,7 +195,7 @@ private:
reqs.clear(); reqs.clear();
} }
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
std::queue<mem_req_t*> empty; std::queue<mem_req_t*> empty;
std::swap(dram_queue_[b], empty); std::swap(dram_queue_[b], empty);
} }
@ -224,17 +224,15 @@ private:
dram_sim_.tick(); dram_sim_.tick();
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
if (!dram_queue_[b].empty()) { if (!dram_queue_[b].empty()) {
auto mem_req = dram_queue_[b].front(); auto mem_req = dram_queue_[b].front();
if (dram_sim_.send_request(mem_req->write, mem_req->addr, b, [](void* arg) { dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) {
// mark completed request as ready // mark completed request as ready
auto orig_req = reinterpret_cast<mem_req_t*>(arg); auto orig_req = reinterpret_cast<mem_req_t*>(arg);
orig_req->ready = true; orig_req->ready = true;
}, mem_req)) { }, mem_req);
// was successfully sent to dram, remove from queue dram_queue_[b].pop();
dram_queue_[b].pop();
}
} }
} }
@ -254,7 +252,7 @@ private:
} }
void mem_bus_reset() { void mem_bus_reset() {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
device_->mem_req_ready[b] = 0; device_->mem_req_ready[b] = 0;
device_->mem_rsp_valid[b] = 0; device_->mem_rsp_valid[b] = 0;
} }
@ -262,13 +260,13 @@ private:
void mem_bus_eval(bool clk) { void mem_bus_eval(bool clk) {
if (!clk) { if (!clk) {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
mem_rd_rsp_ready_[b] = device_->mem_rsp_ready[b]; mem_rd_rsp_ready_[b] = device_->mem_rsp_ready[b];
} }
return; return;
} }
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
// process memory responses // process memory responses
if (device_->mem_rsp_valid[b] && mem_rd_rsp_ready_[b]) { if (device_->mem_rsp_valid[b] && mem_rd_rsp_ready_[b]) {
device_->mem_rsp_valid[b] = 0; device_->mem_rsp_valid[b] = 0;
@ -293,11 +291,16 @@ private:
// process memory requests // process memory requests
if (device_->mem_req_valid[b] && device_->mem_req_ready[b]) { if (device_->mem_req_valid[b] && device_->mem_req_ready[b]) {
uint64_t byte_addr = (device_->mem_req_addr[b] * PLATFORM_MEMORY_DATA_SIZE); #if PLATFORM_MEMORY_INTERLEAVE == 1
uint64_t byte_addr = (uint64_t(device_->mem_req_addr[b]) * PLATFORM_MEMORY_NUM_BANKS + b) * PLATFORM_MEMORY_DATA_SIZE;
#else
uint64_t byte_addr = (uint64_t(device_->mem_req_addr[b]) + (b << g_mem_bank_addr_width)) * PLATFORM_MEMORY_DATA_SIZE;
#endif
// check read/write
if (device_->mem_req_rw[b]) { if (device_->mem_req_rw[b]) {
auto byteen = device_->mem_req_byteen[b]; auto byteen = device_->mem_req_byteen[b];
auto data = VDataCast<uint8_t*, PLATFORM_MEMORY_DATA_SIZE>::get(device_->mem_req_data[b]); auto data = VDataCast<uint8_t*, PLATFORM_MEMORY_DATA_SIZE>::get(device_->mem_req_data[b]);
// check address range // check if console output address
if (byte_addr >= uint64_t(IO_COUT_ADDR) if (byte_addr >= uint64_t(IO_COUT_ADDR)
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) { && byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
// process console output // process console output
@ -313,21 +316,23 @@ private:
} }
} }
} else { } else {
// process writes // process memory writes
/*printf("%0ld: [sim] MEM Wr Req[%d]: addr=0x%0lx, tag=0x%0lx, byteen=0x", timestamp, b, byte_addr, device_->mem_req_tag[b]); /*printf("%0ld: [sim] MEM Wr Req[%d]: addr=0x%0lx, tag=0x%0lx, byteen=0x", timestamp, b, byte_addr, device_->mem_req_tag[b]);
for (int i = (PLATFORM_MEMORY_DATA_SIZE/4)-1; i >= 0; --i) { for (int i = (PLATFORM_MEMORY_DATA_SIZE/4)-1; i >= 0; --i) {
printf("%x", (int)((byteen >> (4 * i)) & 0xf)); printf("%x", (int)((byteen >> (4 * i)) & 0xf));
} }
printf(", data=0x"); printf(", data=0x");
for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) { for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) {
printf("%d=%02x,", i, data[i]); printf("%02x", data[i]);
} }
printf("\n");*/ printf("\n");*/
for (int i = 0; i < PLATFORM_MEMORY_DATA_SIZE; i++) { for (int i = 0; i < PLATFORM_MEMORY_DATA_SIZE; i++) {
if ((byteen >> i) & 0x1) { if ((byteen >> i) & 0x1) {
(*ram_)[byte_addr + i] = data[i]; (*ram_)[byte_addr + i] = data[i];
} }
} }
auto mem_req = new mem_req_t(); auto mem_req = new mem_req_t();
mem_req->tag = device_->mem_req_tag[b]; mem_req->tag = device_->mem_req_tag[b];
mem_req->addr = byte_addr; mem_req->addr = byte_addr;
@ -341,7 +346,7 @@ private:
pending_mem_reqs_[b].emplace_back(mem_req); pending_mem_reqs_[b].emplace_back(mem_req);
} }
} else { } else {
// process reads // process memory reads
auto mem_req = new mem_req_t(); auto mem_req = new mem_req_t();
mem_req->tag = device_->mem_req_tag[b]; mem_req->tag = device_->mem_req_tag[b];
mem_req->addr = byte_addr; mem_req->addr = byte_addr;
@ -388,11 +393,11 @@ private:
std::unordered_map<int, std::stringstream> print_bufs_; std::unordered_map<int, std::stringstream> print_bufs_;
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS]; std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS];
std::queue<mem_req_t*> dram_queue_[PLATFORM_MEMORY_BANKS]; std::queue<mem_req_t*> dram_queue_[PLATFORM_MEMORY_NUM_BANKS];
std::array<bool, PLATFORM_MEMORY_BANKS> mem_rd_rsp_ready_; std::array<bool, PLATFORM_MEMORY_NUM_BANKS> mem_rd_rsp_ready_;
DramSim dram_sim_; DramSim dram_sim_;

View file

@ -14,9 +14,9 @@
`include "VX_define.vh" `include "VX_define.vh"
module rtlsim_shim import VX_gpu_pkg::*; #( module rtlsim_shim import VX_gpu_pkg::*; #(
parameter MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_WIDTH, parameter MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
parameter MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH, parameter MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS),
parameter MEM_NUM_BANKS = `PLATFORM_MEMORY_BANKS, parameter MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS,
parameter MEM_TAG_WIDTH = 64 parameter MEM_TAG_WIDTH = 64
) ( ) (
`SCOPE_IO_DECL `SCOPE_IO_DECL
@ -159,7 +159,7 @@ module rtlsim_shim import VX_gpu_pkg::*; #(
.TAG_WIDTH_OUT (MEM_TAG_WIDTH), .TAG_WIDTH_OUT (MEM_TAG_WIDTH),
.NUM_PORTS_IN (`VX_MEM_PORTS), .NUM_PORTS_IN (`VX_MEM_PORTS),
.NUM_BANKS_OUT (MEM_NUM_BANKS), .NUM_BANKS_OUT (MEM_NUM_BANKS),
.INTERLEAVE (0), .INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
.REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0), .REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0),
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || MEM_NUM_BANKS > 1) ? 2 : 0) .RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || MEM_NUM_BANKS > 1) ? 2 : 0)
) mem_bank_adapter ( ) mem_bank_adapter (

View file

@ -43,8 +43,13 @@ public:
char sname[100]; char sname[100];
snprintf(sname, 100, "%s-xbar", simobject->name().c_str()); snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
uint32_t wsel_bits = log2ceil(config_.line_size); uint32_t lg2_line_size = log2ceil(config_.line_size);
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, (1 << config.B), wsel_bits); uint32_t num_banks = 1 << config.B;
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, num_banks, 1,
[lg2_line_size, num_banks](const MemCrossBar::ReqType& req) {
// Custom logic to calculate the output index using bank interleaving
return (uint32_t)((req.addr >> lg2_line_size) & (num_banks-1));
});
for (uint32_t i = 0; i < config.num_reqs; ++i) { for (uint32_t i = 0; i < config.num_reqs; ++i) {
simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i)); simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i));
mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i)); mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i));

View file

@ -30,7 +30,6 @@ private:
MemCrossBar::Ptr mem_xbar_; MemCrossBar::Ptr mem_xbar_;
DramSim dram_sim_; DramSim dram_sim_;
mutable PerfStats perf_stats_; mutable PerfStats perf_stats_;
struct DramCallbackArgs { struct DramCallbackArgs {
MemSim::Impl* memsim; MemSim::Impl* memsim;
MemReq request; MemReq request;
@ -41,11 +40,15 @@ public:
Impl(MemSim* simobject, const Config& config) Impl(MemSim* simobject, const Config& config)
: simobject_(simobject) : simobject_(simobject)
, config_(config) , config_(config)
, dram_sim_(MEM_CLOCK_RATIO) , dram_sim_(config.num_banks, config.block_size, config.clock_ratio)
{ {
char sname[100]; char sname[100];
snprintf(sname, 100, "%s-xbar", simobject->name().c_str()); snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_ports, config.num_banks); mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_ports, config.num_banks, 1,
[lg2_block_size = log2ceil(config.block_size), num_banks = config.num_banks](const MemCrossBar::ReqType& req) {
// Custom logic to calculate the output index using bank interleaving
return (uint32_t)((req.addr >> lg2_block_size) & (num_banks-1));
});
for (uint32_t i = 0; i < config.num_ports; ++i) { for (uint32_t i = 0; i < config.num_ports; ++i) {
simobject->MemReqPorts.at(i).bind(&mem_xbar_->ReqIn.at(i)); simobject->MemReqPorts.at(i).bind(&mem_xbar_->ReqIn.at(i));
mem_xbar_->RspIn.at(i).bind(&simobject->MemRspPorts.at(i)); mem_xbar_->RspIn.at(i).bind(&simobject->MemRspPorts.at(i));
@ -74,16 +77,15 @@ public:
auto& mem_req = mem_xbar_->ReqOut.at(i).front(); auto& mem_req = mem_xbar_->ReqOut.at(i).front();
// try to enqueue the request to the memory system // enqueue the request to the memory system
auto req_args = new DramCallbackArgs{this, mem_req, i}; auto req_args = new DramCallbackArgs{this, mem_req, i};
auto enqueue_success = dram_sim_.send_request( dram_sim_.send_request(
mem_req.write,
mem_req.addr, mem_req.addr,
0, mem_req.write,
[](void* arg) { [](void* arg) {
auto rsp_args = reinterpret_cast<const DramCallbackArgs*>(arg); auto rsp_args = reinterpret_cast<const DramCallbackArgs*>(arg);
// only send a response for read requests
if (!rsp_args->request.write) { if (!rsp_args->request.write) {
// only send a response for read requests
MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid}; MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid};
rsp_args->memsim->mem_xbar_->RspOut.at(rsp_args->bank_id).push(mem_rsp, 1); rsp_args->memsim->mem_xbar_->RspOut.at(rsp_args->bank_id).push(mem_rsp, 1);
DT(3, rsp_args->memsim->simobject_->name() << "-mem-rsp[" << rsp_args->bank_id << "]: " << mem_rsp); DT(3, rsp_args->memsim->simobject_->name() << "-mem-rsp[" << rsp_args->bank_id << "]: " << mem_rsp);
@ -93,14 +95,7 @@ public:
req_args req_args
); );
// check if the request was enqueued successfully
if (!enqueue_success) {
delete req_args;
continue;
}
DT(3, simobject_->name() << "-mem-req[" << i << "]: " << mem_req); DT(3, simobject_->name() << "-mem-req[" << i << "]: " << mem_req);
mem_xbar_->ReqOut.at(i).pop(); mem_xbar_->ReqOut.at(i).pop();
} }
} }

View file

@ -23,6 +23,8 @@ public:
struct Config { struct Config {
uint32_t num_banks; uint32_t num_banks;
uint32_t num_ports; uint32_t num_ports;
uint32_t block_size;
float clock_ratio;
}; };
struct PerfStats { struct PerfStats {

View file

@ -22,10 +22,14 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
{ {
SimPlatform::instance().initialize(); SimPlatform::instance().initialize();
assert(PLATFORM_MEMORY_DATA_SIZE == MEM_BLOCK_SIZE);
// create memory simulator // create memory simulator
memsim_ = MemSim::Create("dram", MemSim::Config{ memsim_ = MemSim::Create("dram", MemSim::Config{
PLATFORM_MEMORY_BANKS, PLATFORM_MEMORY_NUM_BANKS,
L3_MEM_PORTS L3_MEM_PORTS,
MEM_BLOCK_SIZE,
MEM_CLOCK_RATIO
}); });
// create clusters // create clusters

View file

@ -398,6 +398,8 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
template <typename T> template <typename T>
class HashTable { class HashTable {
public: public:
typedef T DataType;
HashTable(uint32_t capacity) HashTable(uint32_t capacity)
: entries_(capacity) : entries_(capacity)
, size_(0) , size_(0)
@ -470,6 +472,8 @@ private:
template <typename Type> template <typename Type>
class Arbiter : public SimObject<Arbiter<Type>> { class Arbiter : public SimObject<Arbiter<Type>> {
public: public:
typedef Type ReqType;
std::vector<SimPort<Type>> Inputs; std::vector<SimPort<Type>> Inputs;
std::vector<SimPort<Type>> Outputs; std::vector<SimPort<Type>> Outputs;
@ -556,6 +560,8 @@ protected:
template <typename Type> template <typename Type>
class CrossBar : public SimObject<CrossBar<Type>> { class CrossBar : public SimObject<CrossBar<Type>> {
public: public:
typedef Type ReqType;
std::vector<SimPort<Type>> Inputs; std::vector<SimPort<Type>> Inputs;
std::vector<SimPort<Type>> Outputs; std::vector<SimPort<Type>> Outputs;
@ -565,8 +571,8 @@ public:
ArbiterType type, ArbiterType type,
uint32_t num_inputs, uint32_t num_inputs,
uint32_t num_outputs = 1, uint32_t num_outputs = 1,
uint32_t addr_start = 0, uint32_t delay = 1,
uint32_t delay = 1 std::function<uint32_t(const Type& req)> output_sel = nullptr
) )
: SimObject<CrossBar<Type>>(ctx, name) : SimObject<CrossBar<Type>>(ctx, name)
, Inputs(num_inputs, this) , Inputs(num_inputs, this)
@ -576,12 +582,18 @@ public:
, grants_(num_outputs, 0) , grants_(num_outputs, 0)
, lg2_inputs_(log2ceil(num_inputs)) , lg2_inputs_(log2ceil(num_inputs))
, lg2_outputs_(log2ceil(num_outputs)) , lg2_outputs_(log2ceil(num_outputs))
, addr_start_(addr_start)
, collisions_(0) { , collisions_(0) {
assert(delay != 0); assert(delay != 0);
assert(num_inputs <= 64); assert(num_inputs <= 64);
assert(num_outputs <= 64); assert(num_outputs <= 64);
assert(ispow2(num_outputs)); assert(ispow2(num_outputs));
if (output_sel != nullptr) {
output_sel_ = output_sel;
} else {
output_sel_ = [this](const Type& req) {
return (uint32_t)bit_getw(req.addr, 0, (lg2_outputs_-1));
};
}
} }
void reset() { void reset() {
@ -609,7 +621,8 @@ public:
auto& req = req_in.front(); auto& req = req_in.front();
uint32_t output_idx = 0; uint32_t output_idx = 0;
if (lg2_outputs_ != 0) { if (lg2_outputs_ != 0) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1)); // select output index
output_idx = output_sel_(req);
// skip if input is not going to current output // skip if input is not going to current output
if (output_idx != o) if (output_idx != o)
continue; continue;
@ -649,7 +662,7 @@ protected:
std::vector<uint32_t> grants_; std::vector<uint32_t> grants_;
uint32_t lg2_inputs_; uint32_t lg2_inputs_;
uint32_t lg2_outputs_; uint32_t lg2_outputs_;
uint32_t addr_start_; std::function<uint32_t(const Type& req)> output_sel_;
uint64_t collisions_; uint64_t collisions_;
}; };
@ -658,6 +671,9 @@ protected:
template <typename Req, typename Rsp> template <typename Req, typename Rsp>
class TxArbiter : public SimObject<TxArbiter<Req, Rsp>> { class TxArbiter : public SimObject<TxArbiter<Req, Rsp>> {
public: public:
typedef Req ReqType;
typedef Rsp RspType;
std::vector<SimPort<Req>> ReqIn; std::vector<SimPort<Req>> ReqIn;
std::vector<SimPort<Rsp>> RspIn; std::vector<SimPort<Rsp>> RspIn;
@ -771,6 +787,9 @@ protected:
template <typename Req, typename Rsp> template <typename Req, typename Rsp>
class TxCrossBar : public SimObject<TxCrossBar<Req, Rsp>> { class TxCrossBar : public SimObject<TxCrossBar<Req, Rsp>> {
public: public:
typedef Req ReqType;
typedef Rsp RspType;
std::vector<SimPort<Req>> ReqIn; std::vector<SimPort<Req>> ReqIn;
std::vector<SimPort<Rsp>> RspIn; std::vector<SimPort<Rsp>> RspIn;
@ -783,8 +802,8 @@ public:
ArbiterType type, ArbiterType type,
uint32_t num_inputs, uint32_t num_inputs,
uint32_t num_outputs = 1, uint32_t num_outputs = 1,
uint32_t addr_start = 0, uint32_t delay = 1,
uint32_t delay = 1 std::function<uint32_t(const Req& req)> output_sel = nullptr
) )
: SimObject<TxCrossBar<Req, Rsp>>(ctx, name) : SimObject<TxCrossBar<Req, Rsp>>(ctx, name)
, ReqIn(num_inputs, this) , ReqIn(num_inputs, this)
@ -797,7 +816,6 @@ public:
, rsp_grants_(num_inputs, 0) , rsp_grants_(num_inputs, 0)
, lg2_inputs_(log2ceil(num_inputs)) , lg2_inputs_(log2ceil(num_inputs))
, lg2_outputs_(log2ceil(num_outputs)) , lg2_outputs_(log2ceil(num_outputs))
, addr_start_(addr_start)
, req_collisions_(0) , req_collisions_(0)
, rsp_collisions_(0) { , rsp_collisions_(0) {
assert(delay != 0); assert(delay != 0);
@ -805,6 +823,13 @@ public:
assert(num_outputs <= 64); assert(num_outputs <= 64);
assert(ispow2(num_inputs)); assert(ispow2(num_inputs));
assert(ispow2(num_outputs)); assert(ispow2(num_outputs));
if (output_sel != nullptr) {
output_sel_ = output_sel;
} else {
output_sel_ = [this](const Req& req) {
return (uint32_t)bit_getw(req.addr, 0, (lg2_outputs_-1));
};
}
} }
void reset() { void reset() {
@ -875,7 +900,8 @@ public:
auto& req = req_in.front(); auto& req = req_in.front();
uint32_t output_idx = 0; uint32_t output_idx = 0;
if (lg2_outputs_ != 0) { if (lg2_outputs_ != 0) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1)); // select output index
output_idx = output_sel_(req);
// skip if request is not going to current output // skip if request is not going to current output
if (output_idx != o) if (output_idx != o)
continue; continue;
@ -929,7 +955,7 @@ protected:
std::vector<uint32_t> rsp_grants_; std::vector<uint32_t> rsp_grants_;
uint32_t lg2_inputs_; uint32_t lg2_inputs_;
uint32_t lg2_outputs_; uint32_t lg2_outputs_;
uint32_t addr_start_; std::function<uint32_t(const Req& req)> output_sel_;
uint64_t req_collisions_; uint64_t req_collisions_;
uint64_t rsp_collisions_; uint64_t rsp_collisions_;
}; };

View file

@ -31,21 +31,6 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
# Platform parameters
ifeq (,$(findstring PLATFORM_MEMORY_BANKS,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_BANKS=2
endif
ifeq (,$(findstring PLATFORM_MEMORY_ADDR_WIDTH,$(CONFIGS)))
ifeq ($(XLEN),64)
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=47
else
CONFIGS += -DPLATFORM_MEMORY_ADDR_WIDTH=31
endif
endif
ifeq (,$(findstring PLATFORM_MEMORY_DATA_WIDTH,$(CONFIGS)))
CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512
endif
DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS) DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp

View file

@ -17,16 +17,16 @@ module vortex_afu_shim #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8, parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32, parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH, parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_WIDTH, parameter C_M_AXI_MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
parameter C_M_AXI_MEM_ADDR_WIDTH = 64, parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_BANKS parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
) ( ) (
// System signals // System signals
input wire ap_clk, input wire ap_clk,
input wire ap_rst_n, input wire ap_rst_n,
// AXI4 master interface // AXI4 master interface
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA), `REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
// AXI4-Lite slave interface // AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid, input wire s_axi_ctrl_awvalid,
@ -61,7 +61,7 @@ module vortex_afu_shim #(
.clk (ap_clk), .clk (ap_clk),
.reset (~ap_rst_n), .reset (~ap_rst_n),
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_ARGS, REPEAT_COMMA), `REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid), .s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
.s_axi_ctrl_awready (s_axi_ctrl_awready), .s_axi_ctrl_awready (s_axi_ctrl_awready),

View file

@ -37,8 +37,6 @@
#include <iostream> #include <iostream>
#define PLATFORM_MEMORY_DATA_SIZE (PLATFORM_MEMORY_DATA_WIDTH/8)
#ifndef MEM_CLOCK_RATIO #ifndef MEM_CLOCK_RATIO
#define MEM_CLOCK_RATIO 1 #define MEM_CLOCK_RATIO 1
#endif #endif
@ -61,10 +59,10 @@
#define CPU_GPU_LATENCY 200 #define CPU_GPU_LATENCY 200
#if PLATFORM_MEMORY_DATA_WIDTH > 64 #if PLATFORM_MEMORY_DATA_SIZE > 8
typedef VlWide<(PLATFORM_MEMORY_DATA_WIDTH/32)> Vl_m_data_t; typedef VlWide<(PLATFORM_MEMORY_DATA_SIZE/4)> Vl_m_data_t;
#else #else
#if PLATFORM_MEMORY_DATA_WIDTH > 32 #if PLATFORM_MEMORY_DATA_SIZE > 4
typedef QData Vl_m_data_t; typedef QData Vl_m_data_t;
#else #else
typedef IData Vl_m_data_t; typedef IData Vl_m_data_t;
@ -130,7 +128,7 @@ public:
Impl() Impl()
: device_(nullptr) : device_(nullptr)
, ram_(nullptr) , ram_(nullptr)
, dram_sim_(MEM_CLOCK_RATIO) , dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO)
, stop_(false) , stop_(false)
#ifdef VCD_OUTPUT #ifdef VCD_OUTPUT
, tfp_(nullptr) , tfp_(nullptr)
@ -142,7 +140,7 @@ public:
if (future_.valid()) { if (future_.valid()) {
future_.wait(); future_.wait();
} }
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
delete mem_alloc_[b]; delete mem_alloc_[b];
} }
if (ram_) { if (ram_) {
@ -178,16 +176,16 @@ public:
#endif #endif
// calculate memory bank size // calculate memory bank size
mem_bank_size_ = 1ull << PLATFORM_MEMORY_ADDR_WIDTH; mem_bank_size_ = (1ull << PLATFORM_MEMORY_ADDR_WIDTH) / PLATFORM_MEMORY_NUM_BANKS;
// allocate RAM // allocate RAM
ram_ = new RAM(0, RAM_PAGE_SIZE); ram_ = new RAM(0, RAM_PAGE_SIZE);
// initialize AXI memory interfaces // initialize AXI memory interfaces
MP_M_AXI_MEM(PLATFORM_MEMORY_BANKS); MP_M_AXI_MEM(PLATFORM_MEMORY_NUM_BANKS);
// initialize memory allocator // initialize memory allocator
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
mem_alloc_[b] = new MemoryAllocator(0, mem_bank_size_, 4096, 64); mem_alloc_[b] = new MemoryAllocator(0, mem_bank_size_, 4096, 64);
} }
@ -209,13 +207,13 @@ public:
} }
int mem_alloc(uint64_t size, uint32_t bank_id, uint64_t* addr) { int mem_alloc(uint64_t size, uint32_t bank_id, uint64_t* addr) {
if (bank_id >= PLATFORM_MEMORY_BANKS) if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
return -1; return -1;
return mem_alloc_[bank_id]->allocate(size, addr); return mem_alloc_[bank_id]->allocate(size, addr);
} }
int mem_free(uint32_t bank_id, uint64_t addr) { int mem_free(uint32_t bank_id, uint64_t addr) {
if (bank_id >= PLATFORM_MEMORY_BANKS) if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
return -1; return -1;
return mem_alloc_[bank_id]->release(addr); return mem_alloc_[bank_id]->release(addr);
} }
@ -223,7 +221,7 @@ public:
int mem_write(uint32_t bank_id, uint64_t addr, uint64_t size, const void* data) { int mem_write(uint32_t bank_id, uint64_t addr, uint64_t size, const void* data) {
std::lock_guard<std::mutex> guard(mutex_); std::lock_guard<std::mutex> guard(mutex_);
if (bank_id >= PLATFORM_MEMORY_BANKS) if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
return -1; return -1;
uint64_t base_addr = bank_id * mem_bank_size_ + addr; uint64_t base_addr = bank_id * mem_bank_size_ + addr;
ram_->write(data, base_addr, size); ram_->write(data, base_addr, size);
@ -238,7 +236,7 @@ public:
int mem_read(uint32_t bank_id, uint64_t addr, uint64_t size, void* data) { int mem_read(uint32_t bank_id, uint64_t addr, uint64_t size, void* data) {
std::lock_guard<std::mutex> guard(mutex_); std::lock_guard<std::mutex> guard(mutex_);
if (bank_id >= PLATFORM_MEMORY_BANKS) if (bank_id >= PLATFORM_MEMORY_NUM_BANKS)
return -1; return -1;
uint64_t base_addr = bank_id * mem_bank_size_ + addr; uint64_t base_addr = bank_id * mem_bank_size_ + addr;
ram_->read(data, base_addr, size); ram_->read(data, base_addr, size);
@ -321,7 +319,7 @@ private:
reqs.clear(); reqs.clear();
} }
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
std::queue<mem_req_t*> empty; std::queue<mem_req_t*> empty;
std::swap(dram_queues_[b], empty); std::swap(dram_queues_[b], empty);
} }
@ -338,7 +336,7 @@ private:
device_->ap_rst_n = 1; device_->ap_rst_n = 1;
// this AXI device is always ready to accept new requests // this AXI device is always ready to accept new requests
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
*m_axi_mem_[b].arready = 1; *m_axi_mem_[b].arready = 1;
*m_axi_mem_[b].awready = 1; *m_axi_mem_[b].awready = 1;
*m_axi_mem_[b].wready = 1; *m_axi_mem_[b].wready = 1;
@ -358,19 +356,18 @@ private:
dram_sim_.tick(); dram_sim_.tick();
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
if (!dram_queues_[b].empty()) { if (!dram_queues_[b].empty()) {
auto mem_req = dram_queues_[b].front(); auto mem_req = dram_queues_[b].front();
if (dram_sim_.send_request(mem_req->write, mem_req->addr, b, [](void* arg) { dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) {
auto orig_req = reinterpret_cast<mem_req_t*>(arg); auto orig_req = reinterpret_cast<mem_req_t*>(arg);
if (orig_req->ready) { if (orig_req->ready) {
delete orig_req; delete orig_req;
} else { } else {
orig_req->ready = true; orig_req->ready = true;
} }
}, mem_req)) { }, mem_req);
dram_queues_[b].pop(); dram_queues_[b].pop();
}
} }
} }
@ -411,7 +408,7 @@ private:
} }
void axi_mem_bus_reset() { void axi_mem_bus_reset() {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
// read request address // read request address
*m_axi_mem_[b].arready = 0; *m_axi_mem_[b].arready = 0;
@ -435,14 +432,14 @@ private:
void axi_mem_bus_eval(bool clk) { void axi_mem_bus_eval(bool clk) {
if (!clk) { if (!clk) {
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
m_axi_states_[b].read_rsp_ready = *m_axi_mem_[b].rready; m_axi_states_[b].read_rsp_ready = *m_axi_mem_[b].rready;
m_axi_states_[b].write_rsp_ready = *m_axi_mem_[b].bready; m_axi_states_[b].write_rsp_ready = *m_axi_mem_[b].bready;
} }
return; return;
} }
for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) {
// handle read responses // handle read responses
if (*m_axi_mem_[b].rvalid && m_axi_states_[b].read_rsp_ready) { if (*m_axi_mem_[b].rvalid && m_axi_states_[b].read_rsp_ready) {
*m_axi_mem_[b].rvalid = 0; *m_axi_mem_[b].rvalid = 0;
@ -607,15 +604,15 @@ private:
std::mutex mutex_; std::mutex mutex_;
std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_BANKS]; std::list<mem_req_t*> pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS];
m_axi_mem_t m_axi_mem_[PLATFORM_MEMORY_BANKS]; m_axi_mem_t m_axi_mem_[PLATFORM_MEMORY_NUM_BANKS];
MemoryAllocator* mem_alloc_[PLATFORM_MEMORY_BANKS]; MemoryAllocator* mem_alloc_[PLATFORM_MEMORY_NUM_BANKS];
m_axi_state_t m_axi_states_[PLATFORM_MEMORY_BANKS]; m_axi_state_t m_axi_states_[PLATFORM_MEMORY_NUM_BANKS];
std::queue<mem_req_t*> dram_queues_[PLATFORM_MEMORY_BANKS]; std::queue<mem_req_t*> dram_queues_[PLATFORM_MEMORY_NUM_BANKS];
#ifdef VCD_OUTPUT #ifdef VCD_OUTPUT
VerilatedVcdC* tfp_; VerilatedVcdC* tfp_;