Merge branch 'bug_fixes'

This commit is contained in:
tinebp 2025-01-22 02:49:55 -08:00
commit 9dc1d3f688
74 changed files with 1361 additions and 1293 deletions

View file

@ -24,7 +24,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
input sysmem_perf_t sysmem_perf,
`endif
// DCRs
@ -43,12 +43,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
cache_perf_t l2_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.l2cache = l2_perf;
end
`endif
`ifdef GBAR_ENABLE
@ -111,7 +111,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.clk (clk),
.reset (l2_reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.l2cache),
.cache_perf (l2_perf),
`endif
.core_bus_if (per_socket_mem_bus_if),
.mem_bus_if (mem_bus_if)
@ -140,7 +140,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.reset (socket_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.sysmem_perf (sysmem_perf_tmp),
`endif
.dcr_bus_if (socket_dcr_bus_if),

View file

@ -325,23 +325,22 @@
///////////////////////////////////////////////////////////////////////////////
`define NEG_EDGE(dst, src) \
wire dst; \
VX_edge_trigger #( \
.POS (0), \
.INIT (0) \
) __``dst``__ ( \
) __neg_edge`__LINE__ ( \
.clk (clk), \
.reset (1'b0), \
.data_in (src), \
.data_out (dst) \
)
`define BUFFER_EX(dst, src, ena, RSTW, latency) \
`define BUFFER_EX(dst, src, ena, resetw, latency) \
VX_pipe_register #( \
.DATAW ($bits(dst)), \
.RESETW (RSTW), \
.RESETW (resetw), \
.DEPTH (latency) \
) __``dst``__ ( \
) __buffer_ex`__LINE__ ( \
.clk (clk), \
.reset (reset), \
.enable (ena), \
@ -349,13 +348,13 @@
.data_out (dst) \
)
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 0, 1)
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, $bits(dst), 1)
`define POP_COUNT_EX(out, in, model) \
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out``__ ( \
) __pop_count_ex`__LINE__ ( \
.data_in (in), \
.data_out (out) \
)
@ -482,7 +481,7 @@
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_field[__i] = src[__i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
VX_reduce_tree #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
__reduce_add_i_field, \
__reduce_add_o_field \
); \

View file

@ -73,6 +73,17 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
} cache_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
logic [`PERF_CTR_BITS-1:0] bank_stalls;
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
} lmem_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] misses;
} coalescer_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
@ -92,6 +103,26 @@ package VX_gpu_pkg;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t;
typedef struct packed {
cache_perf_t icache;
cache_perf_t dcache;
cache_perf_t l2cache;
cache_perf_t l3cache;
lmem_perf_t lmem;
coalescer_perf_t coalescer;
mem_perf_t mem;
} sysmem_perf_t;
typedef struct packed {
sched_perf_t sched;
issue_perf_t issue;
logic [`PERF_CTR_BITS-1:0] ifetches;
logic [`PERF_CTR_BITS-1:0] loads;
logic [`PERF_CTR_BITS-1:0] stores;
logic [`PERF_CTR_BITS-1:0] ifetch_latency;
logic [`PERF_CTR_BITS-1:0] load_latency;
} pipeline_perf_t;
//////////////////////// instruction arguments ////////////////////////////
typedef struct packed {
@ -145,6 +176,7 @@ package VX_gpu_pkg;
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
localparam LMEM_TAG_WIDTH = LSU_TAG_WIDTH + `CLOG2(`NUM_LSU_BLOCKS);
////////////////////////// Icache Parameters //////////////////////////////

View file

@ -157,7 +157,7 @@
`ifdef QUARTUS
`define MAX_FANOUT 8
`define MAX_LUTRAM 1024
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
`define USE_BLOCK_BRAM (* ramstyle = "block" *)
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
@ -168,7 +168,7 @@
`define STRING string
`elsif VIVADO
`define MAX_FANOUT 8
`define MAX_LUTRAM 1024
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
`define USE_BLOCK_BRAM (* ram_style = "block" *)
`define USE_FAST_BRAM (* ram_style = "distributed" *)
`define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *)
@ -177,9 +177,12 @@
`define PRESERVE_NET (* keep = "true" *)
`define BLACKBOX_CELL (* black_box *)
`define STRING
`ifndef SIMULATION
`define ASYNC_BRAM_PATCH
`endif
`else
`define MAX_FANOUT 8
`define MAX_LUTRAM 1024
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
`define USE_BLOCK_BRAM
`define USE_FAST_BRAM
`define NO_RW_RAM_CHECK

View file

@ -24,7 +24,7 @@ module VX_socket import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
input sysmem_perf_t sysmem_perf,
`endif
// DCRs
@ -63,11 +63,13 @@ module VX_socket import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
cache_perf_t icache_perf, dcache_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.icache = icache_perf;
sysmem_perf_tmp.dcache = dcache_perf;
end
`endif
///////////////////////////////////////////////////////////////////////////
@ -110,7 +112,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_BUF (2)
) icache (
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.icache),
.cache_perf (icache_perf),
`endif
.clk (clk),
.reset (icache_reset),
@ -160,7 +162,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_BUF (2)
) dcache (
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.dcache),
.cache_perf (dcache_perf),
`endif
.clk (clk),
.reset (dcache_reset),
@ -187,6 +189,7 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_arb #(
.NUM_INPUTS (2),
.NUM_OUTPUTS(1),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX(0),
@ -234,7 +237,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.reset (core_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.sysmem_perf (sysmem_perf_tmp),
`endif
.dcr_bus_if (core_dcr_bus_if),

View file

@ -166,10 +166,8 @@
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
`define VX_CSR_MPM_MEM_BANK_CNTR 12'hB1E // memory bank requests
`define VX_CSR_MPM_MEM_BANK_CNTR_H 12'hB9E
`define VX_CSR_MPM_MEM_BANK_TICK 12'hB1F // memory ticks
`define VX_CSR_MPM_MEM_BANK_TICK_H 12'hB9F
`define VX_CSR_MPM_MEM_BANK_ST 12'hB1E // bank conflicts
`define VX_CSR_MPM_MEM_BANK_ST_H 12'hB9E
// PERF: lmem
`define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_LMEM_READS_H 12'hB9B
@ -177,6 +175,9 @@
`define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D
// PERF: coalescer
`define VX_CSR_MPM_COALESCER_MISS 12'hB1F // coalescer misses
`define VX_CSR_MPM_COALESCER_MISS_H 12'hB9F
// Machine Performance-monitoring memory counters (class 3) ///////////////////
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>

View file

@ -50,11 +50,14 @@ module Vortex import VX_gpu_pkg::*; (
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.lmem = 'x;
cache_perf_t l3_perf;
mem_perf_t mem_perf;
sysmem_perf_t sysmem_perf;
always @(*) begin
sysmem_perf = '0;
sysmem_perf.l3cache = l3_perf;
sysmem_perf.mem = mem_perf;
end
`endif
VX_mem_bus_if #(
@ -98,7 +101,7 @@ module Vortex import VX_gpu_pkg::*; (
.reset (l3_reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_if.l3cache),
.cache_perf (l3_perf),
`endif
.core_bus_if (per_cluster_mem_bus_if),
@ -146,7 +149,7 @@ module Vortex import VX_gpu_pkg::*; (
.reset (cluster_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.sysmem_perf (sysmem_perf),
`endif
.dcr_bus_if (cluster_dcr_bus_if),
@ -182,7 +185,6 @@ module Vortex import VX_gpu_pkg::*; (
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
always @(posedge clk) begin
if (reset) begin
@ -202,7 +204,6 @@ module Vortex import VX_gpu_pkg::*; (
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end
end
assign mem_perf_if.mem = mem_perf;
`endif

View file

@ -620,6 +620,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
VX_mem_arb #(
.NUM_INPUTS (2),
.NUM_OUTPUTS (1),
.DATA_SIZE (LMEM_DATA_SIZE),
.ADDR_WIDTH (CCI_VX_ADDR_WIDTH),
.TAG_WIDTH (CCI_VX_TAG_WIDTH),
@ -1097,7 +1098,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire vx_mem_req_fire = vx_mem_req_valid[0] && vx_mem_req_ready[0];
wire vx_mem_rsp_fire = vx_mem_rsp_valid[0] && vx_mem_rsp_ready[0];
wire avs_req_fire = (avs_write[0] || avs_read[0]) && ~avs_waitrequest[0];
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP (0, 0, {
vx_reset,

View file

@ -50,6 +50,8 @@ module VX_afu_ctrl #(
input wire ap_idle,
output wire interrupt,
output wire ap_ctrl_read,
`ifdef SCOPE
input wire scope_bus_in,
output wire scope_bus_out,
@ -368,7 +370,7 @@ module VX_afu_ctrl #(
end else begin
case (rstate)
RSTATE_ADDR: rstate <= s_axi_ar_fire ? RSTATE_DATA : RSTATE_ADDR;
RSTATE_DATA: rstate <= (~rvalid_stall) ? RSTATE_RESP : RSTATE_DATA;
RSTATE_DATA: rstate <= rvalid_stall ? RSTATE_DATA : RSTATE_RESP;
RSTATE_RESP: rstate <= s_axi_r_fire ? RSTATE_ADDR : RSTATE_RESP;
default: rstate <= RSTATE_ADDR;
endcase
@ -430,6 +432,8 @@ module VX_afu_ctrl #(
assign ap_start = ap_start_r;
assign interrupt = gie_r & (| isr_r);
assign ap_ctrl_read = s_axi_r_fire && (raddr == ADDR_AP_CTRL);
assign dcr_wr_valid = dcr_wr_valid_r;
assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r);
assign dcr_wr_data = `VX_DCR_DATA_WIDTH'(dcrv_r);

View file

@ -10,6 +10,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Reference: https://www.xilinx.com/developer/articles/porting-rtl-designs-to-vitis-rtl-kernels.html
`include "vortex_afu.vh"
@ -35,17 +37,21 @@ module VX_afu_wrap #(
input wire s_axi_ctrl_awvalid,
output wire s_axi_ctrl_awready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
input wire s_axi_ctrl_wvalid,
output wire s_axi_ctrl_wready,
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
input wire s_axi_ctrl_arvalid,
output wire s_axi_ctrl_arready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
output wire s_axi_ctrl_rvalid,
input wire s_axi_ctrl_rready,
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
output wire [1:0] s_axi_ctrl_rresp,
output wire s_axi_ctrl_bvalid,
input wire s_axi_ctrl_bready,
output wire [1:0] s_axi_ctrl_bresp,
@ -58,8 +64,12 @@ module VX_afu_wrap #(
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH;
`endif
localparam STATE_IDLE = 0;
localparam STATE_RUN = 1;
typedef enum logic [1:0] {
STATE_IDLE = 0,
STATE_INIT = 1,
STATE_RUN = 2,
STATE_DONE = 3
} state_e;
localparam PENDING_SIZEW = 12; // max outstanding requests size
localparam C_M_AXI_MEM_NUM_BANKS_SW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
@ -69,20 +79,24 @@ module VX_afu_wrap #(
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_awid_a [C_M_AXI_MEM_NUM_BANKS];
wire [7:0] m_axi_mem_awlen_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_wdata_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_wstrb_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wlast_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_bvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_bready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_bid_a [C_M_AXI_MEM_NUM_BANKS];
wire [1:0] m_axi_mem_bresp_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_arvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_arready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_arid_a [C_M_AXI_MEM_NUM_BANKS];
wire [7:0] m_axi_mem_arlen_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_rvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_rready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_rdata_a [C_M_AXI_MEM_NUM_BANKS];
@ -99,7 +113,6 @@ module VX_afu_wrap #(
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
reg [PENDING_SIZEW-1:0] vx_pending_writes;
reg vx_busy_wait;
reg vx_reset = 1; // asserted at initialization
wire vx_busy;
@ -107,13 +120,16 @@ module VX_afu_wrap #(
wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data;
reg state;
state_e state;
wire ap_reset;
wire ap_start;
wire ap_idle = vx_reset;
wire ap_done = (state == STATE_IDLE) && (vx_pending_writes == '0);
wire ap_ready = 1'b1;
wire ap_ctrl_read;
wire ap_idle = (state == STATE_IDLE);
wire ap_done = (state == STATE_DONE) && (vx_pending_writes == '0);
wire ap_ready = ap_done;
wire ap_done_ack = ap_done && ap_ctrl_read;
`ifdef SCOPE
wire scope_bus_in;
@ -130,41 +146,50 @@ module VX_afu_wrap #(
STATE_IDLE: begin
if (ap_start) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Goto STATE RUN\n", $time))
`TRACE(2, ("%t: AFU: Begin initialization\n", $time))
`endif
state <= STATE_RUN;
state <= STATE_INIT;
vx_reset_ctr <= (`RESET_DELAY-1);
vx_reset <= 1;
end
end
STATE_RUN: begin
STATE_INIT: begin
if (vx_reset) begin
// wait until the reset network is ready
// wait for reset to complete
if (vx_reset_ctr == 0) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Begin execution\n", $time))
`TRACE(2, ("%t: AFU: Initialization completed\n", $time))
`endif
vx_busy_wait <= 1;
vx_reset <= 0;
end
end else begin
if (vx_busy_wait) begin
// wait until processor goes busy
if (vx_busy) begin
vx_busy_wait <= 0;
end
end else begin
// wait until the processor is not busy
if (~vx_busy) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: End execution\n", $time))
`TRACE(2, ("%t: AFU: Goto STATE IDLE\n", $time))
`endif
state <= STATE_IDLE;
end
// wait until processor goes busy
if (vx_busy) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Begin execution\n", $time))
`endif
state <= STATE_RUN;
end
end
end
STATE_RUN: begin
// wait until the processor is not busy
if (~vx_busy) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Execution completed\n", $time))
`endif
state <= STATE_DONE;
end
end
STATE_DONE: begin
// wait for host's done acknowledgement
if (ap_done_ack) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Processor idle\n", $time))
`endif
state <= STATE_IDLE;
end
end
endcase
// ensure reset network initialization
@ -177,7 +202,7 @@ module VX_afu_wrap #(
wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire;
wire [C_M_AXI_MEM_NUM_BANKS_SW-1:0] cur_wr_reqs, cur_wr_rsps;
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_awfire
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_req_fire
VX_axi_write_ack axi_write_ack (
.clk (clk),
.reset (reset),
@ -190,7 +215,10 @@ module VX_afu_wrap #(
`UNUSED_PIN (w_ack),
`UNUSED_PIN (tx_rdy)
);
assign m_axi_wr_rsp_fire[i] = m_axi_mem_bvalid_a[i] & m_axi_mem_bready_a[i];
end
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_rsp_fire
assign m_axi_wr_rsp_fire[i] = m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
end
`POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire);
@ -217,17 +245,21 @@ module VX_afu_wrap #(
.s_axi_awvalid (s_axi_ctrl_awvalid),
.s_axi_awready (s_axi_ctrl_awready),
.s_axi_awaddr (s_axi_ctrl_awaddr),
.s_axi_wvalid (s_axi_ctrl_wvalid),
.s_axi_wready (s_axi_ctrl_wready),
.s_axi_wdata (s_axi_ctrl_wdata),
.s_axi_wstrb (s_axi_ctrl_wstrb),
.s_axi_arvalid (s_axi_ctrl_arvalid),
.s_axi_arready (s_axi_ctrl_arready),
.s_axi_araddr (s_axi_ctrl_araddr),
.s_axi_rvalid (s_axi_ctrl_rvalid),
.s_axi_rready (s_axi_ctrl_rready),
.s_axi_rdata (s_axi_ctrl_rdata),
.s_axi_rresp (s_axi_ctrl_rresp),
.s_axi_bvalid (s_axi_ctrl_bvalid),
.s_axi_bready (s_axi_ctrl_bready),
.s_axi_bresp (s_axi_ctrl_bresp),
@ -238,6 +270,8 @@ module VX_afu_wrap #(
.ap_ready (ap_ready),
.ap_idle (ap_idle),
.interrupt (interrupt),
.ap_ctrl_read (ap_ctrl_read),
`ifdef SCOPE
.scope_bus_in (scope_bus_out),
@ -328,9 +362,9 @@ module VX_afu_wrap #(
`ifdef DBG_SCOPE_AFU
wire m_axi_mem_awfire_0 = m_axi_mem_awvalid_a[0] & m_axi_mem_awready_a[0];
wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0];
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0];
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP (0, 0, {
ap_reset,
@ -340,6 +374,7 @@ module VX_afu_wrap #(
interrupt,
vx_reset,
vx_busy,
state,
m_axi_mem_awvalid_a[0],
m_axi_mem_awready_a[0],
m_axi_mem_wvalid_a[0],
@ -356,7 +391,7 @@ module VX_afu_wrap #(
m_axi_mem_arfire_0,
m_axi_mem_wfire_0,
m_axi_mem_bfire_0
},{
}, {
dcr_wr_addr,
dcr_wr_data,
vx_pending_writes,
@ -383,11 +418,11 @@ module VX_afu_wrap #(
ap_start,
ap_done,
ap_idle,
state,
interrupt
}),
.probe1 ({
vx_pending_writes,
vx_busy_wait,
vx_busy,
vx_reset,
dcr_wr_valid,
@ -428,16 +463,19 @@ module VX_afu_wrap #(
always @(posedge clk) begin
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
if (m_axi_mem_awvalid_a[i] && m_axi_mem_awready_a[i]) begin
`TRACE(2, ("%t: AXI Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]))
`TRACE(2, ("%t: AXI Wr Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]))
end
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
`TRACE(2, ("%t: AXI Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i]))
`TRACE(2, ("%t: AXI Wr Req [%0d]: strb=0x%h, data=0x%h\n", $time, i, m_axi_mem_wstrb_a[i], m_axi_mem_wdata_a[i]))
end
if (m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i]) begin
`TRACE(2, ("%t: AXI Wr Rsp [%0d]: id=0x%0h\n", $time, i, m_axi_mem_bid_a[i]))
end
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
`TRACE(2, ("%t: AXI Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]))
`TRACE(2, ("%t: AXI Rd Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]))
end
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
`TRACE(2, ("%t: AXI Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]))
`TRACE(2, ("%t: AXI Rd Rsp [%0d]: data=0x%h, id=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]))
end
end
end

View file

@ -40,18 +40,22 @@ module vortex_afu #(
input wire s_axi_ctrl_awvalid,
output wire s_axi_ctrl_awready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
input wire s_axi_ctrl_wvalid,
output wire s_axi_ctrl_wready,
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
input wire s_axi_ctrl_arvalid,
input wire s_axi_ctrl_arvalid,
output wire s_axi_ctrl_arready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
output wire s_axi_ctrl_rvalid,
input wire s_axi_ctrl_rready,
input wire s_axi_ctrl_rready,
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
output wire [1:0] s_axi_ctrl_rresp,
output wire s_axi_ctrl_bvalid,
output wire s_axi_ctrl_bvalid,
input wire s_axi_ctrl_bready,
output wire [1:0] s_axi_ctrl_bresp,
@ -76,17 +80,21 @@ module vortex_afu #(
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
.s_axi_ctrl_awready (s_axi_ctrl_awready),
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
.s_axi_ctrl_wvalid (s_axi_ctrl_wvalid),
.s_axi_ctrl_wready (s_axi_ctrl_wready),
.s_axi_ctrl_wdata (s_axi_ctrl_wdata),
.s_axi_ctrl_wstrb (s_axi_ctrl_wstrb),
.s_axi_ctrl_arvalid (s_axi_ctrl_arvalid),
.s_axi_ctrl_arready (s_axi_ctrl_arready),
.s_axi_ctrl_araddr (s_axi_ctrl_araddr),
.s_axi_ctrl_rvalid (s_axi_ctrl_rvalid),
.s_axi_ctrl_rready (s_axi_ctrl_rready),
.s_axi_ctrl_rdata (s_axi_ctrl_rdata),
.s_axi_ctrl_rresp (s_axi_ctrl_rresp),
.s_axi_ctrl_bvalid (s_axi_ctrl_bvalid),
.s_axi_ctrl_bready (s_axi_ctrl_bready),
.s_axi_ctrl_bresp (s_axi_ctrl_bresp),

View file

@ -52,7 +52,7 @@ module VX_cache import VX_gpu_pkg::*; #(
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_CYCLIC,
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -106,10 +106,9 @@ module VX_cache import VX_gpu_pkg::*; #(
localparam MEM_ARB_SEL_BITS = `CLOG2(`CDIV(NUM_BANKS, MEM_PORTS));
localparam MEM_ARB_SEL_WIDTH = `UP(MEM_ARB_SEL_BITS);
localparam CORE_RSP_REG_DISABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_REG_DISABLE = (NUM_BANKS != 1);
localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0;
localparam REQ_XBAR_BUF = (NUM_REQS > 2) ? 2 : 0;
localparam CORE_RSP_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
@ -133,7 +132,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.NUM_BANKS (NUM_BANKS),
.UUID_WIDTH(UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // request xbar latency
) flush_unit (
.clk (clk),
.reset (reset),
@ -387,8 +386,8 @@ module VX_cache import VX_gpu_pkg::*; #(
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.FLAGS_WIDTH (FLAGS_WIDTH),
.CORE_OUT_REG (CORE_RSP_REG_DISABLE ? 0 : 1),
.MEM_OUT_REG (MEM_REQ_REG_DISABLE ? 0 : 1)
.CORE_OUT_REG (CORE_RSP_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(CORE_OUT_BUF)),
.MEM_OUT_REG (MEM_REQ_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(MEM_OUT_BUF))
) bank (
.clk (clk),
.reset (reset),
@ -481,7 +480,7 @@ module VX_cache import VX_gpu_pkg::*; #(
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_RSP_REG_DISABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.SIZE (CORE_RSP_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
@ -578,7 +577,7 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
.SIZE (MEM_REQ_REG_DISABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),

View file

@ -48,7 +48,7 @@ module VX_cache_bank #(
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_CYCLIC,
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -353,9 +353,11 @@ module VX_cache_bank #(
.clk (clk),
.reset (reset),
.stall (pipe_stall),
.hit_valid (do_lookup_st1 && is_hit_st1 && ~pipe_stall),
.hit_line (line_idx_st1),
.hit_way (way_idx_st1),
.init (do_init_st0),
.lookup_valid(do_lookup_st1 && ~pipe_stall),
.lookup_hit (is_hit_st1),
.lookup_line(line_idx_st1),
.lookup_way (way_idx_st1),
.repl_valid (do_fill_st0 && ~pipe_stall),
.repl_line (line_idx_st0),
.repl_way (victim_way_st0)
@ -443,7 +445,6 @@ module VX_cache_bank #(
) cache_data (
.clk (clk),
.reset (reset),
.stall (pipe_stall),
// inputs
.init (do_init_st0),
.fill (do_fill_st0 && ~pipe_stall),

View file

@ -56,7 +56,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_CYCLIC,
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,

View file

@ -33,7 +33,6 @@ module VX_cache_data #(
) (
input wire clk,
input wire reset,
input wire stall,
// inputs
input wire init,
input wire fill,
@ -53,7 +52,6 @@ module VX_cache_data #(
output wire [LINE_SIZE-1:0] evict_byteen
);
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (stall)
wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_mask;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin : g_write_mask

View file

@ -73,7 +73,7 @@
///////////////////////////////////////////////////////////////////////////////
`define CS_REPL_RANDOM 0
`define CS_REPL_CYCLIC 1
`define CS_REPL_FIFO 1
`define CS_REPL_PLRU 2
`endif // VX_CACHE_DEFINE_VH

View file

@ -90,19 +90,23 @@ module VX_cache_repl #(
// Number of associative ways
parameter NUM_WAYS = 1,
// replacement policy
parameter REPL_POLICY = `CS_REPL_CYCLIC
parameter REPL_POLICY = `CS_REPL_FIFO
) (
input wire clk,
input wire reset,
input wire stall,
input wire hit_valid,
input wire [`CS_LINE_SEL_BITS-1:0] hit_line,
input wire [`CS_WAY_SEL_WIDTH-1:0] hit_way,
input wire init,
input wire lookup_valid,
input wire lookup_hit,
input wire [`CS_LINE_SEL_BITS-1:0] lookup_line,
input wire [`CS_WAY_SEL_WIDTH-1:0] lookup_way,
input wire repl_valid,
input wire [`CS_LINE_SEL_BITS-1:0] repl_line,
output wire [`CS_WAY_SEL_WIDTH-1:0] repl_way
);
localparam WAY_SEL_WIDTH = `CS_WAY_SEL_WIDTH;
`UNUSED_VAR (reset)
`UNUSED_VAR (init)
`UNUSED_VAR (stall)
if (NUM_WAYS > 1) begin : g_enable
@ -122,20 +126,20 @@ module VX_cache_repl #(
.RADDR_REG (1)
) plru_store (
.clk (clk),
.reset (reset),
.reset (1'b0),
.read (repl_valid),
.write (hit_valid),
.wren (plru_wmask),
.waddr (hit_line),
.write (init || (lookup_valid && lookup_hit)),
.wren (init ? '1 : plru_wmask),
.waddr (lookup_line),
.raddr (repl_line),
.wdata (plru_wdata),
.wdata (init ? '0 : plru_wdata),
.rdata (plru_rdata)
);
plru_decoder #(
.NUM_WAYS (NUM_WAYS)
) plru_dec (
.way_idx (hit_way),
.way_idx (lookup_way),
.lru_data (plru_wdata),
.lru_mask (plru_wmask)
);
@ -147,37 +151,39 @@ module VX_cache_repl #(
.way_idx (repl_way)
);
end else if (REPL_POLICY == `CS_REPL_CYCLIC) begin : g_cyclic
// Cyclic replacement policy
`UNUSED_VAR (hit_valid)
`UNUSED_VAR (hit_line)
`UNUSED_VAR (hit_way)
end else if (REPL_POLICY == `CS_REPL_FIFO) begin : g_fifo
// Fifo replacement policy
`UNUSED_VAR (lookup_valid)
`UNUSED_VAR (lookup_hit)
`UNUSED_VAR (lookup_line)
`UNUSED_VAR (lookup_way)
wire [WAY_SEL_WIDTH-1:0] ctr_rdata;
wire [WAY_SEL_WIDTH-1:0] ctr_wdata = ctr_rdata + 1;
wire [WAY_SEL_WIDTH-1:0] fifo_rdata;
wire [WAY_SEL_WIDTH-1:0] fifo_wdata = fifo_rdata + 1;
VX_sp_ram #(
.DATAW (WAY_SEL_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.RDW_MODE ("R"),
.RADDR_REG (1)
) ctr_store (
) fifo_store (
.clk (clk),
.reset (reset),
.reset (1'b0),
.read (repl_valid),
.write (repl_valid),
.write (init || repl_valid),
.wren (1'b1),
.addr (repl_line),
.wdata (ctr_wdata),
.rdata (ctr_rdata)
.wdata (init ? '0 : fifo_wdata),
.rdata (fifo_rdata)
);
assign repl_way = ctr_rdata;
assign repl_way = fifo_rdata;
end else begin : g_random
// Random replacement policy
`UNUSED_VAR (hit_valid)
`UNUSED_VAR (hit_line)
`UNUSED_VAR (hit_way)
`UNUSED_VAR (lookup_valid)
`UNUSED_VAR (lookup_hit)
`UNUSED_VAR (lookup_line)
`UNUSED_VAR (lookup_way)
`UNUSED_VAR (repl_valid)
`UNUSED_VAR (repl_line)
reg [WAY_SEL_WIDTH-1:0] victim_idx;
@ -192,10 +198,10 @@ module VX_cache_repl #(
end
end else begin : g_disable
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (hit_valid)
`UNUSED_VAR (hit_line)
`UNUSED_VAR (hit_way)
`UNUSED_VAR (lookup_valid)
`UNUSED_VAR (lookup_hit)
`UNUSED_VAR (lookup_line)
`UNUSED_VAR (lookup_way)
`UNUSED_VAR (repl_valid)
`UNUSED_VAR (repl_line)
assign repl_way = 1'b0;

View file

@ -153,7 +153,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
end
VX_cache #(
VX_cache_wrap #(
.INSTANCE_ID (INSTANCE_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),

View file

@ -54,7 +54,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_CYCLIC,
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -210,7 +210,59 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
end
`ifdef PERF_ENABLE
assign cache_perf = '0;
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
assign perf_core_reads_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && ~core_bus_if[i].req_data.rw;
assign perf_core_writes_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && core_bus_if[i].req_data.rw;
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
end
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
perf_core_reads <= '0;
perf_core_writes <= '0;
perf_mem_stalls <= '0;
perf_crsp_stalls <= '0;
end else begin
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
end
end
assign cache_perf.reads = perf_core_reads;
assign cache_perf.writes = perf_core_writes;
assign cache_perf.read_misses = '0;
assign cache_perf.write_misses = '0;
assign cache_perf.bank_stalls = '0;
assign cache_perf.mshr_stalls = '0;
assign cache_perf.mem_stalls = perf_mem_stalls;
assign cache_perf.crsp_stalls = perf_crsp_stalls;
`endif
end
@ -220,13 +272,13 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin
if (core_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.tag.uuid))
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.tag.uuid))
end
end
if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, i, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
end
end
end

View file

@ -101,7 +101,7 @@ module VX_commit import VX_gpu_pkg::*; #(
.data_out ({commit_fire_any_r, commit_size_r})
);
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (COMMIT_SIZEW),
.DATAW_OUT (COMMIT_ALL_SIZEW),
.N (`ISSUE_WIDTH),

View file

@ -28,7 +28,7 @@ module VX_core import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
input sysmem_perf_t sysmem_perf,
`endif
VX_dcr_bus_if.slave dcr_bus_if,
@ -65,14 +65,15 @@ module VX_core import VX_gpu_pkg::*; #(
) lsu_mem_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
VX_pipeline_perf_if pipeline_perf_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
lmem_perf_t lmem_perf;
coalescer_perf_t coalescer_perf;
pipeline_perf_t pipeline_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.lmem = lmem_perf;
sysmem_perf_tmp.coalescer = coalescer_perf;
end
`endif
base_dcrs_t base_dcrs;
@ -94,7 +95,7 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.sched_perf (pipeline_perf_if.sched),
.sched_perf (pipeline_perf.sched),
`endif
.base_dcrs (base_dcrs),
@ -144,7 +145,7 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.issue_perf (pipeline_perf_if.issue),
.issue_perf (pipeline_perf.issue),
`endif
.decode_if (decode_if),
@ -162,8 +163,8 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
.sysmem_perf (sysmem_perf_tmp),
.pipeline_perf (pipeline_perf),
`endif
.base_dcrs (base_dcrs),
@ -200,7 +201,8 @@ module VX_core import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.lmem_perf (mem_perf_tmp_if.lmem),
.lmem_perf (lmem_perf),
.coalescer_perf(coalescer_perf),
`endif
.lsu_mem_if (lsu_mem_if),
.dcache_bus_if (dcache_bus_if)
@ -276,12 +278,11 @@ module VX_core import VX_gpu_pkg::*; #(
end
end
assign pipeline_perf_if.ifetches = perf_ifetches;
assign pipeline_perf_if.loads = perf_loads;
assign pipeline_perf_if.stores = perf_stores;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf.ifetches = perf_ifetches;
assign pipeline_perf.loads = perf_loads;
assign pipeline_perf.stores = perf_stores;
assign pipeline_perf.ifetch_latency = perf_icache_lat;
assign pipeline_perf.load_latency = perf_dcache_lat;
`endif

View file

@ -127,13 +127,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_rsp_ready = icache_bus_if.rsp_ready;
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = '0;
assign mem_perf_if.dcache = '0;
assign mem_perf_if.l2cache = '0;
assign mem_perf_if.l3cache = '0;
assign mem_perf_if.lmem = '0;
assign mem_perf_if.mem = '0;
sysmem_perf_t mem_perf;
assign mem_perf.icache = '0;
assign mem_perf.dcache = '0;
assign mem_perf.l2cache = '0;
assign mem_perf.l3cache = '0;
assign mem_perf.lmem = '0;
assign mem_perf.mem = '0;
`endif
`ifdef SCOPE
@ -152,7 +152,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.sysmem_perf (sysmem_perf),
`endif
.dcr_bus_if (dcr_bus_if),

View file

@ -41,8 +41,8 @@ import VX_fpu_pkg::*;
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
VX_commit_csr_if.slave commit_csr_if,
@ -212,65 +212,67 @@ import VX_fpu_pkg::*;
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf_if.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf_if.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf_if.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf_if.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf_if.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_ALU]);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_FPU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_FPU]);
`else
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0));
`endif
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_WCTL]);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf_if.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf_if.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf_if.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf_if.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf_if.load_latency);
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf.load_latency);
default:;
endcase
end
`VX_DCR_MPM_CLASS_MEM: begin
case (read_addr)
// PERF: icache
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, mem_perf_if.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, mem_perf_if.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, mem_perf_if.icache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls);
// PERF: dcache
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, mem_perf_if.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, mem_perf_if.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, mem_perf_if.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, mem_perf_if.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, mem_perf_if.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, mem_perf_if.dcache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls);
// PERF: lmem
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, mem_perf_if.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, mem_perf_if.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, mem_perf_if.lmem.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls);
// PERF: l2cache
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, mem_perf_if.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, mem_perf_if.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, mem_perf_if.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, mem_perf_if.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l2cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls);
// PERF: l3cache
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, mem_perf_if.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, mem_perf_if.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, mem_perf_if.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, mem_perf_if.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l3cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, mem_perf_if.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, mem_perf_if.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, mem_perf_if.mem.latency);
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency);
// PERF: coalescer
`CSR_READ_64(`VX_CSR_MPM_COALESCER_MISS, read_data_ro_w, sysmem_perf.coalescer.misses);
default:;
endcase
end
@ -290,8 +292,8 @@ import VX_fpu_pkg::*;
`RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE
`UNUSED_VAR (mem_perf_if.icache);
`UNUSED_VAR (mem_perf_if.lmem);
`UNUSED_VAR (sysmem_perf.icache);
`UNUSED_VAR (sysmem_perf.lmem);
`endif
endmodule

View file

@ -24,8 +24,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
`ifdef EXT_F_ENABLE
@ -82,8 +82,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
`endif
.commit_csr_if (commit_csr_if),

View file

@ -23,8 +23,8 @@ module VX_execute import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
input base_dcrs_t base_dcrs,
@ -93,8 +93,8 @@ module VX_execute import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if (pipeline_perf_if),
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
`endif
.base_dcrs (base_dcrs),
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),

View file

@ -53,7 +53,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
VX_dp_ram #(
.DATAW (`PC_BITS + `NUM_THREADS),
.SIZE (`NUM_WARPS),
.RDW_MODE ("R")
.RDW_MODE ("R"),
.LUTRAM (1)
) tag_store (
.clk (clk),
.reset (reset),
@ -137,6 +138,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 1, 6, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +

View file

@ -93,6 +93,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
`SCOPE_IO_SWITCH (1);
wire decode_fire = decode_if.valid && decode_if.ready;
wire operands_fire = operands_if.valid && operands_if.ready;
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 2, 4, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 +

View file

@ -535,6 +535,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`ifdef SCOPE
`ifdef DBG_SCOPE_LSU
`SCOPE_IO_SWITCH (1);
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 3, 4, 2, (
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH

View file

@ -20,7 +20,8 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
output cache_perf_t lmem_perf,
output lmem_perf_t lmem_perf,
output coalescer_perf_t coalescer_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS],
@ -39,7 +40,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_lsu_mem_if #(
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
@ -60,46 +61,58 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
);
end
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LMEM_TAG_WIDTH)
) lmem_arb_if[1]();
VX_lsu_mem_arb #(
.NUM_INPUTS (`NUM_LSU_BLOCKS),
.NUM_OUTPUTS(1),
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_IDX(0),
.ARBITER ("R"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(2)
) lmem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (lsu_lmem_if),
.bus_out_if (lmem_arb_if)
);
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS]();
.TAG_WIDTH (LMEM_TAG_WIDTH)
) lmem_adapt_if[`NUM_LSU_LANES]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_adapters
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (2)
) lmem_adapter (
.clk (clk),
.reset (reset),
.lsu_mem_if (lsu_lmem_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_lmem_bus_if
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
end
end
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LMEM_TAG_WIDTH),
.TAG_SEL_BITS (LMEM_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (0)
) lmem_adapter (
.clk (clk),
.reset (reset),
.lsu_mem_if (lmem_arb_if[0]),
.mem_bus_if (lmem_adapt_if)
);
VX_local_mem #(
.INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (LSU_NUM_REQS),
.NUM_REQS (`NUM_LSU_LANES),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_WIDTH (LMEM_TAG_WIDTH),
.OUT_BUF (3)
) local_mem (
.clk (clk),
@ -107,7 +120,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.lmem_perf (lmem_perf),
`endif
.mem_bus_if (lmem_bus_if)
.mem_bus_if (lmem_adapt_if)
);
`else
@ -115,6 +128,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
assign lmem_perf = '0;
`endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if
`ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end
@ -127,6 +141,21 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
wire [`NUM_LSU_BLOCKS-1:0][`PERF_CTR_BITS-1:0] per_block_coalescer_misses;
wire [`PERF_CTR_BITS-1:0] coalescer_misses;
VX_reduce_tree #(
.DATAW_IN (`PERF_CTR_BITS),
.DATAW_OUT (`PERF_CTR_BITS),
.N (`NUM_LSU_BLOCKS),
.OP ("+")
) coalescer_reduce (
.data_in (per_block_coalescer_misses),
.data_out (coalescer_misses)
);
`BUFFER(coalescer_perf.misses, coalescer_misses);
`endif
if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers
@ -139,11 +168,18 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
.QUEUE_SIZE (`LSUQ_OUT_SIZE),
.PERF_CTR_BITS (`PERF_CTR_BITS)
) mem_coalescer (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.misses (per_block_coalescer_misses[i]),
`else
`UNUSED_PIN (misses),
`endif
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
@ -186,6 +222,9 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if
`ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
`ifdef PERF_ENABLE
assign per_block_coalescer_misses[i] = '0;
`endif
end
end

View file

@ -106,7 +106,6 @@ module VX_operands import VX_gpu_pkg::*; #(
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.ARBITER ("P"), // use priority arbiter
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (0) // no output buffering
) req_xbar (
.clk (clk),
@ -271,7 +270,7 @@ module VX_operands import VX_gpu_pkg::*; #(
.RESET_RAM (1),
`endif
.OUT_REG (1),
.RDW_MODE ("U")
.RDW_MODE ("R")
) gpr_ram (
.clk (clk),
.reset (reset),

View file

@ -68,8 +68,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
reg [`PERF_CTR_BITS-1:0] cycles;
reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs;
wire schedule_fire = schedule_valid && schedule_ready;
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
@ -113,6 +111,16 @@ module VX_schedule import VX_gpu_pkg::*; #(
barrier_stalls_n= barrier_stalls;
warp_pcs_n = warp_pcs;
// decode unlock
if (decode_sched_if.valid && decode_sched_if.unlock) begin
stalled_warps_n[decode_sched_if.wid] = 0;
end
// CSR unlock
if (sched_csr_if.unlock_warp) begin
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
end
// wspawn handling
if (wspawn.valid && is_single_warp) begin
active_warps_n |= wspawn.wmask;
@ -170,6 +178,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
end
`ifdef GBAR_ENABLE
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_data.id)) begin
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
@ -188,16 +197,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
end
// decode unlock
if (decode_sched_if.valid && decode_sched_if.unlock) begin
stalled_warps_n[decode_sched_if.wid] = 0;
end
// CSR unlock
if (sched_csr_if.unlock_warp) begin
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
end
// stall the warp until decode stage
if (schedule_fire) begin
stalled_warps_n[schedule_wid] = 1;
@ -223,7 +222,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
active_warps <= '0;
thread_masks <= '0;
barrier_stalls <= '0;
issued_instrs <= '0;
cycles <= '0;
wspawn.valid <= 0;
@ -268,10 +266,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
`endif
if (schedule_if_fire) begin
issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1);
end
if (busy) begin
cycles <= cycles + 1;
end

View file

@ -44,7 +44,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (`NUM_EX_UNITS),
.N (PER_ISSUE_WARPS),
.OP ("|")
@ -53,7 +53,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
.data_out (perf_units_per_cycle)
);
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (PER_ISSUE_WARPS),
.OP ("|")
@ -151,11 +151,14 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`endif
always @(*) begin
for (integer i = 0; i < NUM_OPDS; ++i) begin
for (genvar i = 0; i < NUM_OPDS; ++i) begin : g_operands_busy_n
always @(*) begin
operands_busy_n[i] = operands_busy[i];
if (ibuffer_fire) begin
operands_busy_n[i] = inuse_regs[ibuf_opds[i]];
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 1;
end
end
if (writeback_fire) begin
if (ibuffer_fire) begin
@ -168,9 +171,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
end
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 1;
end
end
end
@ -185,8 +185,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
inuse_regs[staging_if[w].data.rd] <= 1;
end
end
operands_busy <= operands_busy_n;
operands_ready[w] <= ~(| operands_busy_n);
`ifdef PERF_ENABLE
if (staging_fire && staging_if[w].data.wb) begin
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;

View file

@ -21,8 +21,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
input base_dcrs_t base_dcrs,
@ -121,8 +121,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.execute_if (pe_execute_if[PE_IDX_CSRS]),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
`endif
`ifdef EXT_F_ENABLE

View file

@ -1,46 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_pipeline_perf_if import VX_gpu_pkg::*; ();
sched_perf_t sched;
issue_perf_t issue;
wire [`PERF_CTR_BITS-1:0] ifetches;
wire [`PERF_CTR_BITS-1:0] loads;
wire [`PERF_CTR_BITS-1:0] stores;
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency;
modport master (
output sched,
output issue,
output ifetches,
output loads,
output stores,
output ifetch_latency,
output load_latency
);
modport slave (
input sched,
input issue,
input ifetches,
input loads,
input stores,
input ifetch_latency,
input load_latency
);
endinterface

View file

@ -1,27 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_sfu_perf_if ();
wire [`PERF_CTR_BITS-1:0] wctl_stalls;
modport master (
output wctl_stalls
);
modport slave (
input wctl_stalls
);
endinterface

View file

@ -31,10 +31,10 @@
`RAM_INITIALIZATION \
reg [ADDRW-1:0] raddr_r; \
always @(posedge clk) begin \
if (__re || __we) begin \
if (__we) begin \
ram[__wa] <= wdata; \
end \
if (__we) begin \
ram[__wa] <= wdata; \
end \
if (__re) begin \
raddr_r <= __ra; \
end \
end \
@ -45,14 +45,14 @@
`RAM_INITIALIZATION \
reg [ADDRW-1:0] raddr_r; \
always @(posedge clk) begin \
if (__re || __we) begin \
if (__we) begin \
for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
end \
if (__we) begin \
for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
end \
end \
end \
if (__re) begin \
raddr_r <= __ra; \
end \
end \
@ -63,10 +63,10 @@
`RAM_INITIALIZATION \
reg [DATAW-1:0] rdata_r; \
always @(posedge clk) begin \
if (__re || __we) begin \
if (__we) begin \
ram[__wa] <= wdata; \
end \
if (__we) begin \
ram[__wa] <= wdata; \
end \
if (__re) begin \
rdata_r <= ram[__ra]; \
end \
end \
@ -77,14 +77,14 @@
`RAM_INITIALIZATION \
reg [DATAW-1:0] rdata_r; \
always @(posedge clk) begin \
if (__re || __we) begin \
if (__we) begin \
for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
end \
if (__we) begin \
for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
end \
end \
end \
if (__re) begin \
rdata_r <= ram[__ra]; \
end \
end \
@ -122,6 +122,7 @@ module VX_async_ram_patch #(
parameter DUAL_PORT = 0,
parameter FORCE_BRAM = 0,
parameter RADDR_REG = 0, // read address registered hint
parameter RADDR_RESET = 0, // read address has reset
parameter WRITE_FIRST = 0,
parameter INIT_ENABLE = 0,
parameter INIT_FILE = "",
@ -143,16 +144,24 @@ module VX_async_ram_patch #(
`UNUSED_VAR (reset)
(* keep = "true" *) wire [ADDRW-1:0] raddr_w, raddr_s;
(* keep = "true" *) wire read_s, is_raddr_reg;
(* keep = "true" *) wire read_s;
assign raddr_w = raddr;
wire raddr_reset_w;
if (RADDR_RESET) begin : g_raddr_reset
(* keep = "true" *) wire raddr_reset;
assign raddr_reset = 0;
assign raddr_reset_w = raddr_reset;
end else begin : g_no_raddr_reset
assign raddr_reset_w = 0;
end
VX_placeholder #(
.I (ADDRW),
.O (ADDRW + 1 + 1)
) placeholder (
.in (raddr_w),
.out ({raddr_s, read_s, is_raddr_reg})
.I (ADDRW + 1),
.O (ADDRW + 1)
) placeholder1 (
.in ({raddr_w, raddr_reset_w}),
.out ({raddr_s, read_s})
);
wire [DATAW-1:0] rdata_s;
@ -206,9 +215,15 @@ module VX_async_ram_patch #(
end
if (RADDR_REG) begin : g_raddr_reg
`UNUSED_VAR (is_raddr_reg)
assign rdata = rdata_s;
end else begin : g_async_ram
(* keep = "true" *) wire is_raddr_reg;
VX_placeholder #(
.O (1)
) placeholder2 (
.in (),
.out (is_raddr_reg)
);
wire [DATAW-1:0] rdata_a;
if (DUAL_PORT) begin : g_dp
if (WRENW != 1) begin : g_wren

View file

@ -23,10 +23,10 @@ module VX_axi_adapter #(
parameter NUM_PORTS_IN = 1,
parameter NUM_BANKS_OUT = 1,
parameter INTERLEAVE = 0,
parameter TAG_BUFFER_SIZE= 32,
parameter TAG_BUFFER_SIZE= 16,
parameter ARBITER = "R",
parameter REQ_OUT_BUF = 1,
parameter RSP_OUT_BUF = 1,
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0,
parameter DATA_SIZE = DATA_WIDTH/8
) (
input wire clk,
@ -99,7 +99,7 @@ module VX_axi_adapter #(
localparam LOG2_DATA_SIZE = `CLOG2(DATA_SIZE);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS_OUT);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam DST_ADDR_WDITH = (ADDR_WIDTH_OUT - LOG2_DATA_SIZE) + BANK_SEL_BITS; // convert output addresss to byte-addressable input space
localparam DST_ADDR_WDITH = (ADDR_WIDTH_OUT - LOG2_DATA_SIZE) + BANK_SEL_BITS; // convert byte-addressable output addresss to block-addressable input space
localparam BANK_ADDR_WIDTH = DST_ADDR_WDITH - BANK_SEL_BITS;
localparam NUM_PORTS_IN_BITS = `CLOG2(NUM_PORTS_IN);
localparam NUM_PORTS_IN_WIDTH = `UP(NUM_PORTS_IN_BITS);
@ -109,8 +109,8 @@ module VX_axi_adapter #(
localparam READ_FULL_TAG_WIDTH = READ_TAG_WIDTH + NUM_PORTS_IN_BITS;
localparam WRITE_TAG_WIDTH = `MIN(TAG_WIDTH_IN, TAG_WIDTH_OUT);
localparam DST_TAG_WIDTH = `MAX(READ_FULL_TAG_WIDTH, WRITE_TAG_WIDTH);
localparam ARB_TAG_WIDTH = `MAX(READ_TAG_WIDTH, WRITE_TAG_WIDTH);
localparam ARB_DATAW = 1 + BANK_ADDR_WIDTH + DATA_SIZE + DATA_WIDTH + ARB_TAG_WIDTH;
localparam XBAR_TAG_WIDTH = `MAX(READ_TAG_WIDTH, WRITE_TAG_WIDTH);
localparam REQ_XBAR_DATAW = 1 + BANK_ADDR_WIDTH + DATA_SIZE + DATA_WIDTH + XBAR_TAG_WIDTH;
localparam RSP_XBAR_DATAW = DATA_WIDTH + READ_TAG_WIDTH;
`STATIC_ASSERT ((DST_ADDR_WDITH >= ADDR_WIDTH_IN), ("invalid address width: current=%0d, expected=%0d", DST_ADDR_WDITH, ADDR_WIDTH_IN))
@ -174,117 +174,85 @@ module VX_axi_adapter #(
end
end
// Request ack
// AXI request handling
wire [NUM_BANKS_OUT-1:0][NUM_PORTS_IN-1:0] arb_ready_in;
wire [NUM_PORTS_IN-1:0] req_xbar_valid_in;
wire [NUM_PORTS_IN-1:0][REQ_XBAR_DATAW-1:0] req_xbar_data_in;
wire [NUM_PORTS_IN-1:0] req_xbar_ready_in;
if (NUM_PORTS_IN > 1) begin : g_multi_inputs
wire [NUM_PORTS_IN-1:0][NUM_BANKS_OUT-1:0] arb_ready_in_w;
VX_transpose #(
.N (NUM_BANKS_OUT),
.M (NUM_PORTS_IN)
) rdy_in_transpose (
.data_in (arb_ready_in),
.data_out (arb_ready_in_w)
);
for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_ready_in
assign mem_req_ready[i] = | arb_ready_in_w[i];
end
end else begin : g_single_input
assign mem_req_ready[0] = arb_ready_in[req_bank_sel[0]][0];
wire [NUM_BANKS_OUT-1:0] req_xbar_valid_out;
wire [NUM_BANKS_OUT-1:0][REQ_XBAR_DATAW-1:0] req_xbar_data_out;
wire [NUM_BANKS_OUT-1:0][NUM_PORTS_IN_WIDTH-1:0] req_xbar_sel_out;
wire [NUM_BANKS_OUT-1:0] req_xbar_ready_out;
for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_req_xbar_data_in
wire tag_ready = mem_req_rw[i] || mem_rd_req_tag_ready[i];
wire [XBAR_TAG_WIDTH-1:0] tag_value = mem_req_rw[i] ? XBAR_TAG_WIDTH'(mem_req_tag[i]) : XBAR_TAG_WIDTH'(mem_rd_req_tag[i]);
assign req_xbar_valid_in[i] = mem_req_valid[i] && tag_ready;
assign req_xbar_data_in[i] = {mem_req_rw[i], req_bank_addr[i], mem_req_byteen[i], mem_req_data[i], tag_value};
assign mem_req_ready[i] = req_xbar_ready_in[i] && tag_ready;
end
// AXi write request synchronization
VX_stream_xbar #(
.NUM_INPUTS (NUM_PORTS_IN),
.NUM_OUTPUTS(NUM_BANKS_OUT),
.DATAW (REQ_XBAR_DATAW),
.ARBITER (ARBITER),
.OUT_BUF (REQ_OUT_BUF)
) req_xbar (
.clk (clk),
.reset (reset),
.sel_in (req_bank_sel),
.valid_in (req_xbar_valid_in),
.data_in (req_xbar_data_in),
.ready_in (req_xbar_ready_in),
.valid_out (req_xbar_valid_out),
.data_out (req_xbar_data_out),
.ready_out (req_xbar_ready_out),
.sel_out (req_xbar_sel_out),
`UNUSED_PIN (collisions)
);
wire [NUM_BANKS_OUT-1:0] m_axi_awvalid_w, m_axi_wvalid_w;
wire [NUM_BANKS_OUT-1:0] m_axi_awready_w, m_axi_wready_w;
reg [NUM_BANKS_OUT-1:0] m_axi_aw_ack, m_axi_w_ack, axi_write_ready;
for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_axi_reqs
wire xbar_rw_out;
wire [BANK_ADDR_WIDTH-1:0] xbar_addr_out;
wire [XBAR_TAG_WIDTH-1:0] xbar_tag_out;
wire [DATA_WIDTH-1:0] xbar_data_out;
wire [DATA_SIZE-1:0] xbar_byteen_out;
assign {
xbar_rw_out,
xbar_addr_out,
xbar_byteen_out,
xbar_data_out,
xbar_tag_out
} = req_xbar_data_out[i];
// AXi request handshake
wire m_axi_aw_ack, m_axi_w_ack, axi_write_ready;
for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_axi_write_ready
VX_axi_write_ack axi_write_ack (
.clk (clk),
.reset (reset),
.awvalid(m_axi_awvalid_w[i]),
.awready(m_axi_awready_w[i]),
.wvalid (m_axi_wvalid_w[i]),
.wready (m_axi_wready_w[i]),
.aw_ack (m_axi_aw_ack[i]),
.w_ack (m_axi_w_ack[i]),
.tx_rdy (axi_write_ready[i]),
.awvalid(m_axi_awvalid[i]),
.awready(m_axi_awready[i]),
.wvalid (m_axi_wvalid[i]),
.wready (m_axi_wready[i]),
.aw_ack (m_axi_aw_ack),
.w_ack (m_axi_w_ack),
.tx_rdy (axi_write_ready),
`UNUSED_PIN (tx_ack)
);
end
// AXI request handling
for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_axi_write_req
wire [BANK_ADDR_WIDTH-1:0] arb_addr_out, buf_addr_r_out, buf_addr_w_out;
wire [ARB_TAG_WIDTH-1:0] arb_tag_out;
wire [WRITE_TAG_WIDTH-1:0] buf_tag_w_out;
wire [READ_FULL_TAG_WIDTH-1:0] arb_tag_r_out, buf_tag_r_out;
wire [NUM_PORTS_IN_WIDTH-1:0] arb_sel_out;
wire [DATA_WIDTH-1:0] arb_data_out;
wire [DATA_SIZE-1:0] arb_byteen_out;
wire arb_valid_out, arb_ready_out;
wire arb_rw_out;
wire [NUM_PORTS_IN-1:0][ARB_DATAW-1:0] arb_data_in;
wire [NUM_PORTS_IN-1:0] arb_valid_in;
for (genvar j = 0; j < NUM_PORTS_IN; ++j) begin : g_valid_in
wire tag_ready = mem_req_rw[j] || mem_rd_req_tag_ready[j];
assign arb_valid_in[j] = mem_req_valid[j] && tag_ready && (req_bank_sel[j] == i);
end
for (genvar j = 0; j < NUM_PORTS_IN; ++j) begin : g_data_in
wire [ARB_TAG_WIDTH-1:0] tag_value = mem_req_rw[j] ? ARB_TAG_WIDTH'(mem_req_tag[j]) : ARB_TAG_WIDTH'(mem_rd_req_tag[j]);
assign arb_data_in[j] = {mem_req_rw[j], req_bank_addr[j], mem_req_byteen[j], mem_req_data[j], tag_value};
end
VX_stream_arb #(
.NUM_INPUTS (NUM_PORTS_IN),
.NUM_OUTPUTS(1),
.DATAW (ARB_DATAW),
.ARBITER (ARBITER)
) aw_arb (
.clk (clk),
.reset (reset),
.valid_in (arb_valid_in),
.ready_in (arb_ready_in[i]),
.data_in (arb_data_in),
.data_out ({arb_rw_out, arb_addr_out, arb_byteen_out, arb_data_out, arb_tag_out}),
.valid_out (arb_valid_out),
.ready_out (arb_ready_out),
.sel_out (arb_sel_out)
);
wire m_axi_arready_w;
assign arb_ready_out = axi_write_ready[i] || m_axi_arready_w;
assign req_xbar_ready_out[i] = xbar_rw_out ? axi_write_ready : m_axi_arready[i];
// AXI write address channel
assign m_axi_awvalid_w[i] = arb_valid_out && arb_rw_out && ~m_axi_aw_ack[i];
VX_elastic_buffer #(
.DATAW (BANK_ADDR_WIDTH + WRITE_TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)),
.LUTRAM (`TO_OUT_BUF_LUTRAM(REQ_OUT_BUF))
) aw_buf (
.clk (clk),
.reset (reset),
.valid_in (m_axi_awvalid_w[i]),
.ready_in (m_axi_awready_w[i]),
.data_in ({arb_addr_out, WRITE_TAG_WIDTH'(arb_tag_out)}),
.data_out ({buf_addr_w_out, buf_tag_w_out}),
.valid_out (m_axi_awvalid[i]),
.ready_out (m_axi_awready[i])
);
assign m_axi_awaddr[i] = ADDR_WIDTH_OUT'(buf_addr_w_out) << LOG2_DATA_SIZE;
assign m_axi_awid[i] = TAG_WIDTH_OUT'(buf_tag_w_out);
assign m_axi_awvalid[i] = req_xbar_valid_out[i] && xbar_rw_out && ~m_axi_aw_ack;
assign m_axi_awaddr[i] = ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE;
assign m_axi_awid[i] = TAG_WIDTH_OUT'(xbar_tag_out);
assign m_axi_awlen[i] = 8'b00000000;
assign m_axi_awsize[i] = 3'(LOG2_DATA_SIZE);
assign m_axi_awburst[i] = 2'b00;
@ -296,53 +264,24 @@ module VX_axi_adapter #(
// AXI write data channel
assign m_axi_wvalid_w[i] = arb_valid_out && arb_rw_out && ~m_axi_w_ack[i];
VX_elastic_buffer #(
.DATAW (DATA_SIZE + DATA_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)),
.LUTRAM (`TO_OUT_BUF_LUTRAM(REQ_OUT_BUF))
) w_buf (
.clk (clk),
.reset (reset),
.valid_in (m_axi_wvalid_w[i]),
.ready_in (m_axi_wready_w[i]),
.data_in ({arb_byteen_out, arb_data_out}),
.data_out ({m_axi_wstrb[i], m_axi_wdata[i]}),
.valid_out (m_axi_wvalid[i]),
.ready_out (m_axi_wready[i])
);
assign m_axi_wlast[i] = 1'b1;
assign m_axi_wvalid[i] = req_xbar_valid_out[i] && xbar_rw_out && ~m_axi_w_ack;
assign m_axi_wstrb[i] = xbar_byteen_out;
assign m_axi_wdata[i] = xbar_data_out;
assign m_axi_wlast[i] = 1'b1;
// AXI read address channel
if (NUM_PORTS_IN > 1) begin : g_input_sel
assign arb_tag_r_out = READ_FULL_TAG_WIDTH'({arb_tag_out, arb_sel_out});
wire [READ_FULL_TAG_WIDTH-1:0] xbar_tag_r_out;
if (NUM_PORTS_IN > 1) begin : g_xbar_tag_r_out
assign xbar_tag_r_out = READ_FULL_TAG_WIDTH'({xbar_tag_out, req_xbar_sel_out[i]});
end else begin : g_no_input_sel
`UNUSED_VAR (arb_sel_out)
assign arb_tag_r_out = READ_TAG_WIDTH'(arb_tag_out);
`UNUSED_VAR (req_xbar_sel_out)
assign xbar_tag_r_out = READ_TAG_WIDTH'(xbar_tag_out);
end
VX_elastic_buffer #(
.DATAW (BANK_ADDR_WIDTH + READ_FULL_TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)),
.LUTRAM (`TO_OUT_BUF_LUTRAM(REQ_OUT_BUF))
) ar_buf (
.clk (clk),
.reset (reset),
.valid_in (arb_valid_out && ~arb_rw_out),
.ready_in (m_axi_arready_w),
.data_in ({arb_addr_out, arb_tag_r_out}),
.data_out ({buf_addr_r_out, buf_tag_r_out}),
.valid_out (m_axi_arvalid[i]),
.ready_out (m_axi_arready[i])
);
assign m_axi_araddr[i] = ADDR_WIDTH_OUT'(buf_addr_r_out) << LOG2_DATA_SIZE;
assign m_axi_arid[i] = TAG_WIDTH_OUT'(buf_tag_r_out);
assign m_axi_arvalid[i] = req_xbar_valid_out[i] && ~xbar_rw_out;
assign m_axi_araddr[i] = ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE;
assign m_axi_arid[i] = TAG_WIDTH_OUT'(xbar_tag_r_out);
assign m_axi_arlen[i] = 8'b00000000;
assign m_axi_arsize[i] = 3'(LOG2_DATA_SIZE);
assign m_axi_arburst[i] = 2'b00;

View file

@ -26,35 +26,35 @@ module VX_axi_write_ack (
output wire tx_ack,
output wire tx_rdy
);
reg awfired;
reg wfired;
reg aw_fired;
reg w_fired;
wire awfire = awvalid && awready;
wire wfire = wvalid && wready;
wire aw_fire = awvalid && awready;
wire w_fire = wvalid && wready;
always @(posedge clk) begin
if (reset) begin
awfired <= 0;
wfired <= 0;
aw_fired <= 0;
w_fired <= 0;
end else begin
if (awfire) begin
awfired <= 1;
if (aw_fire) begin
aw_fired <= 1;
end
if (wfire) begin
wfired <= 1;
if (w_fire) begin
w_fired <= 1;
end
if (tx_ack) begin
awfired <= 0;
wfired <= 0;
aw_fired <= 0;
w_fired <= 0;
end
end
end
assign aw_ack = awfired;
assign w_ack = wfired;
assign aw_ack = aw_fired;
assign w_ack = w_fired;
assign tx_ack = (awfire || awfired) && (wfire || wfired);
assign tx_rdy = (awready || awfired) && (wready || wfired);
assign tx_ack = (aw_fire || aw_fired) && (w_fire || w_fired);
assign tx_rdy = (awready || aw_fired) && (wready || w_fired);
endmodule
`TRACING_ON

View file

@ -26,18 +26,39 @@
end \
end
`ifdef SIMULATION
`define RAM_RESET_BLOCK if (RESET_RAM && reset) begin \
for (integer i = 0; i < SIZE; ++i) begin \
ram[i] <= DATAW'(INIT_VALUE); \
end \
end else
`else
`define RAM_RESET_BLOCK
`endif
`define RAM_WRITE_ALL `RAM_RESET_BLOCK \
if (write) begin \
ram[waddr] <= wdata; \
end
`ifdef QUARTUS
`define RAM_ARRAY_WREN reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1];
`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[waddr][i] <= wdata[i * WSELW +: WSELW]; \
`define RAM_WRITE_WREN `RAM_RESET_BLOCK \
if (write) begin \
for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[waddr][i] <= wdata[i * WSELW +: WSELW]; \
end \
end \
end
`else
`define RAM_ARRAY_WREN reg [DATAW-1:0] ram [0:SIZE-1];
`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
`define RAM_WRITE_WREN `RAM_RESET_BLOCK \
if (write) begin \
for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
end \
end \
end
`endif
@ -49,8 +70,9 @@ module VX_dp_ram #(
parameter WRENW = 1,
parameter OUT_REG = 0,
parameter LUTRAM = 0,
parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first, U: undefined
parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first
parameter RADDR_REG = 0, // read address registered hint
parameter RADDR_RESET = 0, // read address has reset
parameter RDW_ASSERT = 0,
parameter RESET_RAM = 0,
parameter INIT_ENABLE = 0,
@ -71,13 +93,14 @@ module VX_dp_ram #(
localparam WSELW = DATAW / WRENW;
`UNUSED_PARAM (LUTRAM)
`UNUSED_PARAM (RADDR_REG)
`UNUSED_PARAM (RADDR_RESET)
`STATIC_ASSERT(!(WRENW * WSELW != DATAW), ("invalid parameter"))
`STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W" || RDW_MODE == "U"), ("invalid parameter"))
`STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W"), ("invalid parameter"))
`UNUSED_PARAM (RDW_ASSERT)
`ifdef SYNTHESIS
localparam FORCE_BRAM = !LUTRAM && (SIZE * DATAW >= `MAX_LUTRAM);
localparam FORCE_BRAM = !LUTRAM && `FORCE_BRAM(SIZE, DATAW);
if (OUT_REG) begin : g_sync
if (FORCE_BRAM) begin : g_bram
if (RDW_MODE == "W") begin : g_write_first
@ -86,10 +109,8 @@ module VX_dp_ram #(
`RAM_INITIALIZATION
reg [ADDRW-1:0] raddr_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
if (read) begin
raddr_r <= raddr;
end
end
@ -99,10 +120,8 @@ module VX_dp_ram #(
`RAM_INITIALIZATION
reg [ADDRW-1:0] raddr_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
ram[waddr] <= wdata;
end
`RAM_WRITE_ALL
if (read) begin
raddr_r <= raddr;
end
end
@ -114,37 +133,7 @@ module VX_dp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
`RAM_WRITE_WREN
end
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
end else begin : g_no_wren
`USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
ram[waddr] <= wdata;
end
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
end
end else if (RDW_MODE == "U") begin : g_undefined
if (WRENW != 1) begin : g_wren
`USE_BLOCK_BRAM `RAM_ARRAY_WREN
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
if (read) begin
rdata_r <= ram[raddr];
end
@ -155,9 +144,7 @@ module VX_dp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
`RAM_WRITE_ALL
if (read) begin
rdata_r <= ram[raddr];
end
@ -172,10 +159,8 @@ module VX_dp_ram #(
`RAM_INITIALIZATION
reg [ADDRW-1:0] raddr_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
if (read) begin
raddr_r <= raddr;
end
end
@ -185,10 +170,8 @@ module VX_dp_ram #(
`RAM_INITIALIZATION
reg [ADDRW-1:0] raddr_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
ram[waddr] <= wdata;
end
`RAM_WRITE_ALL
if (read) begin
raddr_r <= raddr;
end
end
@ -200,37 +183,7 @@ module VX_dp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
`RAM_WRITE_WREN
end
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
end else begin : g_no_wren
reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
ram[waddr] <= wdata;
end
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
end
end else if (RDW_MODE == "U") begin : g_undefined
if (WRENW != 1) begin : g_wren
`RAM_ARRAY_WREN
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
if (read) begin
rdata_r <= ram[raddr];
end
@ -241,9 +194,7 @@ module VX_dp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
`RAM_WRITE_ALL
if (read) begin
rdata_r <= ram[raddr];
end
@ -255,7 +206,7 @@ module VX_dp_ram #(
end else begin : g_async
`UNUSED_VAR (read)
if (FORCE_BRAM) begin : g_bram
`ifdef VIVADO
`ifdef ASYNC_BRAM_PATCH
VX_async_ram_patch #(
.DATAW (DATAW),
.SIZE (SIZE),
@ -263,6 +214,7 @@ module VX_dp_ram #(
.DUAL_PORT (1),
.FORCE_BRAM (FORCE_BRAM),
.RADDR_REG (RADDR_REG),
.RADDR_RESET(RADDR_RESET),
.WRITE_FIRST(RDW_MODE == "W"),
.INIT_ENABLE(INIT_ENABLE),
.INIT_FILE (INIT_FILE),
@ -284,18 +236,14 @@ module VX_dp_ram #(
`RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
end
assign rdata = ram[raddr];
end else begin : g_no_wren
`RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
`RAM_WRITE_ALL
end
assign rdata = ram[raddr];
end
@ -304,18 +252,14 @@ module VX_dp_ram #(
`NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
end
assign rdata = ram[raddr];
end else begin : g_no_wren
`NO_RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
`RAM_WRITE_ALL
end
assign rdata = ram[raddr];
end
@ -327,18 +271,14 @@ module VX_dp_ram #(
`RW_RAM_CHECK `RAM_ARRAY_WREN
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
end
assign rdata = ram[raddr];
end else begin : g_no_wren
`RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
`RAM_WRITE_ALL
end
assign rdata = ram[raddr];
end
@ -347,18 +287,14 @@ module VX_dp_ram #(
`NO_RW_RAM_CHECK `RAM_ARRAY_WREN
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
end
assign rdata = ram[raddr];
end else begin : g_no_wren
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
`RAM_WRITE_ALL
end
assign rdata = ram[raddr];
end
@ -371,37 +307,19 @@ module VX_dp_ram #(
`RAM_INITIALIZATION
always @(posedge clk) begin
if (RESET_RAM && reset) begin
for (integer i = 0; i < SIZE; ++i) begin
ram[i] <= DATAW'(INIT_VALUE);
end
end else if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i]) begin
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
end
`RAM_WRITE_WREN
end
if (OUT_REG) begin : g_sync
if (RDW_MODE == "W") begin : g_write_first
reg [ADDRW-1:0] raddr_r;
always @(posedge clk) begin
if (read || write) begin
if (read) begin
raddr_r <= raddr;
end
end
assign rdata = ram[raddr_r];
end else if (RDW_MODE == "R") begin : g_read_first
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
end else begin : g_undefined
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read) begin

View file

@ -95,7 +95,8 @@ module VX_fifo_queue #(
.SIZE (DEPTH),
.LUTRAM (LUTRAM),
.RDW_MODE ("W"),
.RADDR_REG (1)
.RADDR_REG (1),
.RADDR_RESET (1)
) dp_ram (
.clk (clk),
.reset (reset),

View file

@ -179,25 +179,31 @@ module VX_mem_bank_adapter #(
for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_req_xbar_data_out
wire rw_out;
wire [BANK_ADDR_WIDTH-1:0] addr_out;
wire [XBAR_TAG_WIDTH-1:0] tag_out;
wire [DATA_WIDTH-1:0] data_out;
wire [DATA_SIZE-1:0] byteen_out;
wire xbar_rw_out;
wire [BANK_ADDR_WIDTH-1:0] xbar_addr_out;
wire [XBAR_TAG_WIDTH-1:0] xbar_tag_out;
wire [DATA_WIDTH-1:0] xbar_data_out;
wire [DATA_SIZE-1:0] xbar_byteen_out;
assign {rw_out, addr_out, byteen_out, data_out, tag_out} = req_xbar_data_out[i];
assign {
xbar_rw_out,
xbar_addr_out,
xbar_byteen_out,
xbar_data_out,
xbar_tag_out
} = req_xbar_data_out[i];
assign mem_req_valid_out[i] = req_xbar_valid_out[i];
assign mem_req_rw_out[i] = rw_out;
assign mem_req_addr_out[i] = ADDR_WIDTH_OUT'(addr_out);
assign mem_req_byteen_out[i] = byteen_out;
assign mem_req_data_out[i] = data_out;
assign mem_req_rw_out[i] = xbar_rw_out;
assign mem_req_addr_out[i] = ADDR_WIDTH_OUT'(xbar_addr_out);
assign mem_req_byteen_out[i] = xbar_byteen_out;
assign mem_req_data_out[i] = xbar_data_out;
if (NUM_PORTS_IN > 1) begin : g_input_sel
assign mem_req_tag_out[i] = TAG_WIDTH_OUT'({tag_out, req_xbar_sel_out[i]});
assign mem_req_tag_out[i] = TAG_WIDTH_OUT'({xbar_tag_out, req_xbar_sel_out[i]});
end else begin : g_no_input_sel
`UNUSED_VAR (req_xbar_sel_out[i])
assign mem_req_tag_out[i] = TAG_WIDTH_OUT'(tag_out);
assign mem_req_tag_out[i] = TAG_WIDTH_OUT'(xbar_tag_out);
end
assign req_xbar_ready_out[i] = mem_req_ready_out[i];

View file

@ -24,6 +24,7 @@ module VX_mem_coalescer #(
parameter TAG_WIDTH = 8,
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
parameter QUEUE_SIZE = 8,
parameter PERF_CTR_BITS = `CLOG2(NUM_REQS+1),
parameter DATA_IN_WIDTH = DATA_IN_SIZE * 8,
parameter DATA_OUT_WIDTH= DATA_OUT_SIZE * 8,
@ -37,6 +38,8 @@ module VX_mem_coalescer #(
input wire clk,
input wire reset,
output wire [PERF_CTR_BITS-1:0] misses,
// Input request
input wire in_req_valid,
input wire in_req_rw,
@ -323,6 +326,23 @@ module VX_mem_coalescer #(
assign in_rsp_tag = {out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout_tag};
assign out_rsp_ready = in_rsp_ready;
// compute coalescing misses
// misses are partial transfers (not fuly coalesced)
reg [PERF_CTR_BITS-1:0] misses_r;
wire partial_transfer = (out_req_fire && req_rem_mask_r != '1);
always @(posedge clk) begin
if (reset) begin
misses_r <= '0;
end else begin
misses_r <= misses_r + PERF_CTR_BITS'(partial_transfer);
end
end
assign misses = misses_r;
`ifdef DBG_TRACE_MEM
wire [`UP(UUID_WIDTH)-1:0] out_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] out_rsp_uuid;

View file

@ -237,6 +237,8 @@ module VX_mem_scheduler #(
.clk (clk),
.reset (reset),
`UNUSED_PIN (misses),
// Input request
.in_req_valid (reqq_valid),
.in_req_mask (reqq_mask),

View file

@ -14,7 +14,7 @@
`include "VX_platform.vh"
`TRACING_OFF
module VX_reduce #(
module VX_reduce_tree #(
parameter DATAW_IN = 1,
parameter DATAW_OUT = DATAW_IN,
parameter N = 1,
@ -41,7 +41,7 @@ module VX_reduce #(
assign in_B[i] = data_in[N_A + i];
end
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (DATAW_IN),
.DATAW_OUT (DATAW_OUT),
.N (N_A),
@ -51,7 +51,7 @@ module VX_reduce #(
.data_out (out_A)
);
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (DATAW_IN),
.DATAW_OUT (DATAW_OUT),
.N (N_B),

View file

@ -26,18 +26,39 @@
end \
end
`ifdef SIMULATION
`define RAM_RESET_BLOCK if (RESET_RAM && reset) begin \
for (integer i = 0; i < SIZE; ++i) begin \
ram[i] <= DATAW'(INIT_VALUE); \
end \
end else
`else
`define RAM_RESET_BLOCK
`endif
`define RAM_WRITE_ALL `RAM_RESET_BLOCK \
if (write) begin \
ram[addr] <= wdata; \
end
`ifdef QUARTUS
`define RAM_ARRAY_WREN reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1];
`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[addr][i] <= wdata[i * WSELW +: WSELW]; \
`define RAM_WRITE_WREN `RAM_RESET_BLOCK \
if (write) begin \
for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[addr][i] <= wdata[i * WSELW +: WSELW]; \
end \
end \
end
`else
`define RAM_ARRAY_WREN reg [DATAW-1:0] ram [0:SIZE-1];
`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[addr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
`define RAM_WRITE_WREN `RAM_RESET_BLOCK \
if (write) begin \
for (integer i = 0; i < WRENW; ++i) begin \
if (wren[i]) begin \
ram[addr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
end \
end \
end
`endif
@ -49,8 +70,9 @@ module VX_sp_ram #(
parameter WRENW = 1,
parameter OUT_REG = 0,
parameter LUTRAM = 0,
parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first, N: no-change, U: undefined
parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first, N: no-change
parameter RADDR_REG = 0, // read address registered hint
parameter RADDR_RESET = 0, // read address has reset
parameter RDW_ASSERT = 0,
parameter RESET_RAM = 0,
parameter INIT_ENABLE = 0,
@ -70,13 +92,14 @@ module VX_sp_ram #(
localparam WSELW = DATAW / WRENW;
`UNUSED_PARAM (LUTRAM)
`UNUSED_PARAM (RADDR_REG)
`UNUSED_PARAM (RADDR_RESET)
`STATIC_ASSERT(!(WRENW * WSELW != DATAW), ("invalid parameter"))
`STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W" || RDW_MODE == "N" || RDW_MODE == "U"), ("invalid parameter"))
`STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W" || RDW_MODE == "N"), ("invalid parameter"))
`UNUSED_PARAM (RDW_ASSERT)
`ifdef SYNTHESIS
localparam FORCE_BRAM = !LUTRAM && (SIZE * DATAW >= `MAX_LUTRAM);
localparam FORCE_BRAM = !LUTRAM && `FORCE_BRAM(SIZE, DATAW);
if (OUT_REG) begin : g_sync
if (FORCE_BRAM) begin : g_bram
if (RDW_MODE == "W") begin : g_write_first
@ -85,10 +108,8 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [ADDRW-1:0] addr_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
if (read) begin
addr_r <= addr;
end
end
@ -98,9 +119,9 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
`RAM_WRITE_ALL
if (read) begin
if (write) begin
ram[addr] <= wdata;
rdata_r <= wdata;
end else begin
rdata_r <= ram[addr];
@ -115,10 +136,8 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
if (read) begin
rdata_r <= ram[addr];
end
end
@ -128,10 +147,8 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
ram[addr] <= wdata;
end
`RAM_WRITE_ALL
if (read) begin
rdata_r <= ram[addr];
end
end
@ -143,40 +160,8 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
`RAM_WRITE_WREN
end else begin
rdata_r <= ram[addr];
end
end
end
assign rdata = rdata_r;
end else begin : g_no_wren
`USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
ram[addr] <= wdata;
end else begin
rdata_r <= ram[addr];
end
end
end
assign rdata = rdata_r;
end
end else if (RDW_MODE == "U") begin : g_undefined
if (WRENW != 1) begin : g_wren
`USE_BLOCK_BRAM `RAM_ARRAY_WREN
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
if (read) begin
`RAM_WRITE_WREN
else if (read) begin
rdata_r <= ram[addr];
end
end
@ -186,10 +171,8 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
ram[addr] <= wdata;
end
if (read) begin
`RAM_WRITE_ALL
else if (read) begin
rdata_r <= ram[addr];
end
end
@ -203,10 +186,8 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [ADDRW-1:0] addr_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
if (read) begin
addr_r <= addr;
end
end
@ -216,9 +197,9 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
`RAM_WRITE_ALL
if (read) begin
if (write) begin
ram[addr] <= wdata;
rdata_r <= wdata;
end else begin
rdata_r <= ram[addr];
@ -233,10 +214,8 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
if (read) begin
rdata_r <= ram[addr];
end
end
@ -246,10 +225,8 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
ram[addr] <= wdata;
end
`RAM_WRITE_ALL
if (read) begin
rdata_r <= ram[addr];
end
end
@ -261,40 +238,8 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
`RAM_WRITE_WREN
end else begin
rdata_r <= ram[addr];
end
end
end
assign rdata = rdata_r;
end else begin : g_no_wren
reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (write) begin
ram[addr] <= wdata;
end else begin
rdata_r <= ram[addr];
end
end
end
assign rdata = rdata_r;
end
end else if (RDW_MODE == "U") begin : g_undefined
if (WRENW != 1) begin : g_wren
`RAM_ARRAY_WREN
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
if (read) begin
`RAM_WRITE_WREN
else if (read) begin
rdata_r <= ram[addr];
end
end
@ -304,10 +249,8 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
ram[addr] <= wdata;
end
if (read) begin
`RAM_WRITE_ALL
else if (read) begin
rdata_r <= ram[addr];
end
end
@ -318,7 +261,7 @@ module VX_sp_ram #(
end else begin : g_async
`UNUSED_VAR (read)
if (FORCE_BRAM) begin : g_bram
`ifdef VIVADO
`ifdef ASYNC_BRAM_PATCH
VX_async_ram_patch #(
.DATAW (DATAW),
.SIZE (SIZE),
@ -326,6 +269,7 @@ module VX_sp_ram #(
.DUAL_PORT (0),
.FORCE_BRAM (FORCE_BRAM),
.RADDR_REG (RADDR_REG),
.RADDR_RESET(RADDR_RESET),
.WRITE_FIRST(RDW_MODE == "W"),
.INIT_ENABLE(INIT_ENABLE),
.INIT_FILE (INIT_FILE),
@ -347,18 +291,14 @@ module VX_sp_ram #(
`RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
end
assign rdata = ram[addr];
end else begin : g_no_wren
`RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[addr] <= wdata;
end
`RAM_WRITE_ALL
end
assign rdata = ram[addr];
end
@ -367,18 +307,14 @@ module VX_sp_ram #(
`NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
end
assign rdata = ram[addr];
end else begin : g_no_wren
`NO_RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[addr] <= wdata;
end
`RAM_WRITE_ALL
end
assign rdata = ram[addr];
end
@ -390,18 +326,14 @@ module VX_sp_ram #(
`RW_RAM_CHECK `RAM_ARRAY_WREN
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
end
assign rdata = ram[addr];
end else begin : g_no_wren
`RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[addr] <= wdata;
end
`RAM_WRITE_ALL
end
assign rdata = ram[addr];
end
@ -410,18 +342,14 @@ module VX_sp_ram #(
`NO_RW_RAM_CHECK `RAM_ARRAY_WREN
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
`RAM_WRITE_WREN
end
`RAM_WRITE_WREN
end
assign rdata = ram[addr];
end else begin : g_no_wren
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[addr] <= wdata;
end
`RAM_WRITE_ALL
end
assign rdata = ram[addr];
end
@ -434,24 +362,14 @@ module VX_sp_ram #(
`RAM_INITIALIZATION
always @(posedge clk) begin
if (RESET_RAM && reset) begin
for (integer i = 0; i < SIZE; ++i) begin
ram[i] <= DATAW'(INIT_VALUE);
end
end else if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i]) begin
ram[addr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
end
`RAM_WRITE_WREN
end
if (OUT_REG) begin : g_sync
if (RDW_MODE == "W") begin : g_write_first
reg [ADDRW-1:0] addr_r;
always @(posedge clk) begin
if (read || write) begin
if (read) begin
addr_r <= addr;
end
end
@ -459,7 +377,7 @@ module VX_sp_ram #(
end else if (RDW_MODE == "R") begin : g_read_first
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read || write) begin
if (read) begin
rdata_r <= ram[addr];
end
end
@ -472,14 +390,6 @@ module VX_sp_ram #(
end
end
assign rdata = rdata_r;
end else if (RDW_MODE == "U") begin : g_unknown
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (read) begin
rdata_r <= ram[addr];
end
end
assign rdata = rdata_r;
end
end else begin : g_async
`UNUSED_VAR (read)

View file

@ -206,13 +206,13 @@ module VX_stream_xbar #(
reg [PERF_CTR_BITS-1:0] collisions_r;
always @(*) begin
per_cycle_collision = 0;
per_cycle_collision = '0;
for (integer i = 0; i < NUM_INPUTS; ++i) begin
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
for (integer j = i + 1; j < NUM_INPUTS; ++j) begin
per_cycle_collision[i] |= valid_in[i]
&& valid_in[j+i]
&& (sel_in[i] == sel_in[j+i])
&& (ready_in[i] | ready_in[j+i]);
&& valid_in[j]
&& (sel_in[i] == sel_in[j])
&& (ready_in[i] | ready_in[j]);
end
end
end

View file

@ -43,7 +43,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
// PERF
`ifdef PERF_ENABLE
output cache_perf_t lmem_perf,
output lmem_perf_t lmem_perf,
`endif
VX_mem_bus_if.slave mem_bus_if [NUM_REQS]
@ -286,14 +286,10 @@ module VX_local_mem import VX_gpu_pkg::*; #(
end
end
assign lmem_perf.reads = perf_reads;
assign lmem_perf.writes = perf_writes;
assign lmem_perf.read_misses = '0;
assign lmem_perf.write_misses = '0;
assign lmem_perf.bank_stalls = perf_collisions;
assign lmem_perf.mshr_stalls = '0;
assign lmem_perf.mem_stalls = '0;
assign lmem_perf.crsp_stalls = perf_crsp_stalls;
assign lmem_perf.reads = perf_reads;
assign lmem_perf.writes = perf_writes;
assign lmem_perf.bank_stalls = perf_collisions;
assign lmem_perf.crsp_stalls = perf_crsp_stalls;
`endif
@ -321,15 +317,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end
end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s rd-rsp: req_idx=%0d, data=0x%h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
end
end
@ -339,15 +335,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin
if (per_bank_req_rw[i]) begin
`TRACE(2, ("%t: %s-bank%0d wr-req: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s bank-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
end else begin
`TRACE(2, ("%t: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s bank-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
end
end
if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin
`TRACE(2, ("%t: %s-bank%0d rd-rsp: data=0x%h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s bank-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_rsp_data[i], per_bank_rsp_tag_value[i], per_bank_rsp_uuid[i]))
end
end

View file

@ -1,43 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_mem_perf_if import VX_gpu_pkg::*; ();
cache_perf_t icache;
cache_perf_t dcache;
cache_perf_t l2cache;
cache_perf_t l3cache;
cache_perf_t lmem;
mem_perf_t mem;
modport master (
output icache,
output dcache,
output l2cache,
output l3cache,
output lmem,
output mem
);
modport slave (
input icache,
input dcache,
input l2cache,
input l3cache,
input lmem,
input mem
);
endinterface

View file

@ -13,7 +13,6 @@
namespace eval vortex {
variable info 0
variable debug 0
proc print_error {msg {do_exit 1}} {
@ -21,7 +20,8 @@ proc print_error {msg {do_exit 1}} {
puts "ERROR: $msg"
exit -1
} else {
puts "WARNING: $msg"
variable debug
if {$debug} {puts "WARNING: $msg"}
}
}
@ -132,6 +132,17 @@ proc find_cell_nets {cell name_match {should_exist 1}} {
return $matching_nets
}
proc find_cell_net {cell name_match {should_exist 1}} {
set nets [find_cell_nets $cell $name_match $should_exist]
if {[llength $nets] == 0} {
return ""
} elseif {[llength $nets] > 1} {
puts "ERROR: Multiple matching nets found for '$cell' matching '$name_match'."
exit -1
}
return [lindex $nets 0]
}
proc get_cell_net {cell name} {
set net [get_nets -hierarchical -filter "PARENT_CELL == $cell && NAME == $name"]
if {[llength $net] == 0} {
@ -168,88 +179,52 @@ proc get_cell_pin {cell name} {
}
proc remove_cell_from_netlist {cell} {
variable info
variable debug
# Disconnect all pins of the cell
foreach pin [get_pins -quiet -of_objects $cell] {
foreach net [get_nets -quiet -of_objects $pin] {
disconnect_net -net $net -objects $pin
if {$info} {puts "INFO: Disconnected net '$net' from pin '$pin'."}
if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."}
}
}
# Remove the cell
remove_cell $cell
if {$info} {puts "INFO: Cell '$cell' was removed successfully."}
if {$debug} {puts "DEBUG: Cell '$cell' was removed successfully."}
}
proc replace_pin_source {pin source_pin} {
variable debug
# Disconnect existing net from pin
set net [get_nets -of_objects $pin]
if {[llength $net] == 1} {
disconnect_net -net $net -objects $pin
if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."}
} elseif {[llength $net] > 1} {
puts "ERROR: Multiple nets connected to pin '$pin'."
exit -1
} else {
puts "WARNING: No net connected to pin '$pin'."
}
set source_net [get_nets -quiet -of_objects $source_pin]
if {[llength $source_net] == 0} {
# Create a new net if none exists
set source_cell [get_cells -of_objects $source_pin]
set net_name [unique_net_name "${source_cell}_net"]
set source_net [create_net $net_name]
if {$debug} {puts "DEBUG: Created source_net: '$source_net'"}
# Connect the source pin to the new net
connect_net -net $source_net -objects $source_pin -hierarchical
if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$source_pin'."}
} elseif {[llength $source_net] > 1} {
puts "ERROR: Multiple nets connected to pin '$source_pin'."
exit -1
}
# Connect pin to the new source net
connect_net -net $source_net -objects $pin -hierarchical
if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$pin'."}
}
proc find_net_driver {input_net {should_exist 1}} {
set driverPins [get_pins -quiet -leaf -of_objects $input_net -filter {DIRECTION == "OUT"}]
proc find_net_driver {taregt_net {should_exist 1}} {
set driverPins [get_pins -quiet -leaf -of_objects $taregt_net -filter {DIRECTION == "OUT"}]
if {[llength $driverPins] == 0} {
set driverPorts [get_ports -quiet -of_objects $input_net -filter {DIRECTION == "IN"}]
set driverPorts [get_ports -quiet -of_objects $taregt_net -filter {DIRECTION == "IN"}]
if {[llength $driverPorts] == 0} {
print_error "No driver found for '$input_net'." $should_exist
print_error "No driver found for '$taregt_net'." $should_exist
} elseif {[llength $driverPorts] > 1} {
puts "WARNING: Multiple driver ports found for '$input_net'."
puts "WARNING: Multiple driver ports found for '$taregt_net'."
return [lindex $driverPorts 0]
}
return $driverPorts
} elseif {[llength $driverPins] > 1} {
puts "WARNING: Multiple driver pins found for '$input_net'."
puts "WARNING: Multiple driver pins found for '$taregt_net'."
return [lindex $driverPins 0]
}
return $driverPins
}
proc find_pin_driver {input_pin {should_exist 1}} {
set net [get_nets -quiet -of_objects $input_pin]
proc find_pin_driver {target_pin {should_exist 1}} {
set net [get_nets -quiet -of_objects $target_pin]
if {[llength $net] == 0} {
print_error "No net connected to pin '$input_pin'." $should_exist
print_error "No net connected to pin '$target_pin'." $should_exist
return ""
} elseif {[llength $net] > 1} {
puts "ERROR: Multiple nets connected to pin '$input_pin'."
puts "ERROR: Multiple nets connected to pin '$target_pin'."
exit -1
}
return [find_net_driver $net]
}
proc create_register_next {parent reg_cell} {
variable info
proc create_register_next {parent reg_cell raddr_reset} {
variable debug
set hier_sep [get_hierarchy_separator]
@ -273,6 +248,10 @@ proc create_register_next {parent reg_cell} {
if {$debug} {puts "DEBUG: reg_d_src_pin: '$reg_d_src_pin'"}
if {$raddr_reset == ""} {
return $reg_d_src_pin
}
set reg_r_src_pin ""
set register_type [get_property REF_NAME $reg_cell]
@ -341,7 +320,7 @@ proc create_register_next {parent reg_cell} {
# FDSE: O = I1 ? 1 : I0; where I0=D, I1=S
set lut_name [unique_cell_name "${parent}${hier_sep}raddr_next"]
set lut_cell [create_cell -reference LUT2 $lut_name]
if {$info} {puts "INFO: Created lut cell: '$lut_cell'"}
if {$debug} {puts "DEBUG: Created lut cell: '$lut_cell'"}
if {$register_type == "FDRE"} {
set_property INIT 4'b0010 $lut_cell
@ -389,7 +368,6 @@ proc create_register_next {parent reg_cell} {
}
proc getOrCreateVCCPin {parent} {
variable info
variable debug
set hier_sep [get_hierarchy_separator]
@ -398,7 +376,7 @@ proc getOrCreateVCCPin {parent} {
set vcc_cell [get_cells -quiet $cell_name]
if {[llength $vcc_cell] == 0} {
set vcc_cell [create_cell -reference VCC $cell_name]
if {$info} {puts "INFO: Created VCC cell: '$vcc_cell'"}
if {$debug} {puts "DEBUG: Created VCC cell: '$vcc_cell'"}
} elseif {[llength $vcc_cell] > 1} {
puts "ERROR: Multiple VCC cells found with name '$cell_name'."
exit -1
@ -417,7 +395,6 @@ proc getOrCreateVCCPin {parent} {
}
proc getOrCreateGNDPin {parent} {
variable info
variable debug
set hier_sep [get_hierarchy_separator]
@ -426,7 +403,7 @@ proc getOrCreateGNDPin {parent} {
set gnd_cell [get_cells -quiet $cell_name]
if {[llength $gnd_cell] == 0} {
set gnd_cell [create_cell -reference GND $cell_name]
if {$info} {puts "INFO: Created GND cell: '$gnd_cell'"}
if {$debug} {puts "DEBUG: Created GND cell: '$gnd_cell'"}
} elseif {[llength $gnd_cell] > 1} {
puts "ERROR: Multiple GND cells found with name '$cell_name'."
exit -1
@ -444,16 +421,28 @@ proc getOrCreateGNDPin {parent} {
return $gnd_pin
}
proc find_net_sinks {input_net {should_exist 1}} {
proc find_net_sinks {source_net {should_exist 1}} {
set sink_pins {}
foreach pin [get_pins -quiet -leaf -of_objects $input_net -filter {DIRECTION == "IN"}] {
lappend sink_pins $pin
# Iterate through all pins connected to the source net
foreach pin [get_pins -quiet -of_objects $source_net] {
set direction [get_property DIRECTION $pin]
# Input pins of nested cells
if {$direction == "IN"} {
lappend sink_pins $pin
}
# Output pins of the parent cell
set pin_cell [get_cells -of_objects $pin]
set is_primitive [get_property IS_PRIMITIVE $pin_cell]
if {$direction == "OUT" && !$is_primitive} {
lappend sink_pins $pin
}
}
foreach port [get_ports -quiet -of_objects $input_net -filter {DIRECTION == "OUT"}] {
# Add any top-module output ports connected to the source net
foreach port [get_ports -quiet -of_objects $source_net -filter {DIRECTION == "OUT"}] {
lappend sink_pins $port
}
if {[llength $sink_pins] == 0} {
print_error "No sink found for '$input_net'." $should_exist
print_error "No sink found for '$source_net'." $should_exist
}
return $sink_pins
}
@ -497,13 +486,49 @@ proc find_matching_pins {cell pins match repl} {
}
proc replace_net_source {net source_pin} {
variable debug
foreach pin [find_net_sinks $net 0] {
replace_pin_source $pin $source_pin
# disconnect net from pin
disconnect_net -net $net -objects $pin
if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."}
# find/create source net
set source_net [get_nets -quiet -of_objects $source_pin]
if {[llength $source_net] == 0} {
# Create a new net (in source_cell's parent) if none exists
set source_cell [get_cells -of_objects $source_pin]
set net_name [unique_net_name "${source_cell}_tmp_net"]
set source_net [create_net $net_name]
if {$debug} {puts "DEBUG: Created source_net: '$source_net'"}
# Connect the source pin to the new net
connect_net -net $source_net -objects $source_pin -hierarchical
if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$source_pin'."}
} elseif {[llength $source_net] > 1} {
puts "ERROR: Multiple nets connected to pin '$source_pin'."
exit -1
}
set external_net [get_nets -quiet -of_objects $pin]
if {[llength $external_net] == 0} {
# Connect pin to source net
connect_net -net $source_net -objects $pin -hierarchical
if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$pin'."}
} elseif {[llength $external_net] == 1} {
foreach external_pin [get_pins -of_objects $external_net] {
# disconnect external net from pin
disconnect_net -net $external_net -objects $pin
if {$debug} {puts "DEBUG: Disconnected net '$external_net' from pin '$pin'."}
# recurse-connect external net's pins to source_pin
replace_net_source $external_net $source_pin
}
} else {
puts "ERROR: Multiple nets connected to pin '$pin'."
exit -1
}
}
}
proc resolve_async_bram {inst} {
variable info
variable debug
puts "INFO: Resolving asynchronous BRAM patch: '$inst'."
@ -511,20 +536,32 @@ proc resolve_async_bram {inst} {
set hier_sep [get_hierarchy_separator]
set raddr_w_nets [find_cell_nets $inst "raddr_w(\\\[\\d+\\\])?$"]
set read_s_net [find_cell_nets $inst "read_s$"]
set is_raddr_reg_net [find_cell_nets $inst "is_raddr_reg$"]
set read_s_net [find_cell_net $inst "read_s$"]
if {$debug} {puts "DEBUG: read_s_net: '$read_s_net'"}
set is_raddr_reg_net [find_cell_net $inst "g_async_ram.is_raddr_reg$" 0]
if {$debug} {puts "DEBUG: is_raddr_reg_net: '$is_raddr_reg_net'"}
set raddr_s_nets [find_matching_nets $inst $raddr_w_nets "raddr_w(\\\[\\d+\\\])?$" "raddr_s\\1"]
set reg_next_pins {}
set reg_ce_src_pin ""
set raddr_reset_net [find_cell_net $inst "raddr_reset$" 0]
if {$debug} {puts "DEBUG: raddr_reset: '$raddr_reset_net'"}
# Process each raddr_w net
foreach raddr_w_net $raddr_w_nets {
if {$debug} {puts "DEBUG: Processing raddr_w net: '$raddr_w_net'"}
# Find raddr_w_net's driver pin
set raddr_src_pin [find_net_driver $raddr_w_net]
if {$debug} {puts "DEBUG: raddr_src_pin: '$raddr_src_pin'"}
if {[get_ports -quiet $raddr_src_pin] ne ""} {
puts "WARNING: Net '$raddr_w_net' is not registered, driver_type=port"
break
}
# Get the driver cell
set raddr_src_cell [get_cells -of_objects $raddr_src_pin]
@ -541,12 +578,12 @@ proc resolve_async_bram {inst} {
if {$driver_type == "FDRE" || $driver_type == "FDSE"} {
if {$debug} {puts "DEBUG: Net '$raddr_w_net' is registered, driver_type='$driver_type'"}
} else {
puts "WARNING: Net '$raddr_w_net' is not be registered, driver_type='$driver_type'"
puts "WARNING: Net '$raddr_w_net' is not registered, driver_type='$driver_type'"
break
}
# Create register next cell and return output pin
set reg_next_pin [create_register_next $inst $raddr_src_cell]
set reg_next_pin [create_register_next $inst $raddr_src_cell $raddr_reset_net]
if {$reg_next_pin == ""} {
puts "ERROR: failed to create register next value for '$raddr_src_cell'."
exit -1
@ -576,61 +613,75 @@ proc resolve_async_bram {inst} {
}
}
set addr_width [llength $raddr_w_nets]
# do we have a fully registered read address?
if {[llength $reg_next_pins] == [llength $raddr_w_nets]} {
if {$info} {puts "INFO: Fully registered read address detected."}
if {[llength $reg_next_pins] == $addr_width} {
if {$debug} {puts "DEBUG: Fully registered read address detected."}
# Connect all reg_next_pins to all input pins attached to raddr_s_nets
set addr_width [llength $raddr_w_nets]
for {set addr_idx 0} {$addr_idx < $addr_width} {incr addr_idx} {
set raddr_s_net [lindex $raddr_s_nets $addr_idx]
set reg_next_pin [lindex $reg_next_pins $addr_idx]
if {$info} {puts "INFO: Connecting pin '$reg_next_pin' to '$raddr_s_net's pins."}
# Connect reg_next_pin to all input pins attached to raddr_s_net
if {$debug} {puts "DEBUG: Connecting pin '$reg_next_pin' net to '$raddr_s_net's pins."}
replace_net_source $raddr_s_net $reg_next_pin
}
# Connect reg_ce_src_pin to all input pins attached to read_s_net
if {$info} {puts "INFO: Connecting pin '$reg_ce_src_pin' to '$read_s_net's pins."}
if {$debug} {puts "DEBUG: Connecting pin '$reg_ce_src_pin' net to '$read_s_net's pins."}
replace_net_source $read_s_net $reg_ce_src_pin
# Create Const<1>'s pin
set vcc_pin [getOrCreateVCCPin $inst]
if {$is_raddr_reg_net != ""} {
# Create Const<1>'s pin
set vcc_pin [getOrCreateVCCPin $inst]
# Connect vcc_pin to all input pins attached to is_raddr_reg_net
if {$info} {puts "INFO: Connecting pin '$vcc_pin' to '$is_raddr_reg_net's pins."}
replace_net_source $is_raddr_reg_net $vcc_pin
# Remove all async_ram cells
foreach cell [find_nested_cells $inst "g_async_ram.*" 0] {
remove_cell_from_netlist $cell
# Connect vcc_pin to all input pins attached to is_raddr_reg_net
if {$debug} {puts "DEBUG: Connecting pin '$vcc_pin' to net '$is_raddr_reg_net's pins."}
replace_net_source $is_raddr_reg_net $vcc_pin
}
} else {
puts "WARNING: Not all read addresses are registered!"
if {$is_raddr_reg_net == ""} {
puts "ERROR: read address not fully registered!"
exit -1
} else {
puts "WARNING: read address not fully registered!"
}
# Create Const<0>'s pin
set gnd_pin [getOrCreateGNDPin $inst]
# Connect gnd_pin to all input pins attached to is_raddr_reg_net
if {$info} {puts "INFO: Connecting pin '$gnd_pin' to '$is_raddr_reg_net's pins."}
replace_net_source $is_raddr_reg_net $gnd_pin
# Remove all sync_ram cells
foreach cell [find_nested_cells $inst "g_sync_ram.*" 0] {
remove_cell_from_netlist $cell
# Connect GND to all input pins attached to raddr_s_nets
for {set addr_idx 0} {$addr_idx < $addr_width} {incr addr_idx} {
set raddr_s_net [lindex $raddr_s_nets $addr_idx]
if {$debug} {puts "DEBUG: Connecting pin '$gnd_pin' net to '$raddr_s_net's pins."}
replace_net_source $raddr_s_net $gnd_pin
}
# Connect GND to all input pins attached to read_s_net
if {$debug} {puts "DEBUG: Connecting pin '$gnd_pin' net to '$read_s_net's pins."}
replace_net_source $read_s_net $gnd_pin
# Connect gnd_pin to all input pins attached to is_raddr_reg_net
if {$debug} {puts "DEBUG: Connecting pin '$gnd_pin' to net '$is_raddr_reg_net's pins."}
replace_net_source $is_raddr_reg_net $gnd_pin
}
# Remove placeholder cell
foreach cell [find_nested_cells $inst "placeholder$"] {
# Remove placeholder cells
foreach cell [find_nested_cells $inst "placeholder1$"] {
remove_cell_from_netlist $cell
}
if {$is_raddr_reg_net != ""} {
foreach cell [find_nested_cells $inst "g_async_ram.placeholder2$"] {
remove_cell_from_netlist $cell
}
}
}
proc resolve_async_brams {} {
variable debug
set bram_patch_cells {}
foreach cell [get_cells -hierarchical -filter {REF_NAME =~ "*VX_async_ram_patch*"}] {
puts "INFO: Found async BRAM patch cell: '$cell'."
if {$debug} {puts "DEBUG: Found async BRAM patch cell: '$cell'."}
lappend bram_patch_cells $cell
}
if {[llength $bram_patch_cells] != 0} {

View file

@ -8,4 +8,5 @@ FPU_INCLUDE = -I$(RTL_DIR)/fpu
ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src
endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE)
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE)
RTL_INCLUDE = -I..

View file

@ -5,7 +5,6 @@ DEVICE_FAMILY ?= arria10
PREFIX ?= build$(XLEN)
TARGET ?= fpga
NUM_CORES ?= 1
SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/opae
@ -44,6 +43,7 @@ ifeq ($(DEVICE_FAMILY), arria10)
CONFIGS += -DALTERA_A10
endif
ifdef NUM_CORES
# cluster configuration
CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1
CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2
@ -53,6 +53,7 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16
CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16
CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16
CONFIGS += $(CONFIGS_$(NUM_CORES)c)
endif
# include sources
RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv

View file

@ -47,14 +47,18 @@ TARGET=hw PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 make chipscope
# analyze build report
vitis_analyzer build_xilinx_u50_gen3x16_xdma_5_202210_1_hw_4c/bin/vortex_afu.xclbin.link_summary
# resuming build for routing
# resuming builds
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.synth" make > build.log 2>&1 &
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl" make > build.log 2>&1 &
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.opt_design" make > build.log 2>&1 &
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.place_design" make > build.log 2>&1 &
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.phys_opt_design" make > build.log 2>&1 &
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.route_design" make > build.log 2>&1 &
# running test
FPGA_BIN_DIR=<bin_dir> TARGET=hw_emu ./ci/blackbox.sh --driver=xrt --app=demo
FPGA_BIN_DIR=<bin_dir> TARGET=hw ./ci/blackbox.sh --driver=xrt --app=demo
FPGA_BIN_DIR=<bin_dir> TARGET=hw_emu ./ci/blackbox.sh --driver=xrt --app=demo
FPGA_BIN_DIR=<bin_dir> XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xrt --app=demo
FPGA_BIN_DIR=<bin_dir> TARGET=hw ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n1024"
FPGA_BIN_DIR=<bin_dir> XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n1024"
# build report logs
<build_dir>/bin/vortex_afu.xclbin.info

View file

@ -37,10 +37,15 @@ else
endif
clean:
ifndef RESUME
rm -rf project_1
rm -rf .Xil
rm -f *.rpt
rm -f vivado*.log
rm -f vivado*.jou
rm -f *.log
rm -f *.jou
rm -f *.dcp
else
@echo "RESUME is defined, skipping clean."
endif
.PHONY: all gen-sources build clean

View file

@ -11,9 +11,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Start time
set start_time [clock seconds]
if { $::argc != 4 } {
puts "ERROR: Program \"$::argv0\" requires 4 arguments!\n"
puts "Usage: $::argv0 <top_module> <device_part> <vcs_file> <xdc_file>\n"
@ -46,95 +43,134 @@ if {[info exists ::env(MAX_JOBS)]} {
set num_jobs 0
}
# create fpu ip
if {[info exists ::env(FPU_IP)]} {
set ip_dir $::env(FPU_IP)
set argv [list $ip_dir $device_part]
set argc 2
source ${script_dir}/xilinx_ip_gen.tcl
proc run_setup {} {
global project_name
global top_module device_part vcs_file xdc_file
global script_dir source_dir
global num_jobs
global argv argc ;# Using global system variables: argv and argc
# create fpu ip
if {[info exists ::env(FPU_IP)]} {
set ip_dir $::env(FPU_IP)
set argv [list $ip_dir $device_part]
set argc 2
source ${script_dir}/xilinx_ip_gen.tcl
}
source "${script_dir}/parse_vcs_list.tcl"
set vlist [parse_vcs_list "${vcs_file}"]
set vsources_list [lindex $vlist 0]
set vincludes_list [lindex $vlist 1]
set vdefines_list [lindex $vlist 2]
#puts $vsources_list
#puts $vincludes_list
#puts $vdefines_list
# Create project
create_project $project_name $project_name -force -part $device_part
# Add constrains file
read_xdc $xdc_file
# Add the design sources
add_files -norecurse -verbose $vsources_list
# process defines
set_property verilog_define ${vdefines_list} [current_fileset]
# add fpu ip
if {[info exists ::env(FPU_IP)]} {
set ip_dir $::env(FPU_IP)
add_files -norecurse -verbose ${ip_dir}/xil_fma/xil_fma.xci
add_files -norecurse -verbose ${ip_dir}/xil_fdiv/xil_fdiv.xci
add_files -norecurse -verbose ${ip_dir}/xil_fsqrt/xil_fsqrt.xci
}
# Synthesis
set_property top $top_module [current_fileset]
set_property \
-name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} \
-value {-mode out_of_context} \
-objects [get_runs synth_1]
# register compilation hooks
#set_property STEPS.SYNTH_DESIGN.TCL.PRE ${source_dir}/pre_synth_hook.tcl [get_runs synth_1]
#set_property STEPS.SYNTH_DESIGN.TCL.POST ${source_dir}/post_synth_hook.tcl [get_runs synth_1]
set_property STEPS.OPT_DESIGN.TCL.PRE ${script_dir}/xilinx_async_bram_patch.tcl [get_runs impl_1]
#set_property STEPS.OPT_DESIGN.TCL.POST ${source_dir}/post_opt_hook.tcl [get_runs impl_1]
#set_property STEPS.ROUTE_DESIGN.TCL.PRE ${source_dir}/pre_route_hook.tcl [get_runs impl_1]
#set_property STEPS.ROUTE_DESIGN.TCL.POST ${source_dir}/post_route_hook.tcl [get_runs impl_1]
update_compile_order -fileset sources_1
}
source "${script_dir}/parse_vcs_list.tcl"
set vlist [parse_vcs_list "${vcs_file}"]
proc run_synthesis {} {
global num_jobs
set vsources_list [lindex $vlist 0]
set vincludes_list [lindex $vlist 1]
set vdefines_list [lindex $vlist 2]
#puts $vsources_list
#puts $vincludes_list
#puts $vdefines_list
# Create project
create_project $project_name $project_name -force -part $device_part
# Add constrains file
read_xdc $xdc_file
# Add the design sources
add_files -norecurse -verbose $vsources_list
# process defines
set_property verilog_define ${vdefines_list} [current_fileset]
# add fpu ip
if {[info exists ::env(FPU_IP)]} {
set ip_dir $::env(FPU_IP)
add_files -norecurse -verbose ${ip_dir}/xil_fma/xil_fma.xci
add_files -norecurse -verbose ${ip_dir}/xil_fdiv/xil_fdiv.xci
add_files -norecurse -verbose ${ip_dir}/xil_fsqrt/xil_fsqrt.xci
if {$num_jobs != 0} {
launch_runs synth_1 -verbose -jobs $num_jobs
} else {
launch_runs synth_1 -verbose
}
wait_on_run synth_1
open_run synth_1
report_utilization -file post_synth_util.rpt -hierarchical -hierarchical_percentages
write_checkpoint -force post_synth.dcp
}
update_compile_order -fileset sources_1
proc run_implementation {} {
global num_jobs
# Synthesis
set_property top $top_module [current_fileset]
if {$num_jobs != 0} {
launch_runs impl_1 -verbose -jobs $num_jobs
} else {
launch_runs impl_1 -verbose
}
wait_on_run impl_1
open_run impl_1
report_utilization -file post_impl_util.rpt -hierarchical -hierarchical_percentages
write_checkpoint -force post_impl.dcp
}
set_property \
-name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} \
-value {-mode out_of_context -flatten_hierarchy "rebuilt"} \
-objects [get_runs synth_1]
proc run_report {} {
# Generate the synthesis report
report_place_status -file place.rpt
report_route_status -file route.rpt
# register compilation hooks
#set_property STEPS.SYNTH_DESIGN.TCL.PRE ${source_dir}/pre_synth_hook.tcl [get_runs synth_1]
#set_property STEPS.SYNTH_DESIGN.TCL.POST ${source_dir}/post_synth_hook.tcl [get_runs synth_1]
set_property STEPS.OPT_DESIGN.TCL.PRE ${script_dir}/xilinx_async_bram_patch.tcl [get_runs impl_1]
#set_property STEPS.OPT_DESIGN.TCL.POST ${source_dir}/post_opt_hook.tcl [get_runs impl_1]
#set_property STEPS.ROUTE_DESIGN.TCL.PRE ${source_dir}/pre_route_hook.tcl [get_runs impl_1]
#set_property STEPS.ROUTE_DESIGN.TCL.POST ${source_dir}/post_route_hook.tcl [get_runs impl_1]
# Generate timing report
report_timing -nworst 100 -delay_type max -sort_by group -file timing.rpt
if {$num_jobs != 0} {
launch_runs synth_1 -verbose -jobs $num_jobs
# Generate power and drc reports
report_power -file power.rpt
report_drc -file drc.rpt
}
###############################################################################
# Start time
set start_time [clock seconds]
set checkpoint_synth "post_synth.dcp"
set checkpoint_impl "post_impl.dcp"
if { [file exists $checkpoint_impl] } {
puts "Resuming from post-implementation checkpoint: $checkpoint_impl"
open_checkpoint $checkpoint_impl
run_report
} elseif { [file exists $checkpoint_synth] } {
puts "Resuming from post-synthesis checkpoint: $checkpoint_synth"
open_checkpoint $checkpoint_synth
run_implementation
run_report
} else {
launch_runs synth_1 -verbose
# Execute full pipeline
run_setup
run_synthesis
run_implementation
run_report
}
wait_on_run synth_1
open_run synth_1
write_checkpoint -force post_synth.dcp
report_utilization -file post_synth_util.rpt -hierarchical -hierarchical_percentages
# Implementation
if {$num_jobs != 0} {
launch_runs impl_1 -verbose -jobs $num_jobs
} else {
launch_runs impl_1 -verbose
}
wait_on_run impl_1
open_run impl_1
write_checkpoint -force post_impl.dcp
report_utilization -file post_impl_util.rpt -hierarchical -hierarchical_percentages
# Generate the synthesis report
report_place_status -file place.rpt
report_route_status -file route.rpt
report_timing_summary -file timing.rpt
# Generate timing report
report_timing -nworst 10 -delay_type max -sort_by group -file timing.rpt
# Generate power and drc reports
report_power -file power.rpt
report_drc -file drc.rpt
# End time and calculation
set elapsed_time [expr {[clock seconds] - $start_time}]

View file

@ -458,7 +458,7 @@ if { [file exists post_impl.dcp] } {
run_implementation
run_report
} else {
# execute full pipeline
# Execute full pipeline
run_setup
run_synthesis
run_implementation

View file

@ -15,7 +15,6 @@ endif
TARGET ?= hw
PLATFORM ?=
NUM_CORES ?= 1
PREFIX ?= build$(XLEN)
MAX_JOBS ?= 8
@ -64,6 +63,7 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
ifdef NUM_CORES
# cluster configuration
CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1
CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2
@ -73,6 +73,7 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16
CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16
CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16
CONFIGS += $(CONFIGS_$(NUM_CORES)c)
endif
# include sources
RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
@ -115,12 +116,12 @@ endif
# Debugging
ifdef DEBUG
VPP_FLAGS += -g --optimize 0 --debug.protocol all
ifneq ($(TARGET), hw)
VPP_FLAGS += --vivado.prop fileset.sim_1.xsim.elaborate.debug_level=all
CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS)
else
ifeq ($(TARGET), hw)
VPP_FLAGS += --debug.chipscope vortex_afu_1
CFLAGS += -DNDEBUG -DCHIPSCOPE $(DBG_SCOPE_FLAGS)
else
VPP_FLAGS += --vivado.prop fileset.sim_1.xsim.elaborate.debug_level=all
CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS)
endif
else
VPP_FLAGS += --optimize 3

View file

@ -5,7 +5,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/yosys
TOP_LEVEL_ENTITY ?= Vortex
PREFIX ?= build
NUM_CORES ?= 1
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
RTL_DIR := $(VORTEX_HOME)/hw/rtl
@ -30,7 +29,7 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
ifdef NUM_CORES
# cluster configuration
CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1
CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2
@ -40,6 +39,7 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE
CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE
CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE
CONFIGS += $(CONFIGS_$(NUM_CORES)c)
endif
# include paths
FPU_INCLUDE = -I$(RTL_DIR)/fpu

View file

@ -211,8 +211,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t mem_reads = 0;
uint64_t mem_writes = 0;
uint64_t mem_lat = 0;
uint64_t mem_req_counter = 0;
uint64_t mem_ticks = 0;
uint64_t mem_bank_stalls = 0;
uint64_t num_cores;
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
@ -223,7 +222,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
return err;
});
uint64_t num_mem_bank_ports;
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_MEM_BANKS, &num_mem_bank_ports), {
return err;
@ -437,6 +436,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization);
}
uint64_t dcache_requests_per_core = 0;
if (dcache_enable) {
// PERF: Dcache
uint64_t dcache_reads;
@ -447,6 +448,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
return err;
});
dcache_requests_per_core += dcache_reads + dcache_writes;
uint64_t dcache_read_misses;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
return err;
@ -475,6 +477,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization);
}
// PERF: coalescer
uint64_t coalescer_misses;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_COALESCER_MISS, core_id, &coalescer_misses), {
return err;
});
int coalescer_utilization = calcAvgPercent(dcache_requests_per_core - coalescer_misses, dcache_requests_per_core);
fprintf(stream, "PERF: core%d: coalescer misses=%ld (hit ratio=%d%%)\n", core_id, coalescer_misses, coalescer_utilization);
if (l2cache_enable) {
// PERF: L2cache
uint64_t tmp;
@ -540,10 +550,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), {
return err;
});
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_CNTR, core_id, &mem_req_counter), {
return err;
});
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_TICK, core_id, &mem_ticks), {
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_ST, core_id, &mem_bank_stalls), {
return err;
});
}
@ -612,7 +619,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls);
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio);
@ -621,11 +628,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization);
}
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
int memory_bank_port_utilization = calcAvgPercent(mem_req_counter, (mem_ticks * num_mem_bank_ports));
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
fprintf(stream, "PERF: memory bank port utilization=%d%%\n", memory_bank_port_utilization);
{
uint64_t mem_requests = mem_reads + mem_writes;
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
int mem_bank_utilization = calcAvgPercent(mem_requests, mem_requests + mem_bank_stalls);
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", mem_requests, mem_reads, mem_writes);
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization);
}
} break;
default:
break;

View file

@ -19,7 +19,7 @@
// XRT includes
#ifdef XRTSIM
#include <xrt.h>
#include <xrt_c.h>
#else
#include "experimental/xrt_bo.h"
#include "experimental/xrt_device.h"

View file

@ -104,6 +104,27 @@ inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
return (bits << shift) >> (shift + start);
}
inline uint64_t bit_reverse(uint64_t bits) {
bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1) | ((bits & 0x5555555555555555) << 1);
bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2) | ((bits & 0x3333333333333333) << 2);
bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4) | ((bits & 0x0F0F0F0F0F0F0F0F) << 4);
bits = ((bits & 0xFF00FF00FF00FF00) >> 8) | ((bits & 0x00FF00FF00FF00FF) << 8);
bits = ((bits & 0xFFFF0000FFFF0000) >> 16) | ((bits & 0x0000FFFF0000FFFF) << 16);
bits = (bits >> 32) | (bits << 32);
return bits;
}
inline uint64_t bit_reverse(uint64_t bits, uint32_t width) {
assert(width <= 64);
uint64_t reversed(0);
for (uint32_t i = 0; i < width; ++i) {
if (bits & (1ULL << i)) {
reversed |= (1ULL << (width - 1 - i));
}
}
return reversed;
}
template <typename T = uint32_t>
T sext(const T& word, uint32_t width) {
assert(width > 1);

View file

@ -21,32 +21,32 @@ template <typename T = uint32_t>
class BitVector {
private:
static constexpr size_t BITS_PER_WORD = sizeof(T) * 8;
std::vector<T> bits_;
std::vector<T> words_;
size_t size_;
bool all_zero_;
size_t wordIndex(size_t pos) const {
constexpr size_t wordIndex(size_t pos) const {
return pos / BITS_PER_WORD;
}
T bitMask(size_t pos) const {
constexpr T bitMask(size_t pos) const {
return T(1) << (pos % BITS_PER_WORD);
}
void updateAllZero() {
all_zero_ = std::all_of(bits_.begin(), bits_.end(), [](T word) { return word == 0; });
all_zero_ = std::all_of(words_.begin(), words_.end(), [](T word) { return word == 0; });
}
public:
explicit BitVector(size_t size = 0)
: bits_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD)
: words_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD)
, size_(size)
, all_zero_(true)
{}
void set(size_t pos) {
if (pos >= size_) throw std::out_of_range("Index out of range");
bits_[this->wordIndex(pos)] |= this->bitMask(pos);
words_[this->wordIndex(pos)] |= this->bitMask(pos);
all_zero_ = false;
}
@ -59,19 +59,19 @@ public:
}
void reset() {
std::fill(bits_.begin(), bits_.end(), 0);
std::fill(words_.begin(), words_.end(), 0);
all_zero_ = true;
}
void reset(size_t pos) {
if (pos >= size_) throw std::out_of_range("Index out of range");
bits_[this->wordIndex(pos)] &= ~this->bitMask(pos);
words_[this->wordIndex(pos)] &= ~this->bitMask(pos);
this->updateAllZero();
}
bool test(size_t pos) const {
if (pos >= size_) throw std::out_of_range("Index out of range");
return bits_[this->wordIndex(pos)] & this->bitMask(pos);
return words_[this->wordIndex(pos)] & this->bitMask(pos);
}
size_t size() const {
@ -80,12 +80,12 @@ public:
void resize(size_t new_size) {
size_ = new_size;
bits_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0);
words_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0);
this->updateAllZero();
}
bool operator==(const BitVector& other) const {
return (size_ == other.size_) && (bits_ == other.bits_);
return (size_ == other.size_) && (words_ == other.words_);
}
bool operator!=(const BitVector& other) const {
@ -98,8 +98,8 @@ public:
BitVector& operator&=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) {
bits_[i] &= other.bits_[i];
for (size_t i = 0; i < words_.size(); ++i) {
words_[i] &= other.words_[i];
}
this->updateAllZero();
return *this;
@ -107,8 +107,8 @@ public:
BitVector& operator|=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) {
bits_[i] |= other.bits_[i];
for (size_t i = 0; i < words_.size(); ++i) {
words_[i] |= other.words_[i];
}
this->updateAllZero();
return *this;
@ -116,8 +116,8 @@ public:
BitVector& operator^=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) {
bits_[i] ^= other.bits_[i];
for (size_t i = 0; i < words_.size(); ++i) {
words_[i] ^= other.words_[i];
}
this->updateAllZero();
return *this;
@ -125,23 +125,48 @@ public:
BitVector operator~() const {
BitVector result(size_);
for (size_t i = 0; i < bits_.size(); ++i) {
result.bits_[i] = ~bits_[i];
for (size_t i = 0; i < words_.size(); ++i) {
result.words_[i] = ~words_[i];
}
result.updateAllZero();
return result;
}
void flip() {
for (auto &word : bits_) {
for (auto &word : words_) {
word = ~word;
}
this->updateAllZero();
}
void reverse() {
if (size_ == 0)
return;
size_t remaining_bits = size_ % BITS_PER_WORD;
if (remaining_bits != 0) {
std::vector<T> reversed_words(words_.size(), 0);
for (size_t i = 0; i < size_; ++i) {
size_t reversed_pos = size_ - 1 - i;
size_t src_word = i / BITS_PER_WORD;
size_t src_offset = i % BITS_PER_WORD;
size_t dst_word = reversed_pos / BITS_PER_WORD;
size_t dst_offset = reversed_pos % BITS_PER_WORD;
if (words_[src_word] & (T(1) << src_offset)) {
reversed_words[dst_word] |= (T(1) << dst_offset);
}
}
words_ = std::move(reversed_words);
} else {
std::reverse(words_.begin(), words_.end());
for (auto &word : words_) {
word = static_cast<T>(bit_reverse(static_cast<uint64_t>(word)));
}
}
}
size_t count() const {
size_t count = 0;
for (const auto &word : bits_) {
for (const auto &word : words_) {
count += std::bitset<BITS_PER_WORD>(word).count();
}
return count;
@ -160,12 +185,12 @@ public:
size_t remaining_bits = size_ % BITS_PER_WORD;
T full_mask = ~T(0);
for (size_t i = 0; i < full_bits; ++i) {
if (bits_[i] != full_mask)
if (words_[i] != full_mask)
return false;
}
if (remaining_bits > 0) {
T partial_mask = (T(1) << remaining_bits) - 1;
if ((bits_[full_bits] & partial_mask) != partial_mask)
if ((words_[full_bits] & partial_mask) != partial_mask)
return false;
}
return true;
@ -181,17 +206,17 @@ public:
size_t bit_shift = pos % BITS_PER_WORD;
if (word_shift > 0) {
for (size_t i = bits_.size() - 1; i >= word_shift; --i) {
bits_[i] = bits_[i - word_shift];
for (size_t i = words_.size() - 1; i >= word_shift; --i) {
words_[i] = words_[i - word_shift];
}
std::fill(bits_.begin(), bits_.begin() + word_shift, 0);
std::fill(words_.begin(), words_.begin() + word_shift, 0);
}
if (bit_shift > 0) {
for (size_t i = bits_.size() - 1; i > 0; --i) {
bits_[i] = (bits_[i] << bit_shift) | (bits_[i - 1] >> (BITS_PER_WORD - bit_shift));
for (size_t i = words_.size() - 1; i > 0; --i) {
words_[i] = (words_[i] << bit_shift) | (words_[i - 1] >> (BITS_PER_WORD - bit_shift));
}
bits_[0] <<= bit_shift;
words_[0] <<= bit_shift;
}
this->updateAllZero();
@ -208,17 +233,17 @@ public:
size_t bit_shift = pos % BITS_PER_WORD;
if (word_shift > 0) {
for (size_t i = 0; i < bits_.size() - word_shift; ++i) {
bits_[i] = bits_[i + word_shift];
for (size_t i = 0; i < words_.size() - word_shift; ++i) {
words_[i] = words_[i + word_shift];
}
std::fill(bits_.end() - word_shift, bits_.end(), 0);
std::fill(words_.end() - word_shift, words_.end(), 0);
}
if (bit_shift > 0) {
for (size_t i = 0; i < bits_.size() - 1; ++i) {
bits_[i] = (bits_[i] >> bit_shift) | (bits_[i + 1] << (BITS_PER_WORD - bit_shift));
for (size_t i = 0; i < words_.size() - 1; ++i) {
words_[i] = (words_[i] >> bit_shift) | (words_[i + 1] << (BITS_PER_WORD - bit_shift));
}
bits_.back() >>= bit_shift;
words_.back() >>= bit_shift;
}
this->updateAllZero();

View file

@ -53,25 +53,25 @@ public:
SimPort(SimObjectBase* module)
: SimPortBase(module)
, peer_(nullptr)
, sink_(nullptr)
, tx_cb_(nullptr)
{}
void bind(SimPort<Pkt>* peer) {
assert(peer_ == nullptr);
peer_ = peer;
void bind(SimPort<Pkt>* sink) {
assert(sink_ == nullptr);
sink_ = sink;
}
void unbind() {
peer_ = nullptr;
sink_ = nullptr;
}
bool connected() const {
return (peer_ != nullptr);
return (sink_ != nullptr);
}
SimPort* peer() const {
return peer_;
SimPort* sink() const {
return sink_;
}
bool empty() const {
@ -111,15 +111,15 @@ protected:
};
std::queue<timed_pkt_t> queue_;
SimPort* peer_;
SimPort* sink_;
TxCallback tx_cb_;
void transfer(const Pkt& data, uint64_t cycles) {
if (tx_cb_) {
tx_cb_(data, cycles);
}
if (peer_) {
peer_->transfer(data, cycles);
if (sink_) {
sink_->transfer(data, cycles);
} else {
queue_.push({data, cycles});
}
@ -402,8 +402,8 @@ typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
template <typename Pkt>
void SimPort<Pkt>::push(const Pkt& pkt, uint64_t delay) const {
if (peer_ && !tx_cb_) {
reinterpret_cast<const SimPort<Pkt>*>(peer_)->push(pkt, delay);
if (sink_ && !tx_cb_) {
reinterpret_cast<const SimPort<Pkt>*>(sink_)->push(pkt, delay);
} else {
SimPlatform::instance().schedule(this, pkt, delay);
}

View file

@ -46,8 +46,6 @@ Core::Core(const SimContext& ctx,
, func_units_((uint32_t)FUType::Count)
, lmem_switch_(NUM_LSU_BLOCKS)
, mem_coalescers_(NUM_LSU_BLOCKS)
, lsu_dcache_adapter_(NUM_LSU_BLOCKS)
, lsu_lmem_adapter_(NUM_LSU_BLOCKS)
, pending_icache_(arch_.num_warps())
, commit_arbs_(ISSUE_WIDTH)
{
@ -64,11 +62,11 @@ Core::Core(const SimContext& ctx,
}
// create local memory
snprintf(sname, 100, "%s-local_mem", this->name().c_str());
snprintf(sname, 100, "%s-lmem", this->name().c_str());
local_mem_ = LocalMem::Create(sname, LocalMem::Config{
(1 << LMEM_LOG_SIZE),
LSU_WORD_SIZE,
LSU_NUM_REQS,
LSU_CHANNELS,
log2ceil(LMEM_NUM_BANKS),
false
});
@ -79,48 +77,52 @@ Core::Core(const SimContext& ctx,
lmem_switch_.at(i) = LocalMemSwitch::Create(sname, 1);
}
// create lsu dcache adapter
// create dcache adapter
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter(NUM_LSU_BLOCKS);
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
snprintf(sname, 100, "%s-lsu_dcache_adapter%d", this->name().c_str(), i);
lsu_dcache_adapter_.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
lsu_dcache_adapter.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
}
// create lsu lmem adapter
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
snprintf(sname, 100, "%s-lsu_lmem_adapter%d", this->name().c_str(), i);
lsu_lmem_adapter_.at(i) = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
}
// create lmem arbiter
snprintf(sname, 100, "%s-lmem_arb", this->name().c_str());
auto lmem_arb = LsuArbiter::Create(sname, ArbiterType::RoundRobin, NUM_LSU_BLOCKS, 1);
// connect lsu demux
// create lmem adapter
snprintf(sname, 100, "%s-lsu_lmem_adapter", this->name().c_str());
auto lsu_lmem_adapter = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
// connect lmem switch
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
lmem_switch_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
lmem_switch_.at(b)->ReqLmem.bind(&lmem_arb->ReqIn.at(b));
mem_coalescers_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspDC);
lmem_switch_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
lsu_lmem_adapter_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspLmem);
lmem_arb->RspIn.at(b).bind(&lmem_switch_.at(b)->RspLmem);
}
// connect coalescer-adapter
// connect lmem arbiter
lmem_arb->ReqOut.at(0).bind(&lsu_lmem_adapter->ReqIn);
lsu_lmem_adapter->RspIn.bind(&lmem_arb->RspOut.at(0));
// connect lmem adapter
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
lsu_lmem_adapter->ReqOut.at(c).bind(&local_mem_->Inputs.at(c));
local_mem_->Outputs.at(c).bind(&lsu_lmem_adapter->RspOut.at(c));
}
// connect dcache coalescer
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter_.at(b)->ReqIn);
lsu_dcache_adapter_.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter.at(b)->ReqIn);
lsu_dcache_adapter.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
}
// connect adapter-dcache
// connect dcache adapter
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
uint32_t i = b * DCACHE_CHANNELS + c;
lsu_dcache_adapter_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter_.at(b)->RspOut.at(c));
}
}
// connect adapter-lmem
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
uint32_t i = b * LSU_CHANNELS + c;
lsu_lmem_adapter_.at(b)->ReqOut.at(c).bind(&local_mem_->Inputs.at(i));
local_mem_->Outputs.at(i).bind(&lsu_lmem_adapter_.at(b)->RspOut.at(c));
lsu_dcache_adapter.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter.at(b)->RspOut.at(c));
}
}

View file

@ -127,6 +127,10 @@ public:
return local_mem_;
}
const MemCoalescer::Ptr& mem_coalescer(uint32_t idx) const {
return mem_coalescers_.at(idx);
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
@ -156,8 +160,6 @@ private:
LocalMem::Ptr local_mem_;
std::vector<LocalMemSwitch::Ptr> lmem_switch_;
std::vector<MemCoalescer::Ptr> mem_coalescers_;
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter_;
std::vector<LsuMemAdapter::Ptr> lsu_lmem_adapter_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;

View file

@ -360,7 +360,6 @@ void Emulator::dcache_read(void *data, uint64_t addr, uint32_t size) {
} else {
mmu_.read(data, addr, size, 0);
}
DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl);
}
#endif
@ -567,6 +566,12 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
auto cluster_perf = core_->socket()->cluster()->perf_stats();
auto socket_perf = core_->socket()->perf_stats();
auto lmem_perf = core_->local_mem()->perf_stats();
uint64_t coalescer_misses = 0;
for (uint i = 0; i < NUM_LSU_BLOCKS; ++i) {
coalescer_misses += core_->mem_coalescer(i)->perf_stats().misses;
}
switch (addr) {
CSR_READ_64(VX_CSR_MPM_ICACHE_READS, socket_perf.icache.reads);
CSR_READ_64(VX_CSR_MPM_ICACHE_MISS_R, socket_perf.icache.read_misses);
@ -596,8 +601,9 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
CSR_READ_64(VX_CSR_MPM_MEM_READS, proc_perf.mem_reads);
CSR_READ_64(VX_CSR_MPM_MEM_WRITES, proc_perf.mem_writes);
CSR_READ_64(VX_CSR_MPM_MEM_LT, proc_perf.mem_latency);
CSR_READ_64(VX_CSR_MPM_MEM_BANK_CNTR, proc_perf.memsim.counter);
CSR_READ_64(VX_CSR_MPM_MEM_BANK_TICK, proc_perf.memsim.ticks);
CSR_READ_64(VX_CSR_MPM_MEM_BANK_ST, proc_perf.memsim.bank_stalls);
CSR_READ_64(VX_CSR_MPM_COALESCER_MISS, coalescer_misses);
CSR_READ_64(VX_CSR_MPM_LMEM_READS, lmem_perf.reads);
CSR_READ_64(VX_CSR_MPM_LMEM_WRITES, lmem_perf.writes);

View file

@ -24,14 +24,12 @@ protected:
LocalMem* simobject_;
Config config_;
RAM ram_;
uint32_t line_bits_;
MemCrossBar::Ptr mem_xbar_;
mutable PerfStats perf_stats_;
uint64_t to_local_addr(uint64_t addr) {
uint32_t total_lines = config_.capacity / config_.line_size;
uint32_t line_bits = log2ceil(total_lines);
uint32_t offset = bit_getw(addr, 0, line_bits-1);
return offset;
return bit_getw(addr, 0, line_bits_-1);
}
public:
@ -40,9 +38,13 @@ public:
, config_(config)
, ram_(config.capacity)
{
uint32_t total_lines = config.capacity / config.line_size;
line_bits_ = log2ceil(total_lines);
char sname[100];
snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_reqs, (1 << config.B));
uint32_t wsel_bits = log2ceil(config_.line_size);
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, (1 << config.B), wsel_bits);
for (uint32_t i = 0; i < config.num_reqs; ++i) {
simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i));
mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i));
@ -56,15 +58,15 @@ public:
}
void read(void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl);
ram_.read(data, s_addr, size);
auto l_addr = to_local_addr(addr);
DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl);
ram_.read(data, l_addr, size);
}
void write(const void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl);
ram_.write(data, s_addr, size);
auto l_addr = to_local_addr(addr);
DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl);
ram_.write(data, l_addr, size);
}
void tick() {
@ -94,7 +96,7 @@ public:
}
const PerfStats& perf_stats() const {
perf_stats_.bank_stalls = mem_xbar_->collisions();
perf_stats_.bank_stalls = mem_xbar_->req_collisions();
return perf_stats_;
}
};

View file

@ -147,10 +147,17 @@ void MemCoalescer::tick() {
ReqOut.push(out_req, delay_);
DT(4, this->name() << "-mem-req: coalesced=" << cur_mask.count() << ", " << out_req);
// track partial responses
perf_stats_.misses += (cur_mask.count() != in_req.mask.count());
// update sent mask
sent_mask_ |= cur_mask;
if (sent_mask_ == in_req.mask) {
ReqIn.pop();
sent_mask_.reset();
}
}
const MemCoalescer::PerfStats& MemCoalescer::perf_stats() const {
return perf_stats_;
}

View file

@ -23,6 +23,19 @@ public:
SimPort<LsuReq> ReqOut;
SimPort<LsuRsp> RspOut;
struct PerfStats {
uint64_t misses;
PerfStats()
: misses(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->misses += rhs.misses;
return *this;
}
};
MemCoalescer(
const SimContext& ctx,
const char* name,
@ -37,6 +50,8 @@ public:
void tick();
const PerfStats& perf_stats() const;
private:
struct pending_req_t {
@ -52,6 +67,7 @@ private:
BitVector<> sent_mask_;
uint32_t line_size_;
uint32_t delay_;
PerfStats perf_stats_;
};
}

View file

@ -29,7 +29,7 @@ private:
Config config_;
MemCrossBar::Ptr mem_xbar_;
DramSim dram_sim_;
PerfStats perf_stats_;
mutable PerfStats perf_stats_;
struct DramCallbackArgs {
MemSim::Impl* memsim;
@ -57,6 +57,7 @@ public:
}
const PerfStats& perf_stats() const {
perf_stats_.bank_stalls = mem_xbar_->req_collisions();
return perf_stats_;
}
@ -66,7 +67,6 @@ public:
void tick() {
dram_sim_.tick();
uint32_t counter = 0;
for (uint32_t i = 0; i < config_.num_banks; ++i) {
if (mem_xbar_->ReqOut.at(i).empty())
@ -102,12 +102,6 @@ public:
DT(3, simobject_->name() << "-mem-req[" << i << "]: " << mem_req);
mem_xbar_->ReqOut.at(i).pop();
counter++;
}
perf_stats_.counter += counter;
if (counter > 0) {
++perf_stats_.ticks;
}
}
};

View file

@ -26,17 +26,14 @@ public:
};
struct PerfStats {
uint64_t counter;
uint64_t ticks;
uint64_t bank_stalls;
PerfStats()
: counter(0)
, ticks(0)
: bank_stalls(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->counter += rhs.counter;
this->ticks += rhs.ticks;
this->bank_stalls += rhs.bank_stalls;
return *this;
}
};

View file

@ -527,6 +527,7 @@ public:
auto& req_in = Inputs.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
DT(4, this->name() << "-req" << o << ": " << req);
Outputs.at(o).push(req, delay_);
req_in.pop();
this->update_grant(o, g);
@ -597,37 +598,36 @@ public:
// process incoming requests
for (uint32_t o = 0; o < O; ++o) {
int32_t input_idx = -1;
bool has_collision = false;
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (grants_.at(o) + r) & (R-1);
if (i >= I)
continue;
auto& req_in = Inputs.at(i);
if (!req_in.empty()) {
auto& req = req_in.front();
if (req_in.empty())
continue;
auto& req = req_in.front();
uint32_t output_idx = 0;
if (lg2_outputs_ != 0) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
// skip if input is not going to current output
uint32_t output_idx = 0;
if (O != 1) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
}
if (output_idx != o)
continue;
if (input_idx != -1) {
++collisions_;
continue;
}
input_idx = i;
}
if (input_idx != -1) {
has_collision = true;
continue;
}
input_idx = i;
}
if (input_idx != -1) {
auto& req_in = Inputs.at(input_idx);
auto& req = req_in.front();
if (lg2_inputs_ != 0) {
req.tag = (req.tag << lg2_inputs_) | input_idx;
}
DT(4, this->name() << "-req" << input_idx << ": " << req);
DT(4, this->name() << "-req" << o << ": " << req);
Outputs.at(o).push(req, delay_);
req_in.pop();
this->update_grant(o, input_idx);
collisions_ += has_collision;
}
}
}
@ -721,8 +721,8 @@ public:
g = rsp.tag & (R-1);
rsp.tag >>= lg2_num_reqs_;
}
DT(4, this->name() << "-rsp" << o << ": " << rsp);
uint32_t j = o * R + g;
DT(4, this->name() << "-rsp" << j << ": " << rsp);
RspIn.at(j).push(rsp, 1);
rsp_out.pop();
}
@ -742,7 +742,7 @@ public:
if (lg2_num_reqs_ != 0) {
req.tag = (req.tag << lg2_num_reqs_) | g;
}
DT(4, this->name() << "-req" << j << ": " << req);
DT(4, this->name() << "-req" << o << ": " << req);
ReqOut.at(o).push(req, delay_);
req_in.pop();
this->update_grant(o, g);
@ -798,7 +798,8 @@ public:
, lg2_inputs_(log2ceil(num_inputs))
, lg2_outputs_(log2ceil(num_outputs))
, addr_start_(addr_start)
, collisions_(0) {
, req_collisions_(0)
, rsp_collisions_(0) {
assert(delay != 0);
assert(num_inputs <= 64);
assert(num_outputs <= 64);
@ -824,65 +825,66 @@ public:
// process outgoing responses
for (uint32_t i = 0; i < I; ++i) {
int32_t output_idx = -1;
bool has_collision = false;
for (uint32_t t = 0; t < T; ++t) {
uint32_t o = (rsp_grants_.at(i) + t) & (T-1);
if (o >= O)
continue;
auto& rsp_out = RspOut.at(o);
if (!rsp_out.empty()) {
auto& rsp = rsp_out.front();
// skip if response is not going to current input
uint32_t input_idx = 0;
if (lg2_inputs_ != 0) {
input_idx = rsp.tag & (R-1);
}
if (input_idx != i)
continue;
if (output_idx != -1) {
++collisions_;
continue;
}
output_idx = o;
}
}
if (output_idx != -1) {
auto& rsp_out = RspOut.at(output_idx);
if (rsp_out.empty())
continue;
auto& rsp = rsp_out.front();
uint32_t input_idx = 0;
if (lg2_inputs_ != 0) {
input_idx = rsp.tag & (R-1);
// skip if response is not going to current input
if (input_idx != i)
continue;
}
if (output_idx != -1) {
has_collision = true;
continue;
}
output_idx = o;
}
if (output_idx != -1) {
auto& rsp_out = RspOut.at(output_idx);
auto& rsp = rsp_out.front();
if (lg2_inputs_ != 0) {
rsp.tag >>= lg2_inputs_;
}
DT(4, this->name() << "-rsp" << output_idx << ": " << rsp);
RspIn.at(input_idx).push(rsp, 1);
DT(4, this->name() << "-rsp" << i << ": " << rsp);
RspIn.at(i).push(rsp, 1);
rsp_out.pop();
this->update_rsp_grant(i, output_idx);
rsp_collisions_ += has_collision;
}
}
// process incoming requests
for (uint32_t o = 0; o < O; ++o) {
int32_t input_idx = -1;
bool has_collision = false;
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (req_grants_.at(o) + r) & (R-1);
if (i >= I)
continue;
auto& req_in = ReqIn.at(i);
if (!req_in.empty()) {
auto& req = req_in.front();
if (req_in.empty())
continue;
auto& req = req_in.front();
uint32_t output_idx = 0;
if (lg2_outputs_ != 0) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
// skip if request is not going to current output
uint32_t output_idx = 0;
if (O != 1) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
}
if (output_idx != o)
continue;
if (input_idx != -1) {
++collisions_;
continue;
}
input_idx = i;
}
if (input_idx != -1) {
has_collision = true;
continue;
}
input_idx = i;
}
if (input_idx != -1) {
auto& req_in = ReqIn.at(input_idx);
@ -890,16 +892,21 @@ public:
if (lg2_inputs_ != 0) {
req.tag = (req.tag << lg2_inputs_) | input_idx;
}
DT(4, this->name() << "-req" << input_idx << ": " << req);
DT(4, this->name() << "-req" << o << ": " << req);
ReqOut.at(o).push(req, delay_);
req_in.pop();
this->update_req_grant(o, input_idx);
req_collisions_ += has_collision;
}
}
}
uint64_t collisions() const {
return collisions_;
uint64_t req_collisions() const {
return req_collisions_;
}
uint64_t rsp_collisions() const {
return rsp_collisions_;
}
protected:
@ -923,7 +930,8 @@ protected:
uint32_t lg2_inputs_;
uint32_t lg2_outputs_;
uint32_t addr_start_;
uint64_t collisions_;
uint64_t req_collisions_;
uint64_t rsp_collisions_;
};
///////////////////////////////////////////////////////////////////////////////
@ -978,7 +986,8 @@ private:
uint32_t delay_;
};
using MemArbiter = TxArbiter<MemReq, MemRsp>;
using LsuArbiter = TxArbiter<LsuReq, LsuRsp>;
using MemArbiter = TxArbiter<MemReq, MemRsp>;
using MemCrossBar = TxCrossBar<MemReq, MemRsp>;
}

View file

@ -50,7 +50,7 @@ DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
SRCS += $(SRC_DIR)/xrt.cpp $(SRC_DIR)/xrt_sim.cpp
SRCS += $(SRC_DIR)/xrt_c.cpp $(SRC_DIR)/xrt_sim.cpp
RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv

View file

@ -19,7 +19,7 @@
#include <cstring>
#include <unistd.h>
#include <assert.h>
#include "xrt.h"
#include "xrt_c.h"
#include "xrt_sim.h"
#include <VX_config.h>
#include <util.h>