mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
Merge branch 'bug_fixes'
This commit is contained in:
commit
9dc1d3f688
74 changed files with 1361 additions and 1293 deletions
|
@ -24,7 +24,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
`endif
|
||||
|
||||
// DCRs
|
||||
|
@ -43,12 +43,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
assign mem_perf_tmp_if.icache = 'x;
|
||||
assign mem_perf_tmp_if.dcache = 'x;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.lmem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
cache_perf_t l2_perf;
|
||||
sysmem_perf_t sysmem_perf_tmp;
|
||||
always @(*) begin
|
||||
sysmem_perf_tmp = sysmem_perf;
|
||||
sysmem_perf_tmp.l2cache = l2_perf;
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
|
@ -111,7 +111,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (l2_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.l2cache),
|
||||
.cache_perf (l2_perf),
|
||||
`endif
|
||||
.core_bus_if (per_socket_mem_bus_if),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
|
@ -140,7 +140,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
.reset (socket_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.sysmem_perf (sysmem_perf_tmp),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (socket_dcr_bus_if),
|
||||
|
|
|
@ -325,23 +325,22 @@
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define NEG_EDGE(dst, src) \
|
||||
wire dst; \
|
||||
VX_edge_trigger #( \
|
||||
.POS (0), \
|
||||
.INIT (0) \
|
||||
) __``dst``__ ( \
|
||||
) __neg_edge`__LINE__ ( \
|
||||
.clk (clk), \
|
||||
.reset (1'b0), \
|
||||
.data_in (src), \
|
||||
.data_out (dst) \
|
||||
)
|
||||
|
||||
`define BUFFER_EX(dst, src, ena, RSTW, latency) \
|
||||
`define BUFFER_EX(dst, src, ena, resetw, latency) \
|
||||
VX_pipe_register #( \
|
||||
.DATAW ($bits(dst)), \
|
||||
.RESETW (RSTW), \
|
||||
.RESETW (resetw), \
|
||||
.DEPTH (latency) \
|
||||
) __``dst``__ ( \
|
||||
) __buffer_ex`__LINE__ ( \
|
||||
.clk (clk), \
|
||||
.reset (reset), \
|
||||
.enable (ena), \
|
||||
|
@ -349,13 +348,13 @@
|
|||
.data_out (dst) \
|
||||
)
|
||||
|
||||
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 0, 1)
|
||||
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, $bits(dst), 1)
|
||||
|
||||
`define POP_COUNT_EX(out, in, model) \
|
||||
VX_popcount #( \
|
||||
.N ($bits(in)), \
|
||||
.MODEL (model) \
|
||||
) __``out``__ ( \
|
||||
) __pop_count_ex`__LINE__ ( \
|
||||
.data_in (in), \
|
||||
.data_out (out) \
|
||||
)
|
||||
|
@ -482,7 +481,7 @@
|
|||
for (genvar __i = 0; __i < count; ++__i) begin \
|
||||
assign __reduce_add_i_field[__i] = src[__i].``field; \
|
||||
end \
|
||||
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
|
||||
VX_reduce_tree #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
|
||||
__reduce_add_i_field, \
|
||||
__reduce_add_o_field \
|
||||
); \
|
||||
|
|
|
@ -73,6 +73,17 @@ package VX_gpu_pkg;
|
|||
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
|
||||
} cache_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`PERF_CTR_BITS-1:0] reads;
|
||||
logic [`PERF_CTR_BITS-1:0] writes;
|
||||
logic [`PERF_CTR_BITS-1:0] bank_stalls;
|
||||
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
|
||||
} lmem_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`PERF_CTR_BITS-1:0] misses;
|
||||
} coalescer_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`PERF_CTR_BITS-1:0] reads;
|
||||
logic [`PERF_CTR_BITS-1:0] writes;
|
||||
|
@ -92,6 +103,26 @@ package VX_gpu_pkg;
|
|||
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
|
||||
} issue_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
cache_perf_t icache;
|
||||
cache_perf_t dcache;
|
||||
cache_perf_t l2cache;
|
||||
cache_perf_t l3cache;
|
||||
lmem_perf_t lmem;
|
||||
coalescer_perf_t coalescer;
|
||||
mem_perf_t mem;
|
||||
} sysmem_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
sched_perf_t sched;
|
||||
issue_perf_t issue;
|
||||
logic [`PERF_CTR_BITS-1:0] ifetches;
|
||||
logic [`PERF_CTR_BITS-1:0] loads;
|
||||
logic [`PERF_CTR_BITS-1:0] stores;
|
||||
logic [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||
logic [`PERF_CTR_BITS-1:0] load_latency;
|
||||
} pipeline_perf_t;
|
||||
|
||||
//////////////////////// instruction arguments ////////////////////////////
|
||||
|
||||
typedef struct packed {
|
||||
|
@ -145,6 +176,7 @@ package VX_gpu_pkg;
|
|||
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
|
||||
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
|
||||
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
|
||||
localparam LMEM_TAG_WIDTH = LSU_TAG_WIDTH + `CLOG2(`NUM_LSU_BLOCKS);
|
||||
|
||||
////////////////////////// Icache Parameters //////////////////////////////
|
||||
|
||||
|
|
|
@ -157,7 +157,7 @@
|
|||
|
||||
`ifdef QUARTUS
|
||||
`define MAX_FANOUT 8
|
||||
`define MAX_LUTRAM 1024
|
||||
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
|
||||
`define USE_BLOCK_BRAM (* ramstyle = "block" *)
|
||||
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
|
||||
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
|
||||
|
@ -168,7 +168,7 @@
|
|||
`define STRING string
|
||||
`elsif VIVADO
|
||||
`define MAX_FANOUT 8
|
||||
`define MAX_LUTRAM 1024
|
||||
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
|
||||
`define USE_BLOCK_BRAM (* ram_style = "block" *)
|
||||
`define USE_FAST_BRAM (* ram_style = "distributed" *)
|
||||
`define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *)
|
||||
|
@ -177,9 +177,12 @@
|
|||
`define PRESERVE_NET (* keep = "true" *)
|
||||
`define BLACKBOX_CELL (* black_box *)
|
||||
`define STRING
|
||||
`ifndef SIMULATION
|
||||
`define ASYNC_BRAM_PATCH
|
||||
`endif
|
||||
`else
|
||||
`define MAX_FANOUT 8
|
||||
`define MAX_LUTRAM 1024
|
||||
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
|
||||
`define USE_BLOCK_BRAM
|
||||
`define USE_FAST_BRAM
|
||||
`define NO_RW_RAM_CHECK
|
||||
|
|
|
@ -24,7 +24,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
`endif
|
||||
|
||||
// DCRs
|
||||
|
@ -63,11 +63,13 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.lmem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
cache_perf_t icache_perf, dcache_perf;
|
||||
sysmem_perf_t sysmem_perf_tmp;
|
||||
always @(*) begin
|
||||
sysmem_perf_tmp = sysmem_perf;
|
||||
sysmem_perf_tmp.icache = icache_perf;
|
||||
sysmem_perf_tmp.dcache = dcache_perf;
|
||||
end
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
@ -110,7 +112,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.MEM_OUT_BUF (2)
|
||||
) icache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.icache),
|
||||
.cache_perf (icache_perf),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (icache_reset),
|
||||
|
@ -160,7 +162,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.MEM_OUT_BUF (2)
|
||||
) dcache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.dcache),
|
||||
.cache_perf (dcache_perf),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (dcache_reset),
|
||||
|
@ -187,6 +189,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.NUM_OUTPUTS(1),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX(0),
|
||||
|
@ -234,7 +237,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.reset (core_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.sysmem_perf (sysmem_perf_tmp),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (core_dcr_bus_if),
|
||||
|
|
|
@ -166,10 +166,8 @@
|
|||
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
|
||||
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
|
||||
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
|
||||
`define VX_CSR_MPM_MEM_BANK_CNTR 12'hB1E // memory bank requests
|
||||
`define VX_CSR_MPM_MEM_BANK_CNTR_H 12'hB9E
|
||||
`define VX_CSR_MPM_MEM_BANK_TICK 12'hB1F // memory ticks
|
||||
`define VX_CSR_MPM_MEM_BANK_TICK_H 12'hB9F
|
||||
`define VX_CSR_MPM_MEM_BANK_ST 12'hB1E // bank conflicts
|
||||
`define VX_CSR_MPM_MEM_BANK_ST_H 12'hB9E
|
||||
// PERF: lmem
|
||||
`define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads
|
||||
`define VX_CSR_MPM_LMEM_READS_H 12'hB9B
|
||||
|
@ -177,6 +175,9 @@
|
|||
`define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C
|
||||
`define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts
|
||||
`define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D
|
||||
// PERF: coalescer
|
||||
`define VX_CSR_MPM_COALESCER_MISS 12'hB1F // coalescer misses
|
||||
`define VX_CSR_MPM_COALESCER_MISS_H 12'hB9F
|
||||
|
||||
// Machine Performance-monitoring memory counters (class 3) ///////////////////
|
||||
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>
|
||||
|
|
|
@ -50,11 +50,14 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = 'x;
|
||||
assign mem_perf_if.dcache = 'x;
|
||||
assign mem_perf_if.l2cache = 'x;
|
||||
assign mem_perf_if.lmem = 'x;
|
||||
cache_perf_t l3_perf;
|
||||
mem_perf_t mem_perf;
|
||||
sysmem_perf_t sysmem_perf;
|
||||
always @(*) begin
|
||||
sysmem_perf = '0;
|
||||
sysmem_perf.l3cache = l3_perf;
|
||||
sysmem_perf.mem = mem_perf;
|
||||
end
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
|
@ -98,7 +101,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.reset (l3_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_if.l3cache),
|
||||
.cache_perf (l3_perf),
|
||||
`endif
|
||||
|
||||
.core_bus_if (per_cluster_mem_bus_if),
|
||||
|
@ -146,7 +149,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.reset (cluster_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.sysmem_perf (sysmem_perf),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (cluster_dcr_bus_if),
|
||||
|
@ -182,7 +185,6 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
mem_perf_t mem_perf;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -202,7 +204,6 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
assign mem_perf_if.mem = mem_perf;
|
||||
|
||||
`endif
|
||||
|
||||
|
|
|
@ -620,6 +620,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.NUM_OUTPUTS (1),
|
||||
.DATA_SIZE (LMEM_DATA_SIZE),
|
||||
.ADDR_WIDTH (CCI_VX_ADDR_WIDTH),
|
||||
.TAG_WIDTH (CCI_VX_TAG_WIDTH),
|
||||
|
@ -1097,7 +1098,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
wire vx_mem_req_fire = vx_mem_req_valid[0] && vx_mem_req_ready[0];
|
||||
wire vx_mem_rsp_fire = vx_mem_rsp_valid[0] && vx_mem_rsp_ready[0];
|
||||
wire avs_req_fire = (avs_write[0] || avs_read[0]) && ~avs_waitrequest[0];
|
||||
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP (0, 0, {
|
||||
vx_reset,
|
||||
|
|
|
@ -50,6 +50,8 @@ module VX_afu_ctrl #(
|
|||
input wire ap_idle,
|
||||
output wire interrupt,
|
||||
|
||||
output wire ap_ctrl_read,
|
||||
|
||||
`ifdef SCOPE
|
||||
input wire scope_bus_in,
|
||||
output wire scope_bus_out,
|
||||
|
@ -368,7 +370,7 @@ module VX_afu_ctrl #(
|
|||
end else begin
|
||||
case (rstate)
|
||||
RSTATE_ADDR: rstate <= s_axi_ar_fire ? RSTATE_DATA : RSTATE_ADDR;
|
||||
RSTATE_DATA: rstate <= (~rvalid_stall) ? RSTATE_RESP : RSTATE_DATA;
|
||||
RSTATE_DATA: rstate <= rvalid_stall ? RSTATE_DATA : RSTATE_RESP;
|
||||
RSTATE_RESP: rstate <= s_axi_r_fire ? RSTATE_ADDR : RSTATE_RESP;
|
||||
default: rstate <= RSTATE_ADDR;
|
||||
endcase
|
||||
|
@ -430,6 +432,8 @@ module VX_afu_ctrl #(
|
|||
assign ap_start = ap_start_r;
|
||||
assign interrupt = gie_r & (| isr_r);
|
||||
|
||||
assign ap_ctrl_read = s_axi_r_fire && (raddr == ADDR_AP_CTRL);
|
||||
|
||||
assign dcr_wr_valid = dcr_wr_valid_r;
|
||||
assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r);
|
||||
assign dcr_wr_data = `VX_DCR_DATA_WIDTH'(dcrv_r);
|
||||
|
|
|
@ -10,6 +10,8 @@
|
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Reference: https://www.xilinx.com/developer/articles/porting-rtl-designs-to-vitis-rtl-kernels.html
|
||||
|
||||
`include "vortex_afu.vh"
|
||||
|
||||
|
@ -35,17 +37,21 @@ module VX_afu_wrap #(
|
|||
input wire s_axi_ctrl_awvalid,
|
||||
output wire s_axi_ctrl_awready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
|
||||
|
||||
input wire s_axi_ctrl_wvalid,
|
||||
output wire s_axi_ctrl_wready,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
|
||||
|
||||
input wire s_axi_ctrl_arvalid,
|
||||
output wire s_axi_ctrl_arready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
|
||||
|
||||
output wire s_axi_ctrl_rvalid,
|
||||
input wire s_axi_ctrl_rready,
|
||||
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
|
||||
output wire [1:0] s_axi_ctrl_rresp,
|
||||
|
||||
output wire s_axi_ctrl_bvalid,
|
||||
input wire s_axi_ctrl_bready,
|
||||
output wire [1:0] s_axi_ctrl_bresp,
|
||||
|
@ -58,8 +64,12 @@ module VX_afu_wrap #(
|
|||
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH;
|
||||
`endif
|
||||
|
||||
localparam STATE_IDLE = 0;
|
||||
localparam STATE_RUN = 1;
|
||||
typedef enum logic [1:0] {
|
||||
STATE_IDLE = 0,
|
||||
STATE_INIT = 1,
|
||||
STATE_RUN = 2,
|
||||
STATE_DONE = 3
|
||||
} state_e;
|
||||
|
||||
localparam PENDING_SIZEW = 12; // max outstanding requests size
|
||||
localparam C_M_AXI_MEM_NUM_BANKS_SW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
|
||||
|
@ -69,20 +79,24 @@ module VX_afu_wrap #(
|
|||
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_awid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [7:0] m_axi_mem_awlen_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
wire m_axi_mem_wvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_wready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_wdata_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_wstrb_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_wlast_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
wire m_axi_mem_bvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_bready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_bid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [1:0] m_axi_mem_bresp_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
wire m_axi_mem_arvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_arready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_arid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [7:0] m_axi_mem_arlen_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
wire m_axi_mem_rvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_rready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_rdata_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
@ -99,7 +113,6 @@ module VX_afu_wrap #(
|
|||
|
||||
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
|
||||
reg [PENDING_SIZEW-1:0] vx_pending_writes;
|
||||
reg vx_busy_wait;
|
||||
reg vx_reset = 1; // asserted at initialization
|
||||
wire vx_busy;
|
||||
|
||||
|
@ -107,13 +120,16 @@ module VX_afu_wrap #(
|
|||
wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
|
||||
wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data;
|
||||
|
||||
reg state;
|
||||
state_e state;
|
||||
|
||||
wire ap_reset;
|
||||
wire ap_start;
|
||||
wire ap_idle = vx_reset;
|
||||
wire ap_done = (state == STATE_IDLE) && (vx_pending_writes == '0);
|
||||
wire ap_ready = 1'b1;
|
||||
wire ap_ctrl_read;
|
||||
wire ap_idle = (state == STATE_IDLE);
|
||||
wire ap_done = (state == STATE_DONE) && (vx_pending_writes == '0);
|
||||
wire ap_ready = ap_done;
|
||||
|
||||
wire ap_done_ack = ap_done && ap_ctrl_read;
|
||||
|
||||
`ifdef SCOPE
|
||||
wire scope_bus_in;
|
||||
|
@ -130,41 +146,50 @@ module VX_afu_wrap #(
|
|||
STATE_IDLE: begin
|
||||
if (ap_start) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: Goto STATE RUN\n", $time))
|
||||
`TRACE(2, ("%t: AFU: Begin initialization\n", $time))
|
||||
`endif
|
||||
state <= STATE_RUN;
|
||||
state <= STATE_INIT;
|
||||
vx_reset_ctr <= (`RESET_DELAY-1);
|
||||
vx_reset <= 1;
|
||||
end
|
||||
end
|
||||
STATE_RUN: begin
|
||||
STATE_INIT: begin
|
||||
if (vx_reset) begin
|
||||
// wait until the reset network is ready
|
||||
// wait for reset to complete
|
||||
if (vx_reset_ctr == 0) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: Begin execution\n", $time))
|
||||
`TRACE(2, ("%t: AFU: Initialization completed\n", $time))
|
||||
`endif
|
||||
vx_busy_wait <= 1;
|
||||
vx_reset <= 0;
|
||||
end
|
||||
end else begin
|
||||
if (vx_busy_wait) begin
|
||||
// wait until processor goes busy
|
||||
if (vx_busy) begin
|
||||
vx_busy_wait <= 0;
|
||||
end
|
||||
end else begin
|
||||
// wait until the processor is not busy
|
||||
if (~vx_busy) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: End execution\n", $time))
|
||||
`TRACE(2, ("%t: AFU: Goto STATE IDLE\n", $time))
|
||||
`endif
|
||||
state <= STATE_IDLE;
|
||||
end
|
||||
// wait until processor goes busy
|
||||
if (vx_busy) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: Begin execution\n", $time))
|
||||
`endif
|
||||
state <= STATE_RUN;
|
||||
end
|
||||
end
|
||||
end
|
||||
STATE_RUN: begin
|
||||
// wait until the processor is not busy
|
||||
if (~vx_busy) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: Execution completed\n", $time))
|
||||
`endif
|
||||
state <= STATE_DONE;
|
||||
end
|
||||
end
|
||||
STATE_DONE: begin
|
||||
// wait for host's done acknowledgement
|
||||
if (ap_done_ack) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: Processor idle\n", $time))
|
||||
`endif
|
||||
state <= STATE_IDLE;
|
||||
end
|
||||
end
|
||||
endcase
|
||||
|
||||
// ensure reset network initialization
|
||||
|
@ -177,7 +202,7 @@ module VX_afu_wrap #(
|
|||
wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire;
|
||||
wire [C_M_AXI_MEM_NUM_BANKS_SW-1:0] cur_wr_reqs, cur_wr_rsps;
|
||||
|
||||
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_awfire
|
||||
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_req_fire
|
||||
VX_axi_write_ack axi_write_ack (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -190,7 +215,10 @@ module VX_afu_wrap #(
|
|||
`UNUSED_PIN (w_ack),
|
||||
`UNUSED_PIN (tx_rdy)
|
||||
);
|
||||
assign m_axi_wr_rsp_fire[i] = m_axi_mem_bvalid_a[i] & m_axi_mem_bready_a[i];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_rsp_fire
|
||||
assign m_axi_wr_rsp_fire[i] = m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
|
||||
end
|
||||
|
||||
`POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire);
|
||||
|
@ -217,17 +245,21 @@ module VX_afu_wrap #(
|
|||
.s_axi_awvalid (s_axi_ctrl_awvalid),
|
||||
.s_axi_awready (s_axi_ctrl_awready),
|
||||
.s_axi_awaddr (s_axi_ctrl_awaddr),
|
||||
|
||||
.s_axi_wvalid (s_axi_ctrl_wvalid),
|
||||
.s_axi_wready (s_axi_ctrl_wready),
|
||||
.s_axi_wdata (s_axi_ctrl_wdata),
|
||||
.s_axi_wstrb (s_axi_ctrl_wstrb),
|
||||
|
||||
.s_axi_arvalid (s_axi_ctrl_arvalid),
|
||||
.s_axi_arready (s_axi_ctrl_arready),
|
||||
.s_axi_araddr (s_axi_ctrl_araddr),
|
||||
|
||||
.s_axi_rvalid (s_axi_ctrl_rvalid),
|
||||
.s_axi_rready (s_axi_ctrl_rready),
|
||||
.s_axi_rdata (s_axi_ctrl_rdata),
|
||||
.s_axi_rresp (s_axi_ctrl_rresp),
|
||||
|
||||
.s_axi_bvalid (s_axi_ctrl_bvalid),
|
||||
.s_axi_bready (s_axi_ctrl_bready),
|
||||
.s_axi_bresp (s_axi_ctrl_bresp),
|
||||
|
@ -238,6 +270,8 @@ module VX_afu_wrap #(
|
|||
.ap_ready (ap_ready),
|
||||
.ap_idle (ap_idle),
|
||||
.interrupt (interrupt),
|
||||
|
||||
.ap_ctrl_read (ap_ctrl_read),
|
||||
|
||||
`ifdef SCOPE
|
||||
.scope_bus_in (scope_bus_out),
|
||||
|
@ -328,9 +362,9 @@ module VX_afu_wrap #(
|
|||
`ifdef DBG_SCOPE_AFU
|
||||
wire m_axi_mem_awfire_0 = m_axi_mem_awvalid_a[0] & m_axi_mem_awready_a[0];
|
||||
wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0];
|
||||
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
|
||||
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
|
||||
wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0];
|
||||
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP (0, 0, {
|
||||
ap_reset,
|
||||
|
@ -340,6 +374,7 @@ module VX_afu_wrap #(
|
|||
interrupt,
|
||||
vx_reset,
|
||||
vx_busy,
|
||||
state,
|
||||
m_axi_mem_awvalid_a[0],
|
||||
m_axi_mem_awready_a[0],
|
||||
m_axi_mem_wvalid_a[0],
|
||||
|
@ -356,7 +391,7 @@ module VX_afu_wrap #(
|
|||
m_axi_mem_arfire_0,
|
||||
m_axi_mem_wfire_0,
|
||||
m_axi_mem_bfire_0
|
||||
},{
|
||||
}, {
|
||||
dcr_wr_addr,
|
||||
dcr_wr_data,
|
||||
vx_pending_writes,
|
||||
|
@ -383,11 +418,11 @@ module VX_afu_wrap #(
|
|||
ap_start,
|
||||
ap_done,
|
||||
ap_idle,
|
||||
state,
|
||||
interrupt
|
||||
}),
|
||||
.probe1 ({
|
||||
vx_pending_writes,
|
||||
vx_busy_wait,
|
||||
vx_busy,
|
||||
vx_reset,
|
||||
dcr_wr_valid,
|
||||
|
@ -428,16 +463,19 @@ module VX_afu_wrap #(
|
|||
always @(posedge clk) begin
|
||||
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
|
||||
if (m_axi_mem_awvalid_a[i] && m_axi_mem_awready_a[i]) begin
|
||||
`TRACE(2, ("%t: AXI Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]))
|
||||
`TRACE(2, ("%t: AXI Wr Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]))
|
||||
end
|
||||
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
|
||||
`TRACE(2, ("%t: AXI Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i]))
|
||||
`TRACE(2, ("%t: AXI Wr Req [%0d]: strb=0x%h, data=0x%h\n", $time, i, m_axi_mem_wstrb_a[i], m_axi_mem_wdata_a[i]))
|
||||
end
|
||||
if (m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i]) begin
|
||||
`TRACE(2, ("%t: AXI Wr Rsp [%0d]: id=0x%0h\n", $time, i, m_axi_mem_bid_a[i]))
|
||||
end
|
||||
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
|
||||
`TRACE(2, ("%t: AXI Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]))
|
||||
`TRACE(2, ("%t: AXI Rd Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]))
|
||||
end
|
||||
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
|
||||
`TRACE(2, ("%t: AXI Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]))
|
||||
`TRACE(2, ("%t: AXI Rd Rsp [%0d]: data=0x%h, id=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -40,18 +40,22 @@ module vortex_afu #(
|
|||
input wire s_axi_ctrl_awvalid,
|
||||
output wire s_axi_ctrl_awready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
|
||||
|
||||
input wire s_axi_ctrl_wvalid,
|
||||
output wire s_axi_ctrl_wready,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
|
||||
input wire s_axi_ctrl_arvalid,
|
||||
|
||||
input wire s_axi_ctrl_arvalid,
|
||||
output wire s_axi_ctrl_arready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
|
||||
|
||||
output wire s_axi_ctrl_rvalid,
|
||||
input wire s_axi_ctrl_rready,
|
||||
input wire s_axi_ctrl_rready,
|
||||
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
|
||||
output wire [1:0] s_axi_ctrl_rresp,
|
||||
output wire s_axi_ctrl_bvalid,
|
||||
|
||||
output wire s_axi_ctrl_bvalid,
|
||||
input wire s_axi_ctrl_bready,
|
||||
output wire [1:0] s_axi_ctrl_bresp,
|
||||
|
||||
|
@ -76,17 +80,21 @@ module vortex_afu #(
|
|||
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
|
||||
.s_axi_ctrl_awready (s_axi_ctrl_awready),
|
||||
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
|
||||
|
||||
.s_axi_ctrl_wvalid (s_axi_ctrl_wvalid),
|
||||
.s_axi_ctrl_wready (s_axi_ctrl_wready),
|
||||
.s_axi_ctrl_wdata (s_axi_ctrl_wdata),
|
||||
.s_axi_ctrl_wstrb (s_axi_ctrl_wstrb),
|
||||
|
||||
.s_axi_ctrl_arvalid (s_axi_ctrl_arvalid),
|
||||
.s_axi_ctrl_arready (s_axi_ctrl_arready),
|
||||
.s_axi_ctrl_araddr (s_axi_ctrl_araddr),
|
||||
|
||||
.s_axi_ctrl_rvalid (s_axi_ctrl_rvalid),
|
||||
.s_axi_ctrl_rready (s_axi_ctrl_rready),
|
||||
.s_axi_ctrl_rdata (s_axi_ctrl_rdata),
|
||||
.s_axi_ctrl_rresp (s_axi_ctrl_rresp),
|
||||
|
||||
.s_axi_ctrl_bvalid (s_axi_ctrl_bvalid),
|
||||
.s_axi_ctrl_bready (s_axi_ctrl_bready),
|
||||
.s_axi_ctrl_bresp (s_axi_ctrl_bresp),
|
||||
|
|
19
hw/rtl/cache/VX_cache.sv
vendored
19
hw/rtl/cache/VX_cache.sv
vendored
|
@ -52,7 +52,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
parameter DIRTY_BYTES = 0,
|
||||
|
||||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_CYCLIC,
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
@ -106,10 +106,9 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
localparam MEM_ARB_SEL_BITS = `CLOG2(`CDIV(NUM_BANKS, MEM_PORTS));
|
||||
localparam MEM_ARB_SEL_WIDTH = `UP(MEM_ARB_SEL_BITS);
|
||||
|
||||
localparam CORE_RSP_REG_DISABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
|
||||
localparam MEM_REQ_REG_DISABLE = (NUM_BANKS != 1);
|
||||
|
||||
localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0;
|
||||
localparam REQ_XBAR_BUF = (NUM_REQS > 2) ? 2 : 0;
|
||||
localparam CORE_RSP_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
|
||||
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
|
||||
|
@ -133,7 +132,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.NUM_BANKS (NUM_BANKS),
|
||||
.UUID_WIDTH(UUID_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
|
||||
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // request xbar latency
|
||||
) flush_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -387,8 +386,8 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.FLAGS_WIDTH (FLAGS_WIDTH),
|
||||
.CORE_OUT_REG (CORE_RSP_REG_DISABLE ? 0 : 1),
|
||||
.MEM_OUT_REG (MEM_REQ_REG_DISABLE ? 0 : 1)
|
||||
.CORE_OUT_REG (CORE_RSP_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(CORE_OUT_BUF)),
|
||||
.MEM_OUT_REG (MEM_REQ_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) bank (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -481,7 +480,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
|
||||
.SIZE (CORE_RSP_REG_DISABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
|
||||
.SIZE (CORE_RSP_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
|
||||
) core_rsp_buf (
|
||||
.clk (clk),
|
||||
|
@ -578,7 +577,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
|
||||
.SIZE (MEM_REQ_REG_DISABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
|
||||
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
.clk (clk),
|
||||
|
|
11
hw/rtl/cache/VX_cache_bank.sv
vendored
11
hw/rtl/cache/VX_cache_bank.sv
vendored
|
@ -48,7 +48,7 @@ module VX_cache_bank #(
|
|||
parameter DIRTY_BYTES = 0,
|
||||
|
||||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_CYCLIC,
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
@ -353,9 +353,11 @@ module VX_cache_bank #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (pipe_stall),
|
||||
.hit_valid (do_lookup_st1 && is_hit_st1 && ~pipe_stall),
|
||||
.hit_line (line_idx_st1),
|
||||
.hit_way (way_idx_st1),
|
||||
.init (do_init_st0),
|
||||
.lookup_valid(do_lookup_st1 && ~pipe_stall),
|
||||
.lookup_hit (is_hit_st1),
|
||||
.lookup_line(line_idx_st1),
|
||||
.lookup_way (way_idx_st1),
|
||||
.repl_valid (do_fill_st0 && ~pipe_stall),
|
||||
.repl_line (line_idx_st0),
|
||||
.repl_way (victim_way_st0)
|
||||
|
@ -443,7 +445,6 @@ module VX_cache_bank #(
|
|||
) cache_data (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (pipe_stall),
|
||||
// inputs
|
||||
.init (do_init_st0),
|
||||
.fill (do_fill_st0 && ~pipe_stall),
|
||||
|
|
2
hw/rtl/cache/VX_cache_cluster.sv
vendored
2
hw/rtl/cache/VX_cache_cluster.sv
vendored
|
@ -56,7 +56,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
parameter DIRTY_BYTES = 0,
|
||||
|
||||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_CYCLIC,
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
|
2
hw/rtl/cache/VX_cache_data.sv
vendored
2
hw/rtl/cache/VX_cache_data.sv
vendored
|
@ -33,7 +33,6 @@ module VX_cache_data #(
|
|||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire stall,
|
||||
// inputs
|
||||
input wire init,
|
||||
input wire fill,
|
||||
|
@ -53,7 +52,6 @@ module VX_cache_data #(
|
|||
output wire [LINE_SIZE-1:0] evict_byteen
|
||||
);
|
||||
`UNUSED_PARAM (WORD_SIZE)
|
||||
`UNUSED_VAR (stall)
|
||||
|
||||
wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_mask;
|
||||
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin : g_write_mask
|
||||
|
|
2
hw/rtl/cache/VX_cache_define.vh
vendored
2
hw/rtl/cache/VX_cache_define.vh
vendored
|
@ -73,7 +73,7 @@
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CS_REPL_RANDOM 0
|
||||
`define CS_REPL_CYCLIC 1
|
||||
`define CS_REPL_FIFO 1
|
||||
`define CS_REPL_PLRU 2
|
||||
|
||||
`endif // VX_CACHE_DEFINE_VH
|
||||
|
|
66
hw/rtl/cache/VX_cache_repl.sv
vendored
66
hw/rtl/cache/VX_cache_repl.sv
vendored
|
@ -90,19 +90,23 @@ module VX_cache_repl #(
|
|||
// Number of associative ways
|
||||
parameter NUM_WAYS = 1,
|
||||
// replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_CYCLIC
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire stall,
|
||||
input wire hit_valid,
|
||||
input wire [`CS_LINE_SEL_BITS-1:0] hit_line,
|
||||
input wire [`CS_WAY_SEL_WIDTH-1:0] hit_way,
|
||||
input wire init,
|
||||
input wire lookup_valid,
|
||||
input wire lookup_hit,
|
||||
input wire [`CS_LINE_SEL_BITS-1:0] lookup_line,
|
||||
input wire [`CS_WAY_SEL_WIDTH-1:0] lookup_way,
|
||||
input wire repl_valid,
|
||||
input wire [`CS_LINE_SEL_BITS-1:0] repl_line,
|
||||
output wire [`CS_WAY_SEL_WIDTH-1:0] repl_way
|
||||
);
|
||||
localparam WAY_SEL_WIDTH = `CS_WAY_SEL_WIDTH;
|
||||
`UNUSED_VAR (reset)
|
||||
`UNUSED_VAR (init)
|
||||
`UNUSED_VAR (stall)
|
||||
|
||||
if (NUM_WAYS > 1) begin : g_enable
|
||||
|
@ -122,20 +126,20 @@ module VX_cache_repl #(
|
|||
.RADDR_REG (1)
|
||||
) plru_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (1'b0),
|
||||
.read (repl_valid),
|
||||
.write (hit_valid),
|
||||
.wren (plru_wmask),
|
||||
.waddr (hit_line),
|
||||
.write (init || (lookup_valid && lookup_hit)),
|
||||
.wren (init ? '1 : plru_wmask),
|
||||
.waddr (lookup_line),
|
||||
.raddr (repl_line),
|
||||
.wdata (plru_wdata),
|
||||
.wdata (init ? '0 : plru_wdata),
|
||||
.rdata (plru_rdata)
|
||||
);
|
||||
|
||||
plru_decoder #(
|
||||
.NUM_WAYS (NUM_WAYS)
|
||||
) plru_dec (
|
||||
.way_idx (hit_way),
|
||||
.way_idx (lookup_way),
|
||||
.lru_data (plru_wdata),
|
||||
.lru_mask (plru_wmask)
|
||||
);
|
||||
|
@ -147,37 +151,39 @@ module VX_cache_repl #(
|
|||
.way_idx (repl_way)
|
||||
);
|
||||
|
||||
end else if (REPL_POLICY == `CS_REPL_CYCLIC) begin : g_cyclic
|
||||
// Cyclic replacement policy
|
||||
`UNUSED_VAR (hit_valid)
|
||||
`UNUSED_VAR (hit_line)
|
||||
`UNUSED_VAR (hit_way)
|
||||
end else if (REPL_POLICY == `CS_REPL_FIFO) begin : g_fifo
|
||||
// Fifo replacement policy
|
||||
`UNUSED_VAR (lookup_valid)
|
||||
`UNUSED_VAR (lookup_hit)
|
||||
`UNUSED_VAR (lookup_line)
|
||||
`UNUSED_VAR (lookup_way)
|
||||
|
||||
wire [WAY_SEL_WIDTH-1:0] ctr_rdata;
|
||||
wire [WAY_SEL_WIDTH-1:0] ctr_wdata = ctr_rdata + 1;
|
||||
wire [WAY_SEL_WIDTH-1:0] fifo_rdata;
|
||||
wire [WAY_SEL_WIDTH-1:0] fifo_wdata = fifo_rdata + 1;
|
||||
|
||||
VX_sp_ram #(
|
||||
.DATAW (WAY_SEL_WIDTH),
|
||||
.SIZE (`CS_LINES_PER_BANK),
|
||||
.RDW_MODE ("R"),
|
||||
.RADDR_REG (1)
|
||||
) ctr_store (
|
||||
) fifo_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (1'b0),
|
||||
.read (repl_valid),
|
||||
.write (repl_valid),
|
||||
.write (init || repl_valid),
|
||||
.wren (1'b1),
|
||||
.addr (repl_line),
|
||||
.wdata (ctr_wdata),
|
||||
.rdata (ctr_rdata)
|
||||
.wdata (init ? '0 : fifo_wdata),
|
||||
.rdata (fifo_rdata)
|
||||
);
|
||||
|
||||
assign repl_way = ctr_rdata;
|
||||
assign repl_way = fifo_rdata;
|
||||
end else begin : g_random
|
||||
// Random replacement policy
|
||||
`UNUSED_VAR (hit_valid)
|
||||
`UNUSED_VAR (hit_line)
|
||||
`UNUSED_VAR (hit_way)
|
||||
`UNUSED_VAR (lookup_valid)
|
||||
`UNUSED_VAR (lookup_hit)
|
||||
`UNUSED_VAR (lookup_line)
|
||||
`UNUSED_VAR (lookup_way)
|
||||
`UNUSED_VAR (repl_valid)
|
||||
`UNUSED_VAR (repl_line)
|
||||
reg [WAY_SEL_WIDTH-1:0] victim_idx;
|
||||
|
@ -192,10 +198,10 @@ module VX_cache_repl #(
|
|||
end
|
||||
end else begin : g_disable
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
`UNUSED_VAR (hit_valid)
|
||||
`UNUSED_VAR (hit_line)
|
||||
`UNUSED_VAR (hit_way)
|
||||
`UNUSED_VAR (lookup_valid)
|
||||
`UNUSED_VAR (lookup_hit)
|
||||
`UNUSED_VAR (lookup_line)
|
||||
`UNUSED_VAR (lookup_way)
|
||||
`UNUSED_VAR (repl_valid)
|
||||
`UNUSED_VAR (repl_line)
|
||||
assign repl_way = 1'b0;
|
||||
|
|
2
hw/rtl/cache/VX_cache_top.sv
vendored
2
hw/rtl/cache/VX_cache_top.sv
vendored
|
@ -153,7 +153,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
VX_cache #(
|
||||
VX_cache_wrap #(
|
||||
.INSTANCE_ID (INSTANCE_ID),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
|
|
62
hw/rtl/cache/VX_cache_wrap.sv
vendored
62
hw/rtl/cache/VX_cache_wrap.sv
vendored
|
@ -54,7 +54,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
parameter DIRTY_BYTES = 0,
|
||||
|
||||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_CYCLIC,
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
@ -210,7 +210,59 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign cache_perf = '0;
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_req;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_req;
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
|
||||
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
|
||||
assign perf_core_reads_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && ~core_bus_if[i].req_data.rw;
|
||||
assign perf_core_writes_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && core_bus_if[i].req_data.rw;
|
||||
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
|
||||
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
|
||||
end
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
|
||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
||||
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_core_reads <= '0;
|
||||
perf_core_writes <= '0;
|
||||
perf_mem_stalls <= '0;
|
||||
perf_crsp_stalls <= '0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
assign cache_perf.reads = perf_core_reads;
|
||||
assign cache_perf.writes = perf_core_writes;
|
||||
assign cache_perf.read_misses = '0;
|
||||
assign cache_perf.write_misses = '0;
|
||||
assign cache_perf.bank_stalls = '0;
|
||||
assign cache_perf.mshr_stalls = '0;
|
||||
assign cache_perf.mem_stalls = perf_mem_stalls;
|
||||
assign cache_perf.crsp_stalls = perf_crsp_stalls;
|
||||
`endif
|
||||
|
||||
end
|
||||
|
@ -220,13 +272,13 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
always @(posedge clk) begin
|
||||
if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin
|
||||
if (core_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
|
||||
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.tag.uuid))
|
||||
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, i, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
|
||||
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -101,7 +101,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
.data_out ({commit_fire_any_r, commit_size_r})
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (COMMIT_SIZEW),
|
||||
.DATAW_OUT (COMMIT_ALL_SIZEW),
|
||||
.N (`ISSUE_WIDTH),
|
||||
|
|
|
@ -28,7 +28,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
`endif
|
||||
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
@ -65,14 +65,15 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
) lsu_mem_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
lmem_perf_t lmem_perf;
|
||||
coalescer_perf_t coalescer_perf;
|
||||
pipeline_perf_t pipeline_perf;
|
||||
sysmem_perf_t sysmem_perf_tmp;
|
||||
always @(*) begin
|
||||
sysmem_perf_tmp = sysmem_perf;
|
||||
sysmem_perf_tmp.lmem = lmem_perf;
|
||||
sysmem_perf_tmp.coalescer = coalescer_perf;
|
||||
end
|
||||
`endif
|
||||
|
||||
base_dcrs_t base_dcrs;
|
||||
|
@ -94,7 +95,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.sched_perf (pipeline_perf_if.sched),
|
||||
.sched_perf (pipeline_perf.sched),
|
||||
`endif
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
@ -144,7 +145,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.issue_perf (pipeline_perf_if.issue),
|
||||
.issue_perf (pipeline_perf.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
|
@ -162,8 +163,8 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sysmem_perf (sysmem_perf_tmp),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
`endif
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
@ -200,7 +201,8 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.lmem_perf (mem_perf_tmp_if.lmem),
|
||||
.lmem_perf (lmem_perf),
|
||||
.coalescer_perf(coalescer_perf),
|
||||
`endif
|
||||
.lsu_mem_if (lsu_mem_if),
|
||||
.dcache_bus_if (dcache_bus_if)
|
||||
|
@ -276,12 +278,11 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
assign pipeline_perf_if.ifetches = perf_ifetches;
|
||||
assign pipeline_perf_if.loads = perf_loads;
|
||||
assign pipeline_perf_if.stores = perf_stores;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
assign pipeline_perf.ifetches = perf_ifetches;
|
||||
assign pipeline_perf.loads = perf_loads;
|
||||
assign pipeline_perf.stores = perf_stores;
|
||||
assign pipeline_perf.ifetch_latency = perf_icache_lat;
|
||||
assign pipeline_perf.load_latency = perf_dcache_lat;
|
||||
|
||||
`endif
|
||||
|
||||
|
|
|
@ -127,13 +127,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
assign icache_rsp_ready = icache_bus_if.rsp_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = '0;
|
||||
assign mem_perf_if.dcache = '0;
|
||||
assign mem_perf_if.l2cache = '0;
|
||||
assign mem_perf_if.l3cache = '0;
|
||||
assign mem_perf_if.lmem = '0;
|
||||
assign mem_perf_if.mem = '0;
|
||||
sysmem_perf_t mem_perf;
|
||||
assign mem_perf.icache = '0;
|
||||
assign mem_perf.dcache = '0;
|
||||
assign mem_perf.l2cache = '0;
|
||||
assign mem_perf.l3cache = '0;
|
||||
assign mem_perf.lmem = '0;
|
||||
assign mem_perf.mem = '0;
|
||||
`endif
|
||||
|
||||
`ifdef SCOPE
|
||||
|
@ -152,7 +152,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.sysmem_perf (sysmem_perf),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (dcr_bus_if),
|
||||
|
|
|
@ -41,8 +41,8 @@ import VX_fpu_pkg::*;
|
|||
input base_dcrs_t base_dcrs,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
`endif
|
||||
|
||||
VX_commit_csr_if.slave commit_csr_if,
|
||||
|
@ -212,65 +212,67 @@ import VX_fpu_pkg::*;
|
|||
`VX_DCR_MPM_CLASS_CORE: begin
|
||||
case (read_addr)
|
||||
// PERF: pipeline
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf_if.sched.idles);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf_if.sched.stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf_if.issue.ibf_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf_if.issue.scb_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf_if.issue.opd_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_ALU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf.sched.idles);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf.sched.stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf.issue.ibf_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf.issue.scb_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf.issue.opd_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_ALU]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_FPU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_FPU]);
|
||||
`else
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0));
|
||||
`endif
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_WCTL]);
|
||||
// PERF: memory
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf_if.ifetches);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf_if.loads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf_if.stores);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf_if.ifetch_latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf_if.load_latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf.ifetch_latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf.load_latency);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`VX_DCR_MPM_CLASS_MEM: begin
|
||||
case (read_addr)
|
||||
// PERF: icache
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, mem_perf_if.icache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, mem_perf_if.icache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, mem_perf_if.icache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls);
|
||||
// PERF: dcache
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, mem_perf_if.dcache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, mem_perf_if.dcache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, mem_perf_if.dcache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, mem_perf_if.dcache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, mem_perf_if.dcache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, mem_perf_if.dcache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls);
|
||||
// PERF: lmem
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, mem_perf_if.lmem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, mem_perf_if.lmem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, mem_perf_if.lmem.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls);
|
||||
// PERF: l2cache
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, mem_perf_if.l2cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, mem_perf_if.l2cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, mem_perf_if.l2cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, mem_perf_if.l2cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l2cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l2cache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls);
|
||||
// PERF: l3cache
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, mem_perf_if.l3cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, mem_perf_if.l3cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, mem_perf_if.l3cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, mem_perf_if.l3cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l3cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l3cache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls);
|
||||
// PERF: memory
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, mem_perf_if.mem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, mem_perf_if.mem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, mem_perf_if.mem.latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency);
|
||||
// PERF: coalescer
|
||||
`CSR_READ_64(`VX_CSR_MPM_COALESCER_MISS, read_data_ro_w, sysmem_perf.coalescer.misses);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
@ -290,8 +292,8 @@ import VX_fpu_pkg::*;
|
|||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
`UNUSED_VAR (mem_perf_if.icache);
|
||||
`UNUSED_VAR (mem_perf_if.lmem);
|
||||
`UNUSED_VAR (sysmem_perf.icache);
|
||||
`UNUSED_VAR (sysmem_perf.lmem);
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -24,8 +24,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
input base_dcrs_t base_dcrs,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -82,8 +82,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
.base_dcrs (base_dcrs),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
`endif
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
|
|
|
@ -23,8 +23,8 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
`endif
|
||||
|
||||
input base_dcrs_t base_dcrs,
|
||||
|
@ -93,8 +93,8 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if (pipeline_perf_if),
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
`endif
|
||||
.base_dcrs (base_dcrs),
|
||||
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
|
|
|
@ -53,7 +53,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
VX_dp_ram #(
|
||||
.DATAW (`PC_BITS + `NUM_THREADS),
|
||||
.SIZE (`NUM_WARPS),
|
||||
.RDW_MODE ("R")
|
||||
.RDW_MODE ("R"),
|
||||
.LUTRAM (1)
|
||||
) tag_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -137,6 +138,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
wire schedule_fire = schedule_if.valid && schedule_if.ready;
|
||||
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
|
||||
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 1, 6, 3, (
|
||||
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
|
||||
|
|
|
@ -93,6 +93,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
|
|||
`SCOPE_IO_SWITCH (1);
|
||||
wire decode_fire = decode_if.valid && decode_if.ready;
|
||||
wire operands_fire = operands_if.valid && operands_if.ready;
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 2, 4, 3, (
|
||||
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 +
|
||||
|
|
|
@ -535,6 +535,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
`ifdef SCOPE
|
||||
`ifdef DBG_SCOPE_LSU
|
||||
`SCOPE_IO_SWITCH (1);
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 3, 4, 2, (
|
||||
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH
|
||||
|
|
|
@ -20,7 +20,8 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output cache_perf_t lmem_perf,
|
||||
output lmem_perf_t lmem_perf,
|
||||
output coalescer_perf_t coalescer_perf,
|
||||
`endif
|
||||
|
||||
VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS],
|
||||
|
@ -39,7 +40,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
|
@ -60,46 +61,58 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
);
|
||||
end
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH)
|
||||
) lmem_arb_if[1]();
|
||||
|
||||
VX_lsu_mem_arb #(
|
||||
.NUM_INPUTS (`NUM_LSU_BLOCKS),
|
||||
.NUM_OUTPUTS(1),
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.TAG_SEL_IDX(0),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF(0),
|
||||
.RSP_OUT_BUF(2)
|
||||
) lmem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (lsu_lmem_if),
|
||||
.bus_out_if (lmem_arb_if)
|
||||
);
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lmem_bus_if[LSU_NUM_REQS]();
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH)
|
||||
) lmem_adapt_if[`NUM_LSU_LANES]();
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_adapters
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
|
||||
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (3),
|
||||
.RSP_OUT_BUF (2)
|
||||
) lmem_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.lsu_mem_if (lsu_lmem_if[i]),
|
||||
.mem_bus_if (lmem_bus_tmp_if)
|
||||
);
|
||||
|
||||
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_lmem_bus_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
|
||||
end
|
||||
end
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (LMEM_TAG_WIDTH - `UUID_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (3),
|
||||
.RSP_OUT_BUF (0)
|
||||
) lmem_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.lsu_mem_if (lmem_arb_if[0]),
|
||||
.mem_bus_if (lmem_adapt_if)
|
||||
);
|
||||
|
||||
VX_local_mem #(
|
||||
.INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))),
|
||||
.SIZE (1 << `LMEM_LOG_SIZE),
|
||||
.NUM_REQS (LSU_NUM_REQS),
|
||||
.NUM_REQS (`NUM_LSU_LANES),
|
||||
.NUM_BANKS (`LMEM_NUM_BANKS),
|
||||
.WORD_SIZE (LSU_WORD_SIZE),
|
||||
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH),
|
||||
.OUT_BUF (3)
|
||||
) local_mem (
|
||||
.clk (clk),
|
||||
|
@ -107,7 +120,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
.lmem_perf (lmem_perf),
|
||||
`endif
|
||||
.mem_bus_if (lmem_bus_if)
|
||||
.mem_bus_if (lmem_adapt_if)
|
||||
);
|
||||
|
||||
`else
|
||||
|
@ -115,6 +128,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
assign lmem_perf = '0;
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]);
|
||||
end
|
||||
|
@ -127,6 +141,21 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`NUM_LSU_BLOCKS-1:0][`PERF_CTR_BITS-1:0] per_block_coalescer_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] coalescer_misses;
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (`PERF_CTR_BITS),
|
||||
.DATAW_OUT (`PERF_CTR_BITS),
|
||||
.N (`NUM_LSU_BLOCKS),
|
||||
.OP ("+")
|
||||
) coalescer_reduce (
|
||||
.data_in (per_block_coalescer_misses),
|
||||
.data_out (coalescer_misses)
|
||||
);
|
||||
`BUFFER(coalescer_perf.misses, coalescer_misses);
|
||||
`endif
|
||||
|
||||
if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers
|
||||
|
@ -139,11 +168,18 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
|
||||
.QUEUE_SIZE (`LSUQ_OUT_SIZE),
|
||||
.PERF_CTR_BITS (`PERF_CTR_BITS)
|
||||
) mem_coalescer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.misses (per_block_coalescer_misses[i]),
|
||||
`else
|
||||
`UNUSED_PIN (misses),
|
||||
`endif
|
||||
|
||||
// Input request
|
||||
.in_req_valid (lsu_dcache_if[i].req_valid),
|
||||
.in_req_mask (lsu_dcache_if[i].req_data.mask),
|
||||
|
@ -186,6 +222,9 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
|
||||
`ifdef PERF_ENABLE
|
||||
assign per_block_coalescer_misses[i] = '0;
|
||||
`endif
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -106,7 +106,6 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (PER_BANK_ADDRW),
|
||||
.ARBITER ("P"), // use priority arbiter
|
||||
.PERF_CTR_BITS(`PERF_CTR_BITS),
|
||||
.OUT_BUF (0) // no output buffering
|
||||
) req_xbar (
|
||||
.clk (clk),
|
||||
|
@ -271,7 +270,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
.RESET_RAM (1),
|
||||
`endif
|
||||
.OUT_REG (1),
|
||||
.RDW_MODE ("U")
|
||||
.RDW_MODE ("R")
|
||||
) gpr_ram (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -68,8 +68,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
|
||||
reg [`PERF_CTR_BITS-1:0] cycles;
|
||||
|
||||
reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs;
|
||||
|
||||
wire schedule_fire = schedule_valid && schedule_ready;
|
||||
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
|
||||
|
||||
|
@ -113,6 +111,16 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
barrier_stalls_n= barrier_stalls;
|
||||
warp_pcs_n = warp_pcs;
|
||||
|
||||
// decode unlock
|
||||
if (decode_sched_if.valid && decode_sched_if.unlock) begin
|
||||
stalled_warps_n[decode_sched_if.wid] = 0;
|
||||
end
|
||||
|
||||
// CSR unlock
|
||||
if (sched_csr_if.unlock_warp) begin
|
||||
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
|
||||
end
|
||||
|
||||
// wspawn handling
|
||||
if (wspawn.valid && is_single_warp) begin
|
||||
active_warps_n |= wspawn.wmask;
|
||||
|
@ -170,6 +178,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_data.id)) begin
|
||||
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
|
||||
|
@ -188,16 +197,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
// decode unlock
|
||||
if (decode_sched_if.valid && decode_sched_if.unlock) begin
|
||||
stalled_warps_n[decode_sched_if.wid] = 0;
|
||||
end
|
||||
|
||||
// CSR unlock
|
||||
if (sched_csr_if.unlock_warp) begin
|
||||
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
|
||||
end
|
||||
|
||||
// stall the warp until decode stage
|
||||
if (schedule_fire) begin
|
||||
stalled_warps_n[schedule_wid] = 1;
|
||||
|
@ -223,7 +222,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
active_warps <= '0;
|
||||
thread_masks <= '0;
|
||||
barrier_stalls <= '0;
|
||||
issued_instrs <= '0;
|
||||
cycles <= '0;
|
||||
wspawn.valid <= 0;
|
||||
|
||||
|
@ -268,10 +266,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
end
|
||||
`endif
|
||||
|
||||
if (schedule_if_fire) begin
|
||||
issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1);
|
||||
end
|
||||
|
||||
if (busy) begin
|
||||
cycles <= cycles + 1;
|
||||
end
|
||||
|
|
|
@ -44,7 +44,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
|
||||
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||
|
||||
VX_reduce #(
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
.OP ("|")
|
||||
|
@ -53,7 +53,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
.data_out (perf_units_per_cycle)
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (`NUM_SFU_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
.OP ("|")
|
||||
|
@ -151,11 +151,14 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
`endif
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_OPDS; ++i) begin
|
||||
for (genvar i = 0; i < NUM_OPDS; ++i) begin : g_operands_busy_n
|
||||
always @(*) begin
|
||||
operands_busy_n[i] = operands_busy[i];
|
||||
if (ibuffer_fire) begin
|
||||
operands_busy_n[i] = inuse_regs[ibuf_opds[i]];
|
||||
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
|
||||
operands_busy_n[i] = 1;
|
||||
end
|
||||
end
|
||||
if (writeback_fire) begin
|
||||
if (ibuffer_fire) begin
|
||||
|
@ -168,9 +171,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
end
|
||||
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
|
||||
operands_busy_n[i] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -185,8 +185,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
inuse_regs[staging_if[w].data.rd] <= 1;
|
||||
end
|
||||
end
|
||||
|
||||
operands_busy <= operands_busy_n;
|
||||
operands_ready[w] <= ~(| operands_busy_n);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
|
||||
|
|
|
@ -21,8 +21,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
`endif
|
||||
|
||||
input base_dcrs_t base_dcrs,
|
||||
|
@ -121,8 +121,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
.execute_if (pe_execute_if[PE_IDX_CSRS]),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
|
|
@ -1,46 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_pipeline_perf_if import VX_gpu_pkg::*; ();
|
||||
sched_perf_t sched;
|
||||
issue_perf_t issue;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||
wire [`PERF_CTR_BITS-1:0] loads;
|
||||
wire [`PERF_CTR_BITS-1:0] stores;
|
||||
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||
|
||||
modport master (
|
||||
output sched,
|
||||
output issue,
|
||||
output ifetches,
|
||||
output loads,
|
||||
output stores,
|
||||
output ifetch_latency,
|
||||
output load_latency
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input sched,
|
||||
input issue,
|
||||
input ifetches,
|
||||
input loads,
|
||||
input stores,
|
||||
input ifetch_latency,
|
||||
input load_latency
|
||||
);
|
||||
|
||||
endinterface
|
|
@ -1,27 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_sfu_perf_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] wctl_stalls;
|
||||
|
||||
modport master (
|
||||
output wctl_stalls
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input wctl_stalls
|
||||
);
|
||||
|
||||
endinterface
|
|
@ -31,10 +31,10 @@
|
|||
`RAM_INITIALIZATION \
|
||||
reg [ADDRW-1:0] raddr_r; \
|
||||
always @(posedge clk) begin \
|
||||
if (__re || __we) begin \
|
||||
if (__we) begin \
|
||||
ram[__wa] <= wdata; \
|
||||
end \
|
||||
if (__we) begin \
|
||||
ram[__wa] <= wdata; \
|
||||
end \
|
||||
if (__re) begin \
|
||||
raddr_r <= __ra; \
|
||||
end \
|
||||
end \
|
||||
|
@ -45,14 +45,14 @@
|
|||
`RAM_INITIALIZATION \
|
||||
reg [ADDRW-1:0] raddr_r; \
|
||||
always @(posedge clk) begin \
|
||||
if (__re || __we) begin \
|
||||
if (__we) begin \
|
||||
for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
|
||||
end \
|
||||
if (__we) begin \
|
||||
for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
|
||||
end \
|
||||
end \
|
||||
end \
|
||||
if (__re) begin \
|
||||
raddr_r <= __ra; \
|
||||
end \
|
||||
end \
|
||||
|
@ -63,10 +63,10 @@
|
|||
`RAM_INITIALIZATION \
|
||||
reg [DATAW-1:0] rdata_r; \
|
||||
always @(posedge clk) begin \
|
||||
if (__re || __we) begin \
|
||||
if (__we) begin \
|
||||
ram[__wa] <= wdata; \
|
||||
end \
|
||||
if (__we) begin \
|
||||
ram[__wa] <= wdata; \
|
||||
end \
|
||||
if (__re) begin \
|
||||
rdata_r <= ram[__ra]; \
|
||||
end \
|
||||
end \
|
||||
|
@ -77,14 +77,14 @@
|
|||
`RAM_INITIALIZATION \
|
||||
reg [DATAW-1:0] rdata_r; \
|
||||
always @(posedge clk) begin \
|
||||
if (__re || __we) begin \
|
||||
if (__we) begin \
|
||||
for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
|
||||
end \
|
||||
if (__we) begin \
|
||||
for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
|
||||
end \
|
||||
end \
|
||||
end \
|
||||
if (__re) begin \
|
||||
rdata_r <= ram[__ra]; \
|
||||
end \
|
||||
end \
|
||||
|
@ -122,6 +122,7 @@ module VX_async_ram_patch #(
|
|||
parameter DUAL_PORT = 0,
|
||||
parameter FORCE_BRAM = 0,
|
||||
parameter RADDR_REG = 0, // read address registered hint
|
||||
parameter RADDR_RESET = 0, // read address has reset
|
||||
parameter WRITE_FIRST = 0,
|
||||
parameter INIT_ENABLE = 0,
|
||||
parameter INIT_FILE = "",
|
||||
|
@ -143,16 +144,24 @@ module VX_async_ram_patch #(
|
|||
`UNUSED_VAR (reset)
|
||||
|
||||
(* keep = "true" *) wire [ADDRW-1:0] raddr_w, raddr_s;
|
||||
(* keep = "true" *) wire read_s, is_raddr_reg;
|
||||
|
||||
(* keep = "true" *) wire read_s;
|
||||
assign raddr_w = raddr;
|
||||
|
||||
wire raddr_reset_w;
|
||||
if (RADDR_RESET) begin : g_raddr_reset
|
||||
(* keep = "true" *) wire raddr_reset;
|
||||
assign raddr_reset = 0;
|
||||
assign raddr_reset_w = raddr_reset;
|
||||
end else begin : g_no_raddr_reset
|
||||
assign raddr_reset_w = 0;
|
||||
end
|
||||
|
||||
VX_placeholder #(
|
||||
.I (ADDRW),
|
||||
.O (ADDRW + 1 + 1)
|
||||
) placeholder (
|
||||
.in (raddr_w),
|
||||
.out ({raddr_s, read_s, is_raddr_reg})
|
||||
.I (ADDRW + 1),
|
||||
.O (ADDRW + 1)
|
||||
) placeholder1 (
|
||||
.in ({raddr_w, raddr_reset_w}),
|
||||
.out ({raddr_s, read_s})
|
||||
);
|
||||
|
||||
wire [DATAW-1:0] rdata_s;
|
||||
|
@ -206,9 +215,15 @@ module VX_async_ram_patch #(
|
|||
end
|
||||
|
||||
if (RADDR_REG) begin : g_raddr_reg
|
||||
`UNUSED_VAR (is_raddr_reg)
|
||||
assign rdata = rdata_s;
|
||||
end else begin : g_async_ram
|
||||
(* keep = "true" *) wire is_raddr_reg;
|
||||
VX_placeholder #(
|
||||
.O (1)
|
||||
) placeholder2 (
|
||||
.in (),
|
||||
.out (is_raddr_reg)
|
||||
);
|
||||
wire [DATAW-1:0] rdata_a;
|
||||
if (DUAL_PORT) begin : g_dp
|
||||
if (WRENW != 1) begin : g_wren
|
||||
|
|
|
@ -23,10 +23,10 @@ module VX_axi_adapter #(
|
|||
parameter NUM_PORTS_IN = 1,
|
||||
parameter NUM_BANKS_OUT = 1,
|
||||
parameter INTERLEAVE = 0,
|
||||
parameter TAG_BUFFER_SIZE= 32,
|
||||
parameter TAG_BUFFER_SIZE= 16,
|
||||
parameter ARBITER = "R",
|
||||
parameter REQ_OUT_BUF = 1,
|
||||
parameter RSP_OUT_BUF = 1,
|
||||
parameter REQ_OUT_BUF = 0,
|
||||
parameter RSP_OUT_BUF = 0,
|
||||
parameter DATA_SIZE = DATA_WIDTH/8
|
||||
) (
|
||||
input wire clk,
|
||||
|
@ -99,7 +99,7 @@ module VX_axi_adapter #(
|
|||
localparam LOG2_DATA_SIZE = `CLOG2(DATA_SIZE);
|
||||
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS_OUT);
|
||||
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
|
||||
localparam DST_ADDR_WDITH = (ADDR_WIDTH_OUT - LOG2_DATA_SIZE) + BANK_SEL_BITS; // convert output addresss to byte-addressable input space
|
||||
localparam DST_ADDR_WDITH = (ADDR_WIDTH_OUT - LOG2_DATA_SIZE) + BANK_SEL_BITS; // convert byte-addressable output addresss to block-addressable input space
|
||||
localparam BANK_ADDR_WIDTH = DST_ADDR_WDITH - BANK_SEL_BITS;
|
||||
localparam NUM_PORTS_IN_BITS = `CLOG2(NUM_PORTS_IN);
|
||||
localparam NUM_PORTS_IN_WIDTH = `UP(NUM_PORTS_IN_BITS);
|
||||
|
@ -109,8 +109,8 @@ module VX_axi_adapter #(
|
|||
localparam READ_FULL_TAG_WIDTH = READ_TAG_WIDTH + NUM_PORTS_IN_BITS;
|
||||
localparam WRITE_TAG_WIDTH = `MIN(TAG_WIDTH_IN, TAG_WIDTH_OUT);
|
||||
localparam DST_TAG_WIDTH = `MAX(READ_FULL_TAG_WIDTH, WRITE_TAG_WIDTH);
|
||||
localparam ARB_TAG_WIDTH = `MAX(READ_TAG_WIDTH, WRITE_TAG_WIDTH);
|
||||
localparam ARB_DATAW = 1 + BANK_ADDR_WIDTH + DATA_SIZE + DATA_WIDTH + ARB_TAG_WIDTH;
|
||||
localparam XBAR_TAG_WIDTH = `MAX(READ_TAG_WIDTH, WRITE_TAG_WIDTH);
|
||||
localparam REQ_XBAR_DATAW = 1 + BANK_ADDR_WIDTH + DATA_SIZE + DATA_WIDTH + XBAR_TAG_WIDTH;
|
||||
localparam RSP_XBAR_DATAW = DATA_WIDTH + READ_TAG_WIDTH;
|
||||
|
||||
`STATIC_ASSERT ((DST_ADDR_WDITH >= ADDR_WIDTH_IN), ("invalid address width: current=%0d, expected=%0d", DST_ADDR_WDITH, ADDR_WIDTH_IN))
|
||||
|
@ -174,117 +174,85 @@ module VX_axi_adapter #(
|
|||
end
|
||||
end
|
||||
|
||||
// Request ack
|
||||
// AXI request handling
|
||||
|
||||
wire [NUM_BANKS_OUT-1:0][NUM_PORTS_IN-1:0] arb_ready_in;
|
||||
wire [NUM_PORTS_IN-1:0] req_xbar_valid_in;
|
||||
wire [NUM_PORTS_IN-1:0][REQ_XBAR_DATAW-1:0] req_xbar_data_in;
|
||||
wire [NUM_PORTS_IN-1:0] req_xbar_ready_in;
|
||||
|
||||
if (NUM_PORTS_IN > 1) begin : g_multi_inputs
|
||||
wire [NUM_PORTS_IN-1:0][NUM_BANKS_OUT-1:0] arb_ready_in_w;
|
||||
VX_transpose #(
|
||||
.N (NUM_BANKS_OUT),
|
||||
.M (NUM_PORTS_IN)
|
||||
) rdy_in_transpose (
|
||||
.data_in (arb_ready_in),
|
||||
.data_out (arb_ready_in_w)
|
||||
);
|
||||
for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_ready_in
|
||||
assign mem_req_ready[i] = | arb_ready_in_w[i];
|
||||
end
|
||||
end else begin : g_single_input
|
||||
assign mem_req_ready[0] = arb_ready_in[req_bank_sel[0]][0];
|
||||
wire [NUM_BANKS_OUT-1:0] req_xbar_valid_out;
|
||||
wire [NUM_BANKS_OUT-1:0][REQ_XBAR_DATAW-1:0] req_xbar_data_out;
|
||||
wire [NUM_BANKS_OUT-1:0][NUM_PORTS_IN_WIDTH-1:0] req_xbar_sel_out;
|
||||
wire [NUM_BANKS_OUT-1:0] req_xbar_ready_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_req_xbar_data_in
|
||||
wire tag_ready = mem_req_rw[i] || mem_rd_req_tag_ready[i];
|
||||
wire [XBAR_TAG_WIDTH-1:0] tag_value = mem_req_rw[i] ? XBAR_TAG_WIDTH'(mem_req_tag[i]) : XBAR_TAG_WIDTH'(mem_rd_req_tag[i]);
|
||||
assign req_xbar_valid_in[i] = mem_req_valid[i] && tag_ready;
|
||||
assign req_xbar_data_in[i] = {mem_req_rw[i], req_bank_addr[i], mem_req_byteen[i], mem_req_data[i], tag_value};
|
||||
assign mem_req_ready[i] = req_xbar_ready_in[i] && tag_ready;
|
||||
end
|
||||
|
||||
// AXi write request synchronization
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_PORTS_IN),
|
||||
.NUM_OUTPUTS(NUM_BANKS_OUT),
|
||||
.DATAW (REQ_XBAR_DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.OUT_BUF (REQ_OUT_BUF)
|
||||
) req_xbar (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (req_bank_sel),
|
||||
.valid_in (req_xbar_valid_in),
|
||||
.data_in (req_xbar_data_in),
|
||||
.ready_in (req_xbar_ready_in),
|
||||
.valid_out (req_xbar_valid_out),
|
||||
.data_out (req_xbar_data_out),
|
||||
.ready_out (req_xbar_ready_out),
|
||||
.sel_out (req_xbar_sel_out),
|
||||
`UNUSED_PIN (collisions)
|
||||
);
|
||||
|
||||
wire [NUM_BANKS_OUT-1:0] m_axi_awvalid_w, m_axi_wvalid_w;
|
||||
wire [NUM_BANKS_OUT-1:0] m_axi_awready_w, m_axi_wready_w;
|
||||
reg [NUM_BANKS_OUT-1:0] m_axi_aw_ack, m_axi_w_ack, axi_write_ready;
|
||||
for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_axi_reqs
|
||||
|
||||
wire xbar_rw_out;
|
||||
wire [BANK_ADDR_WIDTH-1:0] xbar_addr_out;
|
||||
wire [XBAR_TAG_WIDTH-1:0] xbar_tag_out;
|
||||
wire [DATA_WIDTH-1:0] xbar_data_out;
|
||||
wire [DATA_SIZE-1:0] xbar_byteen_out;
|
||||
|
||||
assign {
|
||||
xbar_rw_out,
|
||||
xbar_addr_out,
|
||||
xbar_byteen_out,
|
||||
xbar_data_out,
|
||||
xbar_tag_out
|
||||
} = req_xbar_data_out[i];
|
||||
|
||||
// AXi request handshake
|
||||
|
||||
wire m_axi_aw_ack, m_axi_w_ack, axi_write_ready;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_axi_write_ready
|
||||
VX_axi_write_ack axi_write_ack (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.awvalid(m_axi_awvalid_w[i]),
|
||||
.awready(m_axi_awready_w[i]),
|
||||
.wvalid (m_axi_wvalid_w[i]),
|
||||
.wready (m_axi_wready_w[i]),
|
||||
.aw_ack (m_axi_aw_ack[i]),
|
||||
.w_ack (m_axi_w_ack[i]),
|
||||
.tx_rdy (axi_write_ready[i]),
|
||||
.awvalid(m_axi_awvalid[i]),
|
||||
.awready(m_axi_awready[i]),
|
||||
.wvalid (m_axi_wvalid[i]),
|
||||
.wready (m_axi_wready[i]),
|
||||
.aw_ack (m_axi_aw_ack),
|
||||
.w_ack (m_axi_w_ack),
|
||||
.tx_rdy (axi_write_ready),
|
||||
`UNUSED_PIN (tx_ack)
|
||||
);
|
||||
end
|
||||
|
||||
// AXI request handling
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_axi_write_req
|
||||
|
||||
wire [BANK_ADDR_WIDTH-1:0] arb_addr_out, buf_addr_r_out, buf_addr_w_out;
|
||||
wire [ARB_TAG_WIDTH-1:0] arb_tag_out;
|
||||
wire [WRITE_TAG_WIDTH-1:0] buf_tag_w_out;
|
||||
wire [READ_FULL_TAG_WIDTH-1:0] arb_tag_r_out, buf_tag_r_out;
|
||||
wire [NUM_PORTS_IN_WIDTH-1:0] arb_sel_out;
|
||||
wire [DATA_WIDTH-1:0] arb_data_out;
|
||||
wire [DATA_SIZE-1:0] arb_byteen_out;
|
||||
wire arb_valid_out, arb_ready_out;
|
||||
wire arb_rw_out;
|
||||
|
||||
wire [NUM_PORTS_IN-1:0][ARB_DATAW-1:0] arb_data_in;
|
||||
wire [NUM_PORTS_IN-1:0] arb_valid_in;
|
||||
|
||||
for (genvar j = 0; j < NUM_PORTS_IN; ++j) begin : g_valid_in
|
||||
wire tag_ready = mem_req_rw[j] || mem_rd_req_tag_ready[j];
|
||||
assign arb_valid_in[j] = mem_req_valid[j] && tag_ready && (req_bank_sel[j] == i);
|
||||
end
|
||||
|
||||
for (genvar j = 0; j < NUM_PORTS_IN; ++j) begin : g_data_in
|
||||
wire [ARB_TAG_WIDTH-1:0] tag_value = mem_req_rw[j] ? ARB_TAG_WIDTH'(mem_req_tag[j]) : ARB_TAG_WIDTH'(mem_rd_req_tag[j]);
|
||||
assign arb_data_in[j] = {mem_req_rw[j], req_bank_addr[j], mem_req_byteen[j], mem_req_data[j], tag_value};
|
||||
end
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_PORTS_IN),
|
||||
.NUM_OUTPUTS(1),
|
||||
.DATAW (ARB_DATAW),
|
||||
.ARBITER (ARBITER)
|
||||
) aw_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (arb_valid_in),
|
||||
.ready_in (arb_ready_in[i]),
|
||||
.data_in (arb_data_in),
|
||||
.data_out ({arb_rw_out, arb_addr_out, arb_byteen_out, arb_data_out, arb_tag_out}),
|
||||
.valid_out (arb_valid_out),
|
||||
.ready_out (arb_ready_out),
|
||||
.sel_out (arb_sel_out)
|
||||
);
|
||||
|
||||
wire m_axi_arready_w;
|
||||
|
||||
assign arb_ready_out = axi_write_ready[i] || m_axi_arready_w;
|
||||
assign req_xbar_ready_out[i] = xbar_rw_out ? axi_write_ready : m_axi_arready[i];
|
||||
|
||||
// AXI write address channel
|
||||
|
||||
assign m_axi_awvalid_w[i] = arb_valid_out && arb_rw_out && ~m_axi_aw_ack[i];
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (BANK_ADDR_WIDTH + WRITE_TAG_WIDTH),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)),
|
||||
.LUTRAM (`TO_OUT_BUF_LUTRAM(REQ_OUT_BUF))
|
||||
) aw_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (m_axi_awvalid_w[i]),
|
||||
.ready_in (m_axi_awready_w[i]),
|
||||
.data_in ({arb_addr_out, WRITE_TAG_WIDTH'(arb_tag_out)}),
|
||||
.data_out ({buf_addr_w_out, buf_tag_w_out}),
|
||||
.valid_out (m_axi_awvalid[i]),
|
||||
.ready_out (m_axi_awready[i])
|
||||
);
|
||||
|
||||
assign m_axi_awaddr[i] = ADDR_WIDTH_OUT'(buf_addr_w_out) << LOG2_DATA_SIZE;
|
||||
assign m_axi_awid[i] = TAG_WIDTH_OUT'(buf_tag_w_out);
|
||||
assign m_axi_awvalid[i] = req_xbar_valid_out[i] && xbar_rw_out && ~m_axi_aw_ack;
|
||||
assign m_axi_awaddr[i] = ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE;
|
||||
assign m_axi_awid[i] = TAG_WIDTH_OUT'(xbar_tag_out);
|
||||
assign m_axi_awlen[i] = 8'b00000000;
|
||||
assign m_axi_awsize[i] = 3'(LOG2_DATA_SIZE);
|
||||
assign m_axi_awburst[i] = 2'b00;
|
||||
|
@ -296,53 +264,24 @@ module VX_axi_adapter #(
|
|||
|
||||
// AXI write data channel
|
||||
|
||||
assign m_axi_wvalid_w[i] = arb_valid_out && arb_rw_out && ~m_axi_w_ack[i];
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATA_SIZE + DATA_WIDTH),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)),
|
||||
.LUTRAM (`TO_OUT_BUF_LUTRAM(REQ_OUT_BUF))
|
||||
) w_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (m_axi_wvalid_w[i]),
|
||||
.ready_in (m_axi_wready_w[i]),
|
||||
.data_in ({arb_byteen_out, arb_data_out}),
|
||||
.data_out ({m_axi_wstrb[i], m_axi_wdata[i]}),
|
||||
.valid_out (m_axi_wvalid[i]),
|
||||
.ready_out (m_axi_wready[i])
|
||||
);
|
||||
|
||||
assign m_axi_wlast[i] = 1'b1;
|
||||
assign m_axi_wvalid[i] = req_xbar_valid_out[i] && xbar_rw_out && ~m_axi_w_ack;
|
||||
assign m_axi_wstrb[i] = xbar_byteen_out;
|
||||
assign m_axi_wdata[i] = xbar_data_out;
|
||||
assign m_axi_wlast[i] = 1'b1;
|
||||
|
||||
// AXI read address channel
|
||||
|
||||
if (NUM_PORTS_IN > 1) begin : g_input_sel
|
||||
assign arb_tag_r_out = READ_FULL_TAG_WIDTH'({arb_tag_out, arb_sel_out});
|
||||
wire [READ_FULL_TAG_WIDTH-1:0] xbar_tag_r_out;
|
||||
if (NUM_PORTS_IN > 1) begin : g_xbar_tag_r_out
|
||||
assign xbar_tag_r_out = READ_FULL_TAG_WIDTH'({xbar_tag_out, req_xbar_sel_out[i]});
|
||||
end else begin : g_no_input_sel
|
||||
`UNUSED_VAR (arb_sel_out)
|
||||
assign arb_tag_r_out = READ_TAG_WIDTH'(arb_tag_out);
|
||||
`UNUSED_VAR (req_xbar_sel_out)
|
||||
assign xbar_tag_r_out = READ_TAG_WIDTH'(xbar_tag_out);
|
||||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (BANK_ADDR_WIDTH + READ_FULL_TAG_WIDTH),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)),
|
||||
.LUTRAM (`TO_OUT_BUF_LUTRAM(REQ_OUT_BUF))
|
||||
) ar_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (arb_valid_out && ~arb_rw_out),
|
||||
.ready_in (m_axi_arready_w),
|
||||
.data_in ({arb_addr_out, arb_tag_r_out}),
|
||||
.data_out ({buf_addr_r_out, buf_tag_r_out}),
|
||||
.valid_out (m_axi_arvalid[i]),
|
||||
.ready_out (m_axi_arready[i])
|
||||
);
|
||||
|
||||
assign m_axi_araddr[i] = ADDR_WIDTH_OUT'(buf_addr_r_out) << LOG2_DATA_SIZE;
|
||||
assign m_axi_arid[i] = TAG_WIDTH_OUT'(buf_tag_r_out);
|
||||
assign m_axi_arvalid[i] = req_xbar_valid_out[i] && ~xbar_rw_out;
|
||||
assign m_axi_araddr[i] = ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE;
|
||||
assign m_axi_arid[i] = TAG_WIDTH_OUT'(xbar_tag_r_out);
|
||||
assign m_axi_arlen[i] = 8'b00000000;
|
||||
assign m_axi_arsize[i] = 3'(LOG2_DATA_SIZE);
|
||||
assign m_axi_arburst[i] = 2'b00;
|
||||
|
|
|
@ -26,35 +26,35 @@ module VX_axi_write_ack (
|
|||
output wire tx_ack,
|
||||
output wire tx_rdy
|
||||
);
|
||||
reg awfired;
|
||||
reg wfired;
|
||||
reg aw_fired;
|
||||
reg w_fired;
|
||||
|
||||
wire awfire = awvalid && awready;
|
||||
wire wfire = wvalid && wready;
|
||||
wire aw_fire = awvalid && awready;
|
||||
wire w_fire = wvalid && wready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
awfired <= 0;
|
||||
wfired <= 0;
|
||||
aw_fired <= 0;
|
||||
w_fired <= 0;
|
||||
end else begin
|
||||
if (awfire) begin
|
||||
awfired <= 1;
|
||||
if (aw_fire) begin
|
||||
aw_fired <= 1;
|
||||
end
|
||||
if (wfire) begin
|
||||
wfired <= 1;
|
||||
if (w_fire) begin
|
||||
w_fired <= 1;
|
||||
end
|
||||
if (tx_ack) begin
|
||||
awfired <= 0;
|
||||
wfired <= 0;
|
||||
aw_fired <= 0;
|
||||
w_fired <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign aw_ack = awfired;
|
||||
assign w_ack = wfired;
|
||||
assign aw_ack = aw_fired;
|
||||
assign w_ack = w_fired;
|
||||
|
||||
assign tx_ack = (awfire || awfired) && (wfire || wfired);
|
||||
assign tx_rdy = (awready || awfired) && (wready || wfired);
|
||||
assign tx_ack = (aw_fire || aw_fired) && (w_fire || w_fired);
|
||||
assign tx_rdy = (awready || aw_fired) && (wready || w_fired);
|
||||
|
||||
endmodule
|
||||
`TRACING_ON
|
||||
|
|
|
@ -26,18 +26,39 @@
|
|||
end \
|
||||
end
|
||||
|
||||
`ifdef SIMULATION
|
||||
`define RAM_RESET_BLOCK if (RESET_RAM && reset) begin \
|
||||
for (integer i = 0; i < SIZE; ++i) begin \
|
||||
ram[i] <= DATAW'(INIT_VALUE); \
|
||||
end \
|
||||
end else
|
||||
`else
|
||||
`define RAM_RESET_BLOCK
|
||||
`endif
|
||||
|
||||
`define RAM_WRITE_ALL `RAM_RESET_BLOCK \
|
||||
if (write) begin \
|
||||
ram[waddr] <= wdata; \
|
||||
end
|
||||
|
||||
`ifdef QUARTUS
|
||||
`define RAM_ARRAY_WREN reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1];
|
||||
`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[waddr][i] <= wdata[i * WSELW +: WSELW]; \
|
||||
`define RAM_WRITE_WREN `RAM_RESET_BLOCK \
|
||||
if (write) begin \
|
||||
for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[waddr][i] <= wdata[i * WSELW +: WSELW]; \
|
||||
end \
|
||||
end \
|
||||
end
|
||||
`else
|
||||
`define RAM_ARRAY_WREN reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
|
||||
`define RAM_WRITE_WREN `RAM_RESET_BLOCK \
|
||||
if (write) begin \
|
||||
for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
|
||||
end \
|
||||
end \
|
||||
end
|
||||
`endif
|
||||
|
@ -49,8 +70,9 @@ module VX_dp_ram #(
|
|||
parameter WRENW = 1,
|
||||
parameter OUT_REG = 0,
|
||||
parameter LUTRAM = 0,
|
||||
parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first, U: undefined
|
||||
parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first
|
||||
parameter RADDR_REG = 0, // read address registered hint
|
||||
parameter RADDR_RESET = 0, // read address has reset
|
||||
parameter RDW_ASSERT = 0,
|
||||
parameter RESET_RAM = 0,
|
||||
parameter INIT_ENABLE = 0,
|
||||
|
@ -71,13 +93,14 @@ module VX_dp_ram #(
|
|||
localparam WSELW = DATAW / WRENW;
|
||||
`UNUSED_PARAM (LUTRAM)
|
||||
`UNUSED_PARAM (RADDR_REG)
|
||||
`UNUSED_PARAM (RADDR_RESET)
|
||||
|
||||
`STATIC_ASSERT(!(WRENW * WSELW != DATAW), ("invalid parameter"))
|
||||
`STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W" || RDW_MODE == "U"), ("invalid parameter"))
|
||||
`STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W"), ("invalid parameter"))
|
||||
`UNUSED_PARAM (RDW_ASSERT)
|
||||
|
||||
`ifdef SYNTHESIS
|
||||
localparam FORCE_BRAM = !LUTRAM && (SIZE * DATAW >= `MAX_LUTRAM);
|
||||
localparam FORCE_BRAM = !LUTRAM && `FORCE_BRAM(SIZE, DATAW);
|
||||
if (OUT_REG) begin : g_sync
|
||||
if (FORCE_BRAM) begin : g_bram
|
||||
if (RDW_MODE == "W") begin : g_write_first
|
||||
|
@ -86,10 +109,8 @@ module VX_dp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [ADDRW-1:0] raddr_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
if (read) begin
|
||||
raddr_r <= raddr;
|
||||
end
|
||||
end
|
||||
|
@ -99,10 +120,8 @@ module VX_dp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [ADDRW-1:0] raddr_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
ram[waddr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
if (read) begin
|
||||
raddr_r <= raddr;
|
||||
end
|
||||
end
|
||||
|
@ -114,37 +133,7 @@ module VX_dp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
rdata_r <= ram[raddr];
|
||||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end else begin : g_no_wren
|
||||
`USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
ram[waddr] <= wdata;
|
||||
end
|
||||
rdata_r <= ram[raddr];
|
||||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end
|
||||
end else if (RDW_MODE == "U") begin : g_undefined
|
||||
if (WRENW != 1) begin : g_wren
|
||||
`USE_BLOCK_BRAM `RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
if (read) begin
|
||||
rdata_r <= ram[raddr];
|
||||
end
|
||||
|
@ -155,9 +144,7 @@ module VX_dp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[waddr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
if (read) begin
|
||||
rdata_r <= ram[raddr];
|
||||
end
|
||||
|
@ -172,10 +159,8 @@ module VX_dp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [ADDRW-1:0] raddr_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
if (read) begin
|
||||
raddr_r <= raddr;
|
||||
end
|
||||
end
|
||||
|
@ -185,10 +170,8 @@ module VX_dp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [ADDRW-1:0] raddr_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
ram[waddr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
if (read) begin
|
||||
raddr_r <= raddr;
|
||||
end
|
||||
end
|
||||
|
@ -200,37 +183,7 @@ module VX_dp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
rdata_r <= ram[raddr];
|
||||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end else begin : g_no_wren
|
||||
reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
ram[waddr] <= wdata;
|
||||
end
|
||||
rdata_r <= ram[raddr];
|
||||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end
|
||||
end else if (RDW_MODE == "U") begin : g_undefined
|
||||
if (WRENW != 1) begin : g_wren
|
||||
`RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
if (read) begin
|
||||
rdata_r <= ram[raddr];
|
||||
end
|
||||
|
@ -241,9 +194,7 @@ module VX_dp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[waddr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
if (read) begin
|
||||
rdata_r <= ram[raddr];
|
||||
end
|
||||
|
@ -255,7 +206,7 @@ module VX_dp_ram #(
|
|||
end else begin : g_async
|
||||
`UNUSED_VAR (read)
|
||||
if (FORCE_BRAM) begin : g_bram
|
||||
`ifdef VIVADO
|
||||
`ifdef ASYNC_BRAM_PATCH
|
||||
VX_async_ram_patch #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (SIZE),
|
||||
|
@ -263,6 +214,7 @@ module VX_dp_ram #(
|
|||
.DUAL_PORT (1),
|
||||
.FORCE_BRAM (FORCE_BRAM),
|
||||
.RADDR_REG (RADDR_REG),
|
||||
.RADDR_RESET(RADDR_RESET),
|
||||
.WRITE_FIRST(RDW_MODE == "W"),
|
||||
.INIT_ENABLE(INIT_ENABLE),
|
||||
.INIT_FILE (INIT_FILE),
|
||||
|
@ -284,18 +236,14 @@ module VX_dp_ram #(
|
|||
`RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
assign rdata = ram[raddr];
|
||||
end else begin : g_no_wren
|
||||
`RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[waddr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
end
|
||||
assign rdata = ram[raddr];
|
||||
end
|
||||
|
@ -304,18 +252,14 @@ module VX_dp_ram #(
|
|||
`NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
assign rdata = ram[raddr];
|
||||
end else begin : g_no_wren
|
||||
`NO_RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[waddr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
end
|
||||
assign rdata = ram[raddr];
|
||||
end
|
||||
|
@ -327,18 +271,14 @@ module VX_dp_ram #(
|
|||
`RW_RAM_CHECK `RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
assign rdata = ram[raddr];
|
||||
end else begin : g_no_wren
|
||||
`RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[waddr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
end
|
||||
assign rdata = ram[raddr];
|
||||
end
|
||||
|
@ -347,18 +287,14 @@ module VX_dp_ram #(
|
|||
`NO_RW_RAM_CHECK `RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
assign rdata = ram[raddr];
|
||||
end else begin : g_no_wren
|
||||
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[waddr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
end
|
||||
assign rdata = ram[raddr];
|
||||
end
|
||||
|
@ -371,37 +307,19 @@ module VX_dp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (RESET_RAM && reset) begin
|
||||
for (integer i = 0; i < SIZE; ++i) begin
|
||||
ram[i] <= DATAW'(INIT_VALUE);
|
||||
end
|
||||
end else if (write) begin
|
||||
for (integer i = 0; i < WRENW; ++i) begin
|
||||
if (wren[i]) begin
|
||||
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
|
||||
end
|
||||
end
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
|
||||
if (OUT_REG) begin : g_sync
|
||||
if (RDW_MODE == "W") begin : g_write_first
|
||||
reg [ADDRW-1:0] raddr_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (read) begin
|
||||
raddr_r <= raddr;
|
||||
end
|
||||
end
|
||||
assign rdata = ram[raddr_r];
|
||||
end else if (RDW_MODE == "R") begin : g_read_first
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
rdata_r <= ram[raddr];
|
||||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end else begin : g_undefined
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read) begin
|
||||
|
|
|
@ -95,7 +95,8 @@ module VX_fifo_queue #(
|
|||
.SIZE (DEPTH),
|
||||
.LUTRAM (LUTRAM),
|
||||
.RDW_MODE ("W"),
|
||||
.RADDR_REG (1)
|
||||
.RADDR_REG (1),
|
||||
.RADDR_RESET (1)
|
||||
) dp_ram (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -179,25 +179,31 @@ module VX_mem_bank_adapter #(
|
|||
|
||||
for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_req_xbar_data_out
|
||||
|
||||
wire rw_out;
|
||||
wire [BANK_ADDR_WIDTH-1:0] addr_out;
|
||||
wire [XBAR_TAG_WIDTH-1:0] tag_out;
|
||||
wire [DATA_WIDTH-1:0] data_out;
|
||||
wire [DATA_SIZE-1:0] byteen_out;
|
||||
wire xbar_rw_out;
|
||||
wire [BANK_ADDR_WIDTH-1:0] xbar_addr_out;
|
||||
wire [XBAR_TAG_WIDTH-1:0] xbar_tag_out;
|
||||
wire [DATA_WIDTH-1:0] xbar_data_out;
|
||||
wire [DATA_SIZE-1:0] xbar_byteen_out;
|
||||
|
||||
assign {rw_out, addr_out, byteen_out, data_out, tag_out} = req_xbar_data_out[i];
|
||||
assign {
|
||||
xbar_rw_out,
|
||||
xbar_addr_out,
|
||||
xbar_byteen_out,
|
||||
xbar_data_out,
|
||||
xbar_tag_out
|
||||
} = req_xbar_data_out[i];
|
||||
|
||||
assign mem_req_valid_out[i] = req_xbar_valid_out[i];
|
||||
assign mem_req_rw_out[i] = rw_out;
|
||||
assign mem_req_addr_out[i] = ADDR_WIDTH_OUT'(addr_out);
|
||||
assign mem_req_byteen_out[i] = byteen_out;
|
||||
assign mem_req_data_out[i] = data_out;
|
||||
assign mem_req_rw_out[i] = xbar_rw_out;
|
||||
assign mem_req_addr_out[i] = ADDR_WIDTH_OUT'(xbar_addr_out);
|
||||
assign mem_req_byteen_out[i] = xbar_byteen_out;
|
||||
assign mem_req_data_out[i] = xbar_data_out;
|
||||
|
||||
if (NUM_PORTS_IN > 1) begin : g_input_sel
|
||||
assign mem_req_tag_out[i] = TAG_WIDTH_OUT'({tag_out, req_xbar_sel_out[i]});
|
||||
assign mem_req_tag_out[i] = TAG_WIDTH_OUT'({xbar_tag_out, req_xbar_sel_out[i]});
|
||||
end else begin : g_no_input_sel
|
||||
`UNUSED_VAR (req_xbar_sel_out[i])
|
||||
assign mem_req_tag_out[i] = TAG_WIDTH_OUT'(tag_out);
|
||||
assign mem_req_tag_out[i] = TAG_WIDTH_OUT'(xbar_tag_out);
|
||||
end
|
||||
|
||||
assign req_xbar_ready_out[i] = mem_req_ready_out[i];
|
||||
|
|
|
@ -24,6 +24,7 @@ module VX_mem_coalescer #(
|
|||
parameter TAG_WIDTH = 8,
|
||||
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
|
||||
parameter QUEUE_SIZE = 8,
|
||||
parameter PERF_CTR_BITS = `CLOG2(NUM_REQS+1),
|
||||
|
||||
parameter DATA_IN_WIDTH = DATA_IN_SIZE * 8,
|
||||
parameter DATA_OUT_WIDTH= DATA_OUT_SIZE * 8,
|
||||
|
@ -37,6 +38,8 @@ module VX_mem_coalescer #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire [PERF_CTR_BITS-1:0] misses,
|
||||
|
||||
// Input request
|
||||
input wire in_req_valid,
|
||||
input wire in_req_rw,
|
||||
|
@ -323,6 +326,23 @@ module VX_mem_coalescer #(
|
|||
assign in_rsp_tag = {out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout_tag};
|
||||
assign out_rsp_ready = in_rsp_ready;
|
||||
|
||||
// compute coalescing misses
|
||||
// misses are partial transfers (not fuly coalesced)
|
||||
|
||||
reg [PERF_CTR_BITS-1:0] misses_r;
|
||||
|
||||
wire partial_transfer = (out_req_fire && req_rem_mask_r != '1);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
misses_r <= '0;
|
||||
end else begin
|
||||
misses_r <= misses_r + PERF_CTR_BITS'(partial_transfer);
|
||||
end
|
||||
end
|
||||
|
||||
assign misses = misses_r;
|
||||
|
||||
`ifdef DBG_TRACE_MEM
|
||||
wire [`UP(UUID_WIDTH)-1:0] out_req_uuid;
|
||||
wire [`UP(UUID_WIDTH)-1:0] out_rsp_uuid;
|
||||
|
|
|
@ -237,6 +237,8 @@ module VX_mem_scheduler #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
`UNUSED_PIN (misses),
|
||||
|
||||
// Input request
|
||||
.in_req_valid (reqq_valid),
|
||||
.in_req_mask (reqq_mask),
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
`include "VX_platform.vh"
|
||||
|
||||
`TRACING_OFF
|
||||
module VX_reduce #(
|
||||
module VX_reduce_tree #(
|
||||
parameter DATAW_IN = 1,
|
||||
parameter DATAW_OUT = DATAW_IN,
|
||||
parameter N = 1,
|
||||
|
@ -41,7 +41,7 @@ module VX_reduce #(
|
|||
assign in_B[i] = data_in[N_A + i];
|
||||
end
|
||||
|
||||
VX_reduce #(
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (DATAW_IN),
|
||||
.DATAW_OUT (DATAW_OUT),
|
||||
.N (N_A),
|
||||
|
@ -51,7 +51,7 @@ module VX_reduce #(
|
|||
.data_out (out_A)
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (DATAW_IN),
|
||||
.DATAW_OUT (DATAW_OUT),
|
||||
.N (N_B),
|
|
@ -26,18 +26,39 @@
|
|||
end \
|
||||
end
|
||||
|
||||
`ifdef SIMULATION
|
||||
`define RAM_RESET_BLOCK if (RESET_RAM && reset) begin \
|
||||
for (integer i = 0; i < SIZE; ++i) begin \
|
||||
ram[i] <= DATAW'(INIT_VALUE); \
|
||||
end \
|
||||
end else
|
||||
`else
|
||||
`define RAM_RESET_BLOCK
|
||||
`endif
|
||||
|
||||
`define RAM_WRITE_ALL `RAM_RESET_BLOCK \
|
||||
if (write) begin \
|
||||
ram[addr] <= wdata; \
|
||||
end
|
||||
|
||||
`ifdef QUARTUS
|
||||
`define RAM_ARRAY_WREN reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1];
|
||||
`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[addr][i] <= wdata[i * WSELW +: WSELW]; \
|
||||
`define RAM_WRITE_WREN `RAM_RESET_BLOCK \
|
||||
if (write) begin \
|
||||
for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[addr][i] <= wdata[i * WSELW +: WSELW]; \
|
||||
end \
|
||||
end \
|
||||
end
|
||||
`else
|
||||
`define RAM_ARRAY_WREN reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[addr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
|
||||
`define RAM_WRITE_WREN `RAM_RESET_BLOCK \
|
||||
if (write) begin \
|
||||
for (integer i = 0; i < WRENW; ++i) begin \
|
||||
if (wren[i]) begin \
|
||||
ram[addr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \
|
||||
end \
|
||||
end \
|
||||
end
|
||||
`endif
|
||||
|
@ -49,8 +70,9 @@ module VX_sp_ram #(
|
|||
parameter WRENW = 1,
|
||||
parameter OUT_REG = 0,
|
||||
parameter LUTRAM = 0,
|
||||
parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first, N: no-change, U: undefined
|
||||
parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first, N: no-change
|
||||
parameter RADDR_REG = 0, // read address registered hint
|
||||
parameter RADDR_RESET = 0, // read address has reset
|
||||
parameter RDW_ASSERT = 0,
|
||||
parameter RESET_RAM = 0,
|
||||
parameter INIT_ENABLE = 0,
|
||||
|
@ -70,13 +92,14 @@ module VX_sp_ram #(
|
|||
localparam WSELW = DATAW / WRENW;
|
||||
`UNUSED_PARAM (LUTRAM)
|
||||
`UNUSED_PARAM (RADDR_REG)
|
||||
`UNUSED_PARAM (RADDR_RESET)
|
||||
|
||||
`STATIC_ASSERT(!(WRENW * WSELW != DATAW), ("invalid parameter"))
|
||||
`STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W" || RDW_MODE == "N" || RDW_MODE == "U"), ("invalid parameter"))
|
||||
`STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W" || RDW_MODE == "N"), ("invalid parameter"))
|
||||
`UNUSED_PARAM (RDW_ASSERT)
|
||||
|
||||
`ifdef SYNTHESIS
|
||||
localparam FORCE_BRAM = !LUTRAM && (SIZE * DATAW >= `MAX_LUTRAM);
|
||||
localparam FORCE_BRAM = !LUTRAM && `FORCE_BRAM(SIZE, DATAW);
|
||||
if (OUT_REG) begin : g_sync
|
||||
if (FORCE_BRAM) begin : g_bram
|
||||
if (RDW_MODE == "W") begin : g_write_first
|
||||
|
@ -85,10 +108,8 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [ADDRW-1:0] addr_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
if (read) begin
|
||||
addr_r <= addr;
|
||||
end
|
||||
end
|
||||
|
@ -98,9 +119,9 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
`RAM_WRITE_ALL
|
||||
if (read) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
rdata_r <= wdata;
|
||||
end else begin
|
||||
rdata_r <= ram[addr];
|
||||
|
@ -115,10 +136,8 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
if (read) begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
|
@ -128,10 +147,8 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
if (read) begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
|
@ -143,40 +160,8 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end else begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end else begin : g_no_wren
|
||||
`USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
end else begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end
|
||||
end else if (RDW_MODE == "U") begin : g_undefined
|
||||
if (WRENW != 1) begin : g_wren
|
||||
`USE_BLOCK_BRAM `RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
if (read) begin
|
||||
`RAM_WRITE_WREN
|
||||
else if (read) begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
|
@ -186,10 +171,8 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
end
|
||||
if (read) begin
|
||||
`RAM_WRITE_ALL
|
||||
else if (read) begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
|
@ -203,10 +186,8 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [ADDRW-1:0] addr_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
if (read) begin
|
||||
addr_r <= addr;
|
||||
end
|
||||
end
|
||||
|
@ -216,9 +197,9 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
`RAM_WRITE_ALL
|
||||
if (read) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
rdata_r <= wdata;
|
||||
end else begin
|
||||
rdata_r <= ram[addr];
|
||||
|
@ -233,10 +214,8 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
if (read) begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
|
@ -246,10 +225,8 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
if (read) begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
|
@ -261,40 +238,8 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end else begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end else begin : g_no_wren
|
||||
reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
end else begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end
|
||||
end else if (RDW_MODE == "U") begin : g_undefined
|
||||
if (WRENW != 1) begin : g_wren
|
||||
`RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
if (read) begin
|
||||
`RAM_WRITE_WREN
|
||||
else if (read) begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
|
@ -304,10 +249,8 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
end
|
||||
if (read) begin
|
||||
`RAM_WRITE_ALL
|
||||
else if (read) begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
|
@ -318,7 +261,7 @@ module VX_sp_ram #(
|
|||
end else begin : g_async
|
||||
`UNUSED_VAR (read)
|
||||
if (FORCE_BRAM) begin : g_bram
|
||||
`ifdef VIVADO
|
||||
`ifdef ASYNC_BRAM_PATCH
|
||||
VX_async_ram_patch #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (SIZE),
|
||||
|
@ -326,6 +269,7 @@ module VX_sp_ram #(
|
|||
.DUAL_PORT (0),
|
||||
.FORCE_BRAM (FORCE_BRAM),
|
||||
.RADDR_REG (RADDR_REG),
|
||||
.RADDR_RESET(RADDR_RESET),
|
||||
.WRITE_FIRST(RDW_MODE == "W"),
|
||||
.INIT_ENABLE(INIT_ENABLE),
|
||||
.INIT_FILE (INIT_FILE),
|
||||
|
@ -347,18 +291,14 @@ module VX_sp_ram #(
|
|||
`RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
assign rdata = ram[addr];
|
||||
end else begin : g_no_wren
|
||||
`RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
end
|
||||
assign rdata = ram[addr];
|
||||
end
|
||||
|
@ -367,18 +307,14 @@ module VX_sp_ram #(
|
|||
`NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
assign rdata = ram[addr];
|
||||
end else begin : g_no_wren
|
||||
`NO_RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
end
|
||||
assign rdata = ram[addr];
|
||||
end
|
||||
|
@ -390,18 +326,14 @@ module VX_sp_ram #(
|
|||
`RW_RAM_CHECK `RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
assign rdata = ram[addr];
|
||||
end else begin : g_no_wren
|
||||
`RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
end
|
||||
assign rdata = ram[addr];
|
||||
end
|
||||
|
@ -410,18 +342,14 @@ module VX_sp_ram #(
|
|||
`NO_RW_RAM_CHECK `RAM_ARRAY_WREN
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
assign rdata = ram[addr];
|
||||
end else begin : g_no_wren
|
||||
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1];
|
||||
`RAM_INITIALIZATION
|
||||
always @(posedge clk) begin
|
||||
if (write) begin
|
||||
ram[addr] <= wdata;
|
||||
end
|
||||
`RAM_WRITE_ALL
|
||||
end
|
||||
assign rdata = ram[addr];
|
||||
end
|
||||
|
@ -434,24 +362,14 @@ module VX_sp_ram #(
|
|||
`RAM_INITIALIZATION
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (RESET_RAM && reset) begin
|
||||
for (integer i = 0; i < SIZE; ++i) begin
|
||||
ram[i] <= DATAW'(INIT_VALUE);
|
||||
end
|
||||
end else if (write) begin
|
||||
for (integer i = 0; i < WRENW; ++i) begin
|
||||
if (wren[i]) begin
|
||||
ram[addr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
|
||||
end
|
||||
end
|
||||
end
|
||||
`RAM_WRITE_WREN
|
||||
end
|
||||
|
||||
if (OUT_REG) begin : g_sync
|
||||
if (RDW_MODE == "W") begin : g_write_first
|
||||
reg [ADDRW-1:0] addr_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (read) begin
|
||||
addr_r <= addr;
|
||||
end
|
||||
end
|
||||
|
@ -459,7 +377,7 @@ module VX_sp_ram #(
|
|||
end else if (RDW_MODE == "R") begin : g_read_first
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read || write) begin
|
||||
if (read) begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
|
@ -472,14 +390,6 @@ module VX_sp_ram #(
|
|||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end else if (RDW_MODE == "U") begin : g_unknown
|
||||
reg [DATAW-1:0] rdata_r;
|
||||
always @(posedge clk) begin
|
||||
if (read) begin
|
||||
rdata_r <= ram[addr];
|
||||
end
|
||||
end
|
||||
assign rdata = rdata_r;
|
||||
end
|
||||
end else begin : g_async
|
||||
`UNUSED_VAR (read)
|
||||
|
|
|
@ -206,13 +206,13 @@ module VX_stream_xbar #(
|
|||
reg [PERF_CTR_BITS-1:0] collisions_r;
|
||||
|
||||
always @(*) begin
|
||||
per_cycle_collision = 0;
|
||||
per_cycle_collision = '0;
|
||||
for (integer i = 0; i < NUM_INPUTS; ++i) begin
|
||||
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
|
||||
for (integer j = i + 1; j < NUM_INPUTS; ++j) begin
|
||||
per_cycle_collision[i] |= valid_in[i]
|
||||
&& valid_in[j+i]
|
||||
&& (sel_in[i] == sel_in[j+i])
|
||||
&& (ready_in[i] | ready_in[j+i]);
|
||||
&& valid_in[j]
|
||||
&& (sel_in[i] == sel_in[j])
|
||||
&& (ready_in[i] | ready_in[j]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -43,7 +43,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
output cache_perf_t lmem_perf,
|
||||
output lmem_perf_t lmem_perf,
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if.slave mem_bus_if [NUM_REQS]
|
||||
|
@ -286,14 +286,10 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
assign lmem_perf.reads = perf_reads;
|
||||
assign lmem_perf.writes = perf_writes;
|
||||
assign lmem_perf.read_misses = '0;
|
||||
assign lmem_perf.write_misses = '0;
|
||||
assign lmem_perf.bank_stalls = perf_collisions;
|
||||
assign lmem_perf.mshr_stalls = '0;
|
||||
assign lmem_perf.mem_stalls = '0;
|
||||
assign lmem_perf.crsp_stalls = perf_crsp_stalls;
|
||||
assign lmem_perf.reads = perf_reads;
|
||||
assign lmem_perf.writes = perf_writes;
|
||||
assign lmem_perf.bank_stalls = perf_collisions;
|
||||
assign lmem_perf.crsp_stalls = perf_crsp_stalls;
|
||||
|
||||
`endif
|
||||
|
||||
|
@ -321,15 +317,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
always @(posedge clk) begin
|
||||
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
|
||||
if (mem_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: %s rd-rsp: req_idx=%0d, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
|
@ -339,15 +335,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
always @(posedge clk) begin
|
||||
if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin
|
||||
if (per_bank_req_rw[i]) begin
|
||||
`TRACE(2, ("%t: %s-bank%0d wr-req: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s bank-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s bank-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
|
||||
end
|
||||
end
|
||||
if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin
|
||||
`TRACE(2, ("%t: %s-bank%0d rd-rsp: data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s bank-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_rsp_data[i], per_bank_rsp_tag_value[i], per_bank_rsp_uuid[i]))
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,43 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_mem_perf_if import VX_gpu_pkg::*; ();
|
||||
|
||||
cache_perf_t icache;
|
||||
cache_perf_t dcache;
|
||||
cache_perf_t l2cache;
|
||||
cache_perf_t l3cache;
|
||||
cache_perf_t lmem;
|
||||
mem_perf_t mem;
|
||||
|
||||
modport master (
|
||||
output icache,
|
||||
output dcache,
|
||||
output l2cache,
|
||||
output l3cache,
|
||||
output lmem,
|
||||
output mem
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input icache,
|
||||
input dcache,
|
||||
input l2cache,
|
||||
input l3cache,
|
||||
input lmem,
|
||||
input mem
|
||||
);
|
||||
|
||||
endinterface
|
|
@ -13,7 +13,6 @@
|
|||
|
||||
namespace eval vortex {
|
||||
|
||||
variable info 0
|
||||
variable debug 0
|
||||
|
||||
proc print_error {msg {do_exit 1}} {
|
||||
|
@ -21,7 +20,8 @@ proc print_error {msg {do_exit 1}} {
|
|||
puts "ERROR: $msg"
|
||||
exit -1
|
||||
} else {
|
||||
puts "WARNING: $msg"
|
||||
variable debug
|
||||
if {$debug} {puts "WARNING: $msg"}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -132,6 +132,17 @@ proc find_cell_nets {cell name_match {should_exist 1}} {
|
|||
return $matching_nets
|
||||
}
|
||||
|
||||
proc find_cell_net {cell name_match {should_exist 1}} {
|
||||
set nets [find_cell_nets $cell $name_match $should_exist]
|
||||
if {[llength $nets] == 0} {
|
||||
return ""
|
||||
} elseif {[llength $nets] > 1} {
|
||||
puts "ERROR: Multiple matching nets found for '$cell' matching '$name_match'."
|
||||
exit -1
|
||||
}
|
||||
return [lindex $nets 0]
|
||||
}
|
||||
|
||||
proc get_cell_net {cell name} {
|
||||
set net [get_nets -hierarchical -filter "PARENT_CELL == $cell && NAME == $name"]
|
||||
if {[llength $net] == 0} {
|
||||
|
@ -168,88 +179,52 @@ proc get_cell_pin {cell name} {
|
|||
}
|
||||
|
||||
proc remove_cell_from_netlist {cell} {
|
||||
variable info
|
||||
variable debug
|
||||
|
||||
# Disconnect all pins of the cell
|
||||
foreach pin [get_pins -quiet -of_objects $cell] {
|
||||
foreach net [get_nets -quiet -of_objects $pin] {
|
||||
disconnect_net -net $net -objects $pin
|
||||
if {$info} {puts "INFO: Disconnected net '$net' from pin '$pin'."}
|
||||
if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."}
|
||||
}
|
||||
}
|
||||
|
||||
# Remove the cell
|
||||
remove_cell $cell
|
||||
if {$info} {puts "INFO: Cell '$cell' was removed successfully."}
|
||||
if {$debug} {puts "DEBUG: Cell '$cell' was removed successfully."}
|
||||
}
|
||||
|
||||
proc replace_pin_source {pin source_pin} {
|
||||
variable debug
|
||||
|
||||
# Disconnect existing net from pin
|
||||
set net [get_nets -of_objects $pin]
|
||||
if {[llength $net] == 1} {
|
||||
disconnect_net -net $net -objects $pin
|
||||
if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."}
|
||||
} elseif {[llength $net] > 1} {
|
||||
puts "ERROR: Multiple nets connected to pin '$pin'."
|
||||
exit -1
|
||||
} else {
|
||||
puts "WARNING: No net connected to pin '$pin'."
|
||||
}
|
||||
|
||||
set source_net [get_nets -quiet -of_objects $source_pin]
|
||||
if {[llength $source_net] == 0} {
|
||||
# Create a new net if none exists
|
||||
set source_cell [get_cells -of_objects $source_pin]
|
||||
set net_name [unique_net_name "${source_cell}_net"]
|
||||
set source_net [create_net $net_name]
|
||||
if {$debug} {puts "DEBUG: Created source_net: '$source_net'"}
|
||||
# Connect the source pin to the new net
|
||||
connect_net -net $source_net -objects $source_pin -hierarchical
|
||||
if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$source_pin'."}
|
||||
} elseif {[llength $source_net] > 1} {
|
||||
puts "ERROR: Multiple nets connected to pin '$source_pin'."
|
||||
exit -1
|
||||
}
|
||||
|
||||
# Connect pin to the new source net
|
||||
connect_net -net $source_net -objects $pin -hierarchical
|
||||
if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$pin'."}
|
||||
}
|
||||
|
||||
proc find_net_driver {input_net {should_exist 1}} {
|
||||
set driverPins [get_pins -quiet -leaf -of_objects $input_net -filter {DIRECTION == "OUT"}]
|
||||
proc find_net_driver {taregt_net {should_exist 1}} {
|
||||
set driverPins [get_pins -quiet -leaf -of_objects $taregt_net -filter {DIRECTION == "OUT"}]
|
||||
if {[llength $driverPins] == 0} {
|
||||
set driverPorts [get_ports -quiet -of_objects $input_net -filter {DIRECTION == "IN"}]
|
||||
set driverPorts [get_ports -quiet -of_objects $taregt_net -filter {DIRECTION == "IN"}]
|
||||
if {[llength $driverPorts] == 0} {
|
||||
print_error "No driver found for '$input_net'." $should_exist
|
||||
print_error "No driver found for '$taregt_net'." $should_exist
|
||||
} elseif {[llength $driverPorts] > 1} {
|
||||
puts "WARNING: Multiple driver ports found for '$input_net'."
|
||||
puts "WARNING: Multiple driver ports found for '$taregt_net'."
|
||||
return [lindex $driverPorts 0]
|
||||
}
|
||||
return $driverPorts
|
||||
} elseif {[llength $driverPins] > 1} {
|
||||
puts "WARNING: Multiple driver pins found for '$input_net'."
|
||||
puts "WARNING: Multiple driver pins found for '$taregt_net'."
|
||||
return [lindex $driverPins 0]
|
||||
}
|
||||
return $driverPins
|
||||
}
|
||||
|
||||
proc find_pin_driver {input_pin {should_exist 1}} {
|
||||
set net [get_nets -quiet -of_objects $input_pin]
|
||||
proc find_pin_driver {target_pin {should_exist 1}} {
|
||||
set net [get_nets -quiet -of_objects $target_pin]
|
||||
if {[llength $net] == 0} {
|
||||
print_error "No net connected to pin '$input_pin'." $should_exist
|
||||
print_error "No net connected to pin '$target_pin'." $should_exist
|
||||
return ""
|
||||
} elseif {[llength $net] > 1} {
|
||||
puts "ERROR: Multiple nets connected to pin '$input_pin'."
|
||||
puts "ERROR: Multiple nets connected to pin '$target_pin'."
|
||||
exit -1
|
||||
}
|
||||
return [find_net_driver $net]
|
||||
}
|
||||
|
||||
proc create_register_next {parent reg_cell} {
|
||||
variable info
|
||||
proc create_register_next {parent reg_cell raddr_reset} {
|
||||
variable debug
|
||||
|
||||
set hier_sep [get_hierarchy_separator]
|
||||
|
@ -273,6 +248,10 @@ proc create_register_next {parent reg_cell} {
|
|||
|
||||
if {$debug} {puts "DEBUG: reg_d_src_pin: '$reg_d_src_pin'"}
|
||||
|
||||
if {$raddr_reset == ""} {
|
||||
return $reg_d_src_pin
|
||||
}
|
||||
|
||||
set reg_r_src_pin ""
|
||||
|
||||
set register_type [get_property REF_NAME $reg_cell]
|
||||
|
@ -341,7 +320,7 @@ proc create_register_next {parent reg_cell} {
|
|||
# FDSE: O = I1 ? 1 : I0; where I0=D, I1=S
|
||||
set lut_name [unique_cell_name "${parent}${hier_sep}raddr_next"]
|
||||
set lut_cell [create_cell -reference LUT2 $lut_name]
|
||||
if {$info} {puts "INFO: Created lut cell: '$lut_cell'"}
|
||||
if {$debug} {puts "DEBUG: Created lut cell: '$lut_cell'"}
|
||||
|
||||
if {$register_type == "FDRE"} {
|
||||
set_property INIT 4'b0010 $lut_cell
|
||||
|
@ -389,7 +368,6 @@ proc create_register_next {parent reg_cell} {
|
|||
}
|
||||
|
||||
proc getOrCreateVCCPin {parent} {
|
||||
variable info
|
||||
variable debug
|
||||
|
||||
set hier_sep [get_hierarchy_separator]
|
||||
|
@ -398,7 +376,7 @@ proc getOrCreateVCCPin {parent} {
|
|||
set vcc_cell [get_cells -quiet $cell_name]
|
||||
if {[llength $vcc_cell] == 0} {
|
||||
set vcc_cell [create_cell -reference VCC $cell_name]
|
||||
if {$info} {puts "INFO: Created VCC cell: '$vcc_cell'"}
|
||||
if {$debug} {puts "DEBUG: Created VCC cell: '$vcc_cell'"}
|
||||
} elseif {[llength $vcc_cell] > 1} {
|
||||
puts "ERROR: Multiple VCC cells found with name '$cell_name'."
|
||||
exit -1
|
||||
|
@ -417,7 +395,6 @@ proc getOrCreateVCCPin {parent} {
|
|||
}
|
||||
|
||||
proc getOrCreateGNDPin {parent} {
|
||||
variable info
|
||||
variable debug
|
||||
|
||||
set hier_sep [get_hierarchy_separator]
|
||||
|
@ -426,7 +403,7 @@ proc getOrCreateGNDPin {parent} {
|
|||
set gnd_cell [get_cells -quiet $cell_name]
|
||||
if {[llength $gnd_cell] == 0} {
|
||||
set gnd_cell [create_cell -reference GND $cell_name]
|
||||
if {$info} {puts "INFO: Created GND cell: '$gnd_cell'"}
|
||||
if {$debug} {puts "DEBUG: Created GND cell: '$gnd_cell'"}
|
||||
} elseif {[llength $gnd_cell] > 1} {
|
||||
puts "ERROR: Multiple GND cells found with name '$cell_name'."
|
||||
exit -1
|
||||
|
@ -444,16 +421,28 @@ proc getOrCreateGNDPin {parent} {
|
|||
return $gnd_pin
|
||||
}
|
||||
|
||||
proc find_net_sinks {input_net {should_exist 1}} {
|
||||
proc find_net_sinks {source_net {should_exist 1}} {
|
||||
set sink_pins {}
|
||||
foreach pin [get_pins -quiet -leaf -of_objects $input_net -filter {DIRECTION == "IN"}] {
|
||||
lappend sink_pins $pin
|
||||
# Iterate through all pins connected to the source net
|
||||
foreach pin [get_pins -quiet -of_objects $source_net] {
|
||||
set direction [get_property DIRECTION $pin]
|
||||
# Input pins of nested cells
|
||||
if {$direction == "IN"} {
|
||||
lappend sink_pins $pin
|
||||
}
|
||||
# Output pins of the parent cell
|
||||
set pin_cell [get_cells -of_objects $pin]
|
||||
set is_primitive [get_property IS_PRIMITIVE $pin_cell]
|
||||
if {$direction == "OUT" && !$is_primitive} {
|
||||
lappend sink_pins $pin
|
||||
}
|
||||
}
|
||||
foreach port [get_ports -quiet -of_objects $input_net -filter {DIRECTION == "OUT"}] {
|
||||
# Add any top-module output ports connected to the source net
|
||||
foreach port [get_ports -quiet -of_objects $source_net -filter {DIRECTION == "OUT"}] {
|
||||
lappend sink_pins $port
|
||||
}
|
||||
if {[llength $sink_pins] == 0} {
|
||||
print_error "No sink found for '$input_net'." $should_exist
|
||||
print_error "No sink found for '$source_net'." $should_exist
|
||||
}
|
||||
return $sink_pins
|
||||
}
|
||||
|
@ -497,13 +486,49 @@ proc find_matching_pins {cell pins match repl} {
|
|||
}
|
||||
|
||||
proc replace_net_source {net source_pin} {
|
||||
variable debug
|
||||
foreach pin [find_net_sinks $net 0] {
|
||||
replace_pin_source $pin $source_pin
|
||||
# disconnect net from pin
|
||||
disconnect_net -net $net -objects $pin
|
||||
if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."}
|
||||
|
||||
# find/create source net
|
||||
set source_net [get_nets -quiet -of_objects $source_pin]
|
||||
if {[llength $source_net] == 0} {
|
||||
# Create a new net (in source_cell's parent) if none exists
|
||||
set source_cell [get_cells -of_objects $source_pin]
|
||||
set net_name [unique_net_name "${source_cell}_tmp_net"]
|
||||
set source_net [create_net $net_name]
|
||||
if {$debug} {puts "DEBUG: Created source_net: '$source_net'"}
|
||||
# Connect the source pin to the new net
|
||||
connect_net -net $source_net -objects $source_pin -hierarchical
|
||||
if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$source_pin'."}
|
||||
} elseif {[llength $source_net] > 1} {
|
||||
puts "ERROR: Multiple nets connected to pin '$source_pin'."
|
||||
exit -1
|
||||
}
|
||||
|
||||
set external_net [get_nets -quiet -of_objects $pin]
|
||||
if {[llength $external_net] == 0} {
|
||||
# Connect pin to source net
|
||||
connect_net -net $source_net -objects $pin -hierarchical
|
||||
if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$pin'."}
|
||||
} elseif {[llength $external_net] == 1} {
|
||||
foreach external_pin [get_pins -of_objects $external_net] {
|
||||
# disconnect external net from pin
|
||||
disconnect_net -net $external_net -objects $pin
|
||||
if {$debug} {puts "DEBUG: Disconnected net '$external_net' from pin '$pin'."}
|
||||
# recurse-connect external net's pins to source_pin
|
||||
replace_net_source $external_net $source_pin
|
||||
}
|
||||
} else {
|
||||
puts "ERROR: Multiple nets connected to pin '$pin'."
|
||||
exit -1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
proc resolve_async_bram {inst} {
|
||||
variable info
|
||||
variable debug
|
||||
|
||||
puts "INFO: Resolving asynchronous BRAM patch: '$inst'."
|
||||
|
@ -511,20 +536,32 @@ proc resolve_async_bram {inst} {
|
|||
set hier_sep [get_hierarchy_separator]
|
||||
|
||||
set raddr_w_nets [find_cell_nets $inst "raddr_w(\\\[\\d+\\\])?$"]
|
||||
set read_s_net [find_cell_nets $inst "read_s$"]
|
||||
set is_raddr_reg_net [find_cell_nets $inst "is_raddr_reg$"]
|
||||
|
||||
set read_s_net [find_cell_net $inst "read_s$"]
|
||||
if {$debug} {puts "DEBUG: read_s_net: '$read_s_net'"}
|
||||
|
||||
set is_raddr_reg_net [find_cell_net $inst "g_async_ram.is_raddr_reg$" 0]
|
||||
if {$debug} {puts "DEBUG: is_raddr_reg_net: '$is_raddr_reg_net'"}
|
||||
|
||||
set raddr_s_nets [find_matching_nets $inst $raddr_w_nets "raddr_w(\\\[\\d+\\\])?$" "raddr_s\\1"]
|
||||
|
||||
set reg_next_pins {}
|
||||
set reg_ce_src_pin ""
|
||||
|
||||
set raddr_reset_net [find_cell_net $inst "raddr_reset$" 0]
|
||||
if {$debug} {puts "DEBUG: raddr_reset: '$raddr_reset_net'"}
|
||||
|
||||
# Process each raddr_w net
|
||||
foreach raddr_w_net $raddr_w_nets {
|
||||
if {$debug} {puts "DEBUG: Processing raddr_w net: '$raddr_w_net'"}
|
||||
|
||||
# Find raddr_w_net's driver pin
|
||||
set raddr_src_pin [find_net_driver $raddr_w_net]
|
||||
if {$debug} {puts "DEBUG: raddr_src_pin: '$raddr_src_pin'"}
|
||||
if {[get_ports -quiet $raddr_src_pin] ne ""} {
|
||||
puts "WARNING: Net '$raddr_w_net' is not registered, driver_type=port"
|
||||
break
|
||||
}
|
||||
|
||||
# Get the driver cell
|
||||
set raddr_src_cell [get_cells -of_objects $raddr_src_pin]
|
||||
|
@ -541,12 +578,12 @@ proc resolve_async_bram {inst} {
|
|||
if {$driver_type == "FDRE" || $driver_type == "FDSE"} {
|
||||
if {$debug} {puts "DEBUG: Net '$raddr_w_net' is registered, driver_type='$driver_type'"}
|
||||
} else {
|
||||
puts "WARNING: Net '$raddr_w_net' is not be registered, driver_type='$driver_type'"
|
||||
puts "WARNING: Net '$raddr_w_net' is not registered, driver_type='$driver_type'"
|
||||
break
|
||||
}
|
||||
|
||||
# Create register next cell and return output pin
|
||||
set reg_next_pin [create_register_next $inst $raddr_src_cell]
|
||||
set reg_next_pin [create_register_next $inst $raddr_src_cell $raddr_reset_net]
|
||||
if {$reg_next_pin == ""} {
|
||||
puts "ERROR: failed to create register next value for '$raddr_src_cell'."
|
||||
exit -1
|
||||
|
@ -576,61 +613,75 @@ proc resolve_async_bram {inst} {
|
|||
}
|
||||
}
|
||||
|
||||
set addr_width [llength $raddr_w_nets]
|
||||
|
||||
# do we have a fully registered read address?
|
||||
if {[llength $reg_next_pins] == [llength $raddr_w_nets]} {
|
||||
if {$info} {puts "INFO: Fully registered read address detected."}
|
||||
if {[llength $reg_next_pins] == $addr_width} {
|
||||
if {$debug} {puts "DEBUG: Fully registered read address detected."}
|
||||
|
||||
# Connect all reg_next_pins to all input pins attached to raddr_s_nets
|
||||
set addr_width [llength $raddr_w_nets]
|
||||
for {set addr_idx 0} {$addr_idx < $addr_width} {incr addr_idx} {
|
||||
set raddr_s_net [lindex $raddr_s_nets $addr_idx]
|
||||
set reg_next_pin [lindex $reg_next_pins $addr_idx]
|
||||
if {$info} {puts "INFO: Connecting pin '$reg_next_pin' to '$raddr_s_net's pins."}
|
||||
# Connect reg_next_pin to all input pins attached to raddr_s_net
|
||||
if {$debug} {puts "DEBUG: Connecting pin '$reg_next_pin' net to '$raddr_s_net's pins."}
|
||||
replace_net_source $raddr_s_net $reg_next_pin
|
||||
}
|
||||
|
||||
# Connect reg_ce_src_pin to all input pins attached to read_s_net
|
||||
if {$info} {puts "INFO: Connecting pin '$reg_ce_src_pin' to '$read_s_net's pins."}
|
||||
if {$debug} {puts "DEBUG: Connecting pin '$reg_ce_src_pin' net to '$read_s_net's pins."}
|
||||
replace_net_source $read_s_net $reg_ce_src_pin
|
||||
|
||||
# Create Const<1>'s pin
|
||||
set vcc_pin [getOrCreateVCCPin $inst]
|
||||
if {$is_raddr_reg_net != ""} {
|
||||
# Create Const<1>'s pin
|
||||
set vcc_pin [getOrCreateVCCPin $inst]
|
||||
|
||||
# Connect vcc_pin to all input pins attached to is_raddr_reg_net
|
||||
if {$info} {puts "INFO: Connecting pin '$vcc_pin' to '$is_raddr_reg_net's pins."}
|
||||
replace_net_source $is_raddr_reg_net $vcc_pin
|
||||
|
||||
# Remove all async_ram cells
|
||||
foreach cell [find_nested_cells $inst "g_async_ram.*" 0] {
|
||||
remove_cell_from_netlist $cell
|
||||
# Connect vcc_pin to all input pins attached to is_raddr_reg_net
|
||||
if {$debug} {puts "DEBUG: Connecting pin '$vcc_pin' to net '$is_raddr_reg_net's pins."}
|
||||
replace_net_source $is_raddr_reg_net $vcc_pin
|
||||
}
|
||||
} else {
|
||||
puts "WARNING: Not all read addresses are registered!"
|
||||
if {$is_raddr_reg_net == ""} {
|
||||
puts "ERROR: read address not fully registered!"
|
||||
exit -1
|
||||
} else {
|
||||
puts "WARNING: read address not fully registered!"
|
||||
}
|
||||
|
||||
# Create Const<0>'s pin
|
||||
set gnd_pin [getOrCreateGNDPin $inst]
|
||||
|
||||
# Connect gnd_pin to all input pins attached to is_raddr_reg_net
|
||||
if {$info} {puts "INFO: Connecting pin '$gnd_pin' to '$is_raddr_reg_net's pins."}
|
||||
replace_net_source $is_raddr_reg_net $gnd_pin
|
||||
|
||||
# Remove all sync_ram cells
|
||||
foreach cell [find_nested_cells $inst "g_sync_ram.*" 0] {
|
||||
remove_cell_from_netlist $cell
|
||||
# Connect GND to all input pins attached to raddr_s_nets
|
||||
for {set addr_idx 0} {$addr_idx < $addr_width} {incr addr_idx} {
|
||||
set raddr_s_net [lindex $raddr_s_nets $addr_idx]
|
||||
if {$debug} {puts "DEBUG: Connecting pin '$gnd_pin' net to '$raddr_s_net's pins."}
|
||||
replace_net_source $raddr_s_net $gnd_pin
|
||||
}
|
||||
|
||||
# Connect GND to all input pins attached to read_s_net
|
||||
if {$debug} {puts "DEBUG: Connecting pin '$gnd_pin' net to '$read_s_net's pins."}
|
||||
replace_net_source $read_s_net $gnd_pin
|
||||
|
||||
# Connect gnd_pin to all input pins attached to is_raddr_reg_net
|
||||
if {$debug} {puts "DEBUG: Connecting pin '$gnd_pin' to net '$is_raddr_reg_net's pins."}
|
||||
replace_net_source $is_raddr_reg_net $gnd_pin
|
||||
}
|
||||
|
||||
# Remove placeholder cell
|
||||
foreach cell [find_nested_cells $inst "placeholder$"] {
|
||||
# Remove placeholder cells
|
||||
foreach cell [find_nested_cells $inst "placeholder1$"] {
|
||||
remove_cell_from_netlist $cell
|
||||
}
|
||||
if {$is_raddr_reg_net != ""} {
|
||||
foreach cell [find_nested_cells $inst "g_async_ram.placeholder2$"] {
|
||||
remove_cell_from_netlist $cell
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
proc resolve_async_brams {} {
|
||||
variable debug
|
||||
set bram_patch_cells {}
|
||||
foreach cell [get_cells -hierarchical -filter {REF_NAME =~ "*VX_async_ram_patch*"}] {
|
||||
puts "INFO: Found async BRAM patch cell: '$cell'."
|
||||
if {$debug} {puts "DEBUG: Found async BRAM patch cell: '$cell'."}
|
||||
lappend bram_patch_cells $cell
|
||||
}
|
||||
if {[llength $bram_patch_cells] != 0} {
|
||||
|
|
|
@ -8,4 +8,5 @@ FPU_INCLUDE = -I$(RTL_DIR)/fpu
|
|||
ifneq (,$(findstring FPU_FPNEW,$(CONFIGS)))
|
||||
FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src
|
||||
endif
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE)
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE)
|
||||
RTL_INCLUDE = -I..
|
|
@ -5,7 +5,6 @@ DEVICE_FAMILY ?= arria10
|
|||
|
||||
PREFIX ?= build$(XLEN)
|
||||
TARGET ?= fpga
|
||||
NUM_CORES ?= 1
|
||||
|
||||
SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/opae
|
||||
|
||||
|
@ -44,6 +43,7 @@ ifeq ($(DEVICE_FAMILY), arria10)
|
|||
CONFIGS += -DALTERA_A10
|
||||
endif
|
||||
|
||||
ifdef NUM_CORES
|
||||
# cluster configuration
|
||||
CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1
|
||||
CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2
|
||||
|
@ -53,6 +53,7 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16
|
|||
CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16
|
||||
CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16
|
||||
CONFIGS += $(CONFIGS_$(NUM_CORES)c)
|
||||
endif
|
||||
|
||||
# include sources
|
||||
RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv
|
||||
|
|
|
@ -47,14 +47,18 @@ TARGET=hw PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 make chipscope
|
|||
# analyze build report
|
||||
vitis_analyzer build_xilinx_u50_gen3x16_xdma_5_202210_1_hw_4c/bin/vortex_afu.xclbin.link_summary
|
||||
|
||||
# resuming build for routing
|
||||
# resuming builds
|
||||
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.synth" make > build.log 2>&1 &
|
||||
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl" make > build.log 2>&1 &
|
||||
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.opt_design" make > build.log 2>&1 &
|
||||
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.place_design" make > build.log 2>&1 &
|
||||
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.phys_opt_design" make > build.log 2>&1 &
|
||||
TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.route_design" make > build.log 2>&1 &
|
||||
|
||||
# running test
|
||||
FPGA_BIN_DIR=<bin_dir> TARGET=hw_emu ./ci/blackbox.sh --driver=xrt --app=demo
|
||||
FPGA_BIN_DIR=<bin_dir> TARGET=hw ./ci/blackbox.sh --driver=xrt --app=demo
|
||||
FPGA_BIN_DIR=<bin_dir> TARGET=hw_emu ./ci/blackbox.sh --driver=xrt --app=demo
|
||||
FPGA_BIN_DIR=<bin_dir> XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xrt --app=demo
|
||||
FPGA_BIN_DIR=<bin_dir> TARGET=hw ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n1024"
|
||||
FPGA_BIN_DIR=<bin_dir> XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n1024"
|
||||
|
||||
# build report logs
|
||||
<build_dir>/bin/vortex_afu.xclbin.info
|
||||
|
|
|
@ -37,10 +37,15 @@ else
|
|||
endif
|
||||
|
||||
clean:
|
||||
ifndef RESUME
|
||||
rm -rf project_1
|
||||
rm -rf .Xil
|
||||
rm -f *.rpt
|
||||
rm -f vivado*.log
|
||||
rm -f vivado*.jou
|
||||
rm -f *.log
|
||||
rm -f *.jou
|
||||
rm -f *.dcp
|
||||
else
|
||||
@echo "RESUME is defined, skipping clean."
|
||||
endif
|
||||
|
||||
.PHONY: all gen-sources build clean
|
|
@ -11,9 +11,6 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# Start time
|
||||
set start_time [clock seconds]
|
||||
|
||||
if { $::argc != 4 } {
|
||||
puts "ERROR: Program \"$::argv0\" requires 4 arguments!\n"
|
||||
puts "Usage: $::argv0 <top_module> <device_part> <vcs_file> <xdc_file>\n"
|
||||
|
@ -46,95 +43,134 @@ if {[info exists ::env(MAX_JOBS)]} {
|
|||
set num_jobs 0
|
||||
}
|
||||
|
||||
# create fpu ip
|
||||
if {[info exists ::env(FPU_IP)]} {
|
||||
set ip_dir $::env(FPU_IP)
|
||||
set argv [list $ip_dir $device_part]
|
||||
set argc 2
|
||||
source ${script_dir}/xilinx_ip_gen.tcl
|
||||
proc run_setup {} {
|
||||
global project_name
|
||||
global top_module device_part vcs_file xdc_file
|
||||
global script_dir source_dir
|
||||
global num_jobs
|
||||
global argv argc ;# Using global system variables: argv and argc
|
||||
|
||||
# create fpu ip
|
||||
if {[info exists ::env(FPU_IP)]} {
|
||||
set ip_dir $::env(FPU_IP)
|
||||
set argv [list $ip_dir $device_part]
|
||||
set argc 2
|
||||
source ${script_dir}/xilinx_ip_gen.tcl
|
||||
}
|
||||
|
||||
source "${script_dir}/parse_vcs_list.tcl"
|
||||
set vlist [parse_vcs_list "${vcs_file}"]
|
||||
|
||||
set vsources_list [lindex $vlist 0]
|
||||
set vincludes_list [lindex $vlist 1]
|
||||
set vdefines_list [lindex $vlist 2]
|
||||
|
||||
#puts $vsources_list
|
||||
#puts $vincludes_list
|
||||
#puts $vdefines_list
|
||||
# Create project
|
||||
create_project $project_name $project_name -force -part $device_part
|
||||
|
||||
# Add constrains file
|
||||
read_xdc $xdc_file
|
||||
|
||||
# Add the design sources
|
||||
add_files -norecurse -verbose $vsources_list
|
||||
|
||||
# process defines
|
||||
set_property verilog_define ${vdefines_list} [current_fileset]
|
||||
|
||||
# add fpu ip
|
||||
if {[info exists ::env(FPU_IP)]} {
|
||||
set ip_dir $::env(FPU_IP)
|
||||
add_files -norecurse -verbose ${ip_dir}/xil_fma/xil_fma.xci
|
||||
add_files -norecurse -verbose ${ip_dir}/xil_fdiv/xil_fdiv.xci
|
||||
add_files -norecurse -verbose ${ip_dir}/xil_fsqrt/xil_fsqrt.xci
|
||||
}
|
||||
|
||||
# Synthesis
|
||||
set_property top $top_module [current_fileset]
|
||||
set_property \
|
||||
-name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} \
|
||||
-value {-mode out_of_context} \
|
||||
-objects [get_runs synth_1]
|
||||
|
||||
# register compilation hooks
|
||||
#set_property STEPS.SYNTH_DESIGN.TCL.PRE ${source_dir}/pre_synth_hook.tcl [get_runs synth_1]
|
||||
#set_property STEPS.SYNTH_DESIGN.TCL.POST ${source_dir}/post_synth_hook.tcl [get_runs synth_1]
|
||||
set_property STEPS.OPT_DESIGN.TCL.PRE ${script_dir}/xilinx_async_bram_patch.tcl [get_runs impl_1]
|
||||
#set_property STEPS.OPT_DESIGN.TCL.POST ${source_dir}/post_opt_hook.tcl [get_runs impl_1]
|
||||
#set_property STEPS.ROUTE_DESIGN.TCL.PRE ${source_dir}/pre_route_hook.tcl [get_runs impl_1]
|
||||
#set_property STEPS.ROUTE_DESIGN.TCL.POST ${source_dir}/post_route_hook.tcl [get_runs impl_1]
|
||||
|
||||
update_compile_order -fileset sources_1
|
||||
}
|
||||
|
||||
source "${script_dir}/parse_vcs_list.tcl"
|
||||
set vlist [parse_vcs_list "${vcs_file}"]
|
||||
proc run_synthesis {} {
|
||||
global num_jobs
|
||||
|
||||
set vsources_list [lindex $vlist 0]
|
||||
set vincludes_list [lindex $vlist 1]
|
||||
set vdefines_list [lindex $vlist 2]
|
||||
|
||||
#puts $vsources_list
|
||||
#puts $vincludes_list
|
||||
#puts $vdefines_list
|
||||
|
||||
# Create project
|
||||
create_project $project_name $project_name -force -part $device_part
|
||||
|
||||
# Add constrains file
|
||||
read_xdc $xdc_file
|
||||
|
||||
# Add the design sources
|
||||
add_files -norecurse -verbose $vsources_list
|
||||
|
||||
# process defines
|
||||
set_property verilog_define ${vdefines_list} [current_fileset]
|
||||
|
||||
# add fpu ip
|
||||
if {[info exists ::env(FPU_IP)]} {
|
||||
set ip_dir $::env(FPU_IP)
|
||||
add_files -norecurse -verbose ${ip_dir}/xil_fma/xil_fma.xci
|
||||
add_files -norecurse -verbose ${ip_dir}/xil_fdiv/xil_fdiv.xci
|
||||
add_files -norecurse -verbose ${ip_dir}/xil_fsqrt/xil_fsqrt.xci
|
||||
if {$num_jobs != 0} {
|
||||
launch_runs synth_1 -verbose -jobs $num_jobs
|
||||
} else {
|
||||
launch_runs synth_1 -verbose
|
||||
}
|
||||
wait_on_run synth_1
|
||||
open_run synth_1
|
||||
report_utilization -file post_synth_util.rpt -hierarchical -hierarchical_percentages
|
||||
write_checkpoint -force post_synth.dcp
|
||||
}
|
||||
|
||||
update_compile_order -fileset sources_1
|
||||
proc run_implementation {} {
|
||||
global num_jobs
|
||||
|
||||
# Synthesis
|
||||
set_property top $top_module [current_fileset]
|
||||
if {$num_jobs != 0} {
|
||||
launch_runs impl_1 -verbose -jobs $num_jobs
|
||||
} else {
|
||||
launch_runs impl_1 -verbose
|
||||
}
|
||||
wait_on_run impl_1
|
||||
open_run impl_1
|
||||
report_utilization -file post_impl_util.rpt -hierarchical -hierarchical_percentages
|
||||
write_checkpoint -force post_impl.dcp
|
||||
}
|
||||
|
||||
set_property \
|
||||
-name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} \
|
||||
-value {-mode out_of_context -flatten_hierarchy "rebuilt"} \
|
||||
-objects [get_runs synth_1]
|
||||
proc run_report {} {
|
||||
# Generate the synthesis report
|
||||
report_place_status -file place.rpt
|
||||
report_route_status -file route.rpt
|
||||
|
||||
# register compilation hooks
|
||||
#set_property STEPS.SYNTH_DESIGN.TCL.PRE ${source_dir}/pre_synth_hook.tcl [get_runs synth_1]
|
||||
#set_property STEPS.SYNTH_DESIGN.TCL.POST ${source_dir}/post_synth_hook.tcl [get_runs synth_1]
|
||||
set_property STEPS.OPT_DESIGN.TCL.PRE ${script_dir}/xilinx_async_bram_patch.tcl [get_runs impl_1]
|
||||
#set_property STEPS.OPT_DESIGN.TCL.POST ${source_dir}/post_opt_hook.tcl [get_runs impl_1]
|
||||
#set_property STEPS.ROUTE_DESIGN.TCL.PRE ${source_dir}/pre_route_hook.tcl [get_runs impl_1]
|
||||
#set_property STEPS.ROUTE_DESIGN.TCL.POST ${source_dir}/post_route_hook.tcl [get_runs impl_1]
|
||||
# Generate timing report
|
||||
report_timing -nworst 100 -delay_type max -sort_by group -file timing.rpt
|
||||
|
||||
if {$num_jobs != 0} {
|
||||
launch_runs synth_1 -verbose -jobs $num_jobs
|
||||
# Generate power and drc reports
|
||||
report_power -file power.rpt
|
||||
report_drc -file drc.rpt
|
||||
}
|
||||
|
||||
###############################################################################
|
||||
|
||||
# Start time
|
||||
set start_time [clock seconds]
|
||||
|
||||
set checkpoint_synth "post_synth.dcp"
|
||||
set checkpoint_impl "post_impl.dcp"
|
||||
|
||||
if { [file exists $checkpoint_impl] } {
|
||||
puts "Resuming from post-implementation checkpoint: $checkpoint_impl"
|
||||
open_checkpoint $checkpoint_impl
|
||||
run_report
|
||||
} elseif { [file exists $checkpoint_synth] } {
|
||||
puts "Resuming from post-synthesis checkpoint: $checkpoint_synth"
|
||||
open_checkpoint $checkpoint_synth
|
||||
run_implementation
|
||||
run_report
|
||||
} else {
|
||||
launch_runs synth_1 -verbose
|
||||
# Execute full pipeline
|
||||
run_setup
|
||||
run_synthesis
|
||||
run_implementation
|
||||
run_report
|
||||
}
|
||||
wait_on_run synth_1
|
||||
open_run synth_1
|
||||
write_checkpoint -force post_synth.dcp
|
||||
report_utilization -file post_synth_util.rpt -hierarchical -hierarchical_percentages
|
||||
|
||||
# Implementation
|
||||
if {$num_jobs != 0} {
|
||||
launch_runs impl_1 -verbose -jobs $num_jobs
|
||||
} else {
|
||||
launch_runs impl_1 -verbose
|
||||
}
|
||||
wait_on_run impl_1
|
||||
open_run impl_1
|
||||
write_checkpoint -force post_impl.dcp
|
||||
report_utilization -file post_impl_util.rpt -hierarchical -hierarchical_percentages
|
||||
|
||||
# Generate the synthesis report
|
||||
report_place_status -file place.rpt
|
||||
report_route_status -file route.rpt
|
||||
report_timing_summary -file timing.rpt
|
||||
|
||||
# Generate timing report
|
||||
report_timing -nworst 10 -delay_type max -sort_by group -file timing.rpt
|
||||
|
||||
# Generate power and drc reports
|
||||
report_power -file power.rpt
|
||||
report_drc -file drc.rpt
|
||||
|
||||
# End time and calculation
|
||||
set elapsed_time [expr {[clock seconds] - $start_time}]
|
||||
|
|
|
@ -458,7 +458,7 @@ if { [file exists post_impl.dcp] } {
|
|||
run_implementation
|
||||
run_report
|
||||
} else {
|
||||
# execute full pipeline
|
||||
# Execute full pipeline
|
||||
run_setup
|
||||
run_synthesis
|
||||
run_implementation
|
||||
|
|
|
@ -15,7 +15,6 @@ endif
|
|||
TARGET ?= hw
|
||||
PLATFORM ?=
|
||||
|
||||
NUM_CORES ?= 1
|
||||
PREFIX ?= build$(XLEN)
|
||||
MAX_JOBS ?= 8
|
||||
|
||||
|
@ -64,6 +63,7 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
|
|||
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
|
||||
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
|
||||
|
||||
ifdef NUM_CORES
|
||||
# cluster configuration
|
||||
CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1
|
||||
CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2
|
||||
|
@ -73,6 +73,7 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16
|
|||
CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16
|
||||
CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16
|
||||
CONFIGS += $(CONFIGS_$(NUM_CORES)c)
|
||||
endif
|
||||
|
||||
# include sources
|
||||
RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
|
||||
|
@ -115,12 +116,12 @@ endif
|
|||
# Debugging
|
||||
ifdef DEBUG
|
||||
VPP_FLAGS += -g --optimize 0 --debug.protocol all
|
||||
ifneq ($(TARGET), hw)
|
||||
VPP_FLAGS += --vivado.prop fileset.sim_1.xsim.elaborate.debug_level=all
|
||||
CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS)
|
||||
else
|
||||
ifeq ($(TARGET), hw)
|
||||
VPP_FLAGS += --debug.chipscope vortex_afu_1
|
||||
CFLAGS += -DNDEBUG -DCHIPSCOPE $(DBG_SCOPE_FLAGS)
|
||||
else
|
||||
VPP_FLAGS += --vivado.prop fileset.sim_1.xsim.elaborate.debug_level=all
|
||||
CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS)
|
||||
endif
|
||||
else
|
||||
VPP_FLAGS += --optimize 3
|
||||
|
|
|
@ -5,7 +5,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/yosys
|
|||
|
||||
TOP_LEVEL_ENTITY ?= Vortex
|
||||
PREFIX ?= build
|
||||
NUM_CORES ?= 1
|
||||
|
||||
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
|
||||
RTL_DIR := $(VORTEX_HOME)/hw/rtl
|
||||
|
@ -30,7 +29,7 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE
|
|||
DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH
|
||||
DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU
|
||||
|
||||
|
||||
ifdef NUM_CORES
|
||||
# cluster configuration
|
||||
CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1
|
||||
CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2
|
||||
|
@ -40,6 +39,7 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE
|
|||
CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE
|
||||
CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE
|
||||
CONFIGS += $(CONFIGS_$(NUM_CORES)c)
|
||||
endif
|
||||
|
||||
# include paths
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fpu
|
||||
|
|
|
@ -211,8 +211,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t mem_reads = 0;
|
||||
uint64_t mem_writes = 0;
|
||||
uint64_t mem_lat = 0;
|
||||
uint64_t mem_req_counter = 0;
|
||||
uint64_t mem_ticks = 0;
|
||||
uint64_t mem_bank_stalls = 0;
|
||||
|
||||
uint64_t num_cores;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
|
||||
|
@ -223,7 +222,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
|
||||
return err;
|
||||
});
|
||||
|
||||
|
||||
uint64_t num_mem_bank_ports;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_MEM_BANKS, &num_mem_bank_ports), {
|
||||
return err;
|
||||
|
@ -437,6 +436,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
uint64_t dcache_requests_per_core = 0;
|
||||
|
||||
if (dcache_enable) {
|
||||
// PERF: Dcache
|
||||
uint64_t dcache_reads;
|
||||
|
@ -447,6 +448,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
|
||||
return err;
|
||||
});
|
||||
dcache_requests_per_core += dcache_reads + dcache_writes;
|
||||
uint64_t dcache_read_misses;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
|
||||
return err;
|
||||
|
@ -475,6 +477,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
// PERF: coalescer
|
||||
uint64_t coalescer_misses;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_COALESCER_MISS, core_id, &coalescer_misses), {
|
||||
return err;
|
||||
});
|
||||
int coalescer_utilization = calcAvgPercent(dcache_requests_per_core - coalescer_misses, dcache_requests_per_core);
|
||||
fprintf(stream, "PERF: core%d: coalescer misses=%ld (hit ratio=%d%%)\n", core_id, coalescer_misses, coalescer_utilization);
|
||||
|
||||
if (l2cache_enable) {
|
||||
// PERF: L2cache
|
||||
uint64_t tmp;
|
||||
|
@ -540,10 +550,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_CNTR, core_id, &mem_req_counter), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_TICK, core_id, &mem_ticks), {
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_ST, core_id, &mem_bank_stalls), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
|
@ -612,7 +619,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
|
||||
int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
|
||||
int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls);
|
||||
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
|
||||
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
|
||||
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
|
||||
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
|
||||
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio);
|
||||
|
@ -621,11 +628,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
|
||||
int memory_bank_port_utilization = calcAvgPercent(mem_req_counter, (mem_ticks * num_mem_bank_ports));
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
|
||||
fprintf(stream, "PERF: memory bank port utilization=%d%%\n", memory_bank_port_utilization);
|
||||
{
|
||||
uint64_t mem_requests = mem_reads + mem_writes;
|
||||
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
|
||||
int mem_bank_utilization = calcAvgPercent(mem_requests, mem_requests + mem_bank_stalls);
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", mem_requests, mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
|
||||
fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
|
||||
// XRT includes
|
||||
#ifdef XRTSIM
|
||||
#include <xrt.h>
|
||||
#include <xrt_c.h>
|
||||
#else
|
||||
#include "experimental/xrt_bo.h"
|
||||
#include "experimental/xrt_device.h"
|
||||
|
|
|
@ -104,6 +104,27 @@ inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
|
|||
return (bits << shift) >> (shift + start);
|
||||
}
|
||||
|
||||
inline uint64_t bit_reverse(uint64_t bits) {
|
||||
bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1) | ((bits & 0x5555555555555555) << 1);
|
||||
bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2) | ((bits & 0x3333333333333333) << 2);
|
||||
bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4) | ((bits & 0x0F0F0F0F0F0F0F0F) << 4);
|
||||
bits = ((bits & 0xFF00FF00FF00FF00) >> 8) | ((bits & 0x00FF00FF00FF00FF) << 8);
|
||||
bits = ((bits & 0xFFFF0000FFFF0000) >> 16) | ((bits & 0x0000FFFF0000FFFF) << 16);
|
||||
bits = (bits >> 32) | (bits << 32);
|
||||
return bits;
|
||||
}
|
||||
|
||||
inline uint64_t bit_reverse(uint64_t bits, uint32_t width) {
|
||||
assert(width <= 64);
|
||||
uint64_t reversed(0);
|
||||
for (uint32_t i = 0; i < width; ++i) {
|
||||
if (bits & (1ULL << i)) {
|
||||
reversed |= (1ULL << (width - 1 - i));
|
||||
}
|
||||
}
|
||||
return reversed;
|
||||
}
|
||||
|
||||
template <typename T = uint32_t>
|
||||
T sext(const T& word, uint32_t width) {
|
||||
assert(width > 1);
|
||||
|
|
|
@ -21,32 +21,32 @@ template <typename T = uint32_t>
|
|||
class BitVector {
|
||||
private:
|
||||
static constexpr size_t BITS_PER_WORD = sizeof(T) * 8;
|
||||
std::vector<T> bits_;
|
||||
std::vector<T> words_;
|
||||
size_t size_;
|
||||
bool all_zero_;
|
||||
|
||||
size_t wordIndex(size_t pos) const {
|
||||
constexpr size_t wordIndex(size_t pos) const {
|
||||
return pos / BITS_PER_WORD;
|
||||
}
|
||||
|
||||
T bitMask(size_t pos) const {
|
||||
constexpr T bitMask(size_t pos) const {
|
||||
return T(1) << (pos % BITS_PER_WORD);
|
||||
}
|
||||
|
||||
void updateAllZero() {
|
||||
all_zero_ = std::all_of(bits_.begin(), bits_.end(), [](T word) { return word == 0; });
|
||||
all_zero_ = std::all_of(words_.begin(), words_.end(), [](T word) { return word == 0; });
|
||||
}
|
||||
|
||||
public:
|
||||
explicit BitVector(size_t size = 0)
|
||||
: bits_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD)
|
||||
: words_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD)
|
||||
, size_(size)
|
||||
, all_zero_(true)
|
||||
{}
|
||||
|
||||
void set(size_t pos) {
|
||||
if (pos >= size_) throw std::out_of_range("Index out of range");
|
||||
bits_[this->wordIndex(pos)] |= this->bitMask(pos);
|
||||
words_[this->wordIndex(pos)] |= this->bitMask(pos);
|
||||
all_zero_ = false;
|
||||
}
|
||||
|
||||
|
@ -59,19 +59,19 @@ public:
|
|||
}
|
||||
|
||||
void reset() {
|
||||
std::fill(bits_.begin(), bits_.end(), 0);
|
||||
std::fill(words_.begin(), words_.end(), 0);
|
||||
all_zero_ = true;
|
||||
}
|
||||
|
||||
void reset(size_t pos) {
|
||||
if (pos >= size_) throw std::out_of_range("Index out of range");
|
||||
bits_[this->wordIndex(pos)] &= ~this->bitMask(pos);
|
||||
words_[this->wordIndex(pos)] &= ~this->bitMask(pos);
|
||||
this->updateAllZero();
|
||||
}
|
||||
|
||||
bool test(size_t pos) const {
|
||||
if (pos >= size_) throw std::out_of_range("Index out of range");
|
||||
return bits_[this->wordIndex(pos)] & this->bitMask(pos);
|
||||
return words_[this->wordIndex(pos)] & this->bitMask(pos);
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
|
@ -80,12 +80,12 @@ public:
|
|||
|
||||
void resize(size_t new_size) {
|
||||
size_ = new_size;
|
||||
bits_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0);
|
||||
words_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0);
|
||||
this->updateAllZero();
|
||||
}
|
||||
|
||||
bool operator==(const BitVector& other) const {
|
||||
return (size_ == other.size_) && (bits_ == other.bits_);
|
||||
return (size_ == other.size_) && (words_ == other.words_);
|
||||
}
|
||||
|
||||
bool operator!=(const BitVector& other) const {
|
||||
|
@ -98,8 +98,8 @@ public:
|
|||
|
||||
BitVector& operator&=(const BitVector& other) {
|
||||
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
|
||||
for (size_t i = 0; i < bits_.size(); ++i) {
|
||||
bits_[i] &= other.bits_[i];
|
||||
for (size_t i = 0; i < words_.size(); ++i) {
|
||||
words_[i] &= other.words_[i];
|
||||
}
|
||||
this->updateAllZero();
|
||||
return *this;
|
||||
|
@ -107,8 +107,8 @@ public:
|
|||
|
||||
BitVector& operator|=(const BitVector& other) {
|
||||
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
|
||||
for (size_t i = 0; i < bits_.size(); ++i) {
|
||||
bits_[i] |= other.bits_[i];
|
||||
for (size_t i = 0; i < words_.size(); ++i) {
|
||||
words_[i] |= other.words_[i];
|
||||
}
|
||||
this->updateAllZero();
|
||||
return *this;
|
||||
|
@ -116,8 +116,8 @@ public:
|
|||
|
||||
BitVector& operator^=(const BitVector& other) {
|
||||
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
|
||||
for (size_t i = 0; i < bits_.size(); ++i) {
|
||||
bits_[i] ^= other.bits_[i];
|
||||
for (size_t i = 0; i < words_.size(); ++i) {
|
||||
words_[i] ^= other.words_[i];
|
||||
}
|
||||
this->updateAllZero();
|
||||
return *this;
|
||||
|
@ -125,23 +125,48 @@ public:
|
|||
|
||||
BitVector operator~() const {
|
||||
BitVector result(size_);
|
||||
for (size_t i = 0; i < bits_.size(); ++i) {
|
||||
result.bits_[i] = ~bits_[i];
|
||||
for (size_t i = 0; i < words_.size(); ++i) {
|
||||
result.words_[i] = ~words_[i];
|
||||
}
|
||||
result.updateAllZero();
|
||||
return result;
|
||||
}
|
||||
|
||||
void flip() {
|
||||
for (auto &word : bits_) {
|
||||
for (auto &word : words_) {
|
||||
word = ~word;
|
||||
}
|
||||
this->updateAllZero();
|
||||
}
|
||||
|
||||
void reverse() {
|
||||
if (size_ == 0)
|
||||
return;
|
||||
size_t remaining_bits = size_ % BITS_PER_WORD;
|
||||
if (remaining_bits != 0) {
|
||||
std::vector<T> reversed_words(words_.size(), 0);
|
||||
for (size_t i = 0; i < size_; ++i) {
|
||||
size_t reversed_pos = size_ - 1 - i;
|
||||
size_t src_word = i / BITS_PER_WORD;
|
||||
size_t src_offset = i % BITS_PER_WORD;
|
||||
size_t dst_word = reversed_pos / BITS_PER_WORD;
|
||||
size_t dst_offset = reversed_pos % BITS_PER_WORD;
|
||||
if (words_[src_word] & (T(1) << src_offset)) {
|
||||
reversed_words[dst_word] |= (T(1) << dst_offset);
|
||||
}
|
||||
}
|
||||
words_ = std::move(reversed_words);
|
||||
} else {
|
||||
std::reverse(words_.begin(), words_.end());
|
||||
for (auto &word : words_) {
|
||||
word = static_cast<T>(bit_reverse(static_cast<uint64_t>(word)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t count() const {
|
||||
size_t count = 0;
|
||||
for (const auto &word : bits_) {
|
||||
for (const auto &word : words_) {
|
||||
count += std::bitset<BITS_PER_WORD>(word).count();
|
||||
}
|
||||
return count;
|
||||
|
@ -160,12 +185,12 @@ public:
|
|||
size_t remaining_bits = size_ % BITS_PER_WORD;
|
||||
T full_mask = ~T(0);
|
||||
for (size_t i = 0; i < full_bits; ++i) {
|
||||
if (bits_[i] != full_mask)
|
||||
if (words_[i] != full_mask)
|
||||
return false;
|
||||
}
|
||||
if (remaining_bits > 0) {
|
||||
T partial_mask = (T(1) << remaining_bits) - 1;
|
||||
if ((bits_[full_bits] & partial_mask) != partial_mask)
|
||||
if ((words_[full_bits] & partial_mask) != partial_mask)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -181,17 +206,17 @@ public:
|
|||
size_t bit_shift = pos % BITS_PER_WORD;
|
||||
|
||||
if (word_shift > 0) {
|
||||
for (size_t i = bits_.size() - 1; i >= word_shift; --i) {
|
||||
bits_[i] = bits_[i - word_shift];
|
||||
for (size_t i = words_.size() - 1; i >= word_shift; --i) {
|
||||
words_[i] = words_[i - word_shift];
|
||||
}
|
||||
std::fill(bits_.begin(), bits_.begin() + word_shift, 0);
|
||||
std::fill(words_.begin(), words_.begin() + word_shift, 0);
|
||||
}
|
||||
|
||||
if (bit_shift > 0) {
|
||||
for (size_t i = bits_.size() - 1; i > 0; --i) {
|
||||
bits_[i] = (bits_[i] << bit_shift) | (bits_[i - 1] >> (BITS_PER_WORD - bit_shift));
|
||||
for (size_t i = words_.size() - 1; i > 0; --i) {
|
||||
words_[i] = (words_[i] << bit_shift) | (words_[i - 1] >> (BITS_PER_WORD - bit_shift));
|
||||
}
|
||||
bits_[0] <<= bit_shift;
|
||||
words_[0] <<= bit_shift;
|
||||
}
|
||||
|
||||
this->updateAllZero();
|
||||
|
@ -208,17 +233,17 @@ public:
|
|||
size_t bit_shift = pos % BITS_PER_WORD;
|
||||
|
||||
if (word_shift > 0) {
|
||||
for (size_t i = 0; i < bits_.size() - word_shift; ++i) {
|
||||
bits_[i] = bits_[i + word_shift];
|
||||
for (size_t i = 0; i < words_.size() - word_shift; ++i) {
|
||||
words_[i] = words_[i + word_shift];
|
||||
}
|
||||
std::fill(bits_.end() - word_shift, bits_.end(), 0);
|
||||
std::fill(words_.end() - word_shift, words_.end(), 0);
|
||||
}
|
||||
|
||||
if (bit_shift > 0) {
|
||||
for (size_t i = 0; i < bits_.size() - 1; ++i) {
|
||||
bits_[i] = (bits_[i] >> bit_shift) | (bits_[i + 1] << (BITS_PER_WORD - bit_shift));
|
||||
for (size_t i = 0; i < words_.size() - 1; ++i) {
|
||||
words_[i] = (words_[i] >> bit_shift) | (words_[i + 1] << (BITS_PER_WORD - bit_shift));
|
||||
}
|
||||
bits_.back() >>= bit_shift;
|
||||
words_.back() >>= bit_shift;
|
||||
}
|
||||
|
||||
this->updateAllZero();
|
||||
|
|
|
@ -53,25 +53,25 @@ public:
|
|||
|
||||
SimPort(SimObjectBase* module)
|
||||
: SimPortBase(module)
|
||||
, peer_(nullptr)
|
||||
, sink_(nullptr)
|
||||
, tx_cb_(nullptr)
|
||||
{}
|
||||
|
||||
void bind(SimPort<Pkt>* peer) {
|
||||
assert(peer_ == nullptr);
|
||||
peer_ = peer;
|
||||
void bind(SimPort<Pkt>* sink) {
|
||||
assert(sink_ == nullptr);
|
||||
sink_ = sink;
|
||||
}
|
||||
|
||||
void unbind() {
|
||||
peer_ = nullptr;
|
||||
sink_ = nullptr;
|
||||
}
|
||||
|
||||
bool connected() const {
|
||||
return (peer_ != nullptr);
|
||||
return (sink_ != nullptr);
|
||||
}
|
||||
|
||||
SimPort* peer() const {
|
||||
return peer_;
|
||||
SimPort* sink() const {
|
||||
return sink_;
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
|
@ -111,15 +111,15 @@ protected:
|
|||
};
|
||||
|
||||
std::queue<timed_pkt_t> queue_;
|
||||
SimPort* peer_;
|
||||
SimPort* sink_;
|
||||
TxCallback tx_cb_;
|
||||
|
||||
void transfer(const Pkt& data, uint64_t cycles) {
|
||||
if (tx_cb_) {
|
||||
tx_cb_(data, cycles);
|
||||
}
|
||||
if (peer_) {
|
||||
peer_->transfer(data, cycles);
|
||||
if (sink_) {
|
||||
sink_->transfer(data, cycles);
|
||||
} else {
|
||||
queue_.push({data, cycles});
|
||||
}
|
||||
|
@ -402,8 +402,8 @@ typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
|
|||
|
||||
template <typename Pkt>
|
||||
void SimPort<Pkt>::push(const Pkt& pkt, uint64_t delay) const {
|
||||
if (peer_ && !tx_cb_) {
|
||||
reinterpret_cast<const SimPort<Pkt>*>(peer_)->push(pkt, delay);
|
||||
if (sink_ && !tx_cb_) {
|
||||
reinterpret_cast<const SimPort<Pkt>*>(sink_)->push(pkt, delay);
|
||||
} else {
|
||||
SimPlatform::instance().schedule(this, pkt, delay);
|
||||
}
|
||||
|
|
|
@ -46,8 +46,6 @@ Core::Core(const SimContext& ctx,
|
|||
, func_units_((uint32_t)FUType::Count)
|
||||
, lmem_switch_(NUM_LSU_BLOCKS)
|
||||
, mem_coalescers_(NUM_LSU_BLOCKS)
|
||||
, lsu_dcache_adapter_(NUM_LSU_BLOCKS)
|
||||
, lsu_lmem_adapter_(NUM_LSU_BLOCKS)
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, commit_arbs_(ISSUE_WIDTH)
|
||||
{
|
||||
|
@ -64,11 +62,11 @@ Core::Core(const SimContext& ctx,
|
|||
}
|
||||
|
||||
// create local memory
|
||||
snprintf(sname, 100, "%s-local_mem", this->name().c_str());
|
||||
snprintf(sname, 100, "%s-lmem", this->name().c_str());
|
||||
local_mem_ = LocalMem::Create(sname, LocalMem::Config{
|
||||
(1 << LMEM_LOG_SIZE),
|
||||
LSU_WORD_SIZE,
|
||||
LSU_NUM_REQS,
|
||||
LSU_CHANNELS,
|
||||
log2ceil(LMEM_NUM_BANKS),
|
||||
false
|
||||
});
|
||||
|
@ -79,48 +77,52 @@ Core::Core(const SimContext& ctx,
|
|||
lmem_switch_.at(i) = LocalMemSwitch::Create(sname, 1);
|
||||
}
|
||||
|
||||
// create lsu dcache adapter
|
||||
// create dcache adapter
|
||||
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter(NUM_LSU_BLOCKS);
|
||||
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
|
||||
snprintf(sname, 100, "%s-lsu_dcache_adapter%d", this->name().c_str(), i);
|
||||
lsu_dcache_adapter_.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
|
||||
lsu_dcache_adapter.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
|
||||
}
|
||||
|
||||
// create lsu lmem adapter
|
||||
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
|
||||
snprintf(sname, 100, "%s-lsu_lmem_adapter%d", this->name().c_str(), i);
|
||||
lsu_lmem_adapter_.at(i) = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
|
||||
}
|
||||
// create lmem arbiter
|
||||
snprintf(sname, 100, "%s-lmem_arb", this->name().c_str());
|
||||
auto lmem_arb = LsuArbiter::Create(sname, ArbiterType::RoundRobin, NUM_LSU_BLOCKS, 1);
|
||||
|
||||
// connect lsu demux
|
||||
// create lmem adapter
|
||||
snprintf(sname, 100, "%s-lsu_lmem_adapter", this->name().c_str());
|
||||
auto lsu_lmem_adapter = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
|
||||
|
||||
// connect lmem switch
|
||||
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
|
||||
lmem_switch_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
|
||||
lmem_switch_.at(b)->ReqLmem.bind(&lmem_arb->ReqIn.at(b));
|
||||
|
||||
mem_coalescers_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspDC);
|
||||
|
||||
lmem_switch_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
|
||||
lsu_lmem_adapter_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspLmem);
|
||||
lmem_arb->RspIn.at(b).bind(&lmem_switch_.at(b)->RspLmem);
|
||||
}
|
||||
|
||||
// connect coalescer-adapter
|
||||
// connect lmem arbiter
|
||||
lmem_arb->ReqOut.at(0).bind(&lsu_lmem_adapter->ReqIn);
|
||||
lsu_lmem_adapter->RspIn.bind(&lmem_arb->RspOut.at(0));
|
||||
|
||||
// connect lmem adapter
|
||||
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
|
||||
lsu_lmem_adapter->ReqOut.at(c).bind(&local_mem_->Inputs.at(c));
|
||||
local_mem_->Outputs.at(c).bind(&lsu_lmem_adapter->RspOut.at(c));
|
||||
}
|
||||
|
||||
// connect dcache coalescer
|
||||
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
|
||||
mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter_.at(b)->ReqIn);
|
||||
lsu_dcache_adapter_.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
|
||||
mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter.at(b)->ReqIn);
|
||||
lsu_dcache_adapter.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
|
||||
}
|
||||
|
||||
// connect adapter-dcache
|
||||
// connect dcache adapter
|
||||
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
|
||||
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
|
||||
uint32_t i = b * DCACHE_CHANNELS + c;
|
||||
lsu_dcache_adapter_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
|
||||
dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter_.at(b)->RspOut.at(c));
|
||||
}
|
||||
}
|
||||
|
||||
// connect adapter-lmem
|
||||
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
|
||||
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
|
||||
uint32_t i = b * LSU_CHANNELS + c;
|
||||
lsu_lmem_adapter_.at(b)->ReqOut.at(c).bind(&local_mem_->Inputs.at(i));
|
||||
local_mem_->Outputs.at(i).bind(&lsu_lmem_adapter_.at(b)->RspOut.at(c));
|
||||
lsu_dcache_adapter.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
|
||||
dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter.at(b)->RspOut.at(c));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -127,6 +127,10 @@ public:
|
|||
return local_mem_;
|
||||
}
|
||||
|
||||
const MemCoalescer::Ptr& mem_coalescer(uint32_t idx) const {
|
||||
return mem_coalescers_.at(idx);
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
@ -156,8 +160,6 @@ private:
|
|||
LocalMem::Ptr local_mem_;
|
||||
std::vector<LocalMemSwitch::Ptr> lmem_switch_;
|
||||
std::vector<MemCoalescer::Ptr> mem_coalescers_;
|
||||
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter_;
|
||||
std::vector<LsuMemAdapter::Ptr> lsu_lmem_adapter_;
|
||||
|
||||
PipelineLatch fetch_latch_;
|
||||
PipelineLatch decode_latch_;
|
||||
|
|
|
@ -360,7 +360,6 @@ void Emulator::dcache_read(void *data, uint64_t addr, uint32_t size) {
|
|||
} else {
|
||||
mmu_.read(data, addr, size, 0);
|
||||
}
|
||||
|
||||
DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl);
|
||||
}
|
||||
#endif
|
||||
|
@ -567,6 +566,12 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
auto cluster_perf = core_->socket()->cluster()->perf_stats();
|
||||
auto socket_perf = core_->socket()->perf_stats();
|
||||
auto lmem_perf = core_->local_mem()->perf_stats();
|
||||
|
||||
uint64_t coalescer_misses = 0;
|
||||
for (uint i = 0; i < NUM_LSU_BLOCKS; ++i) {
|
||||
coalescer_misses += core_->mem_coalescer(i)->perf_stats().misses;
|
||||
}
|
||||
|
||||
switch (addr) {
|
||||
CSR_READ_64(VX_CSR_MPM_ICACHE_READS, socket_perf.icache.reads);
|
||||
CSR_READ_64(VX_CSR_MPM_ICACHE_MISS_R, socket_perf.icache.read_misses);
|
||||
|
@ -596,8 +601,9 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
CSR_READ_64(VX_CSR_MPM_MEM_READS, proc_perf.mem_reads);
|
||||
CSR_READ_64(VX_CSR_MPM_MEM_WRITES, proc_perf.mem_writes);
|
||||
CSR_READ_64(VX_CSR_MPM_MEM_LT, proc_perf.mem_latency);
|
||||
CSR_READ_64(VX_CSR_MPM_MEM_BANK_CNTR, proc_perf.memsim.counter);
|
||||
CSR_READ_64(VX_CSR_MPM_MEM_BANK_TICK, proc_perf.memsim.ticks);
|
||||
CSR_READ_64(VX_CSR_MPM_MEM_BANK_ST, proc_perf.memsim.bank_stalls);
|
||||
|
||||
CSR_READ_64(VX_CSR_MPM_COALESCER_MISS, coalescer_misses);
|
||||
|
||||
CSR_READ_64(VX_CSR_MPM_LMEM_READS, lmem_perf.reads);
|
||||
CSR_READ_64(VX_CSR_MPM_LMEM_WRITES, lmem_perf.writes);
|
||||
|
|
|
@ -24,14 +24,12 @@ protected:
|
|||
LocalMem* simobject_;
|
||||
Config config_;
|
||||
RAM ram_;
|
||||
uint32_t line_bits_;
|
||||
MemCrossBar::Ptr mem_xbar_;
|
||||
mutable PerfStats perf_stats_;
|
||||
|
||||
uint64_t to_local_addr(uint64_t addr) {
|
||||
uint32_t total_lines = config_.capacity / config_.line_size;
|
||||
uint32_t line_bits = log2ceil(total_lines);
|
||||
uint32_t offset = bit_getw(addr, 0, line_bits-1);
|
||||
return offset;
|
||||
return bit_getw(addr, 0, line_bits_-1);
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -40,9 +38,13 @@ public:
|
|||
, config_(config)
|
||||
, ram_(config.capacity)
|
||||
{
|
||||
uint32_t total_lines = config.capacity / config.line_size;
|
||||
line_bits_ = log2ceil(total_lines);
|
||||
|
||||
char sname[100];
|
||||
snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
|
||||
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_reqs, (1 << config.B));
|
||||
uint32_t wsel_bits = log2ceil(config_.line_size);
|
||||
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, (1 << config.B), wsel_bits);
|
||||
for (uint32_t i = 0; i < config.num_reqs; ++i) {
|
||||
simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i));
|
||||
mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i));
|
||||
|
@ -56,15 +58,15 @@ public:
|
|||
}
|
||||
|
||||
void read(void* data, uint64_t addr, uint32_t size) {
|
||||
auto s_addr = to_local_addr(addr);
|
||||
DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl);
|
||||
ram_.read(data, s_addr, size);
|
||||
auto l_addr = to_local_addr(addr);
|
||||
DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl);
|
||||
ram_.read(data, l_addr, size);
|
||||
}
|
||||
|
||||
void write(const void* data, uint64_t addr, uint32_t size) {
|
||||
auto s_addr = to_local_addr(addr);
|
||||
DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl);
|
||||
ram_.write(data, s_addr, size);
|
||||
auto l_addr = to_local_addr(addr);
|
||||
DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl);
|
||||
ram_.write(data, l_addr, size);
|
||||
}
|
||||
|
||||
void tick() {
|
||||
|
@ -94,7 +96,7 @@ public:
|
|||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
perf_stats_.bank_stalls = mem_xbar_->collisions();
|
||||
perf_stats_.bank_stalls = mem_xbar_->req_collisions();
|
||||
return perf_stats_;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -147,10 +147,17 @@ void MemCoalescer::tick() {
|
|||
ReqOut.push(out_req, delay_);
|
||||
DT(4, this->name() << "-mem-req: coalesced=" << cur_mask.count() << ", " << out_req);
|
||||
|
||||
// track partial responses
|
||||
perf_stats_.misses += (cur_mask.count() != in_req.mask.count());
|
||||
|
||||
// update sent mask
|
||||
sent_mask_ |= cur_mask;
|
||||
if (sent_mask_ == in_req.mask) {
|
||||
ReqIn.pop();
|
||||
sent_mask_.reset();
|
||||
}
|
||||
}
|
||||
|
||||
const MemCoalescer::PerfStats& MemCoalescer::perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
|
@ -23,6 +23,19 @@ public:
|
|||
SimPort<LsuReq> ReqOut;
|
||||
SimPort<LsuRsp> RspOut;
|
||||
|
||||
struct PerfStats {
|
||||
uint64_t misses;
|
||||
|
||||
PerfStats()
|
||||
: misses(0)
|
||||
{}
|
||||
|
||||
PerfStats& operator+=(const PerfStats& rhs) {
|
||||
this->misses += rhs.misses;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
MemCoalescer(
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
|
@ -37,6 +50,8 @@ public:
|
|||
|
||||
void tick();
|
||||
|
||||
const PerfStats& perf_stats() const;
|
||||
|
||||
private:
|
||||
|
||||
struct pending_req_t {
|
||||
|
@ -52,6 +67,7 @@ private:
|
|||
BitVector<> sent_mask_;
|
||||
uint32_t line_size_;
|
||||
uint32_t delay_;
|
||||
PerfStats perf_stats_;
|
||||
};
|
||||
|
||||
}
|
|
@ -29,7 +29,7 @@ private:
|
|||
Config config_;
|
||||
MemCrossBar::Ptr mem_xbar_;
|
||||
DramSim dram_sim_;
|
||||
PerfStats perf_stats_;
|
||||
mutable PerfStats perf_stats_;
|
||||
|
||||
struct DramCallbackArgs {
|
||||
MemSim::Impl* memsim;
|
||||
|
@ -57,6 +57,7 @@ public:
|
|||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
perf_stats_.bank_stalls = mem_xbar_->req_collisions();
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
|
@ -66,7 +67,6 @@ public:
|
|||
|
||||
void tick() {
|
||||
dram_sim_.tick();
|
||||
uint32_t counter = 0;
|
||||
|
||||
for (uint32_t i = 0; i < config_.num_banks; ++i) {
|
||||
if (mem_xbar_->ReqOut.at(i).empty())
|
||||
|
@ -102,12 +102,6 @@ public:
|
|||
DT(3, simobject_->name() << "-mem-req[" << i << "]: " << mem_req);
|
||||
|
||||
mem_xbar_->ReqOut.at(i).pop();
|
||||
counter++;
|
||||
}
|
||||
|
||||
perf_stats_.counter += counter;
|
||||
if (counter > 0) {
|
||||
++perf_stats_.ticks;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
|
|
@ -26,17 +26,14 @@ public:
|
|||
};
|
||||
|
||||
struct PerfStats {
|
||||
uint64_t counter;
|
||||
uint64_t ticks;
|
||||
uint64_t bank_stalls;
|
||||
|
||||
PerfStats()
|
||||
: counter(0)
|
||||
, ticks(0)
|
||||
: bank_stalls(0)
|
||||
{}
|
||||
|
||||
PerfStats& operator+=(const PerfStats& rhs) {
|
||||
this->counter += rhs.counter;
|
||||
this->ticks += rhs.ticks;
|
||||
this->bank_stalls += rhs.bank_stalls;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
|
117
sim/simx/types.h
117
sim/simx/types.h
|
@ -527,6 +527,7 @@ public:
|
|||
auto& req_in = Inputs.at(j);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
DT(4, this->name() << "-req" << o << ": " << req);
|
||||
Outputs.at(o).push(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_grant(o, g);
|
||||
|
@ -597,37 +598,36 @@ public:
|
|||
// process incoming requests
|
||||
for (uint32_t o = 0; o < O; ++o) {
|
||||
int32_t input_idx = -1;
|
||||
bool has_collision = false;
|
||||
for (uint32_t r = 0; r < R; ++r) {
|
||||
uint32_t i = (grants_.at(o) + r) & (R-1);
|
||||
if (i >= I)
|
||||
continue;
|
||||
auto& req_in = Inputs.at(i);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
if (req_in.empty())
|
||||
continue;
|
||||
auto& req = req_in.front();
|
||||
uint32_t output_idx = 0;
|
||||
if (lg2_outputs_ != 0) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
|
||||
// skip if input is not going to current output
|
||||
uint32_t output_idx = 0;
|
||||
if (O != 1) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
|
||||
}
|
||||
if (output_idx != o)
|
||||
continue;
|
||||
if (input_idx != -1) {
|
||||
++collisions_;
|
||||
continue;
|
||||
}
|
||||
input_idx = i;
|
||||
}
|
||||
if (input_idx != -1) {
|
||||
has_collision = true;
|
||||
continue;
|
||||
}
|
||||
input_idx = i;
|
||||
}
|
||||
if (input_idx != -1) {
|
||||
auto& req_in = Inputs.at(input_idx);
|
||||
auto& req = req_in.front();
|
||||
if (lg2_inputs_ != 0) {
|
||||
req.tag = (req.tag << lg2_inputs_) | input_idx;
|
||||
}
|
||||
DT(4, this->name() << "-req" << input_idx << ": " << req);
|
||||
DT(4, this->name() << "-req" << o << ": " << req);
|
||||
Outputs.at(o).push(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_grant(o, input_idx);
|
||||
collisions_ += has_collision;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -721,8 +721,8 @@ public:
|
|||
g = rsp.tag & (R-1);
|
||||
rsp.tag >>= lg2_num_reqs_;
|
||||
}
|
||||
DT(4, this->name() << "-rsp" << o << ": " << rsp);
|
||||
uint32_t j = o * R + g;
|
||||
DT(4, this->name() << "-rsp" << j << ": " << rsp);
|
||||
RspIn.at(j).push(rsp, 1);
|
||||
rsp_out.pop();
|
||||
}
|
||||
|
@ -742,7 +742,7 @@ public:
|
|||
if (lg2_num_reqs_ != 0) {
|
||||
req.tag = (req.tag << lg2_num_reqs_) | g;
|
||||
}
|
||||
DT(4, this->name() << "-req" << j << ": " << req);
|
||||
DT(4, this->name() << "-req" << o << ": " << req);
|
||||
ReqOut.at(o).push(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_grant(o, g);
|
||||
|
@ -798,7 +798,8 @@ public:
|
|||
, lg2_inputs_(log2ceil(num_inputs))
|
||||
, lg2_outputs_(log2ceil(num_outputs))
|
||||
, addr_start_(addr_start)
|
||||
, collisions_(0) {
|
||||
, req_collisions_(0)
|
||||
, rsp_collisions_(0) {
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 64);
|
||||
assert(num_outputs <= 64);
|
||||
|
@ -824,65 +825,66 @@ public:
|
|||
// process outgoing responses
|
||||
for (uint32_t i = 0; i < I; ++i) {
|
||||
int32_t output_idx = -1;
|
||||
bool has_collision = false;
|
||||
for (uint32_t t = 0; t < T; ++t) {
|
||||
uint32_t o = (rsp_grants_.at(i) + t) & (T-1);
|
||||
if (o >= O)
|
||||
continue;
|
||||
auto& rsp_out = RspOut.at(o);
|
||||
if (!rsp_out.empty()) {
|
||||
auto& rsp = rsp_out.front();
|
||||
// skip if response is not going to current input
|
||||
uint32_t input_idx = 0;
|
||||
if (lg2_inputs_ != 0) {
|
||||
input_idx = rsp.tag & (R-1);
|
||||
}
|
||||
if (input_idx != i)
|
||||
continue;
|
||||
if (output_idx != -1) {
|
||||
++collisions_;
|
||||
continue;
|
||||
}
|
||||
output_idx = o;
|
||||
}
|
||||
}
|
||||
if (output_idx != -1) {
|
||||
auto& rsp_out = RspOut.at(output_idx);
|
||||
if (rsp_out.empty())
|
||||
continue;
|
||||
auto& rsp = rsp_out.front();
|
||||
uint32_t input_idx = 0;
|
||||
if (lg2_inputs_ != 0) {
|
||||
input_idx = rsp.tag & (R-1);
|
||||
// skip if response is not going to current input
|
||||
if (input_idx != i)
|
||||
continue;
|
||||
}
|
||||
if (output_idx != -1) {
|
||||
has_collision = true;
|
||||
continue;
|
||||
}
|
||||
output_idx = o;
|
||||
}
|
||||
if (output_idx != -1) {
|
||||
auto& rsp_out = RspOut.at(output_idx);
|
||||
auto& rsp = rsp_out.front();
|
||||
if (lg2_inputs_ != 0) {
|
||||
rsp.tag >>= lg2_inputs_;
|
||||
}
|
||||
DT(4, this->name() << "-rsp" << output_idx << ": " << rsp);
|
||||
RspIn.at(input_idx).push(rsp, 1);
|
||||
DT(4, this->name() << "-rsp" << i << ": " << rsp);
|
||||
RspIn.at(i).push(rsp, 1);
|
||||
rsp_out.pop();
|
||||
this->update_rsp_grant(i, output_idx);
|
||||
rsp_collisions_ += has_collision;
|
||||
}
|
||||
}
|
||||
|
||||
// process incoming requests
|
||||
for (uint32_t o = 0; o < O; ++o) {
|
||||
int32_t input_idx = -1;
|
||||
bool has_collision = false;
|
||||
for (uint32_t r = 0; r < R; ++r) {
|
||||
uint32_t i = (req_grants_.at(o) + r) & (R-1);
|
||||
if (i >= I)
|
||||
continue;
|
||||
auto& req_in = ReqIn.at(i);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
if (req_in.empty())
|
||||
continue;
|
||||
auto& req = req_in.front();
|
||||
uint32_t output_idx = 0;
|
||||
if (lg2_outputs_ != 0) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
|
||||
// skip if request is not going to current output
|
||||
uint32_t output_idx = 0;
|
||||
if (O != 1) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
|
||||
}
|
||||
if (output_idx != o)
|
||||
continue;
|
||||
if (input_idx != -1) {
|
||||
++collisions_;
|
||||
continue;
|
||||
}
|
||||
input_idx = i;
|
||||
}
|
||||
if (input_idx != -1) {
|
||||
has_collision = true;
|
||||
continue;
|
||||
}
|
||||
input_idx = i;
|
||||
}
|
||||
if (input_idx != -1) {
|
||||
auto& req_in = ReqIn.at(input_idx);
|
||||
|
@ -890,16 +892,21 @@ public:
|
|||
if (lg2_inputs_ != 0) {
|
||||
req.tag = (req.tag << lg2_inputs_) | input_idx;
|
||||
}
|
||||
DT(4, this->name() << "-req" << input_idx << ": " << req);
|
||||
DT(4, this->name() << "-req" << o << ": " << req);
|
||||
ReqOut.at(o).push(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_req_grant(o, input_idx);
|
||||
req_collisions_ += has_collision;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t collisions() const {
|
||||
return collisions_;
|
||||
uint64_t req_collisions() const {
|
||||
return req_collisions_;
|
||||
}
|
||||
|
||||
uint64_t rsp_collisions() const {
|
||||
return rsp_collisions_;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
@ -923,7 +930,8 @@ protected:
|
|||
uint32_t lg2_inputs_;
|
||||
uint32_t lg2_outputs_;
|
||||
uint32_t addr_start_;
|
||||
uint64_t collisions_;
|
||||
uint64_t req_collisions_;
|
||||
uint64_t rsp_collisions_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -978,7 +986,8 @@ private:
|
|||
uint32_t delay_;
|
||||
};
|
||||
|
||||
using MemArbiter = TxArbiter<MemReq, MemRsp>;
|
||||
using LsuArbiter = TxArbiter<LsuReq, LsuRsp>;
|
||||
using MemArbiter = TxArbiter<MemReq, MemRsp>;
|
||||
using MemCrossBar = TxCrossBar<MemReq, MemRsp>;
|
||||
|
||||
}
|
||||
|
|
|
@ -50,7 +50,7 @@ DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS)
|
|||
|
||||
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
|
||||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
SRCS += $(SRC_DIR)/xrt.cpp $(SRC_DIR)/xrt_sim.cpp
|
||||
SRCS += $(SRC_DIR)/xrt_c.cpp $(SRC_DIR)/xrt_sim.cpp
|
||||
|
||||
RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@
|
|||
#include <cstring>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include "xrt.h"
|
||||
#include "xrt_c.h"
|
||||
#include "xrt_sim.h"
|
||||
#include <VX_config.h>
|
||||
#include <util.h>
|
Loading…
Add table
Add a link
Reference in a new issue