minor updates

This commit is contained in:
Blaise Tine 2024-08-31 01:44:41 -07:00
parent 6eee0728fb
commit 7d0c141129
14 changed files with 146 additions and 105 deletions

View file

@ -254,9 +254,6 @@
// lut(x): (x & 8) != 0
`define TO_OUT_BUF_LUTRAM(s) ((s & 8) != 0)
// rbuf(x): (x <= 2) ? 3 : x
`define TO_OUT_RBUF(s) ((s & 8) | `MAX(s & 7, 3))
`define REPEAT(n,f,s) `_REPEAT_``n(f,s)
`define _REPEAT_0(f,s)
`define _REPEAT_1(f,s) `f(0)

View file

@ -325,7 +325,7 @@ module VX_alu_muldiv #(
.NUM_INPUTS (2),
.DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)),
.ARBITER ("P"),
.OUT_BUF (1)
.OUT_BUF (2)
) rsp_buf (
.clk (clk),
.reset (reset),

View file

@ -109,6 +109,7 @@ module VX_alu_unit #(
`endif
// can accept new request?
assign per_block_execute_if[block_idx].ready =
`ifdef EXT_M_ENABLE
is_muldiv_op ? muldiv_execute_if.ready :

View file

@ -212,7 +212,7 @@ module VX_core import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.lmem_perf (mem_perf_tmp_if.lmem),
`endif
.lsu_mem_in_if (lsu_mem_if),
.lsu_mem_if (lsu_mem_if),
.dcache_bus_if (dcache_bus_if)
);

View file

@ -23,7 +23,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
output cache_perf_t lmem_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_in_if [`NUM_LSU_BLOCKS],
VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS],
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS]
);
VX_lsu_mem_if #(
@ -54,7 +54,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
) lmem_switch (
.clk (clk),
.reset (reset),
.lsu_in_if (lsu_mem_in_if[i]),
.lsu_in_if (lsu_mem_if[i]),
.global_out_if(lsu_dcache_if[i]),
.local_out_if (lsu_lmem_if[i])
);

View file

@ -54,11 +54,23 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
localparam NUM_FPCORES = 4;
localparam FPCORES_BITS = `LOG2UP(NUM_FPCORES);
localparam REQ_DATAW = NUM_LANES + TAG_WIDTH + `INST_FPU_BITS + `INST_FMT_BITS + `INST_FRM_BITS + 3 * (NUM_LANES * `XLEN);
localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAG_WIDTH;
`UNUSED_VAR (fmt)
wire [NUM_FPCORES-1:0] per_core_valid_in;
wire [NUM_FPCORES-1:0][REQ_DATAW-1:0] per_core_data_in;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0] per_core_mask_in;
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_in;
wire [NUM_FPCORES-1:0][`INST_FPU_BITS-1:0] per_core_op_type;
wire [NUM_FPCORES-1:0][`INST_FMT_BITS-1:0] per_core_fmt;
wire [NUM_FPCORES-1:0][`INST_FRM_BITS-1:0] per_core_frm;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_dataa;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datab;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datac;
wire [NUM_FPCORES-1:0] per_core_ready_in;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_result;
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_out;
wire [NUM_FPCORES-1:0] per_core_ready_out;
@ -94,18 +106,44 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_VAR (datab)
`UNUSED_VAR (datac)
// Decode instruction type
// Decode fpu core type
wire [FPCORES_BITS-1:0] core_select = op_type[3:2];
wire is_sqrt = op_type[0];
wire is_itof = op_type[1];
wire is_signed = ~op_type[0];
wire is_madd = op_type[1];
wire is_neg = op_type[0];
wire is_sub = fmt[1];
// can accept new request?
assign per_core_ready_in[FPU_DIVSQRT] = div_sqrt_ready_in[is_sqrt];
assign ready_in = per_core_ready_in[core_select];
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_INPUTS (1),
.NUM_OUTPUTS (NUM_FPCORES),
.OUT_BUF (0)
) req_switch (
.clk (clk),
.reset (reset),
.sel_in (core_select),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in ({mask_in, tag_in, op_type, fmt, frm, dataa_s, datab_s, datac_s}),
.data_out (per_core_data_in),
.valid_out (per_core_valid_in),
.ready_out (per_core_ready_in)
);
for (genvar i = 0; i < NUM_FPCORES; ++i) begin
assign {
per_core_mask_in[i],
per_core_tag_in[i],
per_core_op_type[i],
per_core_fmt[i],
per_core_frm[i],
per_core_dataa[i],
per_core_datab[i],
per_core_datac[i]
} = per_core_data_in[i];
end
// FMA core
wire is_madd = per_core_op_type[FPU_FMA][1];
wire is_neg = per_core_op_type[FPU_FMA][0];
wire is_sub = per_core_fmt[FPU_FMA][1];
VX_fpu_fma #(
.NUM_LANES (NUM_LANES),
@ -113,17 +151,17 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
) fpu_fma (
.clk (clk),
.reset (fma_reset),
.valid_in (valid_in && (core_select == FPU_FMA)),
.valid_in (per_core_valid_in[FPU_FMA]),
.ready_in (per_core_ready_in[FPU_FMA]),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.mask_in (per_core_mask_in[FPU_FMA]),
.tag_in (per_core_tag_in[FPU_FMA]),
.frm (per_core_frm[FPU_FMA]),
.is_madd (is_madd),
.is_sub (is_sub),
.is_neg (is_neg),
.dataa (dataa_s),
.datab (datab_s),
.datac (datac_s),
.dataa (per_core_dataa[FPU_FMA]),
.datab (per_core_datab[FPU_FMA]),
.datac (per_core_datac[FPU_FMA]),
.has_fflags (per_core_has_fflags[FPU_FMA]),
.fflags (per_core_fflags[FPU_FMA]),
.result (per_core_result[FPU_FMA]),
@ -132,19 +170,24 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
.valid_out (per_core_valid_out[FPU_FMA])
);
// Div/Sqrt cores
wire is_sqrt = per_core_op_type[FPU_DIVSQRT][0];
assign per_core_ready_in[FPU_DIVSQRT] = div_sqrt_ready_in[is_sqrt];
VX_fpu_div #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH)
) fpu_div (
.clk (clk),
.reset (div_reset),
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && ~is_sqrt),
.valid_in (per_core_valid_in[FPU_DIVSQRT] && ~is_sqrt),
.ready_in (div_sqrt_ready_in[0]),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.dataa (dataa_s),
.datab (datab_s),
.mask_in (per_core_mask_in[FPU_DIVSQRT]),
.tag_in (per_core_tag_in[FPU_DIVSQRT]),
.frm (per_core_frm[FPU_DIVSQRT]),
.dataa (per_core_dataa[FPU_DIVSQRT]),
.datab (per_core_datab[FPU_DIVSQRT]),
.has_fflags (div_sqrt_has_fflags[0]),
.fflags (div_sqrt_fflags[0]),
.result (div_sqrt_result[0]),
@ -159,12 +202,12 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
) fpu_sqrt (
.clk (clk),
.reset (sqrt_reset),
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && is_sqrt),
.valid_in (per_core_valid_in[FPU_DIVSQRT] && is_sqrt),
.ready_in (div_sqrt_ready_in[1]),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.dataa (dataa_s),
.mask_in (per_core_mask_in[FPU_DIVSQRT]),
.tag_in (per_core_tag_in[FPU_DIVSQRT]),
.frm (per_core_frm[FPU_DIVSQRT]),
.dataa (per_core_dataa[FPU_DIVSQRT]),
.has_fflags (div_sqrt_has_fflags[1]),
.fflags (div_sqrt_fflags[1]),
.result (div_sqrt_result[1]),
@ -173,23 +216,27 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
.ready_out (div_sqrt_ready_out[1])
);
// CVT core
wire is_itof = per_core_op_type[FPU_CVT][1];
wire is_signed = ~per_core_op_type[FPU_CVT][0];
wire cvt_ret_int_in = ~is_itof;
wire cvt_ret_int_out;
VX_fpu_cvt #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH+1)
.TAG_WIDTH (1+TAG_WIDTH)
) fpu_cvt (
.clk (clk),
.reset (cvt_reset),
.valid_in (valid_in && (core_select == FPU_CVT)),
.valid_in (per_core_valid_in[FPU_CVT]),
.ready_in (per_core_ready_in[FPU_CVT]),
.mask_in (mask_in),
.tag_in ({cvt_ret_int_in, tag_in}),
.frm (frm),
.mask_in (per_core_mask_in[FPU_CVT]),
.tag_in ({cvt_ret_int_in, per_core_tag_in[FPU_CVT]}),
.frm (per_core_frm[FPU_CVT]),
.is_itof (is_itof),
.is_signed (is_signed),
.dataa (dataa_s),
.dataa (per_core_dataa[FPU_CVT]),
.has_fflags (per_core_has_fflags[FPU_CVT]),
.fflags (per_core_fflags[FPU_CVT]),
.result (per_core_result[FPU_CVT]),
@ -198,12 +245,14 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
.ready_out (per_core_ready_out[FPU_CVT])
);
wire ncp_ret_int_in = (op_type == `INST_FPU_CMP)
|| `INST_FPU_IS_CLASS(op_type, frm)
|| `INST_FPU_IS_MVXW(op_type, frm);
// NCP core
wire ncp_ret_int_in = (per_core_op_type[FPU_NCP] == `INST_FPU_CMP)
|| `INST_FPU_IS_CLASS(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP])
|| `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
wire ncp_ret_int_out;
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
wire ncp_ret_sext_out;
VX_fpu_ncp #(
@ -212,14 +261,14 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
) fpu_ncp (
.clk (clk),
.reset (ncp_reset),
.valid_in (valid_in && (core_select == FPU_NCP)),
.valid_in (per_core_valid_in[FPU_NCP]),
.ready_in (per_core_ready_in[FPU_NCP]),
.mask_in (mask_in),
.tag_in ({ncp_ret_sext_in, ncp_ret_int_in, tag_in}),
.op_type (op_type),
.frm (frm),
.dataa (dataa_s),
.datab (datab_s),
.mask_in (per_core_mask_in[FPU_NCP]),
.tag_in ({ncp_ret_sext_in, ncp_ret_int_in, per_core_tag_in[FPU_NCP]}),
.op_type (per_core_op_type[FPU_NCP]),
.frm (per_core_frm[FPU_NCP]),
.dataa (per_core_dataa[FPU_NCP]),
.datab (per_core_datab[FPU_NCP]),
.result (per_core_result[FPU_NCP]),
.has_fflags (per_core_has_fflags[FPU_NCP]),
.fflags (per_core_fflags[FPU_NCP]),

View file

@ -80,7 +80,6 @@ module VX_mem_coalescer #(
`RUNTIME_ASSERT ((~out_rsp_valid || out_rsp_mask != 0), ("invalid request mask"));
localparam TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH;
localparam NUM_REQS_W = `LOG2UP(NUM_REQS);
// tag + mask + offest
localparam IBUF_DATA_WIDTH = TAG_ID_WIDTH + NUM_REQS + (NUM_REQS * DATA_RATIO_W);
@ -115,13 +114,8 @@ module VX_mem_coalescer #(
logic [NUM_REQS-1:0] addr_matches_r, addr_matches_n;
logic [NUM_REQS-1:0] req_rem_mask_r, req_rem_mask_n;
wire [OUT_REQS-1:0][NUM_REQS_W-1:0] seed_idx;
wire [NUM_REQS-1:0][OUT_ADDR_WIDTH-1:0] in_addr_base;
wire [NUM_REQS-1:0][DATA_RATIO_W-1:0] in_addr_offset;
for (genvar i = 0; i < NUM_REQS; i++) begin
assign in_addr_base[i] = in_req_addr[i][ADDR_WIDTH-1:DATA_RATIO_W];
assign in_addr_offset[i] = in_req_addr[i][DATA_RATIO_W-1:0];
end
@ -140,21 +134,18 @@ module VX_mem_coalescer #(
.valid_out (batch_valid_n[i])
);
if (OUT_REQS > 1) begin
assign seed_idx[i] = {(NUM_REQS_W-DATA_RATIO_W)'(i), batch_idx};
end else begin
assign seed_idx[i] = batch_idx;
end
end
for (genvar i = 0; i < OUT_REQS; ++i) begin
assign seed_addr_n[i] = in_addr_base[seed_idx[i]];
assign seed_flags_n[i] = in_req_flags[seed_idx[i]];
end
for (genvar i = 0; i < OUT_REQS; ++i) begin
wire [DATA_RATIO-1:0][OUT_ADDR_WIDTH-1:0] addr_base;
wire [DATA_RATIO-1:0][FLAGS_WIDTH-1:0] req_flags;
for (genvar j = 0; j < DATA_RATIO; ++j) begin
assign addr_matches_n[i * DATA_RATIO + j] = (in_addr_base[i * DATA_RATIO + j] == seed_addr_n[i]);
assign addr_base[j] = in_req_addr[DATA_RATIO * i + j][ADDR_WIDTH-1:DATA_RATIO_W];
assign req_flags[j] = in_req_flags[DATA_RATIO * i + j];
end
assign seed_addr_n[i] = addr_base[batch_idx];
assign seed_flags_n[i] = req_flags[batch_idx];
for (genvar j = 0; j < DATA_RATIO; ++j) begin
assign addr_matches_n[i * DATA_RATIO + j] = (addr_base[j] == seed_addr_n[i]);
end
end

View file

@ -128,7 +128,7 @@ module VX_pe_serializer #(
data_out_r <= data_out_n;
end
assign enable = ready_out_u || ~batch_out_done;
assign enable = ready_out_u || ~valid_out_u;
assign ready_in = enable && batch_in_done;
assign valid_out_u = batch_out_done;

View file

@ -97,7 +97,7 @@ module VX_stream_arb #(
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (`TO_OUT_RBUF(OUT_BUF)) // to registered output
.OUT_BUF (3)
) fanout_slice_arb (
.clk (clk),
.reset (reset),
@ -242,7 +242,7 @@ module VX_stream_arb #(
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (`TO_OUT_RBUF(OUT_BUF)) // to registered output
.OUT_BUF (3)
) fanout_fork_arb (
.clk (clk),
.reset (reset),

View file

@ -38,36 +38,36 @@ module VX_stream_switch #(
);
if (NUM_INPUTS > NUM_OUTPUTS) begin
wire [NUM_OUTPUTS-1:0][NUM_REQS-1:0] valid_in_r;
wire [NUM_OUTPUTS-1:0][NUM_REQS-1:0][DATAW-1:0] data_in_r;
wire [NUM_OUTPUTS-1:0][NUM_REQS-1:0] valid_in_w;
wire [NUM_OUTPUTS-1:0][NUM_REQS-1:0][DATAW-1:0] data_in_w;
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
for (genvar j = 0; j < NUM_REQS; ++j) begin
localparam ii = i * NUM_REQS + j;
if (ii < NUM_INPUTS) begin
assign valid_in_r[i][j] = valid_in[ii];
assign data_in_r[i][j] = data_in[ii];
assign valid_in_w[i][j] = valid_in[ii];
assign data_in_w[i][j] = data_in[ii];
end else begin
assign valid_in_r[i][j] = 0;
assign data_in_r[i][j] = '0;
assign valid_in_w[i][j] = 0;
assign data_in_w[i][j] = '0;
end
end
end
wire [NUM_OUTPUTS-1:0] valid_out_r;
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_r;
wire [NUM_OUTPUTS-1:0] ready_out_r;
wire [NUM_OUTPUTS-1:0] valid_out_w;
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w;
wire [NUM_OUTPUTS-1:0] ready_out_w;
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
assign valid_out_r[i] = valid_in_r[i][sel_in[i]];
assign data_out_r[i] = data_in_r[i][sel_in[i]];
assign valid_out_w[i] = valid_in_w[i][sel_in[i]];
assign data_out_w[i] = data_in_w[i][sel_in[i]];
end
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
for (genvar j = 0; j < NUM_REQS; ++j) begin
localparam ii = i * NUM_REQS + j;
if (ii < NUM_INPUTS) begin
assign ready_in[ii] = ready_out_r[i] & (sel_in[i] == LOG_NUM_REQS'(j));
assign ready_in[ii] = ready_out_w[i] && (sel_in[i] == LOG_NUM_REQS'(j));
end
end
end
@ -80,9 +80,9 @@ module VX_stream_switch #(
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_out_r[i]),
.ready_in (ready_out_r[i]),
.data_in (data_out_r[i]),
.valid_in (valid_out_w[i]),
.ready_in (ready_out_w[i]),
.data_in (data_out_w[i]),
.data_out (data_out[i]),
.valid_out (valid_out[i]),
.ready_out (ready_out[i])
@ -91,14 +91,14 @@ module VX_stream_switch #(
end else if (NUM_OUTPUTS > NUM_INPUTS) begin
wire [NUM_INPUTS-1:0][NUM_REQS-1:0] valid_out_r;
wire [NUM_INPUTS-1:0][NUM_REQS-1:0] ready_out_r;
wire [NUM_INPUTS-1:0][NUM_REQS-1:0] valid_out_w;
wire [NUM_INPUTS-1:0][NUM_REQS-1:0] ready_out_w;
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
for (genvar j = 0; j < NUM_REQS; ++j) begin
assign valid_out_r[i][j] = valid_in[i] & (sel_in[i] == LOG_NUM_REQS'(j));
assign valid_out_w[i][j] = valid_in[i] && (sel_in[i] == LOG_NUM_REQS'(j));
end
assign ready_in[i] = ready_out_r[i][sel_in[i]];
assign ready_in[i] = ready_out_w[i][sel_in[i]];
end
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
@ -112,17 +112,16 @@ module VX_stream_switch #(
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_out_r[i][j]),
.ready_in (ready_out_r[i][j]),
.valid_in (valid_out_w[i][j]),
.ready_in (ready_out_w[i][j]),
.data_in (data_in[i]),
.data_out (data_out[ii]),
.valid_out (valid_out[ii]),
.ready_out (ready_out[ii])
);
end else begin
`UNUSED_VAR (reset)
`UNUSED_VAR (valid_out_r[i][j])
assign ready_out_r[i][j] = '0;
`UNUSED_VAR (valid_out_w[i][j])
assign ready_out_w[i][j] = '0;
end
end
end

View file

@ -39,9 +39,9 @@ module VX_stream_unpack #(
if (NUM_REQS > 1) begin
reg [NUM_REQS-1:0] rem_mask;
wire [NUM_REQS-1:0] ready_out_r;
wire [NUM_REQS-1:0] ready_out_w;
wire [NUM_REQS-1:0] rem_mask_n = rem_mask & ~ready_out_r;
wire [NUM_REQS-1:0] rem_mask_n = rem_mask & ~ready_out_w;
wire sent_all = ~(| (mask_in & rem_mask_n));
always @(posedge clk) begin
@ -65,7 +65,7 @@ module VX_stream_unpack #(
.clk (clk),
.reset (reset),
.valid_in (valid_in && mask_in[i] && rem_mask[i]),
.ready_in (ready_out_r[i]),
.ready_in (ready_out_w[i]),
.data_in ({data_in[i], tag_in}),
.data_out ({data_out[i], tag_out[i]}),
.valid_out (valid_out[i]),

View file

@ -90,7 +90,11 @@ else ifeq ($(DEV_ARCH), versal)
# versal
else
# alveo
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:15]
ifneq ($(findstring xilinx_u55c,$(XSA)),)
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31]
else
VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:15]
endif
endif
VPP_FLAGS += --report_level 2

View file

@ -71,7 +71,7 @@ VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION -DSV_DPI
VL_FLAGS += -DXLEN_$(XLEN)
VL_FLAGS += $(CONFIGS)
VL_FLAGS += $(SRC_DIR)/verilator.vlt
VL_FLAGS += verilator.vlt
VL_FLAGS += $(RTL_INCLUDE)
VL_FLAGS += $(RTL_PKGS)

View file

@ -2,4 +2,4 @@
lint_off -rule BLKANDNBLK -file "@VORTEX_HOME@/third_party/cvfpu/*"
lint_off -rule UNOPTFLAT -file "@VORTEX_HOME@/third_party/cvfpu/*"
lint_off -file "@VORTEX_HOME@/third_party/cvfpu/*"
lint_off -file "@VORTEX_HOME@/third_party/cvfpu/*"