bug fixes
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions

This commit is contained in:
tinebp 2025-02-28 00:00:46 -08:00
parent 7c7ec029fc
commit d23bca3a3d
9 changed files with 97 additions and 72 deletions

View file

@ -171,7 +171,7 @@ module VX_commit import VX_gpu_pkg::*; #(
assign writeback_if[i].data.data = commit_arb_if[i].data.data;
assign writeback_if[i].data.sop = commit_arb_if[i].data.sop;
assign writeback_if[i].data.eop = commit_arb_if[i].data.eop;
assign commit_arb_if[i].ready = writeback_if[i].ready;
assign commit_arb_if[i].ready = 1;
end
`ifdef DBG_TRACE_PIPELINE

View file

@ -61,6 +61,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
.perf_sfu_uses (issue_perf.sfu_uses),
`endif
.writeback_if (writeback_if),
.operands_if (operands_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);

View file

@ -31,7 +31,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_DATAW= UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + 1 + NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_IN_SIZE);
localparam REQ_ASHIFT = `CLOG2(LSU_WORD_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
@ -105,7 +105,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
wire no_rsp_buf_valid, no_rsp_buf_ready;
wire [LSUQ_SIZEW-1:0] reqq_waddr, reqq_raddr;
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
// fence handling
@ -215,52 +215,69 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
if (PID_BITS != 0) begin : g_pid
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_IN_SIZE-1:0] pkt_sop;
reg [`LSUQ_IN_SIZE-1:0] pkt_eop;
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
wire mem_req_rd_fire = mem_req_fire && ~mem_req_rw;
wire mem_req_rd_fire = mem_req_fire && ~mem_req_rw;
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if.data.sop;
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if.data.eop;
wire mem_rsp_sop_fire = mem_rsp_fire && mem_rsp_sop;
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
wire mem_rsp_eop_pkt_fire= mem_rsp_fire && mem_rsp_eop_pkt;
wire full;
assign mem_rsp_sop_pkt = pkt_sop[reqq_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop && pkt_eop[reqq_raddr] && (pkt_ctr[reqq_raddr] == 1);
VX_allocator #(
.SIZE (`LSUQ_IN_SIZE)
) pkt_allocator (
.clk (clk),
.reset (reset),
.acquire_en (mem_req_rd_eop_fire),
.acquire_addr(pkt_waddr),
.release_en (mem_rsp_eop_pkt_fire),
.release_addr(pkt_raddr),
`UNUSED_PIN (empty),
.full (full)
);
wire rw_collision = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr);
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < `LSUQ_IN_SIZE; ++i) begin
pkt_ctr[i] <= '0;
pkt_sop[i] <= 1;
pkt_eop[i] <= 0;
end
pkt_ctr <= '0;
pkt_sop <= '0;
pkt_eop <= '0;
end else begin
if (mem_req_rd_eop_fire) begin
pkt_eop[reqq_waddr] <= 1;
if (mem_req_rd_sop_fire) begin
pkt_sop[pkt_waddr] <= 1;
end
if (~(mem_req_rd_fire && mem_rsp_eop_fire && (reqq_raddr == reqq_waddr))) begin
if (mem_req_rd_eop_fire) begin
pkt_eop[pkt_waddr] <= 1;
end
if (mem_rsp_fire) begin
pkt_sop[pkt_raddr] <= 0;
end
if (mem_rsp_eop_pkt_fire) begin
pkt_eop[pkt_raddr] <= 0;
end
if (~rw_collision) begin
if (mem_req_rd_fire) begin
pkt_ctr[reqq_waddr] <= pkt_ctr[reqq_waddr] + PID_BITS'(1);
pkt_ctr[pkt_waddr] <= pkt_ctr[pkt_waddr] + PID_BITS'(1);
end
if (mem_rsp_eop_fire) begin
pkt_ctr[reqq_raddr] <= pkt_ctr[reqq_raddr] - PID_BITS'(1);
pkt_ctr[pkt_raddr] <= pkt_ctr[pkt_raddr] - PID_BITS'(1);
end
end
if (mem_rsp_sop_fire) begin
pkt_sop[reqq_raddr] <= 0;
end
if (mem_rsp_eop_fire && mem_rsp_eop_pkt) begin
pkt_sop[reqq_raddr] <= 1;
pkt_eop[reqq_raddr] <= 0;
end
end
end
`RUNTIME_ASSERT(~(mem_req_rd_fire && pkt_eop[reqq_waddr]), ("%t: oops! broken eop request! (#%0d)", $time, execute_if.data.uuid))
`RUNTIME_ASSERT(~(mem_req_rd_fire && (2**PID_BITS-1) == pkt_ctr[reqq_waddr]), ("%t: oops! broken ctr request! (#%0d)", $time, execute_if.data.uuid))
`RUNTIME_ASSERT(~(mem_rsp_fire && 0 == pkt_ctr[reqq_raddr]), ("%t: oops! broken ctr response! (#%0d)", $time, rsp_uuid))
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("%t: allocator full!", $time))
`RUNTIME_ASSERT(~(mem_req_rd_sop_fire && pkt_ctr[pkt_waddr] != 0), ("%t: oops! broken sop request!", $time))
`UNUSED_VAR (mem_rsp_sop)
end else begin : g_no_pid
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
`UNUSED_VAR (reqq_raddr)
`UNUSED_VAR (pkt_raddr)
end
// pack memory request tag
@ -273,7 +290,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
execute_if.data.op_type,
req_align,
execute_if.data.pid,
reqq_waddr,
pkt_waddr,
req_is_fence
};
@ -322,12 +339,10 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.core_req_data (mem_req_data),
.core_req_tag (mem_req_tag),
.core_req_ready (mem_req_ready),
.core_req_queue_id (reqq_waddr),
// request queue info
`UNUSED_PIN (req_queue_empty),
`UNUSED_PIN (req_queue_pop),
`UNUSED_PIN (req_queue_id),
`UNUSED_PIN (req_queue_rw_notify),
// Output response
.core_rsp_valid (mem_rsp_valid),
@ -393,7 +408,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
rsp_op_type,
rsp_align,
rsp_pid,
reqq_raddr,
pkt_raddr,
rsp_is_fence
} = mem_rsp_tag;
@ -448,7 +463,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.reset (reset),
.valid_in (mem_rsp_valid),
.ready_in (mem_rsp_ready),
.data_in ({rsp_uuid, rsp_wid, mem_rsp_mask, rsp_pc, rsp_wb, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
.data_in ({rsp_uuid, rsp_wid, mem_rsp_mask, rsp_pc, rsp_wb, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
.data_out ({result_rsp_if.data.uuid, result_rsp_if.data.wid, result_rsp_if.data.tmask, result_rsp_if.data.PC, result_rsp_if.data.wb, result_rsp_if.data.rd, result_rsp_if.data.data, result_rsp_if.data.pid, result_rsp_if.data.sop, result_rsp_if.data.eop}),
.valid_out (result_rsp_if.valid),
.ready_out (result_rsp_if.ready)
@ -462,7 +477,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.reset (reset),
.valid_in (no_rsp_buf_valid),
.ready_in (no_rsp_buf_ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_out ({result_no_rsp_if.data.uuid, result_no_rsp_if.data.wid, result_no_rsp_if.data.tmask, result_no_rsp_if.data.PC, result_no_rsp_if.data.pid, result_no_rsp_if.data.sop, result_no_rsp_if.data.eop}),
.valid_out (result_no_rsp_if.valid),
.ready_out (result_no_rsp_if.ready)
@ -513,7 +528,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
end
if (mem_rsp_fire) begin
`TRACE(2, ("%t: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, pid=%0d, sop=%b, eop=%b, data=",
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, rsp_pid, mem_rsp_sop, mem_rsp_eop))
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt))
`TRACE_ARRAY1D(2, "0x%0h", mem_rsp_data, NUM_LANES)
`TRACE(2, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid))
end

View file

@ -51,9 +51,9 @@ module VX_operands import VX_gpu_pkg::*; #(
// collector selection
reg [`NUM_OPCS-1:0] ready_opcs;
reg [`NUM_OPCS-1:0] select_opcs;
always @(*) begin
ready_opcs = per_opc_scoreboard_ready;
select_opcs = '1;
if (`NUM_OPCS > 1 && SIMD_COUNT > 1) begin
// SFU cannot handle multiple inflight WCTL instructions, always assign them same collector
// LD/ST instructions should also be ordered via the same collector
@ -61,12 +61,12 @@ module VX_operands import VX_gpu_pkg::*; #(
&& inst_sfu_is_wctl(scoreboard_if.data.op_type)) begin
// select collector 0
for (int i = 0; i < `NUM_OPCS; ++i) begin
if (i != 0) ready_opcs[i] = 0;
if (i != 0) select_opcs[i] = 0;
end
end else if (scoreboard_if.data.ex_type == EX_LSU) begin
// select collector 1
for (int i = 0; i < `NUM_OPCS; ++i) begin
if (i != 1) ready_opcs[i] = 0;
if (i != 1) select_opcs[i] = 0;
end
end
end
@ -75,6 +75,8 @@ module VX_operands import VX_gpu_pkg::*; #(
wire opc_sel_valid;
wire [`NUM_OPCS-1:0] opc_sel_mask;
wire [`NUM_OPCS-1:0] ready_opcs = select_opcs & per_opc_scoreboard_ready;
VX_priority_encoder #(
.N (`NUM_OPCS)
) opc_sel (
@ -123,12 +125,7 @@ module VX_operands import VX_gpu_pkg::*; #(
);
wire war_dp_check = (opc_pending_regs[writeback_if.data.rd] == 0);
VX_writeback_if writeback_if_s();
assign writeback_if_s.valid = writeback_if.valid && war_dp_check;
assign writeback_if_s.data = writeback_if.data;
assign writeback_if.ready = war_dp_check;
`UNUSED_VAR (writeback_if_s.ready)
`UNUSED_VAR (war_dp_check)
VX_gpr_unit #(
.INSTANCE_ID (`SFORMATF(("%s-gpr", INSTANCE_ID))),
@ -140,7 +137,7 @@ module VX_operands import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.perf_stalls (perf_stalls),
`endif
.writeback_if (writeback_if_s),
.writeback_if (writeback_if),
.gpr_if (per_opc_gpr_if)
);
@ -170,13 +167,13 @@ module VX_operands import VX_gpu_pkg::*; #(
if (reset) begin
timeout_ctr <= '0;
end else begin
if (writeback_if.valid && ~writeback_if.ready) begin
if (writeback_if.valid) begin
`ifdef DBG_TRACE_PIPELINE
`TRACE(4, ("%t: *** %s-stall: wid=%0d, sid=%0d, tmask=%b, PC=0x%0h, cycles=%0d (#%0d)\n",
$time, INSTANCE_ID, wis_to_wid(writeback_if.data.wis, ISSUE_ID), writeback_if.data.sid, writeback_if.data.tmask, {writeback_if.data.PC, 1'b0}, timeout_ctr, writeback_if.data.uuid))
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (writeback_if.valid && writeback_if.ready) begin
end else if (writeback_if.valid) begin
timeout_ctr <= '0;
end
end

View file

@ -27,6 +27,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`endif
VX_writeback_if.slave writeback_if,
VX_operands_if.slave operands_if,
VX_ibuffer_if.slave ibuffer_if [PER_ISSUE_WARPS],
VX_scoreboard_if.master scoreboard_if
);
@ -38,7 +39,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
localparam DATAW = UUID_WIDTH + `NUM_THREADS + PC_BITS + EX_BITS + INST_OP_BITS + INST_ARGS_BITS + NUM_OPDS + (REG_IDX_BITS * NUM_OPDS);
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
reg [PER_ISSUE_WARPS-1:0] operands_ready;
wire [PER_ISSUE_WARPS-1:0] operands_ready;
`ifdef PERF_ENABLE
reg [PER_ISSUE_WARPS-1:0][NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
@ -122,14 +123,19 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_scoreboard
reg [NUM_REGS-1:0] inuse_regs, inuse_regs_n;
wire [NUM_OPDS-1:0] operands_busy;
reg in_use_warp, in_use_warp_n;
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
wire staging_fire = staging_if[w].valid && staging_if[w].ready;
wire writeback_fire = writeback_if.valid && writeback_if.ready
wire writeback_fire = writeback_if.valid
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
&& writeback_if.data.eop;
wire operands_fire = operands_if.valid && operands_if.ready
&& (operands_if.data.wis == ISSUE_WIS_W'(w))
&& operands_if.data.eop;
reg_idx_t [NUM_OPDS-1:0] ibf_opds, stg_opds;
assign ibf_opds = {ibuffer_if[w].data.rs3, ibuffer_if[w].data.rs2, ibuffer_if[w].data.rs1, ibuffer_if[w].data.rd};
assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd};
@ -174,6 +180,15 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
always @(*) begin
in_use_warp_n = in_use_warp;
if (operands_fire) begin
in_use_warp_n = 0;
end else if (staging_fire) begin
in_use_warp_n = 1;
end
end
wire [REG_TYPES-1:0][31:0] in_use_mask;
for (genvar i = 0; i < REG_TYPES; ++i) begin : g_in_use_mask
wire [31:0] ibf_reg_mask = ibf_opd_mask[0][i] | ibf_opd_mask[1][i] | ibf_opd_mask[2][i] | ibf_opd_mask[3][i];
@ -192,15 +207,21 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
assign operands_busy[i] = (in_use_mask[rtype] & stg_opd_mask[i][rtype]) != 0;
end
reg operands_ready_r;
always @(posedge clk) begin
if (reset) begin
inuse_regs <= '0;
in_use_warp <= 0;
end else begin
inuse_regs <= inuse_regs_n;
in_use_warp <= in_use_warp_n;
end
operands_ready[w] <= ~(| regs_busy);
operands_ready_r <= ~(| regs_busy);
end
assign operands_ready[w] = operands_ready_r && ~in_use_warp;
`ifdef PERF_ENABLE
always @(posedge clk) begin
if (staging_fire && staging_if[w].data.wb) begin

View file

@ -29,18 +29,15 @@ interface VX_writeback_if import VX_gpu_pkg::*; ();
logic valid;
data_t data;
logic ready;
modport master (
output valid,
output data,
input ready
output data
);
modport slave (
input valid,
input data,
output ready
input data
);
endinterface

View file

@ -55,12 +55,10 @@ module VX_mem_scheduler #(
input wire [CORE_REQS-1:0][WORD_WIDTH-1:0] core_req_data,
input wire [TAG_WIDTH-1:0] core_req_tag,
output wire core_req_ready,
output wire [CORE_QUEUE_ADDRW-1:0] core_req_queue_id,
// Core request queue
output wire req_queue_empty,
output wire req_queue_pop,
output wire [CORE_QUEUE_ADDRW-1:0] req_queue_id,
output wire req_queue_rw_notify,
// Core response
output wire core_rsp_valid,
@ -189,13 +187,9 @@ module VX_mem_scheduler #(
// can accept another request?
assign core_req_ready = reqq_ready_in && ibuf_ready;
// return core queue id
assign core_req_queue_id = ibuf_waddr;
// request qeueue info
assign req_queue_pop = reqq_valid && reqq_ready;
assign req_queue_rw_notify = reqq_valid && reqq_ready && reqq_rw;
assign req_queue_empty = !reqq_valid && ibuf_empty;
assign req_queue_id = reqq_tag[CORE_QUEUE_ADDRW-1:0];
// Index buffer ///////////////////////////////////////////////////////////

View file

@ -107,7 +107,7 @@ module VX_lsu_mem_arb import VX_gpu_pkg::*; #(
wire [NUM_OUTPUTS-1:0][RSP_DATAW-1:0] rsp_data_in;
wire [NUM_OUTPUTS-1:0] rsp_ready_in;
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_rsp_enabled
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_rsp_select
wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS-1:0] rsp_sel_in;
@ -148,7 +148,7 @@ module VX_lsu_mem_arb import VX_gpu_pkg::*; #(
.ready_out (rsp_ready_out)
);
end else begin : g_passthru
end else begin : g_rsp_arb
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;

View file

@ -105,7 +105,7 @@ module VX_mem_arb import VX_gpu_pkg::*; #(
wire [NUM_OUTPUTS-1:0][RSP_DATAW-1:0] rsp_data_in;
wire [NUM_OUTPUTS-1:0] rsp_ready_in;
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_rsp_enabled
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_rsp_select
wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS-1:0] rsp_sel_in;
@ -142,7 +142,7 @@ module VX_mem_arb import VX_gpu_pkg::*; #(
.ready_out (rsp_ready_out)
);
end else begin : g_passthru
end else begin : g_rsp_arb
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;