per-warp ibuffer scoreboard scheduling optimization

This commit is contained in:
Blaise Tine 2024-02-14 14:24:47 -08:00
parent a78ac7a246
commit bb4c150aaf
10 changed files with 173 additions and 92 deletions

View file

@ -269,7 +269,7 @@
// Size of Instruction Buffer
`ifndef IBUF_SIZE
`define IBUF_SIZE (2 * (`NUM_WARPS / `ISSUE_WIDTH))
`define IBUF_SIZE 4
`endif
// Size of LSU Request Queue

View file

@ -67,25 +67,23 @@ module VX_fetch import VX_gpu_pkg::*; #(
`ifndef L1_ENABLE
// Ensure that the ibuffer doesn't fill up.
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus.
wire [ISSUE_ISW-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache requests.
// This issue is particularly prevalent when the icache and dcache are disabled and both requests share the same bus.
wire [`NUM_WARPS-1:0] pending_ibuf_full;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
VX_pending_size #(
.SIZE (`IBUF_SIZE)
) pending_reads (
.clk (clk),
.reset (reset),
.incr (icache_req_fire && schedule_isw == i),
.incr (icache_req_fire && schedule_if.data.wid == i),
.decr (fetch_if.ibuf_pop[i]),
.full (pending_ibuf_full[i]),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
);
end
wire ibuf_ready = ~pending_ibuf_full[schedule_isw];
wire ibuf_ready = ~pending_ibuf_full[schedule_if.data.wid];
`else
wire ibuf_ready = 1'b1;
`endif

View file

@ -23,20 +23,16 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
VX_decode_if.slave decode_if,
// outputs
VX_ibuffer_if.master ibuffer_if [`ISSUE_WIDTH]
VX_ibuffer_if.master ibuffer_if [`NUM_WARPS]
);
`UNUSED_PARAM (CORE_ID)
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH);
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4);
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4);
wire [`ISSUE_WIDTH-1:0] ibuf_ready_in;
wire [ISW_WIDTH-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
wire [`NUM_WARPS-1:0] ibuf_ready_in;
assign decode_if.ready = ibuf_ready_in[decode_isw];
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
@ -44,11 +40,10 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
) instr_buf (
.clk (clk),
.reset (reset),
.valid_in (decode_if.valid && decode_isw == i),
.valid_in (decode_if.valid && decode_if.data.wid == i),
.ready_in (ibuf_ready_in[i]),
.data_in ({
decode_if.data.uuid,
decode_wis,
decode_if.data.tmask,
decode_if.data.ex_type,
decode_if.data.op_type,
@ -63,7 +58,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
decode_if.data.rs2,
decode_if.data.rs3}),
.data_out(ibuffer_if[i].data),
.valid_out (ibuffer_if[i].valid),
.valid_out(ibuffer_if[i].valid),
.ready_out(ibuffer_if[i].ready)
);
`ifndef L1_ENABLE

View file

@ -36,8 +36,8 @@ module VX_issue #(
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
);
VX_ibuffer_if ibuffer_if [`ISSUE_WIDTH]();
VX_ibuffer_if scoreboard_if [`ISSUE_WIDTH]();
VX_ibuffer_if ibuffer_if [`NUM_WARPS]();
VX_scoreboard_if scoreboard_if [`ISSUE_WIDTH]();
VX_operands_if operands_if [`ISSUE_WIDTH]();
`RESET_RELAY (ibuf_reset, reset);

View file

@ -21,7 +21,7 @@ module VX_operands import VX_gpu_pkg::*; #(
input wire reset,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
VX_scoreboard_if.slave scoreboard_if [`ISSUE_WIDTH],
VX_operands_if.master operands_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)

View file

@ -26,27 +26,27 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`endif
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
VX_ibuffer_if.slave ibuffer_if [`NUM_WARPS],
VX_scoreboard_if.master scoreboard_if [`ISSUE_WIDTH]
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
`ifdef PERF_ENABLE
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
reg [`NUM_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
reg [`NUM_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
wire [`NUM_WARPS-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`NUM_WARPS+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.N (`NUM_WARPS),
.OP ("|")
) perf_units_reduce (
.data_in (perf_issue_units_per_cycle),
@ -55,7 +55,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (`ISSUE_WIDTH),
.N (`NUM_WARPS),
.OP ("|")
) perf_sfu_reduce (
.data_in (perf_issue_sfu_per_cycle),
@ -93,21 +93,34 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
end
`endif
`endif
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
`RESET_RELAY (arb_reset, reset);
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
reg [`NUM_WARPS-1:0] operands_ready;
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
reg [`NUM_REGS-1:0] inuse_regs;
reg [3:0] operands_busy;
localparam iw = i % `ISSUE_WIDTH;
localparam wis = i / `ISSUE_WIDTH;
wire writeback_fire = writeback_if[iw].valid
&& (writeback_if[iw].data.wis == ISSUE_WIS_W'(wis))
&& writeback_if[iw].data.eop;
wire inuse_rd = inuse_regs[ibuffer_if[i].data.rd];
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.rs1];
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.rs2];
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.rs3];
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
assign operands_ready[i] = ~(| operands_busy);
`ifdef PERF_ENABLE
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
reg [`SFU_WIDTH-1:0] sfu_type;
always @(*) begin
@ -119,32 +132,32 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
endcase
end
always @(*) begin
always @(*) begin
perf_issue_units_per_cycle[i] = '0;
perf_issue_sfu_per_cycle[i] = '0;
if (ibuffer_if[i].valid) begin
if (inuse_rd) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.rd]] = 1;
if (inuse_units[ibuffer_if[i].data.rd] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.rd]] = 1;
end
end
if (inuse_rs1) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.rs1]] = 1;
if (inuse_units[ibuffer_if[i].data.rs1] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.rs1]] = 1;
end
end
if (inuse_rs2) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.rs2]] = 1;
if (inuse_units[ibuffer_if[i].data.rs2] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.rs2]] = 1;
end
end
if (inuse_rs3) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.rs3]] = 1;
if (inuse_units[ibuffer_if[i].data.rs3] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.rs3]] = 1;
end
end
end
@ -152,43 +165,23 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
`endif
wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
wire operands_ready = ~(| operands_busy);
wire stg_valid_in, stg_ready_in;
assign stg_valid_in = ibuffer_if[i].valid && operands_ready;
assign ibuffer_if[i].ready = stg_ready_in && operands_ready;
VX_stream_buffer #(
.DATAW (DATAW)
) staging_buffer (
.clk (clk),
.reset (reset),
.valid_in (stg_valid_in),
.data_in (ibuffer_if[i].data),
.ready_in (stg_ready_in),
.valid_out (scoreboard_if[i].valid),
.data_out (scoreboard_if[i].data),
.ready_out (scoreboard_if[i].ready)
);
always @(posedge clk) begin
if (reset) begin
inuse_regs <= '0;
end else begin
end else begin
if (writeback_fire) begin
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
inuse_regs[writeback_if[iw].data.rd] <= 0;
end
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
inuse_regs[ibuffer_if[i].data.rd] <= 1;
end
end
`ifdef PERF_ENABLE
if (ibuffer_if[i].valid && ibuffer_if[i].ready && ibuffer_if[i].data.wb) begin
inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= ibuffer_if[i].data.ex_type;
inuse_units[ibuffer_if[i].data.rd] <= ibuffer_if[i].data.ex_type;
if (ibuffer_if[i].data.ex_type == `EX_SFU) begin
inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= sfu_type;
end
inuse_sfu[ibuffer_if[i].data.rd] <= sfu_type;
end
end
`endif
end
@ -203,7 +196,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
`ifdef DBG_TRACE_CORE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
$time, CORE_ID, i, ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
operands_busy, ibuffer_if[i].data.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
@ -215,14 +208,58 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
$time, CORE_ID, i, ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
operands_busy, ibuffer_if[i].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[iw].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
$time, CORE_ID, i, writeback_if[iw].data.PC, writeback_if[iw].data.tmask, writeback_if[iw].data.rd, writeback_if[iw].data.uuid));
`endif
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [ISSUE_RATIO-1:0] valid_in;
wire [ISSUE_RATIO-1:0][DATAW-1:0] data_in;
wire [ISSUE_RATIO-1:0] ready_in;
for (genvar j = 0; j < ISSUE_RATIO; ++j) begin
assign valid_in[j] = ibuffer_if[j * `ISSUE_WIDTH + i].valid && operands_ready[j * `ISSUE_WIDTH + i];
assign data_in[j] = ibuffer_if[j * `ISSUE_WIDTH + i].data;
assign ibuffer_if[j * `ISSUE_WIDTH + i].ready = ready_in[j] && operands_ready[j * `ISSUE_WIDTH + i];
end
VX_stream_arb #(
.NUM_INPUTS (ISSUE_RATIO),
.DATAW (DATAW),
.ARBITER ("F"),
.OUT_REG (2)
) out_arb (
.clk (clk),
.reset (arb_reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
.data_out ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd,
scoreboard_if[i].data.rs1,
scoreboard_if[i].data.rs2,
scoreboard_if[i].data.rs3
}),
.valid_out (scoreboard_if[i].valid),
.ready_out (scoreboard_if[i].ready),
.sel_out (scoreboard_if[i].data.wis)
);
end
endmodule

View file

@ -36,8 +36,8 @@ interface VX_decode_if ();
logic valid;
data_t data;
logic ready;
`ifndef L1_ENABLE
logic [`ISSUE_WIDTH-1:0] ibuf_pop;
`ifndef L1_ENABLE
wire [`NUM_WARPS-1:0] ibuf_pop;
`endif
modport master (

View file

@ -26,8 +26,8 @@ interface VX_fetch_if ();
logic valid;
data_t data;
logic ready;
`ifndef L1_ENABLE
logic [`ISSUE_WIDTH-1:0] ibuf_pop;
`ifndef L1_ENABLE
logic [`NUM_WARPS-1:0] ibuf_pop;
`endif
modport master (

View file

@ -17,7 +17,6 @@ interface VX_ibuffer_if import VX_gpu_pkg::*; ();
typedef struct packed {
logic [`UUID_WIDTH-1:0] uuid;
logic [ISSUE_WIS_W-1:0] wis;
logic [`NUM_THREADS-1:0] tmask;
logic [`EX_BITS-1:0] ex_type;
logic [`INST_OP_BITS-1:0] op_type;

View file

@ -0,0 +1,52 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_scoreboard_if import VX_gpu_pkg::*; ();
typedef struct packed {
logic [`UUID_WIDTH-1:0] uuid;
logic [ISSUE_WIS_W-1:0] wis;
logic [`NUM_THREADS-1:0] tmask;
logic [`EX_BITS-1:0] ex_type;
logic [`INST_OP_BITS-1:0] op_type;
logic [`INST_MOD_BITS-1:0] op_mod;
logic wb;
logic use_PC;
logic use_imm;
logic [`XLEN-1:0] PC;
logic [`XLEN-1:0] imm;
logic [`NR_BITS-1:0] rd;
logic [`NR_BITS-1:0] rs1;
logic [`NR_BITS-1:0] rs2;
logic [`NR_BITS-1:0] rs3;
} data_t;
logic valid;
data_t data;
logic ready;
modport master (
output valid,
output data,
input ready
);
modport slave (
input valid,
input data,
output ready
);
endinterface