scoreboard support for multi-lengh registers

This commit is contained in:
tinebp 2025-02-12 22:38:55 -08:00
parent 3e7f9cf81c
commit 5d4abc48d0
9 changed files with 140 additions and 44 deletions

View file

@ -349,6 +349,10 @@
// Pipeline Configuration /////////////////////////////////////////////////////
`ifndef SIMD_WIDTH
`define SIMD_WIDTH `MAX(`NUM_THREADS, 16)
`endif
// Issue width
`ifndef ISSUE_WIDTH
`define ISSUE_WIDTH `UP(`NUM_WARPS / 8)

View file

@ -20,16 +20,14 @@
///////////////////////////////////////////////////////////////////////////////
`define NW_BITS `CLOG2(`NUM_WARPS)
`define NC_WIDTH `UP(`NC_BITS)
`define NT_BITS `CLOG2(`NUM_THREADS)
`define NW_WIDTH `UP(`NW_BITS)
`define NC_BITS `CLOG2(`NUM_CORES)
`define NT_WIDTH `UP(`NT_BITS)
`define NW_BITS `CLOG2(`NUM_WARPS)
`define NT_BITS `CLOG2(`NUM_THREADS)
`define NB_BITS `CLOG2(`NUM_BARRIERS)
`define NC_WIDTH `UP(`NC_BITS)
`define NW_WIDTH `UP(`NW_BITS)
`define NT_WIDTH `UP(`NT_BITS)
`define NB_WIDTH `UP(`NB_BITS)
`define NUM_IREGS 32
@ -37,13 +35,26 @@
`define NRI_BITS `CLOG2(`NUM_IREGS)
`ifdef EXT_F_ENABLE
`define NUM_REGS (2 * `NUM_IREGS)
`define REG_TYPES 2
`else
`define NUM_REGS `NUM_IREGS
`define REG_TYPES 1
`endif
`define REG_TYPE_BITS `CLOG2(`REG_TYPES)
`define REG_TYPE_WIDTH `UP(`REG_TYPE_BITS)
`define NUM_REGS (`REG_TYPES * `NUM_IREGS)
`define NR_BITS `CLOG2(`NUM_REGS)
`define REG_EXT_VAL(ext, type) 32'h1
//32'((1 << ((type == 1) ? ext[2:0] : ext[1:0]))-1)
`define IREG_EXT_BITS 2
`define FREG_EXT_BITS 3
`define VREG_EXT_BITS 3
`define REG_EXT_BITS `MAX(`MAX(`IREG_EXT_BITS, `FREG_EXT_BITS), `VREG_EXT_BITS)
`define DV_STACK_SIZE `UP(`NUM_THREADS-1)
`define DV_STACK_SIZEW `UP(`CLOG2(`DV_STACK_SIZE))
@ -156,7 +167,6 @@
`define INST_ALU_XOR 4'b1110
`define INST_ALU_SLL 4'b1111
`define ALU_TYPE_BITS 2
`define ALU_TYPE_ARITH 0
`define ALU_TYPE_BRANCH 1

View file

@ -16,14 +16,17 @@
`ifdef EXT_F_ENABLE
`define USED_IREG(x) \
x``_v = {1'b0, ``x}; \
x``_ext = 1; \
use_``x = 1
`define USED_FREG(x) \
x``_v = {1'b1, ``x}; \
x``_ext = 1; \
use_``x = 1
`else
`define USED_IREG(x) \
x``_v = ``x; \
x``_ext = 1; \
use_``x = 1
`endif
@ -41,7 +44,7 @@ module VX_decode import VX_gpu_pkg::*; #(
VX_decode_sched_if.master decode_sched_if
);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + (`NR_BITS * 4);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + (`NR_BITS * 4) + (`REG_EXT_BITS * 4);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_VAR (clk)
@ -53,6 +56,7 @@ module VX_decode import VX_gpu_pkg::*; #(
reg [`NR_BITS-1:0] rd_v, rs1_v, rs2_v, rs3_v;
reg use_rd, use_rs1, use_rs2, use_rs3;
reg is_wstall;
reg [`REG_EXT_BITS-1:0] rd_ext, rs1_ext, rs2_ext, rs3_ext;
wire [31:0] instr = fetch_if.data.instr;
wire [6:0] opcode = instr[6:0];
@ -155,6 +159,10 @@ module VX_decode import VX_gpu_pkg::*; #(
ex_type = 'x;
op_type = 'x;
op_args = 'x;
rd_ext = 'x;
rs1_ext = 'x;
rs2_ext = 'x;
rs3_ext = 'x;
rd_v = '0;
rs1_v = '0;
rs2_v = '0;
@ -547,8 +555,8 @@ module VX_decode import VX_gpu_pkg::*; #(
.reset (reset),
.valid_in (fetch_if.valid),
.ready_in (fetch_if.ready),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_v, rs1_v, rs2_v, rs3_v}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_v, rs1_v, rs2_v, rs3_v, rd_ext, rs1_ext, rs2_ext, rs3_ext}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, decode_if.data.rd_ext, decode_if.data.rs1_ext, decode_if.data.rs2_ext, decode_if.data.rs3_ext}),
.valid_out (decode_if.valid),
.ready_out (decode_if.ready)
);

View file

@ -30,7 +30,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
VX_ibuffer_if.master ibuffer_if [PER_ISSUE_WARPS]
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4);
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + (`REG_EXT_BITS * 4);
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
@ -55,7 +55,11 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
decode_if.data.rd,
decode_if.data.rs1,
decode_if.data.rs2,
decode_if.data.rs3
decode_if.data.rs3,
decode_if.data.rd_ext,
decode_if.data.rs1_ext,
decode_if.data.rs2_ext,
decode_if.data.rs3_ext
}),
.ready_in (ibuf_ready_in[w]),
.valid_out(ibuffer_if[w].valid),

View file

@ -101,6 +101,11 @@ module VX_operands import VX_gpu_pkg::*; #(
assign req_valid_in = {NUM_SRC_OPDS{scoreboard_if.valid}} & src_valid;
`UNUSED_VAR (scoreboard_if.data.rd_ext)
`UNUSED_VAR (scoreboard_if.data.rs1_ext)
`UNUSED_VAR (scoreboard_if.data.rs2_ext)
`UNUSED_VAR (scoreboard_if.data.rs3_ext)
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_OPDS),
.NUM_OUTPUTS (NUM_BANKS),

View file

@ -29,10 +29,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
VX_ibuffer_if.slave ibuffer_if [PER_ISSUE_WARPS],
VX_scoreboard_if.master scoreboard_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_OPDS = 3;
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
//`UNUSED_SPARAM (INSTANCE_ID)
//localparam NUM_SRC_OPDS = 3;
//localparam NUM_OPDS = NUM_SRC_OPDS + 1;
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + (`REG_EXT_BITS * 4) + 1;
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
reg [PER_ISSUE_WARPS-1:0] operands_ready;
@ -117,21 +117,55 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_scoreboard
reg [`NUM_REGS-1:0] inuse_regs;
reg [NUM_OPDS-1:0] operands_busy, operands_busy_n;
reg [`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
reg [`REG_TYPES-1:0] operands_busy, operands_busy_n;
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
wire staging_fire = staging_if[w].valid && staging_if[w].ready;
wire writeback_fire = writeback_if.valid
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
&& writeback_if.data.eop;
wire [NUM_OPDS-1:0][`NR_BITS-1:0] ibuf_opds, stg_opds;
assign ibuf_opds = {ibuffer_if[w].data.rs3, ibuffer_if[w].data.rs2, ibuffer_if[w].data.rs1, ibuffer_if[w].data.rd};
assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd};
wire [`REG_TYPE_WIDTH-1:0] ibf_rs1_type, ibf_rs2_type, ibf_rs3_type, ibf_rd_type;
wire [`REG_TYPES-1:0][31:0] ibf_rs1_mask, ibf_rs2_mask, ibf_rs3_mask, ibf_rd_mask;
wire [`REG_TYPE_WIDTH-1:0] stg_rs1_type, stg_rs2_type, stg_rs3_type, stg_rd_type;
wire [`REG_TYPES-1:0][31:0] stg_rs1_mask, stg_rs2_mask, stg_rs3_mask, stg_rd_mask;
`ifdef EXT_F_ENABLE
assign ibf_rs1_type = ibuffer_if[w].data.rs1[5];
assign ibf_rs2_type = ibuffer_if[w].data.rs2[5];
assign ibf_rs3_type = ibuffer_if[w].data.rs3[5];
assign ibf_rd_type = ibuffer_if[w].data.rd[5];
assign stg_rs1_type = staging_if[w].data.rs1[5];
assign stg_rs2_type = staging_if[w].data.rs2[5];
assign stg_rs3_type = staging_if[w].data.rs3[5];
assign stg_rd_type = staging_if[w].data.rd[5];
`else
assign ibf_rs1_type = 0;
assign ibf_rs2_type = 0;
assign ibf_rs3_type = 0;
assign ibf_rd_type = 0;
assign stg_rs1_type = 0;
assign stg_rs2_type = 0;
assign stg_rs3_type = 0;
assign stg_rd_type = 0;
`endif
for (genvar i = 0; i < `REG_TYPES; ++i) begin : g_opd_masks
assign ibf_rs1_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs1_ext, i) << ibuffer_if[w].data.rs1[4:0]) & {32{ibf_rs1_type == i}};
assign ibf_rs2_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs2_ext, i) << ibuffer_if[w].data.rs2[4:0]) & {32{ibf_rs2_type == i}};
assign ibf_rs3_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs3_ext, i) << ibuffer_if[w].data.rs3[4:0]) & {32{ibf_rs3_type == i}};
assign ibf_rd_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rd_ext, i) << ibuffer_if[w].data.rd[4:0]) & {32{ibf_rd_type == i}};
assign stg_rs1_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs1_ext, i) << staging_if[w].data.rs1[4:0]) & {32{stg_rs1_type == i}};
assign stg_rs2_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs2_ext, i) << staging_if[w].data.rs2[4:0]) & {32{stg_rs2_type == i}};
assign stg_rs3_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs3_ext, i) << staging_if[w].data.rs3[4:0]) & {32{stg_rs3_type == i}};
assign stg_rd_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rd_ext, i) << staging_if[w].data.rd[4:0]) & {32{stg_rd_type == i}};
end
`ifdef PERF_ENABLE
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
@ -151,14 +185,14 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`endif
for (genvar i = 0; i < NUM_OPDS; ++i) begin : g_operands_busy_n
/*or (genvar i = 0; i < `REG_TYPES; ++i) begin : g_operands_busy_n
wire [31:0] ibf_reg_mask = ibf_rs1_mask[i] | ibf_rs2_mask[i] | ibf_rs3_mask[i] | ibf_rd_mask[i];
wire in_use_check = (inuse_regs_n[i * 32 +: 32] & ibf_reg_mask) != 0;;
wire in_stg_check = staging_fire && staging_if[w].data.wb && ((ibf_reg_mask & stg_rd_mask[i]) != 0);
always @(*) begin
operands_busy_n[i] = operands_busy[i];
if (ibuffer_fire) begin
operands_busy_n[i] = inuse_regs[ibuf_opds[i]];
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 1;
end
operands_busy_n[i] = in_use_check | in_stg_check;
end
if (writeback_fire) begin
if (ibuffer_fire) begin
@ -168,36 +202,51 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end else begin
if (writeback_if.data.rd == stg_opds[i]) begin
operands_busy_n[i] = 0;
end
end
end
end
end
end*/
always @(*) begin
inuse_regs_n = inuse_regs;
if (writeback_fire) begin
inuse_regs_n[writeback_if.data.rd] = 0;
end
if (staging_fire && staging_if[w].data.wb) begin
inuse_regs_n |= stg_rd_mask;
end
end
for (genvar i = 0; i < `REG_TYPES; ++i) begin : g_operands_busy_n
wire [31:0] ibf_reg_mask = ibf_rs1_mask[i] | ibf_rs2_mask[i] | ibf_rs3_mask[i] | ibf_rd_mask[i];
wire [31:0] stg_reg_mask = stg_rs1_mask[i] | stg_rs2_mask[i] | stg_rs3_mask[i] | stg_rd_mask[i];
wire [31:0] reg_mask = ibuffer_fire ? ibf_reg_mask : stg_reg_mask;
assign operands_busy_n[i] = (inuse_regs_n[i * 32 +: 32] & reg_mask) != 0;
end
always @(posedge clk) begin
if (reset) begin
inuse_regs <= '0;
operands_busy <= '0;
end else begin
if (writeback_fire) begin
inuse_regs[writeback_if.data.rd] <= 0;
end
if (staging_fire && staging_if[w].data.wb) begin
inuse_regs[staging_if[w].data.rd] <= 1;
end
inuse_regs <= inuse_regs_n;
operands_busy <= operands_busy_n;
end
end
operands_busy <= operands_busy_n;
operands_ready[w] <= ~(| operands_busy_n);
assign operands_ready[w] = ~(| operands_busy);
`ifdef PERF_ENABLE
`ifdef PERF_ENABLE
always @(posedge clk) begin
if (staging_fire && staging_if[w].data.wb) begin
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
if (staging_if[w].data.ex_type == `EX_SFU) begin
inuse_sfu[staging_if[w].data.rd] <= op_to_sfu_type(staging_if[w].data.op_type);
end
end
`endif
end
`endif
`ifdef SIMULATION
reg [31:0] timeout_ctr;
@ -263,7 +312,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
scoreboard_if.data.rd,
scoreboard_if.data.rs1,
scoreboard_if.data.rs2,
scoreboard_if.data.rs3
scoreboard_if.data.rs3,
scoreboard_if.data.rd_ext,
scoreboard_if.data.rs1_ext,
scoreboard_if.data.rs2_ext,
scoreboard_if.data.rs3_ext
}),
.valid_out (scoreboard_if.valid),
.ready_out (scoreboard_if.ready),

View file

@ -31,6 +31,10 @@ interface VX_decode_if import VX_gpu_pkg::*; #(
logic [`NR_BITS-1:0] rs1;
logic [`NR_BITS-1:0] rs2;
logic [`NR_BITS-1:0] rs3;
logic [`REG_EXT_BITS-1:0] rd_ext;
logic [`REG_EXT_BITS-1:0] rs1_ext;
logic [`REG_EXT_BITS-1:0] rs2_ext;
logic [`REG_EXT_BITS-1:0] rs3_ext;
} data_t;
logic valid;

View file

@ -27,6 +27,10 @@ interface VX_ibuffer_if import VX_gpu_pkg::*; ();
logic [`NR_BITS-1:0] rs1;
logic [`NR_BITS-1:0] rs2;
logic [`NR_BITS-1:0] rs3;
logic [`REG_EXT_BITS-1:0] rd_ext;
logic [`REG_EXT_BITS-1:0] rs1_ext;
logic [`REG_EXT_BITS-1:0] rs2_ext;
logic [`REG_EXT_BITS-1:0] rs3_ext;
} data_t;
logic valid;

View file

@ -28,6 +28,10 @@ interface VX_scoreboard_if import VX_gpu_pkg::*; ();
logic [`NR_BITS-1:0] rs1;
logic [`NR_BITS-1:0] rs2;
logic [`NR_BITS-1:0] rs3;
logic [`REG_EXT_BITS-1:0] rd_ext;
logic [`REG_EXT_BITS-1:0] rs1_ext;
logic [`REG_EXT_BITS-1:0] rs2_ext;
logic [`REG_EXT_BITS-1:0] rs3_ext;
} data_t;
logic valid;