mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
scoreboard support for multi-lengh registers
This commit is contained in:
parent
3e7f9cf81c
commit
5d4abc48d0
9 changed files with 140 additions and 44 deletions
|
@ -349,6 +349,10 @@
|
|||
|
||||
// Pipeline Configuration /////////////////////////////////////////////////////
|
||||
|
||||
`ifndef SIMD_WIDTH
|
||||
`define SIMD_WIDTH `MAX(`NUM_THREADS, 16)
|
||||
`endif
|
||||
|
||||
// Issue width
|
||||
`ifndef ISSUE_WIDTH
|
||||
`define ISSUE_WIDTH `UP(`NUM_WARPS / 8)
|
||||
|
|
|
@ -20,16 +20,14 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define NW_BITS `CLOG2(`NUM_WARPS)
|
||||
`define NC_WIDTH `UP(`NC_BITS)
|
||||
|
||||
`define NT_BITS `CLOG2(`NUM_THREADS)
|
||||
`define NW_WIDTH `UP(`NW_BITS)
|
||||
|
||||
`define NC_BITS `CLOG2(`NUM_CORES)
|
||||
`define NT_WIDTH `UP(`NT_BITS)
|
||||
|
||||
`define NW_BITS `CLOG2(`NUM_WARPS)
|
||||
`define NT_BITS `CLOG2(`NUM_THREADS)
|
||||
`define NB_BITS `CLOG2(`NUM_BARRIERS)
|
||||
|
||||
`define NC_WIDTH `UP(`NC_BITS)
|
||||
`define NW_WIDTH `UP(`NW_BITS)
|
||||
`define NT_WIDTH `UP(`NT_BITS)
|
||||
`define NB_WIDTH `UP(`NB_BITS)
|
||||
|
||||
`define NUM_IREGS 32
|
||||
|
@ -37,13 +35,26 @@
|
|||
`define NRI_BITS `CLOG2(`NUM_IREGS)
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define NUM_REGS (2 * `NUM_IREGS)
|
||||
`define REG_TYPES 2
|
||||
`else
|
||||
`define NUM_REGS `NUM_IREGS
|
||||
`define REG_TYPES 1
|
||||
`endif
|
||||
|
||||
`define REG_TYPE_BITS `CLOG2(`REG_TYPES)
|
||||
`define REG_TYPE_WIDTH `UP(`REG_TYPE_BITS)
|
||||
|
||||
`define NUM_REGS (`REG_TYPES * `NUM_IREGS)
|
||||
|
||||
`define NR_BITS `CLOG2(`NUM_REGS)
|
||||
|
||||
`define REG_EXT_VAL(ext, type) 32'h1
|
||||
//32'((1 << ((type == 1) ? ext[2:0] : ext[1:0]))-1)
|
||||
|
||||
`define IREG_EXT_BITS 2
|
||||
`define FREG_EXT_BITS 3
|
||||
`define VREG_EXT_BITS 3
|
||||
`define REG_EXT_BITS `MAX(`MAX(`IREG_EXT_BITS, `FREG_EXT_BITS), `VREG_EXT_BITS)
|
||||
|
||||
`define DV_STACK_SIZE `UP(`NUM_THREADS-1)
|
||||
`define DV_STACK_SIZEW `UP(`CLOG2(`DV_STACK_SIZE))
|
||||
|
||||
|
@ -156,7 +167,6 @@
|
|||
`define INST_ALU_XOR 4'b1110
|
||||
`define INST_ALU_SLL 4'b1111
|
||||
|
||||
|
||||
`define ALU_TYPE_BITS 2
|
||||
`define ALU_TYPE_ARITH 0
|
||||
`define ALU_TYPE_BRANCH 1
|
||||
|
|
|
@ -16,14 +16,17 @@
|
|||
`ifdef EXT_F_ENABLE
|
||||
`define USED_IREG(x) \
|
||||
x``_v = {1'b0, ``x}; \
|
||||
x``_ext = 1; \
|
||||
use_``x = 1
|
||||
|
||||
`define USED_FREG(x) \
|
||||
x``_v = {1'b1, ``x}; \
|
||||
x``_ext = 1; \
|
||||
use_``x = 1
|
||||
`else
|
||||
`define USED_IREG(x) \
|
||||
x``_v = ``x; \
|
||||
x``_ext = 1; \
|
||||
use_``x = 1
|
||||
`endif
|
||||
|
||||
|
@ -41,7 +44,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
VX_decode_sched_if.master decode_sched_if
|
||||
);
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + (`NR_BITS * 4);
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + (`NR_BITS * 4) + (`REG_EXT_BITS * 4);
|
||||
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
`UNUSED_VAR (clk)
|
||||
|
@ -53,6 +56,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
reg [`NR_BITS-1:0] rd_v, rs1_v, rs2_v, rs3_v;
|
||||
reg use_rd, use_rs1, use_rs2, use_rs3;
|
||||
reg is_wstall;
|
||||
reg [`REG_EXT_BITS-1:0] rd_ext, rs1_ext, rs2_ext, rs3_ext;
|
||||
|
||||
wire [31:0] instr = fetch_if.data.instr;
|
||||
wire [6:0] opcode = instr[6:0];
|
||||
|
@ -155,6 +159,10 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
ex_type = 'x;
|
||||
op_type = 'x;
|
||||
op_args = 'x;
|
||||
rd_ext = 'x;
|
||||
rs1_ext = 'x;
|
||||
rs2_ext = 'x;
|
||||
rs3_ext = 'x;
|
||||
rd_v = '0;
|
||||
rs1_v = '0;
|
||||
rs2_v = '0;
|
||||
|
@ -547,8 +555,8 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
.valid_in (fetch_if.valid),
|
||||
.ready_in (fetch_if.ready),
|
||||
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_v, rs1_v, rs2_v, rs3_v}),
|
||||
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
|
||||
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_v, rs1_v, rs2_v, rs3_v, rd_ext, rs1_ext, rs2_ext, rs3_ext}),
|
||||
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, decode_if.data.rd_ext, decode_if.data.rs1_ext, decode_if.data.rs2_ext, decode_if.data.rs3_ext}),
|
||||
.valid_out (decode_if.valid),
|
||||
.ready_out (decode_if.ready)
|
||||
);
|
||||
|
|
|
@ -30,7 +30,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
VX_ibuffer_if.master ibuffer_if [PER_ISSUE_WARPS]
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4);
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + (`REG_EXT_BITS * 4);
|
||||
|
||||
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
|
||||
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
|
||||
|
@ -55,7 +55,11 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
decode_if.data.rd,
|
||||
decode_if.data.rs1,
|
||||
decode_if.data.rs2,
|
||||
decode_if.data.rs3
|
||||
decode_if.data.rs3,
|
||||
decode_if.data.rd_ext,
|
||||
decode_if.data.rs1_ext,
|
||||
decode_if.data.rs2_ext,
|
||||
decode_if.data.rs3_ext
|
||||
}),
|
||||
.ready_in (ibuf_ready_in[w]),
|
||||
.valid_out(ibuffer_if[w].valid),
|
||||
|
|
|
@ -101,6 +101,11 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
|
||||
assign req_valid_in = {NUM_SRC_OPDS{scoreboard_if.valid}} & src_valid;
|
||||
|
||||
`UNUSED_VAR (scoreboard_if.data.rd_ext)
|
||||
`UNUSED_VAR (scoreboard_if.data.rs1_ext)
|
||||
`UNUSED_VAR (scoreboard_if.data.rs2_ext)
|
||||
`UNUSED_VAR (scoreboard_if.data.rs3_ext)
|
||||
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_SRC_OPDS),
|
||||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
|
|
|
@ -29,10 +29,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
VX_ibuffer_if.slave ibuffer_if [PER_ISSUE_WARPS],
|
||||
VX_scoreboard_if.master scoreboard_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam NUM_SRC_OPDS = 3;
|
||||
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
|
||||
//`UNUSED_SPARAM (INSTANCE_ID)
|
||||
//localparam NUM_SRC_OPDS = 3;
|
||||
//localparam NUM_OPDS = NUM_SRC_OPDS + 1;
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + (`REG_EXT_BITS * 4) + 1;
|
||||
|
||||
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
|
||||
reg [PER_ISSUE_WARPS-1:0] operands_ready;
|
||||
|
@ -117,21 +117,55 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_scoreboard
|
||||
reg [`NUM_REGS-1:0] inuse_regs;
|
||||
|
||||
reg [NUM_OPDS-1:0] operands_busy, operands_busy_n;
|
||||
reg [`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
||||
reg [`REG_TYPES-1:0] operands_busy, operands_busy_n;
|
||||
|
||||
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
|
||||
|
||||
wire staging_fire = staging_if[w].valid && staging_if[w].ready;
|
||||
|
||||
wire writeback_fire = writeback_if.valid
|
||||
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
|
||||
&& writeback_if.data.eop;
|
||||
|
||||
wire [NUM_OPDS-1:0][`NR_BITS-1:0] ibuf_opds, stg_opds;
|
||||
assign ibuf_opds = {ibuffer_if[w].data.rs3, ibuffer_if[w].data.rs2, ibuffer_if[w].data.rs1, ibuffer_if[w].data.rd};
|
||||
assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd};
|
||||
wire [`REG_TYPE_WIDTH-1:0] ibf_rs1_type, ibf_rs2_type, ibf_rs3_type, ibf_rd_type;
|
||||
wire [`REG_TYPES-1:0][31:0] ibf_rs1_mask, ibf_rs2_mask, ibf_rs3_mask, ibf_rd_mask;
|
||||
|
||||
wire [`REG_TYPE_WIDTH-1:0] stg_rs1_type, stg_rs2_type, stg_rs3_type, stg_rd_type;
|
||||
wire [`REG_TYPES-1:0][31:0] stg_rs1_mask, stg_rs2_mask, stg_rs3_mask, stg_rd_mask;
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign ibf_rs1_type = ibuffer_if[w].data.rs1[5];
|
||||
assign ibf_rs2_type = ibuffer_if[w].data.rs2[5];
|
||||
assign ibf_rs3_type = ibuffer_if[w].data.rs3[5];
|
||||
assign ibf_rd_type = ibuffer_if[w].data.rd[5];
|
||||
|
||||
assign stg_rs1_type = staging_if[w].data.rs1[5];
|
||||
assign stg_rs2_type = staging_if[w].data.rs2[5];
|
||||
assign stg_rs3_type = staging_if[w].data.rs3[5];
|
||||
assign stg_rd_type = staging_if[w].data.rd[5];
|
||||
`else
|
||||
assign ibf_rs1_type = 0;
|
||||
assign ibf_rs2_type = 0;
|
||||
assign ibf_rs3_type = 0;
|
||||
assign ibf_rd_type = 0;
|
||||
|
||||
assign stg_rs1_type = 0;
|
||||
assign stg_rs2_type = 0;
|
||||
assign stg_rs3_type = 0;
|
||||
assign stg_rd_type = 0;
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `REG_TYPES; ++i) begin : g_opd_masks
|
||||
assign ibf_rs1_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs1_ext, i) << ibuffer_if[w].data.rs1[4:0]) & {32{ibf_rs1_type == i}};
|
||||
assign ibf_rs2_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs2_ext, i) << ibuffer_if[w].data.rs2[4:0]) & {32{ibf_rs2_type == i}};
|
||||
assign ibf_rs3_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs3_ext, i) << ibuffer_if[w].data.rs3[4:0]) & {32{ibf_rs3_type == i}};
|
||||
assign ibf_rd_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rd_ext, i) << ibuffer_if[w].data.rd[4:0]) & {32{ibf_rd_type == i}};
|
||||
|
||||
assign stg_rs1_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs1_ext, i) << staging_if[w].data.rs1[4:0]) & {32{stg_rs1_type == i}};
|
||||
assign stg_rs2_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs2_ext, i) << staging_if[w].data.rs2[4:0]) & {32{stg_rs2_type == i}};
|
||||
assign stg_rs3_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs3_ext, i) << staging_if[w].data.rs3[4:0]) & {32{stg_rs3_type == i}};
|
||||
assign stg_rd_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rd_ext, i) << staging_if[w].data.rd[4:0]) & {32{stg_rd_type == i}};
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
|
@ -151,14 +185,14 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < NUM_OPDS; ++i) begin : g_operands_busy_n
|
||||
/*or (genvar i = 0; i < `REG_TYPES; ++i) begin : g_operands_busy_n
|
||||
wire [31:0] ibf_reg_mask = ibf_rs1_mask[i] | ibf_rs2_mask[i] | ibf_rs3_mask[i] | ibf_rd_mask[i];
|
||||
wire in_use_check = (inuse_regs_n[i * 32 +: 32] & ibf_reg_mask) != 0;;
|
||||
wire in_stg_check = staging_fire && staging_if[w].data.wb && ((ibf_reg_mask & stg_rd_mask[i]) != 0);
|
||||
always @(*) begin
|
||||
operands_busy_n[i] = operands_busy[i];
|
||||
if (ibuffer_fire) begin
|
||||
operands_busy_n[i] = inuse_regs[ibuf_opds[i]];
|
||||
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
|
||||
operands_busy_n[i] = 1;
|
||||
end
|
||||
operands_busy_n[i] = in_use_check | in_stg_check;
|
||||
end
|
||||
if (writeback_fire) begin
|
||||
if (ibuffer_fire) begin
|
||||
|
@ -168,36 +202,51 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end else begin
|
||||
if (writeback_if.data.rd == stg_opds[i]) begin
|
||||
operands_busy_n[i] = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end*/
|
||||
|
||||
always @(*) begin
|
||||
inuse_regs_n = inuse_regs;
|
||||
if (writeback_fire) begin
|
||||
inuse_regs_n[writeback_if.data.rd] = 0;
|
||||
end
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
inuse_regs_n |= stg_rd_mask;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `REG_TYPES; ++i) begin : g_operands_busy_n
|
||||
wire [31:0] ibf_reg_mask = ibf_rs1_mask[i] | ibf_rs2_mask[i] | ibf_rs3_mask[i] | ibf_rd_mask[i];
|
||||
wire [31:0] stg_reg_mask = stg_rs1_mask[i] | stg_rs2_mask[i] | stg_rs3_mask[i] | stg_rd_mask[i];
|
||||
wire [31:0] reg_mask = ibuffer_fire ? ibf_reg_mask : stg_reg_mask;
|
||||
assign operands_busy_n[i] = (inuse_regs_n[i * 32 +: 32] & reg_mask) != 0;
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
inuse_regs <= '0;
|
||||
operands_busy <= '0;
|
||||
end else begin
|
||||
if (writeback_fire) begin
|
||||
inuse_regs[writeback_if.data.rd] <= 0;
|
||||
end
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
inuse_regs[staging_if[w].data.rd] <= 1;
|
||||
end
|
||||
inuse_regs <= inuse_regs_n;
|
||||
operands_busy <= operands_busy_n;
|
||||
end
|
||||
end
|
||||
|
||||
operands_busy <= operands_busy_n;
|
||||
operands_ready[w] <= ~(| operands_busy_n);
|
||||
assign operands_ready[w] = ~(| operands_busy);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef PERF_ENABLE
|
||||
always @(posedge clk) begin
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
|
||||
if (staging_if[w].data.ex_type == `EX_SFU) begin
|
||||
inuse_sfu[staging_if[w].data.rd] <= op_to_sfu_type(staging_if[w].data.op_type);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef SIMULATION
|
||||
reg [31:0] timeout_ctr;
|
||||
|
@ -263,7 +312,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
scoreboard_if.data.rd,
|
||||
scoreboard_if.data.rs1,
|
||||
scoreboard_if.data.rs2,
|
||||
scoreboard_if.data.rs3
|
||||
scoreboard_if.data.rs3,
|
||||
scoreboard_if.data.rd_ext,
|
||||
scoreboard_if.data.rs1_ext,
|
||||
scoreboard_if.data.rs2_ext,
|
||||
scoreboard_if.data.rs3_ext
|
||||
}),
|
||||
.valid_out (scoreboard_if.valid),
|
||||
.ready_out (scoreboard_if.ready),
|
||||
|
|
|
@ -31,6 +31,10 @@ interface VX_decode_if import VX_gpu_pkg::*; #(
|
|||
logic [`NR_BITS-1:0] rs1;
|
||||
logic [`NR_BITS-1:0] rs2;
|
||||
logic [`NR_BITS-1:0] rs3;
|
||||
logic [`REG_EXT_BITS-1:0] rd_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs1_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs2_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs3_ext;
|
||||
} data_t;
|
||||
|
||||
logic valid;
|
||||
|
|
|
@ -27,6 +27,10 @@ interface VX_ibuffer_if import VX_gpu_pkg::*; ();
|
|||
logic [`NR_BITS-1:0] rs1;
|
||||
logic [`NR_BITS-1:0] rs2;
|
||||
logic [`NR_BITS-1:0] rs3;
|
||||
logic [`REG_EXT_BITS-1:0] rd_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs1_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs2_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs3_ext;
|
||||
} data_t;
|
||||
|
||||
logic valid;
|
||||
|
|
|
@ -28,6 +28,10 @@ interface VX_scoreboard_if import VX_gpu_pkg::*; ();
|
|||
logic [`NR_BITS-1:0] rs1;
|
||||
logic [`NR_BITS-1:0] rs2;
|
||||
logic [`NR_BITS-1:0] rs3;
|
||||
logic [`REG_EXT_BITS-1:0] rd_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs1_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs2_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs3_ext;
|
||||
} data_t;
|
||||
|
||||
logic valid;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue