scoreboard timing optimization

This commit is contained in:
Blaise Tine 2024-08-13 21:38:33 -07:00
parent d6f1393627
commit aef1411af5
3 changed files with 58 additions and 104 deletions

View file

@ -37,15 +37,15 @@ module VX_operands import VX_gpu_pkg::*; #(
VX_operands_if.master operands_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_REGS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS);
localparam NUM_SRC_OPDS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_OPDS);
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
localparam REGS_DATAW = `XLEN * `NUM_THREADS;
localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW;
localparam DATAW = META_DATAW + NUM_SRC_OPDS * REGS_DATAW;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
@ -53,10 +53,10 @@ module VX_operands import VX_gpu_pkg::*; #(
`UNUSED_VAR (writeback_if.data.sop)
wire [NUM_SRC_REGS-1:0] src_valid;
wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready;
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_SRC_OPDS-1:0] src_valid;
wire [NUM_SRC_OPDS-1:0] req_in_valid, req_in_ready;
wire [NUM_SRC_OPDS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
wire [NUM_SRC_OPDS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
@ -68,40 +68,39 @@ module VX_operands import VX_gpu_pkg::*; #(
wire pipe_valid_st2, pipe_ready_st2;
wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
reg [NUM_SRC_OPDS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
wire [NUM_SRC_OPDS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
reg [NUM_SRC_REGS-1:0] data_fetched_n;
wire [NUM_SRC_REGS-1:0] data_fetched_st1;
reg [NUM_SRC_OPDS-1:0] data_fetched_n;
wire [NUM_SRC_OPDS-1:0] data_fetched_st1;
reg has_collision_n;
wire has_collision_st1;
wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3,
scoreboard_if.data.rs2,
scoreboard_if.data.rs1};
wire [NUM_SRC_OPDS-1:0][`NR_BITS-1:0] src_opds;
assign src_opds = {scoreboard_if.data.rs3, scoreboard_if.data.rs2, scoreboard_if.data.rs1};
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin
if (ISSUE_WIS != 0) begin
assign req_in_data[i] = {src_regs[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
assign req_in_data[i] = {src_opds[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
end else begin
assign req_in_data[i] = src_regs[i][`NR_BITS-1:BANK_SEL_BITS];
assign req_in_data[i] = src_opds[i][`NR_BITS-1:BANK_SEL_BITS];
end
if (NUM_BANKS != 1) begin
assign req_bank_idx[i] = src_regs[i][BANK_SEL_BITS-1:0];
assign req_bank_idx[i] = src_opds[i][BANK_SEL_BITS-1:0];
end else begin
assign req_bank_idx[i] = '0;
end
end
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i];
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin
assign src_valid[i] = (src_opds[i] != 0) && ~data_fetched_st1[i];
end
assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid;
assign req_in_valid = {NUM_SRC_OPDS{scoreboard_if.valid}} & src_valid;
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_REGS),
.NUM_INPUTS (NUM_SRC_OPDS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.ARBITER ("P"), // use priority arbiter
@ -132,8 +131,8 @@ module VX_operands import VX_gpu_pkg::*; #(
always @(*) begin
has_collision_n = 0;
for (integer i = 0; i < NUM_SRC_REGS; ++i) begin
for (integer j = 1; j < (NUM_SRC_REGS-i); ++j) begin
for (integer i = 0; i < NUM_SRC_OPDS; ++i) begin
for (integer j = 1; j < (NUM_SRC_OPDS-i); ++j) begin
has_collision_n |= src_valid[i]
&& src_valid[j+i]
&& (req_bank_idx[i] == req_bank_idx[j+i]);
@ -163,8 +162,8 @@ module VX_operands import VX_gpu_pkg::*; #(
};
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
.RESETW (1 + NUM_SRC_REGS)
.DATAW (1 + NUM_SRC_OPDS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
.RESETW (1 + NUM_SRC_OPDS)
) pipe_reg1 (
.clk (clk),
.reset (reset),
@ -182,8 +181,8 @@ module VX_operands import VX_gpu_pkg::*; #(
`RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
.RESETW (1 + NUM_SRC_REGS * REGS_DATAW)
.DATAW (1 + NUM_SRC_OPDS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
.RESETW (1 + NUM_SRC_OPDS * REGS_DATAW)
) pipe_reg2 (
.clk (clk),
.reset (pipe2_reset),

View file

@ -357,7 +357,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
VX_elastic_buffer #(
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH),
.SIZE (2), // need a skid buffer to buffer out schedule_ready
.SIZE (2), // need to buffer out ready_in
.OUT_REG (1) // should be registered for BRAM acces in fetch unit
) out_buf (
.clk (clk),

View file

@ -30,6 +30,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
VX_scoreboard_if.master scoreboard_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_OPDS = 3;
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
@ -100,9 +102,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`endif
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (1)
VX_pipe_buffer #(
.DATAW (DATAW)
) stanging_buf (
.clk (clk),
.reset (reset),
@ -118,7 +119,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
reg [`NUM_REGS-1:0] inuse_regs;
reg [3:0] operands_busy, operands_busy_n;
reg [NUM_OPDS-1:0] operands_busy, operands_busy_n;
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
@ -128,6 +129,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
&& writeback_if.data.eop;
wire [NUM_OPDS-1:0][`NR_BITS-1:0] ibuf_opds, stg_opds;
assign ibuf_opds = {ibuffer_if[w].data.rs3, ibuffer_if[w].data.rs2, ibuffer_if[w].data.rs1, ibuffer_if[w].data.rd};
assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd};
`ifdef PERF_ENABLE
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
@ -135,29 +140,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
always @(*) begin
perf_inuse_units_per_cycle[w] = '0;
perf_inuse_sfu_per_cycle[w] = '0;
if (staging_if[w].valid) begin
if (operands_busy[0]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rd]] = 1;
if (inuse_units[staging_if[w].data.rd] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rd]] = 1;
end
end
if (operands_busy[1]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs1]] = 1;
if (inuse_units[staging_if[w].data.rs1] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs1]] = 1;
end
end
if (operands_busy[2]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs2]] = 1;
if (inuse_units[staging_if[w].data.rs2] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs2]] = 1;
end
end
if (operands_busy[3]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs3]] = 1;
if (inuse_units[staging_if[w].data.rs3] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs3]] = 1;
for (integer i = 0; i < NUM_OPDS; ++i) begin
if (staging_if[w].valid && operands_busy[i]) begin
perf_inuse_units_per_cycle[w][inuse_units[stg_opds[i]]] = 1;
if (inuse_units[stg_opds[i]] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[stg_opds[i]]] = 1;
end
end
end
@ -165,56 +152,24 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`endif
always @(*) begin
operands_busy_n = operands_busy;
if (ibuffer_fire) begin
operands_busy_n = {
inuse_regs[ibuffer_if[w].data.rs3],
inuse_regs[ibuffer_if[w].data.rs2],
inuse_regs[ibuffer_if[w].data.rs1],
inuse_regs[ibuffer_if[w].data.rd]
};
end
if (writeback_fire) begin
for (integer i = 0; i < NUM_OPDS; ++i) begin
operands_busy_n[i] = operands_busy[i];
if (ibuffer_fire) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 0;
end
end else begin
if (writeback_if.data.rd == staging_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs3) begin
operands_busy_n[3] = 0;
operands_busy_n[i] = inuse_regs[ibuf_opds[i]];
end
if (writeback_fire) begin
if (ibuffer_fire) begin
if (writeback_if.data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 0;
end
end else begin
if (writeback_if.data.rd == stg_opds[i]) begin
operands_busy_n[i] = 0;
end
end
end
end
if (staging_fire && staging_if[w].data.wb) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 1;
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 1;
end
end
end
@ -289,7 +244,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (PER_ISSUE_WARPS),
.DATAW (DATAW),
.ARBITER ("R"),
.ARBITER ("C"),
.OUT_BUF (3)
) out_arb (
.clk (clk),