mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
scoreboard timing optimization
This commit is contained in:
parent
d6f1393627
commit
aef1411af5
3 changed files with 58 additions and 104 deletions
|
@ -37,15 +37,15 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
VX_operands_if.master operands_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam NUM_SRC_REGS = 3;
|
||||
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS);
|
||||
localparam NUM_SRC_OPDS = 3;
|
||||
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_OPDS);
|
||||
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
|
||||
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
|
||||
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
|
||||
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
|
||||
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
|
||||
localparam REGS_DATAW = `XLEN * `NUM_THREADS;
|
||||
localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW;
|
||||
localparam DATAW = META_DATAW + NUM_SRC_OPDS * REGS_DATAW;
|
||||
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
|
||||
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
|
||||
localparam XLEN_SIZE = `XLEN / 8;
|
||||
|
@ -53,10 +53,10 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
|
||||
`UNUSED_VAR (writeback_if.data.sop)
|
||||
|
||||
wire [NUM_SRC_REGS-1:0] src_valid;
|
||||
wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready;
|
||||
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
|
||||
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
|
||||
wire [NUM_SRC_OPDS-1:0] src_valid;
|
||||
wire [NUM_SRC_OPDS-1:0] req_in_valid, req_in_ready;
|
||||
wire [NUM_SRC_OPDS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
|
||||
wire [NUM_SRC_OPDS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
|
||||
|
||||
wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
|
||||
wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
|
||||
|
@ -68,40 +68,39 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
wire pipe_valid_st2, pipe_ready_st2;
|
||||
wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
|
||||
|
||||
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
|
||||
wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
|
||||
reg [NUM_SRC_OPDS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
|
||||
wire [NUM_SRC_OPDS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
|
||||
|
||||
reg [NUM_SRC_REGS-1:0] data_fetched_n;
|
||||
wire [NUM_SRC_REGS-1:0] data_fetched_st1;
|
||||
reg [NUM_SRC_OPDS-1:0] data_fetched_n;
|
||||
wire [NUM_SRC_OPDS-1:0] data_fetched_st1;
|
||||
|
||||
reg has_collision_n;
|
||||
wire has_collision_st1;
|
||||
|
||||
wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3,
|
||||
scoreboard_if.data.rs2,
|
||||
scoreboard_if.data.rs1};
|
||||
wire [NUM_SRC_OPDS-1:0][`NR_BITS-1:0] src_opds;
|
||||
assign src_opds = {scoreboard_if.data.rs3, scoreboard_if.data.rs2, scoreboard_if.data.rs1};
|
||||
|
||||
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
|
||||
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin
|
||||
if (ISSUE_WIS != 0) begin
|
||||
assign req_in_data[i] = {src_regs[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
|
||||
assign req_in_data[i] = {src_opds[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
|
||||
end else begin
|
||||
assign req_in_data[i] = src_regs[i][`NR_BITS-1:BANK_SEL_BITS];
|
||||
assign req_in_data[i] = src_opds[i][`NR_BITS-1:BANK_SEL_BITS];
|
||||
end
|
||||
if (NUM_BANKS != 1) begin
|
||||
assign req_bank_idx[i] = src_regs[i][BANK_SEL_BITS-1:0];
|
||||
assign req_bank_idx[i] = src_opds[i][BANK_SEL_BITS-1:0];
|
||||
end else begin
|
||||
assign req_bank_idx[i] = '0;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
|
||||
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i];
|
||||
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin
|
||||
assign src_valid[i] = (src_opds[i] != 0) && ~data_fetched_st1[i];
|
||||
end
|
||||
|
||||
assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid;
|
||||
assign req_in_valid = {NUM_SRC_OPDS{scoreboard_if.valid}} & src_valid;
|
||||
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_SRC_REGS),
|
||||
.NUM_INPUTS (NUM_SRC_OPDS),
|
||||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (PER_BANK_ADDRW),
|
||||
.ARBITER ("P"), // use priority arbiter
|
||||
|
@ -132,8 +131,8 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
|
||||
always @(*) begin
|
||||
has_collision_n = 0;
|
||||
for (integer i = 0; i < NUM_SRC_REGS; ++i) begin
|
||||
for (integer j = 1; j < (NUM_SRC_REGS-i); ++j) begin
|
||||
for (integer i = 0; i < NUM_SRC_OPDS; ++i) begin
|
||||
for (integer j = 1; j < (NUM_SRC_OPDS-i); ++j) begin
|
||||
has_collision_n |= src_valid[i]
|
||||
&& src_valid[j+i]
|
||||
&& (req_bank_idx[i] == req_bank_idx[j+i]);
|
||||
|
@ -163,8 +162,8 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
};
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
|
||||
.RESETW (1 + NUM_SRC_REGS)
|
||||
.DATAW (1 + NUM_SRC_OPDS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
|
||||
.RESETW (1 + NUM_SRC_OPDS)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -182,8 +181,8 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
`RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
|
||||
.RESETW (1 + NUM_SRC_REGS * REGS_DATAW)
|
||||
.DATAW (1 + NUM_SRC_OPDS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
|
||||
.RESETW (1 + NUM_SRC_OPDS * REGS_DATAW)
|
||||
) pipe_reg2 (
|
||||
.clk (clk),
|
||||
.reset (pipe2_reset),
|
||||
|
|
|
@ -357,7 +357,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH),
|
||||
.SIZE (2), // need a skid buffer to buffer out schedule_ready
|
||||
.SIZE (2), // need to buffer out ready_in
|
||||
.OUT_REG (1) // should be registered for BRAM acces in fetch unit
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
|
|
|
@ -30,6 +30,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
VX_scoreboard_if.master scoreboard_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam NUM_SRC_OPDS = 3;
|
||||
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
|
||||
|
||||
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
|
||||
|
@ -100,9 +102,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (1)
|
||||
VX_pipe_buffer #(
|
||||
.DATAW (DATAW)
|
||||
) stanging_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -118,7 +119,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
reg [`NUM_REGS-1:0] inuse_regs;
|
||||
|
||||
reg [3:0] operands_busy, operands_busy_n;
|
||||
reg [NUM_OPDS-1:0] operands_busy, operands_busy_n;
|
||||
|
||||
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
|
||||
|
||||
|
@ -128,6 +129,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
|
||||
&& writeback_if.data.eop;
|
||||
|
||||
wire [NUM_OPDS-1:0][`NR_BITS-1:0] ibuf_opds, stg_opds;
|
||||
assign ibuf_opds = {ibuffer_if[w].data.rs3, ibuffer_if[w].data.rs2, ibuffer_if[w].data.rs1, ibuffer_if[w].data.rd};
|
||||
assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd};
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||
|
@ -135,29 +140,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
always @(*) begin
|
||||
perf_inuse_units_per_cycle[w] = '0;
|
||||
perf_inuse_sfu_per_cycle[w] = '0;
|
||||
if (staging_if[w].valid) begin
|
||||
if (operands_busy[0]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rd]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rd] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rd]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[1]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs1]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rs1] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs1]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[2]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs2]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rs2] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs2]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[3]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs3]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rs3] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs3]] = 1;
|
||||
for (integer i = 0; i < NUM_OPDS; ++i) begin
|
||||
if (staging_if[w].valid && operands_busy[i]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[stg_opds[i]]] = 1;
|
||||
if (inuse_units[stg_opds[i]] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[stg_opds[i]]] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -165,56 +152,24 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
always @(*) begin
|
||||
operands_busy_n = operands_busy;
|
||||
if (ibuffer_fire) begin
|
||||
operands_busy_n = {
|
||||
inuse_regs[ibuffer_if[w].data.rs3],
|
||||
inuse_regs[ibuffer_if[w].data.rs2],
|
||||
inuse_regs[ibuffer_if[w].data.rs1],
|
||||
inuse_regs[ibuffer_if[w].data.rd]
|
||||
};
|
||||
end
|
||||
if (writeback_fire) begin
|
||||
for (integer i = 0; i < NUM_OPDS; ++i) begin
|
||||
operands_busy_n[i] = operands_busy[i];
|
||||
if (ibuffer_fire) begin
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rd) begin
|
||||
operands_busy_n[0] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rs1) begin
|
||||
operands_busy_n[1] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rs2) begin
|
||||
operands_busy_n[2] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rs3) begin
|
||||
operands_busy_n[3] = 0;
|
||||
end
|
||||
end else begin
|
||||
if (writeback_if.data.rd == staging_if[w].data.rd) begin
|
||||
operands_busy_n[0] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == staging_if[w].data.rs1) begin
|
||||
operands_busy_n[1] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == staging_if[w].data.rs2) begin
|
||||
operands_busy_n[2] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == staging_if[w].data.rs3) begin
|
||||
operands_busy_n[3] = 0;
|
||||
operands_busy_n[i] = inuse_regs[ibuf_opds[i]];
|
||||
end
|
||||
if (writeback_fire) begin
|
||||
if (ibuffer_fire) begin
|
||||
if (writeback_if.data.rd == ibuf_opds[i]) begin
|
||||
operands_busy_n[i] = 0;
|
||||
end
|
||||
end else begin
|
||||
if (writeback_if.data.rd == stg_opds[i]) begin
|
||||
operands_busy_n[i] = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rd) begin
|
||||
operands_busy_n[0] = 1;
|
||||
end
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rs1) begin
|
||||
operands_busy_n[1] = 1;
|
||||
end
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rs2) begin
|
||||
operands_busy_n[2] = 1;
|
||||
end
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rs3) begin
|
||||
operands_busy_n[3] = 1;
|
||||
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
|
||||
operands_busy_n[i] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -289,7 +244,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
VX_stream_arb #(
|
||||
.NUM_INPUTS (PER_ISSUE_WARPS),
|
||||
.DATAW (DATAW),
|
||||
.ARBITER ("R"),
|
||||
.ARBITER ("C"),
|
||||
.OUT_BUF (3)
|
||||
) out_arb (
|
||||
.clk (clk),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue