timing optimization for the TCU
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (cupbop, 32) (push) Blocked by required conditions
CI / tests (cupbop, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (tensor, 32) (push) Blocked by required conditions
CI / tests (tensor, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions

This commit is contained in:
tinebp 2025-06-27 00:00:40 -07:00
parent b06811a201
commit e7b3caf489
10 changed files with 48 additions and 82 deletions

View file

@ -428,18 +428,6 @@
`define LSUQ_OUT_SIZE `MAX(`LSUQ_IN_SIZE, `LSU_LINE_SIZE / (`XLEN / 8))
`endif
`ifndef LATENCY_IMUL
`ifdef VIVADO
`define LATENCY_IMUL 4
`endif
`ifdef QUARTUS
`define LATENCY_IMUL 3
`endif
`ifndef LATENCY_IMUL
`define LATENCY_IMUL 4
`endif
`endif
// Floating-Point Units ///////////////////////////////////////////////////////
// Size of FPU Request Queue

View file

@ -160,6 +160,7 @@
`ifdef QUARTUS
`define MAX_FANOUT 8
`define LATENCY_IMUL 3
`define FORCE_BRAM(d,w) (((d) >= 64 || (w) >= 16 || ((d) * (w)) >= 512) && ((d) * (w)) >= 64)
`define USE_BLOCK_BRAM (* ramstyle = "block" *)
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
@ -171,6 +172,7 @@
`define STRING string
`elsif VIVADO
`define MAX_FANOUT 8
`define LATENCY_IMUL 2
`define FORCE_BRAM(d,w) (((d) >= 64 || (w) >= 16 || ((d) * (w)) >= 512) && ((d) * (w)) >= 64)
`define USE_BLOCK_BRAM (* ram_style = "block" *)
`define USE_FAST_BRAM (* ram_style = "distributed" *)
@ -185,6 +187,7 @@
`endif
`else
`define MAX_FANOUT 8
`define LATENCY_IMUL 2
`define FORCE_BRAM(d,w) (((d) >= 64 || (w) >= 16 || ((d) * (w)) >= 512) && ((d) * (w)) >= 64)
`define USE_BLOCK_BRAM
`define USE_FAST_BRAM

View file

@ -27,6 +27,7 @@ module VX_alu_muldiv import VX_gpu_pkg::*; #(
VX_result_if.master result_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam IMUL_LATENCY = `LATENCY_IMUL;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAG_WIDTH = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NUM_REGS_BITS + 1 + PID_WIDTH + 1 + 1;
@ -80,7 +81,7 @@ module VX_alu_muldiv import VX_gpu_pkg::*; #(
VX_shift_register #(
.DATAW (1 + TAG_WIDTH + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.DEPTH (IMUL_LATENCY),
.RESETW (1)
) mul_shift_reg (
.clk (clk),
@ -158,7 +159,7 @@ module VX_alu_muldiv import VX_gpu_pkg::*; #(
.B_WIDTH (`XLEN+1),
.R_WIDTH (2*(`XLEN+1)),
.SIGNED (1),
.LATENCY (`LATENCY_IMUL)
.LATENCY (IMUL_LATENCY)
) multiplier (
.clk (clk),
.enable (mul_ready_in),
@ -170,7 +171,7 @@ module VX_alu_muldiv import VX_gpu_pkg::*; #(
VX_shift_register #(
.DATAW (1 + TAG_WIDTH + 1 + 1),
.DEPTH (`LATENCY_IMUL),
.DEPTH (IMUL_LATENCY),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
@ -245,7 +246,7 @@ module VX_alu_muldiv import VX_gpu_pkg::*; #(
VX_shift_register #(
.DATAW (1 + TAG_WIDTH + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.DEPTH (IMUL_LATENCY),
.RESETW (1)
) div_shift_reg (
.clk(clk),

View file

@ -31,7 +31,7 @@ module VX_tcu_fedp_dpi #(
);
localparam TCK = 2 * N;
localparam LEVELS = $clog2(TCK);
localparam FMUL_LATENCY = `LATENCY_IMUL;
localparam FMUL_LATENCY = 2;
localparam FADD_LATENCY = 1;
localparam FRND_LATENCY = 1;
localparam RED_LATENCY = LEVELS * FADD_LATENCY;

View file

@ -30,7 +30,7 @@ module VX_tcu_fedp_int #(
output wire [`XLEN-1:0] d_val
);
localparam LEVELS = $clog2(N);
localparam MUL_LATENCY = 3;
localparam MUL_LATENCY = 2;
localparam ADD_LATENCY = 1;
localparam RED_LATENCY = LEVELS * ADD_LATENCY;
localparam ACC_LATENCY = RED_LATENCY + ADD_LATENCY;
@ -39,56 +39,6 @@ module VX_tcu_fedp_int #(
`UNUSED_VAR ({a_row, b_col, c_val});
`UNUSED_VAR (fmt_d);
wire [31:0] prod_i32 [N];
wire [31:0] prod_i16 [N];
wire [31:0] prod_i8 [N];
// multiplication stage
for (genvar i = 0; i < N; i++) begin : g_prod_i32
reg [31:0] prod1, prod2, prod3;
always @(posedge clk) begin
if (enable) begin
prod1 <= $signed(a_row[i][31:0]) * $signed(b_col[i][31:0]);
prod2 <= prod1;
prod3 <= prod2;
end
end
assign prod_i32[i] = prod3;
end
for (genvar i = 0; i < N; i++) begin : g_prod_i16
reg [31:0] prod1_0, prod1_1, prod2_0, prod2_1;
reg [31:0] sum3;
always @(posedge clk) begin
if (enable) begin
prod1_0 <= $signed(a_row[i][15:0]) * $signed(b_col[i][15:0]);
prod1_1 <= $signed(a_row[i][31:16]) * $signed(b_col[i][31:16]);
prod2_0 <= prod1_0;
prod2_1 <= prod1_1;
sum3 <= prod2_0 + prod2_1;
end
end
assign prod_i16[i] = sum3;
end
for (genvar i = 0; i < N; i++) begin : g_prod_i8
reg [16:0] prod1_0, prod1_1, prod1_2, prod1_3;
reg [17:0] sum2_0, sum2_1;
reg [18:0] sum3;
always @(posedge clk) begin
if (enable) begin
prod1_0 <= $signed(a_row[i][7:0]) * $signed(b_col[i][7:0]);
prod1_1 <= $signed(a_row[i][15:8]) * $signed(b_col[i][15:8]);
prod1_2 <= $signed(a_row[i][23:16]) * $signed(b_col[i][23:16]);
prod1_3 <= $signed(a_row[i][31:24]) * $signed(b_col[i][31:24]);
sum2_0 <= prod1_0 + prod1_1;
sum2_1 <= prod1_2 + prod1_3;
sum3 <= sum2_0 + sum2_1;
end
end
assign prod_i8[i] = 32'(sum3);
end
wire [2:0] delayed_fmt_s;
VX_pipe_register #(
.DATAW (3),
@ -102,13 +52,37 @@ module VX_tcu_fedp_int #(
);
wire [31:0] mult_result [N];
for (genvar i = 0; i < N; i++) begin : g_mul_sel
// multiplication stage
for (genvar i = 0; i < N; i++) begin : g_prod
reg [31:0] prod_i32_1, prod_i32_2;
reg [31:0] prod_i16_1, prod_i16_2;
reg [16:0] prod_i8_1a, prod_i8_1b;
reg [17:0] prod_i8_2;
always @(posedge clk) begin
if (enable) begin
prod_i32_1 <= $signed(a_row[i][31:0]) * $signed(b_col[i][31:0]);
prod_i32_2 <= prod_i32_1;
prod_i16_1 <= ($signed(a_row[i][15:0]) * $signed(b_col[i][15:0]))
+ ($signed(a_row[i][31:16]) * $signed(b_col[i][31:16]));
prod_i16_2 <= prod_i16_1;
prod_i8_1a <= ($signed(a_row[i][7:0]) * $signed(b_col[i][7:0]))
+ ($signed(a_row[i][15:8]) * $signed(b_col[i][15:8]));
prod_i8_1b <= ($signed(a_row[i][23:16]) * $signed(b_col[i][23:16]))
+ ($signed(a_row[i][31:24]) * $signed(b_col[i][31:24]));
prod_i8_2 <= prod_i8_1a + prod_i8_1b;
end
end
reg [31:0] mult_sel;
always @(*) begin
case (delayed_fmt_s)
3'd0: mult_sel = prod_i32[i];
3'd1: mult_sel = prod_i16[i];
3'd2: mult_sel = prod_i8[i];
3'd0: mult_sel = prod_i32_2;
3'd1: mult_sel = prod_i16_2;
3'd2: mult_sel = 32'(prod_i8_2);
default: mult_sel = 'x;
endcase
end
@ -116,7 +90,6 @@ module VX_tcu_fedp_int #(
end
wire [31:0] red_in [LEVELS+1][N];
for (genvar i = 0; i < N; i++) begin : g_red_inputs
assign red_in[0][i] = mult_result[i];
end

View file

@ -32,13 +32,13 @@ module VX_tcu_fp import VX_gpu_pkg::*, VX_tcu_pkg::*; #(
localparam MDATA_WIDTH = UUID_WIDTH + NW_WIDTH + PC_BITS + NUM_REGS_BITS;
`ifdef TCU_DPI
localparam FMUL_LATENCY = `LATENCY_IMUL;
localparam FMUL_LATENCY = 2;
localparam FADD_LATENCY = 1;
localparam FRND_LATENCY = 1;
localparam ACC_LATENCY = $clog2(2 * TCU_TC_K) * FADD_LATENCY + FADD_LATENCY;
localparam FEDP_LATENCY = FMUL_LATENCY + ACC_LATENCY + FRND_LATENCY;
`elsif TCU_BHF
localparam FMUL_LATENCY = `LATENCY_IMUL;
localparam FMUL_LATENCY = 2;
localparam FADD_LATENCY = 1;
localparam FRND_LATENCY = 1;
localparam ACC_LATENCY = $clog2(2 * TCU_TC_K) * FADD_LATENCY + FADD_LATENCY;

View file

@ -29,10 +29,11 @@ module VX_tcu_int import VX_gpu_pkg::*, VX_tcu_pkg::*; #(
);
`UNUSED_SPARAM (INSTANCE_ID);
localparam MDATA_WIDTH = UUID_WIDTH + NW_WIDTH + PC_BITS + NUM_REGS_BITS;
localparam MUL_LATENCY = 3;
localparam ADD_LATENCY = 1;
localparam FEDP_LATENCY = MUL_LATENCY + $clog2(TCU_TC_K) * ADD_LATENCY + ADD_LATENCY;
localparam MDATA_WIDTH = UUID_WIDTH + NW_WIDTH + PC_BITS + NUM_REGS_BITS;
localparam MUL_LATENCY = 2;
localparam ADD_LATENCY = 1;
localparam ACC_LATENCY = $clog2(TCU_TC_K) * ADD_LATENCY + ADD_LATENCY;
localparam FEDP_LATENCY = MUL_LATENCY + ACC_LATENCY;
localparam PIPE_LATENCY = FEDP_LATENCY + 1;
localparam MDATA_QUEUE_DEPTH = 1 << $clog2(PIPE_LATENCY);

View file

@ -68,7 +68,7 @@ module VX_tcu_unit import VX_gpu_pkg::*; #(
.PE_COUNT (PE_COUNT),
.NUM_LANES (NUM_LANES),
.ARBITER ("R"),
.REQ_OUT_BUF (0),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (3)
) pe_switch (
.clk (clk),

View file

@ -77,7 +77,7 @@ void AluUnit::tick() {
case MdvType::MULHU:
case MdvType::MULH:
case MdvType::MULHSU:
delay = LATENCY_IMUL+2;
delay = 2;
break;
case MdvType::DIV:
case MdvType::DIVU:

View file

@ -63,7 +63,7 @@ public:
delay = 1;
break;
case VpuOpType::IMUL:
delay = LATENCY_IMUL;
delay = 2;
break;
case VpuOpType::IDIV:
delay = XLEN;