mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-06-27 17:01:10 -04:00
timing optimization for the TCU
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (cupbop, 32) (push) Blocked by required conditions
CI / tests (cupbop, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (tensor, 32) (push) Blocked by required conditions
CI / tests (tensor, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (cupbop, 32) (push) Blocked by required conditions
CI / tests (cupbop, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (tensor, 32) (push) Blocked by required conditions
CI / tests (tensor, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
This commit is contained in:
parent
b06811a201
commit
e7b3caf489
10 changed files with 48 additions and 82 deletions
|
@ -428,18 +428,6 @@
|
|||
`define LSUQ_OUT_SIZE `MAX(`LSUQ_IN_SIZE, `LSU_LINE_SIZE / (`XLEN / 8))
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_IMUL
|
||||
`ifdef VIVADO
|
||||
`define LATENCY_IMUL 4
|
||||
`endif
|
||||
`ifdef QUARTUS
|
||||
`define LATENCY_IMUL 3
|
||||
`endif
|
||||
`ifndef LATENCY_IMUL
|
||||
`define LATENCY_IMUL 4
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// Floating-Point Units ///////////////////////////////////////////////////////
|
||||
|
||||
// Size of FPU Request Queue
|
||||
|
|
|
@ -160,6 +160,7 @@
|
|||
|
||||
`ifdef QUARTUS
|
||||
`define MAX_FANOUT 8
|
||||
`define LATENCY_IMUL 3
|
||||
`define FORCE_BRAM(d,w) (((d) >= 64 || (w) >= 16 || ((d) * (w)) >= 512) && ((d) * (w)) >= 64)
|
||||
`define USE_BLOCK_BRAM (* ramstyle = "block" *)
|
||||
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
|
||||
|
@ -171,6 +172,7 @@
|
|||
`define STRING string
|
||||
`elsif VIVADO
|
||||
`define MAX_FANOUT 8
|
||||
`define LATENCY_IMUL 2
|
||||
`define FORCE_BRAM(d,w) (((d) >= 64 || (w) >= 16 || ((d) * (w)) >= 512) && ((d) * (w)) >= 64)
|
||||
`define USE_BLOCK_BRAM (* ram_style = "block" *)
|
||||
`define USE_FAST_BRAM (* ram_style = "distributed" *)
|
||||
|
@ -185,6 +187,7 @@
|
|||
`endif
|
||||
`else
|
||||
`define MAX_FANOUT 8
|
||||
`define LATENCY_IMUL 2
|
||||
`define FORCE_BRAM(d,w) (((d) >= 64 || (w) >= 16 || ((d) * (w)) >= 512) && ((d) * (w)) >= 64)
|
||||
`define USE_BLOCK_BRAM
|
||||
`define USE_FAST_BRAM
|
||||
|
|
|
@ -27,6 +27,7 @@ module VX_alu_muldiv import VX_gpu_pkg::*; #(
|
|||
VX_result_if.master result_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam IMUL_LATENCY = `LATENCY_IMUL;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam TAG_WIDTH = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NUM_REGS_BITS + 1 + PID_WIDTH + 1 + 1;
|
||||
|
@ -80,7 +81,7 @@ module VX_alu_muldiv import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAG_WIDTH + (NUM_LANES * `XLEN)),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.DEPTH (IMUL_LATENCY),
|
||||
.RESETW (1)
|
||||
) mul_shift_reg (
|
||||
.clk (clk),
|
||||
|
@ -158,7 +159,7 @@ module VX_alu_muldiv import VX_gpu_pkg::*; #(
|
|||
.B_WIDTH (`XLEN+1),
|
||||
.R_WIDTH (2*(`XLEN+1)),
|
||||
.SIGNED (1),
|
||||
.LATENCY (`LATENCY_IMUL)
|
||||
.LATENCY (IMUL_LATENCY)
|
||||
) multiplier (
|
||||
.clk (clk),
|
||||
.enable (mul_ready_in),
|
||||
|
@ -170,7 +171,7 @@ module VX_alu_muldiv import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAG_WIDTH + 1 + 1),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.DEPTH (IMUL_LATENCY),
|
||||
.RESETW (1)
|
||||
) mul_shift_reg (
|
||||
.clk(clk),
|
||||
|
@ -245,7 +246,7 @@ module VX_alu_muldiv import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAG_WIDTH + (NUM_LANES * `XLEN)),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.DEPTH (IMUL_LATENCY),
|
||||
.RESETW (1)
|
||||
) div_shift_reg (
|
||||
.clk(clk),
|
||||
|
|
|
@ -31,7 +31,7 @@ module VX_tcu_fedp_dpi #(
|
|||
);
|
||||
localparam TCK = 2 * N;
|
||||
localparam LEVELS = $clog2(TCK);
|
||||
localparam FMUL_LATENCY = `LATENCY_IMUL;
|
||||
localparam FMUL_LATENCY = 2;
|
||||
localparam FADD_LATENCY = 1;
|
||||
localparam FRND_LATENCY = 1;
|
||||
localparam RED_LATENCY = LEVELS * FADD_LATENCY;
|
||||
|
|
|
@ -30,7 +30,7 @@ module VX_tcu_fedp_int #(
|
|||
output wire [`XLEN-1:0] d_val
|
||||
);
|
||||
localparam LEVELS = $clog2(N);
|
||||
localparam MUL_LATENCY = 3;
|
||||
localparam MUL_LATENCY = 2;
|
||||
localparam ADD_LATENCY = 1;
|
||||
localparam RED_LATENCY = LEVELS * ADD_LATENCY;
|
||||
localparam ACC_LATENCY = RED_LATENCY + ADD_LATENCY;
|
||||
|
@ -39,56 +39,6 @@ module VX_tcu_fedp_int #(
|
|||
`UNUSED_VAR ({a_row, b_col, c_val});
|
||||
`UNUSED_VAR (fmt_d);
|
||||
|
||||
wire [31:0] prod_i32 [N];
|
||||
wire [31:0] prod_i16 [N];
|
||||
wire [31:0] prod_i8 [N];
|
||||
|
||||
// multiplication stage
|
||||
for (genvar i = 0; i < N; i++) begin : g_prod_i32
|
||||
reg [31:0] prod1, prod2, prod3;
|
||||
always @(posedge clk) begin
|
||||
if (enable) begin
|
||||
prod1 <= $signed(a_row[i][31:0]) * $signed(b_col[i][31:0]);
|
||||
prod2 <= prod1;
|
||||
prod3 <= prod2;
|
||||
end
|
||||
end
|
||||
assign prod_i32[i] = prod3;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < N; i++) begin : g_prod_i16
|
||||
reg [31:0] prod1_0, prod1_1, prod2_0, prod2_1;
|
||||
reg [31:0] sum3;
|
||||
always @(posedge clk) begin
|
||||
if (enable) begin
|
||||
prod1_0 <= $signed(a_row[i][15:0]) * $signed(b_col[i][15:0]);
|
||||
prod1_1 <= $signed(a_row[i][31:16]) * $signed(b_col[i][31:16]);
|
||||
prod2_0 <= prod1_0;
|
||||
prod2_1 <= prod1_1;
|
||||
sum3 <= prod2_0 + prod2_1;
|
||||
end
|
||||
end
|
||||
assign prod_i16[i] = sum3;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < N; i++) begin : g_prod_i8
|
||||
reg [16:0] prod1_0, prod1_1, prod1_2, prod1_3;
|
||||
reg [17:0] sum2_0, sum2_1;
|
||||
reg [18:0] sum3;
|
||||
always @(posedge clk) begin
|
||||
if (enable) begin
|
||||
prod1_0 <= $signed(a_row[i][7:0]) * $signed(b_col[i][7:0]);
|
||||
prod1_1 <= $signed(a_row[i][15:8]) * $signed(b_col[i][15:8]);
|
||||
prod1_2 <= $signed(a_row[i][23:16]) * $signed(b_col[i][23:16]);
|
||||
prod1_3 <= $signed(a_row[i][31:24]) * $signed(b_col[i][31:24]);
|
||||
sum2_0 <= prod1_0 + prod1_1;
|
||||
sum2_1 <= prod1_2 + prod1_3;
|
||||
sum3 <= sum2_0 + sum2_1;
|
||||
end
|
||||
end
|
||||
assign prod_i8[i] = 32'(sum3);
|
||||
end
|
||||
|
||||
wire [2:0] delayed_fmt_s;
|
||||
VX_pipe_register #(
|
||||
.DATAW (3),
|
||||
|
@ -102,13 +52,37 @@ module VX_tcu_fedp_int #(
|
|||
);
|
||||
|
||||
wire [31:0] mult_result [N];
|
||||
for (genvar i = 0; i < N; i++) begin : g_mul_sel
|
||||
|
||||
// multiplication stage
|
||||
for (genvar i = 0; i < N; i++) begin : g_prod
|
||||
reg [31:0] prod_i32_1, prod_i32_2;
|
||||
reg [31:0] prod_i16_1, prod_i16_2;
|
||||
reg [16:0] prod_i8_1a, prod_i8_1b;
|
||||
reg [17:0] prod_i8_2;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (enable) begin
|
||||
prod_i32_1 <= $signed(a_row[i][31:0]) * $signed(b_col[i][31:0]);
|
||||
prod_i32_2 <= prod_i32_1;
|
||||
|
||||
prod_i16_1 <= ($signed(a_row[i][15:0]) * $signed(b_col[i][15:0]))
|
||||
+ ($signed(a_row[i][31:16]) * $signed(b_col[i][31:16]));
|
||||
prod_i16_2 <= prod_i16_1;
|
||||
|
||||
prod_i8_1a <= ($signed(a_row[i][7:0]) * $signed(b_col[i][7:0]))
|
||||
+ ($signed(a_row[i][15:8]) * $signed(b_col[i][15:8]));
|
||||
prod_i8_1b <= ($signed(a_row[i][23:16]) * $signed(b_col[i][23:16]))
|
||||
+ ($signed(a_row[i][31:24]) * $signed(b_col[i][31:24]));
|
||||
prod_i8_2 <= prod_i8_1a + prod_i8_1b;
|
||||
end
|
||||
end
|
||||
|
||||
reg [31:0] mult_sel;
|
||||
always @(*) begin
|
||||
case (delayed_fmt_s)
|
||||
3'd0: mult_sel = prod_i32[i];
|
||||
3'd1: mult_sel = prod_i16[i];
|
||||
3'd2: mult_sel = prod_i8[i];
|
||||
3'd0: mult_sel = prod_i32_2;
|
||||
3'd1: mult_sel = prod_i16_2;
|
||||
3'd2: mult_sel = 32'(prod_i8_2);
|
||||
default: mult_sel = 'x;
|
||||
endcase
|
||||
end
|
||||
|
@ -116,7 +90,6 @@ module VX_tcu_fedp_int #(
|
|||
end
|
||||
|
||||
wire [31:0] red_in [LEVELS+1][N];
|
||||
|
||||
for (genvar i = 0; i < N; i++) begin : g_red_inputs
|
||||
assign red_in[0][i] = mult_result[i];
|
||||
end
|
||||
|
|
|
@ -32,13 +32,13 @@ module VX_tcu_fp import VX_gpu_pkg::*, VX_tcu_pkg::*; #(
|
|||
localparam MDATA_WIDTH = UUID_WIDTH + NW_WIDTH + PC_BITS + NUM_REGS_BITS;
|
||||
|
||||
`ifdef TCU_DPI
|
||||
localparam FMUL_LATENCY = `LATENCY_IMUL;
|
||||
localparam FMUL_LATENCY = 2;
|
||||
localparam FADD_LATENCY = 1;
|
||||
localparam FRND_LATENCY = 1;
|
||||
localparam ACC_LATENCY = $clog2(2 * TCU_TC_K) * FADD_LATENCY + FADD_LATENCY;
|
||||
localparam FEDP_LATENCY = FMUL_LATENCY + ACC_LATENCY + FRND_LATENCY;
|
||||
`elsif TCU_BHF
|
||||
localparam FMUL_LATENCY = `LATENCY_IMUL;
|
||||
localparam FMUL_LATENCY = 2;
|
||||
localparam FADD_LATENCY = 1;
|
||||
localparam FRND_LATENCY = 1;
|
||||
localparam ACC_LATENCY = $clog2(2 * TCU_TC_K) * FADD_LATENCY + FADD_LATENCY;
|
||||
|
|
|
@ -29,10 +29,11 @@ module VX_tcu_int import VX_gpu_pkg::*, VX_tcu_pkg::*; #(
|
|||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID);
|
||||
|
||||
localparam MDATA_WIDTH = UUID_WIDTH + NW_WIDTH + PC_BITS + NUM_REGS_BITS;
|
||||
localparam MUL_LATENCY = 3;
|
||||
localparam ADD_LATENCY = 1;
|
||||
localparam FEDP_LATENCY = MUL_LATENCY + $clog2(TCU_TC_K) * ADD_LATENCY + ADD_LATENCY;
|
||||
localparam MDATA_WIDTH = UUID_WIDTH + NW_WIDTH + PC_BITS + NUM_REGS_BITS;
|
||||
localparam MUL_LATENCY = 2;
|
||||
localparam ADD_LATENCY = 1;
|
||||
localparam ACC_LATENCY = $clog2(TCU_TC_K) * ADD_LATENCY + ADD_LATENCY;
|
||||
localparam FEDP_LATENCY = MUL_LATENCY + ACC_LATENCY;
|
||||
localparam PIPE_LATENCY = FEDP_LATENCY + 1;
|
||||
localparam MDATA_QUEUE_DEPTH = 1 << $clog2(PIPE_LATENCY);
|
||||
|
||||
|
|
|
@ -68,7 +68,7 @@ module VX_tcu_unit import VX_gpu_pkg::*; #(
|
|||
.PE_COUNT (PE_COUNT),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF (0),
|
||||
.REQ_OUT_BUF (3),
|
||||
.RSP_OUT_BUF (3)
|
||||
) pe_switch (
|
||||
.clk (clk),
|
||||
|
|
|
@ -77,7 +77,7 @@ void AluUnit::tick() {
|
|||
case MdvType::MULHU:
|
||||
case MdvType::MULH:
|
||||
case MdvType::MULHSU:
|
||||
delay = LATENCY_IMUL+2;
|
||||
delay = 2;
|
||||
break;
|
||||
case MdvType::DIV:
|
||||
case MdvType::DIVU:
|
||||
|
|
|
@ -63,7 +63,7 @@ public:
|
|||
delay = 1;
|
||||
break;
|
||||
case VpuOpType::IMUL:
|
||||
delay = LATENCY_IMUL;
|
||||
delay = 2;
|
||||
break;
|
||||
case VpuOpType::IDIV:
|
||||
delay = XLEN;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue