fmax optimization bundle (250 MHz).

This commit is contained in:
Blaise Tine 2021-09-08 02:26:39 -07:00
parent 05bc970900
commit 3d052e9428
9 changed files with 71 additions and 65 deletions

View file

@ -2,15 +2,14 @@
`include "VX_print_instr.vh"
`ifdef EXT_F_ENABLE
`define USED_IREG(r) \
used_regs[{1'b0, r}] = 1
`define USED_IREG(r) \
r``_r = {1'b0, ``r}
`define USED_FREG(r) \
r``_r[5] = 1; \
used_regs[{1'b1, r}] = 1
`define USED_FREG(r) \
r``_r = {1'b1, ``r}
`else
`define USED_IREG(r) \
used_regs[r] = 1
r``_r = ``r
`endif
module VX_decode #(
@ -38,7 +37,6 @@ module VX_decode #(
reg [31:0] imm;
reg use_rd, use_PC, use_imm;
reg is_join, is_wstall;
reg [`NUM_REGS-1:0] used_regs;
wire [31:0] instr = ifetch_rsp_if.data;
wire [6:0] opcode = instr[6:0];
@ -57,23 +55,24 @@ module VX_decode #(
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
wire [11:0] jalr_imm = {func7, rs2};
`UNUSED_VAR (rs3)
always @(*) begin
ex_type = 0;
op_type = 'x;
op_mod = 0;
rd_r = `NR_BITS'(rd);
rs1_r = `NR_BITS'(rs1);
rs2_r = `NR_BITS'(rs2);
rs3_r = `NR_BITS'(rs3);
rd_r = 0;
rs1_r = 0;
rs2_r = 0;
rs3_r = 0;
imm = 'x;
use_imm = 0;
use_PC = 0;
use_rd = 0;
is_join = 0;
is_wstall = 0;
used_regs = 0;
case (opcode)
`INST_I: begin
@ -399,7 +398,6 @@ module VX_decode #(
assign decode_if.imm = imm;
assign decode_if.use_PC = use_PC;
assign decode_if.use_imm = use_imm;
assign decode_if.used_regs = used_regs;
///////////////////////////////////////////////////////////////////////////
@ -421,7 +419,7 @@ module VX_decode #(
print_ex_type(decode_if.ex_type);
dpi_trace(", op=");
print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_regs=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.used_regs);
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm);
end
end
`endif

View file

@ -10,12 +10,12 @@ module VX_ibuffer #(
VX_decode_if decode_if,
// outputs
VX_ibuffer_if ibuffer_if
VX_ibuffer_if ibuffer_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + `NUM_REGS;
localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1;
localparam ADDRW = $clog2(`IBUF_SIZE+1);
localparam NWARPSW = $clog2(`NUM_WARPS+1);
@ -35,16 +35,16 @@ module VX_ibuffer #(
wire writing = enq_fire && (i == decode_if.wid);
wire reading = deq_fire && (i == ibuffer_if.wid);
wire is_head_ptr = empty_r[i] || (alm_empty_r[i] && reading);
wire going_empty = empty_r[i] || (alm_empty_r[i] && reading);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUTPUT_REG (`IBUF_SIZE > 2)
.OUTPUT_REG (1)
) queue (
.clk (clk),
.reset (reset),
.valid_in (writing && !is_head_ptr),
.valid_in (writing && !going_empty),
.data_in (q_data_in),
.ready_out(reading),
.data_out (q_data_prev[i]),
@ -77,7 +77,7 @@ module VX_ibuffer #(
used_r[i] <= used_r[i] + ADDRW'($signed(2'(writing) - 2'(reading)));
end
if (writing && is_head_ptr) begin
if (writing && going_empty) begin
q_data_out[i] <= q_data_in;
end else if (reading) begin
q_data_out[i] <= q_data_prev[i];
@ -173,15 +173,14 @@ module VX_ibuffer #(
decode_if.ex_type,
decode_if.op_type,
decode_if.op_mod,
decode_if.wb,
decode_if.wb,
decode_if.use_PC,
decode_if.use_imm,
decode_if.imm,
decode_if.rd,
decode_if.rs1,
decode_if.rs2,
decode_if.rs3,
decode_if.imm,
decode_if.use_PC,
decode_if.use_imm,
decode_if.used_regs};
decode_if.rs3};
assign ibuffer_if.valid = deq_valid;
assign ibuffer_if.wid = deq_wid;
@ -190,16 +189,20 @@ module VX_ibuffer #(
ibuffer_if.ex_type,
ibuffer_if.op_type,
ibuffer_if.op_mod,
ibuffer_if.wb,
ibuffer_if.wb,
ibuffer_if.use_PC,
ibuffer_if.use_imm,
ibuffer_if.imm,
ibuffer_if.rd,
ibuffer_if.rs1,
ibuffer_if.rs2,
ibuffer_if.rs3,
ibuffer_if.imm,
ibuffer_if.use_PC,
ibuffer_if.use_imm} = deq_instr[DATAW-1:`NUM_REGS];
ibuffer_if.rs3} = deq_instr;
assign ibuffer_if.used_regs_n = deq_instr_n[`NUM_REGS-1:0];
// scoreboard forwarding
assign ibuffer_if.wid_n = deq_wid_n;
assign ibuffer_if.rd_n = deq_instr_n[3*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs1_n = deq_instr_n[2*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs2_n = deq_instr_n[1*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs3_n = deq_instr_n[0*`NR_BITS +: `NR_BITS];
endmodule

View file

@ -42,8 +42,7 @@ module VX_instr_demux (
wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32))
) alu_buffer (
.clk (clk),
.reset (reset),
@ -62,8 +61,7 @@ module VX_instr_demux (
wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32))
) lsu_buffer (
.clk (clk),
.reset (reset),
@ -84,8 +82,7 @@ module VX_instr_demux (
wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid];
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32),
.OUTPUT_REG (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32)
) csr_buffer (
.clk (clk),
.reset (reset),
@ -104,8 +101,7 @@ module VX_instr_demux (
wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
.OUTPUT_REG (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32))
) fpu_buffer (
.clk (clk),
.reset (reset),
@ -127,8 +123,7 @@ module VX_instr_demux (
wire [31:0] gpu_rs2_data = gpr_rsp_if.rs2_data[tid];
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)),
.OUTPUT_REG (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32))
) gpu_buffer (
.clk (clk),
.reset (reset),

View file

@ -207,7 +207,7 @@ module VX_mem_unit # (
.DATA_SIZE (4),
.TAG_IN_WIDTH (`DCORE_TAG_WIDTH),
.TYPE ("P"),
.BUFFERED_REQ (2),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) smem_arb (
.clk (clk),
@ -319,7 +319,7 @@ module VX_mem_unit # (
.TYPE ("R"),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.BUFFERED_REQ (1),
.BUFFERED_RSP (2)
.BUFFERED_RSP (1)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),

View file

@ -12,10 +12,6 @@ module VX_scoreboard #(
);
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
reg [`NUM_REGS-1:0] deq_inuse_regs;
assign delay = (| deq_inuse_regs);
wire reserve_reg = ibuffer_if.valid && ibuffer_if.ready && ibuffer_if.wb;
wire release_reg = writeback_if.valid && writeback_if.ready && writeback_if.eop;
@ -27,7 +23,7 @@ module VX_scoreboard #(
end
if (release_reg) begin
inuse_regs_n[writeback_if.wid][writeback_if.rd] = 0;
end
end
end
always @(posedge clk) begin
@ -36,8 +32,20 @@ module VX_scoreboard #(
end else begin
inuse_regs <= inuse_regs_n;
end
deq_inuse_regs <= inuse_regs_n[ibuffer_if.wid_n] & ibuffer_if.used_regs_n;
end
reg deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3;
always @(posedge clk) begin
deq_inuse_rd <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rd_n];
deq_inuse_rs1 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs1_n];
deq_inuse_rs2 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs2_n];
deq_inuse_rs3 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs3_n];
end
assign delay = deq_inuse_rd | deq_inuse_rs1 | deq_inuse_rs2 | deq_inuse_rs3;
`UNUSED_VAR (writeback_if.PC)
reg [31:0] deadlock_ctr;
wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
@ -46,10 +54,10 @@ module VX_scoreboard #(
deadlock_ctr <= 0;
end else begin
`ifdef DBG_PRINT_PIPELINE
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
deq_inuse_regs[ibuffer_if.rd], deq_inuse_regs[ibuffer_if.rs1], deq_inuse_regs[ibuffer_if.rs2], deq_inuse_regs[ibuffer_if.rs3]);
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3);
end
`endif
if (release_reg) begin
@ -61,7 +69,7 @@ module VX_scoreboard #(
deadlock_ctr <= deadlock_ctr + 1;
assert(deadlock_ctr < deadlock_timeout) else $error("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
deq_inuse_regs[ibuffer_if.rd], deq_inuse_regs[ibuffer_if.rs1], deq_inuse_regs[ibuffer_if.rs2], deq_inuse_regs[ibuffer_if.rs3]);
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3);
end else if (ibuffer_if.valid && ibuffer_if.ready) begin
deadlock_ctr <= 0;
end

View file

@ -85,8 +85,7 @@ module VX_fp_cvt #(
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
assign fmt_mantissa = INT_MAN_WIDTH'({fp_clss[i].is_normal, dataa[i][MAN_BITS-1:0]});
assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} +
{1'b0, fp_clss[i].is_subnormal} +
(FMT_SHIFT_COMPENSATION - EXP_BIAS);
{1'b0, fp_clss[i].is_subnormal};
assign encoded_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
assign input_sign[i] = is_itof ? int_sign : fmt_sign;
`IGNORE_WARNINGS_END
@ -144,7 +143,7 @@ module VX_fp_cvt #(
assign input_mant_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
// Unbias exponent and compensate for shift
wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] - {1'b0, renorm_shamt_s0[i]};
wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] + (FMT_SHIFT_COMPENSATION - EXP_BIAS) - {1'b0, renorm_shamt_s0[i]};
wire [INT_EXP_WIDTH-1:0] int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};
assign input_exp_s0[i] = is_itof_s0 ? int_input_exp : fp_input_exp;

View file

@ -100,7 +100,7 @@ module VX_fp_ncomp #(
VX_pipe_register #(
.DATAW (1 + TAGW + `INST_FPU_BITS + `INST_FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_class_t) + 1 + 1)),
.RESETW (1),
.DEPTH (0)
.DEPTH (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),

View file

@ -13,14 +13,13 @@ interface VX_decode_if ();
wire [`INST_OP_BITS-1:0] op_type;
wire [`INST_MOD_BITS-1:0] op_mod;
wire wb;
wire use_PC;
wire use_imm;
wire [31:0] imm;
wire [`NR_BITS-1:0] rd;
wire [`NR_BITS-1:0] rs1;
wire [`NR_BITS-1:0] rs2;
wire [`NR_BITS-1:0] rs3;
wire [31:0] imm;
wire use_PC;
wire use_imm;
wire [`NUM_REGS-1:0] used_regs;
wire ready;
endinterface

View file

@ -13,16 +13,20 @@ interface VX_ibuffer_if ();
wire [`INST_OP_BITS-1:0] op_type;
wire [`INST_MOD_BITS-1:0] op_mod;
wire wb;
wire use_PC;
wire use_imm;
wire [31:0] imm;
wire [`NR_BITS-1:0] rd;
wire [`NR_BITS-1:0] rs1;
wire [`NR_BITS-1:0] rs2;
wire [`NR_BITS-1:0] rs3;
wire [31:0] imm;
wire use_PC;
wire use_imm;
wire ready;
wire [`NUM_REGS-1:0] used_regs_n;
// scoreboard forwarding
wire [`NR_BITS-1:0] rd_n;
wire [`NR_BITS-1:0] rs1_n;
wire [`NR_BITS-1:0] rs2_n;
wire [`NR_BITS-1:0] rs3_n;
wire [`NW_BITS-1:0] wid_n;
endinterface