mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
fmax optimization bundle (250 MHz).
This commit is contained in:
parent
05bc970900
commit
3d052e9428
9 changed files with 71 additions and 65 deletions
|
@ -2,15 +2,14 @@
|
|||
`include "VX_print_instr.vh"
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define USED_IREG(r) \
|
||||
used_regs[{1'b0, r}] = 1
|
||||
`define USED_IREG(r) \
|
||||
r``_r = {1'b0, ``r}
|
||||
|
||||
`define USED_FREG(r) \
|
||||
r``_r[5] = 1; \
|
||||
used_regs[{1'b1, r}] = 1
|
||||
`define USED_FREG(r) \
|
||||
r``_r = {1'b1, ``r}
|
||||
`else
|
||||
`define USED_IREG(r) \
|
||||
used_regs[r] = 1
|
||||
r``_r = ``r
|
||||
`endif
|
||||
|
||||
module VX_decode #(
|
||||
|
@ -38,7 +37,6 @@ module VX_decode #(
|
|||
reg [31:0] imm;
|
||||
reg use_rd, use_PC, use_imm;
|
||||
reg is_join, is_wstall;
|
||||
reg [`NUM_REGS-1:0] used_regs;
|
||||
|
||||
wire [31:0] instr = ifetch_rsp_if.data;
|
||||
wire [6:0] opcode = instr[6:0];
|
||||
|
@ -57,23 +55,24 @@ module VX_decode #(
|
|||
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
|
||||
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
|
||||
wire [11:0] jalr_imm = {func7, rs2};
|
||||
|
||||
`UNUSED_VAR (rs3)
|
||||
|
||||
always @(*) begin
|
||||
|
||||
ex_type = 0;
|
||||
op_type = 'x;
|
||||
op_mod = 0;
|
||||
rd_r = `NR_BITS'(rd);
|
||||
rs1_r = `NR_BITS'(rs1);
|
||||
rs2_r = `NR_BITS'(rs2);
|
||||
rs3_r = `NR_BITS'(rs3);
|
||||
rd_r = 0;
|
||||
rs1_r = 0;
|
||||
rs2_r = 0;
|
||||
rs3_r = 0;
|
||||
imm = 'x;
|
||||
use_imm = 0;
|
||||
use_PC = 0;
|
||||
use_rd = 0;
|
||||
is_join = 0;
|
||||
is_wstall = 0;
|
||||
used_regs = 0;
|
||||
|
||||
case (opcode)
|
||||
`INST_I: begin
|
||||
|
@ -399,7 +398,6 @@ module VX_decode #(
|
|||
assign decode_if.imm = imm;
|
||||
assign decode_if.use_PC = use_PC;
|
||||
assign decode_if.use_imm = use_imm;
|
||||
assign decode_if.used_regs = used_regs;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -421,7 +419,7 @@ module VX_decode #(
|
|||
print_ex_type(decode_if.ex_type);
|
||||
dpi_trace(", op=");
|
||||
print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
|
||||
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_regs=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.used_regs);
|
||||
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -10,12 +10,12 @@ module VX_ibuffer #(
|
|||
VX_decode_if decode_if,
|
||||
|
||||
// outputs
|
||||
VX_ibuffer_if ibuffer_if
|
||||
VX_ibuffer_if ibuffer_if
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + `NUM_REGS;
|
||||
localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1;
|
||||
localparam ADDRW = $clog2(`IBUF_SIZE+1);
|
||||
localparam NWARPSW = $clog2(`NUM_WARPS+1);
|
||||
|
||||
|
@ -35,16 +35,16 @@ module VX_ibuffer #(
|
|||
wire writing = enq_fire && (i == decode_if.wid);
|
||||
wire reading = deq_fire && (i == ibuffer_if.wid);
|
||||
|
||||
wire is_head_ptr = empty_r[i] || (alm_empty_r[i] && reading);
|
||||
wire going_empty = empty_r[i] || (alm_empty_r[i] && reading);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`IBUF_SIZE),
|
||||
.OUTPUT_REG (`IBUF_SIZE > 2)
|
||||
.OUTPUT_REG (1)
|
||||
) queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (writing && !is_head_ptr),
|
||||
.valid_in (writing && !going_empty),
|
||||
.data_in (q_data_in),
|
||||
.ready_out(reading),
|
||||
.data_out (q_data_prev[i]),
|
||||
|
@ -77,7 +77,7 @@ module VX_ibuffer #(
|
|||
used_r[i] <= used_r[i] + ADDRW'($signed(2'(writing) - 2'(reading)));
|
||||
end
|
||||
|
||||
if (writing && is_head_ptr) begin
|
||||
if (writing && going_empty) begin
|
||||
q_data_out[i] <= q_data_in;
|
||||
end else if (reading) begin
|
||||
q_data_out[i] <= q_data_prev[i];
|
||||
|
@ -173,15 +173,14 @@ module VX_ibuffer #(
|
|||
decode_if.ex_type,
|
||||
decode_if.op_type,
|
||||
decode_if.op_mod,
|
||||
decode_if.wb,
|
||||
decode_if.wb,
|
||||
decode_if.use_PC,
|
||||
decode_if.use_imm,
|
||||
decode_if.imm,
|
||||
decode_if.rd,
|
||||
decode_if.rs1,
|
||||
decode_if.rs2,
|
||||
decode_if.rs3,
|
||||
decode_if.imm,
|
||||
decode_if.use_PC,
|
||||
decode_if.use_imm,
|
||||
decode_if.used_regs};
|
||||
decode_if.rs3};
|
||||
|
||||
assign ibuffer_if.valid = deq_valid;
|
||||
assign ibuffer_if.wid = deq_wid;
|
||||
|
@ -190,16 +189,20 @@ module VX_ibuffer #(
|
|||
ibuffer_if.ex_type,
|
||||
ibuffer_if.op_type,
|
||||
ibuffer_if.op_mod,
|
||||
ibuffer_if.wb,
|
||||
ibuffer_if.wb,
|
||||
ibuffer_if.use_PC,
|
||||
ibuffer_if.use_imm,
|
||||
ibuffer_if.imm,
|
||||
ibuffer_if.rd,
|
||||
ibuffer_if.rs1,
|
||||
ibuffer_if.rs2,
|
||||
ibuffer_if.rs3,
|
||||
ibuffer_if.imm,
|
||||
ibuffer_if.use_PC,
|
||||
ibuffer_if.use_imm} = deq_instr[DATAW-1:`NUM_REGS];
|
||||
ibuffer_if.rs3} = deq_instr;
|
||||
|
||||
assign ibuffer_if.used_regs_n = deq_instr_n[`NUM_REGS-1:0];
|
||||
// scoreboard forwarding
|
||||
assign ibuffer_if.wid_n = deq_wid_n;
|
||||
assign ibuffer_if.rd_n = deq_instr_n[3*`NR_BITS +: `NR_BITS];
|
||||
assign ibuffer_if.rs1_n = deq_instr_n[2*`NR_BITS +: `NR_BITS];
|
||||
assign ibuffer_if.rs2_n = deq_instr_n[1*`NR_BITS +: `NR_BITS];
|
||||
assign ibuffer_if.rs3_n = deq_instr_n[0*`NR_BITS +: `NR_BITS];
|
||||
|
||||
endmodule
|
|
@ -42,8 +42,7 @@ module VX_instr_demux (
|
|||
wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
|
||||
.OUTPUT_REG (1)
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32))
|
||||
) alu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -62,8 +61,7 @@ module VX_instr_demux (
|
|||
wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)),
|
||||
.OUTPUT_REG (1)
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32))
|
||||
) lsu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -84,8 +82,7 @@ module VX_instr_demux (
|
|||
wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid];
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32),
|
||||
.OUTPUT_REG (1)
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32)
|
||||
) csr_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -104,8 +101,7 @@ module VX_instr_demux (
|
|||
wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
|
||||
.OUTPUT_REG (1)
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32))
|
||||
) fpu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -127,8 +123,7 @@ module VX_instr_demux (
|
|||
wire [31:0] gpu_rs2_data = gpr_rsp_if.rs2_data[tid];
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)),
|
||||
.OUTPUT_REG (1)
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32))
|
||||
) gpu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -207,7 +207,7 @@ module VX_mem_unit # (
|
|||
.DATA_SIZE (4),
|
||||
.TAG_IN_WIDTH (`DCORE_TAG_WIDTH),
|
||||
.TYPE ("P"),
|
||||
.BUFFERED_REQ (2),
|
||||
.BUFFERED_REQ (1),
|
||||
.BUFFERED_RSP (1)
|
||||
) smem_arb (
|
||||
.clk (clk),
|
||||
|
@ -319,7 +319,7 @@ module VX_mem_unit # (
|
|||
.TYPE ("R"),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.BUFFERED_REQ (1),
|
||||
.BUFFERED_RSP (2)
|
||||
.BUFFERED_RSP (1)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (mem_arb_reset),
|
||||
|
|
|
@ -12,10 +12,6 @@ module VX_scoreboard #(
|
|||
);
|
||||
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
||||
|
||||
reg [`NUM_REGS-1:0] deq_inuse_regs;
|
||||
|
||||
assign delay = (| deq_inuse_regs);
|
||||
|
||||
wire reserve_reg = ibuffer_if.valid && ibuffer_if.ready && ibuffer_if.wb;
|
||||
|
||||
wire release_reg = writeback_if.valid && writeback_if.ready && writeback_if.eop;
|
||||
|
@ -27,7 +23,7 @@ module VX_scoreboard #(
|
|||
end
|
||||
if (release_reg) begin
|
||||
inuse_regs_n[writeback_if.wid][writeback_if.rd] = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
@ -36,8 +32,20 @@ module VX_scoreboard #(
|
|||
end else begin
|
||||
inuse_regs <= inuse_regs_n;
|
||||
end
|
||||
deq_inuse_regs <= inuse_regs_n[ibuffer_if.wid_n] & ibuffer_if.used_regs_n;
|
||||
end
|
||||
|
||||
reg deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3;
|
||||
|
||||
always @(posedge clk) begin
|
||||
deq_inuse_rd <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rd_n];
|
||||
deq_inuse_rs1 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs1_n];
|
||||
deq_inuse_rs2 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs2_n];
|
||||
deq_inuse_rs3 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs3_n];
|
||||
end
|
||||
|
||||
assign delay = deq_inuse_rd | deq_inuse_rs1 | deq_inuse_rs2 | deq_inuse_rs3;
|
||||
|
||||
`UNUSED_VAR (writeback_if.PC)
|
||||
|
||||
reg [31:0] deadlock_ctr;
|
||||
wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
|
||||
|
@ -46,10 +54,10 @@ module VX_scoreboard #(
|
|||
deadlock_ctr <= 0;
|
||||
end else begin
|
||||
`ifdef DBG_PRINT_PIPELINE
|
||||
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
|
||||
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
|
||||
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n",
|
||||
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
|
||||
deq_inuse_regs[ibuffer_if.rd], deq_inuse_regs[ibuffer_if.rs1], deq_inuse_regs[ibuffer_if.rs2], deq_inuse_regs[ibuffer_if.rs3]);
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3);
|
||||
end
|
||||
`endif
|
||||
if (release_reg) begin
|
||||
|
@ -61,7 +69,7 @@ module VX_scoreboard #(
|
|||
deadlock_ctr <= deadlock_ctr + 1;
|
||||
assert(deadlock_ctr < deadlock_timeout) else $error("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
|
||||
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
|
||||
deq_inuse_regs[ibuffer_if.rd], deq_inuse_regs[ibuffer_if.rs1], deq_inuse_regs[ibuffer_if.rs2], deq_inuse_regs[ibuffer_if.rs3]);
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3);
|
||||
end else if (ibuffer_if.valid && ibuffer_if.ready) begin
|
||||
deadlock_ctr <= 0;
|
||||
end
|
||||
|
|
|
@ -85,8 +85,7 @@ module VX_fp_cvt #(
|
|||
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
|
||||
assign fmt_mantissa = INT_MAN_WIDTH'({fp_clss[i].is_normal, dataa[i][MAN_BITS-1:0]});
|
||||
assign fmt_exponent[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} +
|
||||
{1'b0, fp_clss[i].is_subnormal} +
|
||||
(FMT_SHIFT_COMPENSATION - EXP_BIAS);
|
||||
{1'b0, fp_clss[i].is_subnormal};
|
||||
assign encoded_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
|
||||
assign input_sign[i] = is_itof ? int_sign : fmt_sign;
|
||||
`IGNORE_WARNINGS_END
|
||||
|
@ -144,7 +143,7 @@ module VX_fp_cvt #(
|
|||
assign input_mant_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
|
||||
|
||||
// Unbias exponent and compensate for shift
|
||||
wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] - {1'b0, renorm_shamt_s0[i]};
|
||||
wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] + (FMT_SHIFT_COMPENSATION - EXP_BIAS) - {1'b0, renorm_shamt_s0[i]};
|
||||
wire [INT_EXP_WIDTH-1:0] int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};
|
||||
|
||||
assign input_exp_s0[i] = is_itof_s0 ? int_input_exp : fp_input_exp;
|
||||
|
|
|
@ -100,7 +100,7 @@ module VX_fp_ncomp #(
|
|||
VX_pipe_register #(
|
||||
.DATAW (1 + TAGW + `INST_FPU_BITS + `INST_FRM_BITS + LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fp_class_t) + 1 + 1)),
|
||||
.RESETW (1),
|
||||
.DEPTH (0)
|
||||
.DEPTH (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -13,14 +13,13 @@ interface VX_decode_if ();
|
|||
wire [`INST_OP_BITS-1:0] op_type;
|
||||
wire [`INST_MOD_BITS-1:0] op_mod;
|
||||
wire wb;
|
||||
wire use_PC;
|
||||
wire use_imm;
|
||||
wire [31:0] imm;
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire [`NR_BITS-1:0] rs1;
|
||||
wire [`NR_BITS-1:0] rs2;
|
||||
wire [`NR_BITS-1:0] rs3;
|
||||
wire [31:0] imm;
|
||||
wire use_PC;
|
||||
wire use_imm;
|
||||
wire [`NUM_REGS-1:0] used_regs;
|
||||
wire ready;
|
||||
|
||||
endinterface
|
||||
|
|
|
@ -13,16 +13,20 @@ interface VX_ibuffer_if ();
|
|||
wire [`INST_OP_BITS-1:0] op_type;
|
||||
wire [`INST_MOD_BITS-1:0] op_mod;
|
||||
wire wb;
|
||||
wire use_PC;
|
||||
wire use_imm;
|
||||
wire [31:0] imm;
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire [`NR_BITS-1:0] rs1;
|
||||
wire [`NR_BITS-1:0] rs2;
|
||||
wire [`NR_BITS-1:0] rs3;
|
||||
wire [31:0] imm;
|
||||
wire use_PC;
|
||||
wire use_imm;
|
||||
wire ready;
|
||||
|
||||
wire [`NUM_REGS-1:0] used_regs_n;
|
||||
// scoreboard forwarding
|
||||
wire [`NR_BITS-1:0] rd_n;
|
||||
wire [`NR_BITS-1:0] rs1_n;
|
||||
wire [`NR_BITS-1:0] rs2_n;
|
||||
wire [`NR_BITS-1:0] rs3_n;
|
||||
wire [`NW_BITS-1:0] wid_n;
|
||||
|
||||
endinterface
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue