fmax optimizations bundles

This commit is contained in:
Blaise Tine 2021-09-06 01:36:57 -07:00
parent b52ace5142
commit 3e014c8285
8 changed files with 93 additions and 103 deletions

View file

@ -90,11 +90,6 @@ module VX_alu_unit #(
wire is_less = cmp_result[32];
wire is_equal = ~(| cmp_result[31:0]);
wire br_neg = `INST_BR_NEG(br_op);
wire br_less = `INST_BR_LESS(br_op);
wire br_static = `INST_BR_STATIC(br_op);
wire br_taken = ((br_less ? is_less : is_equal) ^ br_neg) | br_static;
// output
wire result_valid;
@ -178,24 +173,33 @@ module VX_alu_unit #(
`endif
wire [`INST_BR_BITS-1:0] br_op_r;
wire is_less_r;
wire is_equal_r;
wire is_br_op_r;
assign stall_out = ~alu_commit_if.ready && alu_commit_if.valid;
VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + 1 + 32),
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.data_in ({result_valid, result_wid, result_tmask, result_PC, result_rd, result_wb, result_data, result_is_br, br_taken, br_dest}),
.data_out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.tmask, alu_commit_if.PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_r, branch_ctl_if.taken, branch_ctl_if.dest})
.data_in ({result_valid, result_wid, result_tmask, result_PC, result_rd, result_wb, result_data, result_is_br, br_op, is_less, is_equal, br_dest}),
.data_out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.tmask, alu_commit_if.PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_r, br_op_r, is_less_r, is_equal_r, branch_ctl_if.dest})
);
assign alu_commit_if.eop = 1'b1;
`UNUSED_VAR (br_op_r)
wire br_neg = `INST_BR_NEG(br_op_r);
wire br_less = `INST_BR_LESS(br_op_r);
wire br_static = `INST_BR_STATIC(br_op_r);
assign branch_ctl_if.valid = alu_commit_if.valid && alu_commit_if.ready && is_br_op_r;
assign branch_ctl_if.taken = ((br_less ? is_less_r : is_equal_r) ^ br_neg) | br_static;
assign branch_ctl_if.wid = alu_commit_if.wid;
// can accept new request?

View file

@ -22,91 +22,67 @@ module VX_gpr_stage #(
// ensure r0 never gets written, which can happen before the reset
wire write_enable = writeback_if.valid && (writeback_if.rd != 0);
wire [(`NUM_THREADS * 4)-1:0] wren;
wire [`NUM_THREADS-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign wren [i * 4 +: 4] = {4{write_enable && writeback_if.tmask[i]}};
assign wren[i] = write_enable && writeback_if.tmask[i];
end
reg [`NUM_THREADS-1:0][31:0] last_wdata;
reg [$clog2(RAM_SIZE)-1:0] last_waddr;
reg [`NUM_THREADS-1:0] last_wmask;
always @(posedge clk) begin
last_wdata <= writeback_if.data;
last_wmask <= {`NUM_THREADS{write_enable}} & writeback_if.tmask;
last_waddr <= waddr;
end
wire [`NUM_THREADS-1:0][31:0] rdata1, rdata2;
wire [$clog2(RAM_SIZE)-1:0] waddr, raddr1, raddr2;
assign waddr = {writeback_if.wid, writeback_if.rd};
assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1};
assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2};
VX_dp_ram #(
.DATAW (32 * `NUM_THREADS),
.SIZE (RAM_SIZE),
.BYTEENW (`NUM_THREADS * 4),
.INIT_ENABLE (1),
.INIT_VALUE (0),
.NO_RWCHECK (1)
) dp_ram1 (
.clk (clk),
.wren (wren),
.waddr (waddr),
.wdata (writeback_if.data),
.rden (1'b1),
.raddr (raddr1),
.rdata (rdata1)
);
VX_dp_ram #(
.DATAW (32 * `NUM_THREADS),
.SIZE (RAM_SIZE),
.BYTEENW (`NUM_THREADS * 4),
.INIT_ENABLE (1),
.INIT_VALUE (0),
.NO_RWCHECK (1)
) dp_ram2 (
.clk (clk),
.wren (wren),
.waddr (waddr),
.wdata (writeback_if.data),
.rden (1'b1),
.raddr (raddr2),
.rdata (rdata2)
);
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign gpr_rsp_if.rs1_data[i] = (last_wmask[i] && (raddr1 == last_waddr)) ? last_wdata[i] : rdata1[i];
assign gpr_rsp_if.rs2_data[i] = (last_wmask[i] && (raddr2 == last_waddr)) ? last_wdata[i] : rdata2[i];
VX_dp_ram #(
.DATAW (32),
.SIZE (RAM_SIZE),
.INIT_ENABLE (1),
.INIT_VALUE (0)
) dp_ram1 (
.clk (clk),
.wren (wren[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.rden (1'b1),
.raddr (raddr1),
.rdata (gpr_rsp_if.rs1_data[i])
);
VX_dp_ram #(
.DATAW (32),
.SIZE (RAM_SIZE),
.INIT_ENABLE (1),
.INIT_VALUE (0)
) dp_ram2 (
.clk (clk),
.wren (wren[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.rden (1'b1),
.raddr (raddr2),
.rdata (gpr_rsp_if.rs2_data[i])
);
end
`ifdef EXT_F_ENABLE
wire [`NUM_THREADS-1:0][31:0] rdata3;
wire [$clog2(RAM_SIZE)-1:0] raddr3;
assign raddr3 = {gpr_req_if.wid, gpr_req_if.rs3};
VX_dp_ram #(
.DATAW (32 * `NUM_THREADS),
.SIZE (RAM_SIZE),
.BYTEENW (`NUM_THREADS * 4),
.INIT_ENABLE (1),
.INIT_VALUE (0),
.NO_RWCHECK (1)
) dp_ram3 (
.clk (clk),
.wren (wren),
.waddr (waddr),
.wdata (writeback_if.data),
.rden (1'b1),
.raddr (raddr3),
.rdata (rdata3)
);
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign gpr_rsp_if.rs3_data[i] = (last_wmask[i] && (raddr3 == last_waddr)) ? last_wdata[i] : rdata3[i];
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
VX_dp_ram #(
.DATAW (32),
.SIZE (RAM_SIZE),
.INIT_ENABLE (1),
.INIT_VALUE (0)
) dp_ram3 (
.clk (clk),
.wren (wren[i]),
.waddr (waddr),
.wdata (writeback_if.data[i]),
.rden (1'b1),
.raddr (raddr3),
.rdata (gpr_rsp_if.rs3_data[i])
);
end
`else
`UNUSED_VAR (gpr_req_if.rs3)

View file

@ -38,8 +38,8 @@ module VX_ibuffer #(
wire is_head_ptr = empty_r[i] || (alm_empty_r[i] && reading);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUTPUT_REG (`IBUF_SIZE > 2)
) queue (
.clk (clk),
@ -98,6 +98,8 @@ module VX_ibuffer #(
reg [DATAW-1:0] deq_instr, deq_instr_n;
reg [NWARPSW-1:0] num_warps;
`UNUSED_VAR (deq_instr)
// calculate valid table
always @(*) begin
valid_table_n = valid_table;
@ -147,11 +149,10 @@ module VX_ibuffer #(
valid_table <= 0;
deq_valid <= 0;
num_warps <= 0;
deq_wid_rr <= 0;
end else begin
valid_table <= valid_table_n;
deq_valid <= deq_valid_n;
deq_wid_rr <= deq_wid_rr_n;
if (warp_added && !warp_removed) begin
num_warps <= num_warps + NWARPSW'(1);
@ -160,8 +161,9 @@ module VX_ibuffer #(
end
end
deq_wid <= deq_wid_n;
deq_instr <= deq_instr_n;
deq_wid <= deq_wid_n;
deq_wid_rr <= deq_wid_rr_n;
deq_instr <= deq_instr_n;
end
assign decode_if.ready = ~q_full[decode_if.wid];
@ -183,7 +185,6 @@ module VX_ibuffer #(
assign ibuffer_if.valid = deq_valid;
assign ibuffer_if.wid = deq_wid;
assign ibuffer_if.wid_n = deq_wid_n;
assign {ibuffer_if.tmask,
ibuffer_if.PC,
ibuffer_if.ex_type,
@ -195,8 +196,10 @@ module VX_ibuffer #(
ibuffer_if.rs2,
ibuffer_if.rs3,
ibuffer_if.imm,
ibuffer_if.use_PC,
ibuffer_if.use_imm,
ibuffer_if.used_regs} = deq_instr;
ibuffer_if.use_PC,
ibuffer_if.use_imm} = deq_instr[DATAW-1:`NUM_REGS];
assign ibuffer_if.used_regs_n = deq_instr_n[`NUM_REGS-1:0];
assign ibuffer_if.wid_n = deq_wid_n;
endmodule

View file

@ -27,7 +27,7 @@ module VX_instr_demux (
wire gpu_req_ready;
VX_lzc #(
.WIDTH (`NUM_THREADS)
.N (`NUM_THREADS)
) tid_select (
.in_i (ibuffer_if.tmask),
.cnt_o (tid),

View file

@ -283,13 +283,20 @@ module VX_mem_unit # (
);
end else begin
// core to D-cache request
assign dcache_req_tmp_if.valid = dcache_req_if.valid;
assign dcache_req_tmp_if.addr = dcache_req_if.addr;
assign dcache_req_tmp_if.rw = dcache_req_if.rw;
assign dcache_req_tmp_if.byteen = dcache_req_if.byteen;
assign dcache_req_tmp_if.data = dcache_req_if.data;
assign dcache_req_tmp_if.tag = dcache_req_if.tag;
assign dcache_req_if.ready = dcache_req_tmp_if.ready;
for (genvar i = 0; i < `DNUM_REQS; ++i) begin
VX_skid_buffer #(
.DATAW ((32-`CLOG2(`DWORD_SIZE)) + 1 + `DWORD_SIZE + (8*`DWORD_SIZE) + `DCORE_TAG_WIDTH)
) req_buf (
.clk (clk),
.reset (reset),
.valid_in (dcache_req_if.valid[i]),
.data_in ({dcache_req_if.addr[i], dcache_req_if.rw[i], dcache_req_if.byteen[i], dcache_req_if.data[i], dcache_req_if.tag[i]}),
.ready_in (dcache_req_if.ready[i]),
.valid_out (dcache_req_tmp_if.valid[i]),
.data_out ({dcache_req_tmp_if.addr[i], dcache_req_tmp_if.rw[i], dcache_req_tmp_if.byteen[i], dcache_req_tmp_if.data[i], dcache_req_tmp_if.tag[i]}),
.ready_out (dcache_req_tmp_if.ready[i])
);
end
// D-cache to core reponse
assign dcache_rsp_if.valid = dcache_rsp_tmp_if.valid;

View file

@ -14,7 +14,7 @@ module VX_scoreboard #(
reg [`NUM_REGS-1:0] deq_inuse_regs;
assign delay = |(deq_inuse_regs & ibuffer_if.used_regs);
assign delay = (| deq_inuse_regs);
wire reserve_reg = ibuffer_if.valid && ibuffer_if.ready && ibuffer_if.wb;
@ -36,7 +36,7 @@ module VX_scoreboard #(
end else begin
inuse_regs <= inuse_regs_n;
end
deq_inuse_regs <= inuse_regs_n[ibuffer_if.wid_n];
deq_inuse_regs <= inuse_regs_n[ibuffer_if.wid_n] & ibuffer_if.used_regs_n;
end
reg [31:0] deadlock_ctr;

View file

@ -118,9 +118,8 @@ module VX_bank #(
wire creq_valid, creq_ready;
VX_elastic_buffer #(
.DATAW (1 + `LINE_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SELECT_BITS + WORD_SIZE + `WORD_WIDTH + `REQS_BITS + CORE_TAG_WIDTH)),
.SIZE (CREQ_SIZE),
.OUTPUT_REG (CREQ_SIZE > 2)
.DATAW (1 + `LINE_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SELECT_BITS + WORD_SIZE + `WORD_WIDTH + `REQS_BITS + CORE_TAG_WIDTH)),
.SIZE (CREQ_SIZE)
) core_req_queue (
.clk (clk),
.reset (reset),

View file

@ -7,7 +7,6 @@ interface VX_ibuffer_if ();
wire valid;
wire [`NW_BITS-1:0] wid;
wire [`NW_BITS-1:0] wid_n;
wire [`NUM_THREADS-1:0] tmask;
wire [31:0] PC;
wire [`EX_BITS-1:0] ex_type;
@ -21,9 +20,11 @@ interface VX_ibuffer_if ();
wire [31:0] imm;
wire use_PC;
wire use_imm;
wire [`NUM_REGS-1:0] used_regs;
wire ready;
wire [`NUM_REGS-1:0] used_regs_n;
wire [`NW_BITS-1:0] wid_n;
endinterface
`endif