mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
fmax optimizations bundles
This commit is contained in:
parent
b52ace5142
commit
3e014c8285
8 changed files with 93 additions and 103 deletions
|
@ -90,11 +90,6 @@ module VX_alu_unit #(
|
|||
wire is_less = cmp_result[32];
|
||||
wire is_equal = ~(| cmp_result[31:0]);
|
||||
|
||||
wire br_neg = `INST_BR_NEG(br_op);
|
||||
wire br_less = `INST_BR_LESS(br_op);
|
||||
wire br_static = `INST_BR_STATIC(br_op);
|
||||
wire br_taken = ((br_less ? is_less : is_equal) ^ br_neg) | br_static;
|
||||
|
||||
// output
|
||||
|
||||
wire result_valid;
|
||||
|
@ -178,24 +173,33 @@ module VX_alu_unit #(
|
|||
|
||||
`endif
|
||||
|
||||
wire [`INST_BR_BITS-1:0] br_op_r;
|
||||
wire is_less_r;
|
||||
wire is_equal_r;
|
||||
wire is_br_op_r;
|
||||
|
||||
assign stall_out = ~alu_commit_if.ready && alu_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + 1 + 32),
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({result_valid, result_wid, result_tmask, result_PC, result_rd, result_wb, result_data, result_is_br, br_taken, br_dest}),
|
||||
.data_out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.tmask, alu_commit_if.PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_r, branch_ctl_if.taken, branch_ctl_if.dest})
|
||||
.data_in ({result_valid, result_wid, result_tmask, result_PC, result_rd, result_wb, result_data, result_is_br, br_op, is_less, is_equal, br_dest}),
|
||||
.data_out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.tmask, alu_commit_if.PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_r, br_op_r, is_less_r, is_equal_r, branch_ctl_if.dest})
|
||||
);
|
||||
|
||||
assign alu_commit_if.eop = 1'b1;
|
||||
|
||||
`UNUSED_VAR (br_op_r)
|
||||
wire br_neg = `INST_BR_NEG(br_op_r);
|
||||
wire br_less = `INST_BR_LESS(br_op_r);
|
||||
wire br_static = `INST_BR_STATIC(br_op_r);
|
||||
|
||||
assign branch_ctl_if.valid = alu_commit_if.valid && alu_commit_if.ready && is_br_op_r;
|
||||
assign branch_ctl_if.taken = ((br_less ? is_less_r : is_equal_r) ^ br_neg) | br_static;
|
||||
assign branch_ctl_if.wid = alu_commit_if.wid;
|
||||
|
||||
// can accept new request?
|
||||
|
|
|
@ -22,91 +22,67 @@ module VX_gpr_stage #(
|
|||
// ensure r0 never gets written, which can happen before the reset
|
||||
wire write_enable = writeback_if.valid && (writeback_if.rd != 0);
|
||||
|
||||
wire [(`NUM_THREADS * 4)-1:0] wren;
|
||||
wire [`NUM_THREADS-1:0] wren;
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign wren [i * 4 +: 4] = {4{write_enable && writeback_if.tmask[i]}};
|
||||
assign wren[i] = write_enable && writeback_if.tmask[i];
|
||||
end
|
||||
|
||||
reg [`NUM_THREADS-1:0][31:0] last_wdata;
|
||||
reg [$clog2(RAM_SIZE)-1:0] last_waddr;
|
||||
reg [`NUM_THREADS-1:0] last_wmask;
|
||||
|
||||
always @(posedge clk) begin
|
||||
last_wdata <= writeback_if.data;
|
||||
last_wmask <= {`NUM_THREADS{write_enable}} & writeback_if.tmask;
|
||||
last_waddr <= waddr;
|
||||
end
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] rdata1, rdata2;
|
||||
wire [$clog2(RAM_SIZE)-1:0] waddr, raddr1, raddr2;
|
||||
|
||||
assign waddr = {writeback_if.wid, writeback_if.rd};
|
||||
assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1};
|
||||
assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2};
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (32 * `NUM_THREADS),
|
||||
.SIZE (RAM_SIZE),
|
||||
.BYTEENW (`NUM_THREADS * 4),
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0),
|
||||
.NO_RWCHECK (1)
|
||||
) dp_ram1 (
|
||||
.clk (clk),
|
||||
.wren (wren),
|
||||
.waddr (waddr),
|
||||
.wdata (writeback_if.data),
|
||||
.rden (1'b1),
|
||||
.raddr (raddr1),
|
||||
.rdata (rdata1)
|
||||
);
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (32 * `NUM_THREADS),
|
||||
.SIZE (RAM_SIZE),
|
||||
.BYTEENW (`NUM_THREADS * 4),
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0),
|
||||
.NO_RWCHECK (1)
|
||||
) dp_ram2 (
|
||||
.clk (clk),
|
||||
.wren (wren),
|
||||
.waddr (waddr),
|
||||
.wdata (writeback_if.data),
|
||||
.rden (1'b1),
|
||||
.raddr (raddr2),
|
||||
.rdata (rdata2)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign gpr_rsp_if.rs1_data[i] = (last_wmask[i] && (raddr1 == last_waddr)) ? last_wdata[i] : rdata1[i];
|
||||
assign gpr_rsp_if.rs2_data[i] = (last_wmask[i] && (raddr2 == last_waddr)) ? last_wdata[i] : rdata2[i];
|
||||
VX_dp_ram #(
|
||||
.DATAW (32),
|
||||
.SIZE (RAM_SIZE),
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0)
|
||||
) dp_ram1 (
|
||||
.clk (clk),
|
||||
.wren (wren[i]),
|
||||
.waddr (waddr),
|
||||
.wdata (writeback_if.data[i]),
|
||||
.rden (1'b1),
|
||||
.raddr (raddr1),
|
||||
.rdata (gpr_rsp_if.rs1_data[i])
|
||||
);
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (32),
|
||||
.SIZE (RAM_SIZE),
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0)
|
||||
) dp_ram2 (
|
||||
.clk (clk),
|
||||
.wren (wren[i]),
|
||||
.waddr (waddr),
|
||||
.wdata (writeback_if.data[i]),
|
||||
.rden (1'b1),
|
||||
.raddr (raddr2),
|
||||
.rdata (gpr_rsp_if.rs2_data[i])
|
||||
);
|
||||
end
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [`NUM_THREADS-1:0][31:0] rdata3;
|
||||
wire [$clog2(RAM_SIZE)-1:0] raddr3;
|
||||
assign raddr3 = {gpr_req_if.wid, gpr_req_if.rs3};
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (32 * `NUM_THREADS),
|
||||
.SIZE (RAM_SIZE),
|
||||
.BYTEENW (`NUM_THREADS * 4),
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0),
|
||||
.NO_RWCHECK (1)
|
||||
) dp_ram3 (
|
||||
.clk (clk),
|
||||
.wren (wren),
|
||||
.waddr (waddr),
|
||||
.wdata (writeback_if.data),
|
||||
.rden (1'b1),
|
||||
.raddr (raddr3),
|
||||
.rdata (rdata3)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
assign gpr_rsp_if.rs3_data[i] = (last_wmask[i] && (raddr3 == last_waddr)) ? last_wdata[i] : rdata3[i];
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
VX_dp_ram #(
|
||||
.DATAW (32),
|
||||
.SIZE (RAM_SIZE),
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0)
|
||||
) dp_ram3 (
|
||||
.clk (clk),
|
||||
.wren (wren[i]),
|
||||
.waddr (waddr),
|
||||
.wdata (writeback_if.data[i]),
|
||||
.rden (1'b1),
|
||||
.raddr (raddr3),
|
||||
.rdata (gpr_rsp_if.rs3_data[i])
|
||||
);
|
||||
end
|
||||
`else
|
||||
`UNUSED_VAR (gpr_req_if.rs3)
|
||||
|
|
|
@ -38,8 +38,8 @@ module VX_ibuffer #(
|
|||
wire is_head_ptr = empty_r[i] || (alm_empty_r[i] && reading);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`IBUF_SIZE),
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`IBUF_SIZE),
|
||||
.OUTPUT_REG (`IBUF_SIZE > 2)
|
||||
) queue (
|
||||
.clk (clk),
|
||||
|
@ -98,6 +98,8 @@ module VX_ibuffer #(
|
|||
reg [DATAW-1:0] deq_instr, deq_instr_n;
|
||||
reg [NWARPSW-1:0] num_warps;
|
||||
|
||||
`UNUSED_VAR (deq_instr)
|
||||
|
||||
// calculate valid table
|
||||
always @(*) begin
|
||||
valid_table_n = valid_table;
|
||||
|
@ -147,11 +149,10 @@ module VX_ibuffer #(
|
|||
valid_table <= 0;
|
||||
deq_valid <= 0;
|
||||
num_warps <= 0;
|
||||
deq_wid_rr <= 0;
|
||||
end else begin
|
||||
valid_table <= valid_table_n;
|
||||
deq_valid <= deq_valid_n;
|
||||
deq_wid_rr <= deq_wid_rr_n;
|
||||
|
||||
|
||||
if (warp_added && !warp_removed) begin
|
||||
num_warps <= num_warps + NWARPSW'(1);
|
||||
|
@ -160,8 +161,9 @@ module VX_ibuffer #(
|
|||
end
|
||||
end
|
||||
|
||||
deq_wid <= deq_wid_n;
|
||||
deq_instr <= deq_instr_n;
|
||||
deq_wid <= deq_wid_n;
|
||||
deq_wid_rr <= deq_wid_rr_n;
|
||||
deq_instr <= deq_instr_n;
|
||||
end
|
||||
|
||||
assign decode_if.ready = ~q_full[decode_if.wid];
|
||||
|
@ -183,7 +185,6 @@ module VX_ibuffer #(
|
|||
|
||||
assign ibuffer_if.valid = deq_valid;
|
||||
assign ibuffer_if.wid = deq_wid;
|
||||
assign ibuffer_if.wid_n = deq_wid_n;
|
||||
assign {ibuffer_if.tmask,
|
||||
ibuffer_if.PC,
|
||||
ibuffer_if.ex_type,
|
||||
|
@ -195,8 +196,10 @@ module VX_ibuffer #(
|
|||
ibuffer_if.rs2,
|
||||
ibuffer_if.rs3,
|
||||
ibuffer_if.imm,
|
||||
ibuffer_if.use_PC,
|
||||
ibuffer_if.use_imm,
|
||||
ibuffer_if.used_regs} = deq_instr;
|
||||
ibuffer_if.use_PC,
|
||||
ibuffer_if.use_imm} = deq_instr[DATAW-1:`NUM_REGS];
|
||||
|
||||
assign ibuffer_if.used_regs_n = deq_instr_n[`NUM_REGS-1:0];
|
||||
assign ibuffer_if.wid_n = deq_wid_n;
|
||||
|
||||
endmodule
|
|
@ -27,7 +27,7 @@ module VX_instr_demux (
|
|||
wire gpu_req_ready;
|
||||
|
||||
VX_lzc #(
|
||||
.WIDTH (`NUM_THREADS)
|
||||
.N (`NUM_THREADS)
|
||||
) tid_select (
|
||||
.in_i (ibuffer_if.tmask),
|
||||
.cnt_o (tid),
|
||||
|
|
|
@ -283,13 +283,20 @@ module VX_mem_unit # (
|
|||
);
|
||||
end else begin
|
||||
// core to D-cache request
|
||||
assign dcache_req_tmp_if.valid = dcache_req_if.valid;
|
||||
assign dcache_req_tmp_if.addr = dcache_req_if.addr;
|
||||
assign dcache_req_tmp_if.rw = dcache_req_if.rw;
|
||||
assign dcache_req_tmp_if.byteen = dcache_req_if.byteen;
|
||||
assign dcache_req_tmp_if.data = dcache_req_if.data;
|
||||
assign dcache_req_tmp_if.tag = dcache_req_if.tag;
|
||||
assign dcache_req_if.ready = dcache_req_tmp_if.ready;
|
||||
for (genvar i = 0; i < `DNUM_REQS; ++i) begin
|
||||
VX_skid_buffer #(
|
||||
.DATAW ((32-`CLOG2(`DWORD_SIZE)) + 1 + `DWORD_SIZE + (8*`DWORD_SIZE) + `DCORE_TAG_WIDTH)
|
||||
) req_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (dcache_req_if.valid[i]),
|
||||
.data_in ({dcache_req_if.addr[i], dcache_req_if.rw[i], dcache_req_if.byteen[i], dcache_req_if.data[i], dcache_req_if.tag[i]}),
|
||||
.ready_in (dcache_req_if.ready[i]),
|
||||
.valid_out (dcache_req_tmp_if.valid[i]),
|
||||
.data_out ({dcache_req_tmp_if.addr[i], dcache_req_tmp_if.rw[i], dcache_req_tmp_if.byteen[i], dcache_req_tmp_if.data[i], dcache_req_tmp_if.tag[i]}),
|
||||
.ready_out (dcache_req_tmp_if.ready[i])
|
||||
);
|
||||
end
|
||||
|
||||
// D-cache to core reponse
|
||||
assign dcache_rsp_if.valid = dcache_rsp_tmp_if.valid;
|
||||
|
|
|
@ -14,7 +14,7 @@ module VX_scoreboard #(
|
|||
|
||||
reg [`NUM_REGS-1:0] deq_inuse_regs;
|
||||
|
||||
assign delay = |(deq_inuse_regs & ibuffer_if.used_regs);
|
||||
assign delay = (| deq_inuse_regs);
|
||||
|
||||
wire reserve_reg = ibuffer_if.valid && ibuffer_if.ready && ibuffer_if.wb;
|
||||
|
||||
|
@ -36,7 +36,7 @@ module VX_scoreboard #(
|
|||
end else begin
|
||||
inuse_regs <= inuse_regs_n;
|
||||
end
|
||||
deq_inuse_regs <= inuse_regs_n[ibuffer_if.wid_n];
|
||||
deq_inuse_regs <= inuse_regs_n[ibuffer_if.wid_n] & ibuffer_if.used_regs_n;
|
||||
end
|
||||
|
||||
reg [31:0] deadlock_ctr;
|
||||
|
|
5
hw/rtl/cache/VX_bank.v
vendored
5
hw/rtl/cache/VX_bank.v
vendored
|
@ -118,9 +118,8 @@ module VX_bank #(
|
|||
wire creq_valid, creq_ready;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + `LINE_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SELECT_BITS + WORD_SIZE + `WORD_WIDTH + `REQS_BITS + CORE_TAG_WIDTH)),
|
||||
.SIZE (CREQ_SIZE),
|
||||
.OUTPUT_REG (CREQ_SIZE > 2)
|
||||
.DATAW (1 + `LINE_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SELECT_BITS + WORD_SIZE + `WORD_WIDTH + `REQS_BITS + CORE_TAG_WIDTH)),
|
||||
.SIZE (CREQ_SIZE)
|
||||
) core_req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -7,7 +7,6 @@ interface VX_ibuffer_if ();
|
|||
|
||||
wire valid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NW_BITS-1:0] wid_n;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
wire [`EX_BITS-1:0] ex_type;
|
||||
|
@ -21,9 +20,11 @@ interface VX_ibuffer_if ();
|
|||
wire [31:0] imm;
|
||||
wire use_PC;
|
||||
wire use_imm;
|
||||
wire [`NUM_REGS-1:0] used_regs;
|
||||
wire ready;
|
||||
|
||||
wire [`NUM_REGS-1:0] used_regs_n;
|
||||
wire [`NW_BITS-1:0] wid_n;
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
Loading…
Add table
Add a link
Reference in a new issue