mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
removing pipeline additional registers
This commit is contained in:
parent
efbe4a07ef
commit
b211b29670
9 changed files with 74 additions and 91 deletions
|
@ -19,19 +19,6 @@ module VX_alu_unit #(
|
|||
reg [`NUM_THREADS-1:0][31:0] shift_result;
|
||||
reg [`NUM_THREADS-1:0][31:0] misc_result;
|
||||
|
||||
wire valid_r;
|
||||
wire [`NW_BITS-1:0] wid_r;
|
||||
wire [`NUM_THREADS-1:0] thread_mask_r;
|
||||
wire [31:0] curr_PC_r;
|
||||
wire [`NR_BITS-1:0] rd_r;
|
||||
wire wb_r;
|
||||
wire [`NT_BITS-1:0] tid_r;
|
||||
wire is_sub_r;
|
||||
wire [`BR_BITS-1:0] br_op_r;
|
||||
wire is_br_op_r, is_br_op_s;
|
||||
wire [1:0] alu_op_class_r;
|
||||
wire [31:0] next_PC_r;
|
||||
|
||||
wire is_br_op = alu_req_if.is_br_op;
|
||||
wire [`ALU_BITS-1:0] alu_op = `ALU_OP(alu_req_if.op_type);
|
||||
wire [`BR_BITS-1:0] br_op = `BR_OP(alu_req_if.op_type);
|
||||
|
@ -47,16 +34,16 @@ module VX_alu_unit #(
|
|||
wire [`NUM_THREADS-1:0][31:0] alu_in2_less = (alu_req_if.rs2_is_imm && ~is_br_op) ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
always @(posedge clk) begin
|
||||
add_result[i] <= alu_in1_PC[i] + alu_in2_imm[i];
|
||||
always @(*) begin
|
||||
add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
wire [32:0] sub_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]};
|
||||
wire [32:0] sub_in2 = {alu_signed & alu_in2_less[i][31], alu_in2_less[i]};
|
||||
always @(posedge clk) begin
|
||||
sub_result[i] <= $signed(sub_in1) - $signed(sub_in2);
|
||||
always @(*) begin
|
||||
sub_result[i] = $signed(sub_in1) - $signed(sub_in2);
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -65,79 +52,70 @@ module VX_alu_unit #(
|
|||
`IGNORE_WARNINGS_BEGIN
|
||||
wire [32:0] shift_value = $signed(shift_in1) >>> alu_in2_imm[i][4:0];
|
||||
`IGNORE_WARNINGS_END
|
||||
always @(posedge clk) begin
|
||||
shift_result[i] <= shift_value[31:0];
|
||||
always @(*) begin
|
||||
shift_result[i] = shift_value[31:0];
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
always @(posedge clk) begin
|
||||
always @(*) begin
|
||||
case (alu_op)
|
||||
`ALU_AND: misc_result[i] <= alu_in1[i] & alu_in2_imm[i];
|
||||
`ALU_OR: misc_result[i] <= alu_in1[i] | alu_in2_imm[i];
|
||||
`ALU_XOR: misc_result[i] <= alu_in1[i] ^ alu_in2_imm[i];
|
||||
`ALU_AND: misc_result[i] = alu_in1[i] & alu_in2_imm[i];
|
||||
`ALU_OR: misc_result[i] = alu_in1[i] | alu_in2_imm[i];
|
||||
`ALU_XOR: misc_result[i] = alu_in1[i] ^ alu_in2_imm[i];
|
||||
//`ALU_SLL,
|
||||
default: misc_result[i] <= alu_in1[i] << alu_in2_imm[i][4:0];
|
||||
default: misc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0];
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
wire [31:0] next_PC = alu_req_if.curr_PC + 4;
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + `NT_BITS + 1 + 1 + `BR_BITS + 2 + 32),
|
||||
.DEPTH(1)
|
||||
) alu_shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(alu_req_if.ready),
|
||||
.in({alu_req_if.valid, alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, is_sub, is_br_op, br_op, alu_op_class, next_PC}),
|
||||
.out({valid_r, wid_r, thread_mask_r, curr_PC_r, rd_r, wb_r, tid_r, is_sub_r, is_br_op_r, br_op_r, alu_op_class_r, next_PC_r})
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
always @(*) begin
|
||||
case (alu_op_class_r)
|
||||
0: alu_result[i] = is_sub_r ? sub_result[i][31:0] : add_result[i];
|
||||
case (alu_op_class)
|
||||
0: alu_result[i] = is_sub ? sub_result[i][31:0] : add_result[i];
|
||||
1: alu_result[i] = {31'b0, sub_result[i][32]};
|
||||
2: alu_result[i] = shift_result[i];
|
||||
default: alu_result[i] = misc_result[i];
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
// branch handling
|
||||
|
||||
wire br_neg = `BR_NEG(br_op_r);
|
||||
wire br_less = `BR_LESS(br_op_r);
|
||||
wire br_static = `BR_STATIC(br_op_r);
|
||||
wire is_jal = is_br_op_r && (br_op_r == `BR_JAL || br_op_r == `BR_JALR);
|
||||
|
||||
wire [31:0] br_dest = add_result[tid_r];
|
||||
wire [32:0] cmp_result = sub_result[tid_r];
|
||||
wire is_less = cmp_result[32];
|
||||
wire is_equal = ~(| cmp_result[31:0]);
|
||||
wire br_taken = ((br_less ? is_less : is_equal) ^ br_neg) | br_static;
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{next_PC_r}} : alu_result;
|
||||
wire is_jal = is_br_op && (br_op == `BR_JAL || br_op == `BR_JALR);
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result;
|
||||
|
||||
wire [31:0] br_dest = add_result[alu_req_if.tid];
|
||||
wire [32:0] cmp_result = sub_result[alu_req_if.tid];
|
||||
wire is_less = cmp_result[32];
|
||||
wire is_equal = ~(| cmp_result[31:0]);
|
||||
|
||||
wire is_br_op_r, is_less_r, is_equal_r;
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
wire [`BR_BITS-1:0] br_op_r;
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
// output
|
||||
|
||||
wire stall_out = ~alu_commit_if.ready && alu_commit_if.valid;
|
||||
|
||||
VX_generic_register #(
|
||||
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + 1 + 32)
|
||||
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `BR_BITS + 1 + 1 + 32)
|
||||
) alu_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (stall_out),
|
||||
.flush (1'b0),
|
||||
.in ({valid_r, wid_r, thread_mask_r, curr_PC_r, rd_r, wb_r, alu_jal_result, is_br_op_r, br_taken, br_dest}),
|
||||
.out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.thread_mask, alu_commit_if.curr_PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_s, branch_ctl_if.taken, branch_ctl_if.dest})
|
||||
.in ({alu_req_if.valid, alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}),
|
||||
.out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.thread_mask, alu_commit_if.curr_PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_r, br_op_r, is_less_r, is_equal_r, branch_ctl_if.dest})
|
||||
);
|
||||
|
||||
assign branch_ctl_if.valid = alu_commit_if.valid && alu_commit_if.ready && is_br_op_s;
|
||||
wire br_neg = `BR_NEG(br_op_r);
|
||||
wire br_less = `BR_LESS(br_op_r);
|
||||
wire br_static = `BR_STATIC(br_op_r);
|
||||
wire br_taken = ((br_less ? is_less_r : is_equal_r) ^ br_neg) | br_static;
|
||||
|
||||
assign branch_ctl_if.valid = alu_commit_if.valid && alu_commit_if.ready && is_br_op_r;
|
||||
assign branch_ctl_if.wid = alu_commit_if.wid;
|
||||
assign branch_ctl_if.taken = br_taken;
|
||||
|
||||
// can accept new request?
|
||||
assign alu_req_if.ready = ~stall_out;
|
||||
|
|
|
@ -13,6 +13,9 @@ module VX_gpu_unit #(
|
|||
VX_warp_ctl_if warp_ctl_if,
|
||||
VX_exu_to_cmt_if gpu_commit_if
|
||||
);
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
gpu_tmc_t tmc;
|
||||
gpu_wspawn_t wspawn;
|
||||
gpu_barrier_t barrier;
|
||||
|
@ -58,7 +61,7 @@ module VX_gpu_unit #(
|
|||
assign split.diverged = (| split_then_mask) && (| split_else_mask);
|
||||
assign split.then_mask = split_then_mask;
|
||||
assign split.else_mask = split_else_mask;
|
||||
assign split.pc = gpu_req_if.curr_PC + 4;
|
||||
assign split.pc = gpu_req_if.next_PC;
|
||||
|
||||
// barrier
|
||||
|
||||
|
@ -68,23 +71,21 @@ module VX_gpu_unit #(
|
|||
|
||||
// output
|
||||
|
||||
wire stall = ~gpu_commit_if.ready && gpu_commit_if.valid;
|
||||
assign warp_ctl_if.valid = gpu_req_if.valid && gpu_commit_if.ready;
|
||||
assign warp_ctl_if.wid = gpu_commit_if.wid;
|
||||
assign warp_ctl_if.tmc = tmc;
|
||||
assign warp_ctl_if.wspawn = wspawn;
|
||||
assign warp_ctl_if.split = split;
|
||||
assign warp_ctl_if.barrier = barrier;
|
||||
|
||||
VX_generic_register #(
|
||||
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + $bits(gpu_tmc_t) + $bits(gpu_wspawn_t) + $bits(gpu_split_t) + $bits(gpu_barrier_t))
|
||||
) gpu_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (stall),
|
||||
.flush (1'b0),
|
||||
.in ({gpu_req_if.valid, gpu_req_if.wid, gpu_req_if.thread_mask, gpu_req_if.curr_PC, gpu_req_if.rd, gpu_req_if.wb, tmc, wspawn, split, barrier}),
|
||||
.out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.thread_mask, gpu_commit_if.curr_PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier})
|
||||
);
|
||||
|
||||
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready;
|
||||
assign warp_ctl_if.wid = gpu_commit_if.wid;
|
||||
assign gpu_commit_if.valid = gpu_req_if.valid;
|
||||
assign gpu_commit_if.wid = gpu_req_if.wid;
|
||||
assign gpu_commit_if.thread_mask = gpu_req_if.thread_mask;
|
||||
assign gpu_commit_if.curr_PC = gpu_req_if.curr_PC;
|
||||
assign gpu_commit_if.rd = gpu_req_if.rd;
|
||||
assign gpu_commit_if.wb = gpu_req_if.wb;
|
||||
|
||||
// can accept new request?
|
||||
assign gpu_req_if.ready = ~stall;
|
||||
assign gpu_req_if.ready = gpu_commit_if.ready;
|
||||
|
||||
endmodule
|
|
@ -17,12 +17,6 @@ module VX_instr_demux (
|
|||
VX_fpu_req_if fpu_req_if,
|
||||
VX_gpu_req_if gpu_req_if
|
||||
);
|
||||
// ALU unit
|
||||
|
||||
wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU);
|
||||
wire alu_req_ready;
|
||||
wire is_br_op = `IS_BR_MOD(execute_if.op_mod);
|
||||
|
||||
wire [`NT_BITS-1:0] tid;
|
||||
VX_priority_encoder #(
|
||||
.N(`NUM_THREADS)
|
||||
|
@ -32,15 +26,23 @@ module VX_instr_demux (
|
|||
`UNUSED_PIN (valid_out)
|
||||
);
|
||||
|
||||
wire [31:0] next_PC = execute_if.curr_PC + 4;
|
||||
|
||||
// ALU unit
|
||||
|
||||
wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU);
|
||||
wire alu_req_ready;
|
||||
wire is_br_op = `IS_BR_MOD(execute_if.op_mod);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `ALU_BR_BITS + 1 + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS)
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BR_BITS + 1 + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS)
|
||||
) alu_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.ready_in (alu_req_ready),
|
||||
.valid_in (alu_req_valid),
|
||||
.data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `ALU_BR_OP(execute_if.op_type), is_br_op, execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.rd, execute_if.wb, tid}),
|
||||
.data_out ({alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.op_type, alu_req_if.is_br_op, alu_req_if.imm, alu_req_if.rs1_is_PC, alu_req_if.rs2_is_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid}),
|
||||
.data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, next_PC, `ALU_BR_OP(execute_if.op_type), is_br_op, execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.rd, execute_if.wb, tid}),
|
||||
.data_out ({alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.is_br_op, alu_req_if.imm, alu_req_if.rs1_is_PC, alu_req_if.rs2_is_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid}),
|
||||
.ready_out (alu_req_if.ready),
|
||||
.valid_out (alu_req_if.valid)
|
||||
);
|
||||
|
@ -196,14 +198,14 @@ module VX_instr_demux (
|
|||
wire gpu_req_ready;
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `GPU_BITS + `NR_BITS + 1)
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1)
|
||||
) gpu_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.ready_in (gpu_req_ready),
|
||||
.valid_in (gpu_req_valid),
|
||||
.data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `GPU_OP(execute_if.op_type), execute_if.rd, execute_if.wb}),
|
||||
.data_out ({gpu_req_if.wid, gpu_req_if.thread_mask, gpu_req_if.curr_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb}),
|
||||
.data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, next_PC, `GPU_OP(execute_if.op_type), execute_if.rd, execute_if.wb}),
|
||||
.data_out ({gpu_req_if.wid, gpu_req_if.thread_mask, gpu_req_if.curr_PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb}),
|
||||
.ready_out (gpu_req_if.ready),
|
||||
.valid_out (gpu_req_if.valid)
|
||||
);
|
||||
|
|
|
@ -16,9 +16,9 @@ module VX_scoreboard #(
|
|||
reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0];
|
||||
reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0];
|
||||
|
||||
wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs;
|
||||
wire [`NUM_REGS-1:0] inuse_regs = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs;
|
||||
|
||||
assign delay = (| inuse_mask);
|
||||
assign delay = (| inuse_regs);
|
||||
|
||||
wire reserve_reg = ibuf_deq_if.valid && ibuf_deq_if.ready && (ibuf_deq_if.wb != 0);
|
||||
|
||||
|
@ -55,7 +55,7 @@ module VX_scoreboard #(
|
|||
if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin
|
||||
$display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b, exe=%b, gpr=%b",
|
||||
$time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.curr_PC, ibuf_deq_if.rd, ibuf_deq_if.wb,
|
||||
inuse_mask[ibuf_deq_if.rd], inuse_mask[ibuf_deq_if.rs1], inuse_mask[ibuf_deq_if.rs2], inuse_mask[ibuf_deq_if.rs3], exe_delay, gpr_delay);
|
||||
inuse_reg_mask[ibuf_deq_if.rd], inuse_reg_mask[ibuf_deq_if.rs1], inuse_reg_mask[ibuf_deq_if.rs2], inuse_reg_mask[ibuf_deq_if.rs3], exe_delay, gpr_delay);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -284,7 +284,7 @@ module VX_fp_fpga #(
|
|||
tag_out_r = 'x;
|
||||
for (integer i = 0; i < NUM_FPC; i++) begin
|
||||
if (per_core_valid_out[i]) begin
|
||||
per_core_ready_out[i] = 1;
|
||||
per_core_ready_out[i] = ready_out;
|
||||
valid_out_r = 1;
|
||||
has_fflags_r = fpnew_has_fflags && (i == 0);
|
||||
result_r = per_core_result[i];
|
||||
|
|
|
@ -82,7 +82,7 @@ module VX_fp_noncomp #(
|
|||
.o_type(tmp_b_type)
|
||||
);
|
||||
|
||||
wire tmp_a_smaller = (dataa[i] < datab[i]) ^ (tmp_a_sign || tmp_b_sign);
|
||||
wire tmp_a_smaller = $signed(dataa[i]) < $signed(datab[i]);
|
||||
wire tmp_ab_equal = (dataa[i] == datab[i]) | (tmp_a_type[4] & tmp_b_type[4]);
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
|
|
@ -10,6 +10,7 @@ interface VX_alu_req_if ();
|
|||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] thread_mask;
|
||||
wire [31:0] curr_PC;
|
||||
wire [31:0] next_PC;
|
||||
wire [`ALU_BR_BITS-1:0] op_type;
|
||||
wire is_br_op;
|
||||
wire rs1_is_PC;
|
||||
|
|
|
@ -10,6 +10,7 @@ interface VX_gpu_req_if();
|
|||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] thread_mask;
|
||||
wire [31:0] curr_PC;
|
||||
wire [31:0] next_PC;
|
||||
wire [`GPU_BITS-1:0] op_type;
|
||||
wire [`NUM_THREADS-1:0][31:0] rs1_data;
|
||||
wire [31:0] rs2_data;
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#include <fstream>
|
||||
#include <iomanip>
|
||||
|
||||
#define ALL_TESTS
|
||||
//#define ALL_TESTS
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc == 1) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue