mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
pipeline optimization
This commit is contained in:
parent
60e05ae19a
commit
27e95530ef
20 changed files with 184 additions and 340 deletions
|
@ -46,9 +46,9 @@ module VX_alu_unit #(
|
|||
default: alu_result[i] = alu_in1[i] + alu_in2[i]; // ADD, LUI, AUIPC
|
||||
endcase
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire [`NT_BITS-1:0] br_result_index;
|
||||
wire [`NT_BITS-1:0] br_result_index, br_result_index_o;
|
||||
|
||||
VX_priority_encoder #(
|
||||
.N(`NUM_THREADS)
|
||||
|
@ -58,15 +58,35 @@ module VX_alu_unit #(
|
|||
`UNUSED_PIN (valid_out)
|
||||
);
|
||||
|
||||
wire [32:0] br_result = sub_result[br_result_index];
|
||||
wire br_sign = br_result[32];
|
||||
wire [`BR_BITS-1:0] br_op = `IS_BR_OP(alu_req_if.alu_op) ? `BR_OP(alu_req_if.alu_op) : 0;
|
||||
wire [`BR_BITS-1:0] br_op_o;
|
||||
|
||||
wire [31:0] br_addr = (br_op == `BR_JALR) ? alu_req_if.rs1_data[br_result_index] : alu_req_if.curr_PC;
|
||||
wire [31:0] br_dest = $signed(br_addr) + $signed(alu_req_if.offset);
|
||||
|
||||
wire is_jal = (alu_op == `ALU_JAL || alu_op == `ALU_JALR);
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result;
|
||||
|
||||
wire stall = ~alu_commit_if.ready && alu_commit_if.valid;
|
||||
|
||||
VX_generic_register #(
|
||||
.N(1 + `NW_BITS + `ISTAG_BITS + (`NUM_THREADS * 32) + `BR_BITS + 32 + `NT_BITS)
|
||||
) alu_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (stall),
|
||||
.flush (0),
|
||||
.in ({alu_req_if.valid, alu_req_if.warp_num, alu_req_if.issue_tag, alu_jal_result, br_op, br_dest, br_result_index}),
|
||||
.out ({alu_commit_if.valid, branch_ctl_if.warp_num, alu_commit_if.issue_tag, alu_commit_if.data, br_op_o, branch_ctl_if.dest, br_result_index_o})
|
||||
);
|
||||
|
||||
wire [31:0] br_result = alu_commit_if.data[br_result_index_o];
|
||||
wire br_sign = br_result[31];
|
||||
wire br_nzero = (| br_result[31:0]);
|
||||
|
||||
wire [`BR_BITS-1:0] br_op = `BR_OP(alu_req_if.alu_op);
|
||||
|
||||
reg br_taken;
|
||||
always @(*) begin
|
||||
case (br_op)
|
||||
case (br_op_o)
|
||||
`BR_NE: br_taken = br_nzero;
|
||||
`BR_EQ: br_taken = ~br_nzero;
|
||||
`BR_LT,
|
||||
|
@ -75,39 +95,10 @@ module VX_alu_unit #(
|
|||
`BR_GEU: br_taken = ~br_sign;
|
||||
default: br_taken = 1'b1;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
wire [31:0] br_addr = (br_op == `BR_JALR) ? alu_req_if.rs1_data[br_result_index] : alu_req_if.curr_PC;
|
||||
wire [31:0] br_dest = $signed(br_addr) + $signed(alu_req_if.offset);
|
||||
|
||||
wire is_jal = (alu_op == `ALU_JAL || alu_op == `ALU_JALR);
|
||||
wire is_br_valid = `IS_BR_OP(alu_op) && alu_req_if.valid;
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result;
|
||||
|
||||
wire stall = ~alu_commit_if.ready && alu_commit_if.valid;
|
||||
|
||||
VX_generic_register #(
|
||||
.N(1 + `NW_BITS + 1 + 32)
|
||||
) branch_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (stall),
|
||||
.flush (0),
|
||||
.in ({is_br_valid, alu_req_if.warp_num, br_taken, br_dest}),
|
||||
.out ({branch_ctl_if.valid, branch_ctl_if.warp_num, branch_ctl_if.taken, branch_ctl_if.dest})
|
||||
);
|
||||
|
||||
VX_generic_register #(
|
||||
.N(1 + `ISTAG_BITS + (`NUM_THREADS * 32))
|
||||
) alu_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (stall),
|
||||
.flush (0),
|
||||
.in ({alu_req_if.valid, alu_req_if.issue_tag, alu_jal_result}),
|
||||
.out ({alu_commit_if.valid, alu_commit_if.issue_tag, alu_commit_if.data})
|
||||
);
|
||||
assign branch_ctl_if.valid = alu_req_if.valid && (br_op_o != 0);
|
||||
assign branch_ctl_if.taken = br_taken;
|
||||
|
||||
assign alu_req_if.ready = ~stall;
|
||||
|
||||
|
|
|
@ -28,7 +28,7 @@
|
|||
`endif
|
||||
|
||||
`ifndef NUM_CSRS
|
||||
`define NUM_CSRS 1024
|
||||
`define NUM_CSRS 64
|
||||
`endif
|
||||
|
||||
`ifndef STARTUP_ADDR
|
||||
|
@ -57,7 +57,7 @@
|
|||
|
||||
`define EXT_M_ENABLE
|
||||
|
||||
//`define EXT_F_ENABLE
|
||||
`define EXT_F_ENABLE
|
||||
|
||||
// Configuration Values =======================================================
|
||||
|
||||
|
|
|
@ -35,10 +35,10 @@ module VX_decode #(
|
|||
wire [6:0] func7 = instr[31:25];
|
||||
wire [11:0] u_12 = instr[31:20];
|
||||
|
||||
wire [`NR_BITS-1:0] rd = instr[11:7];
|
||||
wire [`NR_BITS-1:0] rs1 = instr[19:15];
|
||||
wire [`NR_BITS-1:0] rs2 = instr[24:20];
|
||||
wire [`NR_BITS-1:0] rs3 = instr[31:27];
|
||||
wire [4:0] rd = instr[11:7];
|
||||
wire [4:0] rs1 = instr[19:15];
|
||||
wire [4:0] rs2 = instr[24:20];
|
||||
wire [4:0] rs3 = instr[31:27];
|
||||
|
||||
// opcode types
|
||||
wire is_rtype = (opcode == `INST_R);
|
||||
|
@ -202,7 +202,7 @@ module VX_decode #(
|
|||
wire is_fcvtf = is_fci && (func7 == 7'h68); // convert to float
|
||||
wire is_fmvcls = is_fci && (func7 == 7'h70 || func7 == 7'h78); // move + class
|
||||
wire is_fr4 = is_fmadd || is_fmsub || is_fnmsub || is_fnmadd;
|
||||
wire is_fpu = (is_fl || is_fs || is_fci || is_fr4);
|
||||
wire is_fpu = (is_fl || is_fs || is_fci || is_fr4);
|
||||
|
||||
always @(*) begin
|
||||
fpu_op = `FPU_OTHER;
|
||||
|
@ -242,7 +242,8 @@ module VX_decode #(
|
|||
wire is_fcvtf = 0;
|
||||
wire is_fmvcls = 0;
|
||||
wire is_fr4 = 0;
|
||||
wire is_fpu = 0;
|
||||
wire is_fpu = 0;
|
||||
|
||||
always @(*) begin
|
||||
fpu_op = `FPU_OTHER;
|
||||
end
|
||||
|
@ -271,6 +272,29 @@ module VX_decode #(
|
|||
endcase
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire use_rd = (is_fl || is_fci || is_fr4)
|
||||
|| ((rd != 0) && (is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype));
|
||||
|
||||
wire use_rs1 = is_fpu
|
||||
|| is_gpu
|
||||
|| ((is_jalr || is_btype || is_ltype || is_stype || is_itype || is_rtype || ~is_csr_imm || is_gpu) && (rs1 != 0));
|
||||
|
||||
wire use_rs2 = (is_fpu && ~(is_fl || (fpu_op == `FPU_SQRT) || is_fcvti || is_fcvtf || is_fmvcls))
|
||||
|| (is_gpu && (gpu_op == `GPU_BAR || gpu_op == `GPU_WSPAWN))
|
||||
|| ((is_btype || is_stype || is_rtype) && (rs2 != 0));
|
||||
|
||||
wire use_rs3 = is_fr4;
|
||||
|
||||
wire rd_is_fp = is_fpu && ~(is_fcmp || is_fcvti || (fpu_op == `FPU_MVXW || fpu_op == `FPU_CLASS));
|
||||
wire rs1_is_fp = is_fr4 || (is_fci && ~(is_fcvtf || (fpu_op == `FPU_MVWX)));
|
||||
wire rs2_is_fp = is_fs || is_fr4 || is_fci;
|
||||
|
||||
wire [4:0] rs1_qual = is_lui ? 5'h0 : rs1;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_decode_if decode_tmp_if();
|
||||
|
||||
assign decode_tmp_if.valid = ifetch_rsp_if.valid;
|
||||
|
@ -297,9 +321,26 @@ module VX_decode #(
|
|||
(is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) :
|
||||
0;
|
||||
|
||||
assign decode_tmp_if.rd = rd;
|
||||
assign decode_tmp_if.rs1 = is_lui ? `NR_BITS'(0) : rs1;
|
||||
assign decode_tmp_if.rs2 = rs2;
|
||||
assign decode_tmp_if.wb = use_rd;
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign decode_tmp_if.rd = {rd_is_fp, rd};
|
||||
assign decode_tmp_if.rs1 = {rs1_is_fp, rs1_qual};
|
||||
assign decode_tmp_if.rs2 = {rs2_is_fp, rs2};
|
||||
assign decode_tmp_if.rs3 = {1'b1, rs3};
|
||||
`else
|
||||
assign decode_tmp_if.rd = rd;
|
||||
assign decode_tmp_if.rs1 = rs1_qual;
|
||||
assign decode_tmp_if.rs2 = rs2;
|
||||
assign decode_tmp_if.rs3 = rs3;
|
||||
`endif
|
||||
|
||||
assign decode_tmp_if.use_rs3 = use_rs3;
|
||||
|
||||
assign decode_tmp_if.reg_use_mask = ((`NUM_REGS)'(use_rd) << rd)
|
||||
| ((`NUM_REGS)'(use_rs1) << rs1_qual)
|
||||
| ((`NUM_REGS)'(use_rs2) << rs2)
|
||||
| ((`NUM_REGS)'(use_rs3) << rs3);
|
||||
|
||||
assign decode_tmp_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} :
|
||||
(is_jal || is_jalr || is_jals) ? jalx_offset :
|
||||
|
@ -308,26 +349,8 @@ module VX_decode #(
|
|||
|
||||
assign decode_tmp_if.rs1_is_PC = is_auipc;
|
||||
assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm;
|
||||
|
||||
assign decode_tmp_if.use_rs1 = is_fpu
|
||||
|| is_gpu
|
||||
|| ((is_jalr || is_btype || is_ltype || is_stype || is_itype || is_rtype || ~is_csr_imm || is_gpu)
|
||||
&& (decode_tmp_if.rs1 != 0));
|
||||
|
||||
assign decode_tmp_if.use_rs2 = (is_fpu && ~(is_fl || (fpu_op == `FPU_SQRT) || is_fcvti || is_fcvtf || is_fmvcls))
|
||||
|| (is_gpu && (gpu_op == `GPU_BAR || gpu_op == `GPU_WSPAWN))
|
||||
|| ((is_btype || is_stype || is_rtype)
|
||||
&& (decode_tmp_if.rs2 != 0));
|
||||
|
||||
assign decode_tmp_if.rd_is_fp = is_fpu && ~(is_fcmp || is_fcvti || (fpu_op == `FPU_MVXW || fpu_op == `FPU_CLASS));
|
||||
assign decode_tmp_if.rs1_is_fp = is_fr4 || (is_fci && ~(is_fcvtf || (fpu_op == `FPU_MVWX)));
|
||||
assign decode_tmp_if.rs2_is_fp = is_fs || is_fr4 || is_fci;
|
||||
assign decode_tmp_if.rs3 = rs3;
|
||||
assign decode_tmp_if.use_rs3 = is_fr4;
|
||||
assign decode_tmp_if.frm = func3;
|
||||
|
||||
assign decode_tmp_if.wb = (is_fl || is_fci || is_fr4)
|
||||
|| ((rd != 0) && (is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype));
|
||||
|
||||
assign decode_tmp_if.frm = func3;
|
||||
|
||||
assign join_if.is_join = in_valid && is_gpu && (gpu_op == `GPU_JOIN);
|
||||
assign join_if.warp_num = ifetch_rsp_if.warp_num;
|
||||
|
@ -338,14 +361,14 @@ module VX_decode #(
|
|||
wire stall = ~decode_if.ready && decode_if.valid;
|
||||
|
||||
VX_generic_register #(
|
||||
.N(1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + 1 + `FRM_BITS)
|
||||
.N(1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + `FRM_BITS + `NUM_REGS)
|
||||
) decode_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (stall),
|
||||
.flush (0),
|
||||
.in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.rd_is_fp, decode_tmp_if.frm}),
|
||||
.out ({decode_if.valid, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.rd_is_fp, decode_if.frm})
|
||||
.in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, decode_tmp_if.reg_use_mask}),
|
||||
.out ({decode_if.valid, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, decode_if.reg_use_mask})
|
||||
);
|
||||
|
||||
assign ifetch_rsp_if.ready = ~stall;
|
||||
|
@ -357,7 +380,7 @@ module VX_decode #(
|
|||
print_ex_type(decode_tmp_if.ex_type);
|
||||
$write(", op=");
|
||||
print_ex_op(decode_tmp_if.ex_type, decode_tmp_if.ex_op);
|
||||
$write(", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_rs1=%b, use_rs2=%b, use_rs3=%b, rd_is_fp=%b, rs1_is_fp=%b, rs2_is_fp=%b, frm=", decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.rs3, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.use_rs3, decode_tmp_if.rd_is_fp,decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp);
|
||||
$write(", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, frm=", decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.rs3, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm);
|
||||
print_frm(decode_tmp_if.frm);
|
||||
$write("\n");
|
||||
|
||||
|
|
|
@ -7,7 +7,6 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define QUEUE_FORCE_MLAB 1
|
||||
// `define SYNTHESIS 1
|
||||
// `define ASIC 1
|
||||
|
||||
|
@ -23,7 +22,11 @@
|
|||
|
||||
`define REQS_BITS `LOG2UP(NUM_REQUESTS)
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define NUM_REGS 64
|
||||
`else
|
||||
`define NUM_REGS 32
|
||||
`endif
|
||||
|
||||
`define NR_BITS `LOG2UP(`NUM_REGS)
|
||||
|
||||
|
@ -33,7 +36,9 @@
|
|||
|
||||
`define ISTAG_BITS `LOG2UP(`ISSUEQ_SIZE)
|
||||
|
||||
`define LATENCY_IDIV 23
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define LATENCY_IDIV 24
|
||||
`define LATENCY_IMUL 2
|
||||
|
||||
`define LATENCY_FMULADD 2
|
||||
|
@ -408,7 +413,6 @@ typedef struct packed {
|
|||
logic [`NUM_THREADS-1:0] thread_mask;
|
||||
logic [31:0] curr_PC;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic rd_is_fp;
|
||||
logic wb;
|
||||
} is_data_t;
|
||||
|
||||
|
|
|
@ -3,98 +3,49 @@
|
|||
// control module to support multi-cycle read for fp register
|
||||
|
||||
module VX_gpr_fp_ctrl (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire [`NUM_THREADS-1:0][31:0] rs1_int_data,
|
||||
input wire [`NUM_THREADS-1:0][31:0] rs2_int_data,
|
||||
input wire [`NUM_THREADS-1:0][31:0] rs1_fp_data,
|
||||
input wire [`NUM_THREADS-1:0][31:0] rs2_fp_data,
|
||||
input wire [`NUM_THREADS-1:0][31:0] rs1_data,
|
||||
input wire [`NUM_THREADS-1:0][31:0] rs2_data,
|
||||
|
||||
// outputs
|
||||
output wire [`NR_BITS-1:0] raddr1,
|
||||
output wire [`NR_BITS-1:0] raddr2,
|
||||
output wire [`NR_BITS-1:0] raddr1,
|
||||
|
||||
VX_gpr_read_if gpr_read_if
|
||||
VX_gpr_read_if gpr_read_if
|
||||
);
|
||||
// param
|
||||
localparam GPR_DELAY_WID = 1;
|
||||
reg [GPR_DELAY_WID-1:0] multi_cyc_state;
|
||||
|
||||
reg [`NUM_THREADS-1:0][31:0] tmp_rs1_data;
|
||||
reg [`NUM_THREADS-1:0][31:0] tmp_rs2_data;
|
||||
reg [`NUM_THREADS-1:0][31:0] rs1_data;
|
||||
reg [`NUM_THREADS-1:0][31:0] rs2_data;
|
||||
reg [`NUM_THREADS-1:0][31:0] rs3_data;
|
||||
reg [`NUM_THREADS-1:0][31:0] tmp_rs1_data;
|
||||
reg read_rs3;
|
||||
|
||||
wire gpr_delay;
|
||||
wire gpr_delay = gpr_read_if.valid && gpr_read_if.use_rs3 && ~read_rs3;
|
||||
|
||||
wire gpr_fire = gpr_read_if.valid && gpr_read_if.ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
multi_cyc_state <= 0;
|
||||
read_rs3 <= 0;
|
||||
end else if (gpr_delay) begin
|
||||
multi_cyc_state <= 1;
|
||||
read_rs3 <= 1;
|
||||
end else if (gpr_fire) begin
|
||||
multi_cyc_state <= 0;
|
||||
read_rs3 <= 0;
|
||||
end
|
||||
end
|
||||
|
||||
// select rs1 data
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
tmp_rs1_data <= 0;
|
||||
end else begin
|
||||
if (gpr_delay) begin
|
||||
if (gpr_read_if.rs1_is_fp) begin
|
||||
tmp_rs1_data <= rs1_fp_data;
|
||||
end else begin
|
||||
tmp_rs1_data <= rs1_int_data;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// select rs2 data
|
||||
|
||||
always @(posedge clk) begin
|
||||
if(reset) begin
|
||||
tmp_rs2_data <= 0;
|
||||
end else begin
|
||||
if (gpr_delay) begin
|
||||
if (gpr_read_if.rs2_is_fp) begin
|
||||
tmp_rs2_data <= rs2_fp_data;
|
||||
end else begin
|
||||
tmp_rs2_data <= rs2_int_data;
|
||||
end
|
||||
end
|
||||
// backup original rs1 data
|
||||
always @(posedge clk) begin
|
||||
if (gpr_delay) begin
|
||||
tmp_rs1_data <= rs1_data;
|
||||
end
|
||||
end
|
||||
|
||||
// outputs
|
||||
|
||||
assign gpr_delay = gpr_read_if.valid && gpr_read_if.use_rs3 && (0 == multi_cyc_state);
|
||||
|
||||
assign raddr1 = multi_cyc_state ? gpr_read_if.rs3 : gpr_read_if.rs1;
|
||||
assign raddr2 = gpr_read_if.rs2;
|
||||
|
||||
always @(*) begin
|
||||
if (gpr_read_if.use_rs3) begin
|
||||
rs1_data = tmp_rs1_data;
|
||||
rs2_data = tmp_rs2_data;
|
||||
rs3_data = rs1_fp_data;
|
||||
end else begin
|
||||
rs1_data = gpr_read_if.rs1_is_fp ? rs1_fp_data : rs1_int_data;
|
||||
rs2_data = gpr_read_if.rs2_is_fp ? rs2_fp_data : rs2_int_data;
|
||||
rs3_data = {`NUM_THREADS{32'h8000_0000}}; // default value: -0 in single fp
|
||||
end
|
||||
end
|
||||
assign raddr1 = read_rs3 ? gpr_read_if.rs3 : gpr_read_if.rs1;
|
||||
|
||||
assign gpr_read_if.ready = ~gpr_delay;
|
||||
assign gpr_read_if.rs1_data = rs1_data;
|
||||
assign gpr_read_if.rs1_data = gpr_read_if.use_rs3 ? tmp_rs1_data : rs1_data;
|
||||
assign gpr_read_if.rs2_data = rs2_data;
|
||||
assign gpr_read_if.rs3_data = rs3_data;
|
||||
assign gpr_read_if.rs3_data = rs1_data;
|
||||
|
||||
endmodule
|
|
@ -14,79 +14,52 @@ module VX_gpr_stage #(
|
|||
);
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] rs1_int_data [`NUM_WARPS-1:0];
|
||||
wire [`NUM_THREADS-1:0][31:0] rs2_int_data [`NUM_WARPS-1:0];
|
||||
wire [`NUM_THREADS-1:0][31:0] rs1_data [`NUM_WARPS-1:0];
|
||||
wire [`NUM_THREADS-1:0][31:0] rs2_data [`NUM_WARPS-1:0];
|
||||
|
||||
wire [`NR_BITS-1:0] raddr1;
|
||||
wire [`NR_BITS-1:0] raddr2;
|
||||
|
||||
genvar i;
|
||||
|
||||
for (i = 0; i < `NUM_WARPS; i++) begin
|
||||
wire [`NUM_WARPS-1:0] we = writeback_if.thread_mask & {`NUM_THREADS{writeback_if.valid && ~writeback_if.rd_is_fp && (i == writeback_if.warp_num)}};
|
||||
wire [`NUM_THREADS-1:0] we = writeback_if.thread_mask
|
||||
& {`NUM_THREADS{writeback_if.valid && (i == writeback_if.warp_num)}};
|
||||
VX_gpr_ram gpr_int_ram (
|
||||
.clk (clk),
|
||||
.we (we),
|
||||
.waddr (writeback_if.rd),
|
||||
.wdata (writeback_if.data),
|
||||
.rs1 (raddr1),
|
||||
.rs2 (raddr2),
|
||||
.rs1_data (rs1_int_data[i]),
|
||||
.rs2_data (rs2_int_data[i])
|
||||
.rs2 (gpr_read_if.rs2),
|
||||
.rs1_data (rs1_data[i]),
|
||||
.rs2_data (rs2_data[i])
|
||||
);
|
||||
end
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] rs1_fp_data [`NUM_WARPS-1:0];
|
||||
wire [`NUM_THREADS-1:0][31:0] rs2_fp_data [`NUM_WARPS-1:0];
|
||||
|
||||
for (i = 0; i < `NUM_WARPS; i++) begin
|
||||
wire [`NUM_WARPS-1:0] we = writeback_if.thread_mask & {`NUM_THREADS{writeback_if.valid && writeback_if.rd_is_fp && (i == writeback_if.warp_num)}};
|
||||
VX_gpr_ram gpr_fp_ram (
|
||||
.clk (clk),
|
||||
.we (we),
|
||||
.waddr (writeback_if.rd),
|
||||
.wdata (writeback_if.data),
|
||||
.rs1 (raddr1),
|
||||
.rs2 (raddr2),
|
||||
.rs1_data (rs1_fp_data[i]),
|
||||
.rs2_data (rs2_fp_data[i])
|
||||
);
|
||||
end
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_gpr_fp_ctrl VX_gpr_fp_ctrl (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
//inputs
|
||||
.rs1_int_data (rs1_int_data[gpr_read_if.warp_num]),
|
||||
.rs2_int_data (rs2_int_data[gpr_read_if.warp_num]),
|
||||
.rs1_fp_data (rs1_fp_data[gpr_read_if.warp_num]),
|
||||
.rs2_fp_data (rs2_fp_data[gpr_read_if.warp_num]),
|
||||
.rs1_data (rs1_data[gpr_read_if.warp_num]),
|
||||
.rs2_data (rs2_data[gpr_read_if.warp_num]),
|
||||
|
||||
// outputs
|
||||
.raddr1 (raddr1),
|
||||
.raddr2 (raddr2),
|
||||
.gpr_read_if (gpr_read_if)
|
||||
.raddr1 (raddr1),
|
||||
.gpr_read_if(gpr_read_if)
|
||||
);
|
||||
|
||||
`else
|
||||
assign raddr1 = gpr_read_if.rs1;
|
||||
assign raddr2 = gpr_read_if.rs2;
|
||||
assign gpr_read_if.rs1_data = rs1_int_data[gpr_read_if.warp_num];
|
||||
assign gpr_read_if.rs2_data = rs2_int_data[gpr_read_if.warp_num];
|
||||
assign gpr_read_if.rs1_data = rs1_data[gpr_read_if.warp_num];
|
||||
assign gpr_read_if.rs2_data = rs2_data[gpr_read_if.warp_num];
|
||||
assign gpr_read_if.rs3_data = 0;
|
||||
assign gpr_read_if.ready = 1;
|
||||
|
||||
wire valid = gpr_read_if.valid;
|
||||
wire rs1_is_fp = gpr_read_if.rs1_is_fp;
|
||||
wire rs2_is_fp = gpr_read_if.rs2_is_fp;
|
||||
wire use_rs3 = gpr_read_if.use_rs3;
|
||||
wire [`NR_BITS-1:0] rs3 = gpr_read_if.rs3;
|
||||
`UNUSED_VAR (valid);
|
||||
`UNUSED_VAR (rs1_is_fp);
|
||||
`UNUSED_VAR (rs2_is_fp);
|
||||
`UNUSED_VAR (use_rs3);
|
||||
`UNUSED_VAR (rs3);
|
||||
`endif
|
||||
|
|
|
@ -53,7 +53,7 @@ module VX_gpu_unit #(
|
|||
wire[`NUM_THREADS-1:0] split_new_use_mask;
|
||||
wire[`NUM_THREADS-1:0] split_new_later_mask;
|
||||
|
||||
for (i = 0; i < `NUM_THREADS; i++) begin : masks_init
|
||||
for (i = 0; i < `NUM_THREADS; i++) begin
|
||||
wire curr_bool = (gpu_req_if.rs1_data[i] == 32'b1);
|
||||
assign split_new_use_mask[i] = gpu_req_if.thread_mask[i] & (curr_bool);
|
||||
assign split_new_later_mask[i] = gpu_req_if.thread_mask[i] & (!curr_bool);
|
||||
|
|
|
@ -23,8 +23,6 @@ module VX_issue #(
|
|||
assign gpr_read_if.rs1 = decode_if.rs1;
|
||||
assign gpr_read_if.rs2 = decode_if.rs2;
|
||||
assign gpr_read_if.rs3 = decode_if.rs3;
|
||||
assign gpr_read_if.rs1_is_fp = decode_if.rs1_is_fp;
|
||||
assign gpr_read_if.rs2_is_fp = decode_if.rs2_is_fp;
|
||||
assign gpr_read_if.use_rs3 = decode_if.use_rs3;
|
||||
|
||||
wire [`ISTAG_BITS-1:0] issue_tag, issue_tmp_tag;
|
||||
|
@ -52,8 +50,7 @@ module VX_issue #(
|
|||
.mul_busy (mul_busy),
|
||||
.fpu_busy (fpu_busy),
|
||||
.gpu_busy (gpu_busy),
|
||||
.issue_tag (issue_tag),
|
||||
`UNUSED_PIN (is_empty)
|
||||
.issue_tag (issue_tag)
|
||||
);
|
||||
|
||||
VX_gpr_stage #(
|
||||
|
@ -72,14 +69,14 @@ module VX_issue #(
|
|||
wire flush = alu_req_if.ready && ~decode_if.ready;
|
||||
|
||||
VX_generic_register #(
|
||||
.N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + `FRM_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32))
|
||||
) decode_reg (
|
||||
.N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + `FRM_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32))
|
||||
) issue_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (stall),
|
||||
.flush (flush),
|
||||
.in ({decode_if.valid, issue_tag, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.frm, gpr_read_if.rs1_data, gpr_read_if.rs2_data, gpr_read_if.rs3_data}),
|
||||
.out ({decode_tmp_if.valid, issue_tmp_tag, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.frm, gpr_data_tmp_if.rs1_data, gpr_data_tmp_if.rs2_data, gpr_data_tmp_if.rs3_data})
|
||||
.in ({decode_if.valid, issue_tag, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, gpr_read_if.rs1_data, gpr_read_if.rs2_data, gpr_read_if.rs3_data}),
|
||||
.out ({decode_tmp_if.valid, issue_tmp_tag, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, gpr_data_tmp_if.rs1_data, gpr_data_tmp_if.rs2_data, gpr_data_tmp_if.rs3_data})
|
||||
);
|
||||
|
||||
VX_issue_demux issue_demux (
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
`ifndef VX_PLATFORM
|
||||
`define VX_PLATFORM
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifndef NDEBUG
|
||||
`define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \
|
||||
x \
|
||||
|
@ -45,6 +47,12 @@
|
|||
`define ENABLE_TRACING /* verilator tracing_on */
|
||||
`define DISABLE_TRACING /* verilator tracing_off */
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define USE_FAST_BRAM (* syn_ramstyle = "mlab" *)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CLOG2(x) $clog2(x)
|
||||
`define FLOG2(x) ($clog2(x) - (((1 << $clog2(x)) > (x)) ? 1 : 0))
|
||||
`define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1)
|
||||
|
|
|
@ -16,106 +16,55 @@ module VX_scheduler #(
|
|||
input wire mul_busy,
|
||||
input wire fpu_busy,
|
||||
input wire gpu_busy,
|
||||
output wire [`ISTAG_BITS-1:0] issue_tag,
|
||||
output wire is_empty
|
||||
output wire [`ISTAG_BITS-1:0] issue_tag
|
||||
);
|
||||
localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1);
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
localparam NREGS = (`NUM_REGS * 2);
|
||||
reg inuse_table [`NUM_WARPS-1:0][NREGS-1:0];
|
||||
wire [`NR_BITS:0] read_rs1 = {decode_if.rs1_is_fp, decode_if.rs1};
|
||||
wire [`NR_BITS:0] read_rs2 = {decode_if.rs2_is_fp, decode_if.rs2};
|
||||
wire [`NR_BITS:0] read_rs3 = {1'b1, decode_if.rs3};
|
||||
wire [`NR_BITS:0] read_rd = {decode_if.rd_is_fp, decode_if.rd};
|
||||
wire [`NR_BITS:0] write_rd = {writeback_if.rd_is_fp, writeback_if.rd};
|
||||
wire rs3_inuse = inuse_table[decode_if.warp_num][read_rs3];
|
||||
`else
|
||||
localparam NREGS = `NUM_REGS;
|
||||
reg inuse_table [`NUM_WARPS-1:0][NREGS-1:0];
|
||||
wire [`NR_BITS-1:0] read_rs1 = decode_if.rs1;
|
||||
wire [`NR_BITS-1:0] read_rs2 = decode_if.rs2;
|
||||
wire [`NR_BITS-1:0] read_rd = decode_if.rd;
|
||||
wire [`NR_BITS-1:0] write_rd = writeback_if.rd;
|
||||
wire rs3_inuse = 0;
|
||||
`endif
|
||||
|
||||
reg [`NUM_THREADS-1:0] inuse_registers [`NUM_WARPS-1:0][NREGS-1:0];
|
||||
reg [CTVW-1:0] count_valid;
|
||||
localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1);
|
||||
reg [`NUM_THREADS-1:0] inuse_registers [`NUM_WARPS-1:0][`NUM_REGS-1:0];
|
||||
reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0];
|
||||
|
||||
wire rs1_inuse = inuse_table[decode_if.warp_num][read_rs1];
|
||||
wire rs2_inuse = inuse_table[decode_if.warp_num][read_rs2];
|
||||
wire rd_inuse = inuse_table[decode_if.warp_num][read_rd];
|
||||
wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[decode_if.warp_num] & decode_if.reg_use_mask;
|
||||
wire inuse_hazard = (inuse_mask != 0);
|
||||
|
||||
wire rs1_inuse_qual = rs1_inuse && decode_if.use_rs1;
|
||||
wire rs2_inuse_qual = rs2_inuse && decode_if.use_rs2;
|
||||
wire rs3_inuse_qual = rs3_inuse && decode_if.use_rs3;
|
||||
wire rd_inuse_qual = rd_inuse && decode_if.wb;
|
||||
|
||||
wire inuse_valid = (rd_inuse_qual || rs1_inuse_qual || rs2_inuse_qual || rs3_inuse_qual);
|
||||
|
||||
wire ex_stalled = ((gpr_busy)
|
||||
|| (alu_busy && (decode_if.ex_type == `EX_ALU))
|
||||
wire exu_stalled = (alu_busy && (decode_if.ex_type == `EX_ALU))
|
||||
|| (lsu_busy && (decode_if.ex_type == `EX_LSU))
|
||||
|| (csr_busy && (decode_if.ex_type == `EX_CSR))
|
||||
|| (mul_busy && (decode_if.ex_type == `EX_MUL))
|
||||
|| (fpu_busy && (decode_if.ex_type == `EX_FPU))
|
||||
|| (gpu_busy && (decode_if.ex_type == `EX_GPU)));
|
||||
|| (gpu_busy && (decode_if.ex_type == `EX_GPU));
|
||||
|
||||
wire issue_buf_full;
|
||||
|
||||
wire stall = (ex_stalled || inuse_valid || issue_buf_full) && decode_if.valid;
|
||||
wire stall = (gpr_busy || exu_stalled || inuse_hazard || issue_buf_full) && decode_if.valid;
|
||||
|
||||
wire acquire_rd = decode_if.valid && (decode_if.wb != 0) && ~stall;
|
||||
|
||||
wire release_rd = writeback_if.valid;
|
||||
|
||||
wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[writeback_if.warp_num][write_rd] & ~writeback_if.thread_mask;
|
||||
wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[writeback_if.warp_num][writeback_if.rd] & ~writeback_if.thread_mask;
|
||||
|
||||
reg [CTVW-1:0] count_valid_next = (acquire_rd && !(release_rd && (0 == inuse_registers_n))) ? (count_valid + 1) :
|
||||
(~acquire_rd && (release_rd && (0 == inuse_registers_n))) ? (count_valid - 1) :
|
||||
count_valid;
|
||||
always @(posedge clk) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
integer i, w;
|
||||
for (w = 0; w < `NUM_WARPS; w++) begin
|
||||
for (i = 0; i < NREGS; i++) begin
|
||||
inuse_registers[w][i] <= 0;
|
||||
inuse_table[w][i] <= 0;
|
||||
for (i = 0; i < `NUM_REGS; i++) begin
|
||||
inuse_registers[w][i] <= 0;
|
||||
end
|
||||
inuse_reg_mask[w] <= 0;
|
||||
end
|
||||
count_valid <= 0;
|
||||
end else begin
|
||||
if (acquire_rd) begin
|
||||
inuse_registers[decode_if.warp_num][read_rd] <= decode_if.thread_mask;
|
||||
inuse_table[decode_if.warp_num][read_rd] <= 1;
|
||||
inuse_registers[decode_if.warp_num][decode_if.rd] <= decode_if.thread_mask;
|
||||
inuse_reg_mask[decode_if.warp_num][decode_if.rd] <= 1;
|
||||
end
|
||||
if (release_rd) begin
|
||||
assert(inuse_table[writeback_if.warp_num][write_rd] != 0);
|
||||
inuse_registers[writeback_if.warp_num][write_rd] <= inuse_registers_n;
|
||||
inuse_table[writeback_if.warp_num][write_rd] <= (| inuse_registers_n);
|
||||
assert(inuse_reg_mask[writeback_if.warp_num][writeback_if.rd] != 0);
|
||||
inuse_registers[writeback_if.warp_num][writeback_if.rd] <= inuse_registers_n;
|
||||
inuse_reg_mask[writeback_if.warp_num][writeback_if.rd] <= (| inuse_registers_n);
|
||||
end
|
||||
count_valid <= count_valid_next;
|
||||
end
|
||||
end
|
||||
|
||||
wire ib_acquire = decode_if.valid && ~stall;
|
||||
|
||||
`DEBUG_BLOCK(
|
||||
wire [`NW_BITS-1:0] cis_alu_warp_num = cmt_to_issue_if.alu_data.warp_num;
|
||||
wire [`NUM_THREADS-1:0] cis_alu_thread_mask = cmt_to_issue_if.alu_data.thread_mask;
|
||||
wire [31:0] cis_alu_curr_PC = cmt_to_issue_if.alu_data.curr_PC;
|
||||
wire [`NR_BITS-1:0] cis_alu_rd = cmt_to_issue_if.alu_data.rd;
|
||||
wire cis_alu_rd_is_fp = cmt_to_issue_if.alu_data.rd_is_fp;
|
||||
wire cis_alu_wb = cmt_to_issue_if.alu_data.wb;
|
||||
|
||||
wire [`NW_BITS-1:0] cis_fpu_warp_num = cmt_to_issue_if.fpu_data.warp_num;
|
||||
wire [`NUM_THREADS-1:0] cis_fpu_thread_mask = cmt_to_issue_if.fpu_data.thread_mask;
|
||||
wire [31:0] cis_fpu_curr_PC = cmt_to_issue_if.fpu_data.curr_PC;
|
||||
wire [`NR_BITS-1:0] cis_fpu_rd = cmt_to_issue_if.fpu_data.rd;
|
||||
wire cis_fpu_rd_is_fp = cmt_to_issue_if.fpu_data.rd_is_fp;
|
||||
wire cis_fpu_wb = cmt_to_issue_if.fpu_data.wb;
|
||||
)
|
||||
wire issue_fire = decode_if.valid && ~stall;
|
||||
|
||||
VX_cam_buffer #(
|
||||
.DATAW ($bits(is_data_t)),
|
||||
|
@ -124,9 +73,9 @@ module VX_scheduler #(
|
|||
) issue_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.write_data ({decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.rd_is_fp, decode_if.wb}),
|
||||
.write_data ({decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.wb}),
|
||||
.write_addr (issue_tag),
|
||||
.acquire_slot (ib_acquire),
|
||||
.acquire_slot (issue_fire),
|
||||
.release_slot ({cmt_to_issue_if.alu_valid, cmt_to_issue_if.lsu_valid, cmt_to_issue_if.csr_valid, cmt_to_issue_if.mul_valid, cmt_to_issue_if.fpu_valid, cmt_to_issue_if.gpu_valid}),
|
||||
.read_addr ({cmt_to_issue_if.alu_tag, cmt_to_issue_if.lsu_tag, cmt_to_issue_if.csr_tag, cmt_to_issue_if.mul_tag, cmt_to_issue_if.fpu_tag, cmt_to_issue_if.gpu_tag}),
|
||||
.read_data ({cmt_to_issue_if.alu_data, cmt_to_issue_if.lsu_data, cmt_to_issue_if.csr_data, cmt_to_issue_if.mul_data, cmt_to_issue_if.fpu_data, cmt_to_issue_if.gpu_data}),
|
||||
|
@ -135,14 +84,12 @@ module VX_scheduler #(
|
|||
|
||||
assign decode_if.ready = ~stall;
|
||||
|
||||
assign is_empty = (0 == count_valid);
|
||||
|
||||
`ifdef DBG_PRINT_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (stall) begin
|
||||
$display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, ib_full=%b, inuse=%b%b%b%b, gpr=%b, alu=%b, lsu=%b, csr=%b, mul=%b, fpu=%b, gpu=%b",
|
||||
$time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full, rd_inuse_qual, rs1_inuse_qual,
|
||||
rs2_inuse_qual, rs3_inuse_qual, gpr_busy, alu_busy, lsu_busy, csr_busy, mul_busy, fpu_busy, gpu_busy);
|
||||
$time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full, inuse_mask[decode_if.rd], inuse_mask[decode_if.rs1],
|
||||
inuse_mask[decode_if.rs2], inuse_mask[decode_if.rs3], gpr_busy, alu_busy, lsu_busy, csr_busy, mul_busy, fpu_busy, gpu_busy);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -205,24 +205,26 @@ module VX_warp_sched #(
|
|||
assign {join_fall, join_pc, join_tm} = ipdom[join_if.warp_num];
|
||||
|
||||
genvar i;
|
||||
for (i = 0; i < `NUM_WARPS; i++) begin : stacks
|
||||
for (i = 0; i < `NUM_WARPS; i++) begin
|
||||
wire correct_warp_s = (i == warp_ctl_if.warp_num);
|
||||
wire correct_warp_j = (i == join_if.warp_num);
|
||||
|
||||
wire push = (warp_ctl_if.is_split && warp_ctl_if.do_split) && correct_warp_s;
|
||||
wire pop = join_if.is_join && correct_warp_j;
|
||||
|
||||
VX_generic_stack #(
|
||||
VX_ipdom_stack #(
|
||||
.WIDTH(1+32+`NUM_THREADS),
|
||||
.DEPTH($clog2(`NUM_THREADS)+1)
|
||||
) ipdom_stack(
|
||||
.DEPTH(`NT_BITS+1)
|
||||
) ipdom_stack (
|
||||
.clk (clk),
|
||||
.reset(reset),
|
||||
.push (push),
|
||||
.pop (pop),
|
||||
.d (ipdom[i]),
|
||||
.q1 (q1),
|
||||
.q2 (q2)
|
||||
.q2 (q2),
|
||||
`UNUSED_PIN (empty),
|
||||
`UNUSED_PIN (full)
|
||||
);
|
||||
end
|
||||
|
||||
|
|
|
@ -24,7 +24,6 @@ module VX_writeback #(
|
|||
reg [`NUM_THREADS-1:0] wb_thread_mask [`ISSUEQ_SIZE-1:0];
|
||||
reg [31:0] wb_curr_PC [`ISSUEQ_SIZE-1:0];
|
||||
reg [`NR_BITS-1:0] wb_rd [`ISSUEQ_SIZE-1:0];
|
||||
reg wb_rd_is_fp [`ISSUEQ_SIZE-1:0];
|
||||
reg [`ISSUEQ_SIZE-1:0] wb_pending, wb_pending_n;
|
||||
|
||||
reg [`ISTAG_BITS-1:0] wb_index;
|
||||
|
@ -75,7 +74,6 @@ module VX_writeback #(
|
|||
wb_thread_mask [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.thread_mask;
|
||||
wb_curr_PC [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.curr_PC;
|
||||
wb_rd [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.rd;
|
||||
wb_rd_is_fp [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.rd_is_fp;
|
||||
end
|
||||
if (lsu_commit_if.valid) begin
|
||||
wb_data [lsu_commit_if.issue_tag] <= lsu_commit_if.data;
|
||||
|
@ -83,7 +81,6 @@ module VX_writeback #(
|
|||
wb_thread_mask [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.thread_mask;
|
||||
wb_curr_PC [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.curr_PC;
|
||||
wb_rd [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.rd;
|
||||
wb_rd_is_fp [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.rd_is_fp;
|
||||
end
|
||||
if (csr_commit_if.valid) begin
|
||||
wb_data [csr_commit_if.issue_tag] <= csr_commit_if.data;
|
||||
|
@ -91,7 +88,6 @@ module VX_writeback #(
|
|||
wb_thread_mask [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.thread_mask;
|
||||
wb_curr_PC [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.curr_PC;
|
||||
wb_rd [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.rd;
|
||||
wb_rd_is_fp [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.rd_is_fp;
|
||||
end
|
||||
if (mul_commit_if.valid) begin
|
||||
wb_data [mul_commit_if.issue_tag] <= mul_commit_if.data;
|
||||
|
@ -99,7 +95,6 @@ module VX_writeback #(
|
|||
wb_thread_mask [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.thread_mask;
|
||||
wb_curr_PC [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.curr_PC;
|
||||
wb_rd [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.rd;
|
||||
wb_rd_is_fp [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.rd_is_fp;
|
||||
end
|
||||
if (fpu_commit_if.valid) begin
|
||||
wb_data [fpu_commit_if.issue_tag] <= fpu_commit_if.data;
|
||||
|
@ -107,7 +102,6 @@ module VX_writeback #(
|
|||
wb_thread_mask [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.thread_mask;
|
||||
wb_curr_PC [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.curr_PC;
|
||||
wb_rd [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.rd;
|
||||
wb_rd_is_fp [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.rd_is_fp;
|
||||
end
|
||||
|
||||
wb_pending <= wb_pending_n;
|
||||
|
@ -122,7 +116,6 @@ module VX_writeback #(
|
|||
assign writeback_if.thread_mask = wb_thread_mask [wb_index];
|
||||
assign writeback_if.curr_PC = wb_curr_PC [wb_index];
|
||||
assign writeback_if.rd = wb_rd [wb_index];
|
||||
assign writeback_if.rd_is_fp = wb_rd_is_fp [wb_index];
|
||||
assign writeback_if.data = wb_data [wb_index];
|
||||
|
||||
// commit back-pressure
|
||||
|
|
|
@ -21,16 +21,12 @@ interface VX_decode_if ();
|
|||
|
||||
wire rs1_is_PC;
|
||||
wire rs2_is_imm;
|
||||
|
||||
wire use_rs1;
|
||||
wire use_rs2;
|
||||
|
||||
wire [`NUM_REGS-1:0] reg_use_mask;
|
||||
|
||||
// FP states
|
||||
wire [`NR_BITS-1:0] rs3;
|
||||
wire use_rs3;
|
||||
wire rd_is_fp;
|
||||
wire rs1_is_fp;
|
||||
wire rs2_is_fp;
|
||||
wire use_rs3;
|
||||
wire [`FRM_BITS-1:0] frm;
|
||||
|
||||
wire wb;
|
||||
|
|
|
@ -13,10 +13,7 @@ interface VX_gpr_read_if ();
|
|||
wire [`NR_BITS-1:0] rs2;
|
||||
wire [`NR_BITS-1:0] rs3;
|
||||
|
||||
wire use_rs3;
|
||||
|
||||
wire rs1_is_fp;
|
||||
wire rs2_is_fp;
|
||||
wire use_rs3;
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] rs1_data;
|
||||
wire [`NUM_THREADS-1:0][31:0] rs2_data;
|
||||
|
|
|
@ -14,8 +14,8 @@ interface VX_wb_if ();
|
|||
`IGNORE_WARNINGS_END
|
||||
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire rd_is_fp;
|
||||
wire [`NUM_THREADS-1:0][31:0] data;
|
||||
|
||||
wire ready;
|
||||
|
||||
endinterface
|
||||
|
|
|
@ -12,7 +12,7 @@ module VX_cam_buffer #(
|
|||
output wire [ADDRW-1:0] write_addr,
|
||||
input wire acquire_slot,
|
||||
input wire [RPORTS-1:0][ADDRW-1:0] read_addr,
|
||||
output reg [RPORTS-1:0][DATAW-1:0] read_data,
|
||||
output reg [RPORTS-1:0][DATAW-1:0] read_data,
|
||||
input wire [RPORTS-1:0] release_slot,
|
||||
output wire full
|
||||
);
|
||||
|
|
|
@ -52,11 +52,7 @@ module VX_generic_queue #(
|
|||
|
||||
end else begin // (SIZE > 1)
|
||||
|
||||
`ifdef QUEUE_FORCE_MLAB
|
||||
(* syn_ramstyle = "mlab" *) reg [DATAW-1:0] data [SIZE-1:0];
|
||||
`else
|
||||
reg [DATAW-1:0] data [SIZE-1:0];
|
||||
`endif
|
||||
`USE_FAST_BRAM reg [DATAW-1:0] data [SIZE-1:0];
|
||||
|
||||
if (0 == BUFFERED) begin
|
||||
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
module VX_generic_stack #(
|
||||
parameter WIDTH = 1,
|
||||
parameter DEPTH = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire push,
|
||||
input wire pop,
|
||||
input reg [WIDTH - 1:0] q1,
|
||||
input reg [WIDTH - 1:0] q2,
|
||||
output wire[WIDTH - 1:0] d
|
||||
);
|
||||
|
||||
reg [DEPTH - 1:0] ptr;
|
||||
reg [WIDTH - 1:0] stack [0:(1 << DEPTH) - 1];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
ptr <= 0;
|
||||
end else if (push) begin
|
||||
stack[ptr] <= q1;
|
||||
stack[ptr+1] <= q2;
|
||||
ptr <= ptr + 2;
|
||||
end else if (pop) begin
|
||||
ptr <= ptr - 1;
|
||||
end
|
||||
end
|
||||
|
||||
assign d = stack[ptr - 1];
|
||||
|
||||
endmodule
|
|
@ -15,7 +15,7 @@ module VX_index_queue #(
|
|||
input wire [`LOG2UP(SIZE)-1:0] read_addr,
|
||||
output wire [DATAW-1:0] read_data
|
||||
);
|
||||
reg [DATAW-1:0] data [SIZE-1:0];
|
||||
`USE_FAST_BRAM reg [DATAW-1:0] data [SIZE-1:0];
|
||||
reg [SIZE-1:0] valid;
|
||||
reg [`LOG2UP(SIZE):0] rd_ptr, wr_ptr;
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ module VX_tb_divide();
|
|||
|
||||
genvar i;
|
||||
generate
|
||||
for (i = 0; i < 8; i++) begin : div_loop
|
||||
for (i = 0; i < 8; i++) begin
|
||||
VX_divide#(
|
||||
.WIDTHN(32),
|
||||
.WIDTHD(32),
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue