pipeline optimization

This commit is contained in:
Blaise Tine 2020-07-30 03:06:01 -07:00
parent 60e05ae19a
commit 27e95530ef
20 changed files with 184 additions and 340 deletions

View file

@ -46,9 +46,9 @@ module VX_alu_unit #(
default: alu_result[i] = alu_in1[i] + alu_in2[i]; // ADD, LUI, AUIPC
endcase
end
end
end
wire [`NT_BITS-1:0] br_result_index;
wire [`NT_BITS-1:0] br_result_index, br_result_index_o;
VX_priority_encoder #(
.N(`NUM_THREADS)
@ -58,15 +58,35 @@ module VX_alu_unit #(
`UNUSED_PIN (valid_out)
);
wire [32:0] br_result = sub_result[br_result_index];
wire br_sign = br_result[32];
wire [`BR_BITS-1:0] br_op = `IS_BR_OP(alu_req_if.alu_op) ? `BR_OP(alu_req_if.alu_op) : 0;
wire [`BR_BITS-1:0] br_op_o;
wire [31:0] br_addr = (br_op == `BR_JALR) ? alu_req_if.rs1_data[br_result_index] : alu_req_if.curr_PC;
wire [31:0] br_dest = $signed(br_addr) + $signed(alu_req_if.offset);
wire is_jal = (alu_op == `ALU_JAL || alu_op == `ALU_JALR);
wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result;
wire stall = ~alu_commit_if.ready && alu_commit_if.valid;
VX_generic_register #(
.N(1 + `NW_BITS + `ISTAG_BITS + (`NUM_THREADS * 32) + `BR_BITS + 32 + `NT_BITS)
) alu_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (0),
.in ({alu_req_if.valid, alu_req_if.warp_num, alu_req_if.issue_tag, alu_jal_result, br_op, br_dest, br_result_index}),
.out ({alu_commit_if.valid, branch_ctl_if.warp_num, alu_commit_if.issue_tag, alu_commit_if.data, br_op_o, branch_ctl_if.dest, br_result_index_o})
);
wire [31:0] br_result = alu_commit_if.data[br_result_index_o];
wire br_sign = br_result[31];
wire br_nzero = (| br_result[31:0]);
wire [`BR_BITS-1:0] br_op = `BR_OP(alu_req_if.alu_op);
reg br_taken;
always @(*) begin
case (br_op)
case (br_op_o)
`BR_NE: br_taken = br_nzero;
`BR_EQ: br_taken = ~br_nzero;
`BR_LT,
@ -75,39 +95,10 @@ module VX_alu_unit #(
`BR_GEU: br_taken = ~br_sign;
default: br_taken = 1'b1;
endcase
end
end
wire [31:0] br_addr = (br_op == `BR_JALR) ? alu_req_if.rs1_data[br_result_index] : alu_req_if.curr_PC;
wire [31:0] br_dest = $signed(br_addr) + $signed(alu_req_if.offset);
wire is_jal = (alu_op == `ALU_JAL || alu_op == `ALU_JALR);
wire is_br_valid = `IS_BR_OP(alu_op) && alu_req_if.valid;
wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result;
wire stall = ~alu_commit_if.ready && alu_commit_if.valid;
VX_generic_register #(
.N(1 + `NW_BITS + 1 + 32)
) branch_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (0),
.in ({is_br_valid, alu_req_if.warp_num, br_taken, br_dest}),
.out ({branch_ctl_if.valid, branch_ctl_if.warp_num, branch_ctl_if.taken, branch_ctl_if.dest})
);
VX_generic_register #(
.N(1 + `ISTAG_BITS + (`NUM_THREADS * 32))
) alu_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (0),
.in ({alu_req_if.valid, alu_req_if.issue_tag, alu_jal_result}),
.out ({alu_commit_if.valid, alu_commit_if.issue_tag, alu_commit_if.data})
);
assign branch_ctl_if.valid = alu_req_if.valid && (br_op_o != 0);
assign branch_ctl_if.taken = br_taken;
assign alu_req_if.ready = ~stall;

View file

@ -28,7 +28,7 @@
`endif
`ifndef NUM_CSRS
`define NUM_CSRS 1024
`define NUM_CSRS 64
`endif
`ifndef STARTUP_ADDR
@ -57,7 +57,7 @@
`define EXT_M_ENABLE
//`define EXT_F_ENABLE
`define EXT_F_ENABLE
// Configuration Values =======================================================

View file

@ -35,10 +35,10 @@ module VX_decode #(
wire [6:0] func7 = instr[31:25];
wire [11:0] u_12 = instr[31:20];
wire [`NR_BITS-1:0] rd = instr[11:7];
wire [`NR_BITS-1:0] rs1 = instr[19:15];
wire [`NR_BITS-1:0] rs2 = instr[24:20];
wire [`NR_BITS-1:0] rs3 = instr[31:27];
wire [4:0] rd = instr[11:7];
wire [4:0] rs1 = instr[19:15];
wire [4:0] rs2 = instr[24:20];
wire [4:0] rs3 = instr[31:27];
// opcode types
wire is_rtype = (opcode == `INST_R);
@ -202,7 +202,7 @@ module VX_decode #(
wire is_fcvtf = is_fci && (func7 == 7'h68); // convert to float
wire is_fmvcls = is_fci && (func7 == 7'h70 || func7 == 7'h78); // move + class
wire is_fr4 = is_fmadd || is_fmsub || is_fnmsub || is_fnmadd;
wire is_fpu = (is_fl || is_fs || is_fci || is_fr4);
wire is_fpu = (is_fl || is_fs || is_fci || is_fr4);
always @(*) begin
fpu_op = `FPU_OTHER;
@ -242,7 +242,8 @@ module VX_decode #(
wire is_fcvtf = 0;
wire is_fmvcls = 0;
wire is_fr4 = 0;
wire is_fpu = 0;
wire is_fpu = 0;
always @(*) begin
fpu_op = `FPU_OTHER;
end
@ -271,6 +272,29 @@ module VX_decode #(
endcase
end
///////////////////////////////////////////////////////////////////////////
wire use_rd = (is_fl || is_fci || is_fr4)
|| ((rd != 0) && (is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype));
wire use_rs1 = is_fpu
|| is_gpu
|| ((is_jalr || is_btype || is_ltype || is_stype || is_itype || is_rtype || ~is_csr_imm || is_gpu) && (rs1 != 0));
wire use_rs2 = (is_fpu && ~(is_fl || (fpu_op == `FPU_SQRT) || is_fcvti || is_fcvtf || is_fmvcls))
|| (is_gpu && (gpu_op == `GPU_BAR || gpu_op == `GPU_WSPAWN))
|| ((is_btype || is_stype || is_rtype) && (rs2 != 0));
wire use_rs3 = is_fr4;
wire rd_is_fp = is_fpu && ~(is_fcmp || is_fcvti || (fpu_op == `FPU_MVXW || fpu_op == `FPU_CLASS));
wire rs1_is_fp = is_fr4 || (is_fci && ~(is_fcvtf || (fpu_op == `FPU_MVWX)));
wire rs2_is_fp = is_fs || is_fr4 || is_fci;
wire [4:0] rs1_qual = is_lui ? 5'h0 : rs1;
///////////////////////////////////////////////////////////////////////////
VX_decode_if decode_tmp_if();
assign decode_tmp_if.valid = ifetch_rsp_if.valid;
@ -297,9 +321,26 @@ module VX_decode #(
(is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) :
0;
assign decode_tmp_if.rd = rd;
assign decode_tmp_if.rs1 = is_lui ? `NR_BITS'(0) : rs1;
assign decode_tmp_if.rs2 = rs2;
assign decode_tmp_if.wb = use_rd;
`ifdef EXT_F_ENABLE
assign decode_tmp_if.rd = {rd_is_fp, rd};
assign decode_tmp_if.rs1 = {rs1_is_fp, rs1_qual};
assign decode_tmp_if.rs2 = {rs2_is_fp, rs2};
assign decode_tmp_if.rs3 = {1'b1, rs3};
`else
assign decode_tmp_if.rd = rd;
assign decode_tmp_if.rs1 = rs1_qual;
assign decode_tmp_if.rs2 = rs2;
assign decode_tmp_if.rs3 = rs3;
`endif
assign decode_tmp_if.use_rs3 = use_rs3;
assign decode_tmp_if.reg_use_mask = ((`NUM_REGS)'(use_rd) << rd)
| ((`NUM_REGS)'(use_rs1) << rs1_qual)
| ((`NUM_REGS)'(use_rs2) << rs2)
| ((`NUM_REGS)'(use_rs3) << rs3);
assign decode_tmp_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} :
(is_jal || is_jalr || is_jals) ? jalx_offset :
@ -308,26 +349,8 @@ module VX_decode #(
assign decode_tmp_if.rs1_is_PC = is_auipc;
assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm;
assign decode_tmp_if.use_rs1 = is_fpu
|| is_gpu
|| ((is_jalr || is_btype || is_ltype || is_stype || is_itype || is_rtype || ~is_csr_imm || is_gpu)
&& (decode_tmp_if.rs1 != 0));
assign decode_tmp_if.use_rs2 = (is_fpu && ~(is_fl || (fpu_op == `FPU_SQRT) || is_fcvti || is_fcvtf || is_fmvcls))
|| (is_gpu && (gpu_op == `GPU_BAR || gpu_op == `GPU_WSPAWN))
|| ((is_btype || is_stype || is_rtype)
&& (decode_tmp_if.rs2 != 0));
assign decode_tmp_if.rd_is_fp = is_fpu && ~(is_fcmp || is_fcvti || (fpu_op == `FPU_MVXW || fpu_op == `FPU_CLASS));
assign decode_tmp_if.rs1_is_fp = is_fr4 || (is_fci && ~(is_fcvtf || (fpu_op == `FPU_MVWX)));
assign decode_tmp_if.rs2_is_fp = is_fs || is_fr4 || is_fci;
assign decode_tmp_if.rs3 = rs3;
assign decode_tmp_if.use_rs3 = is_fr4;
assign decode_tmp_if.frm = func3;
assign decode_tmp_if.wb = (is_fl || is_fci || is_fr4)
|| ((rd != 0) && (is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype));
assign decode_tmp_if.frm = func3;
assign join_if.is_join = in_valid && is_gpu && (gpu_op == `GPU_JOIN);
assign join_if.warp_num = ifetch_rsp_if.warp_num;
@ -338,14 +361,14 @@ module VX_decode #(
wire stall = ~decode_if.ready && decode_if.valid;
VX_generic_register #(
.N(1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + 1 + `FRM_BITS)
.N(1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + `FRM_BITS + `NUM_REGS)
) decode_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (0),
.in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.rd_is_fp, decode_tmp_if.frm}),
.out ({decode_if.valid, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.rd_is_fp, decode_if.frm})
.in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, decode_tmp_if.reg_use_mask}),
.out ({decode_if.valid, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, decode_if.reg_use_mask})
);
assign ifetch_rsp_if.ready = ~stall;
@ -357,7 +380,7 @@ module VX_decode #(
print_ex_type(decode_tmp_if.ex_type);
$write(", op=");
print_ex_op(decode_tmp_if.ex_type, decode_tmp_if.ex_op);
$write(", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_rs1=%b, use_rs2=%b, use_rs3=%b, rd_is_fp=%b, rs1_is_fp=%b, rs2_is_fp=%b, frm=", decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.rs3, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.use_rs3, decode_tmp_if.rd_is_fp,decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp);
$write(", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, frm=", decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.rs3, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm);
print_frm(decode_tmp_if.frm);
$write("\n");

View file

@ -7,7 +7,6 @@
///////////////////////////////////////////////////////////////////////////////
`define QUEUE_FORCE_MLAB 1
// `define SYNTHESIS 1
// `define ASIC 1
@ -23,7 +22,11 @@
`define REQS_BITS `LOG2UP(NUM_REQUESTS)
`ifdef EXT_F_ENABLE
`define NUM_REGS 64
`else
`define NUM_REGS 32
`endif
`define NR_BITS `LOG2UP(`NUM_REGS)
@ -33,7 +36,9 @@
`define ISTAG_BITS `LOG2UP(`ISSUEQ_SIZE)
`define LATENCY_IDIV 23
///////////////////////////////////////////////////////////////////////////////
`define LATENCY_IDIV 24
`define LATENCY_IMUL 2
`define LATENCY_FMULADD 2
@ -408,7 +413,6 @@ typedef struct packed {
logic [`NUM_THREADS-1:0] thread_mask;
logic [31:0] curr_PC;
logic [`NR_BITS-1:0] rd;
logic rd_is_fp;
logic wb;
} is_data_t;

View file

@ -3,98 +3,49 @@
// control module to support multi-cycle read for fp register
module VX_gpr_fp_ctrl (
input wire clk,
input wire reset,
input wire clk,
input wire reset,
input wire [`NUM_THREADS-1:0][31:0] rs1_int_data,
input wire [`NUM_THREADS-1:0][31:0] rs2_int_data,
input wire [`NUM_THREADS-1:0][31:0] rs1_fp_data,
input wire [`NUM_THREADS-1:0][31:0] rs2_fp_data,
input wire [`NUM_THREADS-1:0][31:0] rs1_data,
input wire [`NUM_THREADS-1:0][31:0] rs2_data,
// outputs
output wire [`NR_BITS-1:0] raddr1,
output wire [`NR_BITS-1:0] raddr2,
output wire [`NR_BITS-1:0] raddr1,
VX_gpr_read_if gpr_read_if
VX_gpr_read_if gpr_read_if
);
// param
localparam GPR_DELAY_WID = 1;
reg [GPR_DELAY_WID-1:0] multi_cyc_state;
reg [`NUM_THREADS-1:0][31:0] tmp_rs1_data;
reg [`NUM_THREADS-1:0][31:0] tmp_rs2_data;
reg [`NUM_THREADS-1:0][31:0] rs1_data;
reg [`NUM_THREADS-1:0][31:0] rs2_data;
reg [`NUM_THREADS-1:0][31:0] rs3_data;
reg [`NUM_THREADS-1:0][31:0] tmp_rs1_data;
reg read_rs3;
wire gpr_delay;
wire gpr_delay = gpr_read_if.valid && gpr_read_if.use_rs3 && ~read_rs3;
wire gpr_fire = gpr_read_if.valid && gpr_read_if.ready;
always @(posedge clk) begin
if (reset) begin
multi_cyc_state <= 0;
read_rs3 <= 0;
end else if (gpr_delay) begin
multi_cyc_state <= 1;
read_rs3 <= 1;
end else if (gpr_fire) begin
multi_cyc_state <= 0;
read_rs3 <= 0;
end
end
// select rs1 data
always @(posedge clk) begin
if (reset) begin
tmp_rs1_data <= 0;
end else begin
if (gpr_delay) begin
if (gpr_read_if.rs1_is_fp) begin
tmp_rs1_data <= rs1_fp_data;
end else begin
tmp_rs1_data <= rs1_int_data;
end
end
end
end
// select rs2 data
always @(posedge clk) begin
if(reset) begin
tmp_rs2_data <= 0;
end else begin
if (gpr_delay) begin
if (gpr_read_if.rs2_is_fp) begin
tmp_rs2_data <= rs2_fp_data;
end else begin
tmp_rs2_data <= rs2_int_data;
end
end
// backup original rs1 data
always @(posedge clk) begin
if (gpr_delay) begin
tmp_rs1_data <= rs1_data;
end
end
// outputs
assign gpr_delay = gpr_read_if.valid && gpr_read_if.use_rs3 && (0 == multi_cyc_state);
assign raddr1 = multi_cyc_state ? gpr_read_if.rs3 : gpr_read_if.rs1;
assign raddr2 = gpr_read_if.rs2;
always @(*) begin
if (gpr_read_if.use_rs3) begin
rs1_data = tmp_rs1_data;
rs2_data = tmp_rs2_data;
rs3_data = rs1_fp_data;
end else begin
rs1_data = gpr_read_if.rs1_is_fp ? rs1_fp_data : rs1_int_data;
rs2_data = gpr_read_if.rs2_is_fp ? rs2_fp_data : rs2_int_data;
rs3_data = {`NUM_THREADS{32'h8000_0000}}; // default value: -0 in single fp
end
end
assign raddr1 = read_rs3 ? gpr_read_if.rs3 : gpr_read_if.rs1;
assign gpr_read_if.ready = ~gpr_delay;
assign gpr_read_if.rs1_data = rs1_data;
assign gpr_read_if.rs1_data = gpr_read_if.use_rs3 ? tmp_rs1_data : rs1_data;
assign gpr_read_if.rs2_data = rs2_data;
assign gpr_read_if.rs3_data = rs3_data;
assign gpr_read_if.rs3_data = rs1_data;
endmodule

View file

@ -14,79 +14,52 @@ module VX_gpr_stage #(
);
`UNUSED_VAR (reset)
wire [`NUM_THREADS-1:0][31:0] rs1_int_data [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0][31:0] rs2_int_data [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0][31:0] rs1_data [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0][31:0] rs2_data [`NUM_WARPS-1:0];
wire [`NR_BITS-1:0] raddr1;
wire [`NR_BITS-1:0] raddr2;
genvar i;
for (i = 0; i < `NUM_WARPS; i++) begin
wire [`NUM_WARPS-1:0] we = writeback_if.thread_mask & {`NUM_THREADS{writeback_if.valid && ~writeback_if.rd_is_fp && (i == writeback_if.warp_num)}};
wire [`NUM_THREADS-1:0] we = writeback_if.thread_mask
& {`NUM_THREADS{writeback_if.valid && (i == writeback_if.warp_num)}};
VX_gpr_ram gpr_int_ram (
.clk (clk),
.we (we),
.waddr (writeback_if.rd),
.wdata (writeback_if.data),
.rs1 (raddr1),
.rs2 (raddr2),
.rs1_data (rs1_int_data[i]),
.rs2_data (rs2_int_data[i])
.rs2 (gpr_read_if.rs2),
.rs1_data (rs1_data[i]),
.rs2_data (rs2_data[i])
);
end
`ifdef EXT_F_ENABLE
wire [`NUM_THREADS-1:0][31:0] rs1_fp_data [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0][31:0] rs2_fp_data [`NUM_WARPS-1:0];
for (i = 0; i < `NUM_WARPS; i++) begin
wire [`NUM_WARPS-1:0] we = writeback_if.thread_mask & {`NUM_THREADS{writeback_if.valid && writeback_if.rd_is_fp && (i == writeback_if.warp_num)}};
VX_gpr_ram gpr_fp_ram (
.clk (clk),
.we (we),
.waddr (writeback_if.rd),
.wdata (writeback_if.data),
.rs1 (raddr1),
.rs2 (raddr2),
.rs1_data (rs1_fp_data[i]),
.rs2_data (rs2_fp_data[i])
);
end
`ifdef EXT_F_ENABLE
VX_gpr_fp_ctrl VX_gpr_fp_ctrl (
.clk (clk),
.reset (reset),
.clk (clk),
.reset (reset),
//inputs
.rs1_int_data (rs1_int_data[gpr_read_if.warp_num]),
.rs2_int_data (rs2_int_data[gpr_read_if.warp_num]),
.rs1_fp_data (rs1_fp_data[gpr_read_if.warp_num]),
.rs2_fp_data (rs2_fp_data[gpr_read_if.warp_num]),
.rs1_data (rs1_data[gpr_read_if.warp_num]),
.rs2_data (rs2_data[gpr_read_if.warp_num]),
// outputs
.raddr1 (raddr1),
.raddr2 (raddr2),
.gpr_read_if (gpr_read_if)
.raddr1 (raddr1),
.gpr_read_if(gpr_read_if)
);
`else
assign raddr1 = gpr_read_if.rs1;
assign raddr2 = gpr_read_if.rs2;
assign gpr_read_if.rs1_data = rs1_int_data[gpr_read_if.warp_num];
assign gpr_read_if.rs2_data = rs2_int_data[gpr_read_if.warp_num];
assign gpr_read_if.rs1_data = rs1_data[gpr_read_if.warp_num];
assign gpr_read_if.rs2_data = rs2_data[gpr_read_if.warp_num];
assign gpr_read_if.rs3_data = 0;
assign gpr_read_if.ready = 1;
wire valid = gpr_read_if.valid;
wire rs1_is_fp = gpr_read_if.rs1_is_fp;
wire rs2_is_fp = gpr_read_if.rs2_is_fp;
wire use_rs3 = gpr_read_if.use_rs3;
wire [`NR_BITS-1:0] rs3 = gpr_read_if.rs3;
`UNUSED_VAR (valid);
`UNUSED_VAR (rs1_is_fp);
`UNUSED_VAR (rs2_is_fp);
`UNUSED_VAR (use_rs3);
`UNUSED_VAR (rs3);
`endif

View file

@ -53,7 +53,7 @@ module VX_gpu_unit #(
wire[`NUM_THREADS-1:0] split_new_use_mask;
wire[`NUM_THREADS-1:0] split_new_later_mask;
for (i = 0; i < `NUM_THREADS; i++) begin : masks_init
for (i = 0; i < `NUM_THREADS; i++) begin
wire curr_bool = (gpu_req_if.rs1_data[i] == 32'b1);
assign split_new_use_mask[i] = gpu_req_if.thread_mask[i] & (curr_bool);
assign split_new_later_mask[i] = gpu_req_if.thread_mask[i] & (!curr_bool);

View file

@ -23,8 +23,6 @@ module VX_issue #(
assign gpr_read_if.rs1 = decode_if.rs1;
assign gpr_read_if.rs2 = decode_if.rs2;
assign gpr_read_if.rs3 = decode_if.rs3;
assign gpr_read_if.rs1_is_fp = decode_if.rs1_is_fp;
assign gpr_read_if.rs2_is_fp = decode_if.rs2_is_fp;
assign gpr_read_if.use_rs3 = decode_if.use_rs3;
wire [`ISTAG_BITS-1:0] issue_tag, issue_tmp_tag;
@ -52,8 +50,7 @@ module VX_issue #(
.mul_busy (mul_busy),
.fpu_busy (fpu_busy),
.gpu_busy (gpu_busy),
.issue_tag (issue_tag),
`UNUSED_PIN (is_empty)
.issue_tag (issue_tag)
);
VX_gpr_stage #(
@ -72,14 +69,14 @@ module VX_issue #(
wire flush = alu_req_if.ready && ~decode_if.ready;
VX_generic_register #(
.N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + `FRM_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32))
) decode_reg (
.N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + `FRM_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32))
) issue_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (flush),
.in ({decode_if.valid, issue_tag, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.frm, gpr_read_if.rs1_data, gpr_read_if.rs2_data, gpr_read_if.rs3_data}),
.out ({decode_tmp_if.valid, issue_tmp_tag, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.frm, gpr_data_tmp_if.rs1_data, gpr_data_tmp_if.rs2_data, gpr_data_tmp_if.rs3_data})
.in ({decode_if.valid, issue_tag, decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, gpr_read_if.rs1_data, gpr_read_if.rs2_data, gpr_read_if.rs3_data}),
.out ({decode_tmp_if.valid, issue_tmp_tag, decode_tmp_if.warp_num, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, gpr_data_tmp_if.rs1_data, gpr_data_tmp_if.rs2_data, gpr_data_tmp_if.rs3_data})
);
VX_issue_demux issue_demux (

View file

@ -1,6 +1,8 @@
`ifndef VX_PLATFORM
`define VX_PLATFORM
///////////////////////////////////////////////////////////////////////////////
`ifndef NDEBUG
`define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \
x \
@ -45,6 +47,12 @@
`define ENABLE_TRACING /* verilator tracing_on */
`define DISABLE_TRACING /* verilator tracing_off */
///////////////////////////////////////////////////////////////////////////////
`define USE_FAST_BRAM (* syn_ramstyle = "mlab" *)
///////////////////////////////////////////////////////////////////////////////
`define CLOG2(x) $clog2(x)
`define FLOG2(x) ($clog2(x) - (((1 << $clog2(x)) > (x)) ? 1 : 0))
`define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1)

View file

@ -16,106 +16,55 @@ module VX_scheduler #(
input wire mul_busy,
input wire fpu_busy,
input wire gpu_busy,
output wire [`ISTAG_BITS-1:0] issue_tag,
output wire is_empty
output wire [`ISTAG_BITS-1:0] issue_tag
);
localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1);
`ifdef EXT_F_ENABLE
localparam NREGS = (`NUM_REGS * 2);
reg inuse_table [`NUM_WARPS-1:0][NREGS-1:0];
wire [`NR_BITS:0] read_rs1 = {decode_if.rs1_is_fp, decode_if.rs1};
wire [`NR_BITS:0] read_rs2 = {decode_if.rs2_is_fp, decode_if.rs2};
wire [`NR_BITS:0] read_rs3 = {1'b1, decode_if.rs3};
wire [`NR_BITS:0] read_rd = {decode_if.rd_is_fp, decode_if.rd};
wire [`NR_BITS:0] write_rd = {writeback_if.rd_is_fp, writeback_if.rd};
wire rs3_inuse = inuse_table[decode_if.warp_num][read_rs3];
`else
localparam NREGS = `NUM_REGS;
reg inuse_table [`NUM_WARPS-1:0][NREGS-1:0];
wire [`NR_BITS-1:0] read_rs1 = decode_if.rs1;
wire [`NR_BITS-1:0] read_rs2 = decode_if.rs2;
wire [`NR_BITS-1:0] read_rd = decode_if.rd;
wire [`NR_BITS-1:0] write_rd = writeback_if.rd;
wire rs3_inuse = 0;
`endif
reg [`NUM_THREADS-1:0] inuse_registers [`NUM_WARPS-1:0][NREGS-1:0];
reg [CTVW-1:0] count_valid;
localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1);
reg [`NUM_THREADS-1:0] inuse_registers [`NUM_WARPS-1:0][`NUM_REGS-1:0];
reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0];
wire rs1_inuse = inuse_table[decode_if.warp_num][read_rs1];
wire rs2_inuse = inuse_table[decode_if.warp_num][read_rs2];
wire rd_inuse = inuse_table[decode_if.warp_num][read_rd];
wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[decode_if.warp_num] & decode_if.reg_use_mask;
wire inuse_hazard = (inuse_mask != 0);
wire rs1_inuse_qual = rs1_inuse && decode_if.use_rs1;
wire rs2_inuse_qual = rs2_inuse && decode_if.use_rs2;
wire rs3_inuse_qual = rs3_inuse && decode_if.use_rs3;
wire rd_inuse_qual = rd_inuse && decode_if.wb;
wire inuse_valid = (rd_inuse_qual || rs1_inuse_qual || rs2_inuse_qual || rs3_inuse_qual);
wire ex_stalled = ((gpr_busy)
|| (alu_busy && (decode_if.ex_type == `EX_ALU))
wire exu_stalled = (alu_busy && (decode_if.ex_type == `EX_ALU))
|| (lsu_busy && (decode_if.ex_type == `EX_LSU))
|| (csr_busy && (decode_if.ex_type == `EX_CSR))
|| (mul_busy && (decode_if.ex_type == `EX_MUL))
|| (fpu_busy && (decode_if.ex_type == `EX_FPU))
|| (gpu_busy && (decode_if.ex_type == `EX_GPU)));
|| (gpu_busy && (decode_if.ex_type == `EX_GPU));
wire issue_buf_full;
wire stall = (ex_stalled || inuse_valid || issue_buf_full) && decode_if.valid;
wire stall = (gpr_busy || exu_stalled || inuse_hazard || issue_buf_full) && decode_if.valid;
wire acquire_rd = decode_if.valid && (decode_if.wb != 0) && ~stall;
wire release_rd = writeback_if.valid;
wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[writeback_if.warp_num][write_rd] & ~writeback_if.thread_mask;
wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[writeback_if.warp_num][writeback_if.rd] & ~writeback_if.thread_mask;
reg [CTVW-1:0] count_valid_next = (acquire_rd && !(release_rd && (0 == inuse_registers_n))) ? (count_valid + 1) :
(~acquire_rd && (release_rd && (0 == inuse_registers_n))) ? (count_valid - 1) :
count_valid;
always @(posedge clk) begin
always @(posedge clk) begin
if (reset) begin
integer i, w;
for (w = 0; w < `NUM_WARPS; w++) begin
for (i = 0; i < NREGS; i++) begin
inuse_registers[w][i] <= 0;
inuse_table[w][i] <= 0;
for (i = 0; i < `NUM_REGS; i++) begin
inuse_registers[w][i] <= 0;
end
inuse_reg_mask[w] <= 0;
end
count_valid <= 0;
end else begin
if (acquire_rd) begin
inuse_registers[decode_if.warp_num][read_rd] <= decode_if.thread_mask;
inuse_table[decode_if.warp_num][read_rd] <= 1;
inuse_registers[decode_if.warp_num][decode_if.rd] <= decode_if.thread_mask;
inuse_reg_mask[decode_if.warp_num][decode_if.rd] <= 1;
end
if (release_rd) begin
assert(inuse_table[writeback_if.warp_num][write_rd] != 0);
inuse_registers[writeback_if.warp_num][write_rd] <= inuse_registers_n;
inuse_table[writeback_if.warp_num][write_rd] <= (| inuse_registers_n);
assert(inuse_reg_mask[writeback_if.warp_num][writeback_if.rd] != 0);
inuse_registers[writeback_if.warp_num][writeback_if.rd] <= inuse_registers_n;
inuse_reg_mask[writeback_if.warp_num][writeback_if.rd] <= (| inuse_registers_n);
end
count_valid <= count_valid_next;
end
end
wire ib_acquire = decode_if.valid && ~stall;
`DEBUG_BLOCK(
wire [`NW_BITS-1:0] cis_alu_warp_num = cmt_to_issue_if.alu_data.warp_num;
wire [`NUM_THREADS-1:0] cis_alu_thread_mask = cmt_to_issue_if.alu_data.thread_mask;
wire [31:0] cis_alu_curr_PC = cmt_to_issue_if.alu_data.curr_PC;
wire [`NR_BITS-1:0] cis_alu_rd = cmt_to_issue_if.alu_data.rd;
wire cis_alu_rd_is_fp = cmt_to_issue_if.alu_data.rd_is_fp;
wire cis_alu_wb = cmt_to_issue_if.alu_data.wb;
wire [`NW_BITS-1:0] cis_fpu_warp_num = cmt_to_issue_if.fpu_data.warp_num;
wire [`NUM_THREADS-1:0] cis_fpu_thread_mask = cmt_to_issue_if.fpu_data.thread_mask;
wire [31:0] cis_fpu_curr_PC = cmt_to_issue_if.fpu_data.curr_PC;
wire [`NR_BITS-1:0] cis_fpu_rd = cmt_to_issue_if.fpu_data.rd;
wire cis_fpu_rd_is_fp = cmt_to_issue_if.fpu_data.rd_is_fp;
wire cis_fpu_wb = cmt_to_issue_if.fpu_data.wb;
)
wire issue_fire = decode_if.valid && ~stall;
VX_cam_buffer #(
.DATAW ($bits(is_data_t)),
@ -124,9 +73,9 @@ module VX_scheduler #(
) issue_buffer (
.clk (clk),
.reset (reset),
.write_data ({decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.rd_is_fp, decode_if.wb}),
.write_data ({decode_if.warp_num, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.wb}),
.write_addr (issue_tag),
.acquire_slot (ib_acquire),
.acquire_slot (issue_fire),
.release_slot ({cmt_to_issue_if.alu_valid, cmt_to_issue_if.lsu_valid, cmt_to_issue_if.csr_valid, cmt_to_issue_if.mul_valid, cmt_to_issue_if.fpu_valid, cmt_to_issue_if.gpu_valid}),
.read_addr ({cmt_to_issue_if.alu_tag, cmt_to_issue_if.lsu_tag, cmt_to_issue_if.csr_tag, cmt_to_issue_if.mul_tag, cmt_to_issue_if.fpu_tag, cmt_to_issue_if.gpu_tag}),
.read_data ({cmt_to_issue_if.alu_data, cmt_to_issue_if.lsu_data, cmt_to_issue_if.csr_data, cmt_to_issue_if.mul_data, cmt_to_issue_if.fpu_data, cmt_to_issue_if.gpu_data}),
@ -135,14 +84,12 @@ module VX_scheduler #(
assign decode_if.ready = ~stall;
assign is_empty = (0 == count_valid);
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (stall) begin
$display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, ib_full=%b, inuse=%b%b%b%b, gpr=%b, alu=%b, lsu=%b, csr=%b, mul=%b, fpu=%b, gpu=%b",
$time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full, rd_inuse_qual, rs1_inuse_qual,
rs2_inuse_qual, rs3_inuse_qual, gpr_busy, alu_busy, lsu_busy, csr_busy, mul_busy, fpu_busy, gpu_busy);
$time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full, inuse_mask[decode_if.rd], inuse_mask[decode_if.rs1],
inuse_mask[decode_if.rs2], inuse_mask[decode_if.rs3], gpr_busy, alu_busy, lsu_busy, csr_busy, mul_busy, fpu_busy, gpu_busy);
end
end
`endif

View file

@ -205,24 +205,26 @@ module VX_warp_sched #(
assign {join_fall, join_pc, join_tm} = ipdom[join_if.warp_num];
genvar i;
for (i = 0; i < `NUM_WARPS; i++) begin : stacks
for (i = 0; i < `NUM_WARPS; i++) begin
wire correct_warp_s = (i == warp_ctl_if.warp_num);
wire correct_warp_j = (i == join_if.warp_num);
wire push = (warp_ctl_if.is_split && warp_ctl_if.do_split) && correct_warp_s;
wire pop = join_if.is_join && correct_warp_j;
VX_generic_stack #(
VX_ipdom_stack #(
.WIDTH(1+32+`NUM_THREADS),
.DEPTH($clog2(`NUM_THREADS)+1)
) ipdom_stack(
.DEPTH(`NT_BITS+1)
) ipdom_stack (
.clk (clk),
.reset(reset),
.push (push),
.pop (pop),
.d (ipdom[i]),
.q1 (q1),
.q2 (q2)
.q2 (q2),
`UNUSED_PIN (empty),
`UNUSED_PIN (full)
);
end

View file

@ -24,7 +24,6 @@ module VX_writeback #(
reg [`NUM_THREADS-1:0] wb_thread_mask [`ISSUEQ_SIZE-1:0];
reg [31:0] wb_curr_PC [`ISSUEQ_SIZE-1:0];
reg [`NR_BITS-1:0] wb_rd [`ISSUEQ_SIZE-1:0];
reg wb_rd_is_fp [`ISSUEQ_SIZE-1:0];
reg [`ISSUEQ_SIZE-1:0] wb_pending, wb_pending_n;
reg [`ISTAG_BITS-1:0] wb_index;
@ -75,7 +74,6 @@ module VX_writeback #(
wb_thread_mask [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.thread_mask;
wb_curr_PC [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.curr_PC;
wb_rd [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.rd;
wb_rd_is_fp [alu_commit_if.issue_tag] <= cmt_to_issue_if.alu_data.rd_is_fp;
end
if (lsu_commit_if.valid) begin
wb_data [lsu_commit_if.issue_tag] <= lsu_commit_if.data;
@ -83,7 +81,6 @@ module VX_writeback #(
wb_thread_mask [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.thread_mask;
wb_curr_PC [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.curr_PC;
wb_rd [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.rd;
wb_rd_is_fp [lsu_commit_if.issue_tag] <= cmt_to_issue_if.lsu_data.rd_is_fp;
end
if (csr_commit_if.valid) begin
wb_data [csr_commit_if.issue_tag] <= csr_commit_if.data;
@ -91,7 +88,6 @@ module VX_writeback #(
wb_thread_mask [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.thread_mask;
wb_curr_PC [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.curr_PC;
wb_rd [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.rd;
wb_rd_is_fp [csr_commit_if.issue_tag] <= cmt_to_issue_if.csr_data.rd_is_fp;
end
if (mul_commit_if.valid) begin
wb_data [mul_commit_if.issue_tag] <= mul_commit_if.data;
@ -99,7 +95,6 @@ module VX_writeback #(
wb_thread_mask [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.thread_mask;
wb_curr_PC [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.curr_PC;
wb_rd [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.rd;
wb_rd_is_fp [mul_commit_if.issue_tag] <= cmt_to_issue_if.mul_data.rd_is_fp;
end
if (fpu_commit_if.valid) begin
wb_data [fpu_commit_if.issue_tag] <= fpu_commit_if.data;
@ -107,7 +102,6 @@ module VX_writeback #(
wb_thread_mask [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.thread_mask;
wb_curr_PC [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.curr_PC;
wb_rd [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.rd;
wb_rd_is_fp [fpu_commit_if.issue_tag] <= cmt_to_issue_if.fpu_data.rd_is_fp;
end
wb_pending <= wb_pending_n;
@ -122,7 +116,6 @@ module VX_writeback #(
assign writeback_if.thread_mask = wb_thread_mask [wb_index];
assign writeback_if.curr_PC = wb_curr_PC [wb_index];
assign writeback_if.rd = wb_rd [wb_index];
assign writeback_if.rd_is_fp = wb_rd_is_fp [wb_index];
assign writeback_if.data = wb_data [wb_index];
// commit back-pressure

View file

@ -21,16 +21,12 @@ interface VX_decode_if ();
wire rs1_is_PC;
wire rs2_is_imm;
wire use_rs1;
wire use_rs2;
wire [`NUM_REGS-1:0] reg_use_mask;
// FP states
wire [`NR_BITS-1:0] rs3;
wire use_rs3;
wire rd_is_fp;
wire rs1_is_fp;
wire rs2_is_fp;
wire use_rs3;
wire [`FRM_BITS-1:0] frm;
wire wb;

View file

@ -13,10 +13,7 @@ interface VX_gpr_read_if ();
wire [`NR_BITS-1:0] rs2;
wire [`NR_BITS-1:0] rs3;
wire use_rs3;
wire rs1_is_fp;
wire rs2_is_fp;
wire use_rs3;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data;

View file

@ -14,8 +14,8 @@ interface VX_wb_if ();
`IGNORE_WARNINGS_END
wire [`NR_BITS-1:0] rd;
wire rd_is_fp;
wire [`NUM_THREADS-1:0][31:0] data;
wire ready;
endinterface

View file

@ -12,7 +12,7 @@ module VX_cam_buffer #(
output wire [ADDRW-1:0] write_addr,
input wire acquire_slot,
input wire [RPORTS-1:0][ADDRW-1:0] read_addr,
output reg [RPORTS-1:0][DATAW-1:0] read_data,
output reg [RPORTS-1:0][DATAW-1:0] read_data,
input wire [RPORTS-1:0] release_slot,
output wire full
);

View file

@ -52,11 +52,7 @@ module VX_generic_queue #(
end else begin // (SIZE > 1)
`ifdef QUEUE_FORCE_MLAB
(* syn_ramstyle = "mlab" *) reg [DATAW-1:0] data [SIZE-1:0];
`else
reg [DATAW-1:0] data [SIZE-1:0];
`endif
`USE_FAST_BRAM reg [DATAW-1:0] data [SIZE-1:0];
if (0 == BUFFERED) begin

View file

@ -1,34 +0,0 @@
`include "VX_platform.vh"
module VX_generic_stack #(
parameter WIDTH = 1,
parameter DEPTH = 1
) (
input wire clk,
input wire reset,
input wire push,
input wire pop,
input reg [WIDTH - 1:0] q1,
input reg [WIDTH - 1:0] q2,
output wire[WIDTH - 1:0] d
);
reg [DEPTH - 1:0] ptr;
reg [WIDTH - 1:0] stack [0:(1 << DEPTH) - 1];
always @(posedge clk) begin
if (reset) begin
ptr <= 0;
end else if (push) begin
stack[ptr] <= q1;
stack[ptr+1] <= q2;
ptr <= ptr + 2;
end else if (pop) begin
ptr <= ptr - 1;
end
end
assign d = stack[ptr - 1];
endmodule

View file

@ -15,7 +15,7 @@ module VX_index_queue #(
input wire [`LOG2UP(SIZE)-1:0] read_addr,
output wire [DATAW-1:0] read_data
);
reg [DATAW-1:0] data [SIZE-1:0];
`USE_FAST_BRAM reg [DATAW-1:0] data [SIZE-1:0];
reg [SIZE-1:0] valid;
reg [`LOG2UP(SIZE):0] rd_ptr, wr_ptr;

View file

@ -19,7 +19,7 @@ module VX_tb_divide();
genvar i;
generate
for (i = 0; i < 8; i++) begin : div_loop
for (i = 0; i < 8; i++) begin
VX_divide#(
.WIDTHN(32),
.WIDTHD(32),