mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 13:27:29 -04:00
adding using serial divider to save area cost
This commit is contained in:
parent
df25bae456
commit
ee81e81818
10 changed files with 239 additions and 135 deletions
|
@ -102,4 +102,8 @@ make -C top clean && make -C top > top/build.log 2>&1 &
|
|||
|
||||
# How to calculate the maximum operating frequency?
|
||||
200 Mhz -> period = 1/200x10^6 = 5ns
|
||||
if slack = +1.664 -> minimal period = 5-1.664 = 3.336 -> fmax = 1/3.336 = 300 Mhz
|
||||
if slack = +1.664 -> minimal period = 5-1.664 = 3.336 -> fmax = 1/3.336 = 300 Mhz
|
||||
|
||||
|
||||
# build rtlsim from driver tests
|
||||
make -C ../../rtlsim clean && reset && make -C ../../rtlsim
|
|
@ -7,16 +7,16 @@ module VX_csr_data #(
|
|||
input wire reset,
|
||||
|
||||
VX_cmt_to_csr_if cmt_to_csr_if,
|
||||
VX_csr_to_issue_if csr_to_issue_if,
|
||||
|
||||
input wire[`NW_BITS-1:0] wid,
|
||||
VX_csr_to_issue_if csr_to_issue_if,
|
||||
|
||||
input wire read_enable,
|
||||
input wire[`CSR_ADDR_BITS-1:0] read_addr,
|
||||
input wire[`NW_BITS-1:0] read_wid,
|
||||
output wire[31:0] read_data,
|
||||
|
||||
input wire write_enable,
|
||||
input wire[`CSR_ADDR_BITS-1:0] write_addr,
|
||||
input wire[`NW_BITS-1:0] write_wid,
|
||||
input wire[`CSR_WIDTH-1:0] write_data
|
||||
);
|
||||
reg [`CSR_WIDTH-1:0] csr_satp;
|
||||
|
@ -33,7 +33,7 @@ module VX_csr_data #(
|
|||
|
||||
reg [`FFG_BITS-1:0] csr_fflags [`NUM_WARPS-1:0];
|
||||
reg [`FRM_BITS-1:0] csr_frm [`NUM_WARPS-1:0];
|
||||
reg [`FRM_BITS+`FFG_BITS-1:0] csr_fcsr [`NUM_WARPS-1:0]; // fflags + frm
|
||||
reg [`FRM_BITS+`FFG_BITS-1:0] csr_fcsr [`NUM_WARPS-1:0]; // fflags + frm
|
||||
|
||||
reg [31:0] read_data_r;
|
||||
|
||||
|
@ -46,29 +46,32 @@ module VX_csr_data #(
|
|||
if (write_enable) begin
|
||||
case (write_addr)
|
||||
`CSR_FFLAGS: begin
|
||||
csr_fcsr[wid][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0];
|
||||
csr_fflags[wid] <= write_data[`FFG_BITS-1:0];
|
||||
csr_fcsr[write_wid][`FFG_BITS-1:0] <= write_data[`FFG_BITS-1:0];
|
||||
csr_fflags[write_wid] <= write_data[`FFG_BITS-1:0];
|
||||
end
|
||||
|
||||
`CSR_FRM: begin
|
||||
csr_fcsr[wid][`FFG_BITS+`FRM_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0];
|
||||
csr_frm[wid] <= write_data[`FRM_BITS-1:0];
|
||||
csr_fcsr[write_wid][`FFG_BITS+`FRM_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0];
|
||||
csr_frm[write_wid] <= write_data[`FRM_BITS-1:0];
|
||||
end
|
||||
|
||||
`CSR_FCSR: begin
|
||||
csr_fcsr[wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0];
|
||||
csr_frm[wid] <= write_data[`FFG_BITS+`FRM_BITS-1:`FFG_BITS];
|
||||
csr_fflags[wid] <= write_data[`FFG_BITS-1:0];
|
||||
csr_fcsr[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0];
|
||||
csr_frm[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:`FFG_BITS];
|
||||
csr_fflags[write_wid] <= write_data[`FFG_BITS-1:0];
|
||||
end
|
||||
`CSR_SATP: csr_satp <= write_data;
|
||||
|
||||
`CSR_SATP: csr_satp <= write_data;
|
||||
|
||||
`CSR_MSTATUS: csr_mstatus <= write_data;
|
||||
`CSR_MEDELEG: csr_medeleg <= write_data;
|
||||
`CSR_MIDELEG: csr_mideleg <= write_data;
|
||||
`CSR_MIE: csr_mie <= write_data;
|
||||
`CSR_MTVEC: csr_mtvec <= write_data;
|
||||
`CSR_MIE: csr_mie <= write_data;
|
||||
`CSR_MTVEC: csr_mtvec <= write_data;
|
||||
|
||||
`CSR_MEPC: csr_mepc <= write_data;
|
||||
`CSR_MEPC: csr_mepc <= write_data;
|
||||
|
||||
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data;
|
||||
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data;
|
||||
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data;
|
||||
|
||||
default: begin
|
||||
|
@ -93,15 +96,15 @@ module VX_csr_data #(
|
|||
always @(*) begin
|
||||
read_data_r = 'x;
|
||||
case (read_addr)
|
||||
`CSR_FFLAGS : read_data_r = 32'(csr_fflags[wid]);
|
||||
`CSR_FRM : read_data_r = 32'(csr_frm[wid]);
|
||||
`CSR_FCSR : read_data_r = 32'(csr_fcsr[wid]);
|
||||
`CSR_FFLAGS : read_data_r = 32'(csr_fflags[read_wid]);
|
||||
`CSR_FRM : read_data_r = 32'(csr_frm[read_wid]);
|
||||
`CSR_FCSR : read_data_r = 32'(csr_fcsr[read_wid]);
|
||||
|
||||
`CSR_LWID : read_data_r = 32'(wid);
|
||||
`CSR_LWID : read_data_r = 32'(read_wid);
|
||||
`CSR_LTID ,
|
||||
`CSR_GTID ,
|
||||
`CSR_MHARTID ,
|
||||
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(wid);
|
||||
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
|
||||
`CSR_GCID : read_data_r = CORE_ID;
|
||||
`CSR_NT : read_data_r = `NUM_THREADS;
|
||||
`CSR_NW : read_data_r = `NUM_WARPS;
|
||||
|
|
|
@ -45,14 +45,15 @@ module VX_csr_unit #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.csr_to_issue_if (csr_to_issue_if),
|
||||
.csr_to_issue_if(csr_to_issue_if),
|
||||
.read_enable (csr_pipe_req_if.valid),
|
||||
.read_addr (csr_pipe_req_if.csr_addr),
|
||||
.read_wid (csr_pipe_req_if.wid),
|
||||
.read_data (csr_read_data),
|
||||
.write_enable (csr_we_s1),
|
||||
.write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]),
|
||||
.write_enable (csr_we_s1),
|
||||
.write_addr (csr_addr_s1),
|
||||
.wid (csr_pipe_req_if.wid)
|
||||
.write_wid (csr_pipe_rsp_if.wid),
|
||||
.write_data (csr_updated_data_s1[`CSR_WIDTH-1:0])
|
||||
);
|
||||
|
||||
wire csr_hazard = (csr_addr_s1 == csr_pipe_req_if.csr_addr)
|
||||
|
|
|
@ -393,7 +393,7 @@ module VX_decode #(
|
|||
print_ex_type(decode_if.ex_type);
|
||||
$write(", op=");
|
||||
print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
|
||||
$write("mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.thread_mask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm);
|
||||
$write(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.thread_mask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -35,7 +35,6 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define LATENCY_IDIV 33
|
||||
`define LATENCY_IMUL 3
|
||||
|
||||
`define LATENCY_FDIV 16
|
||||
|
|
|
@ -70,7 +70,7 @@ module VX_ibuffer #(
|
|||
end
|
||||
end
|
||||
|
||||
assign q_data_prev[i] = (wr_ptr_r != rd_ptr_r) ? entries[i][rd_ptr_a] : q_data_in;
|
||||
assign q_data_prev[i] = entries[i][rd_ptr_a];
|
||||
assign q_full[i] = (size_r[i] == SIZE);
|
||||
assign q_size[i] = size_r[i];
|
||||
end
|
||||
|
@ -83,31 +83,38 @@ module VX_ibuffer #(
|
|||
reg [`NW_BITS-1:0] deq_wid, deq_wid_n;
|
||||
reg deq_valid, deq_valid_n;
|
||||
reg [DATAW-1:0] deq_instr, deq_instr_n;
|
||||
reg deq_is_size1, deq_is_size1_n;
|
||||
|
||||
always @(*) begin
|
||||
valid_table_n = valid_table;
|
||||
if (deq_fire) begin
|
||||
valid_table_n[ibuf_deq_if.wid] = (q_size[ibuf_deq_if.wid] != 1);
|
||||
if (deq_fire && deq_is_size1) begin
|
||||
valid_table_n[ibuf_deq_if.wid] = 0;
|
||||
end
|
||||
if (enq_fire) begin
|
||||
valid_table_n[ibuf_enq_if.wid] = 1;
|
||||
end
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
deq_wid_n = 0;
|
||||
deq_valid_n = 0;
|
||||
deq_instr_n = 'x;
|
||||
schedule_table_n = schedule_table;
|
||||
if (deq_fire) begin
|
||||
schedule_table_n[ibuf_deq_if.wid] = (q_size[ibuf_deq_if.wid] != 1);
|
||||
always @(*) begin
|
||||
deq_valid_n = 0;
|
||||
deq_wid_n = 'x;
|
||||
deq_instr_n = 'x;
|
||||
deq_is_size1_n = 'x;
|
||||
|
||||
schedule_table_n = schedule_table;
|
||||
if (deq_fire && deq_is_size1) begin
|
||||
schedule_table_n[ibuf_deq_if.wid] = 0;
|
||||
end
|
||||
|
||||
for (integer i = 0; i < `NUM_WARPS; i++) begin
|
||||
if (schedule_table_n[i]) begin
|
||||
deq_wid_n = `NW_BITS'(i);
|
||||
deq_valid_n = 1;
|
||||
deq_instr_n = (deq_fire && (ibuf_deq_if.wid == `NW_BITS'(i))) ? q_data_prev[i] : q_data_out[i];
|
||||
schedule_table_n[i] = 0;
|
||||
if (schedule_table_n[i]) begin
|
||||
deq_valid_n = 1;
|
||||
deq_wid_n = `NW_BITS'(i);
|
||||
deq_instr_n = (deq_fire && (ibuf_deq_if.wid == `NW_BITS'(i))) ? q_data_prev[i] : q_data_out[i];
|
||||
deq_is_size1_n = (~(enq_fire && ibuf_enq_if.wid == `NW_BITS'(i))
|
||||
&& (((deq_fire && ibuf_deq_if.wid == `NW_BITS'(i)) && (SIZEW'(2) == q_size[i]))
|
||||
|| (SIZEW'(1) == q_size[i])));
|
||||
schedule_table_n[i] = 0;
|
||||
break;
|
||||
end
|
||||
end
|
||||
|
@ -123,17 +130,19 @@ module VX_ibuffer #(
|
|||
deq_valid <= 0;
|
||||
num_warps <= 0;
|
||||
end else begin
|
||||
valid_table <= valid_table_n;
|
||||
schedule_table <= (| schedule_table_n) ? schedule_table_n : valid_table_n;
|
||||
valid_table <= valid_table_n;
|
||||
schedule_table <= (| schedule_table_n) ? schedule_table_n : valid_table_n;
|
||||
|
||||
if (enq_fire && (0 == num_warps)) begin
|
||||
deq_valid <= 1;
|
||||
deq_wid <= ibuf_enq_if.wid;
|
||||
deq_instr <= q_data_in;
|
||||
deq_valid <= 1;
|
||||
deq_wid <= ibuf_enq_if.wid;
|
||||
deq_instr <= q_data_in;
|
||||
deq_is_size1 <= 1;
|
||||
end else if (!freeze) begin
|
||||
deq_valid <= deq_valid_n;
|
||||
deq_wid <= deq_wid_n;
|
||||
deq_instr <= deq_instr_n;
|
||||
deq_valid <= deq_valid_n;
|
||||
deq_wid <= deq_wid_n;
|
||||
deq_instr <= deq_instr_n;
|
||||
deq_is_size1 <= deq_is_size1_n;
|
||||
end
|
||||
|
||||
if (warp_added && !warp_removed) begin
|
||||
|
|
|
@ -15,6 +15,7 @@ module VX_mul_unit #(
|
|||
localparam MULQ_BITS = `LOG2UP(`MULQ_SIZE);
|
||||
|
||||
wire [`MUL_BITS-1:0] alu_op = mul_req_if.op_type;
|
||||
wire is_div_op = `IS_DIV_OP(alu_op);
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_in1 = mul_req_if.rs1_data;
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_in2 = mul_req_if.rs2_data;
|
||||
|
||||
|
@ -81,7 +82,7 @@ module VX_mul_unit #(
|
|||
wire [MULQ_BITS-1:0] mul_tag;
|
||||
wire mul_valid_out;
|
||||
|
||||
wire mul_fire = mul_req_if.valid && mul_req_if.ready && ~`IS_DIV_OP(alu_op);
|
||||
wire mul_fire = mul_req_if.valid && mul_req_if.ready && !is_div_op;
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(1 + MULQ_BITS + 1),
|
||||
|
@ -96,88 +97,50 @@ module VX_mul_unit #(
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result;
|
||||
wire is_div = (alu_op == `MUL_DIV || alu_op == `MUL_DIVU);
|
||||
wire is_signed_div = (alu_op == `MUL_DIV || alu_op == `MUL_REM);
|
||||
reg [`NUM_THREADS-1:0] is_div_qual;
|
||||
wire is_div_out;
|
||||
wire stall_div;
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
|
||||
reg [31:0] div_in1_qual, div_in2_qual;
|
||||
reg [32:0] div_in1, div_in2;
|
||||
wire [31:0] div_result_tmp, rem_result_tmp;
|
||||
|
||||
// handle divide by zero
|
||||
always @(*) begin
|
||||
is_div_qual[i] = is_div;
|
||||
div_in1_qual = alu_in1[i];
|
||||
div_in2_qual = alu_in2[i];
|
||||
if (0 == alu_in2[i]) begin
|
||||
div_in2_qual = 1;
|
||||
if (is_div) begin
|
||||
div_in1_qual = 32'hFFFFFFFF; // quotient = (0xFFFFFFFF / 1)
|
||||
end else begin
|
||||
is_div_qual[i] = 1; // remainder = (in1 / 1)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// latch divider inputs
|
||||
always @(posedge clk) begin
|
||||
if (~stall_div) begin
|
||||
div_in1 <= {is_signed_div & alu_in1[i][31], div_in1_qual};
|
||||
div_in2 <= {is_signed_div & alu_in2[i][31], div_in2_qual};
|
||||
end
|
||||
end
|
||||
|
||||
VX_divide #(
|
||||
.WIDTHN(33),
|
||||
.WIDTHD(33),
|
||||
.WIDTHQ(32),
|
||||
.WIDTHR(32),
|
||||
.NSIGNED(1),
|
||||
.DSIGNED(1),
|
||||
.PIPELINE(`LATENCY_IDIV)
|
||||
) divide (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.clk_en(~stall_div),
|
||||
.numer(div_in1),
|
||||
.denom(div_in2),
|
||||
.quotient(div_result_tmp),
|
||||
.remainder(rem_result_tmp)
|
||||
);
|
||||
|
||||
assign div_result[i] = is_div_out ? div_result_tmp : rem_result_tmp;
|
||||
end
|
||||
|
||||
wire [MULQ_BITS-1:0] div_tag;
|
||||
wire div_valid_out;
|
||||
|
||||
wire div_fire = mul_req_if.valid && mul_req_if.ready && `IS_DIV_OP(alu_op);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(1 + MULQ_BITS + 1),
|
||||
.DEPTH(`LATENCY_IDIV + 1)
|
||||
) div_shift_reg (
|
||||
wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU);
|
||||
wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM);
|
||||
wire div_valid_in = mul_req_if.valid && is_div_op;
|
||||
wire div_ready_in;
|
||||
wire div_ready_out;
|
||||
wire div_valid_out;
|
||||
wire is_div_out;
|
||||
wire [MULQ_BITS-1:0] div_tag;
|
||||
|
||||
VX_serial_div #(
|
||||
.WIDTHN(32),
|
||||
.WIDTHD(32),
|
||||
.WIDTHQ(32),
|
||||
.WIDTHR(32),
|
||||
.LANES(`NUM_THREADS),
|
||||
.TAGW(MULQ_BITS + 1)
|
||||
) divide (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall_div),
|
||||
.in({div_fire, tag_in, (| is_div_qual)}),
|
||||
.out({div_valid_out, div_tag, is_div_out})
|
||||
.ready_in(div_ready_in),
|
||||
.valid_in(div_valid_in),
|
||||
.signed_mode(is_signed_div),
|
||||
.tag_in({tag_in, is_div_only}),
|
||||
.numer(alu_in1),
|
||||
.denom(alu_in2),
|
||||
.quotient(div_result_tmp),
|
||||
.remainder(rem_result_tmp),
|
||||
.ready_out(div_ready_out),
|
||||
.valid_out(div_valid_out),
|
||||
.tag_out({div_tag, is_div_out})
|
||||
);
|
||||
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result = is_div_out ? div_result_tmp : rem_result_tmp;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire arbiter_hazard = mul_valid_out && div_valid_out;
|
||||
|
||||
assign stall_out = ~mul_commit_if.ready && mul_commit_if.valid;
|
||||
assign stall_mul = stall_out || mulq_full;
|
||||
assign stall_div = stall_out || mulq_full
|
||||
|| arbiter_hazard; // arbitration prioritizes MUL
|
||||
wire stall_in = stall_mul || stall_div;
|
||||
assign stall_mul = (stall_out && !is_div_op) || mulq_full;
|
||||
assign div_ready_out = ~stall_out && ~arbiter_hazard; // arbitration prioritizes MUL
|
||||
wire stall_in = stall_mul || ~div_ready_in;
|
||||
|
||||
assign valid_out = mul_valid_out || div_valid_out;
|
||||
assign tag_out = mul_valid_out ? mul_tag : div_tag;
|
||||
|
@ -186,7 +149,7 @@ module VX_mul_unit #(
|
|||
|
||||
VX_generic_register #(
|
||||
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32))
|
||||
) alu_reg (
|
||||
) mul_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (stall_out),
|
||||
|
|
|
@ -210,9 +210,10 @@ module VX_warp_sched #(
|
|||
wire [`NUM_WARPS-1:0] schedule_ready = schedule_table & ~(stalled_warps | total_barrier_stall | fetch_lock);
|
||||
|
||||
always @(*) begin
|
||||
schedule_valid = 0;
|
||||
thread_mask = 'x;
|
||||
warp_pc = 'x;
|
||||
schedule_valid = 0;
|
||||
thread_mask = 'x;
|
||||
warp_pc = 'x;
|
||||
warp_to_schedule = 'x;
|
||||
for (integer i = 0; i < `NUM_WARPS; ++i) begin
|
||||
if (schedule_ready[i]) begin
|
||||
schedule_valid = 1;
|
||||
|
|
|
@ -74,7 +74,7 @@ module VX_fp_fpga #(
|
|||
.ready_in (per_core_ready_in[0]),
|
||||
.tag_in (tag_in),
|
||||
.op_type (op_type),
|
||||
.frm (op_mod),
|
||||
.frm (frm),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.result (per_core_result[0]),
|
||||
|
@ -278,14 +278,14 @@ module VX_fp_fpga #(
|
|||
|
||||
always @(*) begin
|
||||
per_core_ready_out = 0;
|
||||
valid_out_r = 0;
|
||||
has_fflags_r = 0;
|
||||
result_r = 'x;
|
||||
tag_out_r = 'x;
|
||||
valid_out_r = 0;
|
||||
has_fflags_r = 'x;
|
||||
result_r = 'x;
|
||||
tag_out_r = 'x;
|
||||
for (integer i = 0; i < NUM_FPC; i++) begin
|
||||
if (per_core_valid_out[i]) begin
|
||||
per_core_ready_out[i] = 1;
|
||||
valid_out_r = i;
|
||||
valid_out_r = 1;
|
||||
has_fflags_r = fpnew_has_fflags && (i == 0);
|
||||
result_r = per_core_result[i];
|
||||
tag_out_r = per_core_tag_out[i];
|
||||
|
|
124
hw/rtl/libs/VX_serial_div.v
Normal file
124
hw/rtl/libs/VX_serial_div.v
Normal file
|
@ -0,0 +1,124 @@
|
|||
`include "VX_platform.vh"
|
||||
|
||||
module VX_serial_div #(
|
||||
parameter WIDTHN = 1,
|
||||
parameter WIDTHD = 1,
|
||||
parameter WIDTHQ = 1,
|
||||
parameter WIDTHR = 1,
|
||||
parameter LANES = 1,
|
||||
parameter TAGW = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire valid_in,
|
||||
output wire ready_in,
|
||||
input wire [LANES-1:0][WIDTHN-1:0] numer,
|
||||
input wire [LANES-1:0][WIDTHD-1:0] denom,
|
||||
input wire signed_mode,
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
output wire [LANES-1:0][WIDTHQ-1:0] quotient,
|
||||
output wire [LANES-1:0][WIDTHR-1:0] remainder,
|
||||
input wire ready_out,
|
||||
output wire valid_out,
|
||||
output wire [TAGW-1:0] tag_out
|
||||
);
|
||||
localparam MIN_ND = (WIDTHN < WIDTHD) ? WIDTHN : WIDTHD;
|
||||
localparam CNTRW = $clog2(WIDTHN+1);
|
||||
|
||||
reg [LANES-1:0][WIDTHN + MIN_ND:0] working;
|
||||
reg [LANES-1:0][WIDTHD-1:0] denom_r;
|
||||
|
||||
wire [LANES-1:0][WIDTHN-1:0] numer_qual;
|
||||
wire [LANES-1:0][WIDTHD-1:0] denom_qual;
|
||||
wire [LANES-1:0][WIDTHD:0] sub_result;
|
||||
|
||||
reg [LANES-1:0] inv_quot, inv_rem;
|
||||
|
||||
reg [CNTRW-1:0] cntr;
|
||||
reg is_busy;
|
||||
|
||||
reg [TAGW-1:0] tag_r;
|
||||
|
||||
wire done = ~(| cntr);
|
||||
|
||||
wire push = valid_in && ready_in;
|
||||
wire pop = valid_out && ready_out;
|
||||
|
||||
for (genvar i = 0; i < LANES; ++i) begin
|
||||
wire negate_numer = signed_mode && numer[i][WIDTHN-1];
|
||||
wire negate_denom = signed_mode && denom[i][WIDTHD-1];
|
||||
assign numer_qual[i] = (numer[i] ^ {WIDTHN{negate_numer}}) + WIDTHN'(negate_numer);
|
||||
assign denom_qual[i] = (denom[i] ^ {WIDTHD{negate_denom}}) + WIDTHD'(negate_denom);
|
||||
assign sub_result[i] = working[i][WIDTHN + MIN_ND : WIDTHN] - denom_r[i];
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
cntr <= 0;
|
||||
is_busy <= 0;
|
||||
end
|
||||
else begin
|
||||
if (push) begin
|
||||
for (integer i = 0; i < LANES; ++i) begin
|
||||
working[i] <= {{WIDTHD{1'b0}}, numer_qual[i], 1'b0};
|
||||
denom_r[i] <= denom_qual[i];
|
||||
inv_quot[i] <= (denom[i] != 0) && signed_mode && (numer[i][31] ^ denom[i][31]);
|
||||
inv_rem[i] <= signed_mode && numer[i][31];
|
||||
end
|
||||
tag_r <= tag_in;
|
||||
cntr <= WIDTHN;
|
||||
is_busy <= 1;
|
||||
end
|
||||
else begin
|
||||
if (!done) begin
|
||||
for (integer i = 0; i < LANES; ++i) begin
|
||||
working[i] <= sub_result[i][WIDTHD] ? {working[i][WIDTHN+MIN_ND-1:0], 1'b0} :
|
||||
{sub_result[i][WIDTHD-1:0], working[i][WIDTHN-1:0], 1'b1};
|
||||
end
|
||||
cntr <= cntr - CNTRW'(1);
|
||||
end
|
||||
end
|
||||
if (pop) begin
|
||||
is_busy <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < LANES; ++i) begin
|
||||
assign quotient[i] = (working[i][WIDTHQ-1:0] ^ {WIDTHQ{inv_quot[i]}}) + WIDTHQ'(inv_quot[i]);
|
||||
assign remainder[i] = (working[i][WIDTHN+WIDTHR:WIDTHN+1] ^ {WIDTHR{inv_rem[i]}}) + WIDTHR'(inv_rem[i]);
|
||||
end
|
||||
assign ready_in = !is_busy;
|
||||
assign tag_out = tag_r;
|
||||
assign valid_out = is_busy && done;
|
||||
|
||||
/*reg [LANES-1:0][WIDTHQ-1:0] quotient_r;
|
||||
reg [LANES-1:0][WIDTHR-1:0] remainder_r;
|
||||
reg [TAGW-1:0] tag_out_r;
|
||||
reg valid_out_r;
|
||||
|
||||
wire stall_out = !ready_out && valid_out_r;
|
||||
assign pop = is_busy && done && !stall_out;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
valid_out_r <= 0;
|
||||
end else if (~stall_out) begin
|
||||
for (integer i = 0; i < LANES; ++i) begin
|
||||
quotient_r[i] <= (working[i][WIDTHQ-1:0] ^ {WIDTHQ{inv_quot[i]}}) + WIDTHQ'(inv_quot[i]);
|
||||
remainder_r[i] <= ((working[i][WIDTHN+WIDTHR-1:WIDTHN] >> 1) ^ {WIDTHR{inv_rem[i]}}) + WIDTHR'(inv_rem[i]);
|
||||
end
|
||||
tag_out_r <= tag_r;
|
||||
valid_out_r <= is_busy && done;
|
||||
end
|
||||
end
|
||||
|
||||
assign ready_in = !is_busy;
|
||||
assign quotient = quotient_r;
|
||||
assign remainder = remainder_r;
|
||||
assign tag_out = tag_out_r;
|
||||
assign valid_out = valid_out_r;*/
|
||||
|
||||
endmodule
|
Loading…
Add table
Add a link
Reference in a new issue