Div improvements

This commit is contained in:
Eric Matthews 2021-03-29 10:45:40 -07:00
parent 8cc7e217ee
commit 40f5e808fb
9 changed files with 189 additions and 329 deletions

View file

@ -39,26 +39,26 @@ module clz
//////////////////////////////////////////
//31-28 index: 0, 3-0 index: 7
const logic [1:0] clz_low_table [8] = '{2'd3, 2'd2, 2'd1, 2'd1, 2'd0, 2'd0, 2'd0, 2'd0};
always_comb begin
for (int i=0; i<8; i++) begin
sub_clz[7-i] = ~|clz_input[(i*4)+:4];
low_order_clz[7-i][1] = ~(clz_input[(i*4)+3] | clz_input[(i*4)+2]);
low_order_clz[7-i][0] = ~(clz_input[(i*4)+3] | (~clz_input[(i*4)+2] & clz_input[(i*4)+1]));
sub_clz[7-i] = ~|clz_input[(i*4) +: 4];
low_order_clz[7-i] = clz_low_table[clz_input[(i*4) + 1 +: 3]];
end
clz[4] = &sub_clz[3:0]; //upper 16 all zero
clz[3] = &sub_clz[1:0] & (~&sub_clz[3:2] | &sub_clz[5:4]);
clz[3] = clz[4] ? &sub_clz[5:4] : &sub_clz[1:0];//upper 24 zero, or first 8 zero
clz[2] =
(sub_clz[0] & ~sub_clz[1]) |
(&sub_clz[2:0] & ~sub_clz[3]) |
(&sub_clz[4:0] & ~sub_clz[5]) |
(&sub_clz[6:0]);
for (int i=0; i<4; i++) begin
upper_lower[i] = (sub_clz[(2*i)]) ? low_order_clz[(2*i)+1] : low_order_clz[(2*i)];
for (int i=0; i<8; i+=2) begin
upper_lower[i/2] = low_order_clz[{i[2:1], sub_clz[i]}];
end
clz[1:0] = upper_lower[clz[4:3]];
end
endmodule
endmodule

View file

@ -1,45 +0,0 @@
/*
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
* Alec Lu <alec_lu@sfu.ca>
*/
import taiga_config::*;
import taiga_types::*;
module div_algorithm
(
input logic clk,
input logic rst,
unsigned_division_interface.divider div
);
generate
case(DIV_ALGORITHM)
RADIX_2 : div_radix2 #(.DIV_WIDTH(32)) div_block (.clk(clk), .rst(rst), .div(div));
QUICK_CLZ : div_quick_clz #(.DIV_WIDTH(32)) div_block (.clk(clk), .rst(rst), .div(div));
endcase
endgenerate
endmodule

View file

@ -1,140 +0,0 @@
/*
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
Alec Lu <alec_lu@sfu.ca>
*/
module div_quick_clz
#(
parameter DIV_WIDTH = 32
)
(
input logic clk,
input logic rst,
unsigned_division_interface.divider div
);
logic running;
logic terminate;
logic [DIV_WIDTH-1:0] divisor_r;
logic [DIV_WIDTH-1:0] normalized_divisor;
logic overflow;
logic [DIV_WIDTH-1:0] subtraction1;
logic [DIV_WIDTH-1:0] subtraction2;
logic [DIV_WIDTH-1:0] new_remainder;
logic [DIV_WIDTH-1:0] new_quotient;
logic [DIV_WIDTH-1:0] new_Q_bit1;
logic [DIV_WIDTH-1:0] new_Q_bit2;
logic [DIV_WIDTH-1:0] test_multiple1;
logic [DIV_WIDTH-1:0] test_multiple2;
localparam CLZ_W = $clog2(DIV_WIDTH);
logic [CLZ_W-1:0] remainder_CLZ;
logic [CLZ_W-1:0] divisor_CLZ;
logic [CLZ_W-1:0] divisor_CLZ_r;
logic [CLZ_W-1:0] CLZ_delta;
logic divisor_is_zero_first_cycle;
////////////////////////////////////////////////////
//Implementation
clz remainder_clz_block (.clz_input(div.remainder), .clz(remainder_CLZ));
clz divisor_clz_block (.clz_input(div.divisor), .clz(divisor_CLZ));
////////////////////////////////////////////////////
//Control Signals
assign divisor_is_zero_first_cycle = (&divisor_CLZ) & ~div.divisor[0];
always @ (posedge clk) begin
if (div.start)
div.divisor_is_zero <= divisor_is_zero_first_cycle;
end
always_ff @ (posedge clk) begin
if (rst)
running <= 0;
else if (div.start & ~divisor_is_zero_first_cycle)
running <= 1;
else if (terminate)
running <= 0;
end
always_ff @ (posedge clk) begin
div.done <= (running & terminate) | (div.start & divisor_is_zero_first_cycle);
end
assign terminate = div.remainder < divisor_r;
////////////////////////////////////////////////////
//Divisor Pre-processing
always_ff @ (posedge clk) begin
if (div.start) begin
divisor_r <= div.divisor;
divisor_CLZ_r <= divisor_CLZ;
normalized_divisor <= div.divisor << divisor_CLZ;
end
end
////////////////////////////////////////////////////
//Remainder Determination
assign test_multiple1 = normalized_divisor >> remainder_CLZ;
assign {overflow, subtraction1} = div.remainder - test_multiple1;
assign test_multiple2 = test_multiple1 >> 1;
assign subtraction2 = div.remainder - test_multiple2;
assign new_remainder = overflow ? subtraction2 : subtraction1;
initial begin
div.remainder = 0;
end
always @ (posedge clk) begin
if (div.start)
div.remainder <= div.dividend;
else if (~terminate & running)
div.remainder <= new_remainder;
end
////////////////////////////////////////////////////
//Quotient Determination
assign CLZ_delta = divisor_CLZ_r - remainder_CLZ;
always_comb begin
new_Q_bit1 = 0;
new_Q_bit1[CLZ_delta] = 1;
end
assign new_Q_bit2 = new_Q_bit1 >> 1;
assign new_quotient = div.quotient | (overflow ? new_Q_bit2 : new_Q_bit1);
always_ff @ (posedge clk) begin
if (div.start)
div.quotient <= '0;
else if (~terminate & running)
div.quotient <= new_quotient;
end
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
endmodule

View file

@ -1,96 +0,0 @@
/*
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
* Alec Lu <alec_lu@sfu.ca>
*/
module div_radix2
#(
parameter DIV_WIDTH = 32
)
(
input logic clk,
input logic rst,
unsigned_division_interface.divider div
);
logic terminate;
logic [DIV_WIDTH-1:0] divisor_r;
logic [DIV_WIDTH:0] new_PR;
logic [DIV_WIDTH:0] PR;
logic [DIV_WIDTH-1:0] shift_count;
logic negative_sub_rst;
//implementation
////////////////////////////////////////////////////
assign new_PR = PR - {1'b0, divisor_r};
assign negative_sub_rst = new_PR[DIV_WIDTH];
//Shift reg for
always_ff @ (posedge clk) begin
shift_count <= {shift_count[DIV_WIDTH-2:0], div.start};
end
always_ff @ (posedge clk) begin
if (div.start) begin
divisor_r <= div.divisor;
PR <= {(DIV_WIDTH)'(1'b0), div.dividend[DIV_WIDTH-1]};
div.quotient <= {div.dividend[DIV_WIDTH-2:0], 1'b0};
end
else if (~terminate) begin
PR <= negative_sub_rst ? {PR[DIV_WIDTH-1:0], div.quotient[DIV_WIDTH-1]} : {new_PR[DIV_WIDTH-1:0], div.quotient[DIV_WIDTH-1]};
div.quotient <= {div.quotient[DIV_WIDTH-2:0], ~negative_sub_rst};
end
end
assign div.remainder = PR[DIV_WIDTH:1];
always_ff @ (posedge clk) begin
if (div.start)
div.divisor_is_zero <= ~div.divisor[0];
else if (~terminate)
div.divisor_is_zero <= div.divisor_is_zero & ~negative_sub_rst;
end
always_ff @ (posedge clk) begin
if (rst)
terminate <= 0;
else begin
if (div.start)
terminate <= 0;
if (shift_count[DIV_WIDTH-1])
terminate <= 1;
end
end
always_ff @ (posedge clk) begin
if (rst)
div.done <= 0;
else begin
if (shift_count[DIV_WIDTH-1])
div.done <= 1;
else if (div.done)
div.done <= 0;
end
end
endmodule

122
core/div_core.sv Normal file
View file

@ -0,0 +1,122 @@
/*
* Copyright © 2021 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*
*/
module div_core
#(
parameter DIV_WIDTH = 32
)
(
input logic clk,
input logic rst,
unsigned_division_interface.divider div
);
localparam CLZ_W = $clog2(DIV_WIDTH);
logic [CLZ_W-1:0] CLZ_delta;
logic divisor_greater_than_dividend;
logic first_cycle_abort;
logic [DIV_WIDTH-1:0] shifted_divisor;
logic [1:0] new_quotient_bits;
logic [DIV_WIDTH-1:0] sub_1x;
logic [DIV_WIDTH-1:0] sub_2x;
logic sub_1x_overflow;
logic sub_2x_overflow;
logic [CLZ_W-2:0] cycles_remaining;
logic [CLZ_W-2:0] cycles_remaining_next;
logic running;
logic terminate;
////////////////////////////////////////////////////
//Implementation
//First cycle
assign {divisor_greater_than_dividend, CLZ_delta} = div.divisor_CLZ - div.dividend_CLZ;
always_ff @ (posedge clk) begin
if (running)
shifted_divisor <= {2'b0, shifted_divisor[DIV_WIDTH-1:2]};
else
shifted_divisor <= div.divisor << {CLZ_delta[CLZ_W-1:1], 1'b0};//Rounding down when CLZ_delta is odd
end
//Subtractions
logic sub2x_toss;
assign {sub_2x_overflow, sub2x_toss, sub_2x} = {1'b0, div.remainder} - {shifted_divisor, 1'b0};
assign {sub_1x_overflow, sub_1x} = {1'b0, (sub_2x_overflow ? div.remainder : sub_2x)} - {1'b0, shifted_divisor};
assign new_quotient_bits[1] = ~sub_2x_overflow;
assign new_quotient_bits[0] = ~sub_1x_overflow;
always_ff @ (posedge clk) begin
if (div.start)
div.quotient <= '0;
else if (running)
div.quotient <= {div.quotient[(DIV_WIDTH-3):0], new_quotient_bits};
end
//Remainder mux, when quotient bits are zero value is held
always_ff @ (posedge clk) begin
if (div.start | (running & |new_quotient_bits)) begin //enable: on div.start for init and so long as we are in the running state and the quotient pair is not zero
case ({div.start, sub_1x_overflow})
0 : div.remainder <= sub_1x;
1 : div.remainder <= sub_2x;
default : div.remainder <= div.dividend;//Overloading the quotient zero case to fit the initial loading of the dividend in
endcase
end
end
////////////////////////////////////////////////////
//Control Signals
assign first_cycle_abort = divisor_greater_than_dividend | div.divisor_is_zero;
assign {terminate, cycles_remaining_next} = cycles_remaining - 1;
always_ff @ (posedge clk) begin
cycles_remaining <= running ? cycles_remaining_next : CLZ_delta[CLZ_W-1:1];
end
always_ff @ (posedge clk) begin
if (rst)
running <= 0;
else if (div.start)
running <= ~first_cycle_abort;
else if (terminate)
running <= 0;
end
always_ff @ (posedge clk) begin
if (rst)
div.done <= 0;
else
div.done <= (running & terminate) | (div.start & first_cycle_abort);
end
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
endmodule

View file

@ -41,15 +41,19 @@ module div_unit
logic negate_remainder;
logic negate_dividend;
logic negate_divisor;
logic remainder_op;
logic [31:0] unsigned_dividend;
logic [31:0] unsigned_divisor;
logic remainder_op;
logic [$clog2(32)-1:0] dividend_CLZ;
logic [$clog2(32)-1:0] divisor_CLZ;
logic divisor_is_zero;
typedef struct packed{
logic remainder_op;
logic negate_quotient;
logic negate_remainder;
logic negate_result;
logic divisor_is_zero;
logic reuse_result;
id_t id;
} div_attributes_t;
@ -57,6 +61,8 @@ module div_unit
typedef struct packed{
logic [XLEN-1:0] unsigned_dividend;
logic [XLEN-1:0] unsigned_divisor;
logic [$clog2(32)-1:0] dividend_CLZ;
logic [$clog2(32)-1:0] divisor_CLZ;
div_attributes_t attr;
} div_fifo_inputs_t;
@ -64,11 +70,10 @@ module div_unit
div_fifo_inputs_t div_op;
div_attributes_t in_progress_attr;
unsigned_division_interface #(.DATA_WIDTH(32)) div_core();
unsigned_division_interface #(.DATA_WIDTH(32)) div();
logic in_progress;
logic div_done;
logic negate_result;
fifo_interface #(.DATA_WIDTH($bits(div_fifo_inputs_t))) input_fifo();
fifo_interface #(.DATA_WIDTH(XLEN)) wb_fifo();
@ -93,39 +98,55 @@ module div_unit
assign unsigned_dividend = negate_if (div_inputs.rs1, negate_dividend);
assign unsigned_divisor = negate_if (div_inputs.rs2, negate_divisor);
//Note: If this becomes the critical path, we can use the one's complemented input instead.
//It will potentially overestimate (only when the input is a negative power-of-two), and
//the divisor width will need to be increased by one to safely handle the case where the divisor CLZ is overestimated
clz dividend_clz_block (.clz_input(unsigned_dividend), .clz(dividend_CLZ));
clz divisor_clz_block (.clz_input(unsigned_divisor), .clz(divisor_CLZ));
assign divisor_is_zero = (&divisor_CLZ) & ~div_inputs.rs2[0];
assign fifo_inputs.unsigned_dividend = unsigned_dividend;
assign fifo_inputs.unsigned_divisor = unsigned_divisor;
assign fifo_inputs.dividend_CLZ = dividend_CLZ;
assign fifo_inputs.divisor_CLZ = divisor_CLZ;
assign fifo_inputs.attr.remainder_op = div_inputs.op[1];
assign fifo_inputs.attr.negate_quotient = negate_quotient;
assign fifo_inputs.attr.negate_remainder = negate_remainder;
assign fifo_inputs.attr.negate_result = div_inputs.op[1] ? negate_remainder : (~divisor_is_zero & negate_quotient);
assign fifo_inputs.attr.divisor_is_zero = divisor_is_zero;
assign fifo_inputs.attr.reuse_result = div_inputs.reuse_result;
assign fifo_inputs.attr.id = issue.id;
////////////////////////////////////////////////////
//Input FIFO
//Currently just a register (DEPTH=1). As one div instruction can be in-progress
//and one in this input "fifo," we can support two in-flight div ops.
taiga_fifo #(.DATA_WIDTH($bits(div_fifo_inputs_t)), .FIFO_DEPTH(1))
div_input_fifo (
.clk (clk),
.rst (rst),
.fifo (input_fifo)
);
div_input_fifo (
.clk (clk),
.rst (rst),
.fifo (input_fifo)
);
logic div_ready;
assign div_ready = (~in_progress) | wb.ack;
assign input_fifo.data_in = fifo_inputs;
assign input_fifo.push = issue.new_request;
assign input_fifo.potential_push = issue.possible_issue;
assign issue.ready = ~input_fifo.full | input_fifo.pop; //As FIFO depth is the same as MAX_INFLIGHT_COUNT
assign input_fifo.pop = input_fifo.valid & (~in_progress);//wb.done & wb.ack;
assign issue.ready = ~input_fifo.full | (~in_progress);
assign input_fifo.pop = input_fifo.valid & div_ready;
assign div_op = input_fifo.data_out;
////////////////////////////////////////////////////
//Control Signals
assign div_core.start = input_fifo.valid & (~in_progress) & ~div_op.attr.reuse_result;
assign div_done = div_core.done | (in_progress & in_progress_attr.reuse_result);
assign div.start = input_fifo.valid & div_ready & ~div_op.attr.reuse_result;
assign div_done = div.done | (in_progress & in_progress_attr.reuse_result);
//If more than one cycle, set in_progress so that multiple div.start signals are not sent to the div unit.
set_clr_reg_with_rst #(.SET_OVER_CLR(1), .WIDTH(1), .RST_VALUE('0)) in_progress_m (
set_clr_reg_with_rst #(.SET_OVER_CLR(1), .WIDTH(1), .RST_VALUE('0))
in_progress_m (
.clk, .rst,
.set(input_fifo.valid & (~in_progress)),
.set(input_fifo.pop),
.clr(wb.ack),
.result(in_progress)
);
@ -136,25 +157,30 @@ module div_unit
////////////////////////////////////////////////////
//Div core
assign div_core.dividend = div_op.unsigned_dividend;
assign div_core.divisor = div_op.unsigned_divisor;
div_algorithm divider_block (
.clk (clk),
.rst (rst),
.div (div_core)
assign div.dividend = div_op.unsigned_dividend;
assign div.divisor = div_op.unsigned_divisor;
assign div.dividend_CLZ = div_op.dividend_CLZ;
assign div.divisor_CLZ = div_op.divisor_CLZ;
assign div.divisor_is_zero = div_op.attr.divisor_is_zero;
div_core #(.DIV_WIDTH(32))
divider_block (
.clk(clk),
.rst(rst),
.div(div)
);
////////////////////////////////////////////////////
//Output
logic done_r;
assign negate_result = in_progress_attr.remainder_op ? in_progress_attr.negate_remainder : (~div_core.divisor_is_zero & in_progress_attr.negate_quotient);
assign wb.rd = negate_if (in_progress_attr.remainder_op ? div_core.remainder : ({32{div_core.divisor_is_zero}} | div_core.quotient), negate_result);
assign wb.rd = negate_if (in_progress_attr.remainder_op ? div.remainder : ({32{in_progress_attr.divisor_is_zero}} | div.quotient), in_progress_attr.negate_result);
always_ff @ (posedge clk) begin
if (wb.ack)
if (rst)
done_r <= 0;
else if (div_done)
done_r <= 1;
else
done_r <= (div_done | done_r) & ~wb.ack;
end
assign wb.done = div_done | done_r;
assign wb.id = in_progress_attr.id;

View file

@ -262,13 +262,15 @@ endinterface
interface unsigned_division_interface #(parameter DATA_WIDTH = 32);
logic start;
logic [DATA_WIDTH-1:0] dividend;
logic [$clog2(DATA_WIDTH)-1:0] dividend_CLZ;
logic [DATA_WIDTH-1:0] divisor;
logic [$clog2(DATA_WIDTH)-1:0] divisor_CLZ;
logic [DATA_WIDTH-1:0] remainder;
logic [DATA_WIDTH-1:0] quotient;
logic done;
logic divisor_is_zero;
modport requester (input remainder, quotient, done, divisor_is_zero, output dividend, divisor, start);
modport divider (output remainder, quotient, done, divisor_is_zero, input dividend, divisor, start);
modport requester (input remainder, quotient, done, output dividend, dividend_CLZ, divisor, divisor_CLZ, divisor_is_zero, start);
modport divider (output remainder, quotient, done, input dividend, dividend_CLZ, divisor, divisor_CLZ, divisor_is_zero, start);
endinterface
interface renamer_interface;

View file

@ -48,13 +48,6 @@ package taiga_config;
localparam USE_MUL = 1;
localparam USE_DIV = 1;
//Division algorithm selection
typedef enum {
RADIX_2,//Smallest
QUICK_CLZ//Highest performance and best performance per LUT
} div_type;
localparam div_type DIV_ALGORITHM = QUICK_CLZ;
//Enable Atomic extension (cache operations only)
localparam USE_AMO = 0;

View file

@ -64,10 +64,8 @@ core/icache.sv
core/div_algorithms/div_radix2.sv
core/clz.sv
core/div_algorithms/div_quick_clz.sv
core/div_algorithms/div_algorithm.sv
core/div_core.sv
core/div_unit.sv
core/lut_ram.sv