diff --git a/core/clz.sv b/core/clz.sv index 8d2f0e2..0dd9fba 100644 --- a/core/clz.sv +++ b/core/clz.sv @@ -39,26 +39,26 @@ module clz ////////////////////////////////////////// //31-28 index: 0, 3-0 index: 7 + const logic [1:0] clz_low_table [8] = '{2'd3, 2'd2, 2'd1, 2'd1, 2'd0, 2'd0, 2'd0, 2'd0}; always_comb begin for (int i=0; i<8; i++) begin - sub_clz[7-i] = ~|clz_input[(i*4)+:4]; - low_order_clz[7-i][1] = ~(clz_input[(i*4)+3] | clz_input[(i*4)+2]); - low_order_clz[7-i][0] = ~(clz_input[(i*4)+3] | (~clz_input[(i*4)+2] & clz_input[(i*4)+1])); + sub_clz[7-i] = ~|clz_input[(i*4) +: 4]; + low_order_clz[7-i] = clz_low_table[clz_input[(i*4) + 1 +: 3]]; end clz[4] = &sub_clz[3:0]; //upper 16 all zero - clz[3] = &sub_clz[1:0] & (~&sub_clz[3:2] | &sub_clz[5:4]); + clz[3] = clz[4] ? &sub_clz[5:4] : &sub_clz[1:0];//upper 24 zero, or first 8 zero clz[2] = (sub_clz[0] & ~sub_clz[1]) | (&sub_clz[2:0] & ~sub_clz[3]) | (&sub_clz[4:0] & ~sub_clz[5]) | (&sub_clz[6:0]); - for (int i=0; i<4; i++) begin - upper_lower[i] = (sub_clz[(2*i)]) ? low_order_clz[(2*i)+1] : low_order_clz[(2*i)]; + for (int i=0; i<8; i+=2) begin + upper_lower[i/2] = low_order_clz[{i[2:1], sub_clz[i]}]; end clz[1:0] = upper_lower[clz[4:3]]; end -endmodule \ No newline at end of file +endmodule diff --git a/core/div_algorithms/div_algorithm.sv b/core/div_algorithms/div_algorithm.sv deleted file mode 100644 index 08081c2..0000000 --- a/core/div_algorithms/div_algorithm.sv +++ /dev/null @@ -1,45 +0,0 @@ -/* - * Copyright © 2017-2020 Eric Matthews, Lesley Shannon - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Initial code developed under the supervision of Dr. Lesley Shannon, - * Reconfigurable Computing Lab, Simon Fraser University. - * - * Author(s): - * Eric Matthews - * Alec Lu - */ - -import taiga_config::*; -import taiga_types::*; - -module div_algorithm - ( - input logic clk, - input logic rst, - unsigned_division_interface.divider div - ); - - generate - case(DIV_ALGORITHM) - RADIX_2 : div_radix2 #(.DIV_WIDTH(32)) div_block (.clk(clk), .rst(rst), .div(div)); - QUICK_CLZ : div_quick_clz #(.DIV_WIDTH(32)) div_block (.clk(clk), .rst(rst), .div(div)); - endcase - endgenerate - -endmodule - - - - diff --git a/core/div_algorithms/div_quick_clz.sv b/core/div_algorithms/div_quick_clz.sv deleted file mode 100644 index 1e7c4a0..0000000 --- a/core/div_algorithms/div_quick_clz.sv +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright © 2017-2020 Eric Matthews, Lesley Shannon - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Initial code developed under the supervision of Dr. Lesley Shannon, - * Reconfigurable Computing Lab, Simon Fraser University. - * - * Author(s): - * Eric Matthews - Alec Lu - */ - - -module div_quick_clz - #( - parameter DIV_WIDTH = 32 - ) - ( - input logic clk, - input logic rst, - unsigned_division_interface.divider div - ); - - logic running; - logic terminate; - logic [DIV_WIDTH-1:0] divisor_r; - - logic [DIV_WIDTH-1:0] normalized_divisor; - - logic overflow; - logic [DIV_WIDTH-1:0] subtraction1; - logic [DIV_WIDTH-1:0] subtraction2; - - logic [DIV_WIDTH-1:0] new_remainder; - logic [DIV_WIDTH-1:0] new_quotient; - - logic [DIV_WIDTH-1:0] new_Q_bit1; - logic [DIV_WIDTH-1:0] new_Q_bit2; - - logic [DIV_WIDTH-1:0] test_multiple1; - logic [DIV_WIDTH-1:0] test_multiple2; - - localparam CLZ_W = $clog2(DIV_WIDTH); - logic [CLZ_W-1:0] remainder_CLZ; - logic [CLZ_W-1:0] divisor_CLZ; - logic [CLZ_W-1:0] divisor_CLZ_r; - logic [CLZ_W-1:0] CLZ_delta; - logic divisor_is_zero_first_cycle; - //////////////////////////////////////////////////// - //Implementation - clz remainder_clz_block (.clz_input(div.remainder), .clz(remainder_CLZ)); - clz divisor_clz_block (.clz_input(div.divisor), .clz(divisor_CLZ)); - - //////////////////////////////////////////////////// - //Control Signals - assign divisor_is_zero_first_cycle = (&divisor_CLZ) & ~div.divisor[0]; - always @ (posedge clk) begin - if (div.start) - div.divisor_is_zero <= divisor_is_zero_first_cycle; - end - - always_ff @ (posedge clk) begin - if (rst) - running <= 0; - else if (div.start & ~divisor_is_zero_first_cycle) - running <= 1; - else if (terminate) - running <= 0; - end - - always_ff @ (posedge clk) begin - div.done <= (running & terminate) | (div.start & divisor_is_zero_first_cycle); - end - - assign terminate = div.remainder < divisor_r; - - //////////////////////////////////////////////////// - //Divisor Pre-processing - always_ff @ (posedge clk) begin - if (div.start) begin - divisor_r <= div.divisor; - divisor_CLZ_r <= divisor_CLZ; - normalized_divisor <= div.divisor << divisor_CLZ; - end - end - - //////////////////////////////////////////////////// - //Remainder Determination - assign test_multiple1 = normalized_divisor >> remainder_CLZ; - assign {overflow, subtraction1} = div.remainder - test_multiple1; - - assign test_multiple2 = test_multiple1 >> 1; - assign subtraction2 = div.remainder - test_multiple2; - - assign new_remainder = overflow ? subtraction2 : subtraction1; - - initial begin - div.remainder = 0; - end - always @ (posedge clk) begin - if (div.start) - div.remainder <= div.dividend; - else if (~terminate & running) - div.remainder <= new_remainder; - end - - //////////////////////////////////////////////////// - //Quotient Determination - assign CLZ_delta = divisor_CLZ_r - remainder_CLZ; - always_comb begin - new_Q_bit1 = 0; - new_Q_bit1[CLZ_delta] = 1; - end - assign new_Q_bit2 = new_Q_bit1 >> 1; - assign new_quotient = div.quotient | (overflow ? new_Q_bit2 : new_Q_bit1); - - always_ff @ (posedge clk) begin - if (div.start) - div.quotient <= '0; - else if (~terminate & running) - div.quotient <= new_quotient; - end - //////////////////////////////////////////////////// - //End of Implementation - //////////////////////////////////////////////////// - - //////////////////////////////////////////////////// - //Assertions -endmodule diff --git a/core/div_algorithms/div_radix2.sv b/core/div_algorithms/div_radix2.sv deleted file mode 100644 index 82eafe0..0000000 --- a/core/div_algorithms/div_radix2.sv +++ /dev/null @@ -1,96 +0,0 @@ -/* - * Copyright © 2017-2020 Eric Matthews, Lesley Shannon - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * Initial code developed under the supervision of Dr. Lesley Shannon, - * Reconfigurable Computing Lab, Simon Fraser University. - * - * Author(s): - * Eric Matthews - * Alec Lu - */ - - -module div_radix2 - #( - parameter DIV_WIDTH = 32 - ) - ( - input logic clk, - input logic rst, - unsigned_division_interface.divider div - ); - - logic terminate; - - logic [DIV_WIDTH-1:0] divisor_r; - logic [DIV_WIDTH:0] new_PR; - logic [DIV_WIDTH:0] PR; - logic [DIV_WIDTH-1:0] shift_count; - logic negative_sub_rst; - - //implementation - //////////////////////////////////////////////////// - assign new_PR = PR - {1'b0, divisor_r}; - assign negative_sub_rst = new_PR[DIV_WIDTH]; - - //Shift reg for - always_ff @ (posedge clk) begin - shift_count <= {shift_count[DIV_WIDTH-2:0], div.start}; - end - - always_ff @ (posedge clk) begin - if (div.start) begin - divisor_r <= div.divisor; - PR <= {(DIV_WIDTH)'(1'b0), div.dividend[DIV_WIDTH-1]}; - div.quotient <= {div.dividend[DIV_WIDTH-2:0], 1'b0}; - end - else if (~terminate) begin - PR <= negative_sub_rst ? {PR[DIV_WIDTH-1:0], div.quotient[DIV_WIDTH-1]} : {new_PR[DIV_WIDTH-1:0], div.quotient[DIV_WIDTH-1]}; - div.quotient <= {div.quotient[DIV_WIDTH-2:0], ~negative_sub_rst}; - end - end - - assign div.remainder = PR[DIV_WIDTH:1]; - - always_ff @ (posedge clk) begin - if (div.start) - div.divisor_is_zero <= ~div.divisor[0]; - else if (~terminate) - div.divisor_is_zero <= div.divisor_is_zero & ~negative_sub_rst; - end - - always_ff @ (posedge clk) begin - if (rst) - terminate <= 0; - else begin - if (div.start) - terminate <= 0; - if (shift_count[DIV_WIDTH-1]) - terminate <= 1; - end - end - - always_ff @ (posedge clk) begin - if (rst) - div.done <= 0; - else begin - if (shift_count[DIV_WIDTH-1]) - div.done <= 1; - else if (div.done) - div.done <= 0; - end - end - -endmodule diff --git a/core/div_core.sv b/core/div_core.sv new file mode 100644 index 0000000..57f690d --- /dev/null +++ b/core/div_core.sv @@ -0,0 +1,122 @@ +/* + * Copyright © 2021 Eric Matthews, Lesley Shannon + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Initial code developed under the supervision of Dr. Lesley Shannon, + * Reconfigurable Computing Lab, Simon Fraser University. + * + * Author(s): + * Eric Matthews + * + */ +module div_core + #( + parameter DIV_WIDTH = 32 + ) + ( + input logic clk, + input logic rst, + unsigned_division_interface.divider div + ); + + localparam CLZ_W = $clog2(DIV_WIDTH); + logic [CLZ_W-1:0] CLZ_delta; + + logic divisor_greater_than_dividend; + logic first_cycle_abort; + + logic [DIV_WIDTH-1:0] shifted_divisor; + + logic [1:0] new_quotient_bits; + logic [DIV_WIDTH-1:0] sub_1x; + logic [DIV_WIDTH-1:0] sub_2x; + logic sub_1x_overflow; + logic sub_2x_overflow; + + logic [CLZ_W-2:0] cycles_remaining; + logic [CLZ_W-2:0] cycles_remaining_next; + + logic running; + logic terminate; + //////////////////////////////////////////////////// + //Implementation + //First cycle + assign {divisor_greater_than_dividend, CLZ_delta} = div.divisor_CLZ - div.dividend_CLZ; + + always_ff @ (posedge clk) begin + if (running) + shifted_divisor <= {2'b0, shifted_divisor[DIV_WIDTH-1:2]}; + else + shifted_divisor <= div.divisor << {CLZ_delta[CLZ_W-1:1], 1'b0};//Rounding down when CLZ_delta is odd + end + + //Subtractions + logic sub2x_toss; + assign {sub_2x_overflow, sub2x_toss, sub_2x} = {1'b0, div.remainder} - {shifted_divisor, 1'b0}; + assign {sub_1x_overflow, sub_1x} = {1'b0, (sub_2x_overflow ? div.remainder : sub_2x)} - {1'b0, shifted_divisor}; + + assign new_quotient_bits[1] = ~sub_2x_overflow; + assign new_quotient_bits[0] = ~sub_1x_overflow; + + always_ff @ (posedge clk) begin + if (div.start) + div.quotient <= '0; + else if (running) + div.quotient <= {div.quotient[(DIV_WIDTH-3):0], new_quotient_bits}; + end + + //Remainder mux, when quotient bits are zero value is held + always_ff @ (posedge clk) begin + if (div.start | (running & |new_quotient_bits)) begin //enable: on div.start for init and so long as we are in the running state and the quotient pair is not zero + case ({div.start, sub_1x_overflow}) + 0 : div.remainder <= sub_1x; + 1 : div.remainder <= sub_2x; + default : div.remainder <= div.dividend;//Overloading the quotient zero case to fit the initial loading of the dividend in + endcase + end + end + + //////////////////////////////////////////////////// + //Control Signals + assign first_cycle_abort = divisor_greater_than_dividend | div.divisor_is_zero; + + assign {terminate, cycles_remaining_next} = cycles_remaining - 1; + always_ff @ (posedge clk) begin + cycles_remaining <= running ? cycles_remaining_next : CLZ_delta[CLZ_W-1:1]; + end + + always_ff @ (posedge clk) begin + if (rst) + running <= 0; + else if (div.start) + running <= ~first_cycle_abort; + else if (terminate) + running <= 0; + end + + always_ff @ (posedge clk) begin + if (rst) + div.done <= 0; + else + div.done <= (running & terminate) | (div.start & first_cycle_abort); + end + + //////////////////////////////////////////////////// + //End of Implementation + //////////////////////////////////////////////////// + + //////////////////////////////////////////////////// + //Assertions + +endmodule diff --git a/core/div_unit.sv b/core/div_unit.sv index 40adeac..55843e9 100755 --- a/core/div_unit.sv +++ b/core/div_unit.sv @@ -41,15 +41,19 @@ module div_unit logic negate_remainder; logic negate_dividend; logic negate_divisor; + logic remainder_op; logic [31:0] unsigned_dividend; logic [31:0] unsigned_divisor; - logic remainder_op; + logic [$clog2(32)-1:0] dividend_CLZ; + logic [$clog2(32)-1:0] divisor_CLZ; + + logic divisor_is_zero; typedef struct packed{ logic remainder_op; - logic negate_quotient; - logic negate_remainder; + logic negate_result; + logic divisor_is_zero; logic reuse_result; id_t id; } div_attributes_t; @@ -57,6 +61,8 @@ module div_unit typedef struct packed{ logic [XLEN-1:0] unsigned_dividend; logic [XLEN-1:0] unsigned_divisor; + logic [$clog2(32)-1:0] dividend_CLZ; + logic [$clog2(32)-1:0] divisor_CLZ; div_attributes_t attr; } div_fifo_inputs_t; @@ -64,11 +70,10 @@ module div_unit div_fifo_inputs_t div_op; div_attributes_t in_progress_attr; - unsigned_division_interface #(.DATA_WIDTH(32)) div_core(); + unsigned_division_interface #(.DATA_WIDTH(32)) div(); logic in_progress; logic div_done; - logic negate_result; fifo_interface #(.DATA_WIDTH($bits(div_fifo_inputs_t))) input_fifo(); fifo_interface #(.DATA_WIDTH(XLEN)) wb_fifo(); @@ -93,39 +98,55 @@ module div_unit assign unsigned_dividend = negate_if (div_inputs.rs1, negate_dividend); assign unsigned_divisor = negate_if (div_inputs.rs2, negate_divisor); + //Note: If this becomes the critical path, we can use the one's complemented input instead. + //It will potentially overestimate (only when the input is a negative power-of-two), and + //the divisor width will need to be increased by one to safely handle the case where the divisor CLZ is overestimated + clz dividend_clz_block (.clz_input(unsigned_dividend), .clz(dividend_CLZ)); + clz divisor_clz_block (.clz_input(unsigned_divisor), .clz(divisor_CLZ)); + assign divisor_is_zero = (&divisor_CLZ) & ~div_inputs.rs2[0]; + assign fifo_inputs.unsigned_dividend = unsigned_dividend; assign fifo_inputs.unsigned_divisor = unsigned_divisor; + assign fifo_inputs.dividend_CLZ = dividend_CLZ; + assign fifo_inputs.divisor_CLZ = divisor_CLZ; + assign fifo_inputs.attr.remainder_op = div_inputs.op[1]; - assign fifo_inputs.attr.negate_quotient = negate_quotient; - assign fifo_inputs.attr.negate_remainder = negate_remainder; + assign fifo_inputs.attr.negate_result = div_inputs.op[1] ? negate_remainder : (~divisor_is_zero & negate_quotient); + assign fifo_inputs.attr.divisor_is_zero = divisor_is_zero; assign fifo_inputs.attr.reuse_result = div_inputs.reuse_result; assign fifo_inputs.attr.id = issue.id; //////////////////////////////////////////////////// //Input FIFO + //Currently just a register (DEPTH=1). As one div instruction can be in-progress + //and one in this input "fifo," we can support two in-flight div ops. taiga_fifo #(.DATA_WIDTH($bits(div_fifo_inputs_t)), .FIFO_DEPTH(1)) - div_input_fifo ( - .clk (clk), - .rst (rst), - .fifo (input_fifo) - ); + div_input_fifo ( + .clk (clk), + .rst (rst), + .fifo (input_fifo) + ); + + logic div_ready; + assign div_ready = (~in_progress) | wb.ack; assign input_fifo.data_in = fifo_inputs; assign input_fifo.push = issue.new_request; assign input_fifo.potential_push = issue.possible_issue; - assign issue.ready = ~input_fifo.full | input_fifo.pop; //As FIFO depth is the same as MAX_INFLIGHT_COUNT - assign input_fifo.pop = input_fifo.valid & (~in_progress);//wb.done & wb.ack; + assign issue.ready = ~input_fifo.full | (~in_progress); + assign input_fifo.pop = input_fifo.valid & div_ready; assign div_op = input_fifo.data_out; //////////////////////////////////////////////////// //Control Signals - assign div_core.start = input_fifo.valid & (~in_progress) & ~div_op.attr.reuse_result; - assign div_done = div_core.done | (in_progress & in_progress_attr.reuse_result); + assign div.start = input_fifo.valid & div_ready & ~div_op.attr.reuse_result; + assign div_done = div.done | (in_progress & in_progress_attr.reuse_result); //If more than one cycle, set in_progress so that multiple div.start signals are not sent to the div unit. - set_clr_reg_with_rst #(.SET_OVER_CLR(1), .WIDTH(1), .RST_VALUE('0)) in_progress_m ( + set_clr_reg_with_rst #(.SET_OVER_CLR(1), .WIDTH(1), .RST_VALUE('0)) + in_progress_m ( .clk, .rst, - .set(input_fifo.valid & (~in_progress)), + .set(input_fifo.pop), .clr(wb.ack), .result(in_progress) ); @@ -136,25 +157,30 @@ module div_unit //////////////////////////////////////////////////// //Div core - assign div_core.dividend = div_op.unsigned_dividend; - assign div_core.divisor = div_op.unsigned_divisor; - div_algorithm divider_block ( - .clk (clk), - .rst (rst), - .div (div_core) + assign div.dividend = div_op.unsigned_dividend; + assign div.divisor = div_op.unsigned_divisor; + assign div.dividend_CLZ = div_op.dividend_CLZ; + assign div.divisor_CLZ = div_op.divisor_CLZ; + + assign div.divisor_is_zero = div_op.attr.divisor_is_zero; + + div_core #(.DIV_WIDTH(32)) + divider_block ( + .clk(clk), + .rst(rst), + .div(div) ); //////////////////////////////////////////////////// //Output logic done_r; - assign negate_result = in_progress_attr.remainder_op ? in_progress_attr.negate_remainder : (~div_core.divisor_is_zero & in_progress_attr.negate_quotient); - assign wb.rd = negate_if (in_progress_attr.remainder_op ? div_core.remainder : ({32{div_core.divisor_is_zero}} | div_core.quotient), negate_result); + assign wb.rd = negate_if (in_progress_attr.remainder_op ? div.remainder : ({32{in_progress_attr.divisor_is_zero}} | div.quotient), in_progress_attr.negate_result); always_ff @ (posedge clk) begin - if (wb.ack) + if (rst) done_r <= 0; - else if (div_done) - done_r <= 1; + else + done_r <= (div_done | done_r) & ~wb.ack; end assign wb.done = div_done | done_r; assign wb.id = in_progress_attr.id; diff --git a/core/interfaces.sv b/core/interfaces.sv index 6c4aeb9..e72e761 100755 --- a/core/interfaces.sv +++ b/core/interfaces.sv @@ -262,13 +262,15 @@ endinterface interface unsigned_division_interface #(parameter DATA_WIDTH = 32); logic start; logic [DATA_WIDTH-1:0] dividend; + logic [$clog2(DATA_WIDTH)-1:0] dividend_CLZ; logic [DATA_WIDTH-1:0] divisor; + logic [$clog2(DATA_WIDTH)-1:0] divisor_CLZ; logic [DATA_WIDTH-1:0] remainder; logic [DATA_WIDTH-1:0] quotient; logic done; logic divisor_is_zero; - modport requester (input remainder, quotient, done, divisor_is_zero, output dividend, divisor, start); - modport divider (output remainder, quotient, done, divisor_is_zero, input dividend, divisor, start); + modport requester (input remainder, quotient, done, output dividend, dividend_CLZ, divisor, divisor_CLZ, divisor_is_zero, start); + modport divider (output remainder, quotient, done, input dividend, dividend_CLZ, divisor, divisor_CLZ, divisor_is_zero, start); endinterface interface renamer_interface; diff --git a/core/taiga_config.sv b/core/taiga_config.sv index 2d15ae4..9078f61 100755 --- a/core/taiga_config.sv +++ b/core/taiga_config.sv @@ -48,13 +48,6 @@ package taiga_config; localparam USE_MUL = 1; localparam USE_DIV = 1; - //Division algorithm selection - typedef enum { - RADIX_2,//Smallest - QUICK_CLZ//Highest performance and best performance per LUT - } div_type; - localparam div_type DIV_ALGORITHM = QUICK_CLZ; - //Enable Atomic extension (cache operations only) localparam USE_AMO = 0; diff --git a/tools/taiga_compile_order b/tools/taiga_compile_order index 48df009..421353c 100644 --- a/tools/taiga_compile_order +++ b/tools/taiga_compile_order @@ -64,10 +64,8 @@ core/icache.sv -core/div_algorithms/div_radix2.sv core/clz.sv -core/div_algorithms/div_quick_clz.sv -core/div_algorithms/div_algorithm.sv +core/div_core.sv core/div_unit.sv core/lut_ram.sv