timing optimization

2025-04-23 21:39:10 -04:00 · 2024-08-24 01:56:14 -07:00 · 2024-08-24 01:56:14 -07:00 · bcf7d9f960
commit bcf7d9f960
parent ade6b2c985
7 changed files with 80 additions and 76 deletions
--- a/hw/rtl/fpu/VX_fcvt_unit.sv
+++ b/hw/rtl/fpu/VX_fcvt_unit.sv
@ -1,17 +1,17 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// Modified port of cast module from fpnew Libray 
+// Modified port of cast module from fpnew Libray
 // reference: https://github.com/pulp-platform/fpnew

 `include "VX_fpu_define.vh"
@ -22,7 +22,8 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
    parameter LATENCY   = 1,
    parameter INT_WIDTH = 32,
    parameter MAN_BITS  = 23,
-    parameter EXP_BITS  = 8    
+    parameter EXP_BITS  = 8,
+    parameter OUT_REG   = 0
 ) (
    input wire clk,
    input wire reset,
@ -35,10 +36,10 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
    input wire is_signed,

    input wire [31:0]  dataa,
-    output wire [31:0] result, 
+    output wire [31:0] result,

    output wire [`FP_FLAGS_BITS-1:0] fflags
-);   
+);
    // Constants
    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;

@ -55,11 +56,11 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
    localparam FMT_SHIFT_COMPENSATION = S_MAN_WIDTH - 1 - MAN_BITS;
    localparam NUM_FP_STICKY  = 2 * S_MAN_WIDTH - MAN_BITS - 1;   // removed mantissa, 1. and R
    localparam NUM_INT_STICKY = 2 * S_MAN_WIDTH - INT_WIDTH;  // removed int and R
-    
+
    // Input processing
-    
-    fclass_t fclass;      
-    VX_fp_classifier #( 
+
+    fclass_t fclass;
+    VX_fp_classifier #(
        .EXP_BITS (EXP_BITS),
        .MAN_BITS (MAN_BITS)
    ) fp_classifier (
@ -69,9 +70,9 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
    );

    wire [S_MAN_WIDTH-1:0] input_mant;
-    wire [S_EXP_WIDTH-1:0] input_exp;    
+    wire [S_EXP_WIDTH-1:0] input_exp;
    wire                   input_sign;
-    
+
    wire i2f_sign = dataa[INT_WIDTH-1];
    wire f2i_sign = dataa[INT_WIDTH-1] && is_signed;
    wire [S_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa) : dataa;
@ -81,7 +82,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
    assign input_sign = is_itof ? f2i_sign : i2f_sign;

    // Pipeline stage0
-    
+
    wire                   is_itof_s0;
    wire                   is_signed_s0;
    wire [2:0]             rnd_mode_s0;
@ -92,7 +93,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(

    VX_pipe_register #(
        .DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + S_EXP_WIDTH + S_MAN_WIDTH),
-        .DEPTH (LATENCY > 2)
+        .DEPTH (LATENCY > 1)
    ) pipe_reg0 (
        .clk      (clk),
        .reset    (reset),
@ -100,7 +101,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
        .data_in  ({is_itof,    is_signed,    frm,         fclass,    input_sign,    input_exp,       input_mant}),
        .data_out ({is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
    );
-    
+
    // Normalization

    wire [LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; // renormalization shift amount
@ -113,12 +114,12 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
        .data_out  (renorm_shamt_s0),
        .valid_out (mant_is_nonzero_s0)
    );
-    
+
    wire mant_is_zero_s0 = ~mant_is_nonzero_s0;

-    wire [S_MAN_WIDTH-1:0] input_mant_n_s0;    // normalized input mantissa    
+    wire [S_MAN_WIDTH-1:0] input_mant_n_s0;    // normalized input mantissa
    wire [S_EXP_WIDTH-1:0] input_exp_n_s0;     // unbiased true exponent
-    
+
    // Realign input mantissa, append zeroes if destination is wider
    assign input_mant_n_s0 = encoded_mant_s0 << renorm_shamt_s0;

@ -140,7 +141,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(

    VX_pipe_register #(
        .DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + 1 + S_MAN_WIDTH + S_EXP_WIDTH),
-        .DEPTH (LATENCY > 1)
+        .DEPTH (LATENCY > 2)
    ) pipe_reg1 (
        .clk      (clk),
        .reset    (reset),
@ -169,30 +170,30 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
    wire of_before_round_s1 = overflow;

    // Pipeline stage2
-    
+
    wire                   is_itof_s2;
    wire                   is_signed_s2;
    wire [2:0]             rnd_mode_s2;
-    fclass_t               fclass_s2;   
+    fclass_t               fclass_s2;
    wire                   mant_is_zero_s2;
    wire                   input_sign_s2;
    wire [2*S_MAN_WIDTH:0] destination_mant_s2;
    wire [EXP_BITS-1:0]    final_exp_s2;
    wire                   of_before_round_s2;
-    
+
    VX_pipe_register #(
        .DATAW (1 + 1 + `INST_FRM_BITS + $bits(fclass_t) + 1 + 1 + (2*S_MAN_WIDTH+1) + EXP_BITS + 1),
-        .DEPTH (LATENCY > 3)
+        .DEPTH (LATENCY > 0)
    ) pipe_reg2 (
        .clk      (clk),
        .reset    (reset),
        .enable   (enable),
        .data_in  ({is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
        .data_out ({is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
-    );   
-    
+    );
+
    // Rouding and classification
-   
+
    wire [MAN_BITS-1:0]  final_mant_s2;  // mantissa after adjustments
    wire [INT_WIDTH-1:0] final_int_s2;   // integer shifted in position
    wire [1:0]           f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
@ -237,20 +238,20 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(

    wire                 is_itof_s3;
    wire                 is_signed_s3;
-    fclass_t             fclass_s3;   
+    fclass_t             fclass_s3;
    wire                 mant_is_zero_s3;
    wire                 input_sign_s3;
    wire                 rounded_sign_s3;
    wire [INT_WIDTH-1:0] rounded_abs_s3;
-    wire                 of_before_round_s3;   
+    wire                 of_before_round_s3;
    wire                 f2i_round_has_sticky_s3;
    wire                 i2f_round_has_sticky_s3;

-    `UNUSED_VAR (fclass_s3) 
+    `UNUSED_VAR (fclass_s3)

    VX_pipe_register #(
        .DATAW (1 + 1 + $bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1),
-        .DEPTH (LATENCY > 4)
+        .DEPTH (LATENCY > 3)
    ) pipe_reg3 (
        .clk      (clk),
        .reset    (reset),
@ -258,7 +259,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
        .data_in  ({is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
        .data_out ({is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
    );
-     
+
    // Assemble regular result, nan box short ones. Int zeroes need to be detected
    wire [INT_WIDTH-1:0] fmt_result_s3 = mant_is_zero_s3 ? 0 : {rounded_sign_s3, rounded_abs_s3[EXP_BITS+MAN_BITS-1:0]};

@ -278,18 +279,18 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
            f2i_special_result_s3[INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1;   // alone yields 2**(31)-1
            f2i_special_result_s3[INT_WIDTH-1]   = ~is_signed_s3;   // for unsigned casts yields 2**31
        end
-    end            
+    end

    // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
-    wire f2i_result_is_special_s3 = fclass_s3.is_nan 
+    wire f2i_result_is_special_s3 = fclass_s3.is_nan
                                  | fclass_s3.is_inf
                                  | of_before_round_s3
                                  | (input_sign_s3 & ~is_signed_s3 & ~rounded_int_res_zero_s3);
-                                    
+
    fflags_t f2i_special_status_s3;
    fflags_t i2f_status_s3, f2i_status_s3;
    fflags_t tmp_fflags_s3;
-    
+
    // All integer special cases are invalid
    assign f2i_special_status_s3 = {1'b1, 4'h0};

@ -306,7 +307,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(

    VX_pipe_register #(
        .DATAW (32 + `FP_FLAGS_BITS),
-        .DEPTH (LATENCY > 0)
+        .DEPTH (OUT_REG)
    ) pipe_reg4 (
        .clk      (clk),
        .reset    (reset),
--- a/hw/rtl/fpu/VX_fncp_unit.sv
+++ b/hw/rtl/fpu/VX_fncp_unit.sv
@ -1,17 +1,17 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// Modified port of noncomp module from fpnew Libray 
+// Modified port of noncomp module from fpnew Libray
 // reference: https://github.com/pulp-platform/fpnew

 `include "VX_fpu_define.vh"
@ -19,9 +19,10 @@
 `ifdef FPU_DSP

 module VX_fncp_unit import VX_fpu_pkg::*; #(
-    parameter LATENCY  = 2,
+    parameter LATENCY  = 1,
    parameter EXP_BITS = 8,
-    parameter MAN_BITS = 23
+    parameter MAN_BITS = 23,
+    parameter OUT_REG  = 0
 ) (
    input wire clk,
    input wire reset,
@ -33,10 +34,10 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(

    input wire [31:0]  dataa,
    input wire [31:0]  datab,
-    output wire [31:0] result, 
+    output wire [31:0] result,

    output wire [`FP_FLAGS_BITS-1:0] fflags
-);       
+);
    localparam  NEG_INF     = 32'h00000001,
                NEG_NORM    = 32'h00000002,
                NEG_SUBNORM = 32'h00000004,
@ -55,15 +56,15 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
    wire        a_smaller, ab_equal;

    // Setup
-    assign     a_sign = dataa[31]; 
+    assign     a_sign = dataa[31];
    assign a_exponent = dataa[30:23];
    assign a_mantissa = dataa[22:0];

-    assign     b_sign = datab[31]; 
+    assign     b_sign = datab[31];
    assign b_exponent = datab[30:23];
    assign b_mantissa = datab[22:0];

-    VX_fp_classifier #( 
+    VX_fp_classifier #(
        .EXP_BITS (EXP_BITS),
        .MAN_BITS (MAN_BITS)
    ) fp_class_a (
@ -72,7 +73,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
        .clss_o (a_fclass)
    );

-    VX_fp_classifier #( 
+    VX_fp_classifier #(
        .EXP_BITS (EXP_BITS),
        .MAN_BITS (MAN_BITS)
    ) fp_class_b (
@ -82,7 +83,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
    );

    assign a_smaller = (dataa < datab) ^ (a_sign || b_sign);
-    assign ab_equal  = (dataa == datab) 
+    assign ab_equal  = (dataa == datab)
                    || (a_fclass.is_zero && b_fclass.is_zero); // +0 == -0

    // Pipeline stage0
@ -101,54 +102,54 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(

    VX_pipe_register #(
        .DATAW (4 + 2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1),
-        .DEPTH (LATENCY > 1)
+        .DEPTH (LATENCY > 0)
    ) pipe_reg0 (
        .clk      (clk),
        .reset    (reset),
        .enable   (enable),
        .data_in  ({op_mod,    dataa,    datab,    a_sign,    b_sign,    a_exponent,    a_mantissa,    a_fclass,    b_fclass,    a_smaller,    ab_equal}),
        .data_out ({op_mod_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_fclass_s0, b_fclass_s0, a_smaller_s0, ab_equal_s0})
-    ); 
+    );

    // FCLASS
    reg [31:0] fclass_mask_s0;  // generate a 10-bit mask for integer reg
-    always @(*) begin 
+    always @(*) begin
        if (a_fclass_s0.is_normal) begin
            fclass_mask_s0 = a_sign_s0 ? NEG_NORM : POS_NORM;
-        end 
+        end
        else if (a_fclass_s0.is_inf) begin
            fclass_mask_s0 = a_sign_s0 ? NEG_INF : POS_INF;
-        end 
+        end
        else if (a_fclass_s0.is_zero) begin
            fclass_mask_s0 = a_sign_s0 ? NEG_ZERO : POS_ZERO;
-        end 
+        end
        else if (a_fclass_s0.is_subnormal) begin
            fclass_mask_s0 = a_sign_s0 ? NEG_SUBNORM : POS_SUBNORM;
-        end 
+        end
        else if (a_fclass_s0.is_nan) begin
            fclass_mask_s0 = {22'h0, a_fclass_s0.is_quiet, a_fclass_s0.is_signaling, 8'h0};
-        end 
-        else begin                     
+        end
+        else begin
            fclass_mask_s0 = QUT_NAN;
        end
    end

-    // Min/Max    
+    // Min/Max
    reg [31:0] fminmax_res_s0;
    always @(*) begin
        if (a_fclass_s0.is_nan && b_fclass_s0.is_nan)
            fminmax_res_s0 = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
-        else if (a_fclass_s0.is_nan) 
+        else if (a_fclass_s0.is_nan)
            fminmax_res_s0 = datab_s0;
-        else if (b_fclass_s0.is_nan) 
+        else if (b_fclass_s0.is_nan)
            fminmax_res_s0 = dataa_s0;
-        else begin 
+        else begin
            // FMIN, FMAX
            fminmax_res_s0 = (op_mod_s0[0] ^ a_smaller_s0) ? dataa_s0 : datab_s0;
        end
    end

-    // Sign injection    
+    // Sign injection
    reg [31:0] fsgnj_res_s0;    // result of sign injection
    always @(*) begin
        case (op_mod_s0[1:0])
@ -158,12 +159,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
        endcase
    end

-    // Comparison    
+    // Comparison
    reg fcmp_res_s0;        // result of comparison
    reg fcmp_fflags_NV_s0;  // comparison fflags
    always @(*) begin
        case (op_mod_s0[1:0])
-            0: begin // LE                    
+            0: begin // LE
                if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin
                    fcmp_res_s0       = 0;
                    fcmp_fflags_NV_s0 = 1;
@ -179,12 +180,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
                end else begin
                    fcmp_res_s0       = (a_smaller_s0 & ~ab_equal_s0);
                    fcmp_fflags_NV_s0 = 0;
-                end                    
+                end
            end
            2: begin // EQ
                if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin
                    fcmp_res_s0       = 0;
-                    fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling; 
+                    fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling;
                end else begin
                    fcmp_res_s0       = ab_equal_s0;
                    fcmp_fflags_NV_s0 = 0;
@ -192,7 +193,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
            end
            default: begin
                fcmp_res_s0       = 'x;
-                fcmp_fflags_NV_s0 = 'x;                        
+                fcmp_fflags_NV_s0 = 'x;
            end
        endcase
    end
@ -216,7 +217,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
                // FMV
                result_s0 = dataa_s0;
                fflags_NV_s0 = 0;
-            end                
+            end
            6,7: begin
                // MIN/MAX
                result_s0 = fminmax_res_s0;
@ -229,7 +230,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(

    VX_pipe_register #(
        .DATAW (32 + 1),
-        .DEPTH (LATENCY > 0)
+        .DEPTH (OUT_REG)
    ) pipe_reg1 (
        .clk      (clk),
        .reset    (reset),
--- a/hw/rtl/fpu/VX_fpu_cvt.sv
+++ b/hw/rtl/fpu/VX_fpu_cvt.sv
@ -64,7 +64,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
        .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
        .TAG_WIDTH  (NUM_LANES + TAG_WIDTH),
        .PE_REG     (0),
-        .OUT_BUF    (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
+        .OUT_BUF    (2)
    ) pe_serializer (
        .clk        (clk),
        .reset      (reset),
@ -88,7 +88,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(

    for (genvar i = 0; i < NUM_PES; ++i) begin
        VX_fcvt_unit #(
-            .LATENCY (`LATENCY_FCVT)
+            .LATENCY (`LATENCY_FCVT),
+            .OUT_REG (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
        ) fcvt_unit (
            .clk        (clk),
            .reset      (reset),
--- a/hw/rtl/fpu/VX_fpu_div.sv
+++ b/hw/rtl/fpu/VX_fpu_div.sv
@ -68,7 +68,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
        .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
        .TAG_WIDTH  (NUM_LANES + TAG_WIDTH),
        .PE_REG     (0),
-        .OUT_BUF    (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
+        .OUT_BUF    (((NUM_LANES / NUM_PES) > 2) ? 2 : 0)
    ) pe_serializer (
        .clk        (clk),
        .reset      (reset),
--- a/hw/rtl/fpu/VX_fpu_fma.sv
+++ b/hw/rtl/fpu/VX_fpu_fma.sv
@ -99,7 +99,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
        .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
        .TAG_WIDTH  (NUM_LANES + TAG_WIDTH),
        .PE_REG     ((NUM_LANES != NUM_PES) ? 1 : 0), // must be registered for DSPs
-        .OUT_BUF    (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
+        .OUT_BUF    (((NUM_LANES / NUM_PES) > 2) ? 2 : 0)
    ) pe_serializer (
        .clk        (clk),
        .reset      (reset),
--- a/hw/rtl/fpu/VX_fpu_ncp.sv
+++ b/hw/rtl/fpu/VX_fpu_ncp.sv
@ -69,7 +69,7 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
        .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
        .TAG_WIDTH  (NUM_LANES + TAG_WIDTH),
        .PE_REG     (0),
-        .OUT_BUF    (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
+        .OUT_BUF    (2)
    ) pe_serializer (
        .clk        (clk),
        .reset      (reset),
@ -93,7 +93,8 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(

    for (genvar i = 0; i < NUM_PES; ++i) begin
        VX_fncp_unit #(
-            .LATENCY (`LATENCY_FNCP)
+            .LATENCY (`LATENCY_FNCP),
+            .OUT_REG (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
        ) fncp_unit (
            .clk        (clk),
            .reset      (reset),
--- a/hw/rtl/fpu/VX_fpu_sqrt.sv
+++ b/hw/rtl/fpu/VX_fpu_sqrt.sv
@ -62,7 +62,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
        .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
        .TAG_WIDTH  (NUM_LANES + TAG_WIDTH),
        .PE_REG     (0),
-        .OUT_BUF    (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
+        .OUT_BUF    (((NUM_LANES / NUM_PES) > 2) ? 2 : 0)
    ) pe_serializer (
        .clk        (clk),
        .reset      (reset),