RV64F DSP FPU hardware fixes

2025-04-23 21:39:10 -04:00 · 2023-06-18 03:20:48 -04:00 · 2023-06-18 03:20:48 -04:00 · 34290f7e95
commit 34290f7e95
parent bd5a52ff9c
12 changed files with 258 additions and 201 deletions
--- a/hw/dpi/float_dpi.cpp
+++ b/hw/dpi/float_dpi.cpp
@ -41,22 +41,21 @@ extern "C" {
 }

 inline uint64_t nan_box(uint32_t value) {
-  uint64_t mask = 0xffffffff00000000;
-  return value | mask;
+  return value | 0xffffffff00000000;
 }

-inline bool is_nan_boxed(uint64_t value) {
-#if (XLEN == 64)
+inline bool is_nan_boxed(uint64_t value) {  
  return (uint32_t(value >> 32) == 0xffffffff);
-#else
-  return true;
-#endif
 }

 inline int64_t check_boxing(int64_t a) {  
-  if (is_nan_boxed(a))
-    return a;
-  return nan_box(0x7fc00000); // NaN
+#if (FLEN == 64)
+  // this check is only needed when both single and double precisions are enabled
+  if (!is_nan_boxed(a)) {
+    return nan_box(0x7fc00000); // NaN
+  }
+#endif
+  return a;
 }

 void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -155,14 +155,8 @@
 `define FPU_DPI
 `endif

-`ifdef SYNTHESIS
-`ifndef FPU_DSP
-`ifndef FPU_FPNEW
-`define FPU_FPNEW
-`endif
-`endif
-`else
 `ifndef FPU_DPI
+`ifndef FPU_DSP
 `ifndef FPU_FPNEW
 `define FPU_FPNEW
 `endif
@ -329,6 +323,9 @@
 `ifdef VIVADO
 `define LATENCY_FMA 16    
 `endif
+`ifndef LATENCY_FMA
+`define LATENCY_FMA 4    
+`endif
 `endif
 `endif

@ -346,6 +343,9 @@
 `ifdef VIVADO
 `define LATENCY_FDIV 28    
 `endif
+`ifndef LATENCY_FDIV
+`define LATENCY_FDIV 16
+`endif
 `endif
 `endif

@ -363,6 +363,9 @@
 `ifdef VIVADO
 `define LATENCY_FSQRT 28    
 `endif
+`ifndef LATENCY_FSQRT
+`define LATENCY_FSQRT 16    
+`endif
 `endif
 `endif

--- a/hw/rtl/VX_define.vh
+++ b/hw/rtl/VX_define.vh
@ -196,9 +196,9 @@
 `define INST_FPU_MUL         4'b0010 
 `define INST_FPU_DIV         4'b0011
 `define INST_FPU_SQRT        4'b0100
-`define INST_FPU_CMP         4'b0101
+`define INST_FPU_CMP         4'b0101 // mod: LE=0, LT=1, EQ=2
 `define INST_FPU_F2F         4'b0110
-`define INST_FPU_MISC        4'b0111  // SGNJ, SGNJN, SGNJX, CLASS, MVXW, MVWX, FMIN, FMAX
+`define INST_FPU_MISC        4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
 `define INST_FPU_F2I         4'b1000
 `define INST_FPU_F2U         4'b1001
 `define INST_FPU_I2F         4'b1010
@ -208,6 +208,8 @@
 `define INST_FPU_NMSUB       4'b1110   
 `define INST_FPU_NMADD       4'b1111
 `define INST_FPU_IS_W(mod)   (mod[4])
+`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
+`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)

 `define INST_FPU_BITS        4

--- a/hw/rtl/fpu/VX_fpu_cvt.sv
+++ b/hw/rtl/fpu/VX_fpu_cvt.sv
@ -35,7 +35,10 @@ module VX_fpu_cvt #(
 
    localparam MAN_BITS = 23;
    localparam EXP_BITS = 8;
-    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
+    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;    
+
+    localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
+    localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
    
    // Use 32-bit integer
    localparam MAX_INT_WIDTH = 32;
@ -122,15 +125,15 @@ module VX_fpu_cvt #(
    wire [NUM_LANES-1:0] mant_is_zero_s0;                       // for integer zeroes

    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        wire mant_is_nonzero;
+        wire mant_is_nonzero_s0;
        VX_lzc #(
            .N (INT_MAN_WIDTH)
        ) lzc (
            .data_in   (encoded_mant_s0[i]),
            .data_out  (renorm_shamt_s0[i]),
-            .valid_out (mant_is_nonzero)
+            .valid_out (mant_is_nonzero_s0)
        );
-        assign mant_is_zero_s0[i] = ~mant_is_nonzero;  
+        assign mant_is_zero_s0[i] = ~mant_is_nonzero_s0;  
    end

    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s0;      // normalized input mantissa    
@ -142,10 +145,10 @@ module VX_fpu_cvt #(
        assign input_mant_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];

        // Unbias exponent and compensate for shift
-        wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - {1'b0, renorm_shamt_s0[i]};                                 
-        wire [INT_EXP_WIDTH-1:0] int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};
+        wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - {1'b0, renorm_shamt_s0[i]};                                 
+        wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};

-        assign input_exp_s0[i] = is_itof_s0 ? int_input_exp : fp_input_exp;
+        assign input_exp_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
    `IGNORE_WARNINGS_END
    end

@ -180,54 +183,54 @@ module VX_fpu_cvt #(
    wire [NUM_LANES-1:0]                    of_before_round_s1;

    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        reg [2*INT_MAN_WIDTH:0] preshift_mant;      // mantissa before final shift                
-        reg [SHAMT_BITS-1:0]    denorm_shamt;       // shift amount for denormalization
-        reg [INT_EXP_WIDTH-1:0] final_exp;          // after eventual adjustments
-        reg                     of_before_round;
+        reg [2*INT_MAN_WIDTH:0] preshift_mant_s1;      // mantissa before final shift                
+        reg [SHAMT_BITS-1:0]    denorm_shamt_s1;       // shift amount for denormalization
+        reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1;          // after eventual adjustments
+        reg                     of_before_round_tmp_s1;

        always @(*) begin           
        `IGNORE_WARNINGS_BEGIN     
            // Default assignment
-            final_exp       = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
-            preshift_mant   = {input_mant_s1[i], 33'b0};  // Place mantissa to the left of the shifter
-            denorm_shamt    = '0;      // right of mantissa
-            of_before_round = 1'b0;
+            final_exp_tmp_s1       = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
+            preshift_mant_s1       = {input_mant_s1[i], 33'b0};  // Place mantissa to the left of the shifter
+            denorm_shamt_s1        = '0;      // right of mantissa
+            of_before_round_tmp_s1 = 1'b0;

            // Handle INT casts
            if (is_itof_s1) begin                   
                if ($signed(input_exp_s1[i]) >= $signed(2**EXP_BITS-1-EXP_BIAS)) begin
                    // Overflow or infinities (for proper rounding)
-                    final_exp     = (2**EXP_BITS-2); // largest normal value
-                    preshift_mant = ~0;  // largest normal value and RS bits set
-                    of_before_round = 1'b1;
+                    final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
+                    preshift_mant_s1 = ~0;  // largest normal value and RS bits set
+                    of_before_round_tmp_s1 = 1'b1;
                end else if ($signed(input_exp_s1[i]) < $signed(-MAN_BITS-EXP_BIAS)) begin
                    // Limit the shift to retain sticky bits
-                    final_exp     = '0; // denormal result
-                    denorm_shamt  = (2 + MAN_BITS); // to sticky                
+                    final_exp_tmp_s1 = '0; // denormal result
+                    denorm_shamt_s1  = (2 + MAN_BITS); // to sticky                
                end else if ($signed(input_exp_s1[i]) < $signed(1-EXP_BIAS)) begin
                    // Denormalize underflowing values
-                    final_exp     = '0; // denormal result
-                    denorm_shamt  = SHAMT_BITS'(1-EXP_BIAS - input_exp_s1[i]); // adjust right shifting               
+                    final_exp_tmp_s1 = '0; // denormal result
+                    denorm_shamt_s1  = SHAMT_BITS'(1-EXP_BIAS - input_exp_s1[i]); // adjust right shifting               
                end
            end else begin                                
                if ($signed(input_exp_s1[i]) >= $signed((MAX_INT_WIDTH-1) + unsigned_s1)) begin
                    // overflow: when converting to unsigned the range is larger by one
-                    denorm_shamt = SHAMT_BITS'(0); // prevent shifting
-                    of_before_round = 1'b1;                
+                    denorm_shamt_s1 = SHAMT_BITS'(0); // prevent shifting
+                    of_before_round_tmp_s1 = 1'b1;                
                end else if ($signed(input_exp_s1[i]) < $signed(-1)) begin
                    // underflow
-                    denorm_shamt = MAX_INT_WIDTH+1; // all bits go to the sticky
+                    denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
                end else begin
                    // By default right shift mantissa to be an integer
-                    denorm_shamt = SHAMT_BITS'((MAX_INT_WIDTH-1) - input_exp_s1[i]);
+                    denorm_shamt_s1 = SHAMT_BITS'((MAX_INT_WIDTH-1) - input_exp_s1[i]);
                end              
            end     
        `IGNORE_WARNINGS_END  
        end

-        assign destination_mant_s1[i] = preshift_mant >> denorm_shamt;
-        assign final_exp_s1[i]        = final_exp;
-        assign of_before_round_s1[i]  = of_before_round;
+        assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
+        assign final_exp_s1[i]        = final_exp_tmp_s1;
+        assign of_before_round_s1[i]  = of_before_round_tmp_s1;
    end

    // Pipeline stage2
@ -242,7 +245,7 @@ module VX_fpu_cvt #(
    wire [NUM_LANES-1:0]    input_sign_s2;
    wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
-    wire [NUM_LANES-1:0]        of_before_round_s2;
+    wire [NUM_LANES-1:0]    of_before_round_s2;
    
    VX_pipe_register #(
        .DATAW  (1 + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
@ -255,47 +258,51 @@ module VX_fpu_cvt #(
        .data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
    );

-    wire [NUM_LANES-1:0]       rounded_sign;
-    wire [NUM_LANES-1:0][31:0] rounded_abs;     // absolute value of result after rounding
-    wire [NUM_LANES-1:0][1:0]  fp_round_sticky_bits, int_round_sticky_bits;
+    wire [NUM_LANES-1:0]       rounded_sign_s2;
+    wire [NUM_LANES-1:0][31:0] rounded_abs_s2;      // absolute value of result after rounding
+    wire [NUM_LANES-1:0]       int_round_has_sticky_s2;
+    wire [NUM_LANES-1:0]       fp_round_has_sticky_s2;
    
    // Rouding and classification
   
    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        wire [MAN_BITS-1:0]      final_mant;        // mantissa after adjustments
-        wire [MAX_INT_WIDTH-1:0] final_int;         // integer shifted in position
-        wire [1:0]               round_sticky_bits;
-        wire [31:0]              fmt_pre_round_abs;
-        wire [31:0]              pre_round_abs;
+        wire [MAN_BITS-1:0]      final_mant_s2;        // mantissa after adjustments
+        wire [MAX_INT_WIDTH-1:0] final_int_s2;         // integer shifted in position
+        wire [1:0]               round_sticky_bits_s2;
+        wire [31:0]              fmt_pre_round_abs_s2;
+        wire [31:0]              pre_round_abs_s2;
+        wire [1:0]               int_round_sticky_bits_s2, fp_round_sticky_bits_s2;

        // Extract final mantissa and round bit, discard the normal bit (for FP)
-        assign {final_mant, fp_round_sticky_bits[i][1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
-        assign {final_int, int_round_sticky_bits[i][1]} = destination_mant_s2[i][2*INT_MAN_WIDTH   : 2*INT_MAN_WIDTH   - (MAX_INT_WIDTH+1) + 1];
+        assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
+        assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH   : 2*INT_MAN_WIDTH   - (MAX_INT_WIDTH+1) + 1];

        // Collapse sticky bits
-        assign fp_round_sticky_bits[i][0]  = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
-        assign int_round_sticky_bits[i][0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
+        assign fp_round_sticky_bits_s2[0]  = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
+        assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
+        assign fp_round_has_sticky_s2[i]  = (| fp_round_sticky_bits_s2);
+        assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2);

        // select RS bits for destination operation
-        assign round_sticky_bits = is_itof_s2 ? fp_round_sticky_bits[i] : int_round_sticky_bits[i];
+        assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;

        // Pack exponent and mantissa into proper rounding form
-        assign fmt_pre_round_abs = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant[MAN_BITS-1:0]};
+        assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};

        // Select output with destination format and operation
-        assign pre_round_abs = is_itof_s2 ? fmt_pre_round_abs : final_int;
+        assign pre_round_abs_s2 = is_itof_s2 ? fmt_pre_round_abs_s2 : final_int_s2;

        // Perform the rounding
        VX_fpu_rounding #(
            .DAT_WIDTH (32)
        ) fp_rounding (
-            .abs_value_i    (pre_round_abs),
+            .abs_value_i    (pre_round_abs_s2),
            .sign_i         (input_sign_s2[i]),
-            .round_sticky_bits_i(round_sticky_bits),
+            .round_sticky_bits_i(round_sticky_bits_s2),
            .rnd_mode_i     (rnd_mode_s2),
            .effective_subtraction_i(1'b0),
-            .abs_rounded_o  (rounded_abs[i]),
-            .sign_o         (rounded_sign[i]),
+            .abs_rounded_o  (rounded_abs_s2[i]),
+            .sign_o         (rounded_sign_s2[i]),
            `UNUSED_PIN (exact_zero_o)
        );
    end
@ -311,117 +318,116 @@ module VX_fpu_cvt #(
    wire [NUM_LANES-1:0]    input_sign_s3;
    wire [NUM_LANES-1:0]    rounded_sign_s3;
    wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
-    wire [NUM_LANES-1:0]    of_before_round_s3;
+    wire [NUM_LANES-1:0]    of_before_round_s3;   
+    wire [NUM_LANES-1:0]    int_round_has_sticky_s3;
+    wire [NUM_LANES-1:0]    fp_round_has_sticky_s3; 

    VX_pipe_register #(
-        .DATAW  (1 + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1)),
+        .DATAW  (1 + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
        .RESETW (1)
    ) pipe_reg3 (
        .clk      (clk),
        .reset    (reset),
        .enable   (~stall),
-        .data_in  ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs,    rounded_sign,    of_before_round_s2}),
-        .data_out ({valid_in_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3})
+        .data_in  ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
+        .data_out ({valid_in_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
    );
     
-    wire [NUM_LANES-1:0] of_after_round;
-    wire [NUM_LANES-1:0] uf_after_round;
-    wire [NUM_LANES-1:0][31:0] fmt_result;
-    wire [NUM_LANES-1:0][31:0] rounded_int_res; // after possible inversion
-    wire [NUM_LANES-1:0] rounded_int_res_zero;  // after rounding
+    wire [NUM_LANES-1:0] of_after_round_s3;
+    wire [NUM_LANES-1:0] uf_after_round_s3;
+    wire [NUM_LANES-1:0][31:0] fmt_result_s3;
+    wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
+    wire [NUM_LANES-1:0] rounded_int_res_zero_s3;  // after rounding

    for (genvar i = 0; i < NUM_LANES; ++i) begin
        // Assemble regular result, nan box short ones. Int zeroes need to be detected
-        assign fmt_result[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
+        assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};

        // Classification after rounding select by destination format
-        assign uf_after_round[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
-        assign of_after_round[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
+        assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
+        assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.

        // Negative integer result needs to be brought into two's complement
-        assign rounded_int_res[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
-        assign rounded_int_res_zero[i] = (rounded_int_res[i] == 0);
+        assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
+        assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
    end

    // FP Special case handling

-    wire [NUM_LANES-1:0][31:0]  fp_special_result;
-    fflags_t [NUM_LANES-1:0]    fp_special_status;
-    wire [NUM_LANES-1:0]        fp_result_is_special;
-
-    localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
-    localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
+    wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
+    fflags_t [NUM_LANES-1:0]   fp_special_status_s3;
+    wire [NUM_LANES-1:0]       fp_result_is_special_s3;

    for (genvar i = 0; i < NUM_LANES; ++i) begin
        // Detect special case from source format, I2F casts don't produce a special result
-        assign fp_result_is_special[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
+        assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);

        // Signalling input NaNs raise invalid flag, otherwise no flags set
-        assign fp_special_status[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0;   // invalid operation
+        assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0;   // invalid operation

        // Assemble result according to destination format
-        assign fp_special_result[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
-                                                            : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
+        assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
+                                                              : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
    end

    // INT Special case handling

-    reg [NUM_LANES-1:0][31:0]   int_special_result;
-    fflags_t [NUM_LANES-1:0]    int_special_status;
-    wire [NUM_LANES-1:0]        int_result_is_special;
+    reg [NUM_LANES-1:0][31:0] int_special_result_s3;
+    fflags_t [NUM_LANES-1:0]  int_special_status_s3;
+    wire [NUM_LANES-1:0]      int_result_is_special_s3;

    for (genvar i = 0; i < NUM_LANES; ++i) begin
         // Assemble result according to destination format
        always @(*) begin
            if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
-                int_special_result[i][30:0] = '0;              // alone yields 2**(31)-1
-                int_special_result[i][31]   = ~unsigned_s3;    // for unsigned casts yields 2**31
+                int_special_result_s3[i][30:0] = '0;              // alone yields 2**(31)-1
+                int_special_result_s3[i][31]   = ~unsigned_s3;    // for unsigned casts yields 2**31
            end else begin
-                int_special_result[i][30:0] = 2**(31) - 1;     // alone yields 2**(31)-1
-                int_special_result[i][31]   = unsigned_s3;     // for unsigned casts yields 2**31
+                int_special_result_s3[i][30:0] = 2**(31) - 1;     // alone yields 2**(31)-1
+                int_special_result_s3[i][31]   = unsigned_s3;     // for unsigned casts yields 2**31
            end
        end            

        // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
-        assign int_result_is_special[i] = fclass_s3[i].is_nan 
-                                        | fclass_s3[i].is_inf 
-                                        | of_before_round_s3[i] 
-                                        | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero[i]);
+        assign int_result_is_special_s3[i] = fclass_s3[i].is_nan 
+                                           | fclass_s3[i].is_inf
+                                           | of_before_round_s3[i]
+                                           | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
                                        
        // All integer special cases are invalid
-        assign int_special_status[i] = {1'b1, 4'h0};
+        assign int_special_status_s3[i] = {1'b1, 4'h0};
    end

    // Result selection and Output handshake

-    fflags_t [NUM_LANES-1:0] tmp_fflags;    
-    wire [NUM_LANES-1:0][31:0] tmp_result;
+    fflags_t [NUM_LANES-1:0] tmp_fflags_s3;    
+    wire [NUM_LANES-1:0][31:0] tmp_result_s3;

    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        fflags_t    fp_regular_status, int_regular_status;
-        fflags_t    fp_status, int_status;    
-        wire [31:0] fp_result, int_result;
+        fflags_t    fp_regular_status_s3, int_regular_status_s3;
+        fflags_t    fp_status_s3, int_status_s3;    
+        wire [31:0] fp_result_s3, int_result_s3;

-        wire inexact = is_itof_s3 ? (| fp_round_sticky_bits[i]) // overflow is invalid in i2f;        
-                                  : (| fp_round_sticky_bits[i]) | (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i]));
+        wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;        
+                                     : (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
                                  
-        assign fp_regular_status.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round[i]); // overflow is invalid for I2F casts
-        assign fp_regular_status.DZ = 1'b0; // no divisions
-        assign fp_regular_status.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i])); // inf casts no OF
-        assign fp_regular_status.UF = uf_after_round[i] & inexact;
-        assign fp_regular_status.NX = inexact;
+        assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
+        assign fp_regular_status_s3.DZ = 1'b0; // no divisions
+        assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
+        assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
+        assign fp_regular_status_s3.NX = inexact_s3;

-        assign int_regular_status = (| int_round_sticky_bits[i]) ? {4'h0, 1'b1} : 5'h0;
+        assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;

-        assign fp_result  = fp_result_is_special[i]  ? fp_special_result[i]  : fmt_result[i];        
-        assign int_result = int_result_is_special[i] ? int_special_result[i] : rounded_int_res[i];
+        assign fp_result_s3  = fp_result_is_special_s3[i]  ? fp_special_result_s3[i]  : fmt_result_s3[i];        
+        assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];

-        assign fp_status  = fp_result_is_special[i]  ? fp_special_status[i]  : fp_regular_status;
-        assign int_status = int_result_is_special[i] ? int_special_status[i] : int_regular_status;
+        assign fp_status_s3  = fp_result_is_special_s3[i]  ? fp_special_status_s3[i]  : fp_regular_status_s3;
+        assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;

        // Select output depending on special case detection
-        assign tmp_result[i] = is_itof_s3 ? fp_result : int_result;
-        assign tmp_fflags[i] = is_itof_s3 ? fp_status : int_status;
+        assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
+        assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
    end

    assign stall = ~ready_out && valid_out;
@ -433,8 +439,8 @@ module VX_fpu_cvt #(
        .clk      (clk),
        .reset    (reset),
        .enable   (!stall),
-        .data_in  ({valid_in_s3, tag_in_s3, tmp_result, tmp_fflags}),
-        .data_out ({valid_out,   tag_out,   result,     fflags})
+        .data_in  ({valid_in_s3, tag_in_s3, tmp_result_s3, tmp_fflags_s3}),
+        .data_out ({valid_out,   tag_out,   result,        fflags})
    );

    assign ready_in = ~stall;
--- a/hw/rtl/fpu/VX_fpu_define.vh
+++ b/hw/rtl/fpu/VX_fpu_define.vh
@ -13,4 +13,10 @@
 import VX_fpu_types::*;
 `IGNORE_WARNINGS_END

+`ifdef XLEN_64
+`ifdef FLEN_32
+    `define FPU_RV64_F
+`endif
+`endif
+
 `endif // VX_FPU_DEFINE_VH
--- a/hw/rtl/fpu/VX_fpu_div.sv
+++ b/hw/rtl/fpu/VX_fpu_div.sv
@ -59,7 +59,7 @@ module VX_fpu_div #(
    end    
    
    assign has_fflags = 0;
-    assign fflags = '0;
+    assign fflags = 'x

 `elsif VIVADO

@ -86,29 +86,28 @@ module VX_fpu_div #(
 `else    

    for (genvar i = 0; i < NUM_LANES; ++i) begin       
-        reg [`XLEN-1:0] r;
+        reg [63:0] r;
+        `UNUSED_VAR (r)
        
        fflags_t f;
-        `UNUSED_VAR (f)

        always @(*) begin        
-            dpi_fdiv (enable && valid_in, dataa[i], datab[i], frm, r, f);
+            dpi_fdiv (enable && valid_in, int'(0), 64'(dataa[i]), 64'(datab[i]), frm, r, f);
        end

        VX_shift_register #(
-            .DATAW  (`XLEN),
+            .DATAW  (32 + $bits(fflags_t)),
            .DEPTH  (`LATENCY_FDIV)
        ) shift_req_dpi (
            .clk      (clk),
            `UNUSED_PIN (reset),
            .enable   (enable),
-            .data_in  (r),
-            .data_out (result[i])
+            .data_in  ({r[31:0],   f}),
+            .data_out ({result[i], fflags[i]})
        );
    end

-    assign has_fflags = 0;
-    assign fflags = '0;
+    assign has_fflags = 1;

 `endif

--- a/hw/rtl/fpu/VX_fpu_dpi.sv
+++ b/hw/rtl/fpu/VX_fpu_dpi.sv
@ -2,12 +2,6 @@

 `ifdef FPU_DPI

-`ifdef XLEN_64
-`ifdef FLEN_32
-    `define ISA_RV64F
-`endif
-`endif
-
 module VX_fpu_dpi #( 
    parameter NUM_LANES = 1,
    parameter TAGW      = 1
@ -69,14 +63,6 @@ module VX_fpu_dpi #(
            operands[0][i] = 64'(dataa[i]);
            operands[1][i] = 64'(datab[i]);
            operands[2][i] = 64'(datac[i]);
-        `ifdef ISA_RV64F
-            // apply nan-boxing to floating-point operands
-            if (op_type != `INST_FPU_I2F && op_type != `INST_FPU_U2F) begin
-                operands[0][i] |= 64'hffffffff00000000;
-            end
-            operands[1][i] |= 64'hffffffff00000000;
-            operands[2][i] |= 64'hffffffff00000000;
-        `endif
        end
    end

@ -118,7 +104,7 @@ module VX_fpu_dpi #(
            `INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
            `INST_FPU_DIV:   begin core_select = FPU_DIV; end
            `INST_FPU_SQRT:  begin core_select = FPU_SQRT; end
-            `INST_FPU_CMP:   begin core_select = FPU_NCP; is_fcmp = 1; end            
+            `INST_FPU_CMP:   begin core_select = FPU_NCP; is_fcmp = 1; end
            `INST_FPU_F2I:   begin core_select = FPU_CVT; is_ftoi = 1; end
            `INST_FPU_F2U:   begin core_select = FPU_CVT; is_ftou = 1; end
            `INST_FPU_I2F:   begin core_select = FPU_CVT; is_itof = 1; end
--- a/hw/rtl/fpu/VX_fpu_dsp.sv
+++ b/hw/rtl/fpu/VX_fpu_dsp.sv
@ -39,7 +39,9 @@ module VX_fpu_dsp #(
    localparam NUM_FPC  = 5;
    localparam FPC_BITS = `LOG2UP(NUM_FPC);

-    localparam RSP_ARB_DATAW = (NUM_LANES * `XLEN) + 1 + (NUM_LANES * $bits(fflags_t)) + TAGW;
+    localparam RSP_ARB_DATAW = (NUM_LANES * 32) + 1 + (NUM_LANES * $bits(fflags_t)) + TAGW;
+
+    `UNUSED_VAR (fmt)
    
    wire [NUM_FPC-1:0] per_core_ready_in;
    wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
@ -53,8 +55,6 @@ module VX_fpu_dsp #(
    reg [FPC_BITS-1:0] core_select;
    reg do_madd, do_sub, do_neg, is_itof, is_signed;

-    wire [`INST_FRM_BITS-1:0] frm = `INST_FRM_BITS'(frm);
-
    always @(*) begin
        do_madd   = 0;
        do_sub    = 0;        
@ -85,6 +85,20 @@ module VX_fpu_dsp #(
    `RESET_RELAY (cvt_reset, reset);
    `RESET_RELAY (ncp_reset, reset);

+    wire [NUM_LANES-1:0][31:0] dataa_s;
+    wire [NUM_LANES-1:0][31:0] datab_s;
+    wire [NUM_LANES-1:0][31:0] datac_s;
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
+        assign dataa_s[i] = dataa[i][31:0];
+        assign datab_s[i] = datab[i][31:0];
+        assign datac_s[i] = datac[i][31:0];
+    end
+
+    `UNUSED_VAR (dataa)
+    `UNUSED_VAR (datab)
+    `UNUSED_VAR (datac)
+
    VX_fpu_fma #(
        .NUM_LANES (NUM_LANES),
        .TAGW      (TAGW)
@ -98,9 +112,9 @@ module VX_fpu_dsp #(
        .do_madd    (do_madd),
        .do_sub     (do_sub),
        .do_neg     (do_neg),
-        .dataa      (dataa), 
-        .datab      (datab),    
-        .datac      (datac),   
+        .dataa      (dataa_s), 
+        .datab      (datab_s),    
+        .datac      (datac_s),   
        .has_fflags (per_core_has_fflags[FPU_FMA]),
        .fflags     (per_core_fflags[FPU_FMA]),
        .result     (per_core_result[FPU_FMA]),
@ -119,8 +133,8 @@ module VX_fpu_dsp #(
        .ready_in   (per_core_ready_in[FPU_DIV]),    
        .tag_in     (tag_in),
        .frm        (frm),  
-        .dataa      (dataa), 
-        .datab      (datab),   
+        .dataa      (dataa_s), 
+        .datab      (datab_s),   
        .has_fflags (per_core_has_fflags[FPU_DIV]),
        .fflags     (per_core_fflags[FPU_DIV]),   
        .result     (per_core_result[FPU_DIV]),
@ -139,7 +153,7 @@ module VX_fpu_dsp #(
        .ready_in   (per_core_ready_in[FPU_SQRT]),    
        .tag_in     (tag_in),
        .frm        (frm),    
-        .dataa      (dataa), 
+        .dataa      (dataa_s), 
        .has_fflags (per_core_has_fflags[FPU_SQRT]),
        .fflags     (per_core_fflags[FPU_SQRT]),
        .result     (per_core_result[FPU_SQRT]),
@ -148,59 +162,78 @@ module VX_fpu_dsp #(
        .ready_out  (per_core_ready_out[FPU_SQRT])
    );

+    wire cvt_rt_int_in = ~is_itof;
+    wire cvt_rt_int_out;
+
    VX_fpu_cvt #(
        .NUM_LANES (NUM_LANES),
-        .TAGW      (TAGW)
+        .TAGW      (TAGW+1)
    ) fp_cvt (
        .clk        (clk), 
        .reset      (cvt_reset),   
        .valid_in   (valid_in && (core_select == FPU_CVT)),
        .ready_in   (per_core_ready_in[FPU_CVT]),    
-        .tag_in     (tag_in), 
+        .tag_in     ({cvt_rt_int_in, tag_in}), 
        .frm        (frm),
        .is_itof    (is_itof),   
        .is_signed  (is_signed),        
-        .dataa      (dataa),  
+        .dataa      (dataa_s),  
        .has_fflags (per_core_has_fflags[FPU_CVT]),
        .fflags     (per_core_fflags[FPU_CVT]),
        .result     (per_core_result[FPU_CVT]),
-        .tag_out    (per_core_tag_out[FPU_CVT]),
+        .tag_out    ({cvt_rt_int_out, per_core_tag_out[FPU_CVT]}),
        .valid_out  (per_core_valid_out[FPU_CVT]),
        .ready_out  (per_core_ready_out[FPU_CVT])
    );

+    wire ncp_rt_int_in = (op_type == `INST_FPU_CMP)
+                      || `INST_FPU_IS_CLASS(op_type, frm) 
+                      || `INST_FPU_IS_MVXW(op_type, frm);
+    wire ncp_rt_int_out;
+
+    wire ncp_rt_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
+    wire ncp_rt_sext_out;
+    
    VX_fpu_ncomp #(
        .NUM_LANES (NUM_LANES),
-        .TAGW      (TAGW)
+        .TAGW      (TAGW+2)
    ) fp_ncomp (
        .clk        (clk),
        .reset      (ncp_reset),   
        .valid_in   (valid_in && (core_select == FPU_NCP)),
        .ready_in   (per_core_ready_in[FPU_NCP]),        
-        .tag_in     (tag_in),
+        .tag_in     ({ncp_rt_sext_in, ncp_rt_int_in, tag_in}),
        .op_type    (op_type),
        .frm        (frm),
-        .dataa      (dataa),
-        .datab      (datab),        
+        .dataa      (dataa_s),
+        .datab      (datab_s),        
        .result     (per_core_result[FPU_NCP]), 
        .has_fflags (per_core_has_fflags[FPU_NCP]),
        .fflags     (per_core_fflags[FPU_NCP]),
-        .tag_out    (per_core_tag_out[FPU_NCP]),
+        .tag_out    ({ncp_rt_sext_out, ncp_rt_int_out, per_core_tag_out[FPU_NCP]}),
        .valid_out  (per_core_valid_out[FPU_NCP]),
        .ready_out  (per_core_ready_out[FPU_NCP])
    );

    ///////////////////////////////////////////////////////////////////////////

-    wire [NUM_FPC-1:0][RSP_ARB_DATAW-1:0] per_core_data_out;
-
-    for (genvar i = 0; i < NUM_FPC; ++i) begin
-        assign per_core_data_out[i] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
+    reg [NUM_FPC-1:0][RSP_ARB_DATAW+2-1:0] per_core_data_out;
+    
+    always @(*) begin
+        for (integer i = 0; i < NUM_FPC; ++i) begin
+            per_core_data_out[i][RSP_ARB_DATAW+1:2] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
+            per_core_data_out[i][1:0] = '0;
+        end        
+        per_core_data_out[FPU_CVT][1:0] = {1'b1, cvt_rt_int_out};
+        per_core_data_out[FPU_NCP][1:0] = {ncp_rt_sext_out, ncp_rt_int_out};
    end

+    wire [NUM_LANES-1:0][31:0] result_s;
+    wire [1:0] op_rt_int_out;
+
    VX_stream_arb #(
        .NUM_INPUTS (NUM_FPC),
-        .DATAW      (RSP_ARB_DATAW),        
+        .DATAW      (RSP_ARB_DATAW + 2),        
        .ARBITER    ("R"),
        .BUFFERED   (2)
    ) rsp_arb (
@ -209,11 +242,31 @@ module VX_fpu_dsp #(
        .valid_in  (per_core_valid_out),        
        .ready_in  (per_core_ready_out),
        .data_in   (per_core_data_out),
-        .data_out  ({result, has_fflags, fflags, tag_out}),
+        .data_out  ({result_s, has_fflags, fflags, tag_out, op_rt_int_out}),
        .valid_out (valid_out),
        .ready_out (ready_out)
    );

+`ifndef FPU_RV64_F
+    `UNUSED_VAR (op_rt_int_out)
+`endif
+
+    for (genvar i = 0; i < NUM_LANES; ++i) begin        
+    `ifdef FPU_RV64_F
+        reg [`XLEN-1:0] result_r;
+        always @(*) begin
+            case (op_rt_int_out)
+            2'b11:   result_r = `XLEN'($signed(result_s[i]));
+            2'b01:   result_r = {32'h00000000, result_s[i]};
+            default: result_r = {32'hffffffff, result_s[i]};
+            endcase
+        end
+        assign result[i] = result_r;
+    `else
+        assign result[i] = result_s[i];
+    `endif
+    end
+
    // can accept new request?
    assign ready_in = per_core_ready_in[core_select];

--- a/hw/rtl/fpu/VX_fpu_fma.sv
+++ b/hw/rtl/fpu/VX_fpu_fma.sv
@ -88,6 +88,9 @@ module VX_fpu_fma #(
            .q      (result[i])
        );
    end
+    
+    assign has_fflags = 0;
+    assign fflags = 'x

 `elsif VIVADO

@ -111,31 +114,33 @@ module VX_fpu_fma #(
        assign fflags[i] = {tuser[2], 1'b0, tuser[1], tuser[0], 1'b0};
    end

+    assign has_fflags = 1;
+
 `else

    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        reg [`XLEN-1:0] r;
+        reg [63:0] r;
+        `UNUSED_VAR (r)
+
        fflags_t f;
-        `UNUSED_VAR (f)

        always @(*) begin        
-            dpi_fmadd (enable && valid_in, a[i], b[i], c[i], frm, r, f);
+            dpi_fmadd (enable && valid_in, int'(0), 64'(a[i]), 64'(b[i]), 64'(c[i]), frm, r, f);
        end        

        VX_shift_register #(
-            .DATAW  (`XLEN),
+            .DATAW  (32 + $bits(fflags_t)),
            .DEPTH  (`LATENCY_FMA)
        ) shift_req_dpi (
            .clk      (clk),
            `UNUSED_PIN (reset),
            .enable   (enable),
-            .data_in  (r),
-            .data_out (result[i])
+            .data_in  ({r[31:0],   f}),
+            .data_out ({result[i], fflags[i]})
        );
    end

-    assign has_fflags = 1'b0;
-    assign fflags = '0;
+    assign has_fflags = 1;

 `endif

--- a/hw/rtl/fpu/VX_fpu_fpnew.sv
+++ b/hw/rtl/fpu/VX_fpu_fpnew.sv
@ -6,12 +6,6 @@
 `include "fpnew_pkg.sv"
 `include "defs_div_sqrt_mvp.sv"

-`ifdef XLEN_64
-`ifdef FLEN_32
-    `define ISA_RV64F
-`endif
-`endif
-
 module VX_fpu_fpnew #(      
    parameter NUM_LANES = 1,
    parameter TAGW      = 1
@ -147,7 +141,7 @@ module VX_fpu_fpnew #(
            default:;
        endcase

-    `ifdef ISA_RV64F
+    `ifdef FPU_RV64_F
        // apply nan-boxing to floating-point operands
        for (integer i = 0; i < NUM_LANES; ++i) begin                    
            if (op_type != `INST_FPU_I2F && op_type != `INST_FPU_U2F) begin
--- a/hw/rtl/fpu/VX_fpu_ncomp.sv
+++ b/hw/rtl/fpu/VX_fpu_ncomp.sv
@ -86,7 +86,7 @@ module VX_fpu_ncomp #(

    wire                        valid_in_s0;
    wire [TAGW-1:0]             tag_in_s0;
-    wire [4:0]                  op_mod_s0;
+    wire [3:0]                  op_mod_s0;
    wire [NUM_LANES-1:0][31:0]  dataa_s0, datab_s0;
    wire [NUM_LANES-1:0]        a_sign_s0, b_sign_s0;
    wire [NUM_LANES-1:0][7:0]   a_exponent_s0;
@ -96,7 +96,7 @@ module VX_fpu_ncomp #(

    wire stall;

-    wire [4:0] op_mod = {(op_type == INST_FPU_CMP), frm};
+    wire [3:0] op_mod = {(op_type == `INST_FPU_CMP), frm};

    VX_pipe_register #(
        .DATAW  (1 + TAGW + 4 + NUM_LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1)),
@ -169,7 +169,7 @@ module VX_fpu_ncomp #(
    reg [NUM_LANES-1:0] fcmp_fflags_NV;  // comparison fflags
    for (genvar i = 0; i < NUM_LANES; ++i) begin
        always @(*) begin
-            case (op_mod_s0[1:0])
+            case (op_mod_s0[2:0])
                `INST_FRM_RNE: begin // LE                    
                    if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
                        fcmp_res[i]       = 32'h0;
--- a/hw/rtl/fpu/VX_fpu_sqrt.sv
+++ b/hw/rtl/fpu/VX_fpu_sqrt.sv
@ -57,6 +57,9 @@ module VX_fpu_sqrt #(
        );
    end

+    assign has_fflags = 0;
+    assign fflags = 'x;
+
 `elsif VIVADO

    for (genvar i = 0; i < NUM_LANES; ++i) begin
@ -66,41 +69,42 @@ module VX_fpu_sqrt #(
            .aclk                (clk),
            .aclken              (enable),
            .s_axis_a_tvalid     (1'b1),
-            .s_axis_a_tdata      (dataa[i]),
+            .s_axis_a_tdata      (dataa[i][31:0]),
            `UNUSED_PIN (m_axis_result_tvalid),
-            .m_axis_result_tdata (result[i]),
+            .m_axis_result_tdata (result[i][31:0]),
            .m_axis_result_tuser (tuser)
        );
                        // NV,  DZ,   OF,   UF,   NX
        assign fflags[i] = {tuser, 1'b0, 1'b0, 1'b0, 1'b0};
    end

+    assign has_fflags = 1;
+
 `else

    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        reg [`XLEN-1:0] r;
+        reg [63:0] r;
+        `UNUSED_VAR (r)

        fflags_t f;
-        `UNUSED_VAR (f)

        always @(*) begin        
-            dpi_fsqrt (enable && valid_in, dataa[i], frm, r, f);
+            dpi_fsqrt (enable && valid_in, int'(0), 64'(dataa[i]), frm, r, f);
        end
        
        VX_shift_register #(
-            .DATAW  (`XLEN),
+            .DATAW  (32 + $bits(fflags_t)),
            .DEPTH  (`LATENCY_FSQRT)
        ) shift_req_dpi (
            .clk      (clk),
            `UNUSED_PIN (reset),
            .enable   (enable),
-            .data_in  (r),
-            .data_out (result[i])
+            .data_in  ({r[31:0],   f}),
+            .data_out ({result[i], fflags[i]})
        );
    end

-    assign has_fflags = 1'b0;
-    assign fflags = '0;
+    assign has_fflags = 1;

 `endif