minor update

2025-04-23 21:39:10 -04:00 · 2023-11-12 23:40:59 -08:00 · 2023-11-12 23:40:59 -08:00 · a08d3ebd42
commit a08d3ebd42
parent 62cdd8e993
6 changed files with 81 additions and 136 deletions
--- a/hw/rtl/afu/xrt/VX_afu_wrap.sv
+++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv
@ -262,7 +262,7 @@ module VX_afu_wrap #(
 		.m_axi_awready	(m_axi_mem_awready_a),
 		.m_axi_awaddr	(m_axi_mem_awaddr_w),
 		.m_axi_awid		(m_axi_mem_awid_a),
-		`UNUSED_PIN (m_axi_awlen),
+		.m_axi_awlen    (m_axi_mem_awlen_a),
 		`UNUSED_PIN (m_axi_awsize),
 		`UNUSED_PIN (m_axi_awburst),
 		`UNUSED_PIN (m_axi_awlock),
--- a/hw/rtl/core/VX_fpu_unit.sv
+++ b/hw/rtl/core/VX_fpu_unit.sv
--- a/hw/rtl/core/VX_muldiv_unit.sv
+++ b/hw/rtl/core/VX_muldiv_unit.sv
@ -220,8 +220,13 @@ module VX_muldiv_unit #(
    wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;

    for (genvar i = 0; i < NUM_LANES; ++i) begin
+    `ifdef XLEN_64
        assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
        assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
+    `else
+        assign div_in1[i] = execute_if.data.rs1_data[i];
+        assign div_in2[i] = execute_if.data.rs2_data[i];
+    `endif        
    end

 `ifdef IDIV_DPI    
--- a/hw/rtl/core/VX_scoreboard.sv
+++ b/hw/rtl/core/VX_scoreboard.sv
@ -107,6 +107,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
            .ready_out (scoreboard_if[i].ready)
        );

+    `ifdef SIMULATION
        reg [31:0] timeout_ctr;
    
        always @(posedge clk) begin
@ -134,6 +135,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
        `RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
            ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
                $time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
+    `endif
+    
    end    

 endmodule
--- a/hw/rtl/fpu/VX_fpu_cvt.sv
+++ b/hw/rtl/fpu/VX_fpu_cvt.sv
@ -52,30 +52,27 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
 
    localparam MAN_BITS = 23;
    localparam EXP_BITS = 8;
-    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;    
-
-    localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
-    localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
+    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
    
    // Use 32-bit integer
-    localparam MAX_INT_WIDTH = 32;
+    localparam INT_WIDTH = 32;

    // The internal mantissa includes normal bit or an entire integer
-    localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
+    localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH);

    // The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
    localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);

    // The internal exponent must be able to represent the smallest denormal input value as signed
    // or the number of bits in an integer
-    localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
+    localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;

    // shift amount for denormalization
    localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);

    localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
    localparam NUM_FP_STICKY  = 2 * INT_MAN_WIDTH - MAN_BITS - 1;   // removed mantissa, 1. and R
-    localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH;  // removed int and R
+    localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH;  // removed int and R
    
    // Input processing
    
@ -86,8 +83,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
            .EXP_BITS (EXP_BITS),
            .MAN_BITS (MAN_BITS)
        ) fp_class (
-            .exp_i  (dataa[i][30:23]),
-            .man_i  (dataa[i][22:0]),
+            .exp_i  (dataa[i][INT_WIDTH-2:MAN_BITS]),
+            .man_i  (dataa[i][MAN_BITS-1:0]),
            .clss_o (fclass[i])
        );
    end
@ -97,15 +94,13 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
    wire [NUM_LANES-1:0]                    input_sign;
    
    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        wire [INT_MAN_WIDTH-1:0] int_mantissa;
-        wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
-        wire fmt_sign        = dataa[i][31];
-        wire int_sign        = dataa[i][31] && is_signed;
-        assign int_mantissa  = int_sign ? (-dataa[i]) : dataa[i];
-        assign fmt_mantissa  = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
+        wire i2f_sign = dataa[i][INT_WIDTH-1];
+        wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed;
+        wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i];
+        wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
        assign input_exp[i]  = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
-        assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
-        assign input_sign[i] = is_itof ? int_sign : fmt_sign;
+        assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa;
+        assign input_sign[i] = is_itof ? f2i_sign : i2f_sign;
    end

    // Pipeline stage0
@ -159,9 +154,9 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
        assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];

        // Unbias exponent and compensate for shift
-        wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
-        wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
-        assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
+        wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
+        wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
+        assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0;
    end

    // Pipeline stage1
@ -193,51 +188,32 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(

    wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
-    wire [NUM_LANES-1:0]                    of_before_round_s1;
+    wire [NUM_LANES-1:0] of_before_round_s1;

-    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        reg [2*INT_MAN_WIDTH:0] preshift_mant_s1;   // mantissa before final shift                
-        reg [SHAMT_BITS-1:0]    denorm_shamt_s1;    // shift amount for denormalization
-        reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1;   // after eventual adjustments
-        reg                     of_before_round_tmp_s1;
+    for (genvar i = 0; i < NUM_LANES; ++i) begin           
+        reg [SHAMT_BITS-1:0] denorm_shamt_s1;    // shift amount for denormalization
+        reg of_before_round_tmp_s1;

        always @(*) begin
-            final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
-            preshift_mant_s1 = {input_mant_s1[i], 33'b0};
            denorm_shamt_s1  = '0;
            of_before_round_tmp_s1 = 1'b0;

-            if (is_itof_s1) begin                   
-                if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
-                    // Overflow or infinities (for proper rounding)
-                    final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
-                    preshift_mant_s1 = ~0;  // largest normal value and RS bits set
-                    of_before_round_tmp_s1 = 1'b1;
-                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
-                    // Limit the shift to retain sticky bits
-                    final_exp_tmp_s1 = '0; // denormal result
-                    denorm_shamt_s1  = (2 + MAN_BITS); // to sticky                
-                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
-                    // Denormalize underflowing values
-                    final_exp_tmp_s1 = '0; // denormal result
-                    denorm_shamt_s1  = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting               
-                end
-            end else begin
-                if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
-                    // overflow: when converting to unsigned the range is larger by one
+            if (!is_itof_s1) begin
+                if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
+                    // overflow
                    of_before_round_tmp_s1 = 1'b1;                
                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
                    // underflow
-                    denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
+                    denorm_shamt_s1 = INT_WIDTH+1; // all bits go to the sticky
                end else begin
                    // By default right shift mantissa to be an integer
-                    denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
+                    denorm_shamt_s1 = SHAMT_BITS'(INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
                end              
            end
        end

-        assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
-        assign final_exp_s1[i]        = final_exp_tmp_s1;
+        assign destination_mant_s1[i] = {input_mant_s1[i], 33'b0} >> denorm_shamt_s1;
+        assign final_exp_s1[i]        = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
        assign of_before_round_s1[i]  = of_before_round_tmp_s1;
    end

@ -267,33 +243,33 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
        .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
    );

-    wire [NUM_LANES-1:0]       rounded_sign_s2;
-    wire [NUM_LANES-1:0][31:0] rounded_abs_s2;      // absolute value of result after rounding
-    wire [NUM_LANES-1:0]       int_round_has_sticky_s2;
-    wire [NUM_LANES-1:0]       fp_round_has_sticky_s2;
+    wire [NUM_LANES-1:0]    rounded_sign_s2;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
+    wire [NUM_LANES-1:0]    f2i_round_has_sticky_s2;
+    wire [NUM_LANES-1:0]    i2f_round_has_sticky_s2;
    
    // Rouding and classification
   
    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        wire [MAN_BITS-1:0]      final_mant_s2;        // mantissa after adjustments
-        wire [MAX_INT_WIDTH-1:0] final_int_s2;         // integer shifted in position
-        wire [1:0]               round_sticky_bits_s2;
-        wire [31:0]              fmt_pre_round_abs_s2;
-        wire [31:0]              pre_round_abs_s2;
-        wire [1:0]               int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
+        wire [MAN_BITS-1:0]     final_mant_s2;  // mantissa after adjustments
+        wire [INT_WIDTH-1:0]    final_int_s2;   // integer shifted in position
+        wire [1:0]              round_sticky_bits_s2;
+        wire [INT_WIDTH-1:0]    fmt_pre_round_abs_s2;
+        wire [INT_WIDTH-1:0]    pre_round_abs_s2;
+        wire [1:0]              f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;

        // Extract final mantissa and round bit, discard the normal bit (for FP)
-        assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
-        assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH   : 2*INT_MAN_WIDTH   - (MAX_INT_WIDTH+1) + 1];
+        assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
+        assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH   : 2*INT_MAN_WIDTH   - (INT_WIDTH+1) + 1];

        // Collapse sticky bits
-        assign fp_round_sticky_bits_s2[0]  = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
-        assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
-        assign fp_round_has_sticky_s2[i]   = (| fp_round_sticky_bits_s2);
-        assign int_round_has_sticky_s2[i]  = (| int_round_sticky_bits_s2);
+        assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
+        assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
+        assign i2f_round_has_sticky_s2[i]  = (| i2f_round_sticky_bits_s2);
+        assign f2i_round_has_sticky_s2[i]  = (| f2i_round_sticky_bits_s2);

        // select RS bits for destination operation
-        assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
+        assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2;

        // Pack exponent and mantissa into proper rounding form
        assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
@ -327,10 +303,10 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
    wire [NUM_LANES-1:0] mant_is_zero_s3;
    wire [NUM_LANES-1:0] input_sign_s3;
    wire [NUM_LANES-1:0] rounded_sign_s3;
-    wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3;
    wire [NUM_LANES-1:0] of_before_round_s3;   
-    wire [NUM_LANES-1:0] int_round_has_sticky_s3;
-    wire [NUM_LANES-1:0] fp_round_has_sticky_s3; 
+    wire [NUM_LANES-1:0] f2i_round_has_sticky_s3;
+    wire [NUM_LANES-1:0] i2f_round_has_sticky_s3; 

    VX_pipe_register #(
        .DATAW  (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
@ -339,105 +315,68 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
        .clk      (clk),
        .reset    (reset),
        .enable   (~stall),
-        .data_in  ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
-        .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
+        .data_in  ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
+        .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
    );
     
-    wire [NUM_LANES-1:0] of_after_round_s3;
-    wire [NUM_LANES-1:0] uf_after_round_s3;
-    wire [NUM_LANES-1:0][31:0] fmt_result_s3;
-    wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion
    wire [NUM_LANES-1:0] rounded_int_res_zero_s3;  // after rounding

    for (genvar i = 0; i < NUM_LANES; ++i) begin
        // Assemble regular result, nan box short ones. Int zeroes need to be detected
-        assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
-
-        // Classification after rounding select by destination format
-        assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0);  // denormal
-        assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
+        assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};

        // Negative integer result needs to be brought into two's complement
        assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
        assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
    end

-    // FP Special case handling
+    // F2I Special case handling

-    wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
-    fflags_t [NUM_LANES-1:0]   fp_special_status_s3;
-    wire [NUM_LANES-1:0]       fp_result_is_special_s3;
-
-    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        // Detect special case from source format, I2F casts don't produce a special result
-        assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
-
-        // Signalling input NaNs raise invalid flag, otherwise no flags set
-        assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
-
-        // Assemble result according to destination format
-        assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
-                                                              : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
-    end
-
-    // INT Special case handling
-
-    reg [NUM_LANES-1:0][31:0] int_special_result_s3;
-    fflags_t [NUM_LANES-1:0]  int_special_status_s3;
-    wire [NUM_LANES-1:0]      int_result_is_special_s3;
+    reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
+    fflags_t [NUM_LANES-1:0]  f2i_special_status_s3;
+    wire [NUM_LANES-1:0]      f2i_result_is_special_s3;

    for (genvar i = 0; i < NUM_LANES; ++i) begin
         // Assemble result according to destination format
        always @(*) begin
            if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
-                int_special_result_s3[i][30:0] = '0;            // alone yields 2**(31)-1
-                int_special_result_s3[i][31]   = ~unsigned_s3;  // for unsigned casts yields 2**31
+                f2i_special_result_s3[i][INT_WIDTH-2:0] = '0;            // alone yields 2**(31)-1
+                f2i_special_result_s3[i][INT_WIDTH-1]   = ~unsigned_s3;  // for unsigned casts yields 2**31
            end else begin
-                int_special_result_s3[i][30:0] = 2**(31) - 1;   // alone yields 2**(31)-1
-                int_special_result_s3[i][31]   = unsigned_s3;   // for unsigned casts yields 2**31
+                f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1;   // alone yields 2**(31)-1
+                f2i_special_result_s3[i][INT_WIDTH-1]   = unsigned_s3;   // for unsigned casts yields 2**31
            end
        end            

        // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
-        assign int_result_is_special_s3[i] = fclass_s3[i].is_nan 
+        assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan 
                                           | fclass_s3[i].is_inf
                                           | of_before_round_s3[i]
                                           | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
                                        
        // All integer special cases are invalid
-        assign int_special_status_s3[i] = {1'b1, 4'h0};
+        assign f2i_special_status_s3[i] = {1'b1, 4'h0};
    end

    // Result selection and Output handshake

    fflags_t [NUM_LANES-1:0] tmp_fflags_s3;    
-    wire [NUM_LANES-1:0][31:0] tmp_result_s3;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;

-    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        fflags_t fp_regular_status_s3, int_regular_status_s3;
-        fflags_t fp_status_s3, int_status_s3;    
-        wire [31:0] fp_result_s3, int_result_s3;
+    for (genvar i = 0; i < NUM_LANES; ++i) begin                                  
+        fflags_t i2f_regular_status_s3 = i2f_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
+        fflags_t f2i_regular_status_s3 = f2i_round_has_sticky_s3[i] ? 5'h1 : 5'h0;

-        wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;        
-                                     : (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
-                                  
-        assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
-        assign fp_regular_status_s3.DZ = 1'b0; // no divisions
-        assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
-        assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
-        assign fp_regular_status_s3.NX = inexact_s3;
+        fflags_t i2f_status_s3 = i2f_regular_status_s3;
+        fflags_t f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;

-        assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
+        wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
+        wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];

-        assign fp_result_s3  = fp_result_is_special_s3[i]  ? fp_special_result_s3[i]  : fmt_result_s3[i];        
-        assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
-
-        assign fp_status_s3  = fp_result_is_special_s3[i]  ? fp_special_status_s3[i]  : fp_regular_status_s3;
-        assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
-
-        // Select output depending on special case detection
-        assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
-        assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
+        assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3;
+        assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3;
    end

    assign stall = ~ready_out && valid_out;
@ -457,7 +396,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
    );

    assign ready_in = ~stall;
-
    assign has_fflags = 1'b1;

 endmodule
--- a/hw/rtl/fpu/VX_fpu_rounding.sv
+++ b/hw/rtl/fpu/VX_fpu_rounding.sv
@ -54,7 +54,6 @@ module VX_fpu_rounding #(
                      2'b01: round_up = 1'b0;            // < ulp/2 away, round down
                      2'b10: round_up = abs_value_i[0];  // = ulp/2 away, round towards even result
                      2'b11: round_up = 1'b1;            // > ulp/2 away, round up
-                    default: round_up = 1'bx;
                endcase
            `INST_FRM_RTZ: round_up = 1'b0; // always round down
            `INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i;  // to 0 if +, away if -