Fixed cvtint bug by adding 2 bits to convert width; initial implementation of fround passes basic regression but fails some nightly regression cases

2025-06-28 09:36:01 -04:00 · 2024-05-11 22:32:51 -07:00 · 2024-05-11 22:32:51 -07:00 · 009d251433
commit 009d251433
parent c0743a1fcf
15 changed files with 399 additions and 263 deletions
--- a/bin/regression-wally
+++ b/bin/regression-wally
@ -125,19 +125,19 @@ derivconfigtests = [
        ["div_4_2_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
        ["div_4_2i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
        ["div_4_4_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_4_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]], 
        # fpu permutations
-        ["f_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma"]],
+        ["f_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfaf"]],
-        ["fh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfh", "arch32zfh_divsqrt"]],
+        ["fh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfh", "arch32zfh_divsqrt", "arch32zfaf"]],
-        ["fdh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt"]],
+        ["fdh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt", "arch32zfaf", "arch32zfad"]],
-        ["fdq_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32i"]],
+        ["fdq_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32i", "arch32zfaf", "arch32zfad"]],
-        ["fdqh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt", "arch32i"]],
+        ["fdqh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt", "arch32i", "arch32zfaf", "arch32zfad"]],
-        ["f_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma"]],
+        ["f_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64zfaf"]],
-        ["fh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64zfh", "arch64zfh_divsqrt"]], 
+        ["fh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64zfh", "arch64zfh_divsqrt", "arch64zfaf"]], 
-        ["fdh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt"]],
+        ["fdh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt", "arch64zfaf", "arch64zfad"]],
-        ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64i"]],
+        ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64i", "arch64zfaf", "arch64zfad"]],
-        ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt", "arch64i", "wally64q"]],
+        ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt", "arch64i", "wally64q", "arch64zfaf", "arch64zfad"]],
    ]
 bpredtests = [
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@ -75,6 +75,7 @@ localparam NE   = Q_SUPPORTED ? Q_NE   : D_SUPPORTED ? D_NE   : S_NE;
 localparam NF   = Q_SUPPORTED ? Q_NF   : D_SUPPORTED ? D_NF   : S_NF;
 localparam FMT  = Q_SUPPORTED ? 2'd3   : D_SUPPORTED ? 2'd1   : 2'd0;
 localparam BIAS = Q_SUPPORTED ? Q_BIAS : D_SUPPORTED ? D_BIAS : S_BIAS;
 localparam LOGFLEN = $clog2(FLEN);
 // Floating point constants needed for FPU paramerterization
 // LEN1/NE1/NF1/FNT1 is the size of the second longest supported format
@ -124,7 +125,8 @@ localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));
 //     because NORMSHIFTSZ becomes limited by convert rather than divider
 //     Figure out why extra two bits are needed for convert (and only in testbench_fp, not Wally)
 //     Might be a testbench_fp issue
-localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1+2), (DIVb + 1 + NF + 1)), (3*NF+6));
+//localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1+2), (DIVb + 1 + NF + 1)), (3*NF+6));
 localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1), (DIVb + 1 + NF + 1)), (3*NF+6));
 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));                  // log_2(NORMSHIFTSZ)
 localparam CORRSHIFTSZ = NORMSHIFTSZ-2;                             // Drop leading 2 integer bits
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@ -173,6 +173,7 @@ localparam cvw_t P = '{
  H_BIAS : H_BIAS,
  H_FMT : H_FMT,
  FLEN : FLEN,
  LOGFLEN : LOGFLEN,
  NE   : NE  ,
  NF   : NF  ,
  FMT  : FMT ,
--- a/src/cvw.sv
+++ b/src/cvw.sv
@ -260,7 +260,8 @@ typedef struct packed {
  logic [1:0] H_FMT;
 // Floating point length FLEN and number of exponent (NE) and fraction (NF) bits
-  int FLEN;
+  int         FLEN;
  int         LOGFLEN;
  int         NE  ;
  int         NF  ;
  logic [1:0] FMT ;
--- a/src/fpu/fctrl.sv
+++ b/src/fpu/fctrl.sv
@ -48,7 +48,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  output logic                 XEnE, YEnE, ZEnE,                   // enable inputs
  // operation mux selections                                    
  output logic                 FCvtIntE, FCvtIntW,                 // convert to integer operation
-  output logic [2:0]           FrmM,                               // FP rounding mode
+  output logic [2:0]           FrmE, FrmM,                          // FP rounding mode
  output logic [P.FMTBITS-1:0] FmtE, FmtM,                         // FP format
  output logic [2:0]           OpCtrlE, OpCtrlM,                   // Select which operation to do in each component
  output logic                 FpLoadStoreM,                       // FP load or store instruction
@ -56,6 +56,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  output logic [1:0]           FResSelE, FResSelM, FResSelW,       // Select one of the results that finish in the memory stage
  output logic                 FPUActiveE,                         // FP instruction being executed
  output logic                 ZfaE, ZfaM,                         // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod)
  output logic                 ZfaFRoundNXE,                       // Zfa froundnx instruction
  // register control signals
  output logic                 FRegWriteE, FRegWriteM, FRegWriteW, // FP register write enable
  output logic                 FWriteIntE, FWriteIntM,             // Write to integer register
@ -66,7 +67,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  output logic                 FDivStartE, IDivStartE              // Start division or squareroot
  );
-  `define FCTRLW 13
+  `define FCTRLW 14
  logic [`FCTRLW-1:0]          ControlsD;                          // control signals
  logic                        FRegWriteD;                         // FP register write enable
@ -75,13 +76,14 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  logic [2:0]                  OpCtrlD;                            // Select which operation to do in each component
  logic [1:0]                  PostProcSelD;                       // select result in the post processing unit
  logic [1:0]                  FResSelD;                           // Select one of the results that finish in the memory stage
-  logic [2:0]                  FrmD, FrmE;                         // FP rounding mode
+  logic [2:0]                  FrmD;                               // FP rounding mode
  logic [P.FMTBITS-1:0]        FmtD;                               // FP format
  logic [1:0]                  Fmt, Fmt2;                          // format - before possible reduction
  logic                        SupportedFmt;                       // is the format supported
  logic                        SupportedFmt2;                      // is the source format supported for fp -> fp
  logic                        FCvtIntD, FCvtIntM;                 // convert to integer operation
  logic                        ZfaD;                               // Zfa variants of instructions
  logic                        ZfaFRoundNXD;                       // Zfa froundnx instruction
  // FPU Instruction Decoder
  assign Fmt = Funct7D[1:0];
@ -93,156 +95,156 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
                         (Fmt2 == 2'b10 & P.ZFH_SUPPORTED) | (Fmt2 == 2'b11 & P.Q_SUPPORTED));
  // decode the instruction                       
-  // FRegWrite_FWriteInt_FResSel_PostProcSel_FOpCtrl_FDivStart_IllegalFPUInstr_FCvtInt_Zfa
+  // FRegWrite_FWriteInt_FResSel_PostProcSel_FOpCtrl_FDivStart_IllegalFPUInstr_FCvtInt_Zfa_FroundNX
  always_comb
    if (STATUS_FS == 2'b00) // FPU instructions are illegal when FPU is disabled
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0;
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0_0;
    else if (OpD != 7'b0000111 & OpD != 7'b0100111 & ~SupportedFmt) 
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0; // for anything other than loads and stores, check for supported format
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0_0; // for anything other than loads and stores, check for supported format
    else begin 
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0; // default: non-implemented instruction
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0_0; // default: non-implemented instruction
      /* verilator lint_off CASEINCOMPLETE */   // default value above has priority so no other default needed
      case(OpD)
        7'b0000111: case(Funct3D)
-                      3'b010:                       ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flw
+                      3'b010:                       ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0_0; // flw
-                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // fld
+                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0_0; // fld
-                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flq
+                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0_0; // flq
-                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flh
+                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0_0; // flh
                    endcase
        7'b0100111: case(Funct3D)
-                      3'b010:                       ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsw
+                      3'b010:                       ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0_0; // fsw
-                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsd
+                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0_0; // fsd
-                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsq
+                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0_0; // fsq
-                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsh
+                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0_0; // fsh
                    endcase
-        7'b1000011:   ControlsD = `FCTRLW'b1_0_01_10_000_0_0_0_0; // fmadd
+        7'b1000011:   ControlsD = `FCTRLW'b1_0_01_10_000_0_0_0_0_0; // fmadd
-        7'b1000111:   ControlsD = `FCTRLW'b1_0_01_10_001_0_0_0_0; // fmsub
+        7'b1000111:   ControlsD = `FCTRLW'b1_0_01_10_001_0_0_0_0_0; // fmsub
-        7'b1001011:   ControlsD = `FCTRLW'b1_0_01_10_010_0_0_0_0; // fnmsub
+        7'b1001011:   ControlsD = `FCTRLW'b1_0_01_10_010_0_0_0_0_0; // fnmsub
-        7'b1001111:   ControlsD = `FCTRLW'b1_0_01_10_011_0_0_0_0; // fnmadd
+        7'b1001111:   ControlsD = `FCTRLW'b1_0_01_10_011_0_0_0_0_0; // fnmadd
        7'b1010011: casez(Funct7D)
-                      7'b00000??: ControlsD = `FCTRLW'b1_0_01_10_110_0_0_0_0; // fadd
+                      7'b00000??: ControlsD = `FCTRLW'b1_0_01_10_110_0_0_0_0_0; // fadd
-                      7'b00001??: ControlsD = `FCTRLW'b1_0_01_10_111_0_0_0_0; // fsub
+                      7'b00001??: ControlsD = `FCTRLW'b1_0_01_10_111_0_0_0_0_0; // fsub
-                      7'b00010??: ControlsD = `FCTRLW'b1_0_01_10_100_0_0_0_0; // fmul
+                      7'b00010??: ControlsD = `FCTRLW'b1_0_01_10_100_0_0_0_0_0; // fmul
-                      7'b00011??: ControlsD = `FCTRLW'b1_0_01_01_xx0_1_0_0_0; // fdiv
+                      7'b00011??: ControlsD = `FCTRLW'b1_0_01_01_xx0_1_0_0_0_0; // fdiv
-                      7'b01011??: if (Rs2D == 5'b0000) ControlsD = `FCTRLW'b1_0_01_01_xx1_1_0_0_0; // fsqrt
+                      7'b01011??: if (Rs2D == 5'b0000) ControlsD = `FCTRLW'b1_0_01_01_xx1_1_0_0_0_0; // fsqrt
                      7'b00100??: case(Funct3D)
-                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_000_0_0_0_0; // fsgnj
+                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_000_0_0_0_0_0; // fsgnj
-                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_001_0_0_0_0; // fsgnjn
+                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_001_0_0_0_0_0; // fsgnjn
-                                    3'b010:  ControlsD = `FCTRLW'b1_0_00_00_010_0_0_0_0; // fsgnjx
+                                    3'b010:  ControlsD = `FCTRLW'b1_0_00_00_010_0_0_0_0_0; // fsgnjx
                                  endcase
                      7'b00101??: case(Funct3D)
-                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_0; // fmin
+                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_0_0; // fmin
-                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_0; // fmax
+                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_0_0; // fmax
-                                    3'b010:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_1; // fminm  (Zfa)
+                                    3'b010:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_1_0; // fminm  (Zfa)
-                                    3'b011:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_1; // fmaxm  (Zfa)
+                                    3'b011:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_1_0; // fmaxm  (Zfa)
                                  endcase
                      7'b10100??: case(Funct3D)
-                                    3'b000:  ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_0; // fle
+                                    3'b000:  ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_0_0; // fle
-                                    3'b001:  ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_0; // flt
+                                    3'b001:  ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_0_0; // flt
-                                    3'b010:  ControlsD = `FCTRLW'b0_1_00_00_010_0_0_0_0; // feq
+                                    3'b010:  ControlsD = `FCTRLW'b0_1_00_00_010_0_0_0_0_0; // feq
-                                    3'b100:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_1; // fleq  (Zfa)
+                                    3'b100:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_1_0; // fleq  (Zfa)
-                                    3'b101:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_1; // fltq  (Zfa)
+                                    3'b101:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_1_0; // fltq  (Zfa)
                                  endcase
                      7'b11100??: if (Funct3D == 3'b001 & Rs2D == 5'b00000)          
-                                                ControlsD = `FCTRLW'b0_1_10_00_000_0_0_0_0; // fclass
+                                                ControlsD = `FCTRLW'b0_1_10_00_000_0_0_0_0_0; // fclass
                                  else if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
-                                                ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_0; // fmv.x.w/d/h/q  fp to int register
+                                                ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_0_0; // fmv.x.w/d/h/q  fp to int register
                                  else if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct7D[1:0] == 2'b01 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
-                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1; // fmvh.x.d  (Zfa) 
+                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1_0; // fmvh.x.d  (Zfa) 
                                  //  Q not supported in RV64GC
                                  // coverage off   
                                  else if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct7D[1:0] == 2'b11 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
-                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1; // fmvh.x.q  (Zfa)
+                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1_0; // fmvh.x.q  (Zfa)
                                  // coverage on
                      7'b11110??: if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
-                                                ControlsD = `FCTRLW'b1_0_00_00_011_0_0_0_0; // fmv.w/d/h/q.x  int to fp reg
+                                                ControlsD = `FCTRLW'b1_0_00_00_011_0_0_0_0_0; // fmv.w/d/h/q.x  int to fp reg
                                  else if (P.ZFA_SUPPORTED & Funct3D == 3'b000 & Rs2D == 5'b00001) 
-                                                ControlsD = `FCTRLW'b1_0_00_00_111_0_0_0_1; // fli  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_111_0_0_0_1_0; // fli  (Zfa)
                      7'b0100000: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b00)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_0; // fcvt.s.(d/q/h)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_0_0; // fcvt.s.(d/q/h)
                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.s  (Zfa) *** needs ctrl for all rounds
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // fround.s  (Zfa) 
                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.s  (Zfa) *** needs ctrl for all rounds
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // froundnx.s  (Zfa) 
                      7'b0100001: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b01)
-                                                ControlsD = `FCTRLW'b1_0_01_00_001_0_0_0_0; // fcvt.d.(s/h/q)
+                                                ControlsD = `FCTRLW'b1_0_01_00_001_0_0_0_0_0; // fcvt.d.(s/h/q)
                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.d  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // fround.d  (Zfa)
                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.d  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // froundnx.d  (Zfa)
                      7'b0100010: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b10)
-                                                ControlsD = `FCTRLW'b1_0_01_00_010_0_0_0_0; // fcvt.h.(s/d/q)
+                                                ControlsD = `FCTRLW'b1_0_01_00_010_0_0_0_0_0; // fcvt.h.(s/d/q)
                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.h  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // fround.h  (Zfa)
                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.h  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // froundnx.h  (Zfa)
                      // coverage off
                      // Not covered in testing because rv64gc does not support quad precision
                      7'b0100011: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b11)
-                                                ControlsD = `FCTRLW'b1_0_01_00_011_0_0_0_0; // fcvt.q.(s/h/d)
+                                                ControlsD = `FCTRLW'b1_0_01_00_011_0_0_0_0_0; // fcvt.q.(s/h/d)
                                  else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.q  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // fround.q  (Zfa)
                                  else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.q  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // froundnx.q  (Zfa)
                      // coverage on
                      7'b1101000: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.s.w   w->s
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fcvt.s.w   w->s
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.s.wu wu->s
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0_0; // fcvt.s.wu wu->s
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.s.l   l->s
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0_0; // fcvt.s.l   l->s
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.s.lu lu->s
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0_0; // fcvt.s.lu lu->s
                                  endcase
                      7'b1100000: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.s   s->w
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0_0; // fcvt.w.s   s->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.s  s->wu
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0_0; // fcvt.wu.s  s->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.s   s->l
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0_0; // fcvt.l.s   s->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.s  s->lu
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0_0; // fcvt.lu.s  s->lu
                                  endcase
                      7'b1101001: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.d.w   w->d
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fcvt.d.w   w->d
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.d.wu wu->d
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0_0; // fcvt.d.wu wu->d
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.d.l   l->d
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0_0; // fcvt.d.l   l->d
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.d.lu lu->d
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0_0; // fcvt.d.lu lu->d
                                  endcase
                      7'b1100001: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.d   d->w
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0_0; // fcvt.w.d   d->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.d  d->wu
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0_0; // fcvt.wu.d  d->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.d   d->l
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0_0; // fcvt.l.d   d->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.d  d->lu
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0_0; // fcvt.lu.d  d->lu
                                    5'b01000: if (P.ZFA_SUPPORTED & P.D_SUPPORTED & Funct3D == 3'b001) 
-                                                 ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_1; // fcvtmod.w.d (Zfa)
+                                                 ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_1_0; // fcvtmod.w.d (Zfa)
                                  endcase
                      7'b1101010: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.h.w   w->h
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fcvt.h.w   w->h
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.h.wu wu->h
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0_0; // fcvt.h.wu wu->h
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.h.l   l->h
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0_0; // fcvt.h.l   l->h
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.h.lu lu->h
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0_0; // fcvt.h.lu lu->h
                                  endcase
                      7'b1100010: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.h   h->w
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0_0; // fcvt.w.h   h->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.h  h->wu
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0_0; // fcvt.wu.h  h->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.h   h->l
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0_0; // fcvt.l.h   h->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.h  h->lu
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0_0; // fcvt.lu.h  h->lu
                                  endcase
                      // Not covered in testing because rv64gc does not support quad precision
                      // coverage off
                      7'b1101011: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.q.w   w->q
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fcvt.q.w   w->q
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.q.wu wu->q
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0_0; // fcvt.q.wu wu->q
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.q.l   l->q
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0_0; // fcvt.q.l   l->q
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.q.lu lu->q
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0_0; // fcvt.q.lu lu->q
                                  endcase
                      7'b1100011: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.q   q->w
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0_0; // fcvt.w.q   q->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.q  q->wu
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0_0; // fcvt.wu.q  q->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.q   q->l
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0_0; // fcvt.l.q   q->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.q  q->lu
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0_0; // fcvt.lu.q  q->lu
                                  endcase
                      // coverage off
                      // Not covered in testing because rv64gc is not RV64Q or RV32D
                      7'b1011001: if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct3D == 3'b000) 
-                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fmvp.d.x  (Zfa) *** untested, controls could be wrong
+                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fmvp.d.x  (Zfa) *** untested, controls could be wrong
                      7'b1011011: if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct3D == 3'b000) 
-                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fmvp.q.x  (Zfa)
+                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fmvp.q.x  (Zfa)
                      // coverage on
                   endcase
      endcase
@ -250,7 +252,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
    /* verilator lint_on CASEINCOMPLETE */
  // unswizzle control bits
-  assign {FRegWriteD, FWriteIntD, FResSelD, PostProcSelD, OpCtrlD, FDivStartD, IllegalFPUInstrD, FCvtIntD, ZfaD} = ControlsD;
+  assign {FRegWriteD, FWriteIntD, FResSelD, PostProcSelD, OpCtrlD, FDivStartD, IllegalFPUInstrD, FCvtIntD, ZfaD, ZfaFRoundNXD} = ControlsD;
  // rounding modes:
  //    000 - round to nearest, ties to even
@ -259,7 +261,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  //    011 - round up - round twords positive infinity
  //    100 - round to nearest, ties to max magnitude - round to nearest, ties away from zero
  //    111 - dynamic - choose FRM_REGW as rounding mode
-  assign FrmD = &Funct3D ? FRM_REGW : Funct3D;
+  assign FrmD = (Funct3D == 3'b111) ? FRM_REGW : Funct3D;
  // Precision
  //    00 - single
@ -269,7 +271,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
    if (P.FPSIZES == 1)
      assign FmtD = 1'b0;
-    else if (P.FPSIZES == 2)begin
+    else if (P.FPSIZES == 2) begin
      logic [1:0] FmtTmp;
      assign FmtTmp = ((Funct7D[6:3] == 4'b0100)&OpD[4]) ? Rs2D[1:0] : (~OpD[6]&(&OpD[2:0])) ? {~Funct3D[1], ~(Funct3D[1]^Funct3D[0])} : Funct7D[1:0];
      assign FmtD = (P.FMT == FmtTmp);
@ -313,6 +315,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  //        011 - mv to fp        01
  //        110 - min             10
  //        101 - max             10
  //        100 - fround          11
  //        111 - fli             11
  //  OpCtrl:
@ -350,9 +353,9 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  assign Adr3D = InstrD[31:27];
  // D/E pipleine register
-  flopenrc #(15+P.FMTBITS) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+  flopenrc #(`FCTRLW+2+P.FMTBITS) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-              {FRegWriteD, PostProcSelD, FResSelD, FrmD, FmtD, OpCtrlD, FWriteIntD, FCvtIntD, ZfaD, ~IllegalFPUInstrD},
+              {FRegWriteD, PostProcSelD, FResSelD, FrmD, FmtD, OpCtrlD, FWriteIntD, FCvtIntD, ZfaD, ZfaFRoundNXD, ~IllegalFPUInstrD},
-              {FRegWriteE, PostProcSelE, FResSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, ZfaE, FPUActiveE});
+              {FRegWriteE, PostProcSelE, FResSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, ZfaE, ZfaFRoundNXE, FPUActiveE});
  flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {Adr1D, Adr2D, Adr3D}, {Adr1E, Adr2E, Adr3E});
  flopenrc #(1) DEFDivStartReg(clk, reset, FlushE, ~StallE|FDivBusyE, FDivStartD, FDivStartE);
  flopenrc #(3) DEEnReg(clk, reset, FlushE, ~StallE, {XEnD, YEnD, ZEnD}, {XEnE, YEnE, ZEnE});
@ -365,7 +368,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  flopenrc #(14+int'(P.FMTBITS)) EMCtrlReg (clk, reset, FlushM, ~StallM,
              {FRegWriteE, FResSelE, PostProcSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, ZfaE},
              {FRegWriteM, FResSelM, PostProcSelM, FrmM, FmtM, OpCtrlM, FWriteIntM, FCvtIntM, ZfaM});
-  
+
  // renameing for readability
  assign FpLoadStoreM = FResSelM[1];
@ -373,5 +376,5 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
  flopenrc #(4)  MWCtrlReg(clk, reset, FlushW, ~StallW,
          {FRegWriteM, FResSelM, FCvtIntM},
          {FRegWriteW, FResSelW, FCvtIntW});
- 
+
 endmodule
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@ -37,6 +37,8 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
  input  logic                 XInfE, YInfE, 
  input  logic                 XZeroE, YZeroE, 
  input  logic                 XNaNE, YNaNE, 
  input  logic [P.NE-2:0]      BiasE,                               // Bias of exponent
  input  logic [P.LOGFLEN-1:0] NfE,                          // Number of fractional bits in selected format
  input  logic                 FDivStartE, IDivStartE,
  input  logic                 StallM,
  input  logic                 FlushE,
@ -75,7 +77,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
  fdivsqrtpreproc #(P) fdivsqrtpreproc(                          // Preprocessor
    .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE),
-    .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
+    .FmtE, .Bias(BiasE), .Nf(NfE), .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
    // Int-specific 
    .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
    .BZeroM, .IntNormShiftM, .AM, 
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@ -29,39 +29,14 @@
 module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.FMTBITS-1:0] FmtE,
  input  logic [P.LOGFLEN-1:0] Nf,          // Number of fractional bits in selected format
  input  logic                 SqrtE,
  input  logic                 IntDivE,
  input  logic [P.DIVBLEN-1:0] IntResultBitsE,    
  output logic [P.DURLEN-1:0]  CyclesE
 );
-  logic [P.DIVBLEN-1:0] Nf, FPResultBitsE, ResultBitsE; // number of fractional (result) bits
+  logic [P.DIVBLEN-1:0] FPResultBitsE, ResultBitsE; // number of fractional (result) bits
  /* verilator lint_off WIDTH */
  if (P.FPSIZES == 1)
    assign Nf = P.NF;
  else if (P.FPSIZES == 2)
    always_comb
      case (FmtE)
        1'b0: Nf = P.NF1;
        1'b1: Nf = P.NF;
      endcase
  else if (P.FPSIZES == 3)
    always_comb
      case (FmtE)
        P.FMT:   Nf = P.NF;
        P.FMT1:  Nf = P.NF1;
        P.FMT2:  Nf = P.NF2; 
        default: Nf = 'x; // shouldn't happen
      endcase
  else if (P.FPSIZES == 4)  
    always_comb
      case(FmtE)
        P.S_FMT: Nf = P.S_NF;
        P.D_FMT: Nf = P.D_NF;
        P.H_FMT: Nf = P.H_NF;
        P.Q_FMT: Nf = P.Q_NF;
      endcase 
  // Cycle logic
  // P.DIVCOPIES = k. P.LOGR = log(R) = r.  P.RK = rk.  
@ -70,6 +45,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
  // FP Sqrt needs at least Nf fractional bits and 2 guard/round bits.  The integer bit is always initialized to 1 and does not need a cycle.
  // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
  /* verilator lint_off WIDTH */
  always_comb begin 
    FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
--- a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
@ -28,49 +28,21 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.FMTBITS-1:0] Fmt,
+  input  logic [P.NE-2:0]      Bias,      // Bias of exponent
  input  logic [P.NE-1:0]      Xe, Ye,    // input exponents
  input  logic                 Sqrt,
  input  logic [P.DIVBLEN-1:0] ell, m,    // number of leading 0s in Xe and Ye
  output logic [P.NE+1:0]      Ue         // result exponent
  );
-  
+
  logic [P.NE-2:0] Bias;
  logic [P.NE+1:0] SXExp;
  logic [P.NE+1:0] SExp;
  logic [P.NE+1:0] DExp;
  // Determine exponent bias according to the format
  if (P.FPSIZES == 1) begin
    assign Bias = (P.NE-1)'(P.BIAS); 
  end else if (P.FPSIZES == 2) begin
    assign Bias = Fmt ? (P.NE-1)'(P.BIAS) : (P.NE-1)'(P.BIAS1); 
  end else if (P.FPSIZES == 3) begin
    always_comb
      case (Fmt)
        P.FMT: Bias  =  (P.NE-1)'(P.BIAS);
        P.FMT1: Bias = (P.NE-1)'(P.BIAS1);
        P.FMT2: Bias = (P.NE-1)'(P.BIAS2);
        default: Bias = 'x;
      endcase
  end else if (P.FPSIZES == 4) begin        
  always_comb
    case (Fmt)
      2'h3: Bias =  (P.NE-1)'(P.Q_BIAS);
      2'h1: Bias =  (P.NE-1)'(P.D_BIAS);
      2'h0: Bias =  (P.NE-1)'(P.S_BIAS);
      2'h2: Bias =  (P.NE-1)'(P.H_BIAS);
    endcase
  end
  // Square root exponent = (Xe - l - bias) / 2 + bias; l accounts for subnorms
  assign SXExp = {2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - (P.NE+2)'(P.BIAS);
  assign SExp  = {SXExp[P.NE+1], SXExp[P.NE+1:1]} + {2'b0, Bias};
-  
+
  // division exponent = (Xe-l) - (Ye-m) + bias; l and m account for subnorms
  assign DExp  = ({2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - {2'b0, Ye} + {{(P.NE+1-P.DIVBLEN){1'b0}}, m} + {3'b0, Bias}); 
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@ -33,6 +33,8 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.NF:0]        Xm, Ym,      // Floating-point significands
  input  logic [P.NE-1:0]      Xe, Ye,      // Floating-point exponents
  input  logic [P.FMTBITS-1:0] FmtE,
  input  logic [P.NE-2:0]      Bias,                               // Bias of exponent
  input  logic [P.LOGFLEN-1:0] Nf,          // Number of fractional bits in selected format
  input  logic                 SqrtE,
  input  logic                 XZeroE,
  input  logic [2:0]           Funct3E,
@ -209,11 +211,11 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
  flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dnorm}, D);
  // Floating-point exponent
-  fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .ell, .m(mE), .Ue(UeE));
+  fdivsqrtexpcalc #(P) expcalc(.Bias, .Xe, .Ye, .Sqrt(SqrtE), .ell, .m(mE), .Ue(UeE));
  flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
  // Number of FSM cycles (to FSM)
-  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
+  fdivsqrtcycles #(P) cyclecalc(.FmtE, .Nf, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
  if (P.IDIV_ON_FPU) begin:intpipelineregs
    logic [P.DIVBLEN-1:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
--- a/src/fpu/fmtparams.sv
+++ b/src/fpu/fmtparams.sv
@ -0,0 +1,86 @@
 ///////////////////////////////////////////
 // fmtparams.sv
 //
 // Written: David_Harris@hmc.edu
 // Modified: 5/11/24
 //
 // Purpose: Look up bias of exponent and number of fractional bits for the selected format
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
 //
 // SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
 //
 // Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
 // except in compliance with the License, or, at your option, the Apache License version 2.0. You 
 // may obtain a copy of the License at
 //
 // https://solderpad.org/licenses/SHL-2.1/
 //
 // Unless required by applicable law or agreed to in writing, any work distributed under the 
 // License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
 // either express or implied. See the License for the specific language governing permissions 
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 module fmtparams import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.FMTBITS-1:0] Fmt,
  output logic [P.NE-2:0]      Bias,
  output logic [P.LOGFLEN-1:0] Nf
 );
  if (P.FPSIZES == 1) begin
    assign Bias = (P.NE-1)'(P.BIAS); 
  end else if (P.FPSIZES == 2) begin
    assign Bias = Fmt ? (P.NE-1)'(P.BIAS) : (P.NE-1)'(P.BIAS1); 
  end else if (P.FPSIZES == 3) begin
    always_comb
      case (Fmt)
        P.FMT:  Bias =  (P.NE-1)'(P.BIAS);
        P.FMT1: Bias = (P.NE-1)'(P.BIAS1);
        P.FMT2: Bias = (P.NE-1)'(P.BIAS2);
        default: Bias = 'x;
      endcase
  end else if (P.FPSIZES == 4) begin        
    always_comb
      case (Fmt)
        2'h3: Bias =  (P.NE-1)'(P.Q_BIAS);
        2'h1: Bias =  (P.NE-1)'(P.D_BIAS);
        2'h0: Bias =  (P.NE-1)'(P.S_BIAS);
        2'h2: Bias =  (P.NE-1)'(P.H_BIAS);
      endcase
  end
  /* verilator lint_off WIDTH */
  if (P.FPSIZES == 1)
    assign Nf = P.NF;
  else if (P.FPSIZES == 2)
    always_comb
      case (Fmt)
        1'b0: Nf = P.NF1;
        1'b1: Nf = P.NF;
      endcase
  else if (P.FPSIZES == 3)
    always_comb
      case (Fmt)
        P.FMT:   Nf = P.NF;
        P.FMT1:  Nf = P.NF1;
        P.FMT2:  Nf = P.NF2; 
        default: Nf = 'x; // shouldn't happen
      endcase
  else if (P.FPSIZES == 4)  
    always_comb
      case(Fmt)
        P.S_FMT: Nf = P.S_NF;
        P.D_FMT: Nf = P.D_NF;
        P.H_FMT: Nf = P.H_NF;
        P.Q_FMT: Nf = P.Q_NF;
      endcase 
  /* verilator lint_on WIDTH */
 endmodule
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@ -70,7 +70,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  // control signals
  logic                        FRegWriteW;                         // FP register write enable
-  logic [2:0]                  FrmM;                               // FP rounding mode
+  logic [2:0]                  FrmE, FrmM;                         // FP rounding mode
  logic [P.FMTBITS-1:0]        FmtE, FmtM;                         // FP precision 0-single 1-double
  logic                        FDivStartE, IDivStartE;             // Start division or squareroot
  logic                        FWriteIntM;                         // Write to integer register
@ -85,6 +85,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  logic                        FRegWriteE;                         // Write floating-point register
  logic                        FPUActiveE;                         // FP instruction being executed
  logic                        ZfaE, ZfaM;                         // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod.w.d)
  logic                        ZfaFRoundNXE;                       // Zfa froundnx variant
  // regfile signals
  logic [P.FLEN-1:0]           FRD1D, FRD2D, FRD3D;                // Read Data from FP register - decode stage
@ -112,6 +113,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  logic                        XInfM, YInfM, ZInfM;                // is the input infinity - memory stage
  logic                        XExpMaxE;                           // is the exponent all ones (max value)
  logic [P.FLEN-1:0]           XPostBoxE;                          // X after fixing bad NaN box.  Needed for 1-input operations
  logic [P.NE-2:0]             BiasE;                              // Bias of exponent
  logic [P.LOGFLEN-1:0]        NfE;                                // Number of fractional bits
  // Fma Signals
  logic                        FmaAddSubE;                         // Multiply by 1.0 when adding or subtracting
@ -150,7 +153,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  logic [P.XLEN-1:0]           FIntResE;                           // FPU to IEU E-stage result (classify, compare, move)
  logic [P.FLEN-1:0]           PostProcResM;                       // Postprocessor output
  logic [4:0]                  PostProcFlgM;                       // Postprocessor flags
-  logic                        PreNVE, PreNVM;                     // selected flag that is ready in the memory stage     
+  logic                        PreNVE, PreNVM;                     // selected invalid flag that is ready in the memory stage     
  logic                        PreNXE, PreNXM;                     // selected inexact flag that is ready in the memory stage     
  logic [P.FLEN-1:0]           FpResM, FpResW;                     // FPU preliminary result
  logic [P.FLEN-1:0]           PreFpResE, PreFpResM;               // selected result that is ready in the memory stage
  logic [P.FLEN-1:0]           FResultW;                           // final FP result being written to the FP register   
@ -162,9 +166,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  logic                        StallUnpackedM;                     // Stall unpacker outputs during multicycle fdivsqrt
  logic [P.FLEN-1:0]           SgnExtXE;                           // Sign-extended X input for move to integer
  logic                        mvsgn;                              // sign bit for extending move
-  logic [P.FLEN-1:0]           FliResE;                            // Zfa Floating-point load immediate value
+  logic [P.FLEN-1:0]           ZfaResE;                            // Result of Zfa fli or fround instruction
-  logic [P.FLEN-1:0]           FRoundE;                            // Zfa fround output
+  logic                        FRoundNVE, FRoundNXE;               // Zfa fround invalid and inexact flags
  logic [4:0]                  FRoundFlagsE;                       // Zfa fround flags
  //////////////////////////////////////////////////////////////////////////////////////////
  // Decode Stage: fctrl decoder, read register file
@ -174,7 +177,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  fctrl #(P) fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
              .IntDivE, .InstrD,
              .StallE, .StallM, .StallW, .FlushE, .FlushM, .FlushW, .FRM_REGW, .STATUS_FS, .FDivBusyE,
-              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .ZfaE, .ZfaM, .FrmM, .FmtE, .FmtM,
+              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .ZfaE, .ZfaM, .ZfaFRoundNXE, .FrmE, .FrmM, .FmtE, .FmtM,
              .FDivStartE, .IDivStartE, .FWriteIntE, .FCvtIntE, .FWriteIntM, .OpCtrlE, .OpCtrlM, .FpLoadStoreM,
              .IllegalFPUInstrD, .XEnD, .YEnD, .ZEnD, .XEnE, .YEnE, .ZEnE,
              .FResSelE, .FResSelM, .FResSelW, .FPUActiveE, .PostProcSelE, .PostProcSelM, .FCvtIntW, 
@ -237,7 +240,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
    .XNaN(XNaNE), .YNaN(YNaNE), .ZNaN(ZNaNE), .XSNaN(XSNaNE), .XEn(XEnE), 
    .YSNaN(YSNaNE), .ZSNaN(ZSNaNE), .XSubnorm(XSubnormE), 
    .XZero(XZeroE), .YZero(YZeroE), .ZZero(ZZeroE), .XInf(XInfE), .YInf(YInfE), 
-    .ZEn(ZEnE), .ZInf(ZInfE), .XExpMax(XExpMaxE), .XPostBox(XPostBoxE));
+    .ZEn(ZEnE), .ZInf(ZInfE), .XExpMax(XExpMaxE), .XPostBox(XPostBoxE), .Bias(BiasE), .Nf(NfE));
  // fused multiply add: fadd/sub, fmul, fmadd/fnmadd/fmsub/fnmsub
  fma #(P) fma (.Xs(XsE), .Ys(YsE), .Zs(ZsE), .Xe(XeE), .Ye(YeE), .Ze(ZeE), .Xm(XmE), .Ym(YmE), .Zm(ZmE), 
@ -246,7 +249,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  // divide and square root: fdiv, fsqrt, optionally integer division
  fdivsqrt #(P) fdivsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
-    .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .FDivStartE, .IDivStartE, .XsE,
+    .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .BiasE, .NfE, .FDivStartE, .IDivStartE, .XsE,
    .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .IntDivE, .W64E,
    .StallM, .FlushE, .DivStickyM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .UeM, 
    .UmM, .FIntDivResultM);
@ -270,23 +273,26 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
    .ResSubnormUf(CvtResSubnormUfE), .Cs(CsE), .IntZero(IntZeroE), .LzcIn(CvtLzcInE));
  // ZFA: fround and floating-point load immediate fli
-  if (P.ZFA_SUPPORTED) begin
+  if (P.ZFA_SUPPORTED) begin:Zfa
    logic [4:0] Rs1E;
    logic [1:0] Fmt2E; // Two-bit format field from instruction
    logic [P.FLEN-1:0]           FRoundE;                            // Zfa fround output
    logic [P.FLEN-1:0]           FliResE;                            // Zfa Floating-point load immediate value
    // fround
-    fround #(P) fround(.Xs(XsE), .Xe(XeE), .Xm(XmE), 
+    fround #(P) fround(.X(XE), .Xs(XsE), .Xe(XeE), .Xm(XmE), 
-                       .XNaN(XNaNE), .XSNaN(XSNaNE), .XZero(XZeroE), .Fmt(FmtE), 
+                       .XNaN(XNaNE), .XSNaN(XSNaNE), .XZero(XZeroE), .Fmt(FmtE), .Frm(FrmE), .Nf(NfE), 
-                       .FRound(FRoundE), .FRoundFlags(FRoundFlagsE));
+                       .ZfaFRoundNX(ZfaFRoundNXE),
                       .FRound(FRoundE), .FRoundNV(FRoundNVE), .FRoundNX(FRoundNXE));
    // fli
    flopenrc #(5) Rs1EReg(clk, reset, FlushE, ~StallE, InstrD[19:15], Rs1E);
    flopenrc #(2) Fmt2EReg(clk, reset, FlushE, ~StallE, InstrD[26:25], Fmt2E);
    fli #(P) fli(.Rs1(Rs1E), .Fmt(Fmt2E), .Imm(FliResE)); 
    mux2  #(P.FLEN) ZfaResMux(FRoundE, FliResE, OpCtrlE[0], ZfaResE);
  end else begin
-    assign FRoundE = '0; 
+    assign {FRoundNXE, FRoundNVE} = '0;
-    assign FRoundFlagsE = '0;
+    assign ZfaResE = 'x;
    assign FliResE = '0;
  end
  // fmv.*.x: NaN Box SrcA to extend integer to requested FP size 
@ -311,8 +317,9 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  else assign IntSrcE = PreIntSrcE;
  // select a result that may be written to the FP register
-  mux4  #(P.FLEN) FResMux(SgnResE, IntSrcE, CmpFpResE, FliResE, {OpCtrlE[2], &OpCtrlE[1:0]}, PreFpResE);
+  mux4  #(P.FLEN) FResMux(SgnResE, IntSrcE, CmpFpResE, ZfaResE, {OpCtrlE[2], &OpCtrlE[1:0] | (OpCtrlE == 3'b100) & ZfaE}, PreFpResE);
-  assign PreNVE = CmpNVE&(OpCtrlE[2]|FWriteIntE);
+  assign PreNVE = CmpNVE&(OpCtrlE[2]|FWriteIntE) | FRoundNVE & (OpCtrlE == 3'b100) & ZfaE;
  assign PreNXE = FRoundNXE & (OpCtrlE == 3'b100);
  // fmv.x.*: select the result that may be written to the integer register
  if(P.FPSIZES == 1) begin
@ -350,7 +357,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
  flopenr #(13) EMFpReg5 (clk, reset, ~StallUnpackedM, 
    {XsE, YsE, XZeroE, YZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
    {XsM, YsM, XZeroM, YZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
-  flopenrc #(1)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, PreNVE, PreNVM);      
+  flopenrc #(2)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, {PreNVE, PreNXE}, {PreNVM, PreNXM});      
  flopenrc #(3*P.NF+4) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM);
  flopenrc #($clog2(3*P.NF+5)+7+P.NE) EMRegFma4(clk, reset, FlushM, ~StallM,
    {FmaAStickyE, InvAE, SCntE, AsE, PsE, SsE, SeE},
@ -373,8 +380,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
    .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));
  // FPU flag selection - to privileged
-  //mux2  #(5)       FPUFlgMux({PreNVM&~FResSelM[1], 4'b0}, PostProcFlgM, ~FResSelM[1]&FResSelM[0], SetFflagsM);
+  mux2  #(5)       FPUFlgMux({PreNVM, 3'b0, PreNXM}, PostProcFlgM, (FResSelM == 2'b01), SetFflagsM);
  mux2  #(5)       FPUFlgMux({PreNVM, 4'b0}, PostProcFlgM, (FResSelM == 2'b01), SetFflagsM);
  mux2  #(P.FLEN)  FPUResMux(PreFpResM, PostProcResM, FResSelM[0], FpResM);
  // M/W pipe registers
--- a/src/fpu/fround.sv
+++ b/src/fpu/fround.sv
@ -28,60 +28,34 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 module fround import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.FLEN-1:0]       X,            // input before unpacking
  input  logic                    Xs,           // input's sign
  input  logic [P.NE-1:0]         Xe,           // input's exponent
-  input  logic [P.NF:0]           Xm,           // input's fraction
+  input  logic [P.NF:0]           Xm,           // input's fraction with leading integer bit (U1.NF)
  input  logic                    XNaN,         // X is NaN
  input  logic                    XSNaN,        // X is Signalling NaN
  input  logic                    XZero,        // X is Zero
  input  logic [P.FMTBITS-1:0]    Fmt,          // the input's precision (11=quad 01=double 00=single 10=half)
  input  logic [2:0]              Frm,          // rounding mode
  input  logic [P.LOGFLEN-1:0]    Nf,           // Number of fractional bits in selected format
  input  logic                    ZfaFRoundNX,  // froundnx instruction can set inexact flag
  output logic [P.FLEN-1:0]       FRound,       // Rounded result
-  output logic [4:0]              FRoundFlags   // Rounder flags
+  output logic                    FRoundNV,     // fround invalid
  output logic                    FRoundNX      // fround inexact
 );
-  logic [P.NE-2:0] Bias;
+  logic [P.NE-1:0] E, Xep1, EminusNf;
-  logic [P.NE-1:0] E;
+  logic [P.NF:0] IMask, Tmasknonneg, Tmaskneg, Tmask, HotE, HotEP1, Trunc, Rnd;
-  logic [P.NF:0] Imask, Tmasknonneg, Tmaskneg, Tmask, HotE, HotEP1, Trunc, Rnd;
+  logic [P.FLEN-1:0] W, PackedW;
-  logic Lnonneg, Lp, Rnonneg, Rp, Tp;
+  logic Elt0, Eeqm1, Lnonneg, Lp, Rnonneg, Rp, Tp, RoundUp, Two, EgeNf, Exact;
  //////////////////////////////////////////
  // Determine exponent bias according to the format
  //////////////////////////////////////////
  // *** replicated from fdivsqrt; find a way to share
  if (P.FPSIZES == 1) begin
    assign Bias = (P.NE-1)'(P.BIAS); 
  end else if (P.FPSIZES == 2) begin
    assign Bias = Fmt ? (P.NE-1)'(P.BIAS) : (P.NE-1)'(P.BIAS1); 
  end else if (P.FPSIZES == 3) begin
    always_comb
      case (Fmt)
        P.FMT: Bias  =  (P.NE-1)'(P.BIAS);
        P.FMT1: Bias = (P.NE-1)'(P.BIAS1);
        P.FMT2: Bias = (P.NE-1)'(P.BIAS2);
        default: Bias = 'x;
      endcase
  end else if (P.FPSIZES == 4) begin        
  always_comb
    case (Fmt)
      2'h3: Bias =  (P.NE-1)'(P.Q_BIAS);
      2'h1: Bias =  (P.NE-1)'(P.D_BIAS);
      2'h0: Bias =  (P.NE-1)'(P.S_BIAS);
      2'h2: Bias =  (P.NE-1)'(P.H_BIAS);
    endcase
  end
 /*
  // Unbiased exponent
-  assign E = Xe - Bias;
+  assign E = Xe - P.BIAS[P.NE-1:0];
  assign Xep1 = Xe + 1;
  //////////////////////////////////////////
  // Compute LSB L', rounding bit R' and Sticky bit T'
-  //      if (E < 0)					// negative exponents round to 0 or 1.  
+  //      if (E < 0)					// negative exponents round to 0 or 1.
  //              L' = 0      // LSB = 0
  //              if (E = -1) R' = 1, TMask = 0.1111...111	// if (E = -1) 0.5  X < 1.  Round bit is 1
  //              else R' = 0; TMask = 1.1111...111  	// if (E < -1), X < 0.5.  Round bit is 0
@ -100,19 +74,19 @@ module fround import cvw::*;  #(parameter cvw_t P) (
  //////////////////////////////////////////
  // Check if exponent is negative and -1
-  assign Elt0 = (E < 0);
+  assign Elt0 = E[P.NE-1]; // (E < 0);
-  assign Eeqm1 = (E == -1);
+  assign Eeqm1 = ($signed(E) == -1);
  // Logic for nonnegative mask and rounding bits
-  assign Imask = {1'b1, {P.NF{1'b0}}} >>> E;
+  assign IMask = {1'b1, {P.NF{1'b0}}} >>> E;
  assign Tmasknonneg = ~(IMask >>> 1'b1);
-  assign HotE = IMask & !(IMask << 1'b1);
+  assign HotE = IMask & ~(IMask << 1'b1);
  assign HotEP1 = HotE >> 1'b1;
  assign Lnonneg = |(Xm & HotE);
  assign Rnonneg = |(Xm & HotEP1);
-  assign Trunc = Xm & Imask;
+  assign Trunc = Xm & IMask;
-  assign Rnd = Trunc + HotE;
+  assign {Two, Rnd} = Trunc + HotE; // Two means result is 10.000000 = 2.0
-   
+
  // mux and AND-OR logic to select final rounding bits
  mux2 #(1) Lmux(Lnonneg, 1'b0, Elt0, Lp);
  mux2 #(1) Rmux(Rnonneg, Eeqm1, Elt0, Rp);
@ -120,7 +94,6 @@ module fround import cvw::*;  #(parameter cvw_t P) (
  mux2 #(P.NF+1) Tmaskmux(Tmasknonneg, Tmaskneg, Elt0, Tmask);
  assign Tp = |(Xm & Tmask);
  ///////////////////////////
  // Rounding, flags, special Cases 
  //      Flags = 0						// unless overridden later
@ -144,11 +117,15 @@ module fround import cvw::*;  #(parameter cvw_t P) (
  ///////////////////////////
  // Exact logic
-  assign Exact = (E >= Nf | XZero); // result will be exact; no need to round
+  /* verilator lint_off WIDTH */
  assign EminusNf = E - Nf;
  /* verilator lint_on WIDTH */
  assign EgeNf = ~EminusNf[P.NE-1] & (~E[P.NE-1] | E[P.NE-2:0] == '0); // E >= Nf if MSB of E-Nf is 0 and E was positive 
  assign Exact = (EgeNf | XZero) & ~XNaN; // result will be exact; no need to round
  // Rounding logic: determine whether to round up in magnitude
-  always_comb
+  always_comb begin
-    case (Rm) // *** make sure this includes dynamic
+    case (Frm) // Frm is either specified in the instruction or is the dynamic rounding mode
      3'b000:  RoundUp = Rp & (Lp | Tp);  // RNE
      3'b001:  RoundUp = 0;               // RZ
      3'b010:  RoundUp = Xs & (Rp | Tp);  // RN
@ -157,22 +134,23 @@ module fround import cvw::*;  #(parameter cvw_t P) (
      default: RoundUp = 0;               // should never happen
    endcase
-    // output logic
+    // If result is not exact, select output in unpacked FLEN format initially
-    if (XNaN) W = CanonicalNan; // ***
+    if (XNaN) W = {1'b0, {P.NE{1'b1}}, 1'b1, {(P.NF-1){1'b0}}}; // Canonical NaN
-    else if (Exact) W = X;
+    else if (Elt0) // 0 <= |X| < 1 rounds to 0 or 1
-    else if (Elt0) 
+      if (RoundUp) W = {Xs, P.BIAS[P.NE-1:0], {P.NF{1'b0}}}; // round to +/- 1
-      if (RoundUp) W = {Xs, bias, {P.NF}} // *** format conversions
+      else         W = {Xs, {(P.FLEN-1){1'b0}}}; // round to +/- 0
    else begin // |X| > 1 rounds to an integer
      if (RoundUp & Two) W = {Xs, Xep1, {(P.NF){1'b0}}}; // Round up to 2.0
      else if (RoundUp)  W = {Xs, Xe, Rnd[P.NF-1:0]};      // Round up to Rnd
      else               W = {Xs, Xe, Trunc[P.NF-1:0]};    // Round down to Trunc
    end
  end
-      *** may not need to round to infinity; update docs and pseudocode above
+  packoutput #(P) packoutput(W, Fmt, PackedW); // pack and NaN-box based on selected format.
-
+  mux2 #(P.FLEN) resultmux(PackedW, X, Exact, FRound);
  always_comb
  // Flags
-  assign Invalid = XSNaN;
+  assign FRoundNV = XSNaN;                                        // invalid if input is signaling NaN
-  assign Inexact = FRoundNX & ~(XNaN | Exact) & (Rp | T'); 
+  assign FRoundNX = ZfaFRoundNX & ~(XNaN | Exact) & (Rp | Tp);    // Inexact if Round or Sticky bit set for FRoundNX instruction
 */
  assign FRound = '0;
  assign FRoundFlags = '0;
 endmodule
--- a/src/fpu/packoutput.sv
+++ b/src/fpu/packoutput.sv
@ -0,0 +1,101 @@
 ///////////////////////////////////////////
 // packoutput.sv
 //
 // Written: David_Harris@hmc.edu
 // Modified: 5/11/24
 //
 // Purpose: Pack the output of the FPU
 // 
 // Documentation: RISC-V System on Chip Design Chapter 13
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
 // 
 // Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
 //
 // SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
 //
 // Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
 // except in compliance with the License, or, at your option, the Apache License version 2.0. You 
 // may obtain a copy of the License at
 //
 // https://solderpad.org/licenses/SHL-2.1/
 //
 // Unless required by applicable law or agreed to in writing, any work distributed under the 
 // License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
 // either express or implied. See the License for the specific language governing permissions 
 // and limitations under the License.
 ////////////////////////////////////////////////////////////////////////////////////////////////
 module packoutput import cvw::*;  #(parameter cvw_t P) (
  input  logic [P.FLEN-1:0]       Unpacked,
  input  logic [P.FMTBITS-1:0]    Fmt,
  output logic [P.FLEN-1:0]       Packed
 );
  logic             Sign; 
  logic [P.NE1-1:0] Exp1;
  logic [P.NF1-1:0] Fract1;
  logic [P.NE2-1:0] Exp2;
  logic [P.NF2-1:0] Fract2;
  logic [P.H_NE-1:0] Exp3;
  logic [P.H_NF-1:0] Fract3;
  // Pack exponent and fraction, with NaN-boxing to full FLEN
  assign Sign = Unpacked[P.FLEN-1];
  if (P.FPSIZES == 1) begin
    assign Packed = Unpacked;
  end else if (P.FPSIZES == 2) begin
    int NF = P.NF;
    int NE1 = P.NE1;
    int top = P.NF + P.NE1-2;
    int bot = P.NF - P.NF1;
    always_comb
      case (Fmt)
        1'b1: Packed = Unpacked;
        1'b0: begin
                Exp1 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.NE1-2:P.NF]};
                Fract1 = Unpacked[P.NF-1:P.NF-P.NF1];
                Packed = {{(P.FLEN-P.LEN1){1'b1}}, Sign, Exp1, Fract1}; 
              end
      endcase
  end else if (P.FPSIZES == 3) begin
    always_comb
      case (Fmt)
        P.FMT: Packed = Unpacked;
        P.FMT1: begin
                Exp1 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.NE1-2:P.NF]};
                Fract1 = Unpacked[P.NF-1:P.NF-P.NF1];
                Packed = {{(P.FLEN-P.LEN1){1'b1}}, Sign, Exp1, Fract1}; 
              end
        P.FMT2: begin
                Exp2 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.NE2-2:P.NF]};
                Fract2 = Unpacked[P.NF-1:P.NF-P.NF2];
                Packed = {{(P.FLEN-P.LEN2){1'b1}}, Sign, Exp2, Fract2}; 
              end
        default: Packed = 'x;
      endcase
  end else if (P.FPSIZES == 4) begin        
    always_comb
      case (Fmt)
        2'h3: Packed = Unpacked;  // Quad
        2'h1: begin // double
                Exp1 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.NE1-2:P.NF]};
                Fract1 = Unpacked[P.NF-1:P.NF-P.NF1];
                Packed = {{(P.FLEN-P.LEN1){1'b1}}, Sign, Exp1, Fract1}; 
              end
        2'h0: begin // float
                Exp2 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.NE2-2:P.NF]};
                Fract2 = Unpacked[P.NF-1:P.NF-P.NF2];
                Packed = {{(P.FLEN-P.LEN2){1'b1}}, Sign, Exp2, Fract2}; 
              end
        2'h2: begin // half
                Exp3 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.H_NE-2:P.NF]};
                Fract3 = Unpacked[P.NF-1:P.NF-P.H_NF];
                Packed = {{(P.FLEN-P.H_LEN){1'b1}}, Sign, Exp3, Fract3}; 
              end
      endcase
  end
 endmodule
--- a/src/fpu/unpack.sv
+++ b/src/fpu/unpack.sv
@ -41,13 +41,15 @@ module unpack import cvw::*;  #(parameter cvw_t P) (
  output logic                    XZero, YZero, ZZero,  // is XYZ zero
  output logic                    XInf, YInf, ZInf,     // is XYZ infinity
  output logic                    XExpMax,              // does X have the maximum exponent (NaN or Inf)
-  output logic [P.FLEN-1:0]       XPostBox              // X after being properly NaN-boxed
+  output logic [P.FLEN-1:0]       XPostBox,             // X after being properly NaN-boxed
  output logic [P.NE-2:0]         Bias,                 // Exponent bias
  output logic [P.LOGFLEN-1:0]    Nf                    // Number of fractional bits
 );
  logic XExpNonZero, YExpNonZero, ZExpNonZero;          // is the exponent of XYZ non-zero
  logic XFracZero, YFracZero, ZFracZero;                // is the fraction zero
  logic YExpMax, ZExpMax;                               // is the exponent all 1s
-  
+
  unpackinput #(P) unpackinputX (.A(X), .Fmt, .Sgn(Xs), .Exp(Xe), .Man(Xm), .En(XEn), .FPUActive,
                          .NaN(XNaN), .SNaN(XSNaN), .ExpNonZero(XExpNonZero),
                          .Zero(XZero), .Inf(XInf), .ExpMax(XExpMax), .FracZero(XFracZero), 
@ -63,4 +65,7 @@ module unpack import cvw::*;  #(parameter cvw_t P) (
                          .Zero(ZZero), .Inf(ZInf), .ExpMax(ZExpMax), .FracZero(ZFracZero), 
                          .Subnorm(), .PostBox());
  // look up bias and fractional bits for the given format
  fmtparams #(P) fmtparams(Fmt, Bias, Nf);
 endmodule
--- a/testbench/tests.vh
+++ b/testbench/tests.vh
@ -1641,7 +1641,7 @@ string imperas32f[] = '{
  string arch64d[] = '{
    `RISCVARCHTEST,
    // for speed
-   "rv64i_m/D/src/fadd.d_b10-01.S",
+    "rv64i_m/D/src/fadd.d_b10-01.S",
    "rv64i_m/D/src/fadd.d_b1-01.S",
    "rv64i_m/D/src/fadd.d_b11-01.S",
    "rv64i_m/D/src/fadd.d_b12-01.S",
@ -2278,6 +2278,7 @@ string arch64zknh[] = '{
  string arch32zfaf[] = '{
    //`RISCVARCHTEST,
    `WALLYTEST,
    "rv32i_m/F_Zfa/src/fround_b1-01.S",
    "rv32i_m/F_Zfa/src/fleq_b1-01.S",
    "rv32i_m/F_Zfa/src/fleq_b19-01.S", 
    "rv32i_m/F_Zfa/src/fli.s-01.S",
@ -2289,12 +2290,12 @@ string arch64zknh[] = '{
    "rv32i_m/F_Zfa/src/fminm_b19-01.S",
    "rv32i_m/F_Zfa/src/fmaxm_b1-01.S",
    "rv32i_m/F_Zfa/src/fmaxm_b19-01.S"
 /*    "rv32i_m/F_Zfa/src/fround_b1-01.S" */
  };
  string arch32zfad[] = '{
    //`RISCVARCHTEST,
    `WALLYTEST,
    "rv32i_m/D_Zfa/src/fround_b1-01.S",
    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b1-01.S",
    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b22-01.S",
    "rv32i_m/D_Zfa/src/fcvtmod.w.d_b23-01.S",
@ -2326,12 +2327,12 @@ string arch64zknh[] = '{
    "rv32i_m/D_Zfa/src/fmvh.x.d_b27-01.S",
    "rv32i_m/D_Zfa/src/fmvh.x.d_b28-01.S",
    "rv32i_m/D_Zfa/src/fmvh.x.d_b29-01.S"
 /*    "rv32i_m/D_Zfa/src/fround_b1-01.S" */
  };
  string arch64zfaf[] = '{
    //`RISCVARCHTEST,
    `WALLYTEST,
    "rv64i_m/F_Zfa/src/fround_b1-01.S",
    "rv64i_m/F_Zfa/src/fleq_b1-01.S",
    "rv64i_m/F_Zfa/src/fleq_b19-01.S", 
    "rv64i_m/F_Zfa/src/fli.s-01.S",
@ -2341,12 +2342,12 @@ string arch64zknh[] = '{
    "rv64i_m/F_Zfa/src/fminm_b19-01.S",
    "rv64i_m/F_Zfa/src/fmaxm_b1-01.S",
    "rv64i_m/F_Zfa/src/fmaxm_b19-01.S"
 /*    "rv64i_m/F_Zfa/src/fround_b1-01.S" */
  };
  string arch64zfad[] = '{
    //`RISCVARCHTEST,
    `WALLYTEST,
     "rv64i_m/D_Zfa/src/fround_b1-01.S",
    "rv64i_m/D_Zfa/src/fcvtmod.w.d_b1-01.S",
    "rv64i_m/D_Zfa/src/fcvtmod.w.d_b22-01.S", 
    "rv64i_m/D_Zfa/src/fcvtmod.w.d_b23-01.S",
@ -2363,7 +2364,7 @@ string arch64zknh[] = '{
    "rv64i_m/D_Zfa/src/fminm_b19-01.S",
    "rv64i_m/D_Zfa/src/fmaxm_b1-01.S",
    "rv64i_m/D_Zfa/src/fmaxm_b19-01.S"
-/*     "rv64i_m/D_Zfa/src/fround_b1-01.S" */
+
  };
  string arch32d_fma[] = '{