diff --git a/bin/regression-wally b/bin/regression-wally
index fbfc6eece..4e72fae66 100755
--- a/bin/regression-wally
+++ b/bin/regression-wally
@@ -125,19 +125,19 @@ derivconfigtests = [
         ["div_4_2_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
         ["div_4_2i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
         ["div_4_4_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
-        ["div_4_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]],
+        ["div_4_4i_rv64gc", ["arch64f_divsqrt", "arch64d_divsqrt", "arch64m"]], 
 
         # fpu permutations
-        ["f_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma"]],
-        ["fh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfh", "arch32zfh_divsqrt"]],
-        ["fdh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt"]],
-        ["fdq_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32i"]],
-        ["fdqh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt", "arch32i"]],
-        ["f_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma"]],
-        ["fh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64zfh", "arch64zfh_divsqrt"]], 
-        ["fdh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt"]],
-        ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64i"]],
-        ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt", "arch64i", "wally64q"]],
+        ["f_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfaf"]],
+        ["fh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32zfh", "arch32zfh_divsqrt", "arch32zfaf"]],
+        ["fdh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt", "arch32zfaf", "arch32zfad"]],
+        ["fdq_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32i", "arch32zfaf", "arch32zfad"]],
+        ["fdqh_rv32gc", ["arch32f", "arch32f_divsqrt", "arch32f_fma", "arch32d", "arch32d_divsqrt", "arch32d_fma", "arch32zfh", "arch32zfh_divsqrt", "arch32i", "arch32zfaf", "arch32zfad"]],
+        ["f_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64zfaf"]],
+        ["fh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64zfh", "arch64zfh_divsqrt", "arch64zfaf"]], 
+        ["fdh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt", "arch64zfaf", "arch64zfad"]],
+        ["fdq_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64i", "arch64zfaf", "arch64zfad"]],
+        ["fdqh_rv64gc", ["arch64f", "arch64f_divsqrt", "arch64f_fma", "arch64d", "arch64d_divsqrt", "arch64d_fma", "arch64zfh", "arch64zfh_divsqrt", "arch64i", "wally64q", "arch64zfaf", "arch64zfad"]],
     ]
 
 bpredtests = [
diff --git a/config/shared/config-shared.vh b/config/shared/config-shared.vh
index 5cf6049c8..bf42b9506 100644
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@@ -75,6 +75,7 @@ localparam NE   = Q_SUPPORTED ? Q_NE   : D_SUPPORTED ? D_NE   : S_NE;
 localparam NF   = Q_SUPPORTED ? Q_NF   : D_SUPPORTED ? D_NF   : S_NF;
 localparam FMT  = Q_SUPPORTED ? 2'd3   : D_SUPPORTED ? 2'd1   : 2'd0;
 localparam BIAS = Q_SUPPORTED ? Q_BIAS : D_SUPPORTED ? D_BIAS : S_BIAS;
+localparam LOGFLEN = $clog2(FLEN);
 
 // Floating point constants needed for FPU paramerterization
 // LEN1/NE1/NF1/FNT1 is the size of the second longest supported format
@@ -124,7 +125,8 @@ localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));
 //     because NORMSHIFTSZ becomes limited by convert rather than divider
 //     Figure out why extra two bits are needed for convert (and only in testbench_fp, not Wally)
 //     Might be a testbench_fp issue
-localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1+2), (DIVb + 1 + NF + 1)), (3*NF+6));
+//localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1+2), (DIVb + 1 + NF + 1)), (3*NF+6));
+localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1), (DIVb + 1 + NF + 1)), (3*NF+6));
 
 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));                  // log_2(NORMSHIFTSZ)
 localparam CORRSHIFTSZ = NORMSHIFTSZ-2;                             // Drop leading 2 integer bits
diff --git a/config/shared/parameter-defs.vh b/config/shared/parameter-defs.vh
index 1aa6da5d3..96440490c 100644
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@@ -173,6 +173,7 @@ localparam cvw_t P = '{
   H_BIAS : H_BIAS,
   H_FMT : H_FMT,
   FLEN : FLEN,
+  LOGFLEN : LOGFLEN,
   NE   : NE  ,
   NF   : NF  ,
   FMT  : FMT ,
diff --git a/src/cvw.sv b/src/cvw.sv
index cba95c0fa..1f8e0a1c1 100644
--- a/src/cvw.sv
+++ b/src/cvw.sv
@@ -260,7 +260,8 @@ typedef struct packed {
   logic [1:0] H_FMT;
 
 // Floating point length FLEN and number of exponent (NE) and fraction (NF) bits
-  int FLEN;
+  int         FLEN;
+  int         LOGFLEN;
   int         NE  ;
   int         NF  ;
   logic [1:0] FMT ;
diff --git a/src/fpu/fctrl.sv b/src/fpu/fctrl.sv
index e9efc8e76..2d456aeee 100755
--- a/src/fpu/fctrl.sv
+++ b/src/fpu/fctrl.sv
@@ -48,7 +48,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   output logic                 XEnE, YEnE, ZEnE,                   // enable inputs
   // operation mux selections                                    
   output logic                 FCvtIntE, FCvtIntW,                 // convert to integer operation
-  output logic [2:0]           FrmM,                               // FP rounding mode
+  output logic [2:0]           FrmE, FrmM,                          // FP rounding mode
   output logic [P.FMTBITS-1:0] FmtE, FmtM,                         // FP format
   output logic [2:0]           OpCtrlE, OpCtrlM,                   // Select which operation to do in each component
   output logic                 FpLoadStoreM,                       // FP load or store instruction
@@ -56,6 +56,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   output logic [1:0]           FResSelE, FResSelM, FResSelW,       // Select one of the results that finish in the memory stage
   output logic                 FPUActiveE,                         // FP instruction being executed
   output logic                 ZfaE, ZfaM,                         // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod)
+  output logic                 ZfaFRoundNXE,                       // Zfa froundnx instruction
   // register control signals
   output logic                 FRegWriteE, FRegWriteM, FRegWriteW, // FP register write enable
   output logic                 FWriteIntE, FWriteIntM,             // Write to integer register
@@ -66,7 +67,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   output logic                 FDivStartE, IDivStartE              // Start division or squareroot
   );
 
-  `define FCTRLW 13
+  `define FCTRLW 14
 
   logic [`FCTRLW-1:0]          ControlsD;                          // control signals
   logic                        FRegWriteD;                         // FP register write enable
@@ -75,13 +76,14 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   logic [2:0]                  OpCtrlD;                            // Select which operation to do in each component
   logic [1:0]                  PostProcSelD;                       // select result in the post processing unit
   logic [1:0]                  FResSelD;                           // Select one of the results that finish in the memory stage
-  logic [2:0]                  FrmD, FrmE;                         // FP rounding mode
+  logic [2:0]                  FrmD;                               // FP rounding mode
   logic [P.FMTBITS-1:0]        FmtD;                               // FP format
   logic [1:0]                  Fmt, Fmt2;                          // format - before possible reduction
   logic                        SupportedFmt;                       // is the format supported
   logic                        SupportedFmt2;                      // is the source format supported for fp -> fp
   logic                        FCvtIntD, FCvtIntM;                 // convert to integer operation
   logic                        ZfaD;                               // Zfa variants of instructions
+  logic                        ZfaFRoundNXD;                       // Zfa froundnx instruction
 
   // FPU Instruction Decoder
   assign Fmt = Funct7D[1:0];
@@ -93,156 +95,156 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
                          (Fmt2 == 2'b10 & P.ZFH_SUPPORTED) | (Fmt2 == 2'b11 & P.Q_SUPPORTED));
 
   // decode the instruction                       
-  // FRegWrite_FWriteInt_FResSel_PostProcSel_FOpCtrl_FDivStart_IllegalFPUInstr_FCvtInt_Zfa
+  // FRegWrite_FWriteInt_FResSel_PostProcSel_FOpCtrl_FDivStart_IllegalFPUInstr_FCvtInt_Zfa_FroundNX
   always_comb
     if (STATUS_FS == 2'b00) // FPU instructions are illegal when FPU is disabled
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0;
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0_0;
     else if (OpD != 7'b0000111 & OpD != 7'b0100111 & ~SupportedFmt) 
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0; // for anything other than loads and stores, check for supported format
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0_0; // for anything other than loads and stores, check for supported format
     else begin 
-      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0; // default: non-implemented instruction
+      ControlsD = `FCTRLW'b0_0_00_00_000_0_1_0_0_0; // default: non-implemented instruction
       /* verilator lint_off CASEINCOMPLETE */   // default value above has priority so no other default needed
       case(OpD)
         7'b0000111: case(Funct3D)
-                      3'b010:                       ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flw
-                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // fld
-                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flq
-                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0; // flh
+                      3'b010:                       ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0_0; // flw
+                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0_0; // fld
+                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0_0; // flq
+                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b1_0_10_00_0xx_0_0_0_0_0; // flh
                     endcase
         7'b0100111: case(Funct3D)
-                      3'b010:                       ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsw
-                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsd
-                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsq
-                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0; // fsh
+                      3'b010:                       ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0_0; // fsw
+                      3'b011:  if (P.D_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0_0; // fsd
+                      3'b100:  if (P.Q_SUPPORTED)   ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0_0; // fsq
+                      3'b001:  if (P.ZFH_SUPPORTED) ControlsD = `FCTRLW'b0_0_10_00_0xx_0_0_0_0_0; // fsh
                     endcase
-        7'b1000011:   ControlsD = `FCTRLW'b1_0_01_10_000_0_0_0_0; // fmadd
-        7'b1000111:   ControlsD = `FCTRLW'b1_0_01_10_001_0_0_0_0; // fmsub
-        7'b1001011:   ControlsD = `FCTRLW'b1_0_01_10_010_0_0_0_0; // fnmsub
-        7'b1001111:   ControlsD = `FCTRLW'b1_0_01_10_011_0_0_0_0; // fnmadd
+        7'b1000011:   ControlsD = `FCTRLW'b1_0_01_10_000_0_0_0_0_0; // fmadd
+        7'b1000111:   ControlsD = `FCTRLW'b1_0_01_10_001_0_0_0_0_0; // fmsub
+        7'b1001011:   ControlsD = `FCTRLW'b1_0_01_10_010_0_0_0_0_0; // fnmsub
+        7'b1001111:   ControlsD = `FCTRLW'b1_0_01_10_011_0_0_0_0_0; // fnmadd
         7'b1010011: casez(Funct7D)
-                      7'b00000??: ControlsD = `FCTRLW'b1_0_01_10_110_0_0_0_0; // fadd
-                      7'b00001??: ControlsD = `FCTRLW'b1_0_01_10_111_0_0_0_0; // fsub
-                      7'b00010??: ControlsD = `FCTRLW'b1_0_01_10_100_0_0_0_0; // fmul
-                      7'b00011??: ControlsD = `FCTRLW'b1_0_01_01_xx0_1_0_0_0; // fdiv
-                      7'b01011??: if (Rs2D == 5'b0000) ControlsD = `FCTRLW'b1_0_01_01_xx1_1_0_0_0; // fsqrt
+                      7'b00000??: ControlsD = `FCTRLW'b1_0_01_10_110_0_0_0_0_0; // fadd
+                      7'b00001??: ControlsD = `FCTRLW'b1_0_01_10_111_0_0_0_0_0; // fsub
+                      7'b00010??: ControlsD = `FCTRLW'b1_0_01_10_100_0_0_0_0_0; // fmul
+                      7'b00011??: ControlsD = `FCTRLW'b1_0_01_01_xx0_1_0_0_0_0; // fdiv
+                      7'b01011??: if (Rs2D == 5'b0000) ControlsD = `FCTRLW'b1_0_01_01_xx1_1_0_0_0_0; // fsqrt
                       7'b00100??: case(Funct3D)
-                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_000_0_0_0_0; // fsgnj
-                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_001_0_0_0_0; // fsgnjn
-                                    3'b010:  ControlsD = `FCTRLW'b1_0_00_00_010_0_0_0_0; // fsgnjx
+                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_000_0_0_0_0_0; // fsgnj
+                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_001_0_0_0_0_0; // fsgnjn
+                                    3'b010:  ControlsD = `FCTRLW'b1_0_00_00_010_0_0_0_0_0; // fsgnjx
                                   endcase
                       7'b00101??: case(Funct3D)
-                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_0; // fmin
-                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_0; // fmax
-                                    3'b010:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_1; // fminm  (Zfa)
-                                    3'b011:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_1; // fmaxm  (Zfa)
+                                    3'b000:  ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_0_0; // fmin
+                                    3'b001:  ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_0_0; // fmax
+                                    3'b010:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_110_0_0_0_1_0; // fminm  (Zfa)
+                                    3'b011:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b1_0_00_00_101_0_0_0_1_0; // fmaxm  (Zfa)
                                   endcase
                       7'b10100??: case(Funct3D)
-                                    3'b000:  ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_0; // fle
-                                    3'b001:  ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_0; // flt
-                                    3'b010:  ControlsD = `FCTRLW'b0_1_00_00_010_0_0_0_0; // feq
-                                    3'b100:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_1; // fleq  (Zfa)
-                                    3'b101:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_1; // fltq  (Zfa)
+                                    3'b000:  ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_0_0; // fle
+                                    3'b001:  ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_0_0; // flt
+                                    3'b010:  ControlsD = `FCTRLW'b0_1_00_00_010_0_0_0_0_0; // feq
+                                    3'b100:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_011_0_0_0_1_0; // fleq  (Zfa)
+                                    3'b101:  if (P.ZFA_SUPPORTED) ControlsD = `FCTRLW'b0_1_00_00_001_0_0_0_1_0; // fltq  (Zfa)
                                   endcase
                       7'b11100??: if (Funct3D == 3'b001 & Rs2D == 5'b00000)          
-                                                ControlsD = `FCTRLW'b0_1_10_00_000_0_0_0_0; // fclass
+                                                ControlsD = `FCTRLW'b0_1_10_00_000_0_0_0_0_0; // fclass
                                   else if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
-                                                ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_0; // fmv.x.w/d/h/q  fp to int register
+                                                ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_0_0; // fmv.x.w/d/h/q  fp to int register
                                   else if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct7D[1:0] == 2'b01 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
-                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1; // fmvh.x.d  (Zfa) 
+                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1_0; // fmvh.x.d  (Zfa) 
                                   //  Q not supported in RV64GC
                                   // coverage off   
                                   else if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct7D[1:0] == 2'b11 & Funct3D == 3'b000 & Rs2D == 5'b00001) 
-                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1; // fmvh.x.q  (Zfa)
+                                                  ControlsD = `FCTRLW'b0_1_11_00_000_0_0_0_1_0; // fmvh.x.q  (Zfa)
                                   // coverage on
                       7'b11110??: if (Funct3D == 3'b000 & Rs2D == 5'b00000) 
-                                                ControlsD = `FCTRLW'b1_0_00_00_011_0_0_0_0; // fmv.w/d/h/q.x  int to fp reg
+                                                ControlsD = `FCTRLW'b1_0_00_00_011_0_0_0_0_0; // fmv.w/d/h/q.x  int to fp reg
                                   else if (P.ZFA_SUPPORTED & Funct3D == 3'b000 & Rs2D == 5'b00001) 
-                                                ControlsD = `FCTRLW'b1_0_00_00_111_0_0_0_1; // fli  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_111_0_0_0_1_0; // fli  (Zfa)
                       7'b0100000: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b00)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_0; // fcvt.s.(d/q/h)
+                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_0_0; // fcvt.s.(d/q/h)
                                   else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.s  (Zfa) *** needs ctrl for all rounds
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // fround.s  (Zfa) 
                                   else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.s  (Zfa) *** needs ctrl for all rounds
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // froundnx.s  (Zfa) 
                       7'b0100001: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b01)
-                                                ControlsD = `FCTRLW'b1_0_01_00_001_0_0_0_0; // fcvt.d.(s/h/q)
+                                                ControlsD = `FCTRLW'b1_0_01_00_001_0_0_0_0_0; // fcvt.d.(s/h/q)
                                   else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.d  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // fround.d  (Zfa)
                                   else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.d  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // froundnx.d  (Zfa)
                       7'b0100010: if (Rs2D[4:2] == 3'b000 & SupportedFmt2 & Rs2D[1:0] != 2'b10)
-                                                ControlsD = `FCTRLW'b1_0_01_00_010_0_0_0_0; // fcvt.h.(s/d/q)
+                                                ControlsD = `FCTRLW'b1_0_01_00_010_0_0_0_0_0; // fcvt.h.(s/d/q)
                                   else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.h  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // fround.h  (Zfa)
                                   else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.h  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // froundnx.h  (Zfa)
                       // coverage off
                       // Not covered in testing because rv64gc does not support quad precision
                       7'b0100011: if (Rs2D[4:2] == 3'b000  & SupportedFmt2 & Rs2D[1:0] != 2'b11)
-                                                ControlsD = `FCTRLW'b1_0_01_00_011_0_0_0_0; // fcvt.q.(s/h/d)
+                                                ControlsD = `FCTRLW'b1_0_01_00_011_0_0_0_0_0; // fcvt.q.(s/h/d)
                                   else if (Rs2D == 5'b00100 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // fround.q  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // fround.q  (Zfa)
                                   else if (Rs2D == 5'b00101 & P.ZFA_SUPPORTED)
-                                                ControlsD = `FCTRLW'b1_0_01_00_000_0_0_0_1; // froundnx.q  (Zfa)
+                                                ControlsD = `FCTRLW'b1_0_00_00_100_0_0_0_1_0; // froundnx.q  (Zfa)
                       // coverage on
                       7'b1101000: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.s.w   w->s
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.s.wu wu->s
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.s.l   l->s
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.s.lu lu->s
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fcvt.s.w   w->s
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0_0; // fcvt.s.wu wu->s
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0_0; // fcvt.s.l   l->s
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0_0; // fcvt.s.lu lu->s
                                   endcase
                       7'b1100000: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.s   s->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.s  s->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.s   s->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.s  s->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0_0; // fcvt.w.s   s->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0_0; // fcvt.wu.s  s->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0_0; // fcvt.l.s   s->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0_0; // fcvt.lu.s  s->lu
                                   endcase
                       7'b1101001: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.d.w   w->d
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.d.wu wu->d
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.d.l   l->d
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.d.lu lu->d
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fcvt.d.w   w->d
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0_0; // fcvt.d.wu wu->d
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0_0; // fcvt.d.l   l->d
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0_0; // fcvt.d.lu lu->d
                                   endcase
                       7'b1100001: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.d   d->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.d  d->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.d   d->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.d  d->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0_0; // fcvt.w.d   d->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0_0; // fcvt.wu.d  d->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0_0; // fcvt.l.d   d->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0_0; // fcvt.lu.d  d->lu
                                     5'b01000: if (P.ZFA_SUPPORTED & P.D_SUPPORTED & Funct3D == 3'b001) 
-                                                 ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_1; // fcvtmod.w.d (Zfa)
+                                                 ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_1_0; // fcvtmod.w.d (Zfa)
                                   endcase
                       7'b1101010: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.h.w   w->h
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.h.wu wu->h
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.h.l   l->h
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.h.lu lu->h
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fcvt.h.w   w->h
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0_0; // fcvt.h.wu wu->h
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0_0; // fcvt.h.l   l->h
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0_0; // fcvt.h.lu lu->h
                                   endcase
                       7'b1100010: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.h   h->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.h  h->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.h   h->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.h  h->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0_0; // fcvt.w.h   h->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0_0; // fcvt.wu.h  h->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0_0; // fcvt.l.h   h->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0_0; // fcvt.lu.h  h->lu
                                   endcase
                       // Not covered in testing because rv64gc does not support quad precision
                       // coverage off
                       7'b1101011: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fcvt.q.w   w->q
-                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0; // fcvt.q.wu wu->q
-                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0; // fcvt.q.l   l->q
-                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0; // fcvt.q.lu lu->q
+                                    5'b00000:    ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fcvt.q.w   w->q
+                                    5'b00001:    ControlsD = `FCTRLW'b1_0_01_00_100_0_0_0_0_0; // fcvt.q.wu wu->q
+                                    5'b00010:    ControlsD = `FCTRLW'b1_0_01_00_111_0_0_0_0_0; // fcvt.q.l   l->q
+                                    5'b00011:    ControlsD = `FCTRLW'b1_0_01_00_110_0_0_0_0_0; // fcvt.q.lu lu->q
                                   endcase
                       7'b1100011: case(Rs2D)
-                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0; // fcvt.w.q   q->w
-                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0; // fcvt.wu.q  q->wu
-                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0; // fcvt.l.q   q->l
-                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0; // fcvt.lu.q  q->lu
+                                    5'b00000:    ControlsD = `FCTRLW'b0_1_01_00_001_0_0_1_0_0; // fcvt.w.q   q->w
+                                    5'b00001:    ControlsD = `FCTRLW'b0_1_01_00_000_0_0_1_0_0; // fcvt.wu.q  q->wu
+                                    5'b00010:    ControlsD = `FCTRLW'b0_1_01_00_011_0_0_1_0_0; // fcvt.l.q   q->l
+                                    5'b00011:    ControlsD = `FCTRLW'b0_1_01_00_010_0_0_1_0_0; // fcvt.lu.q  q->lu
                                   endcase
                       // coverage off
                       // Not covered in testing because rv64gc is not RV64Q or RV32D
                       7'b1011001: if (P.ZFA_SUPPORTED & P.XLEN == 32 & P.D_SUPPORTED & Funct3D == 3'b000) 
-                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fmvp.d.x  (Zfa) *** untested, controls could be wrong
+                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fmvp.d.x  (Zfa) *** untested, controls could be wrong
                       7'b1011011: if (P.ZFA_SUPPORTED & P.XLEN == 64 & P.Q_SUPPORTED & Funct3D == 3'b000) 
-                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0; // fmvp.q.x  (Zfa)
+                                                  ControlsD = `FCTRLW'b1_0_01_00_101_0_0_0_0_0; // fmvp.q.x  (Zfa)
                       // coverage on
                    endcase
       endcase
@@ -250,7 +252,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
     /* verilator lint_on CASEINCOMPLETE */
 
   // unswizzle control bits
-  assign {FRegWriteD, FWriteIntD, FResSelD, PostProcSelD, OpCtrlD, FDivStartD, IllegalFPUInstrD, FCvtIntD, ZfaD} = ControlsD;
+  assign {FRegWriteD, FWriteIntD, FResSelD, PostProcSelD, OpCtrlD, FDivStartD, IllegalFPUInstrD, FCvtIntD, ZfaD, ZfaFRoundNXD} = ControlsD;
   
   // rounding modes:
   //    000 - round to nearest, ties to even
@@ -259,7 +261,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   //    011 - round up - round twords positive infinity
   //    100 - round to nearest, ties to max magnitude - round to nearest, ties away from zero
   //    111 - dynamic - choose FRM_REGW as rounding mode
-  assign FrmD = &Funct3D ? FRM_REGW : Funct3D;
+  assign FrmD = (Funct3D == 3'b111) ? FRM_REGW : Funct3D;
 
   // Precision
   //    00 - single
@@ -269,7 +271,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   
     if (P.FPSIZES == 1)
       assign FmtD = 1'b0;
-    else if (P.FPSIZES == 2)begin
+    else if (P.FPSIZES == 2) begin
       logic [1:0] FmtTmp;
       assign FmtTmp = ((Funct7D[6:3] == 4'b0100)&OpD[4]) ? Rs2D[1:0] : (~OpD[6]&(&OpD[2:0])) ? {~Funct3D[1], ~(Funct3D[1]^Funct3D[0])} : Funct7D[1:0];
       assign FmtD = (P.FMT == FmtTmp);
@@ -313,6 +315,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   //        011 - mv to fp        01
   //        110 - min             10
   //        101 - max             10
+  //        100 - fround          11
   //        111 - fli             11
 
   //  OpCtrl:
@@ -350,9 +353,9 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   assign Adr3D = InstrD[31:27];
  
   // D/E pipleine register
-  flopenrc #(15+P.FMTBITS) DECtrlReg3(clk, reset, FlushE, ~StallE, 
-              {FRegWriteD, PostProcSelD, FResSelD, FrmD, FmtD, OpCtrlD, FWriteIntD, FCvtIntD, ZfaD, ~IllegalFPUInstrD},
-              {FRegWriteE, PostProcSelE, FResSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, ZfaE, FPUActiveE});
+  flopenrc #(`FCTRLW+2+P.FMTBITS) DECtrlReg3(clk, reset, FlushE, ~StallE, 
+              {FRegWriteD, PostProcSelD, FResSelD, FrmD, FmtD, OpCtrlD, FWriteIntD, FCvtIntD, ZfaD, ZfaFRoundNXD, ~IllegalFPUInstrD},
+              {FRegWriteE, PostProcSelE, FResSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, ZfaE, ZfaFRoundNXE, FPUActiveE});
   flopenrc #(15) DEAdrReg(clk, reset, FlushE, ~StallE, {Adr1D, Adr2D, Adr3D}, {Adr1E, Adr2E, Adr3E});
   flopenrc #(1) DEFDivStartReg(clk, reset, FlushE, ~StallE|FDivBusyE, FDivStartD, FDivStartE);
   flopenrc #(3) DEEnReg(clk, reset, FlushE, ~StallE, {XEnD, YEnD, ZEnD}, {XEnE, YEnE, ZEnE});
@@ -365,7 +368,7 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   flopenrc #(14+int'(P.FMTBITS)) EMCtrlReg (clk, reset, FlushM, ~StallM,
               {FRegWriteE, FResSelE, PostProcSelE, FrmE, FmtE, OpCtrlE, FWriteIntE, FCvtIntE, ZfaE},
               {FRegWriteM, FResSelM, PostProcSelM, FrmM, FmtM, OpCtrlM, FWriteIntM, FCvtIntM, ZfaM});
-  
+
   // renameing for readability
   assign FpLoadStoreM = FResSelM[1];
 
@@ -373,5 +376,5 @@ module fctrl import cvw::*;  #(parameter cvw_t P) (
   flopenrc #(4)  MWCtrlReg(clk, reset, FlushW, ~StallW,
           {FRegWriteM, FResSelM, FCvtIntM},
           {FRegWriteW, FResSelW, FCvtIntW});
- 
+
 endmodule
diff --git a/src/fpu/fdivsqrt/fdivsqrt.sv b/src/fpu/fdivsqrt/fdivsqrt.sv
index 1d44cef5d..dba69267a 100644
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@@ -37,6 +37,8 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
   input  logic                 XInfE, YInfE, 
   input  logic                 XZeroE, YZeroE, 
   input  logic                 XNaNE, YNaNE, 
+  input  logic [P.NE-2:0]      BiasE,                               // Bias of exponent
+  input  logic [P.LOGFLEN-1:0] NfE,                          // Number of fractional bits in selected format
   input  logic                 FDivStartE, IDivStartE,
   input  logic                 StallM,
   input  logic                 FlushE,
@@ -75,7 +77,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
 
   fdivsqrtpreproc #(P) fdivsqrtpreproc(                          // Preprocessor
     .clk, .IFDivStartE, .Xm(XmE), .Ym(YmE), .Xe(XeE), .Ye(YeE),
-    .FmtE, .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
+    .FmtE, .Bias(BiasE), .Nf(NfE), .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
     // Int-specific 
     .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
     .BZeroM, .IntNormShiftM, .AM, 
diff --git a/src/fpu/fdivsqrt/fdivsqrtcycles.sv b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
index 72fe04249..9e2489eb3 100644
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@@ -29,39 +29,14 @@
 
 module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.FMTBITS-1:0] FmtE,
+  input  logic [P.LOGFLEN-1:0] Nf,          // Number of fractional bits in selected format
   input  logic                 SqrtE,
   input  logic                 IntDivE,
   input  logic [P.DIVBLEN-1:0] IntResultBitsE,    
   output logic [P.DURLEN-1:0]  CyclesE
 );
 
-  logic [P.DIVBLEN-1:0] Nf, FPResultBitsE, ResultBitsE; // number of fractional (result) bits
-
-  /* verilator lint_off WIDTH */
-  if (P.FPSIZES == 1)
-    assign Nf = P.NF;
-  else if (P.FPSIZES == 2)
-    always_comb
-      case (FmtE)
-        1'b0: Nf = P.NF1;
-        1'b1: Nf = P.NF;
-      endcase
-  else if (P.FPSIZES == 3)
-    always_comb
-      case (FmtE)
-        P.FMT:   Nf = P.NF;
-        P.FMT1:  Nf = P.NF1;
-        P.FMT2:  Nf = P.NF2; 
-        default: Nf = 'x; // shouldn't happen
-      endcase
-  else if (P.FPSIZES == 4)  
-    always_comb
-      case(FmtE)
-        P.S_FMT: Nf = P.S_NF;
-        P.D_FMT: Nf = P.D_NF;
-        P.H_FMT: Nf = P.H_NF;
-        P.Q_FMT: Nf = P.Q_NF;
-      endcase 
+  logic [P.DIVBLEN-1:0] FPResultBitsE, ResultBitsE; // number of fractional (result) bits
 
   // Cycle logic
   // P.DIVCOPIES = k. P.LOGR = log(R) = r.  P.RK = rk.  
@@ -70,6 +45,7 @@ module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
   // FP Sqrt needs at least Nf fractional bits and 2 guard/round bits.  The integer bit is always initialized to 1 and does not need a cycle.
   // The datapath produces rk bits per cycle, so Cycles = ceil (ResultBitsE / rk)
 
+  /* verilator lint_off WIDTH */
   always_comb begin 
     FPResultBitsE = Nf + 2 + P.LOGR; // Nf + two fractional bits for round/guard; integer bit implicit because starting at n=1
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
index a20d1871d..03d144263 100644
--- a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
@@ -28,49 +28,21 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module fdivsqrtexpcalc import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.FMTBITS-1:0] Fmt,
+  input  logic [P.NE-2:0]      Bias,      // Bias of exponent
   input  logic [P.NE-1:0]      Xe, Ye,    // input exponents
   input  logic                 Sqrt,
   input  logic [P.DIVBLEN-1:0] ell, m,    // number of leading 0s in Xe and Ye
   output logic [P.NE+1:0]      Ue         // result exponent
   );
-  
-  logic [P.NE-2:0] Bias;
+
   logic [P.NE+1:0] SXExp;
   logic [P.NE+1:0] SExp;
   logic [P.NE+1:0] DExp;
 
-  // Determine exponent bias according to the format
-  
-  if (P.FPSIZES == 1) begin
-    assign Bias = (P.NE-1)'(P.BIAS); 
-
-  end else if (P.FPSIZES == 2) begin
-    assign Bias = Fmt ? (P.NE-1)'(P.BIAS) : (P.NE-1)'(P.BIAS1); 
-
-  end else if (P.FPSIZES == 3) begin
-    always_comb
-      case (Fmt)
-        P.FMT: Bias  =  (P.NE-1)'(P.BIAS);
-        P.FMT1: Bias = (P.NE-1)'(P.BIAS1);
-        P.FMT2: Bias = (P.NE-1)'(P.BIAS2);
-        default: Bias = 'x;
-      endcase
-
-  end else if (P.FPSIZES == 4) begin        
-  always_comb
-    case (Fmt)
-      2'h3: Bias =  (P.NE-1)'(P.Q_BIAS);
-      2'h1: Bias =  (P.NE-1)'(P.D_BIAS);
-      2'h0: Bias =  (P.NE-1)'(P.S_BIAS);
-      2'h2: Bias =  (P.NE-1)'(P.H_BIAS);
-    endcase
-  end
-
   // Square root exponent = (Xe - l - bias) / 2 + bias; l accounts for subnorms
   assign SXExp = {2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - (P.NE+2)'(P.BIAS);
   assign SExp  = {SXExp[P.NE+1], SXExp[P.NE+1:1]} + {2'b0, Bias};
-  
+
   // division exponent = (Xe-l) - (Ye-m) + bias; l and m account for subnorms
   assign DExp  = ({2'b0, Xe} - {{(P.NE+1-P.DIVBLEN){1'b0}}, ell} - {2'b0, Ye} + {{(P.NE+1-P.DIVBLEN){1'b0}}, m} + {3'b0, Bias}); 
 
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 45d50dac3..802ac92dc 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -33,6 +33,8 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   input  logic [P.NF:0]        Xm, Ym,      // Floating-point significands
   input  logic [P.NE-1:0]      Xe, Ye,      // Floating-point exponents
   input  logic [P.FMTBITS-1:0] FmtE,
+  input  logic [P.NE-2:0]      Bias,                               // Bias of exponent
+  input  logic [P.LOGFLEN-1:0] Nf,          // Number of fractional bits in selected format
   input  logic                 SqrtE,
   input  logic                 XZeroE,
   input  logic [2:0]           Funct3E,
@@ -209,11 +211,11 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
   flopen #(P.DIVb+4) dreg(clk, IFDivStartE, {3'b000, Dnorm}, D);
  
   // Floating-point exponent
-  fdivsqrtexpcalc #(P) expcalc(.Fmt(FmtE), .Xe, .Ye, .Sqrt(SqrtE), .ell, .m(mE), .Ue(UeE));
+  fdivsqrtexpcalc #(P) expcalc(.Bias, .Xe, .Ye, .Sqrt(SqrtE), .ell, .m(mE), .Ue(UeE));
   flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);
 
   // Number of FSM cycles (to FSM)
-  fdivsqrtcycles #(P) cyclecalc(.FmtE, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
+  fdivsqrtcycles #(P) cyclecalc(.FmtE, .Nf, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
 
   if (P.IDIV_ON_FPU) begin:intpipelineregs
     logic [P.DIVBLEN-1:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
diff --git a/src/fpu/fmtparams.sv b/src/fpu/fmtparams.sv
new file mode 100644
index 000000000..d83dfd782
--- /dev/null
+++ b/src/fpu/fmtparams.sv
@@ -0,0 +1,86 @@
+
+///////////////////////////////////////////
+// fmtparams.sv
+//
+// Written: David_Harris@hmc.edu
+// Modified: 5/11/24
+//
+// Purpose: Look up bias of exponent and number of fractional bits for the selected format
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module fmtparams import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.FMTBITS-1:0] Fmt,
+  output logic [P.NE-2:0]      Bias,
+  output logic [P.LOGFLEN-1:0] Nf
+);
+
+  if (P.FPSIZES == 1) begin
+    assign Bias = (P.NE-1)'(P.BIAS); 
+  end else if (P.FPSIZES == 2) begin
+    assign Bias = Fmt ? (P.NE-1)'(P.BIAS) : (P.NE-1)'(P.BIAS1); 
+  end else if (P.FPSIZES == 3) begin
+    always_comb
+      case (Fmt)
+        P.FMT:  Bias =  (P.NE-1)'(P.BIAS);
+        P.FMT1: Bias = (P.NE-1)'(P.BIAS1);
+        P.FMT2: Bias = (P.NE-1)'(P.BIAS2);
+        default: Bias = 'x;
+      endcase
+  end else if (P.FPSIZES == 4) begin        
+    always_comb
+      case (Fmt)
+        2'h3: Bias =  (P.NE-1)'(P.Q_BIAS);
+        2'h1: Bias =  (P.NE-1)'(P.D_BIAS);
+        2'h0: Bias =  (P.NE-1)'(P.S_BIAS);
+        2'h2: Bias =  (P.NE-1)'(P.H_BIAS);
+      endcase
+  end
+
+  /* verilator lint_off WIDTH */
+  if (P.FPSIZES == 1)
+    assign Nf = P.NF;
+  else if (P.FPSIZES == 2)
+    always_comb
+      case (Fmt)
+        1'b0: Nf = P.NF1;
+        1'b1: Nf = P.NF;
+      endcase
+  else if (P.FPSIZES == 3)
+    always_comb
+      case (Fmt)
+        P.FMT:   Nf = P.NF;
+        P.FMT1:  Nf = P.NF1;
+        P.FMT2:  Nf = P.NF2; 
+        default: Nf = 'x; // shouldn't happen
+      endcase
+  else if (P.FPSIZES == 4)  
+    always_comb
+      case(Fmt)
+        P.S_FMT: Nf = P.S_NF;
+        P.D_FMT: Nf = P.D_NF;
+        P.H_FMT: Nf = P.H_NF;
+        P.Q_FMT: Nf = P.Q_NF;
+      endcase 
+  /* verilator lint_on WIDTH */
+
+endmodule
diff --git a/src/fpu/fpu.sv b/src/fpu/fpu.sv
index 22c650ed8..dc3d353fb 100755
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@@ -70,7 +70,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
 
   // control signals
   logic                        FRegWriteW;                         // FP register write enable
-  logic [2:0]                  FrmM;                               // FP rounding mode
+  logic [2:0]                  FrmE, FrmM;                         // FP rounding mode
   logic [P.FMTBITS-1:0]        FmtE, FmtM;                         // FP precision 0-single 1-double
   logic                        FDivStartE, IDivStartE;             // Start division or squareroot
   logic                        FWriteIntM;                         // Write to integer register
@@ -85,6 +85,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   logic                        FRegWriteE;                         // Write floating-point register
   logic                        FPUActiveE;                         // FP instruction being executed
   logic                        ZfaE, ZfaM;                         // Zfa variants of instructions (fli, fminm, fmaxm, fround, froundnx, fleq, fltq, fmvh, fmvp, fcvtmod.w.d)
+  logic                        ZfaFRoundNXE;                       // Zfa froundnx variant
 
   // regfile signals
   logic [P.FLEN-1:0]           FRD1D, FRD2D, FRD3D;                // Read Data from FP register - decode stage
@@ -112,6 +113,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   logic                        XInfM, YInfM, ZInfM;                // is the input infinity - memory stage
   logic                        XExpMaxE;                           // is the exponent all ones (max value)
   logic [P.FLEN-1:0]           XPostBoxE;                          // X after fixing bad NaN box.  Needed for 1-input operations
+  logic [P.NE-2:0]             BiasE;                              // Bias of exponent
+  logic [P.LOGFLEN-1:0]        NfE;                                // Number of fractional bits
 
   // Fma Signals
   logic                        FmaAddSubE;                         // Multiply by 1.0 when adding or subtracting
@@ -150,7 +153,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   logic [P.XLEN-1:0]           FIntResE;                           // FPU to IEU E-stage result (classify, compare, move)
   logic [P.FLEN-1:0]           PostProcResM;                       // Postprocessor output
   logic [4:0]                  PostProcFlgM;                       // Postprocessor flags
-  logic                        PreNVE, PreNVM;                     // selected flag that is ready in the memory stage     
+  logic                        PreNVE, PreNVM;                     // selected invalid flag that is ready in the memory stage     
+  logic                        PreNXE, PreNXM;                     // selected inexact flag that is ready in the memory stage     
   logic [P.FLEN-1:0]           FpResM, FpResW;                     // FPU preliminary result
   logic [P.FLEN-1:0]           PreFpResE, PreFpResM;               // selected result that is ready in the memory stage
   logic [P.FLEN-1:0]           FResultW;                           // final FP result being written to the FP register   
@@ -162,9 +166,8 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   logic                        StallUnpackedM;                     // Stall unpacker outputs during multicycle fdivsqrt
   logic [P.FLEN-1:0]           SgnExtXE;                           // Sign-extended X input for move to integer
   logic                        mvsgn;                              // sign bit for extending move
-  logic [P.FLEN-1:0]           FliResE;                            // Zfa Floating-point load immediate value
-  logic [P.FLEN-1:0]           FRoundE;                            // Zfa fround output
-  logic [4:0]                  FRoundFlagsE;                       // Zfa fround flags
+  logic [P.FLEN-1:0]           ZfaResE;                            // Result of Zfa fli or fround instruction
+  logic                        FRoundNVE, FRoundNXE;               // Zfa fround invalid and inexact flags
 
   //////////////////////////////////////////////////////////////////////////////////////////
   // Decode Stage: fctrl decoder, read register file
@@ -174,7 +177,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   fctrl #(P) fctrl (.Funct7D(InstrD[31:25]), .OpD(InstrD[6:0]), .Rs2D(InstrD[24:20]), .Funct3D(InstrD[14:12]), 
               .IntDivE, .InstrD,
               .StallE, .StallM, .StallW, .FlushE, .FlushM, .FlushW, .FRM_REGW, .STATUS_FS, .FDivBusyE,
-              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .ZfaE, .ZfaM, .FrmM, .FmtE, .FmtM,
+              .reset, .clk, .FRegWriteE, .FRegWriteM, .FRegWriteW, .ZfaE, .ZfaM, .ZfaFRoundNXE, .FrmE, .FrmM, .FmtE, .FmtM,
               .FDivStartE, .IDivStartE, .FWriteIntE, .FCvtIntE, .FWriteIntM, .OpCtrlE, .OpCtrlM, .FpLoadStoreM,
               .IllegalFPUInstrD, .XEnD, .YEnD, .ZEnD, .XEnE, .YEnE, .ZEnE,
               .FResSelE, .FResSelM, .FResSelW, .FPUActiveE, .PostProcSelE, .PostProcSelM, .FCvtIntW, 
@@ -237,7 +240,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
     .XNaN(XNaNE), .YNaN(YNaNE), .ZNaN(ZNaNE), .XSNaN(XSNaNE), .XEn(XEnE), 
     .YSNaN(YSNaNE), .ZSNaN(ZSNaNE), .XSubnorm(XSubnormE), 
     .XZero(XZeroE), .YZero(YZeroE), .ZZero(ZZeroE), .XInf(XInfE), .YInf(YInfE), 
-    .ZEn(ZEnE), .ZInf(ZInfE), .XExpMax(XExpMaxE), .XPostBox(XPostBoxE));
+    .ZEn(ZEnE), .ZInf(ZInfE), .XExpMax(XExpMaxE), .XPostBox(XPostBoxE), .Bias(BiasE), .Nf(NfE));
   
   // fused multiply add: fadd/sub, fmul, fmadd/fnmadd/fmsub/fnmsub
   fma #(P) fma (.Xs(XsE), .Ys(YsE), .Zs(ZsE), .Xe(XeE), .Ye(YeE), .Ze(ZeE), .Xm(XmE), .Ym(YmE), .Zm(ZmE), 
@@ -246,7 +249,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
 
   // divide and square root: fdiv, fsqrt, optionally integer division
   fdivsqrt #(P) fdivsqrt(.clk, .reset, .FmtE, .XmE, .YmE, .XeE, .YeE, .SqrtE(OpCtrlE[0]), .SqrtM(OpCtrlM[0]),
-    .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .FDivStartE, .IDivStartE, .XsE,
+    .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, .BiasE, .NfE, .FDivStartE, .IDivStartE, .XsE,
     .ForwardedSrcAE, .ForwardedSrcBE, .Funct3E, .Funct3M, .IntDivE, .W64E,
     .StallM, .FlushE, .DivStickyM, .FDivBusyE, .IFDivStartE, .FDivDoneE, .UeM, 
     .UmM, .FIntDivResultM);
@@ -270,23 +273,26 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
     .ResSubnormUf(CvtResSubnormUfE), .Cs(CsE), .IntZero(IntZeroE), .LzcIn(CvtLzcInE));
 
   // ZFA: fround and floating-point load immediate fli
-  if (P.ZFA_SUPPORTED) begin
+  if (P.ZFA_SUPPORTED) begin:Zfa
     logic [4:0] Rs1E;
     logic [1:0] Fmt2E; // Two-bit format field from instruction
+    logic [P.FLEN-1:0]           FRoundE;                            // Zfa fround output
+    logic [P.FLEN-1:0]           FliResE;                            // Zfa Floating-point load immediate value
 
     // fround
-    fround #(P) fround(.Xs(XsE), .Xe(XeE), .Xm(XmE), 
-                       .XNaN(XNaNE), .XSNaN(XSNaNE), .XZero(XZeroE), .Fmt(FmtE), 
-                       .FRound(FRoundE), .FRoundFlags(FRoundFlagsE));
+    fround #(P) fround(.X(XE), .Xs(XsE), .Xe(XeE), .Xm(XmE), 
+                       .XNaN(XNaNE), .XSNaN(XSNaNE), .XZero(XZeroE), .Fmt(FmtE), .Frm(FrmE), .Nf(NfE), 
+                       .ZfaFRoundNX(ZfaFRoundNXE),
+                       .FRound(FRoundE), .FRoundNV(FRoundNVE), .FRoundNX(FRoundNXE));
 
     // fli
     flopenrc #(5) Rs1EReg(clk, reset, FlushE, ~StallE, InstrD[19:15], Rs1E);
     flopenrc #(2) Fmt2EReg(clk, reset, FlushE, ~StallE, InstrD[26:25], Fmt2E);
     fli #(P) fli(.Rs1(Rs1E), .Fmt(Fmt2E), .Imm(FliResE)); 
+    mux2  #(P.FLEN) ZfaResMux(FRoundE, FliResE, OpCtrlE[0], ZfaResE);
   end else begin
-    assign FRoundE = '0; 
-    assign FRoundFlagsE = '0;
-    assign FliResE = '0;
+    assign {FRoundNXE, FRoundNVE} = '0;
+    assign ZfaResE = 'x;
   end
 
   // fmv.*.x: NaN Box SrcA to extend integer to requested FP size 
@@ -311,8 +317,9 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   else assign IntSrcE = PreIntSrcE;
 
   // select a result that may be written to the FP register
-  mux4  #(P.FLEN) FResMux(SgnResE, IntSrcE, CmpFpResE, FliResE, {OpCtrlE[2], &OpCtrlE[1:0]}, PreFpResE);
-  assign PreNVE = CmpNVE&(OpCtrlE[2]|FWriteIntE);
+  mux4  #(P.FLEN) FResMux(SgnResE, IntSrcE, CmpFpResE, ZfaResE, {OpCtrlE[2], &OpCtrlE[1:0] | (OpCtrlE == 3'b100) & ZfaE}, PreFpResE);
+  assign PreNVE = CmpNVE&(OpCtrlE[2]|FWriteIntE) | FRoundNVE & (OpCtrlE == 3'b100) & ZfaE;
+  assign PreNXE = FRoundNXE & (OpCtrlE == 3'b100);
 
   // fmv.x.*: select the result that may be written to the integer register
   if(P.FPSIZES == 1) begin
@@ -350,7 +357,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
   flopenr #(13) EMFpReg5 (clk, reset, ~StallUnpackedM, 
     {XsE, YsE, XZeroE, YZeroE, XInfE, YInfE, ZInfE, XNaNE, YNaNE, ZNaNE, XSNaNE, YSNaNE, ZSNaNE},
     {XsM, YsM, XZeroM, YZeroM, XInfM, YInfM, ZInfM, XNaNM, YNaNM, ZNaNM, XSNaNM, YSNaNM, ZSNaNM});     
-  flopenrc #(1)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, PreNVE, PreNVM);      
+  flopenrc #(2)  EMRegCmpFlg (clk, reset, FlushM, ~StallM, {PreNVE, PreNXE}, {PreNVM, PreNXM});      
   flopenrc #(3*P.NF+4) EMRegFma2(clk, reset, FlushM, ~StallM, SmE, SmM);
   flopenrc #($clog2(3*P.NF+5)+7+P.NE) EMRegFma4(clk, reset, FlushM, ~StallM,
     {FmaAStickyE, InvAE, SCntE, AsE, PsE, SsE, SeE},
@@ -373,8 +380,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
     .PostProcSel(PostProcSelM), .PostProcRes(PostProcResM), .PostProcFlg(PostProcFlgM), .FCvtIntRes(FCvtIntResM));
 
   // FPU flag selection - to privileged
-  //mux2  #(5)       FPUFlgMux({PreNVM&~FResSelM[1], 4'b0}, PostProcFlgM, ~FResSelM[1]&FResSelM[0], SetFflagsM);
-  mux2  #(5)       FPUFlgMux({PreNVM, 4'b0}, PostProcFlgM, (FResSelM == 2'b01), SetFflagsM);
+  mux2  #(5)       FPUFlgMux({PreNVM, 3'b0, PreNXM}, PostProcFlgM, (FResSelM == 2'b01), SetFflagsM);
   mux2  #(P.FLEN)  FPUResMux(PreFpResM, PostProcResM, FResSelM[0], FpResM);
 
   // M/W pipe registers
diff --git a/src/fpu/fround.sv b/src/fpu/fround.sv
index 180f99605..fb4911253 100644
--- a/src/fpu/fround.sv
+++ b/src/fpu/fround.sv
@@ -28,60 +28,34 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////
 
 module fround import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.FLEN-1:0]       X,            // input before unpacking
   input  logic                    Xs,           // input's sign
   input  logic [P.NE-1:0]         Xe,           // input's exponent
-  input  logic [P.NF:0]           Xm,           // input's fraction
+  input  logic [P.NF:0]           Xm,           // input's fraction with leading integer bit (U1.NF)
   input  logic                    XNaN,         // X is NaN
   input  logic                    XSNaN,        // X is Signalling NaN
   input  logic                    XZero,        // X is Zero
   input  logic [P.FMTBITS-1:0]    Fmt,          // the input's precision (11=quad 01=double 00=single 10=half)
+  input  logic [2:0]              Frm,          // rounding mode
+  input  logic [P.LOGFLEN-1:0]    Nf,           // Number of fractional bits in selected format
+  input  logic                    ZfaFRoundNX,  // froundnx instruction can set inexact flag
   output logic [P.FLEN-1:0]       FRound,       // Rounded result
-  output logic [4:0]              FRoundFlags   // Rounder flags
+  output logic                    FRoundNV,     // fround invalid
+  output logic                    FRoundNX      // fround inexact
 );
 
-  logic [P.NE-2:0] Bias;
-  logic [P.NE-1:0] E;
-  logic [P.NF:0] Imask, Tmasknonneg, Tmaskneg, Tmask, HotE, HotEP1, Trunc, Rnd;
-  logic Lnonneg, Lp, Rnonneg, Rp, Tp;
-
-  //////////////////////////////////////////
-  // Determine exponent bias according to the format
-  //////////////////////////////////////////
-  // *** replicated from fdivsqrt; find a way to share
-  
-  if (P.FPSIZES == 1) begin
-    assign Bias = (P.NE-1)'(P.BIAS); 
-
-  end else if (P.FPSIZES == 2) begin
-    assign Bias = Fmt ? (P.NE-1)'(P.BIAS) : (P.NE-1)'(P.BIAS1); 
-
-  end else if (P.FPSIZES == 3) begin
-    always_comb
-      case (Fmt)
-        P.FMT: Bias  =  (P.NE-1)'(P.BIAS);
-        P.FMT1: Bias = (P.NE-1)'(P.BIAS1);
-        P.FMT2: Bias = (P.NE-1)'(P.BIAS2);
-        default: Bias = 'x;
-      endcase
-
-  end else if (P.FPSIZES == 4) begin        
-  always_comb
-    case (Fmt)
-      2'h3: Bias =  (P.NE-1)'(P.Q_BIAS);
-      2'h1: Bias =  (P.NE-1)'(P.D_BIAS);
-      2'h0: Bias =  (P.NE-1)'(P.S_BIAS);
-      2'h2: Bias =  (P.NE-1)'(P.H_BIAS);
-    endcase
-  end
-
-/*
+  logic [P.NE-1:0] E, Xep1, EminusNf;
+  logic [P.NF:0] IMask, Tmasknonneg, Tmaskneg, Tmask, HotE, HotEP1, Trunc, Rnd;
+  logic [P.FLEN-1:0] W, PackedW;
+  logic Elt0, Eeqm1, Lnonneg, Lp, Rnonneg, Rp, Tp, RoundUp, Two, EgeNf, Exact;
 
   // Unbiased exponent
-  assign E = Xe - Bias;
+  assign E = Xe - P.BIAS[P.NE-1:0];
+  assign Xep1 = Xe + 1;
 
   //////////////////////////////////////////
   // Compute LSB L', rounding bit R' and Sticky bit T'
-  //      if (E < 0)					// negative exponents round to 0 or 1.  
+  //      if (E < 0)					// negative exponents round to 0 or 1.
   //              L' = 0      // LSB = 0
   //              if (E = -1) R' = 1, TMask = 0.1111...111	// if (E = -1) 0.5  X < 1.  Round bit is 1
   //              else R' = 0; TMask = 1.1111...111  	// if (E < -1), X < 0.5.  Round bit is 0
@@ -100,19 +74,19 @@ module fround import cvw::*;  #(parameter cvw_t P) (
   //////////////////////////////////////////
 
   // Check if exponent is negative and -1
-  assign Elt0 = (E < 0);
-  assign Eeqm1 = (E == -1);
+  assign Elt0 = E[P.NE-1]; // (E < 0);
+  assign Eeqm1 = ($signed(E) == -1);
 
   // Logic for nonnegative mask and rounding bits
-  assign Imask = {1'b1, {P.NF{1'b0}}} >>> E;
+  assign IMask = {1'b1, {P.NF{1'b0}}} >>> E;
   assign Tmasknonneg = ~(IMask >>> 1'b1);
-  assign HotE = IMask & !(IMask << 1'b1);
+  assign HotE = IMask & ~(IMask << 1'b1);
   assign HotEP1 = HotE >> 1'b1;
   assign Lnonneg = |(Xm & HotE);
   assign Rnonneg = |(Xm & HotEP1);
-  assign Trunc = Xm & Imask;
-  assign Rnd = Trunc + HotE;
-   
+  assign Trunc = Xm & IMask;
+  assign {Two, Rnd} = Trunc + HotE; // Two means result is 10.000000 = 2.0
+
   // mux and AND-OR logic to select final rounding bits
   mux2 #(1) Lmux(Lnonneg, 1'b0, Elt0, Lp);
   mux2 #(1) Rmux(Rnonneg, Eeqm1, Elt0, Rp);
@@ -120,7 +94,6 @@ module fround import cvw::*;  #(parameter cvw_t P) (
   mux2 #(P.NF+1) Tmaskmux(Tmasknonneg, Tmaskneg, Elt0, Tmask);
   assign Tp = |(Xm & Tmask);
 
-
   ///////////////////////////
   // Rounding, flags, special Cases 
   //      Flags = 0						// unless overridden later
@@ -144,11 +117,15 @@ module fround import cvw::*;  #(parameter cvw_t P) (
   ///////////////////////////
 
   // Exact logic
-  assign Exact = (E >= Nf | XZero); // result will be exact; no need to round
+  /* verilator lint_off WIDTH */
+  assign EminusNf = E - Nf;
+  /* verilator lint_on WIDTH */
+  assign EgeNf = ~EminusNf[P.NE-1] & (~E[P.NE-1] | E[P.NE-2:0] == '0); // E >= Nf if MSB of E-Nf is 0 and E was positive 
+  assign Exact = (EgeNf | XZero) & ~XNaN; // result will be exact; no need to round
 
   // Rounding logic: determine whether to round up in magnitude
-  always_comb
-    case (Rm) // *** make sure this includes dynamic
+  always_comb begin
+    case (Frm) // Frm is either specified in the instruction or is the dynamic rounding mode
       3'b000:  RoundUp = Rp & (Lp | Tp);  // RNE
       3'b001:  RoundUp = 0;               // RZ
       3'b010:  RoundUp = Xs & (Rp | Tp);  // RN
@@ -157,22 +134,23 @@ module fround import cvw::*;  #(parameter cvw_t P) (
       default: RoundUp = 0;               // should never happen
     endcase
 
-    // output logic
-    if (XNaN) W = CanonicalNan; // ***
-    else if (Exact) W = X;
-    else if (Elt0) 
-      if (RoundUp) W = {Xs, bias, {P.NF}} // *** format conversions
+    // If result is not exact, select output in unpacked FLEN format initially
+    if (XNaN) W = {1'b0, {P.NE{1'b1}}, 1'b1, {(P.NF-1){1'b0}}}; // Canonical NaN
+    else if (Elt0) // 0 <= |X| < 1 rounds to 0 or 1
+      if (RoundUp) W = {Xs, P.BIAS[P.NE-1:0], {P.NF{1'b0}}}; // round to +/- 1
+      else         W = {Xs, {(P.FLEN-1){1'b0}}}; // round to +/- 0
+    else begin // |X| > 1 rounds to an integer
+      if (RoundUp & Two) W = {Xs, Xep1, {(P.NF){1'b0}}}; // Round up to 2.0
+      else if (RoundUp)  W = {Xs, Xe, Rnd[P.NF-1:0]};      // Round up to Rnd
+      else               W = {Xs, Xe, Trunc[P.NF-1:0]};    // Round down to Trunc
+    end
+  end
 
-      *** may not need to round to infinity; update docs and pseudocode above
-
-  always_comb
+  packoutput #(P) packoutput(W, Fmt, PackedW); // pack and NaN-box based on selected format.
+  mux2 #(P.FLEN) resultmux(PackedW, X, Exact, FRound);
 
   // Flags
-  assign Invalid = XSNaN;
-  assign Inexact = FRoundNX & ~(XNaN | Exact) & (Rp | T'); 
- */
-
-  assign FRound = '0;
-  assign FRoundFlags = '0;
+  assign FRoundNV = XSNaN;                                        // invalid if input is signaling NaN
+  assign FRoundNX = ZfaFRoundNX & ~(XNaN | Exact) & (Rp | Tp);    // Inexact if Round or Sticky bit set for FRoundNX instruction
 
 endmodule
diff --git a/src/fpu/packoutput.sv b/src/fpu/packoutput.sv
new file mode 100644
index 000000000..332c0ed52
--- /dev/null
+++ b/src/fpu/packoutput.sv
@@ -0,0 +1,101 @@
+
+///////////////////////////////////////////
+// packoutput.sv
+//
+// Written: David_Harris@hmc.edu
+// Modified: 5/11/24
+//
+// Purpose: Pack the output of the FPU
+// 
+// Documentation: RISC-V System on Chip Design Chapter 13
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module packoutput import cvw::*;  #(parameter cvw_t P) (
+  input  logic [P.FLEN-1:0]       Unpacked,
+  input  logic [P.FMTBITS-1:0]    Fmt,
+  output logic [P.FLEN-1:0]       Packed
+);
+
+  logic             Sign; 
+  logic [P.NE1-1:0] Exp1;
+  logic [P.NF1-1:0] Fract1;
+  logic [P.NE2-1:0] Exp2;
+  logic [P.NF2-1:0] Fract2;
+  logic [P.H_NE-1:0] Exp3;
+  logic [P.H_NF-1:0] Fract3;
+
+  // Pack exponent and fraction, with NaN-boxing to full FLEN
+
+  assign Sign = Unpacked[P.FLEN-1];
+  if (P.FPSIZES == 1) begin
+    assign Packed = Unpacked;
+  end else if (P.FPSIZES == 2) begin
+    int NF = P.NF;
+    int NE1 = P.NE1;
+    int top = P.NF + P.NE1-2;
+    int bot = P.NF - P.NF1;
+    always_comb
+      case (Fmt)
+        1'b1: Packed = Unpacked;
+        1'b0: begin
+                Exp1 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.NE1-2:P.NF]};
+                Fract1 = Unpacked[P.NF-1:P.NF-P.NF1];
+                Packed = {{(P.FLEN-P.LEN1){1'b1}}, Sign, Exp1, Fract1}; 
+              end
+      endcase
+  end else if (P.FPSIZES == 3) begin
+    always_comb
+      case (Fmt)
+        P.FMT: Packed = Unpacked;
+        P.FMT1: begin
+                Exp1 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.NE1-2:P.NF]};
+                Fract1 = Unpacked[P.NF-1:P.NF-P.NF1];
+                Packed = {{(P.FLEN-P.LEN1){1'b1}}, Sign, Exp1, Fract1}; 
+              end
+        P.FMT2: begin
+                Exp2 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.NE2-2:P.NF]};
+                Fract2 = Unpacked[P.NF-1:P.NF-P.NF2];
+                Packed = {{(P.FLEN-P.LEN2){1'b1}}, Sign, Exp2, Fract2}; 
+              end
+        default: Packed = 'x;
+      endcase
+  end else if (P.FPSIZES == 4) begin        
+    always_comb
+      case (Fmt)
+        2'h3: Packed = Unpacked;  // Quad
+        2'h1: begin // double
+                Exp1 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.NE1-2:P.NF]};
+                Fract1 = Unpacked[P.NF-1:P.NF-P.NF1];
+                Packed = {{(P.FLEN-P.LEN1){1'b1}}, Sign, Exp1, Fract1}; 
+              end
+        2'h0: begin // float
+                Exp2 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.NE2-2:P.NF]};
+                Fract2 = Unpacked[P.NF-1:P.NF-P.NF2];
+                Packed = {{(P.FLEN-P.LEN2){1'b1}}, Sign, Exp2, Fract2}; 
+              end
+        2'h2: begin // half
+                Exp3 = {Unpacked[P.FLEN-2], Unpacked[P.NF+P.H_NE-2:P.NF]};
+                Fract3 = Unpacked[P.NF-1:P.NF-P.H_NF];
+                Packed = {{(P.FLEN-P.H_LEN){1'b1}}, Sign, Exp3, Fract3}; 
+              end
+      endcase
+  end
+endmodule
\ No newline at end of file
diff --git a/src/fpu/unpack.sv b/src/fpu/unpack.sv
index eab224dd9..2e87d17fc 100644
--- a/src/fpu/unpack.sv
+++ b/src/fpu/unpack.sv
@@ -41,13 +41,15 @@ module unpack import cvw::*;  #(parameter cvw_t P) (
   output logic                    XZero, YZero, ZZero,  // is XYZ zero
   output logic                    XInf, YInf, ZInf,     // is XYZ infinity
   output logic                    XExpMax,              // does X have the maximum exponent (NaN or Inf)
-  output logic [P.FLEN-1:0]       XPostBox              // X after being properly NaN-boxed
+  output logic [P.FLEN-1:0]       XPostBox,             // X after being properly NaN-boxed
+  output logic [P.NE-2:0]         Bias,                 // Exponent bias
+  output logic [P.LOGFLEN-1:0]    Nf                    // Number of fractional bits
 );
 
   logic XExpNonZero, YExpNonZero, ZExpNonZero;          // is the exponent of XYZ non-zero
   logic XFracZero, YFracZero, ZFracZero;                // is the fraction zero
   logic YExpMax, ZExpMax;                               // is the exponent all 1s
-  
+
   unpackinput #(P) unpackinputX (.A(X), .Fmt, .Sgn(Xs), .Exp(Xe), .Man(Xm), .En(XEn), .FPUActive,
                           .NaN(XNaN), .SNaN(XSNaN), .ExpNonZero(XExpNonZero),
                           .Zero(XZero), .Inf(XInf), .ExpMax(XExpMax), .FracZero(XFracZero), 
@@ -63,4 +65,7 @@ module unpack import cvw::*;  #(parameter cvw_t P) (
                           .Zero(ZZero), .Inf(ZInf), .ExpMax(ZExpMax), .FracZero(ZFracZero), 
                           .Subnorm(), .PostBox());
  
+  // look up bias and fractional bits for the given format
+  fmtparams #(P) fmtparams(Fmt, Bias, Nf);
+
  endmodule
diff --git a/testbench/tests.vh b/testbench/tests.vh
index 49a454c43..0386dba6e 100644
--- a/testbench/tests.vh
+++ b/testbench/tests.vh
@@ -1641,7 +1641,7 @@ string imperas32f[] = '{
   string arch64d[] = '{
     `RISCVARCHTEST,
     // for speed
-   "rv64i_m/D/src/fadd.d_b10-01.S",
+    "rv64i_m/D/src/fadd.d_b10-01.S",
     "rv64i_m/D/src/fadd.d_b1-01.S",
     "rv64i_m/D/src/fadd.d_b11-01.S",
     "rv64i_m/D/src/fadd.d_b12-01.S",
@@ -2278,6 +2278,7 @@ string arch64zknh[] = '{
   string arch32zfaf[] = '{
     //`RISCVARCHTEST,
     `WALLYTEST,
+    "rv32i_m/F_Zfa/src/fround_b1-01.S",
     "rv32i_m/F_Zfa/src/fleq_b1-01.S",
     "rv32i_m/F_Zfa/src/fleq_b19-01.S", 
     "rv32i_m/F_Zfa/src/fli.s-01.S",
@@ -2289,12 +2290,12 @@ string arch64zknh[] = '{
     "rv32i_m/F_Zfa/src/fminm_b19-01.S",
     "rv32i_m/F_Zfa/src/fmaxm_b1-01.S",
     "rv32i_m/F_Zfa/src/fmaxm_b19-01.S"
-/*    "rv32i_m/F_Zfa/src/fround_b1-01.S" */
   };
 
   string arch32zfad[] = '{
     //`RISCVARCHTEST,
     `WALLYTEST,
+    "rv32i_m/D_Zfa/src/fround_b1-01.S",
     "rv32i_m/D_Zfa/src/fcvtmod.w.d_b1-01.S",
     "rv32i_m/D_Zfa/src/fcvtmod.w.d_b22-01.S",
     "rv32i_m/D_Zfa/src/fcvtmod.w.d_b23-01.S",
@@ -2326,12 +2327,12 @@ string arch64zknh[] = '{
     "rv32i_m/D_Zfa/src/fmvh.x.d_b27-01.S",
     "rv32i_m/D_Zfa/src/fmvh.x.d_b28-01.S",
     "rv32i_m/D_Zfa/src/fmvh.x.d_b29-01.S"
-/*    "rv32i_m/D_Zfa/src/fround_b1-01.S" */
   };
 
   string arch64zfaf[] = '{
     //`RISCVARCHTEST,
     `WALLYTEST,
+    "rv64i_m/F_Zfa/src/fround_b1-01.S",
     "rv64i_m/F_Zfa/src/fleq_b1-01.S",
     "rv64i_m/F_Zfa/src/fleq_b19-01.S", 
     "rv64i_m/F_Zfa/src/fli.s-01.S",
@@ -2341,12 +2342,12 @@ string arch64zknh[] = '{
     "rv64i_m/F_Zfa/src/fminm_b19-01.S",
     "rv64i_m/F_Zfa/src/fmaxm_b1-01.S",
     "rv64i_m/F_Zfa/src/fmaxm_b19-01.S"
-/*    "rv64i_m/F_Zfa/src/fround_b1-01.S" */
   };
 
   string arch64zfad[] = '{
     //`RISCVARCHTEST,
     `WALLYTEST,
+     "rv64i_m/D_Zfa/src/fround_b1-01.S",
     "rv64i_m/D_Zfa/src/fcvtmod.w.d_b1-01.S",
     "rv64i_m/D_Zfa/src/fcvtmod.w.d_b22-01.S", 
     "rv64i_m/D_Zfa/src/fcvtmod.w.d_b23-01.S",
@@ -2363,7 +2364,7 @@ string arch64zknh[] = '{
     "rv64i_m/D_Zfa/src/fminm_b19-01.S",
     "rv64i_m/D_Zfa/src/fmaxm_b1-01.S",
     "rv64i_m/D_Zfa/src/fmaxm_b19-01.S"
-/*     "rv64i_m/D_Zfa/src/fround_b1-01.S" */
+
   };
 
   string arch32d_fma[] = '{