64-bit rtl fix

2025-04-23 21:39:10 -04:00 · 2024-06-13 06:26:45 -07:00 · 2024-06-13 06:26:45 -07:00 · 5bcf24ed55
commit 5bcf24ed55
parent 78b6e0638c
7 changed files with 166 additions and 167 deletions
--- a/ci/regression.sh
+++ b/ci/regression.sh
@ -74,16 +74,20 @@ isa()

    if [ "$XLEN" == "64" ]
    then
-        make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
-        make -C tests/riscv/isa run-rtlsim-64f

-        make -C sim/rtlsim clean && CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim > /dev/null
-        make -C tests/riscv/isa run-rtlsim-64d || true
+        make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
+        make -C tests/riscv/isa run-rtlsim-64d

        make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
+        make -C tests/riscv/isa run-rtlsim-64d
+
+        make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
        make -C tests/riscv/isa run-rtlsim-64f

-        make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
+        make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
+        make -C tests/riscv/isa run-rtlsim-64f
+
+        make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
        make -C tests/riscv/isa run-rtlsim-64fx
    fi

--- a/ci/trace_csv.py
+++ b/ci/trace_csv.py
@ -225,7 +225,7 @@ def write_csv(log_filename, csv_filename, log_type):

    # write to CSV
    with open(csv_filename, 'w', newline='') as csv_file:
-        fieldnames = ["uuid", "PC", "opcode", "instr", "core_id", "warp_id", "tmask", "operands", "destination"]
+        fieldnames = ["uuid", "PC", "opcode", "instr", "core_id", "warp_id", "tmask", "destination", "operands"]
        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
        writer.writeheader()
        for entry in entries:
--- a/hw/dpi/float_dpi.cpp
+++ b/hw/dpi/float_dpi.cpp
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -40,7 +40,7 @@ extern "C" {
  void dpi_itof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
  void dpi_utof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
  void dpi_f2f(bool enable, int dst_fmt, int64_t a, int64_t* result);
-  
+
  void dpi_fclss(bool enable, int dst_fmt, int64_t a, int64_t* result);
  void dpi_fsgnj(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result);
  void dpi_fsgnjn(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result);
@ -54,31 +54,21 @@ extern "C" {
 }

 inline uint64_t nan_box(uint32_t value) {
-#ifdef FPU_RV64F
  return value | 0xffffffff00000000;
-#else 
-  return value;
-#endif
 }

 inline bool is_nan_boxed(uint64_t value) {
-#ifdef FPU_RV64F
  return (uint32_t(value >> 32) == 0xffffffff);
-#else
-  __unused (value);
-  return true;
-#endif
 }

-inline int64_t check_boxing(int64_t a) {  
-  if (!is_nan_boxed(a)) {
-    return nan_box(0x7fc00000); // NaN
-  }
-  return a;
+inline int64_t check_boxing(int64_t a) {
+  if (is_nan_boxed(a))
+    return a;
+  return nan_box(0x7fc00000); // NaN
 }

 void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
  if (dst_fmt) {
    *result = rv_fadd_d(a, b, (*frm & 0x7), fflags);
@ -88,7 +78,7 @@ void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal*
 }

 void dpi_fsub(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
  if (dst_fmt) {
    *result = rv_fsub_d(a, b, (*frm & 0x7), fflags);
@ -98,19 +88,19 @@ void dpi_fsub(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal*
 }

 void dpi_fmul(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_fmul_d(a, b, (*frm & 0x7), fflags); 
+  if (dst_fmt) {
+    *result = rv_fmul_d(a, b, (*frm & 0x7), fflags);
  } else {
    *result = nan_box(rv_fmul_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
  }
 }

 void dpi_fmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
+  if (dst_fmt) {
    *result = rv_fmadd_d(a, b, c, (*frm & 0x7), fflags);
  } else {
    *result = nan_box(rv_fmadd_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
@ -118,9 +108,9 @@ void dpi_fmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const
 }

 void dpi_fmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
+  if (dst_fmt) {
    *result = rv_fmsub_d(a, b, c, (*frm & 0x7), fflags);
  } else {
    *result = nan_box(rv_fmsub_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
@ -128,9 +118,9 @@ void dpi_fmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const
 }

 void dpi_fnmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
+  if (dst_fmt) {
    *result = rv_fnmadd_d(a, b, c, (*frm & 0x7), fflags);
  } else {
    *result = nan_box(rv_fnmadd_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
@ -138,9 +128,9 @@ void dpi_fnmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const
 }

 void dpi_fnmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
+  if (dst_fmt) {
    *result = rv_fnmsub_d(a, b, c, (*frm & 0x7), fflags);
  } else {
    *result = nan_box(rv_fnmsub_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
@ -148,36 +138,36 @@ void dpi_fnmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const
 }

 void dpi_fdiv(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_fdiv_d(a, b, (*frm & 0x7), fflags); 
+  if (dst_fmt) {
+    *result = rv_fdiv_d(a, b, (*frm & 0x7), fflags);
  } else {
    *result = nan_box(rv_fdiv_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
  }
 }

 void dpi_fsqrt(bool enable, int dst_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_fsqrt_d(a, (*frm & 0x7), fflags); 
+  if (dst_fmt) {
+    *result = rv_fsqrt_d(a, (*frm & 0x7), fflags);
  } else {
    *result = nan_box(rv_fsqrt_s(check_boxing(a), (*frm & 0x7), fflags));
  }
 }

 void dpi_ftoi(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
  if (dst_fmt) {
-    if (src_fmt) { 
+    if (src_fmt) {
      *result = rv_ftol_d(a, (*frm & 0x7), fflags);
    } else {
      *result = rv_ftol_s(check_boxing(a), (*frm & 0x7), fflags);
    }
-  } else {    
-    if (src_fmt) { 
+  } else {
+    if (src_fmt) {
      *result = sext<uint64_t>(rv_ftoi_d(a, (*frm & 0x7), fflags), 32);
    } else {
      *result = sext<uint64_t>(rv_ftoi_s(check_boxing(a), (*frm & 0x7), fflags), 32);
@ -186,61 +176,61 @@ void dpi_ftoi(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVa
 }

 void dpi_ftou(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
  if (dst_fmt) {
-    if (src_fmt) { 
+    if (src_fmt) {
      *result = rv_ftolu_d(a, (*frm & 0x7), fflags);
    } else {
      *result = rv_ftolu_s(check_boxing(a), (*frm & 0x7), fflags);
    }
-  } else {    
-    if (src_fmt) { 
+  } else {
+    if (src_fmt) {
      *result = sext<uint64_t>(rv_ftou_d(a, (*frm & 0x7), fflags), 32);
    } else {
-      *result = sext<uint64_t>(rv_ftou_s(check_boxing(a), (*frm & 0x7), fflags), 32); 
+      *result = sext<uint64_t>(rv_ftou_s(check_boxing(a), (*frm & 0x7), fflags), 32);
    }
  }
 }

 void dpi_itof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
  if (dst_fmt) {
-    if (src_fmt) { 
+    if (src_fmt) {
      *result = rv_ltof_d(a, (*frm & 0x7), fflags);
-    } else { 
+    } else {
      *result = rv_itof_d(a, (*frm & 0x7), fflags);
    }
  } else {
-    if (src_fmt) { 
-      *result = nan_box(rv_ltof_s(a, (*frm & 0x7), fflags)); 
-    } else { 
-      *result = nan_box(rv_itof_s(a, (*frm & 0x7), fflags)); 
+    if (src_fmt) {
+      *result = nan_box(rv_ltof_s(a, (*frm & 0x7), fflags));
+    } else {
+      *result = nan_box(rv_itof_s(a, (*frm & 0x7), fflags));
    }
  }
 }

 void dpi_utof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
  if (dst_fmt) {
-    if (src_fmt) { 
+    if (src_fmt) {
      *result = rv_lutof_d(a, (*frm & 0x7), fflags);
-    } else { 
+    } else {
      *result = rv_utof_d(a, (*frm & 0x7), fflags);
    }
  } else {
-    if (src_fmt) { 
+    if (src_fmt) {
      *result = nan_box(rv_lutof_s(a, (*frm & 0x7), fflags));
-    } else { 
+    } else {
      *result = nan_box(rv_utof_s(a, (*frm & 0x7), fflags));
    }
  }
 }

 void dpi_f2f(bool enable, int dst_fmt, int64_t a, int64_t* result) {
-  if (!enable) 
+  if (!enable)
    return;
  if (dst_fmt) {
    *result = rv_ftod((int32_t)check_boxing(a));
@ -250,90 +240,90 @@ void dpi_f2f(bool enable, int dst_fmt, int64_t a, int64_t* result) {
 }

 void dpi_fclss(bool enable, int dst_fmt, int64_t a, int64_t* result) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_fclss_d(a); 
-  } else { 
-    *result = rv_fclss_s(check_boxing(a)); 
+  if (dst_fmt) {
+    *result = rv_fclss_d(a);
+  } else {
+    *result = rv_fclss_s(check_boxing(a));
  }
 }

 void dpi_fsgnj(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_fsgnj_d(a, b); 
+  if (dst_fmt) {
+    *result = rv_fsgnj_d(a, b);
  } else {
    *result = nan_box(rv_fsgnj_s(check_boxing(a), check_boxing(b)));
  }
 }

 void dpi_fsgnjn(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_fsgnjn_d(a, b); 
+  if (dst_fmt) {
+    *result = rv_fsgnjn_d(a, b);
  } else {
    *result = nan_box(rv_fsgnjn_s(check_boxing(a), check_boxing(b)));
  }
 }

 void dpi_fsgnjx(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_fsgnjx_d(a, b); 
+  if (dst_fmt) {
+    *result = rv_fsgnjx_d(a, b);
  } else {
    *result = nan_box(rv_fsgnjx_s(check_boxing(a), check_boxing(b)));
  }
 }

 void dpi_flt(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
  if (dst_fmt) {
-    *result = rv_flt_d(a, b, fflags); 
+    *result = rv_flt_d(a, b, fflags);
  } else {
    *result = rv_flt_s(check_boxing(a), check_boxing(b), fflags);
  }
 }

 void dpi_fle(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_fle_d(a, b, fflags); 
+  if (dst_fmt) {
+    *result = rv_fle_d(a, b, fflags);
  } else {
    *result = rv_fle_s(check_boxing(a), check_boxing(b), fflags);
  }
 }

 void dpi_feq(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_feq_d(a, b, fflags); 
+  if (dst_fmt) {
+    *result = rv_feq_d(a, b, fflags);
  } else {
    *result = rv_feq_s(check_boxing(a), check_boxing(b), fflags);
  }
 }

 void dpi_fmin(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_fmin_d(a, b, fflags); 
+  if (dst_fmt) {
+    *result = rv_fmin_d(a, b, fflags);
  } else {
    *result = nan_box(rv_fmin_s(check_boxing(a), check_boxing(b), fflags));
  }
 }

 void dpi_fmax(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
-  if (!enable) 
+  if (!enable)
    return;
-  if (dst_fmt) { 
-    *result = rv_fmax_d(a, b, fflags); 
+  if (dst_fmt) {
+    *result = rv_fmax_d(a, b, fflags);
  } else {
    *result = nan_box(rv_fmax_s(check_boxing(a), check_boxing(b), fflags));
  }
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -40,6 +40,14 @@
 `define EXT_F_ENABLE
 `endif

+`ifdef XLEN_64
+`ifndef FPU_DSP
+`ifndef EXT_D_DISABLE
+`define EXT_D_ENABLE
+`endif
+`endif
+`endif
+
 `ifndef EXT_ZICOND_DISABLE
 `define EXT_ZICOND_ENABLE
 `endif
--- a/hw/rtl/VX_define.vh
+++ b/hw/rtl/VX_define.vh
@ -230,9 +230,9 @@
 `define INST_FPU_MUL         4'b0010
 `define INST_FPU_DIV         4'b0011
 `define INST_FPU_SQRT        4'b0100
-`define INST_FPU_CMP         4'b0101 // mod: LE=0, LT=1, EQ=2
+`define INST_FPU_CMP         4'b0101 // frm: LE=0, LT=1, EQ=2
 `define INST_FPU_F2F         4'b0110
-`define INST_FPU_MISC        4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
+`define INST_FPU_MISC        4'b0111 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
 `define INST_FPU_F2I         4'b1000
 `define INST_FPU_F2U         4'b1001
 `define INST_FPU_I2F         4'b1010
--- a/hw/rtl/fpu/VX_fpu_dsp.sv
+++ b/hw/rtl/fpu/VX_fpu_dsp.sv
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,7 +16,7 @@
 `ifdef FPU_DSP

 module VX_fpu_dsp import VX_fpu_pkg::*; #(
-    parameter NUM_LANES = 4, 
+    parameter NUM_LANES = 4,
    parameter TAG_WIDTH = 4,
    parameter OUT_BUF   = 0
 ) (
@ -29,7 +29,7 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
    input wire [NUM_LANES-1:0] mask_in,

    input wire [TAG_WIDTH-1:0] tag_in,
-    
+
    input wire [`INST_FPU_BITS-1:0] op_type,
    input wire [`INST_FMT_BITS-1:0] fmt,
    input wire [`INST_FRM_BITS-1:0] frm,
@ -37,7 +37,7 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
    input wire [NUM_LANES-1:0][`XLEN-1:0]  dataa,
    input wire [NUM_LANES-1:0][`XLEN-1:0]  datab,
    input wire [NUM_LANES-1:0][`XLEN-1:0]  datac,
-    output wire [NUM_LANES-1:0][`XLEN-1:0] result, 
+    output wire [NUM_LANES-1:0][`XLEN-1:0] result,

    output wire has_fflags,
    output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -56,22 +56,22 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(

    localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAG_WIDTH;

-    `UNUSED_VAR (fmt)    
+    `UNUSED_VAR (fmt)

    wire [NUM_FPC-1:0] per_core_ready_in;
    wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
    wire [NUM_FPC-1:0][TAG_WIDTH-1:0] per_core_tag_out;
    wire [NUM_FPC-1:0] per_core_ready_out;
-    wire [NUM_FPC-1:0] per_core_valid_out;    
-    wire [NUM_FPC-1:0] per_core_has_fflags;  
+    wire [NUM_FPC-1:0] per_core_valid_out;
+    wire [NUM_FPC-1:0] per_core_has_fflags;
    fflags_t [NUM_FPC-1:0] per_core_fflags;

    wire div_ready_in, sqrt_ready_in;
    wire [NUM_LANES-1:0][31:0] div_result, sqrt_result;
    wire [TAG_WIDTH-1:0] div_tag_out, sqrt_tag_out;
    wire div_ready_out, sqrt_ready_out;
-    wire div_valid_out, sqrt_valid_out;    
-    wire div_has_fflags, sqrt_has_fflags;  
+    wire div_valid_out, sqrt_valid_out;
+    wire div_has_fflags, sqrt_has_fflags;
    fflags_t div_fflags, sqrt_fflags;

    reg [FPC_BITS-1:0] core_select;
@ -79,7 +79,7 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(

    always @(*) begin
        is_madd   = 0;
-        is_sub    = 0;        
+        is_sub    = 0;
        is_neg    = 0;
        is_div    = 0;
        is_itof   = 0;
@ -126,19 +126,19 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
        .NUM_LANES (NUM_LANES),
        .TAG_WIDTH (TAG_WIDTH)
    ) fpu_fma (
-        .clk        (clk), 
-        .reset      (fma_reset), 
+        .clk        (clk),
+        .reset      (fma_reset),
        .valid_in   (valid_in && (core_select == FPU_FMA)),
        .ready_in   (per_core_ready_in[FPU_FMA]),
        .mask_in    (mask_in),
-        .tag_in     (tag_in), 
+        .tag_in     (tag_in),
        .frm        (frm),
        .is_madd    (is_madd),
        .is_sub     (is_sub),
        .is_neg     (is_neg),
-        .dataa      (dataa_s), 
-        .datab      (datab_s), 
-        .datac      (datac_s), 
+        .dataa      (dataa_s),
+        .datab      (datab_s),
+        .datac      (datac_s),
        .has_fflags (per_core_has_fflags[FPU_FMA]),
        .fflags     (per_core_fflags[FPU_FMA]),
        .result     (per_core_result[FPU_FMA]),
@ -151,17 +151,17 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
        .NUM_LANES (NUM_LANES),
        .TAG_WIDTH (TAG_WIDTH)
    ) fpu_div (
-        .clk        (clk), 
-        .reset      (div_reset), 
+        .clk        (clk),
+        .reset      (div_reset),
        .valid_in   (valid_in && (core_select == FPU_DIVSQRT) && is_div),
        .ready_in   (div_ready_in),
        .mask_in    (mask_in),
        .tag_in     (tag_in),
-        .frm        (frm), 
-        .dataa      (dataa_s), 
-        .datab      (datab_s), 
+        .frm        (frm),
+        .dataa      (dataa_s),
+        .datab      (datab_s),
        .has_fflags (div_has_fflags),
-        .fflags     (div_fflags), 
+        .fflags     (div_fflags),
        .result     (div_result),
        .tag_out    (div_tag_out),
        .valid_out  (div_valid_out),
@ -172,14 +172,14 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
        .NUM_LANES (NUM_LANES),
        .TAG_WIDTH (TAG_WIDTH)
    ) fpu_sqrt (
-        .clk        (clk), 
-        .reset      (sqrt_reset), 
+        .clk        (clk),
+        .reset      (sqrt_reset),
        .valid_in   (valid_in && (core_select == FPU_DIVSQRT) && ~is_div),
        .ready_in   (sqrt_ready_in),
        .mask_in    (mask_in),
        .tag_in     (tag_in),
-        .frm        (frm), 
-        .dataa      (dataa_s), 
+        .frm        (frm),
+        .dataa      (dataa_s),
        .has_fflags (sqrt_has_fflags),
        .fflags     (sqrt_fflags),
        .result     (sqrt_result),
@ -188,57 +188,57 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
        .ready_out  (sqrt_ready_out)
    );

-    wire cvt_rt_int_in = ~is_itof;
-    wire cvt_rt_int_out;
+    wire cvt_ret_int_in = ~is_itof;
+    wire cvt_ret_int_out;

    VX_fpu_cvt #(
        .NUM_LANES (NUM_LANES),
        .TAG_WIDTH (TAG_WIDTH+1)
    ) fpu_cvt (
-        .clk        (clk), 
-        .reset      (cvt_reset), 
+        .clk        (clk),
+        .reset      (cvt_reset),
        .valid_in   (valid_in && (core_select == FPU_CVT)),
        .ready_in   (per_core_ready_in[FPU_CVT]),
        .mask_in    (mask_in),
-        .tag_in     ({cvt_rt_int_in, tag_in}), 
+        .tag_in     ({cvt_ret_int_in, tag_in}),
        .frm        (frm),
-        .is_itof    (is_itof), 
-        .is_signed  (is_signed), 
-        .dataa      (dataa_s), 
+        .is_itof    (is_itof),
+        .is_signed  (is_signed),
+        .dataa      (dataa_s),
        .has_fflags (per_core_has_fflags[FPU_CVT]),
        .fflags     (per_core_fflags[FPU_CVT]),
        .result     (per_core_result[FPU_CVT]),
-        .tag_out    ({cvt_rt_int_out, per_core_tag_out[FPU_CVT]}),
+        .tag_out    ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}),
        .valid_out  (per_core_valid_out[FPU_CVT]),
        .ready_out  (per_core_ready_out[FPU_CVT])
    );

-    wire ncp_rt_int_in = (op_type == `INST_FPU_CMP)
-                      || `INST_FPU_IS_CLASS(op_type, frm) 
+    wire ncp_ret_int_in = (op_type == `INST_FPU_CMP)
+                      || `INST_FPU_IS_CLASS(op_type, frm)
                      || `INST_FPU_IS_MVXW(op_type, frm);
-    wire ncp_rt_int_out;
+    wire ncp_ret_int_out;
+
+    wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
+    wire ncp_ret_sext_out;

-    wire ncp_rt_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
-    wire ncp_rt_sext_out;
-    
    VX_fpu_ncp #(
        .NUM_LANES (NUM_LANES),
        .TAG_WIDTH (TAG_WIDTH+2)
    ) fpu_ncp (
        .clk        (clk),
-        .reset      (ncp_reset), 
+        .reset      (ncp_reset),
        .valid_in   (valid_in && (core_select == FPU_NCP)),
        .ready_in   (per_core_ready_in[FPU_NCP]),
        .mask_in    (mask_in),
-        .tag_in     ({ncp_rt_sext_in, ncp_rt_int_in, tag_in}),
+        .tag_in     ({ncp_ret_sext_in, ncp_ret_int_in, tag_in}),
        .op_type    (op_type),
        .frm        (frm),
        .dataa      (dataa_s),
-        .datab      (datab_s), 
-        .result     (per_core_result[FPU_NCP]), 
+        .datab      (datab_s),
+        .result     (per_core_result[FPU_NCP]),
        .has_fflags (per_core_has_fflags[FPU_NCP]),
        .fflags     (per_core_fflags[FPU_NCP]),
-        .tag_out    ({ncp_rt_sext_out, ncp_rt_int_out, per_core_tag_out[FPU_NCP]}),
+        .tag_out    ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}),
        .valid_out  (per_core_valid_out[FPU_NCP]),
        .ready_out  (per_core_ready_out[FPU_NCP])
    );
@ -249,20 +249,20 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(

    VX_stream_arb #(
        .NUM_INPUTS (2),
-        .DATAW      (RSP_DATAW), 
+        .DATAW      (RSP_DATAW),
        .ARBITER    ("R"),
        .OUT_BUF    (0)
    ) div_sqrt_arb (
        .clk       (clk),
        .reset     (reset),
-        .valid_in  ({sqrt_valid_out, div_valid_out}), 
+        .valid_in  ({sqrt_valid_out, div_valid_out}),
        .ready_in  ({sqrt_ready_out, div_ready_out}),
-        .data_in   ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out}, 
+        .data_in   ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out},
                     {div_result, div_has_fflags, div_fflags, div_tag_out}}),
        .data_out  ({
-            per_core_result[FPU_DIVSQRT], 
-            per_core_has_fflags[FPU_DIVSQRT], 
-            per_core_fflags[FPU_DIVSQRT], 
+            per_core_result[FPU_DIVSQRT],
+            per_core_has_fflags[FPU_DIVSQRT],
+            per_core_fflags[FPU_DIVSQRT],
            per_core_tag_out[FPU_DIVSQRT]
        }),
        .valid_out (per_core_valid_out[FPU_DIVSQRT]),
@ -273,50 +273,48 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
    ///////////////////////////////////////////////////////////////////////////

    reg [NUM_FPC-1:0][RSP_DATAW+2-1:0] per_core_data_out;
-    
+
    always @(*) begin
        for (integer i = 0; i < NUM_FPC; ++i) begin
            per_core_data_out[i][RSP_DATAW+1:2] = {
-                per_core_result[i], 
-                per_core_has_fflags[i], 
-                per_core_fflags[i], 
+                per_core_result[i],
+                per_core_has_fflags[i],
+                per_core_fflags[i],
                per_core_tag_out[i]
            };
            per_core_data_out[i][1:0] = '0;
-        end        
-        per_core_data_out[FPU_CVT][1:0] = {1'b1, cvt_rt_int_out};
-        per_core_data_out[FPU_NCP][1:0] = {ncp_rt_sext_out, ncp_rt_int_out};
+        end
+        per_core_data_out[FPU_CVT][1:0] = {1'b1, cvt_ret_int_out};
+        per_core_data_out[FPU_NCP][1:0] = {ncp_ret_sext_out, ncp_ret_int_out};
    end

    wire [NUM_LANES-1:0][31:0] result_s;
-    wire [1:0] op_rt_int_out;
+    
+    wire [1:0] op_ret_int_out;
+    `UNUSED_VAR (op_ret_int_out)

    VX_stream_arb #(
        .NUM_INPUTS (NUM_FPC),
-        .DATAW      (RSP_DATAW + 2), 
+        .DATAW      (RSP_DATAW + 2),
        .ARBITER    ("R"),
        .OUT_BUF    (OUT_BUF)
    ) rsp_arb (
        .clk       (clk),
        .reset     (reset),
-        .valid_in  (per_core_valid_out), 
+        .valid_in  (per_core_valid_out),
        .ready_in  (per_core_ready_out),
        .data_in   (per_core_data_out),
-        .data_out  ({result_s, has_fflags, fflags, tag_out, op_rt_int_out}),
+        .data_out  ({result_s, has_fflags, fflags, tag_out, op_ret_int_out}),
        .valid_out (valid_out),
        .ready_out (ready_out),
        `UNUSED_PIN (sel_out)
    );

-`ifndef FPU_RV64F
-    `UNUSED_VAR (op_rt_int_out)
-`endif
-
-    for (genvar i = 0; i < NUM_LANES; ++i) begin        
+    for (genvar i = 0; i < NUM_LANES; ++i) begin
    `ifdef FPU_RV64F
        reg [`XLEN-1:0] result_r;
        always @(*) begin
-            case (op_rt_int_out)
+            case (op_ret_int_out)
            2'b11:   result_r = `XLEN'($signed(result_s[i]));
            2'b01:   result_r = {32'h00000000, result_s[i]};
            default: result_r = {32'hffffffff, result_s[i]};
@ -333,4 +331,4 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(

 endmodule

-`endif 
+`endif
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@ -41,8 +41,7 @@ union reg_data_t {
 };

 inline uint64_t nan_box(uint32_t value) {
-  uint64_t mask = 0xffffffff00000000;
-  return value | mask;
+  return value | 0xffffffff00000000;
 }

 inline bool is_nan_boxed(uint64_t value) {