Fix alu changes, enable 64bit in FPU

- Move FPU DPI to support 64b
2025-04-24 05:47:35 -04:00 · 2023-03-12 21:17:32 -04:00 · 2023-03-12 21:17:32 -04:00 · 5bdff46810
commit 5bdff46810
parent 8da207c5bf
13 changed files with 147 additions and 147 deletions
--- a/hw/dpi/float_dpi.cpp
+++ b/hw/dpi/float_dpi.cpp
@ -10,161 +10,161 @@
 #include "VX_config.h"

 extern "C" {
-  void dpi_fadd(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
-  void dpi_fsub(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
-  void dpi_fmul(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
-  void dpi_fmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
-  void dpi_fmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
-  void dpi_fnmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
-  void dpi_fnmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
+  void dpi_fadd(bool enable, long int a, long int b, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
+  void dpi_fsub(bool enable, long int a, long int b, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
+  void dpi_fmul(bool enable, long int a, long int b, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
+  void dpi_fmadd(bool enable, long int a, long int b, int c, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
+  void dpi_fmsub(bool enable, long int a, long int b, int c, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
+  void dpi_fnmadd(bool enable, long int a, long int b, int c, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
+  void dpi_fnmsub(bool enable, long int a, long int b, int c, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);

-  void dpi_fdiv(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
-  void dpi_fsqrt(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
+  void dpi_fdiv(bool enable, long int a, long int b, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
+  void dpi_fsqrt(bool enable, int a, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
  
-  void dpi_ftoi(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
-  void dpi_ftou(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
-  void dpi_itof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
-  void dpi_utof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
+  void dpi_ftoi(bool enable, int a, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
+  void dpi_ftou(bool enable, int a, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
+  void dpi_itof(bool enable, int a, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);
+  void dpi_utof(bool enable, int a, const svBitVecVal* frm, long int* result, svBitVecVal* fflags);

-  void dpi_fclss(bool enable, int a, int* result);
-  void dpi_fsgnj(bool enable, int a, int b, int* result);
-  void dpi_fsgnjn(bool enable, int a, int b, int* result);
-  void dpi_fsgnjx(bool enable, int a, int b, int* result);
+  void dpi_fclss(bool enable, long int a, long long int* result);
+  void dpi_fsgnj(bool enable, long int a, long int b, long int* result);
+  void dpi_fsgnjn(bool enable, long int a, long int b, long int* result);
+  void dpi_fsgnjx(bool enable, long int a, long int b, long int* result);

-  void dpi_flt(bool enable, int a, int b, int* result, svBitVecVal* fflags);
-  void dpi_fle(bool enable, int a, int b, int* result, svBitVecVal* fflags);
-  void dpi_feq(bool enable, int a, int b, int* result, svBitVecVal* fflags);
-  void dpi_fmin(bool enable, int a, int b, int* result, svBitVecVal* fflags);
-  void dpi_fmax(bool enable, int a, int b, int* result, svBitVecVal* fflags);
+  void dpi_flt(bool enable, long int a, long int b, long int* result, svBitVecVal* fflags);
+  void dpi_fle(bool enable, long int a, long int b, long int* result, svBitVecVal* fflags);
+  void dpi_feq(bool enable, long int a, long int b, long int* result, svBitVecVal* fflags);
+  void dpi_fmin(bool enable, long int a, long int b, long int* result, svBitVecVal* fflags);
+  void dpi_fmax(bool enable, long int a, long int b, long int* result, svBitVecVal* fflags);
 }

-void dpi_fadd(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_fadd(bool enable, long int a, long int b, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fadd_s(a, b, (*frm & 0x7), fflags);
 }

-void dpi_fsub(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_fsub(bool enable, long int a, long int b, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fsub_s(a, b, (*frm & 0x7), fflags);
 }

-void dpi_fmul(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_fmul(bool enable, long int a, long int b, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fmul_s(a, b, (*frm & 0x7), fflags);
 }

-void dpi_fmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_fmadd(bool enable, long int a, long int b, int c, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fmadd_s(a, b, c, (*frm & 0x7), fflags);
 }

-void dpi_fmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_fmsub(bool enable, long int a, long int b, int c, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fmsub_s(a, b, c, (*frm & 0x7), fflags);
 }

-void dpi_fnmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_fnmadd(bool enable, long int a, long int b, int c, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fnmadd_s(a, b, c, (*frm & 0x7), fflags);
 }

-void dpi_fnmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_fnmsub(bool enable, long int a, long int b, int c, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fnmsub_s(a, b, c, (*frm & 0x7), fflags);
 }

-void dpi_fdiv(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_fdiv(bool enable, long int a, long int b, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fdiv_s(a, b, (*frm & 0x7), fflags);
 }

-void dpi_fsqrt(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_fsqrt(bool enable, int a, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fsqrt_s(a, (*frm & 0x7), fflags);
 }

-void dpi_ftoi(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_ftoi(bool enable, int a, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_ftoi_s(a, (*frm & 0x7), fflags);
 }

-void dpi_ftou(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_ftou(bool enable, int a, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_ftou_s(a, (*frm & 0x7), fflags);
 }

-void dpi_itof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_itof(bool enable, int a, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_itof_s(a, (*frm & 0x7), fflags);
 }

-void dpi_utof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
+void dpi_utof(bool enable, int a, const svBitVecVal* frm, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_utof_s(a, (*frm & 0x7), fflags);
 }

-void dpi_flt(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
+void dpi_flt(bool enable, long int a, long int b, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_flt_s(a, b, fflags);
 }

-void dpi_fle(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
+void dpi_fle(bool enable, long int a, long int b, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fle_s(a, b, fflags);
 }

-void dpi_feq(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
+void dpi_feq(bool enable, long int a, long int b, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_feq_s(a, b, fflags);
 }

-void dpi_fmin(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
+void dpi_fmin(bool enable, long int a, long int b, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fmin_s(a, b, fflags);
 }

-void dpi_fmax(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
+void dpi_fmax(bool enable, long int a, long int b, long int* result, svBitVecVal* fflags) {
  if (!enable) 
    return;
  *result = rv_fmax_s(a, b, fflags);
 }

-void dpi_fclss(bool enable, int a, int* result) {
+void dpi_fclss(bool enable, long int a, long long int* result) {
  if (!enable) 
    return;
  *result = rv_fclss_s(a);
 }

-void dpi_fsgnj(bool enable, int a, int b, int* result) {
+void dpi_fsgnj(bool enable, long int a, long int b, long int* result) {
  if (!enable) 
    return;
  *result = rv_fsgnj_s(a, b);
 }

-void dpi_fsgnjn(bool enable, int a, int b, int* result) {
+void dpi_fsgnjn(bool enable, long int a, long int b, long int* result) {
  if (!enable) 
    return;
  *result = rv_fsgnjn_s(a, b);
 }

-void dpi_fsgnjx(bool enable, int a, int b, int* result) {
+void dpi_fsgnjx(bool enable, long int a, long int b, long int* result) {
  if (!enable) 
    return;
  *result = rv_fsgnjx_s(a, b);
--- a/hw/dpi/float_dpi.vh
+++ b/hw/dpi/float_dpi.vh
@ -1,31 +1,31 @@
 `ifndef FLOAT_DPI
 `define FLOAT_DPI

-import "DPI-C" function void dpi_fadd(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_fsub(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_fmul(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_fmadd(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_fmsub(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_fnmadd(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_fnmsub(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fadd(input logic enable, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fsub(input logic enable, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fmul(input logic enable, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fmadd(input logic enable, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fmsub(input logic enable, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fnmadd(input logic enable, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fnmsub(input logic enable, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);

-import "DPI-C" function void dpi_fdiv(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_fsqrt(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fdiv(input logic enable, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fsqrt(input logic enable, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);

-import "DPI-C" function void dpi_ftoi(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_ftou(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_itof(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_utof(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_ftoi(input logic enable, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_ftou(input logic enable, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_itof(input logic enable, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_utof(input logic enable, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);

-import "DPI-C" function void dpi_fclss(input logic enable, input int a, output int result);
-import "DPI-C" function void dpi_fsgnj(input logic enable, input int a, input int b, output int result);
-import "DPI-C" function void dpi_fsgnjn(input logic enable, input int a, input int b, output int result);
-import "DPI-C" function void dpi_fsgnjx(input logic enable, input int a, input int b, output int result);
+import "DPI-C" function void dpi_fclss(input logic enable, input longint a, output longint result);
+import "DPI-C" function void dpi_fsgnj(input logic enable, input longint a, input longint b, output longint result);
+import "DPI-C" function void dpi_fsgnjn(input logic enable, input longint a, input longint b, output longint result);
+import "DPI-C" function void dpi_fsgnjx(input logic enable, input longint a, input longint b, output longint result);

-import "DPI-C" function void dpi_flt(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_fle(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_feq(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_fmin(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
-import "DPI-C" function void dpi_fmax(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_flt(input logic enable, input longint a, input longint b, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fle(input logic enable, input longint a, input longint b, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_feq(input logic enable, input longint a, input longint b, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fmin(input logic enable, input longint a, input longint b, output longint result, output bit[4:0] fflags);
+import "DPI-C" function void dpi_fmax(input logic enable, input longint a, input longint b, output longint result, output bit[4:0] fflags);

 `endif
--- a/hw/dpi/util_dpi.cpp
+++ b/hw/dpi/util_dpi.cpp
@ -13,8 +13,8 @@
 #endif

 extern "C" {
-  void dpi_imul(bool enable, int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, int* resulth);
-  void dpi_idiv(bool enable, int a, int b, bool is_signed, int* quotient, int* remainder);
+  void dpi_imul(bool enable, long int a, long int b, bool is_signed_a, bool is_signed_b, long int* resultl, long int* resulth);
+  void dpi_idiv(bool enable, long int a, long int b, bool is_signed, long int* quotient, long int* remainder);

  int dpi_register();
  void dpi_assert(int inst, bool cond, int delay);
@ -99,12 +99,12 @@ void dpi_assert(int inst, bool cond, int delay) {

 ///////////////////////////////////////////////////////////////////////////////

-void dpi_imul(bool enable, int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, int* resulth) {
+void dpi_imul(bool enable, long int a, long int b, bool is_signed_a, bool is_signed_b, long int* resultl, long int* resulth) {
  if (!enable)
    return;
    
-  uint64_t first  = *(uint32_t*)&a;
-  uint64_t second = *(uint32_t*)&b;
+  uint64_t first  = *(long int*)&a;
+  uint64_t second = *(long int*)&b;
    
  if (is_signed_a && (first & 0x80000000)) {
    first |= 0xFFFFFFFF00000000;
@ -125,12 +125,12 @@ void dpi_imul(bool enable, int a, int b, bool is_signed_a, bool is_signed_b, int
  *resulth = (result >> 32) & 0xFFFFFFFF;
 }

-void dpi_idiv(bool enable, int a, int b, bool is_signed, int* quotient, int* remainder) {
+void dpi_idiv(bool enable, long int a, long int b, bool is_signed, long int* quotient, long int* remainder) {
  if (!enable)
    return;

-  uint32_t dividen = *(uint32_t*)&a;
-  uint32_t divisor = *(uint32_t*)&b;
+  uint32_t dividen = *(long int*)&a;
+  uint32_t divisor = *(long int*)&b;

  if (is_signed) {
    if (b == 0) {
--- a/hw/dpi/util_dpi.vh
+++ b/hw/dpi/util_dpi.vh
@ -1,8 +1,8 @@
 `ifndef UTIL_DPI
 `define UTIL_DPI

-import "DPI-C" function void dpi_imul(input logic enable, input int a, input int b, input logic is_signed_a, input logic is_signed_b, output int resultl, output int resulth);
-import "DPI-C" function void dpi_idiv(input logic enable, input int a, input int b, input logic is_signed, output int quotient, output int remainder);
+import "DPI-C" function void dpi_imul(input logic enable, input longint a, input longint b, input logic is_signed_a, input logic is_signed_b, output longint resultl, output longint resulth);
+import "DPI-C" function void dpi_idiv(input logic enable, input longint a, input longint b, input logic is_signed, output longint quotient, output longint remainder);

 import "DPI-C" function int dpi_register();
 import "DPI-C" function void dpi_assert(int inst, input logic cond, input int delay);
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -28,8 +28,8 @@
 `endif

 // Disable MULDIV, FPU, and TEX units since irrelevant to RV64I instructions
-`define EXT_M_DISABLE 1
-`define EXT_F_DISABLE 1
+// `define EXT_M_DISABLE 1
+//`define EXT_F_DISABLE 1
 `define EXT_TEX_DISABLE 1

 `ifndef NUM_CLUSTERS
--- a/hw/rtl/core/VX_alu_unit.sv
+++ b/hw/rtl/core/VX_alu_unit.sv
@ -43,26 +43,26 @@ module VX_alu_unit #(
    wire [`NUM_THREADS-1:0][`XLEN-1:0] alu_in1 = alu_req_if.rs1_data;
    wire [`NUM_THREADS-1:0][`XLEN-1:0] alu_in2 = alu_req_if.rs2_data;

-    wire [`NUM_THREADS-1:0][31:0] trunc_alu_in1, trunc_alu_result;
+    wire [`NUM_THREADS-1:0][31:0] trunc_alu_in1;

    for (genvar i = 0; i < `NUM_THREADS; ++i) begin
        // PC operations should only be for 32 bits
        assign trunc_alu_in1[i] = alu_in1[i][31:0];
-        assign trunc_alu_result[i] = alu_result[i][31:0];
+        // assign trunc_alu_result[i] = alu_result[i][`XLEN-1:0];
    end

    // PC operations should only be for 32 bits
-    wire [`NUM_THREADS-1:0][31:0] alu_in1_PC   = alu_req_if.use_PC ? {`NUM_THREADS{alu_req_if.PC}} : trunc_alu_in1;
-    wire [`NUM_THREADS-1:0][`XLEN-1:0] alu_in2_imm  = alu_req_if.use_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
+    wire [`NUM_THREADS-1:0][`XLEN-1:0] alu_A  = alu_req_if.use_PC ? {`NUM_THREADS{`XLEN'(alu_req_if.PC)}} : alu_in1;
+    wire [`NUM_THREADS-1:0][31:0] alu_A_trunc = alu_req_if.use_PC ? {`NUM_THREADS{alu_req_if.PC}} : trunc_alu_in1;
+    wire [`NUM_THREADS-1:0][`XLEN-1:0] alu_B  = alu_req_if.use_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
    wire [`NUM_THREADS-1:0][`XLEN-1:0] alu_in2_less = (alu_req_if.use_imm && ~is_br_op) ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;

    for (genvar i = 0; i < `NUM_THREADS; ++i) begin
-        wire [`XLEN-1:0] temp_add_result = {{`XLEN-32{1'b0}}, alu_in1_PC[i]} + alu_in2_imm[i];
        always @(*) begin
            case(alu_op)
-                `INST_ALU_ADD: add_result[i] = temp_add_result;
-                `INST_ALU_LUI, `INST_ALU_AUIPC, `INST_ALU_ADD_W: add_result[i] = `XLEN'($signed(temp_add_result[31:0])); //{{`XLEN-32{add_result[31]}}, temp_add_result[31:0]}; 
-                default: add_result[i] = temp_add_result;
+                `INST_ALU_ADD, `INST_ALU_AUIPC, `INST_ALU_LUI: add_result[i] = alu_A[i] + alu_B[i];
+                `INST_ALU_ADD_W: add_result[i] = `XLEN'($signed(alu_A_trunc[i] + alu_B[i][31:0]));
+                default: add_result[i] = alu_A[i] + alu_B[i];
            endcase
        end
    end
@ -83,8 +83,8 @@ module VX_alu_unit #(

    for (genvar i = 0; i < `NUM_THREADS; ++i) begin    
        wire [`XLEN:0] shr_in1 = {alu_signed & alu_in1[i][`XLEN-1], alu_in1[i]};
-        wire [`XLEN-1:0] temp_shr_result = `XLEN'($signed(shr_in1) >>> alu_in2_imm[i][SHIFT_IMM_BITS:0]);
-        wire [31:0] temp_shr_result_w = 32'($signed(shr_in1) >>> alu_in2_imm[i][4:0]);
+        wire [`XLEN-1:0] temp_shr_result = `XLEN'($signed(shr_in1) >>> alu_B[i][SHIFT_IMM_BITS:0]);
+        wire [31:0] temp_shr_result_w = 32'($signed(shr_in1) >>> alu_B[i][4:0]);

        always @(*) begin
            case(alu_op)
@ -97,14 +97,14 @@ module VX_alu_unit #(
    end

    for (genvar i = 0; i < `NUM_THREADS; ++i) begin
-        wire [31:0] temp_shift_result = alu_in1[i][31:0] << alu_in2_imm[i][4:0]; // only used for SLLW
+        wire [31:0] temp_shift_result = alu_in1[i][31:0] << alu_B[i][4:0]; // only used for SLLW
        always @(*) begin
            case (alu_op)
-                `INST_ALU_AND: msc_result[i] = alu_in1[i] & alu_in2_imm[i];
-                `INST_ALU_OR:  msc_result[i] = alu_in1[i] | alu_in2_imm[i];
-                `INST_ALU_XOR: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i];
-                // `INST_ALU_SLL: msc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0];
-                `INST_ALU_SLL: msc_result[i] = alu_in1[i] << alu_in2_imm[i][SHIFT_IMM_BITS:0]; // TODO: CHANGED: adjust this to shift using 6 bits for 64 bit
+                `INST_ALU_AND: msc_result[i] = alu_in1[i] & alu_B[i];
+                `INST_ALU_OR:  msc_result[i] = alu_in1[i] | alu_B[i];
+                `INST_ALU_XOR: msc_result[i] = alu_in1[i] ^ alu_B[i];
+                // `INST_ALU_SLL: msc_result[i] = alu_in1[i] << alu_B[i][4:0];
+                `INST_ALU_SLL: msc_result[i] = alu_in1[i] << alu_B[i][SHIFT_IMM_BITS:0]; // TODO: CHANGED: adjust this to shift using 6 bits for 64 bit
                `INST_ALU_SLL_W: msc_result[i] = `XLEN'($signed(temp_shift_result[31:0])); // TODO: CHANGED: adjust this to shift using 6 bits for 32 signed bit 
                default:       msc_result[i] = 'x;
            endcase
@ -127,7 +127,7 @@ module VX_alu_unit #(
    // branch
    
    wire is_jal = is_br_op && (br_op == `INST_BR_JAL || br_op == `INST_BR_JALR);
-    wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : trunc_alu_result; 
+    wire [`NUM_THREADS-1:0][`XLEN-1:0] alu_jal_result = is_jal ? {`NUM_THREADS{`XLEN'(alu_req_if.next_PC)}} : alu_result; 

    wire [`XLEN-1:0] br_dest    = add_result[alu_req_if.tid][`XLEN-1:0];
    wire [32:0] cmp_result = sub_result[alu_req_if.tid][32:0];
@ -147,11 +147,11 @@ module VX_alu_unit #(
    wire [31:0]                   alu_PC;
    wire [`NR_BITS-1:0]           alu_rd;   
    wire                          alu_wb; 
-    wire [`NUM_THREADS-1:0][31:0] alu_data;
+    wire [`NUM_THREADS-1:0][`XLEN-1:0] alu_data;

    wire [`NUM_THREADS-1:0][`XLEN-1:0] full_alu_data;
    for (genvar i = 0; i < `NUM_THREADS; ++i) begin
-        assign full_alu_data[i] = {{`XLEN-31{alu_data[i][31]}},alu_data[i][30:0]};
+        assign full_alu_data[i] =alu_data[i];
    end

    wire [`INST_BR_BITS-1:0] br_op_r;
@ -163,7 +163,7 @@ module VX_alu_unit #(
    assign alu_ready_in = alu_ready_out || ~alu_valid_out;

    VX_pipe_register #(
-        .DATAW  (1 + UUID_WIDTH + NW_WIDTH + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + `XLEN),
+        .DATAW  (1 + UUID_WIDTH + NW_WIDTH + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * `XLEN) + 1 + `INST_BR_BITS + 1 + 1 + `XLEN),
        .RESETW (1)
    ) pipe_reg (
        .clk      (clk),
--- a/hw/rtl/core/VX_dispatch.sv
+++ b/hw/rtl/core/VX_dispatch.sv
@ -121,7 +121,7 @@ module VX_dispatch (
    wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(dispatch_if.op_type);
        
    VX_skid_buffer #(
-        .DATAW   (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + (3 * `NUM_THREADS * 32)),
+        .DATAW   (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + (3 * `NUM_THREADS * `XLEN)),
        .OUT_REG (1)
    ) fpu_buffer (
        .clk       (clk),
--- a/hw/rtl/fpu_unit/VX_fpu_agent.sv
+++ b/hw/rtl/fpu_unit/VX_fpu_agent.sv
@ -73,7 +73,7 @@ module VX_fpu_agent #(
    assign fpu_agent_if.ready = ready_in && mdata_and_csr_ready;    

    VX_skid_buffer #(
-        .DATAW   (`INST_FPU_BITS + `INST_FRM_BITS + `NUM_THREADS * 3 * 32 + `FPU_REQ_TAG_WIDTH),
+        .DATAW   (`INST_FPU_BITS + `INST_FRM_BITS + `NUM_THREADS * 3 * `XLEN + `FPU_REQ_TAG_WIDTH),
        .OUT_REG (1)
    ) req_sbuf (
        .clk       (clk),
@ -109,7 +109,7 @@ module VX_fpu_agent #(
    // commit

    VX_skid_buffer #(
-        .DATAW (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + 32 + `NR_BITS + (`NUM_THREADS * 32))
+        .DATAW (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + 32 + `NR_BITS + (`NUM_THREADS * `XLEN))
    ) rsp_sbuf (
        .clk       (clk),
        .reset     (reset),
--- a/hw/rtl/fpu_unit/VX_fpu_agent_if.sv
+++ b/hw/rtl/fpu_unit/VX_fpu_agent_if.sv
@ -9,9 +9,9 @@ interface VX_fpu_agent_if ();
    wire [31:0]                     PC;
    wire [`INST_FPU_BITS-1:0]       op_type;
    wire [`INST_MOD_BITS-1:0]       op_mod;
-    wire [`NUM_THREADS-1:0][31:0]   rs1_data;
-    wire [`NUM_THREADS-1:0][31:0]   rs2_data;
-    wire [`NUM_THREADS-1:0][31:0]   rs3_data;
+    wire [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data;
+    wire [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data;
+    wire [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data;
    wire [`NR_BITS-1:0]             rd;   
    wire                            ready;

--- a/hw/rtl/fpu_unit/VX_fpu_arb.sv
+++ b/hw/rtl/fpu_unit/VX_fpu_arb.sv
@ -29,8 +29,8 @@ module VX_fpu_arb #(
    localparam LOG_NUM_REQS  = `ARB_SEL_BITS(NUM_INPUTS, NUM_OUTPUTS);
    localparam NUM_REQS      = 1 << LOG_NUM_REQS;
    localparam TAG_OUT_WIDTH = TAG_WIDTH + LOG_NUM_REQS;
-    localparam REQ_DATAW     = TAG_OUT_WIDTH + `INST_FPU_BITS + `INST_FRM_BITS + NUM_LANES * 3 * 32;
-    localparam RSP_DATAW     = TAG_WIDTH + NUM_LANES * (32 + `FP_FLAGS_BITS) + 1;
+    localparam REQ_DATAW     = TAG_OUT_WIDTH + `INST_FPU_BITS + `INST_FRM_BITS + NUM_LANES * 3 * `XLEN;
+    localparam RSP_DATAW     = TAG_WIDTH + NUM_LANES * (`XLEN + `FFLAGS_BITS) + 1;
    
    ///////////////////////////////////////////////////////////////////////

--- a/hw/rtl/fpu_unit/VX_fpu_dpi.sv
+++ b/hw/rtl/fpu_unit/VX_fpu_dpi.sv
@ -16,10 +16,10 @@ module VX_fpu_dpi #(
    input wire [`INST_FPU_BITS-1:0] op_type,
    input wire [`INST_MOD_BITS-1:0] frm,

-    input wire [NUM_LANES-1:0][31:0]  dataa,
-    input wire [NUM_LANES-1:0][31:0]  datab,
-    input wire [NUM_LANES-1:0][31:0]  datac,
-    output wire [NUM_LANES-1:0][31:0] result, 
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  dataa,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datab,
+    input wire [NUM_LANES-1:0][`XLEN-1:0]  datac,
+    output wire [NUM_LANES-1:0][`XLEN-1:0] result, 

    output wire has_fflags,
    output wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags,
@ -37,10 +37,10 @@ module VX_fpu_dpi #(
    localparam NUM_FPC  = 5;
    localparam FPC_BITS = `LOG2UP(NUM_FPC);

-    localparam RSP_ARB_DATAW = (NUM_LANES * 32) + 1 + (NUM_LANES * $bits(fflags_t)) + TAGW;
+    localparam RSP_ARB_DATAW = (NUM_LANES * `XLEN) + 1 + (NUM_LANES * $bits(fflags_t)) + TAGW;
    
    wire [NUM_FPC-1:0] per_core_ready_in;
-    wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
+    wire [NUM_FPC-1:0][NUM_LANES-1:0][`XLEN-1:0] per_core_result;
    wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
    reg [NUM_FPC-1:0] per_core_ready_out;
    wire [NUM_FPC-1:0] per_core_valid_out;
@ -109,14 +109,14 @@ module VX_fpu_dpi #(
    generate 
    begin : fma
        
-        wire [NUM_LANES-1:0][31:0] result_fma;
-        wire [NUM_LANES-1:0][31:0] result_fadd;
-        wire [NUM_LANES-1:0][31:0] result_fsub;
-        wire [NUM_LANES-1:0][31:0] result_fmul;
-        wire [NUM_LANES-1:0][31:0] result_fmadd;
-        wire [NUM_LANES-1:0][31:0] result_fmsub;
-        wire [NUM_LANES-1:0][31:0] result_fnmadd;
-        wire [NUM_LANES-1:0][31:0] result_fnmsub;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fma;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fadd;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fsub;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fmul;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fmadd;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fmsub;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fnmadd;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fnmsub;
        
        fflags_t [NUM_LANES-1:0] fflags_fma;
        fflags_t [NUM_LANES-1:0] fflags_fadd;
@ -163,7 +163,7 @@ module VX_fpu_dpi #(
                                        0;                

        VX_shift_register #(
-            .DATAW  (1 + TAGW + NUM_LANES * (32 + $bits(fflags_t))),
+            .DATAW  (1 + TAGW + NUM_LANES * (`XLEN + $bits(fflags_t))),
            .DEPTH  (`LATENCY_FMA),
            .RESETW (1)
        ) shift_reg (
@ -183,7 +183,7 @@ module VX_fpu_dpi #(
    generate 
    begin : fdiv

-        wire [NUM_LANES-1:0][31:0] result_fdiv;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fdiv;
        fflags_t [NUM_LANES-1:0] fflags_fdiv;

        wire fdiv_valid = (valid_in && core_select == FPU_DIV);
@ -198,7 +198,7 @@ module VX_fpu_dpi #(
        end

        VX_shift_register #(
-            .DATAW  (1 + TAGW + NUM_LANES * (32 + $bits(fflags_t))),
+            .DATAW  (1 + TAGW + NUM_LANES * (`XLEN + $bits(fflags_t))),
            .DEPTH  (`LATENCY_FDIV),
            .RESETW (1)
        ) shift_reg (
@ -218,7 +218,7 @@ module VX_fpu_dpi #(
    generate 
    begin : fsqrt

-        wire [NUM_LANES-1:0][31:0] result_fsqrt;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fsqrt;
        fflags_t [NUM_LANES-1:0] fflags_fsqrt;

        wire fsqrt_valid = (valid_in && core_select == FPU_SQRT);
@ -233,7 +233,7 @@ module VX_fpu_dpi #(
        end

        VX_shift_register #(
-            .DATAW  (1 + TAGW + NUM_LANES * (32 + $bits(fflags_t))),
+            .DATAW  (1 + TAGW + NUM_LANES * (`XLEN + $bits(fflags_t))),
            .DEPTH  (`LATENCY_FSQRT),
            .RESETW (1)
        ) shift_reg (
@ -253,11 +253,11 @@ module VX_fpu_dpi #(
    generate
    begin : fcvt

-        wire [NUM_LANES-1:0][31:0] result_fcvt;
-        wire [NUM_LANES-1:0][31:0] result_itof;
-        wire [NUM_LANES-1:0][31:0] result_utof;
-        wire [NUM_LANES-1:0][31:0] result_ftoi;
-        wire [NUM_LANES-1:0][31:0] result_ftou;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fcvt;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_itof;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_utof;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_ftoi;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_ftou;
        
        fflags_t [NUM_LANES-1:0] fflags_fcvt;
        fflags_t [NUM_LANES-1:0] fflags_itof;
@ -292,7 +292,7 @@ module VX_fpu_dpi #(
                                       0;

        VX_shift_register #(
-            .DATAW  (1 + TAGW + NUM_LANES * (32 + $bits(fflags_t))),
+            .DATAW  (1 + TAGW + NUM_LANES * (`XLEN + $bits(fflags_t))),
            .DEPTH  (`LATENCY_FCVT),
            .RESETW (1)
        ) shift_reg (
@ -312,17 +312,17 @@ module VX_fpu_dpi #(
    generate 
    begin : fncp

-        wire [NUM_LANES-1:0][31:0] result_fncp;
-        wire [NUM_LANES-1:0][31:0] result_fclss;
-        wire [NUM_LANES-1:0][31:0] result_flt;
-        wire [NUM_LANES-1:0][31:0] result_fle;
-        wire [NUM_LANES-1:0][31:0] result_feq;
-        wire [NUM_LANES-1:0][31:0] result_fmin;
-        wire [NUM_LANES-1:0][31:0] result_fmax;
-        wire [NUM_LANES-1:0][31:0] result_fsgnj;
-        wire [NUM_LANES-1:0][31:0] result_fsgnjn;
-        wire [NUM_LANES-1:0][31:0] result_fsgnjx;
-        reg [NUM_LANES-1:0][31:0] result_fmv;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fncp;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fclss;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_flt;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fle;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_feq;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fmin;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fmax;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fsgnj;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fsgnjn;
+        wire [NUM_LANES-1:0][`XLEN-1:0] result_fsgnjx;
+        reg  [NUM_LANES-1:0][`XLEN-1:0] result_fmv;

        fflags_t [NUM_LANES-1:0] fflags_fncp;
        fflags_t [NUM_LANES-1:0] fflags_flt;
@ -372,7 +372,7 @@ module VX_fpu_dpi #(
                                       0;

        VX_shift_register #(
-            .DATAW  (1 + TAGW + 1 + NUM_LANES * (32 + $bits(fflags_t))),
+            .DATAW  (1 + TAGW + 1 + NUM_LANES * (`XLEN + $bits(fflags_t))),
            .DEPTH  (`LATENCY_FNCP),
            .RESETW (1)
        ) shift_reg (
--- a/hw/rtl/fpu_unit/VX_fpu_req_if.sv
+++ b/hw/rtl/fpu_unit/VX_fpu_req_if.sv
@ -9,9 +9,9 @@ interface VX_fpu_req_if #(
    wire                         valid;
    wire [`INST_FPU_BITS-1:0]    op_type;
    wire [`INST_FRM_BITS-1:0]    frm;
-    wire [NUM_LANES-1:0][31:0]   dataa;
-    wire [NUM_LANES-1:0][31:0]   datab;
-    wire [NUM_LANES-1:0][31:0]   datac;
+    wire [NUM_LANES-1:0][`XLEN-1:0] dataa;
+    wire [NUM_LANES-1:0][`XLEN-1:0] datab;
+    wire [NUM_LANES-1:0][`XLEN-1:0] datac;
    wire [TAG_WIDTH-1:0]         tag; 
    wire                         ready;

--- a/hw/rtl/fpu_unit/VX_fpu_rsp_if.sv
+++ b/hw/rtl/fpu_unit/VX_fpu_rsp_if.sv
@ -11,7 +11,7 @@ interface VX_fpu_rsp_if #(
 ) ();

    wire                        valid;
-    wire [NUM_LANES-1:0][31:0]  result; 
+    wire [NUM_LANES-1:0][`XLEN-1:0] result; 
    fflags_t [NUM_LANES-1:0]    fflags;
    wire                        has_fflags;       
    wire [TAG_WIDTH-1:0]        tag;