FPU DPI fallback

2025-04-24 05:47:35 -04:00 · 2020-08-31 09:19:55 -04:00 · 2020-08-31 09:19:55 -04:00 · df711986bc
commit df711986bc
parent 0a0b28aac0
29 changed files with 1147 additions and 1329 deletions
--- a/driver/rtlsim/Makefile
+++ b/driver/rtlsim/Makefile
@ -34,9 +34,10 @@ LDFLAGS += -shared -pthread
 TOP = Vortex

 SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
+SRCS += ../../hw/rtl/fp_cores/svdpi/float_dpi.cpp

-FPU_INCLUDE = -I../../hw/rtl/fp_cores/fpnew/src/common_cells/include -I../../hw/rtl/fp_cores/fpnew/src/common_cells/src -I../../hw/rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../hw/rtl/fp_cores/fpnew/src 
-RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/fp_cores $(FPU_INCLUDE)
+FPU_INCLUDE = -I../../hw/rtl/fp_cores -I../../hw/rtl/fp_cores/svdpi -I../../hw/rtl/fp_cores/fpnew/src/common_cells/include -I../../hw/rtl/fp_cores/fpnew/src/common_cells/src -I../../hw/rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../hw/rtl/fp_cores/fpnew/src 
+RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache $(FPU_INCLUDE)

 VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
 VL_FLAGS += -Wno-DECLFILENAME
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -51,15 +51,55 @@
 `define L3_ENABLE (`NUM_CLUSTERS > 1)
 `endif

+`ifndef EXT_M_DISABLE
 `define EXT_M_ENABLE
+`endif

+`ifndef EXT_F_DISABLE
 `define EXT_F_ENABLE
+`endif
+
+`define FPNEW_ENABLE

 // Device identification
 `define VENDOR_ID           0
 `define ARCHITECTURE_ID     0
 `define IMPLEMENTATION_ID   0

+///////////////////////////////////////////////////////////////////////////////
+
+`ifndef LATENCY_IMUL
+`define LATENCY_IMUL 3
+`endif
+
+`ifndef LATENCY_FNONCOMP
+`define LATENCY_FNONCOMP 1
+`endif
+
+`ifndef LATENCY_FMADD
+`define LATENCY_FMADD 1
+`endif
+
+`ifndef LATENCY_FNMADD
+`define LATENCY_FNMADD 2
+`endif
+
+`ifndef LATENCY_FDIV
+`define LATENCY_FDIV 15
+`endif
+
+`ifndef LATENCY_FSQRT
+`define LATENCY_FSQRT 9
+`endif
+
+`ifndef LATENCY_ITOF
+`define LATENCY_ITOF 7
+`endif
+
+`ifndef LATENCY_FTOI
+`define LATENCY_FTOI 3
+`endif
+
 // CSR Addresses //////////////////////////////////////////////////////////////

 `define CSR_FFLAGS      12'h001
--- a/hw/rtl/VX_define.vh
+++ b/hw/rtl/VX_define.vh
@ -35,22 +35,6 @@

 ///////////////////////////////////////////////////////////////////////////////

-`define LATENCY_IMUL    3
-
-`define LATENCY_FDIV     16
-`define LATENCY_FSQRT    10
-`define LATENCY_FTOI     5
-`define LATENCY_FTOU     4
-`define LATENCY_ITOF     8
-`define LATENCY_UTOF     7
-
-`define LATENCY_FMULADD  2
-`define LATENCY_FDIVSQRT 2
-`define LATENCY_FCONV    2
-`define LATENCY_FNONCOMP 1
-
-///////////////////////////////////////////////////////////////////////////////
-
 `define INST_LUI    7'b0110111
 `define INST_AUIPC  7'b0010111
 `define INST_JAL    7'b1101111
--- a/hw/rtl/VX_fpu_unit.v
+++ b/hw/rtl/VX_fpu_unit.v
@ -56,7 +56,7 @@ module VX_fpu_unit #(
    // can accept new request?
    assign fpu_req_if.ready = ready_in && ~fpuq_full;

-`ifdef SYNTHESIS
+`ifndef FPNEW_ENABLE

    VX_fp_fpga #(
        .TAGW (FPUQ_BITS)
--- a/hw/rtl/fp_cores/altera/VX_fp_div.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_div.v
@ -1,5 +1,9 @@
 `include "VX_define.vh"

+`ifndef SYNTHESIS
+`include "float_dpi.vh"
+`endif
+
 module VX_fp_div #( 
    parameter TAGW = 1,
    parameter LANES = 1
@ -21,19 +25,23 @@ module VX_fp_div #(
    input wire  ready_out,
    output wire valid_out
 );    
-    wire stall  = ~ready_out && valid_out;
-    wire enable = ~stall;
-    assign ready_in = enable;
-
+    wire stall = ~ready_out && valid_out;
+    
    for (genvar i = 0; i < LANES; i++) begin
+    `ifdef QUARTUS
        acl_fp_div fdiv (
            .clk    (clk),
            .areset (1'b0),
-            .en     (enable),
+            .en     (~stall),
            .a      (dataa[i]),
            .b      (datab[i]),
            .q      (result[i])
        );
+    `else 
+        always @(posedge clk) begin
+           dpi_fdiv(clk, ~stall, dataa[i], datab[i], result[i]);
+        end
+    `endif
    end

    VX_shift_register #(
@ -42,9 +50,11 @@ module VX_fp_div #(
    ) shift_reg (
        .clk(clk),
        .reset(reset),
-        .enable(enable),
+        .enable(~stall),
        .in ({tag_in,  valid_in}),
        .out({tag_out, valid_out})
    );

+    assign ready_in = ~stall;
+
 endmodule
--- a/hw/rtl/fp_cores/VX_fp_fpga.v
+++ b/hw/rtl/fp_cores/VX_fp_fpga.v
@ -1,5 +1,4 @@
 `include "VX_define.vh"
-`include "dspba_library_ver.sv"

 module VX_fp_fpga #( 
    parameter TAGW = 1
@ -28,7 +27,7 @@ module VX_fp_fpga #(
    input wire  ready_out,
    output wire valid_out
 );
-    localparam NUM_FPC  = 12;
+    localparam NUM_FPC  = 7;
    localparam FPC_BITS = `LOG2UP(NUM_FPC);
    
    wire [NUM_FPC-1:0] per_core_ready_in;
@ -41,26 +40,30 @@ module VX_fp_fpga #(
    fflags_t [`NUM_THREADS-1:0] fpnew_fflags;  

    reg [FPC_BITS-1:0] core_select;
-    reg fmadd_negate;
+    reg do_add, do_sub, do_mul;
+    reg is_signed;

    always @(*) begin
-        core_select  = 0;
-        fmadd_negate = 0;
+        core_select = 'x;
+        do_add      = 'x;
+        do_sub      = 'x;
+        do_mul      = 'x;
+        is_signed   = 'x;
        case (op_type)
-            `FPU_ADD:    core_select = 1;
-            `FPU_SUB:    core_select = 2;
-            `FPU_MUL:    core_select = 3;
-            `FPU_MADD:   core_select = 4;
-            `FPU_MSUB:   core_select = 5;
-            `FPU_NMSUB:  begin core_select = 4; fmadd_negate = 1; end
-            `FPU_NMADD:  begin core_select = 5; fmadd_negate = 1; end           
-            `FPU_DIV:    core_select = 6;
-            `FPU_SQRT:   core_select = 7;
-            `FPU_CVTWS:  core_select = 8;
-            `FPU_CVTWUS: core_select = 9;
-            `FPU_CVTSW:  core_select = 10;
-            `FPU_CVTSWU: core_select = 11;
-            default:;
+            `FPU_ADD:    begin core_select = 1; do_mul = 0; do_add = 1; do_sub = 0; end
+            `FPU_SUB:    begin core_select = 1; do_mul = 0; do_add = 0; do_sub = 1; end
+            `FPU_MUL:    begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 0; end
+            `FPU_MADD:   begin core_select = 1; do_mul = 1; do_add = 1; do_sub = 0; end
+            `FPU_MSUB:   begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 1; end
+            `FPU_NMSUB:  begin core_select = 2; do_sub = 1; end
+            `FPU_NMADD:  begin core_select = 2; do_sub = 0; end           
+            `FPU_DIV:    begin core_select = 3; end
+            `FPU_SQRT:   begin core_select = 4; end
+            `FPU_CVTWS:  begin core_select = 5; is_signed = 1; end
+            `FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
+            `FPU_CVTSW:  begin core_select = 6; is_signed = 1; end
+            `FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
+            default:     begin core_select = 0; end
        endcase
    end

@ -76,7 +79,7 @@ module VX_fp_fpga #(
        .op_type    (op_type),
        .frm        (frm),
        .dataa      (dataa),
-        .datab      (datab),
+        .datab      (datab),        
        .result     (per_core_result[0]), 
        .has_fflags (fpnew_has_fflags),
        .fflags     (fpnew_fflags),
@ -85,44 +88,50 @@ module VX_fp_fpga #(
        .valid_out  (per_core_valid_out[0])
    );
    
-    VX_fp_add #(
+    VX_fp_madd #(
        .TAGW (TAGW),
        .LANES(`NUM_THREADS)
-    ) fp_add (
+    ) fp_madd (
        .clk        (clk), 
        .reset      (reset),   
        .valid_in   (valid_in && (core_select == 1)),
        .ready_in   (per_core_ready_in[1]),    
        .tag_in     (tag_in),    
+        .do_add     (do_add),
+        .do_sub     (do_sub),
+        .do_mul     (do_mul),
        .dataa      (dataa), 
-        .datab      (datab),         
+        .datab      (datab),      
+        .datac      (datac),   
        .result     (per_core_result[1]),
        .tag_out    (per_core_tag_out[1]),
        .ready_out  (per_core_ready_out[1]),
        .valid_out  (per_core_valid_out[1])
    );

-    VX_fp_sub #(
+    VX_fp_nmadd #(
        .TAGW (TAGW),
        .LANES(`NUM_THREADS)
-    ) fp_sub (
+    ) fp_nmadd (
        .clk        (clk), 
        .reset      (reset),   
        .valid_in   (valid_in && (core_select == 2)),
        .ready_in   (per_core_ready_in[2]),    
-        .tag_in     (tag_in),    
+        .tag_in     (tag_in),  
+        .do_sub     (do_sub),
        .dataa      (dataa), 
-        .datab      (datab),         
+        .datab      (datab),   
+        .datac      (datac),              
        .result     (per_core_result[2]),
        .tag_out    (per_core_tag_out[2]),
        .ready_out  (per_core_ready_out[2]),
        .valid_out  (per_core_valid_out[2])
    );

-    VX_fp_mul #(
+    VX_fp_div #(
        .TAGW (TAGW),
        .LANES(`NUM_THREADS)
-    ) fp_mul (
+    ) fp_div (
        .clk        (clk), 
        .reset      (reset),   
        .valid_in   (valid_in && (core_select == 3)),
@ -136,75 +145,20 @@ module VX_fp_fpga #(
        .valid_out  (per_core_valid_out[3])
    );

-    VX_fp_madd #(
-        .TAGW (TAGW),
-        .LANES(`NUM_THREADS)
-    ) fp_madd (
-        .clk        (clk), 
-        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 4)),
-        .ready_in   (per_core_ready_in[4]),    
-        .tag_in     (tag_in),    
-        .negate     (fmadd_negate),
-        .dataa      (dataa), 
-        .datab      (datab),         
-        .datac      (datac),        
-        .result     (per_core_result[4]),
-        .tag_out    (per_core_tag_out[4]),
-        .ready_out  (per_core_ready_out[4]),
-        .valid_out  (per_core_valid_out[4])
-    );
-
-    VX_fp_msub #(
-        .TAGW (TAGW),
-        .LANES(`NUM_THREADS)
-    ) fp_msub (
-        .clk        (clk), 
-        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 5)),
-        .ready_in   (per_core_ready_in[5]),    
-        .tag_in     (tag_in),    
-        .negate     (fmadd_negate),
-        .dataa      (dataa), 
-        .datab      (datab),   
-        .datac      (datac),              
-        .result     (per_core_result[5]),
-        .tag_out    (per_core_tag_out[5]),
-        .ready_out  (per_core_ready_out[5]),
-        .valid_out  (per_core_valid_out[5])
-    );
-
-    VX_fp_div #(
-        .TAGW (TAGW),
-        .LANES(`NUM_THREADS)
-    ) fp_div (
-        .clk        (clk), 
-        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 6)),
-        .ready_in   (per_core_ready_in[6]),    
-        .tag_in     (tag_in),    
-        .dataa      (dataa), 
-        .datab      (datab),         
-        .result     (per_core_result[6]),
-        .tag_out    (per_core_tag_out[6]),
-        .ready_out  (per_core_ready_out[6]),
-        .valid_out  (per_core_valid_out[6])
-    );
-
    VX_fp_sqrt #(
        .TAGW (TAGW),
        .LANES(`NUM_THREADS)
    ) fp_sqrt (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 7)),
-        .ready_in   (per_core_ready_in[7]),    
+        .valid_in   (valid_in && (core_select == 4)),
+        .ready_in   (per_core_ready_in[4]),    
        .tag_in     (tag_in),    
        .dataa      (dataa),  
-        .result     (per_core_result[7]),
-        .tag_out    (per_core_tag_out[7]),
-        .ready_out  (per_core_ready_out[7]),
-        .valid_out  (per_core_valid_out[7])
+        .result     (per_core_result[4]),
+        .tag_out    (per_core_tag_out[4]),
+        .ready_out  (per_core_ready_out[4]),
+        .valid_out  (per_core_valid_out[4])
    );

    VX_fp_ftoi #(
@ -213,30 +167,15 @@ module VX_fp_fpga #(
    ) fp_ftoi (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 8)),
-        .ready_in   (per_core_ready_in[8]),    
-        .tag_in     (tag_in),    
+        .valid_in   (valid_in && (core_select == 5)),
+        .ready_in   (per_core_ready_in[5]),    
+        .tag_in     (tag_in), 
+        .is_signed  (is_signed),   
        .dataa      (dataa),  
-        .result     (per_core_result[8]),
-        .tag_out    (per_core_tag_out[8]),
-        .ready_out  (per_core_ready_out[8]),
-        .valid_out  (per_core_valid_out[8])
-    );
-
-    VX_fp_ftou #(
-        .TAGW (TAGW),
-        .LANES(`NUM_THREADS)
-    ) fp_ftou (
-        .clk        (clk), 
-        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 9)),
-        .ready_in   (per_core_ready_in[9]),    
-        .tag_in     (tag_in),    
-        .dataa      (dataa),  
-        .result     (per_core_result[9]),
-        .tag_out    (per_core_tag_out[9]),
-        .ready_out  (per_core_ready_out[9]),
-        .valid_out  (per_core_valid_out[9])
+        .result     (per_core_result[5]),
+        .tag_out    (per_core_tag_out[5]),
+        .ready_out  (per_core_ready_out[5]),
+        .valid_out  (per_core_valid_out[5])
    );

    VX_fp_itof #(
@ -245,60 +184,45 @@ module VX_fp_fpga #(
    ) fp_itof (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 10)),
-        .ready_in   (per_core_ready_in[10]),    
-        .tag_in     (tag_in),    
+        .valid_in   (valid_in && (core_select == 6)),
+        .ready_in   (per_core_ready_in[6]),    
+        .tag_in     (tag_in), 
+        .is_signed  (is_signed),      
        .dataa      (dataa),  
-        .result     (per_core_result[10]),
-        .tag_out    (per_core_tag_out[10]),
-        .ready_out  (per_core_ready_out[10]),
-        .valid_out  (per_core_valid_out[10])
+        .result     (per_core_result[6]),
+        .tag_out    (per_core_tag_out[6]),
+        .ready_out  (per_core_ready_out[6]),
+        .valid_out  (per_core_valid_out[6])
    );

-    VX_fp_utof #(
-        .TAGW (TAGW),
-        .LANES(`NUM_THREADS)
-    ) fp_utof (
-        .clk        (clk), 
-        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 11)),
-        .ready_in   (per_core_ready_in[11]),    
-        .tag_in     (tag_in),    
-        .dataa      (dataa),  
-        .result     (per_core_result[11]),
-        .tag_out    (per_core_tag_out[11]),
-        .ready_out  (per_core_ready_out[11]),
-        .valid_out  (per_core_valid_out[11])
-    );
-
-    reg valid_out_r;
-    reg has_fflags_r;
-    reg [`NUM_THREADS-1:0][31:0] result_r;
-    reg [TAGW-1:0] tag_out_r;
+    reg valid_out_n;
+    reg has_fflags_n;
+    reg [`NUM_THREADS-1:0][31:0] result_n;
+    reg [TAGW-1:0] tag_out_n;

    always @(*) begin
        per_core_ready_out = 0;
-        valid_out_r        = 0;
-        has_fflags_r       = 'x;
-        result_r           = 'x;
-        tag_out_r          = 'x;
+        valid_out_n        = 0;
+        has_fflags_n       = 'x;
+        result_n           = 'x;
+        tag_out_n          = 'x;
        for (integer i = 0; i < NUM_FPC; i++) begin
            if (per_core_valid_out[i]) begin
                per_core_ready_out[i] = ready_out;
-                valid_out_r  = 1;
-                has_fflags_r = fpnew_has_fflags && (i == 0);
-                result_r     = per_core_result[i];
-                tag_out_r    = per_core_tag_out[i];
+                valid_out_n  = 1;
+                has_fflags_n = fpnew_has_fflags && (i == 0);
+                result_n     = per_core_result[i];
+                tag_out_n    = per_core_tag_out[i];
                break;
            end
        end
    end

    assign ready_in   = (& per_core_ready_in);
-    assign valid_out  = valid_out_r;
-    assign has_fflags = has_fflags_r;
-    assign tag_out    = tag_out_r;
-    assign result     = result_r;    
+    assign valid_out  = valid_out_n;
+    assign has_fflags = has_fflags_n;
+    assign tag_out    = tag_out_n;
+    assign result     = result_n;    
    assign fflags     = fpnew_fflags;

 endmodule
--- a/hw/rtl/fp_cores/VX_fp_ftoi.v
+++ b/hw/rtl/fp_cores/VX_fp_ftoi.v
@ -0,0 +1,77 @@
+`include "VX_define.vh"
+
+`ifndef SYNTHESIS
+`include "float_dpi.vh"
+`endif
+
+module VX_fp_ftoi #( 
+    parameter TAGW = 1,
+    parameter LANES = 1
+) (
+    input wire clk,
+    input wire reset,   
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire  is_signed,
+
+    input wire [LANES-1:0][31:0]  dataa,
+    output wire [LANES-1:0][31:0] result, 
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);    
+    wire stall = ~ready_out && valid_out;
+
+    reg is_signed_r;
+    
+    for (genvar i = 0; i < LANES; i++) begin
+
+        wire [31:0] result_s;
+        wire [31:0] result_u;
+
+    `ifdef QUARTUS       
+        acl_fp_ftoi ftoi (
+            .clk    (clk),
+            .areset (1'b0),
+            .en     (~stall),
+            .a      (dataa[i]),
+            .q      (result_s)
+        );
+
+        acl_fp_ftou ftou (
+            .clk    (clk),
+            .areset (1'b0),
+            .en     (~stall),
+            .a      (dataa[i]),
+            .q      (result_u)
+        );        
+    `else
+        always @(posedge clk) begin
+           dpi_ftoi(clk, ~stall, dataa[i], result_s);
+           dpi_ftou(clk, ~stall, dataa[i], result_u);
+        end
+    `endif
+
+        assign result[i] = is_signed_r ? result_s : result_u;
+    end
+
+    VX_shift_register #(
+        .DATAW(TAGW + 1 + 1),
+        .DEPTH(`LATENCY_FTOI)
+    ) shift_reg (
+        .clk(clk),
+        .reset(reset),
+        .enable(~stall),
+        .in ({tag_in,  valid_in,  is_signed}),
+        .out({tag_out, valid_out, is_signed_r})
+    );
+
+    assign ready_in = ~stall;
+
+endmodule
--- a/hw/rtl/fp_cores/VX_fp_itof.v
+++ b/hw/rtl/fp_cores/VX_fp_itof.v
@ -0,0 +1,77 @@
+`include "VX_define.vh"
+
+`ifndef SYNTHESIS
+`include "float_dpi.vh"
+`endif
+
+module VX_fp_itof #( 
+    parameter TAGW = 1,
+    parameter LANES = 1
+) (
+    input wire clk,
+    input wire reset,   
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire  is_signed,
+
+    input wire [LANES-1:0][31:0]  dataa,
+    output wire [LANES-1:0][31:0] result, 
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);    
+    wire stall = ~ready_out && valid_out;
+
+    reg is_signed_r;
+
+    for (genvar i = 0; i < LANES; i++) begin
+        
+        wire [31:0] result_s;
+        wire [31:0] result_u;
+
+    `ifdef QUARTUS
+        acl_fp_itof itof (
+            .clk    (clk),
+            .areset (1'b0),
+            .en     (~stall),
+            .a      (dataa[i]),
+            .q      (result_s)
+        );
+
+        acl_fp_utof utof (
+            .clk    (clk),
+            .areset (1'b0),
+            .en     (~stall),
+            .a      (dataa[i]),
+            .q      (result_u)
+        );
+    `else
+        always @(posedge clk) begin
+           dpi_itof(clk, ~stall, dataa[i], result_s);
+           dpi_utof(clk, ~stall, dataa[i], result_u);
+        end
+    `endif
+
+        assign result[i] = is_signed_r ? result_s : result_u;
+    end
+
+    VX_shift_register #(
+        .DATAW(TAGW + 1 + 1),
+        .DEPTH(`LATENCY_FTOI)
+    ) shift_reg (
+        .clk(clk),
+        .reset(reset),
+        .enable(~stall),
+        .in ({tag_in,  valid_in,  is_signed}),
+        .out({tag_out, valid_out, is_signed_r})
+    );
+
+    assign ready_in = ~stall;
+
+endmodule
--- a/hw/rtl/fp_cores/VX_fp_madd.v
+++ b/hw/rtl/fp_cores/VX_fp_madd.v
@ -0,0 +1,291 @@
+`include "VX_define.vh"
+
+`ifndef SYNTHESIS
+`include "float_dpi.vh"
+`endif
+
+module VX_fp_madd #( 
+    parameter TAGW = 1,
+    parameter LANES = 1
+) (
+    input wire clk,
+    input wire reset,   
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire  do_add,
+    input wire  do_sub,
+    input wire  do_mul,    
+
+    input wire [LANES-1:0][31:0]  dataa,
+    input wire [LANES-1:0][31:0]  datab,
+    input wire [LANES-1:0][31:0]  datac,
+    output wire [LANES-1:0][31:0] result, 
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);    
+    
+    wire stall = ~ready_out && valid_out;
+
+    reg do_add_r, do_sub_r, do_mul_r;
+
+    for (genvar i = 0; i < LANES; i++) begin
+        
+        wire [31:0] result_add;
+        wire [31:0] result_sub;
+        wire [31:0] result_mul;
+        wire [31:0] result_madd;
+        wire [31:0] result_msub;
+
+    `ifdef QUARTUS
+        twentynm_fp_mac mac_fp_add (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(),
+            .ay(datab[i]),
+            .az(dataa[i]),
+            .clk({2'b00,clk}),
+            .ena({2'b11,~stall}),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result_add),
+            .chainout()
+        );
+        defparam mac_fp_add.operation_mode = "sp_add"; 
+        defparam mac_fp_add.use_chainin = "false"; 
+        defparam mac_fp_add.adder_subtract = "false"; 
+        defparam mac_fp_add.ax_clock = "0"; 
+        defparam mac_fp_add.ay_clock = "0"; 
+        defparam mac_fp_add.az_clock = "0"; 
+        defparam mac_fp_add.output_clock = "0"; 
+        defparam mac_fp_add.accumulate_clock = "none"; 
+        defparam mac_fp_add.ax_chainin_pl_clock = "0"; 
+        defparam mac_fp_add.accum_pipeline_clock = "none"; 
+        defparam mac_fp_add.mult_pipeline_clock = "0"; 
+        defparam mac_fp_add.adder_input_clock = "0"; 
+        defparam mac_fp_add.accum_adder_clock = "none"; 
+
+        twentynm_fp_mac mac_fp_sub (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(),
+            .ay(datab[i]),
+            .az(dataa[i]),
+            .clk({2'b00,clk}),
+            .ena({2'b11,~stall}),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result_sub),
+            .chainout()
+        );
+        defparam mac_fp_sub.operation_mode = "sp_add"; 
+        defparam mac_fp_sub.use_chainin = "false"; 
+        defparam mac_fp_sub.adder_subtract = "true"; 
+        defparam mac_fp_sub.ax_clock = "0"; 
+        defparam mac_fp_sub.ay_clock = "0"; 
+        defparam mac_fp_sub.az_clock = "none"; 
+        defparam mac_fp_sub.output_clock = "0"; 
+        defparam mac_fp_sub.accumulate_clock = "none"; 
+        defparam mac_fp_sub.ax_chainin_pl_clock = "none"; 
+        defparam mac_fp_sub.accum_pipeline_clock = "none"; 
+        defparam mac_fp_sub.mult_pipeline_clock = "none"; 
+        defparam mac_fp_sub.adder_input_clock = "0"; 
+        defparam mac_fp_sub.accum_adder_clock = "none";
+
+        twentynm_fp_mac mac_fp_mul (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(),
+            .ay(datab[i]),
+            .az(dataa[i]),
+            .clk({2'b00,clk}),
+            .ena({2'b11,~stall}),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result_mul),
+            .chainout()
+        );
+        defparam mac_fp_mul.operation_mode = "sp_mult"; 
+        defparam mac_fp_mul.use_chainin = "false"; 
+        defparam mac_fp_mul.adder_subtract = "false"; 
+        defparam mac_fp_mul.ax_clock = "none"; 
+        defparam mac_fp_mul.ay_clock = "0"; 
+        defparam mac_fp_mul.az_clock = "0"; 
+        defparam mac_fp_mul.output_clock = "0"; 
+        defparam mac_fp_mul.accumulate_clock = "none"; 
+        defparam mac_fp_mul.ax_chainin_pl_clock = "none"; 
+        defparam mac_fp_mul.accum_pipeline_clock = "none"; 
+        defparam mac_fp_mul.mult_pipeline_clock = "0"; 
+        defparam mac_fp_mul.adder_input_clock = "none"; 
+        defparam mac_fp_mul.accum_adder_clock = "none"; 
+
+        twentynm_fp_mac mac_fp_madd (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(datac[i]),
+            .ay(datab[i]),
+            .az(dataa[i]),
+            .clk({2'b00,clk}),
+            .ena({2'b11,~stall}),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result_madd),
+            .chainout()
+        );
+        defparam mac_fp_madd.operation_mode = "sp_mult_add"; 
+        defparam mac_fp_madd.use_chainin = "false"; 
+        defparam mac_fp_madd.adder_subtract = "false"; 
+        defparam mac_fp_madd.ax_clock = "0"; 
+        defparam mac_fp_madd.ay_clock = "0"; 
+        defparam mac_fp_madd.az_clock = "0"; 
+        defparam mac_fp_madd.output_clock = "0"; 
+        defparam mac_fp_madd.accumulate_clock = "none"; 
+        defparam mac_fp_madd.ax_chainin_pl_clock = "0"; 
+        defparam mac_fp_madd.accum_pipeline_clock = "none"; 
+        defparam mac_fp_madd.mult_pipeline_clock = "0"; 
+        defparam mac_fp_madd.adder_input_clock = "0"; 
+        defparam mac_fp_madd.accum_adder_clock = "none"; 
+
+        twentynm_fp_mac mac_fp_msub (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(datac[i]),
+            .ay(datab[i]),
+            .az(dataa[i]),
+            .clk({2'b00,clk}),
+            .ena({2'b11,~stall}),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result_msub),
+            .chainout()
+        );
+        defparam mac_fp_msub.operation_mode = "sp_mult_add"; 
+        defparam mac_fp_msub.use_chainin = "false"; 
+        defparam mac_fp_msub.adder_subtract = "true"; 
+        defparam mac_fp_msub.ax_clock = "0"; 
+        defparam mac_fp_msub.ay_clock = "0"; 
+        defparam mac_fp_msub.az_clock = "0"; 
+        defparam mac_fp_msub.output_clock = "0"; 
+        defparam mac_fp_msub.accumulate_clock = "none"; 
+        defparam mac_fp_msub.ax_chainin_pl_clock = "0"; 
+        defparam mac_fp_msub.accum_pipeline_clock = "none"; 
+        defparam mac_fp_msub.mult_pipeline_clock = "0"; 
+        defparam mac_fp_msub.adder_input_clock = "0"; 
+        defparam mac_fp_msub.accum_adder_clock = "none";
+    `else
+        always @(posedge clk) begin
+           dpi_fadd(clk, ~stall, dataa[i], datab[i], result_add);
+           dpi_fsub(clk, ~stall, dataa[i], datab[i], result_sub);
+           dpi_fmul(clk, ~stall, dataa[i], datab[i], result_mul);
+           dpi_fmadd(clk, ~stall, dataa[i], datab[i], datac[i], result_madd);
+           dpi_fmsub(clk, ~stall, dataa[i], datab[i], datac[i], result_msub);
+        end
+    `endif
+
+        reg [31:0] result_r;        
+
+        always @(*) begin
+            result_r = 'x;
+            if (do_mul_r) begin
+                if (do_add_r)
+                    result_r = result_madd;
+                else if (do_sub_r)
+                    result_r = result_msub;
+                else
+                    result_r = result_mul;
+            end else begin
+                if (do_add_r)
+                    result_r = result_add;
+                else if (do_sub_r)
+                    result_r = result_sub;
+            end            
+        end
+
+        assign result[i] = result_r;
+    end
+    
+    VX_shift_register #(
+        .DATAW(TAGW + 1 + 1 + 1 + 1),
+        .DEPTH(`LATENCY_FMADD)
+    ) shift_reg1 (
+        .clk(clk),
+        .reset(reset),
+        .enable(~stall),
+        .in({tag_in,   valid_in,  do_add,   do_sub,   do_mul}),
+        .out({tag_out, valid_out, do_add_r, do_sub_r, do_mul_r})
+    );
+
+    assign ready_in  = ~stall;
+
+endmodule
--- a/hw/rtl/fp_cores/VX_fp_nmadd.v
+++ b/hw/rtl/fp_cores/VX_fp_nmadd.v
@ -0,0 +1,191 @@
+`include "VX_define.vh"
+
+`ifndef SYNTHESIS
+`include "float_dpi.vh"
+`endif
+
+module VX_fp_nmadd #( 
+    parameter TAGW = 1,
+    parameter LANES = 1
+) (
+    input wire clk,
+    input wire reset,   
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire  do_sub,
+
+    input wire [LANES-1:0][31:0]  dataa,
+    input wire [LANES-1:0][31:0]  datab,
+    input wire [LANES-1:0][31:0]  datac,
+    output wire [LANES-1:0][31:0] result, 
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+); 
+
+    wire stall = ~ready_out && valid_out;
+
+    reg do_sub_r;
+
+    for (genvar i = 0; i < LANES; i++) begin
+
+        wire [31:0] result_madd;
+        wire [31:0] result_msub; 
+
+        wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;
+
+    `ifdef QUARTUS
+        twentynm_fp_mac mac_fp_madd (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(datac[i]),
+            .ay(datab[i]),
+            .az(dataa[i]),
+            .clk({2'b00,clk}),
+            .ena({2'b11,~stall),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result_madd),
+            .chainout()
+        );
+        defparam mac_fp_madd.operation_mode = "sp_mult_add"; 
+        defparam mac_fp_madd.use_chainin = "false"; 
+        defparam mac_fp_madd.adder_subtract = "false"; 
+        defparam mac_fp_madd.ax_clock = "0"; 
+        defparam mac_fp_madd.ay_clock = "0"; 
+        defparam mac_fp_madd.az_clock = "0"; 
+        defparam mac_fp_madd.output_clock = "0"; 
+        defparam mac_fp_madd.accumulate_clock = "none"; 
+        defparam mac_fp_madd.ax_chainin_pl_clock = "0"; 
+        defparam mac_fp_madd.accum_pipeline_clock = "none"; 
+        defparam mac_fp_madd.mult_pipeline_clock = "0"; 
+        defparam mac_fp_madd.adder_input_clock = "0"; 
+        defparam mac_fp_madd.accum_adder_clock = "none"; 
+
+        twentynm_fp_mac mac_fp_msub (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(datac[i]),
+            .ay(datab[i]),
+            .az(dataa[i]),
+            .clk({2'b00,clk}),
+            .ena({2'b11,enable0}),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result_msub),
+            .chainout()
+        );
+        defparam mac_fp_msub.operation_mode = "sp_mult_add"; 
+        defparam mac_fp_msub.use_chainin = "false"; 
+        defparam mac_fp_msub.adder_subtract = "true"; 
+        defparam mac_fp_msub.ax_clock = "0"; 
+        defparam mac_fp_msub.ay_clock = "0"; 
+        defparam mac_fp_msub.az_clock = "0"; 
+        defparam mac_fp_msub.output_clock = "0"; 
+        defparam mac_fp_msub.accumulate_clock = "none"; 
+        defparam mac_fp_msub.ax_chainin_pl_clock = "0"; 
+        defparam mac_fp_msub.accum_pipeline_clock = "none"; 
+        defparam mac_fp_msub.mult_pipeline_clock = "0"; 
+        defparam mac_fp_msub.adder_input_clock = "0"; 
+        defparam mac_fp_msub.accum_adder_clock = "none";
+
+        twentynm_fp_mac mac_fp_neg (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(32'h0),
+            .ay(result_st0),
+            .az(),
+            .clk({2'b00,clk}),
+            .ena({2'b11,enable1}),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result[i]),
+            .chainout()
+        );
+        defparam mac_fp_neg.operation_mode = "sp_add"; 
+        defparam mac_fp_neg.use_chainin = "false"; 
+        defparam mac_fp_neg.adder_subtract = "true"; 
+        defparam mac_fp_neg.ax_clock = "0"; 
+        defparam mac_fp_neg.ay_clock = "0"; 
+        defparam mac_fp_neg.az_clock = "none"; 
+        defparam mac_fp_neg.output_clock = "0"; 
+        defparam mac_fp_neg.accumulate_clock = "none"; 
+        defparam mac_fp_neg.ax_chainin_pl_clock = "none"; 
+        defparam mac_fp_neg.accum_pipeline_clock = "none"; 
+        defparam mac_fp_neg.mult_pipeline_clock = "none"; 
+        defparam mac_fp_neg.adder_input_clock = "0"; 
+        defparam mac_fp_neg.accum_adder_clock = "none";
+    `else
+        always @(posedge clk) begin
+           dpi_fmadd(clk, ~stall, dataa[i], datab[i], datac[i], result_madd);
+           dpi_fmsub(clk, ~stall, dataa[i], datab[i], datac[i], result_msub);
+           dpi_fsub(clk, ~stall, 32'b0, result_st0, result[i]);
+        end
+    `endif
+    end    
+
+    always @(posedge clk) begin
+        if (~stall) begin
+            do_sub_r <= do_sub;
+        end
+    end
+
+    VX_shift_register #(
+        .DATAW(TAGW + 1),
+        .DEPTH(`LATENCY_FNMADD)
+    ) shift_reg1 (
+        .clk(clk),
+        .reset(reset),
+        .enable(~stall),
+        .in({tag_in,   valid_in}),
+        .out({tag_out, valid_out})
+    );
+
+    assign ready_in  = ~stall;
+
+endmodule
--- a/hw/rtl/fp_cores/VX_fp_noncomp.v
+++ b/hw/rtl/fp_cores/VX_fp_noncomp.v
@ -45,8 +45,8 @@ module VX_fp_noncomp #(
    reg [LANES-1:0][31:0]  datab_r;

    reg [LANES-1:0]       a_sign, b_sign;
-    reg [LANES-1:0][7:0]  a_exponent, b_exponent;
-    reg [LANES-1:0][22:0] a_mantissa, b_mantissa;
+    reg [LANES-1:0][7:0]  a_exponent;
+    reg [LANES-1:0][22:0] a_mantissa;
    fp_type_t [LANES-1:0]  a_type, b_type;
    reg [LANES-1:0] a_smaller, ab_equal;

@ -60,12 +60,12 @@ module VX_fp_noncomp #(

    // Setup
    for (genvar i = 0; i < LANES; i++) begin
-        wire tmp_a_sign            = dataa[i][31]; 
-        wire [7:0] tmp_a_exponent  = dataa[i][30:23];
+        wire            tmp_a_sign = dataa[i][31]; 
+        wire [7:0]  tmp_a_exponent = dataa[i][30:23];
        wire [22:0] tmp_a_mantissa = dataa[i][22:0];

-        wire tmp_b_sign            = datab[i][31]; 
-        wire [7:0] tmp_b_exponent  = datab[i][30:23];
+        wire            tmp_b_sign = datab[i][31]; 
+        wire [7:0]  tmp_b_exponent = datab[i][30:23];
        wire [22:0] tmp_b_mantissa = datab[i][22:0];

        fp_type_t tmp_a_type, tmp_b_type;
@ -86,14 +86,14 @@ module VX_fp_noncomp #(
        wire tmp_ab_equal  = (dataa[i] == datab[i]) | (tmp_a_type[4] & tmp_b_type[4]);

        VX_generic_register #(
-            .N(1 + 1 + 8 + 8 + 23 + 23 + $bits(fp_type_t) + $bits(fp_type_t) + 1 + 1)
+            .N(1 + 1 + 8 + 23 + $bits(fp_type_t) + $bits(fp_type_t) + 1 + 1)
        ) fnc1_reg (
            .clk   (clk),
            .reset (reset),
            .stall (stall),
            .flush (1'b0),
-            .in    ({tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_b_exponent, tmp_a_mantissa, tmp_b_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}),
-            .out   ({a_sign[i],  b_sign[i],  a_exponent[i],  b_exponent[i],  a_mantissa[i],  b_mantissa[i],  a_type[i],  b_type[i],  a_smaller[i],  ab_equal[i]})
+            .in    ({tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_a_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}),
+            .out   ({a_sign[i],  b_sign[i],  a_exponent[i],  a_mantissa[i],  a_type[i],  b_type[i],  a_smaller[i],  ab_equal[i]})
        );
    end  

@ -213,8 +213,6 @@ module VX_fp_noncomp #(

    for (genvar i = 0; i < LANES; i++) begin
        always @(*) begin
-            tmp_result[i] = 32'hdeadbeaf;
-            {tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0;
            case (op_type_r)
                `FPU_CLASS: begin
                    tmp_result[i] = fclass_mask[i];
@ -224,7 +222,8 @@ module VX_fp_noncomp #(
                    tmp_result[i] = fcmp_res[i];
                    {tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = fcmp_excp[i];
                end      
-                `FPU_MISC: begin
+                //`FPU_MISC:
+                default: begin
                    case (frm)
                        0,1,2:  begin
                            tmp_result[i] = fsgnj_res[i];
@ -234,7 +233,8 @@ module VX_fp_noncomp #(
                            tmp_result[i] = fminmax_res[i];
                            {tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = {a_type[i][0] | b_type[i][0], 4'h0};    
                        end
-                        5,6: begin
+                        //5,6,7: 
+                        default: begin
                            tmp_result[i] = dataa[i];
                            {tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0;    
                        end
--- a/hw/rtl/fp_cores/altera/VX_fp_sqrt.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_sqrt.v
@ -1,5 +1,9 @@
 `include "VX_define.vh"

+`ifndef SYNTHESIS
+`include "float_dpi.vh"
+`endif
+
 module VX_fp_sqrt #( 
    parameter TAGW = 1,
    parameter LANES = 1
@ -20,18 +24,22 @@ module VX_fp_sqrt #(
    input wire  ready_out,
    output wire valid_out
 );    
-    wire stall  = ~ready_out && valid_out;
-    wire enable = ~stall;
-    assign ready_in = enable;
-
+    wire stall = ~ready_out && valid_out;
+    
    for (genvar i = 0; i < LANES; i++) begin
+    `ifdef QUARTUS
        acl_fp_sqrt fsqrt (
            .clk    (clk),
            .areset (1'b0),
-            .en     (enable),
+            .en     (~stall),
            .a      (dataa[i]),
            .q      (result[i])
        );
+    `else
+        always @(posedge clk) begin
+           dpi_fsqrt(clk, ~stall, dataa[i], result[i]);
+        end
+    `endif
    end

    VX_shift_register #(
@ -40,9 +48,11 @@ module VX_fp_sqrt #(
    ) shift_reg (
        .clk(clk),
        .reset(reset),
-        .enable(enable),
+        .enable(~stall),
        .in ({tag_in,  valid_in}),
        .out({tag_out, valid_out})
    );

+    assign ready_in = ~stall;
+
 endmodule
--- a/hw/rtl/fp_cores/VX_fp_type.v
+++ b/hw/rtl/fp_cores/VX_fp_type.v
@ -8,12 +8,20 @@ module VX_fp_type (
    // outputs
    output fp_type_t o_type
 );
-    assign o_type.is_normal    = (exponent != 8'd0) && (exponent != 8'hff);
-    assign o_type.is_zero      = (exponent == 8'd0) && (mantissa == 23'd0);
-    assign o_type.is_subnormal = (exponent == 8'd0) && !o_type.is_zero;
-    assign o_type.is_inf       = ((exponent == 8'hff) && (mantissa == 23'd0));
-    assign o_type.is_nan       = ((exponent == 8'hff) && (mantissa != 23'd0));
-    assign o_type.is_signaling = o_type.is_nan && (mantissa[22] == 1'b0);
-    assign o_type.is_quiet     = o_type.is_nan && !o_type.is_signaling;
+    wire is_normal    = (exponent != 8'd0) && (exponent != 8'hff);
+    wire is_zero      = (exponent == 8'd0) && (mantissa == 23'd0);
+    wire is_subnormal = (exponent == 8'd0) && !is_zero;
+    wire is_inf       = (exponent == 8'hff) && (mantissa == 23'd0); 
+    wire is_nan       = (exponent == 8'hff) && (mantissa != 23'd0);
+    wire is_signaling = is_nan && (mantissa[22] == 1'b0);
+    wire is_quiet     = is_nan && !is_signaling;
+
+    assign o_type.is_normal    = is_normal;
+    assign o_type.is_zero      = is_zero;
+    assign o_type.is_subnormal = is_subnormal;
+    assign o_type.is_inf       = is_inf;
+    assign o_type.is_nan       = is_nan;
+    assign o_type.is_signaling = is_signaling;
+    assign o_type.is_quiet     = is_quiet;

 endmodule
--- a/hw/rtl/fp_cores/VX_fpnew.v
+++ b/hw/rtl/fp_cores/VX_fpnew.v
@ -53,10 +53,10 @@ module VX_fpnew #(
    };

    localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{
-      PipeRegs:'{'{`LATENCY_FMULADD, 0, 0, 0, 0},   // ADDMUL
-                 '{default: `LATENCY_FDIVSQRT},     // DIVSQRT
-                 '{default: `LATENCY_FNONCOMP},     // NONCOMP
-                 '{default: `LATENCY_FCONV}},       // CONV
+      PipeRegs:'{'{`LATENCY_FMADD, 0, 0, 0, 0}, // ADDMUL
+                 '{default: `LATENCY_FDIV},     // DIVSQRT
+                 '{default: `LATENCY_FNONCOMP}, // NONCOMP
+                 '{default: `LATENCY_ITOF}},    // CONV
      UnitTypes:'{'{default: UNIT_FMULADD},     // ADDMUL
                  '{default: UNIT_FDIVSQRT},    // DIVSQRT
                  '{default: UNIT_FNONCOMP},    // NONCOMP
--- a/hw/rtl/fp_cores/altera/VX_fp_add.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_add.v
@ -1,81 +0,0 @@
-`include "VX_define.vh"
-
-module VX_fp_add #( 
-    parameter TAGW = 1,
-    parameter LANES = 1
-) (
-    input wire clk,
-    input wire reset,   
-
-    output wire ready_in,
-    input wire  valid_in,
-
-    input wire [TAGW-1:0] tag_in,
-
-    input wire [LANES-1:0][31:0]  dataa,
-    input wire [LANES-1:0][31:0]  datab,
-    output wire [LANES-1:0][31:0] result, 
-
-    output wire [TAGW-1:0] tag_out,
-
-    input wire  ready_out,
-    output wire valid_out
-);    
-    wire stall  = ~ready_out && valid_out;
-    wire enable = ~stall;
-    assign ready_in = enable;
-
-    for (genvar i = 0; i < LANES; i++) begin
-        twentynm_fp_mac mac_fp_wys (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(dataa[i]),
-            .ay(datab[i]),
-            .az(),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result[i]),
-            .chainout()
-        );
-        defparam mac_fp_wys.operation_mode = "sp_add"; 
-        defparam mac_fp_wys.use_chainin = "false"; 
-        defparam mac_fp_wys.adder_subtract = "false"; 
-        defparam mac_fp_wys.ax_clock = "0"; 
-        defparam mac_fp_wys.ay_clock = "0"; 
-        defparam mac_fp_wys.az_clock = "none"; 
-        defparam mac_fp_wys.output_clock = "0"; 
-        defparam mac_fp_wys.accumulate_clock = "none"; 
-        defparam mac_fp_wys.ax_chainin_pl_clock = "none"; 
-        defparam mac_fp_wys.accum_pipeline_clock = "none"; 
-        defparam mac_fp_wys.mult_pipeline_clock = "none"; 
-        defparam mac_fp_wys.adder_input_clock = "0"; 
-        defparam mac_fp_wys.accum_adder_clock = "none"; 
-    end
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1),
-        .DEPTH(1)
-    ) shift_reg (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable),
-        .in ({tag_in,  valid_in}),
-        .out({tag_out, valid_out})
-    );
-
-endmodule
--- a/hw/rtl/fp_cores/altera/VX_fp_ftoi.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_ftoi.v
@ -1,48 +0,0 @@
-`include "VX_define.vh"
-
-module VX_fp_ftoi #( 
-    parameter TAGW = 1,
-    parameter LANES = 1
-) (
-    input wire clk,
-    input wire reset,   
-
-    output wire ready_in,
-    input wire  valid_in,
-
-    input wire [TAGW-1:0] tag_in,
-
-    input wire [LANES-1:0][31:0]  dataa,
-    output wire [LANES-1:0][31:0] result, 
-
-    output wire [TAGW-1:0] tag_out,
-
-    input wire  ready_out,
-    output wire valid_out
-);    
-    wire stall  = ~ready_out && valid_out;
-    wire enable = ~stall;
-    assign ready_in = enable;
-
-    for (genvar i = 0; i < LANES; i++) begin
-        acl_fp_ftoi ftoi (
-            .clk    (clk),
-            .areset (1'b0),
-            .en     (enable),
-            .a      (dataa[i]),
-            .q      (result[i])
-        );
-    end
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1),
-        .DEPTH(`LATENCY_FTOI)
-    ) shift_reg (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable),
-        .in ({tag_in,  valid_in}),
-        .out({tag_out, valid_out})
-    );
-
-endmodule
--- a/hw/rtl/fp_cores/altera/VX_fp_ftou.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_ftou.v
@ -1,48 +0,0 @@
-`include "VX_define.vh"
-
-module VX_fp_ftou #( 
-    parameter TAGW = 1,
-    parameter LANES = 1
-) (
-    input wire clk,
-    input wire reset,   
-
-    output wire ready_in,
-    input wire  valid_in,
-
-    input wire [TAGW-1:0] tag_in,
-
-    input wire [LANES-1:0][31:0]  dataa,
-    output wire [LANES-1:0][31:0] result, 
-
-    output wire [TAGW-1:0] tag_out,
-
-    input wire  ready_out,
-    output wire valid_out
-);    
-    wire stall  = ~ready_out && valid_out;
-    wire enable = ~stall;
-    assign ready_in = enable;
-
-    for (genvar i = 0; i < LANES; i++) begin
-        acl_fp_ftou ftou (
-            .clk    (clk),
-            .areset (1'b0),
-            .en     (enable),
-            .a      (dataa[i]),
-            .q      (result[i])
-        );
-    end
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1),
-        .DEPTH(`LATENCY_FTOU)
-    ) shift_reg (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable),
-        .in ({tag_in,  valid_in}),
-        .out({tag_out, valid_out})
-    );
-
-endmodule
--- a/hw/rtl/fp_cores/altera/VX_fp_itof.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_itof.v
@ -1,48 +0,0 @@
-`include "VX_define.vh"
-
-module VX_fp_itof #( 
-    parameter TAGW = 1,
-    parameter LANES = 1
-) (
-    input wire clk,
-    input wire reset,   
-
-    output wire ready_in,
-    input wire  valid_in,
-
-    input wire [TAGW-1:0] tag_in,
-
-    input wire [LANES-1:0][31:0]  dataa,
-    output wire [LANES-1:0][31:0] result, 
-
-    output wire [TAGW-1:0] tag_out,
-
-    input wire  ready_out,
-    output wire valid_out
-);    
-    wire stall  = ~ready_out && valid_out;
-    wire enable = ~stall;
-    assign ready_in = enable;
-
-    for (genvar i = 0; i < LANES; i++) begin
-        acl_fp_itof itof (
-            .clk    (clk),
-            .areset (1'b0),
-            .en     (enable),
-            .a      (dataa[i]),
-            .q      (result[i])
-        );
-    end
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1),
-        .DEPTH(`LATENCY_ITOF)
-    ) shift_reg (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable),
-        .in ({tag_in,  valid_in}),
-        .out({tag_out, valid_out})
-    );
-
-endmodule
--- a/hw/rtl/fp_cores/altera/VX_fp_madd.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_madd.v
@ -1,146 +0,0 @@
-`include "VX_define.vh"
-
-module VX_fp_madd #( 
-    parameter TAGW = 1,
-    parameter LANES = 1
-) (
-    input wire clk,
-    input wire reset,   
-
-    output wire ready_in,
-    input wire  valid_in,
-
-    input wire [TAGW-1:0] tag_in,
-
-    input wire [LANES-1:0][31:0]  dataa,
-    input wire [LANES-1:0][31:0]  datab,
-    input wire [LANES-1:0][31:0]  datac,
-    output wire [LANES-1:0][31:0] result, 
-
-    input wire  negate,
-
-    output wire [TAGW-1:0] tag_out,
-
-    input wire  ready_out,
-    output wire valid_out
-);    
-    wire enable0, enable1;
-    assign ready_in = enable0 && enable1;
-
-    wire [LANES-1:0][31:0] result_st0, result_st1;
-    wire [TAGW-1:0] out_tag_st0, out_tag_st1;
-    wire in_valid_st0, out_valid_st0, out_valid_st1;
-
-    for (genvar i = 0; i < LANES; i++) begin
-        twentynm_fp_mac mac_fp_wys0 (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(datac[i]),
-            .ay(datab[i]),
-            .az(dataa[i]),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable0}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result_st0[i]),
-            .chainout()
-        );
-        defparam mac_fp_wys0.operation_mode = "sp_mult_add"; 
-        defparam mac_fp_wys0.use_chainin = "false"; 
-        defparam mac_fp_wys0.adder_subtract = "false"; 
-        defparam mac_fp_wys0.ax_clock = "0"; 
-        defparam mac_fp_wys0.ay_clock = "0"; 
-        defparam mac_fp_wys0.az_clock = "0"; 
-        defparam mac_fp_wys0.output_clock = "0"; 
-        defparam mac_fp_wys0.accumulate_clock = "none"; 
-        defparam mac_fp_wys0.ax_chainin_pl_clock = "0"; 
-        defparam mac_fp_wys0.accum_pipeline_clock = "none"; 
-        defparam mac_fp_wys0.mult_pipeline_clock = "0"; 
-        defparam mac_fp_wys0.adder_input_clock = "0"; 
-        defparam mac_fp_wys0.accum_adder_clock = "none"; 
-
-        twentynm_fp_mac mac_fp_wys1 (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(32'h0),
-            .ay(result_st0[i]),
-            .az(),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable1}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result_st1[i]),
-            .chainout()
-        );
-        defparam mac_fp_wys1.operation_mode = "sp_add"; 
-        defparam mac_fp_wys1.use_chainin = "false"; 
-        defparam mac_fp_wys1.adder_subtract = "true"; 
-        defparam mac_fp_wys1.ax_clock = "0"; 
-        defparam mac_fp_wys1.ay_clock = "0"; 
-        defparam mac_fp_wys1.az_clock = "none"; 
-        defparam mac_fp_wys1.output_clock = "0"; 
-        defparam mac_fp_wys1.accumulate_clock = "none"; 
-        defparam mac_fp_wys1.ax_chainin_pl_clock = "none"; 
-        defparam mac_fp_wys1.accum_pipeline_clock = "none"; 
-        defparam mac_fp_wys1.mult_pipeline_clock = "none"; 
-        defparam mac_fp_wys1.adder_input_clock = "0"; 
-        defparam mac_fp_wys1.accum_adder_clock = "none";
-    end
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1 + 1),
-        .DEPTH(1)
-    ) shift_reg0 (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable0),
-        .in ({tag_in,      (valid_in && ~negate), (valid_in && negate)}),
-        .out({out_tag_st0, out_valid_st0,         in_valid_st0})
-    );
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1),
-        .DEPTH(1)
-    ) shift_reg1 (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable1),
-        .in({out_tag_st0,  in_valid_st0}),
-        .out({out_tag_st1, out_valid_st1})
-    );
-
-    wire out_stall = ~ready_out && valid_out;
-    assign enable0 = ~out_stall;
-    assign enable1 = ~out_stall && ~(out_valid_st0 && out_valid_st1); // stall the negate stage if dual outputs
-
-    assign result    = out_valid_st0 ? result_st0  : result_st1;
-    assign tag_out   = out_valid_st0 ? out_tag_st0 : out_tag_st1;
-    assign valid_out = out_valid_st0 || out_valid_st1; 
-
-endmodule
--- a/hw/rtl/fp_cores/altera/VX_fp_msub.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_msub.v
@ -1,146 +0,0 @@
-`include "VX_define.vh"
-
-module VX_fp_msub #( 
-    parameter TAGW = 1,
-    parameter LANES = 1
-) (
-    input wire clk,
-    input wire reset,   
-
-    output wire ready_in,
-    input wire  valid_in,
-
-    input wire [TAGW-1:0] tag_in,
-
-    input wire [LANES-1:0][31:0]  dataa,
-    input wire [LANES-1:0][31:0]  datab,
-    input wire [LANES-1:0][31:0]  datac,
-    output wire [LANES-1:0][31:0] result, 
-
-    input wire  negate,
-
-    output wire [TAGW-1:0] tag_out,
-
-    input wire  ready_out,
-    output wire valid_out
-);    
-    wire enable0, enable1;
-    assign ready_in = enable0 && enable1;
-
-    wire [LANES-1:0][31:0] result_st0, result_st1;
-    wire [TAGW-1:0] out_tag_st0, out_tag_st1;
-    wire in_valid_st0, out_valid_st0, out_valid_st1;
-
-    for (genvar i = 0; i < LANES; i++) begin
-        twentynm_fp_mac mac_fp_wys0 (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(datac[i]),
-            .ay(datab[i]),
-            .az(dataa[i]),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable0}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result_st0[i]),
-            .chainout()
-        );
-        defparam mac_fp_wys0.operation_mode = "sp_mult_add"; 
-        defparam mac_fp_wys0.use_chainin = "false"; 
-        defparam mac_fp_wys0.adder_subtract = "true"; 
-        defparam mac_fp_wys0.ax_clock = "0"; 
-        defparam mac_fp_wys0.ay_clock = "0"; 
-        defparam mac_fp_wys0.az_clock = "0"; 
-        defparam mac_fp_wys0.output_clock = "0"; 
-        defparam mac_fp_wys0.accumulate_clock = "none"; 
-        defparam mac_fp_wys0.ax_chainin_pl_clock = "0"; 
-        defparam mac_fp_wys0.accum_pipeline_clock = "none"; 
-        defparam mac_fp_wys0.mult_pipeline_clock = "0"; 
-        defparam mac_fp_wys0.adder_input_clock = "0"; 
-        defparam mac_fp_wys0.accum_adder_clock = "none"; 
-
-        twentynm_fp_mac mac_fp_wys1 (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(32'h0),
-            .ay(result_st0[i]),
-            .az(),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable1}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result_st1[i]),
-            .chainout()
-        );
-        defparam mac_fp_wys1.operation_mode = "sp_add"; 
-        defparam mac_fp_wys1.use_chainin = "false"; 
-        defparam mac_fp_wys1.adder_subtract = "true"; 
-        defparam mac_fp_wys1.ax_clock = "0"; 
-        defparam mac_fp_wys1.ay_clock = "0"; 
-        defparam mac_fp_wys1.az_clock = "none"; 
-        defparam mac_fp_wys1.output_clock = "0"; 
-        defparam mac_fp_wys1.accumulate_clock = "none"; 
-        defparam mac_fp_wys1.ax_chainin_pl_clock = "none"; 
-        defparam mac_fp_wys1.accum_pipeline_clock = "none"; 
-        defparam mac_fp_wys1.mult_pipeline_clock = "none"; 
-        defparam mac_fp_wys1.adder_input_clock = "0"; 
-        defparam mac_fp_wys1.accum_adder_clock = "none";
-    end
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1 + 1),
-        .DEPTH(1)
-    ) shift_reg0 (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable0),
-        .in ({tag_in,      (valid_in && ~negate), (valid_in && negate)}),
-        .out({out_tag_st0, out_valid_st0,         in_valid_st0})
-    );
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1),
-        .DEPTH(1)
-    ) shift_reg1 (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable1),
-        .in({out_tag_st0, in_valid_st0}),
-        .out({out_tag_st1, out_valid_st1})
-    );
-
-    wire out_stall = ~ready_out && valid_out;
-    assign enable0 = ~out_stall;
-    assign enable1 = ~out_stall && ~(out_valid_st0 && out_valid_st1); // stall the negate stage if dual outputs
-
-    assign result    = out_valid_st0 ? result_st0  : result_st1;
-    assign tag_out   = out_valid_st0 ? out_tag_st0 : out_tag_st1;
-    assign valid_out = out_valid_st0 || out_valid_st1; 
-
-endmodule
--- a/hw/rtl/fp_cores/altera/VX_fp_mul.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_mul.v
@ -1,81 +0,0 @@
-`include "VX_define.vh"
-
-module VX_fp_mul #( 
-    parameter TAGW = 1,
-    parameter LANES = 1
-) (
-    input wire clk,
-    input wire reset,   
-
-    output wire ready_in,
-    input wire  valid_in,
-
-    input wire [TAGW-1:0] tag_in,
-
-    input wire [LANES-1:0][31:0]  dataa,
-    input wire [LANES-1:0][31:0]  datab,
-    output wire [LANES-1:0][31:0] result, 
-
-    output wire [TAGW-1:0] tag_out,
-
-    input wire  ready_out,
-    output wire valid_out
-);    
-    wire stall  = ~ready_out && valid_out;
-    wire enable = ~stall;
-    assign ready_in = enable;
-
-    for (genvar i = 0; i < LANES; i++) begin
-        twentynm_fp_mac mac_fp_wys (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(),
-            .ay(datab[i]),
-            .az(dataa[i]),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result[i]),
-            .chainout()
-        );
-        defparam mac_fp_wys.operation_mode = "sp_mult"; 
-        defparam mac_fp_wys.use_chainin = "false"; 
-        defparam mac_fp_wys.adder_subtract = "false"; 
-        defparam mac_fp_wys.ax_clock = "none"; 
-        defparam mac_fp_wys.ay_clock = "0"; 
-        defparam mac_fp_wys.az_clock = "0"; 
-        defparam mac_fp_wys.output_clock = "0"; 
-        defparam mac_fp_wys.accumulate_clock = "none"; 
-        defparam mac_fp_wys.ax_chainin_pl_clock = "none"; 
-        defparam mac_fp_wys.accum_pipeline_clock = "none"; 
-        defparam mac_fp_wys.mult_pipeline_clock = "0"; 
-        defparam mac_fp_wys.adder_input_clock = "none"; 
-        defparam mac_fp_wys.accum_adder_clock = "none"; 
-    end
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1),
-        .DEPTH(1)
-    ) shift_reg (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable),
-        .in ({tag_in,  valid_in}),
-        .out({tag_out, valid_out})
-    );
-
-endmodule
--- a/hw/rtl/fp_cores/altera/VX_fp_sub.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_sub.v
@ -1,81 +0,0 @@
-`include "VX_define.vh"
-
-module VX_fp_sub #( 
-    parameter TAGW = 1,
-    parameter LANES = 1
-) (
-    input wire clk,
-    input wire reset,   
-
-    output wire ready_in,
-    input wire  valid_in,
-
-    input wire [TAGW-1:0] tag_in,
-
-    input wire [LANES-1:0][31:0]  dataa,
-    input wire [LANES-1:0][31:0]  datab,
-    output wire [LANES-1:0][31:0] result, 
-
-    output wire [TAGW-1:0] tag_out,
-
-    input wire  ready_out,
-    output wire valid_out
-);
-    wire stall  = ~ready_out && valid_out;
-    wire enable = ~stall;
-    assign ready_in = enable;
-
-    for (genvar i = 0; i < LANES; i++) begin
-        twentynm_fp_mac mac_fp_wys (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(dataa[i]),
-            .ay(datab[i]),
-            .az(),
-            .clk({2'b00,clk}),
-            .ena({2'b11,enable}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result[i]),
-            .chainout()
-        );
-        defparam mac_fp_wys.operation_mode = "sp_add"; 
-        defparam mac_fp_wys.use_chainin = "false"; 
-        defparam mac_fp_wys.adder_subtract = "true"; 
-        defparam mac_fp_wys.ax_clock = "0"; 
-        defparam mac_fp_wys.ay_clock = "0"; 
-        defparam mac_fp_wys.az_clock = "none"; 
-        defparam mac_fp_wys.output_clock = "0"; 
-        defparam mac_fp_wys.accumulate_clock = "none"; 
-        defparam mac_fp_wys.ax_chainin_pl_clock = "none"; 
-        defparam mac_fp_wys.accum_pipeline_clock = "none"; 
-        defparam mac_fp_wys.mult_pipeline_clock = "none"; 
-        defparam mac_fp_wys.adder_input_clock = "0"; 
-        defparam mac_fp_wys.accum_adder_clock = "none";
-    end
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1),
-        .DEPTH(1)
-    ) shift_reg (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable),
-        .in ({tag_in,  valid_in}),
-        .out({tag_out, valid_out})
-    );    
-
-endmodule
--- a/hw/rtl/fp_cores/altera/VX_fp_utof.v
+++ b/hw/rtl/fp_cores/altera/VX_fp_utof.v
@ -1,48 +0,0 @@
-`include "VX_define.vh"
-
-module VX_fp_utof #( 
-    parameter TAGW = 1,
-    parameter LANES = 1
-) (
-    input wire clk,
-    input wire reset,   
-
-    output wire ready_in,
-    input wire  valid_in,
-
-    input wire [TAGW-1:0] tag_in,
-
-    input wire [LANES-1:0][31:0]  dataa,
-    output wire [LANES-1:0][31:0] result, 
-
-    output wire [TAGW-1:0] tag_out,
-
-    input wire  ready_out,
-    output wire valid_out
-);    
-    wire stall  = ~ready_out && valid_out;
-    wire enable = ~stall;
-    assign ready_in = enable;
-
-    for (genvar i = 0; i < LANES; i++) begin
-        acl_fp_utof utof (
-            .clk    (clk),
-            .areset (1'b0),
-            .en     (enable),
-            .a      (dataa[i]),
-            .q      (result[i])
-        );
-    end
-
-    VX_shift_register #(
-        .DATAW(TAGW + 1),
-        .DEPTH(`LATENCY_UTOF)
-    ) shift_reg (
-        .clk(clk),
-        .reset(reset),
-        .enable(enable),
-        .in ({tag_in,  valid_in}),
-        .out({tag_out, valid_out})
-    );
-
-endmodule
--- a/hw/rtl/fp_cores/altera/dspba_delay_ver.sv
+++ b/hw/rtl/fp_cores/altera/dspba_delay_ver.sv
@ -0,0 +1,95 @@
+// Legal Notice: Copyright 2017 Intel Corporation.  All rights reserved.
+// Your use of  Intel  Corporation's design tools,  logic functions and other
+// software and tools,  and its AMPP  partner logic functions, and  any output
+// files  any of the  foregoing  device programming or simulation files),  and
+// any associated  documentation or information are expressly subject  to  the
+// terms and conditions  of the Intel FPGA Software License Agreement,
+// Intel  MegaCore  Function  License  Agreement, or other applicable license
+// agreement,  including,  without limitation,  that your use  is for the sole
+// purpose of  programming  logic  devices  manufactured by Intel and sold by
+// Intel or its authorized  distributors.  Please  refer  to  the  applicable
+// agreement for further details.
+
+module dspba_delay_ver
+#(
+    parameter width = 8,
+    parameter depth = 1,
+    parameter reset_high = 1'b1,
+    parameter reset_kind = "ASYNC" 
+) (
+    input clk,
+    input aclr,
+    input ena,
+    input [width-1:0] xin,
+    output [width-1:0] xout
+);
+
+    wire reset;
+    reg [width-1:0] delays [depth-1:0];
+
+    assign reset = aclr ^ reset_high;
+    
+    generate
+        if (depth > 0)
+        begin
+            genvar i;
+            for (i = 0; i < depth; ++i)
+            begin : delay_block
+                if (reset_kind == "ASYNC") 
+                begin : sync_reset
+                always @ (posedge clk or negedge reset)
+                    begin: a
+                        if (!reset) begin
+                            delays[i] <= 0;
+                        end else begin
+                            if (ena) begin
+                                if (i > 0) begin
+                                    delays[i] <= delays[i - 1];
+                                end else begin
+                                    delays[i] <= xin;
+                                end
+                            end
+                        end
+                    end
+                end
+
+                if (reset_kind == "SYNC") 
+                begin : async_reset
+                always @ (posedge clk)
+                    begin: a
+                        if (!reset) begin
+                            delays[i] <= 0;
+                        end else begin
+                            if (ena) begin
+                                if (i > 0) begin
+                                    delays[i] <= delays[i - 1];
+                                end else begin
+                                    delays[i] <= xin;
+                                end
+                            end
+                        end
+                    end
+                end
+
+                if (reset_kind == "NONE") 
+                begin : no_reset
+                always @ (posedge clk)
+                    begin: a
+                        if (ena) begin
+                            if (i > 0) begin
+                                delays[i] <= delays[i - 1];
+                            end else begin
+                                delays[i] <= xin;
+                            end
+                        end
+                    end
+                end
+            end
+
+            assign xout = delays[depth - 1];
+        end else begin
+            assign xout = xin;
+        end
+    endgenerate
+    
+endmodule
--- a/hw/rtl/fp_cores/altera/dspba_library_ver.sv
+++ b/hw/rtl/fp_cores/altera/dspba_library_ver.sv
@ -1,392 +0,0 @@
-// Legal Notice: Copyright 2017 Intel Corporation.  All rights reserved.
-// Your use of  Intel  Corporation's design tools,  logic functions and other
-// software and tools,  and its AMPP  partner logic functions, and  any output
-// files  any of the  foregoing  device programming or simulation files),  and
-// any associated  documentation or information are expressly subject  to  the
-// terms and conditions  of the Intel FPGA Software License Agreement,
-// Intel  MegaCore  Function  License  Agreement, or other applicable license
-// agreement,  including,  without limitation,  that your use  is for the sole
-// purpose of  programming  logic  devices  manufactured by Intel and sold by
-// Intel or its authorized  distributors.  Please  refer  to  the  applicable
-// agreement for further details.
-
-
-module dspba_delay_ver
-#(
-    parameter width = 8,
-    parameter depth = 1,
-    parameter reset_high = 1'b1,
-    parameter reset_kind = "ASYNC" 
-) (
-    input clk,
-    input aclr,
-    input ena,
-    input [width-1:0] xin,
-    output [width-1:0] xout
-);
-
-    wire reset;
-    reg [width-1:0] delays [depth-1:0];
-
-    assign reset = aclr ^ reset_high;
-    
-    generate
-        if (depth > 0)
-        begin
-            genvar i;
-            for (i = 0; i < depth; ++i)
-            begin : delay_block
-                if (reset_kind == "ASYNC") 
-                begin : sync_reset
-                always @ (posedge clk or negedge reset)
-                    begin: a
-                        if (!reset) begin
-                            delays[i] <= 0;
-                        end else begin
-                            if (ena) begin
-                                if (i > 0) begin
-                                    delays[i] <= delays[i - 1];
-                                end else begin
-                                    delays[i] <= xin;
-                                end
-                            end
-                        end
-                    end
-                end
-
-                if (reset_kind == "SYNC") 
-                begin : async_reset
-                always @ (posedge clk)
-                    begin: a
-                        if (!reset) begin
-                            delays[i] <= 0;
-                        end else begin
-                            if (ena) begin
-                                if (i > 0) begin
-                                    delays[i] <= delays[i - 1];
-                                end else begin
-                                    delays[i] <= xin;
-                                end
-                            end
-                        end
-                    end
-                end
-
-                if (reset_kind == "NONE") 
-                begin : no_reset
-                always @ (posedge clk)
-                    begin: a
-                        if (ena) begin
-                            if (i > 0) begin
-                                delays[i] <= delays[i - 1];
-                            end else begin
-                                delays[i] <= xin;
-                            end
-                        end
-                    end
-                end
-            end
-
-            assign xout = delays[depth - 1];
-        end else begin
-            assign xout = xin;
-        end
-    endgenerate
-    
-endmodule
-
-//------------------------------------------------------------------------------
-
-module dspba_sync_reg_ver
-#(
-    parameter width1 = 8,
-    parameter width2 = 8,
-    parameter depth = 2,
-    parameter pulse_multiplier = 1,
-    parameter counter_width = 8,
-    parameter init_value = 0,
-    parameter reset1_high = 1'b1,
-    parameter reset2_high = 1'b1,
-    parameter reset_kind = "ASYNC" 
-) (
-    input clk1,
-    input aclr1,
-    input [0 : 0] ena,
-    input [width1-1 : 0] xin,
-    output [width1-1 : 0] xout,
-    input clk2,
-    input aclr2,
-    output [width2-1 : 0] sxout
-);
-wire [width1-1 : 0] init_value_internal;
-
-wire reset1;
-wire reset2;
-
-reg iclk_enable;
-reg [width1-1 : 0] iclk_data;
-reg [width2-1 : 0] oclk_data;
-
-// For Synthesis this means: preserve this registers and do not merge any other flip-flops with synchronizer flip-flops 
-// For TimeQuest this means: identify these flip-flops as synchronizer to enable automatic MTBF analysis
-(* altera_attribute = {"-name ADV_NETLIST_OPT_ALLOWED NEVER_ALLOW; -name SYNCHRONIZER_IDENTIFICATION FORCED; -name DONT_MERGE_REGISTER ON; -name PRESERVE_REGISTER ON"} *) reg [depth-1 : 0] sync_regs;
-
-wire oclk_enable;
-
-wire ena_internal;
-reg [counter_width-1 : 0] counter;
-
-assign init_value_internal = init_value;
-
-assign reset1 = aclr1 ^ reset1_high;
-assign reset2 = aclr2 ^ reset2_high;
-
-generate
-    if (pulse_multiplier == 1)
-    begin: no_multiplication
-        assign ena_internal = ena[0];
-    end
-endgenerate
-
-generate 
-    if (pulse_multiplier > 1)
-    begin: multiplu_ena_pulse
-        if (reset_kind == "ASYNC")
-        begin: async_reset
-            always @ (posedge clk1 or negedge reset1)
-            begin
-                if (reset1 == 1'b0) begin
-                    counter <= 0;
-                end else begin
-                    if (counter > 0) begin
-                        if (counter == pulse_multiplier - 1) begin
-                            counter <= 0;
-                        end else begin
-                            counter <= counter + 2'd1;
-                        end
-                    end else begin
-                        if (ena[0] == 1'b1) begin
-                            counter <= 1;
-                        end
-                    end
-                end
-            end
-        end
-        if (reset_kind == "SYNC")
-        begin: sync_reset
-            always @ (posedge clk1)
-            begin
-                if (reset1 == 1'b0) begin
-                    counter <= 0;
-                end else begin
-                    if (counter > 0) begin
-                        if (counter == pulse_multiplier - 1) begin
-                            counter <= 0;
-                        end else begin
-                            counter <= counter + 2'd1;
-                        end
-                    end else begin
-                        if (ena[0] == 1'b1) begin
-                            counter <= 1;
-                        end
-                    end
-                end
-            end
-        end
-        if (reset_kind == "NONE")
-        begin: no_reset
-            always @ (posedge clk1)
-            begin
-                if (counter > 0) begin
-                    if (counter == pulse_multiplier - 1) begin
-                        counter <= 0;
-                    end else begin
-                        counter <= counter + 2'd1;
-                    end
-                end else begin
-                    if (ena[0] == 1'b1) begin
-                        counter <= 1;
-                    end
-                end
-            end
-        end
-        
-        assign ena_internal = counter > 0 ? 1'b1 : ena[0];
-    end
-endgenerate
-
-assign oclk_enable = sync_regs[depth - 1];
-
-generate
-    if (reset_kind == "ASYNC")
-    begin: iclk_async_reset 
-        always @ (posedge clk1 or negedge reset1) 
-        begin
-           if (reset1 == 1'b0) begin
-               iclk_data <= init_value_internal;
-               iclk_enable <= 1'b0;
-           end else begin
-               iclk_enable <= ena_internal;
-               if (ena[0] == 1'b1) begin
-                   iclk_data <= xin;
-               end
-           end
-        end
-    end
-    if (reset_kind == "SYNC")
-    begin: iclk_sync_reset 
-        always @ (posedge clk1) 
-        begin
-           if (reset1 == 1'b0) begin
-               iclk_data <= init_value_internal;
-               iclk_enable <= 1'b0;
-           end else begin
-               iclk_enable <= ena_internal;
-               if (ena[0] == 1'b1) begin
-                   iclk_data <= xin;
-               end
-           end
-        end
-    end
-    if (reset_kind == "NONE")
-    begin: iclk_no_reset 
-        always @ (posedge clk1) 
-        begin
-           iclk_enable <= ena_internal;
-           if (ena[0] == 1'b1) begin
-               iclk_data <= xin;
-           end
-        end
-    end
-endgenerate
-
-generate
-    genvar i;
-    for (i = 0; i < depth; ++i)
-    begin: sync_regs_block
-        if (reset_kind == "ASYNC") 
-        begin: sync_reg_async_reset
-            always @ (posedge clk2 or negedge reset2) begin
-                if (reset2 == 1'b0) begin
-                    sync_regs[i] <= 1'b0;
-                end else begin
-                    if (i > 0) begin
-                        sync_regs[i] <= sync_regs[i - 1];
-                    end else begin
-                        sync_regs[i] <= iclk_enable;
-                    end
-                end
-            end
-        end
-        if (reset_kind == "SYNC") 
-        begin: sync_reg_sync_reset
-            always @ (posedge clk2) begin
-                if (reset2 == 1'b0) begin
-                    sync_regs[i] <= 1'b0;
-                end else begin
-                    if (i > 0) begin
-                        sync_regs[i] <= sync_regs[i - 1];
-                    end else begin
-                        sync_regs[i] <= iclk_enable;
-                    end
-                end
-            end
-        end
-        if (reset_kind == "NONE") 
-        begin: sync_reg_no_reset
-            always @ (posedge clk2) begin
-                if (i > 0) begin
-                    sync_regs[i] <= sync_regs[i - 1];
-                end else begin
-                    sync_regs[i] <= iclk_enable;
-                end
-            end
-        end
-    end
-endgenerate
-
-generate
-    if (reset_kind == "ASYNC")
-    begin: oclk_async_reset
-        always @ (posedge clk2 or negedge reset2)
-        begin
-            if (reset2 == 1'b0) begin
-                oclk_data <= init_value_internal[width2-1 : 0];
-            end else begin
-                if (oclk_enable == 1'b1) begin
-                    oclk_data <= iclk_data[width2-1 : 0];
-                end
-            end
-        end
-    end
-    if (reset_kind == "SYNC")
-    begin: oclk_sync_reset
-        always @ (posedge clk2)
-        begin
-            if (reset2 == 1'b0) begin
-                oclk_data <= init_value_internal[width2-1 : 0];
-            end else begin
-                if (oclk_enable == 1'b1) begin
-                    oclk_data <= iclk_data[width2-1 : 0];
-                end
-            end
-        end
-    end
-    if (reset_kind == "NONE")
-    begin: oclk_no_reset
-        always @ (posedge clk2)
-        begin
-            if (oclk_enable == 1'b1) begin
-                oclk_data <= iclk_data[width2-1 : 0];
-            end
-        end
-    end
-endgenerate
-
-assign xout = iclk_data;
-assign sxout = oclk_data;
-
-endmodule
-
-//------------------------------------------------------------------------------
-
-module dspba_pipe
-#(
-    parameter num_bits   = 8,
-    parameter num_stages = 0,
-    parameter init_value = 1'bx
-) (
-    input  clk,
-    input  [num_bits-1:0] d,
-    output [num_bits-1:0] q
-);
-    logic [num_bits-1:0] init_stage = { num_bits { init_value } };
-
-    generate
-        if (num_stages > 0)
-        begin
-            reg [num_bits-1:0] stage_array[num_stages-1:0];
-
-            genvar i;
-            for (i = 0; i < num_stages; ++i)
-            begin : g_pipe
-                always @ (posedge clk) begin
-                    if (i>0) begin
-                        stage_array[i] <= stage_array[i-1];
-                    end else begin
-                        stage_array[i] <= d;
-                    end
-                end
-            end
-            initial begin
-                stage_array = '{ num_stages { init_stage } };
-            end
-
-            assign q = stage_array[num_stages-1];
-
-        end else begin
-            assign q = d;
-        end
-    endgenerate
-
-endmodule
--- a/hw/rtl/fp_cores/svdpi/float_dpi.cpp
+++ b/hw/rtl/fp_cores/svdpi/float_dpi.cpp
@ -0,0 +1,210 @@
+#include <stdio.h>
+#include <math.h>
+#include <unordered_map>
+#include <vector>
+#include <mutex>
+#include "svdpi.h"
+#include "VX_config.h"
+
+extern "C" {
+  void dpi_fadd(bool clk, bool enable, int a, int b, int* result);
+  void dpi_fsub(bool clk, bool enable, int a, int b, int* result);
+  void dpi_fmul(bool clk, bool enable, int a, int b, int* result);
+  void dpi_fmadd(bool clk, bool enable, int a, int b, int c, int* result);
+  void dpi_fmsub(bool clk, bool enable, int a, int b, int c, int* result);
+  void dpi_fdiv(bool clk, bool enable, int a, int b, int* result);
+  void dpi_fsqrt(bool clk, bool enable, int a, int* result);
+  void dpi_ftoi(bool clk, bool enable, int a, int* result);
+  void dpi_ftou(bool clk, bool enable, int a, int* result);
+  void dpi_itof(bool clk, bool enable, int a, int* result);
+  void dpi_utof(bool clk, bool enable, int a, int* result);
+}
+
+class ShiftRegister {
+public:
+  ShiftRegister() : init_(false), depth_(0) {}
+
+  void ensure_init(int depth) {
+    if (!init_) {
+      buffer_.resize(depth);
+      init_  = true;
+      depth_ = depth;
+    }
+  }
+
+  void push(int value, bool clk, bool enable) {
+    if (clk || !enable)
+      return;
+    for (unsigned i = 0; i < depth_-1; ++i) {
+      buffer_[i] = buffer_[i+1];
+    }
+    buffer_[depth_-1] = value;
+  }
+
+  int top() const {
+    return buffer_[0];
+  }
+
+private:
+
+  std::vector<int> buffer_;
+  unsigned depth_;
+  bool init_;
+};
+
+class Instances {
+public:
+  ShiftRegister& get(svScope scope) {
+    mutex_.lock();
+    ShiftRegister& reg = instances_[scope];
+    mutex_.unlock();
+    return reg;
+  }
+
+private:
+  std::unordered_map<svScope, ShiftRegister> instances_;
+  std::mutex mutex_;
+};
+
+Instances instances;
+
+void dpi_fadd(bool clk, bool enable, int a, int b, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  float fa = *(float*)&a;
+  float fb = *(float*)&b;
+  float fr = fa + fb;   
+
+  inst.ensure_init(LATENCY_FMADD);
+  inst.push(*(int*)&fr, clk, enable);
+  *result = inst.top();
+}
+
+void dpi_fsub(bool clk, bool enable, int a, int b, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  float fa = *(float*)&a;
+  float fb = *(float*)&b;
+  float fr = fa - fb;   
+
+  inst.ensure_init(LATENCY_FMADD);
+  inst.push(*(int*)&fr, clk, enable);
+  *result = inst.top();
+}
+
+void dpi_fmul(bool clk, bool enable, int a, int b, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  float fa = *(float*)&a;
+  float fb = *(float*)&b;
+  float fr = fa * fb;   
+
+  inst.ensure_init(LATENCY_FMADD);
+  inst.push(*(int*)&fr, clk, enable);
+  *result = inst.top();
+}
+
+void dpi_fmadd(bool clk, bool enable, int a, int b, int c, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  float fa = *(float*)&a;
+  float fb = *(float*)&b;
+  float fc = *(float*)&c;
+  float fr = fa * fb + fc;   
+
+  inst.ensure_init(LATENCY_FMADD);
+  inst.push(*(int*)&fr, clk, enable);
+  *result = inst.top();
+}
+
+void dpi_fmsub(bool clk, bool enable, int a, int b, int c, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  float fa = *(float*)&a;
+  float fb = *(float*)&b;
+  float fc = *(float*)&c;
+  float fr = fa * fb - fc;   
+
+  inst.ensure_init(LATENCY_FMADD);
+  inst.push(*(int*)&fr, clk, enable);
+  *result = inst.top();
+}
+
+void dpi_fdiv(bool clk, bool enable, int a, int b, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  float fa = *(float*)&a;
+  float fb = *(float*)&b;
+  float fr = fa / fb;   
+
+  inst.ensure_init(LATENCY_FDIV);
+  inst.push(*(int*)&fr, clk, enable);
+  *result = inst.
+  
+  top();
+}
+
+void dpi_fsqrt(bool clk, bool enable, int a, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  float fa = *(float*)&a;
+  float fr = sqrt(fa);   
+
+  inst.ensure_init(LATENCY_FSQRT);
+  inst.push(*(int*)&fr, clk, enable);
+  *result = inst.top();
+}
+
+void dpi_ftoi(bool clk, bool enable, int a, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  float fa = *(float*)&a;
+  int ir = int(fa);   
+
+  inst.ensure_init(LATENCY_FTOI);
+  inst.push(ir, clk, enable);
+  *result = inst.top();
+}
+
+void dpi_ftou(bool clk, bool enable, int a, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  float fa = *(float*)&a;
+  unsigned ir = unsigned(fa);   
+
+  inst.ensure_init(LATENCY_FTOI);
+  inst.push(ir, clk, enable);
+  *result = inst.top();
+}
+
+void dpi_itof(bool clk, bool enable, int a, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  float fr = float(a);   
+
+  inst.ensure_init(LATENCY_ITOF);
+  inst.push(*(int*)&fr, clk, enable);
+  *result = inst.top();
+}
+
+void dpi_utof(bool clk, bool enable, int a, int* result) {
+  auto scope = svGetScope();
+  ShiftRegister& inst = instances.get(scope);
+
+  unsigned ua = *(unsigned*)&a;
+  float fr = float(ua);   
+
+  inst.ensure_init(LATENCY_ITOF);
+  inst.push(*(int*)&fr, clk, enable);
+  *result = inst.top();
+}
--- a/hw/rtl/fp_cores/svdpi/float_dpi.vh
+++ b/hw/rtl/fp_cores/svdpi/float_dpi.vh
@ -0,0 +1,16 @@
+`ifndef FLOAT_DPI
+`define FLOAT_DPI
+
+import "DPI-C" context function void dpi_fadd(input logic clk, input logic enable, input int a, input int b, output int result);
+import "DPI-C" context function void dpi_fsub(input logic clk, input logic enable, input int a, input int b, output int result);
+import "DPI-C" context function void dpi_fmul(input logic clk, input logic enable, input int a, input int b, output int result);
+import "DPI-C" context function void dpi_fmadd(input logic clk, input logic enable, input int a, input int b, input int c, output int result);
+import "DPI-C" context function void dpi_fmsub(input logic clk, input logic enable, input int a, input int b, input int c, output int result);
+import "DPI-C" context function void dpi_fdiv(input logic clk, input logic enable, input int a, input int b, output int result);
+import "DPI-C" context function void dpi_fsqrt(input logic clk, input logic enable, input int a, output int result);
+import "DPI-C" context function void dpi_ftoi(input logic clk, input logic enable, input int a, output int result);
+import "DPI-C" context function void dpi_ftou(input logic clk, input logic enable, input int a, output int result);
+import "DPI-C" context function void dpi_itof(input logic clk, input logic enable, input int a, output int result);
+import "DPI-C" context function void dpi_utof(input logic clk, input logic enable, input int a, output int result);
+
+`endif
--- a/hw/scripts/gen_config.py
+++ b/hw/scripts/gen_config.py
@ -94,6 +94,7 @@ if args.outc != 'none':
 // Translated from VX_config.vh:
 '''[1:].format(date=datetime.now()), file=f)
        with open(path.join(script_dir, '../rtl/VX_config.vh'), 'r') as r:
+            lineno = 0
            for line in r:
                if in_expansion:
                    f.write(post_process_line(line))
@ -107,7 +108,8 @@ if args.outc != 'none':
                            f.write(post_process_line(pat.sub(repl, line)))
                            break
                    else:
-                        raise ValueError('failed to find rule for: ' + line)
+                        raise ValueError('failed to find rule for: "' + line + '" (' + str(lineno) + ')')
+                lineno = lineno + 1

        print('''
 // Misc
--- a/hw/simulate/Makefile
+++ b/hw/simulate/Makefile
@ -17,10 +17,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
 DBG_FLAGS += $(DBG_PRINT_FLAGS)
 DBG_FLAGS += -DDBG_CORE_REQ_INFO

-FPU_INCLUDE = -I../rtl/fp_cores/fpnew/src/common_cells/include -I../rtl/fp_cores/fpnew/src/common_cells/src -I../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../rtl/fp_cores/fpnew/src 
-INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/fp_cores -I../rtl/simulate $(FPU_INCLUDE)
+FPU_INCLUDE = -I../rtl/fp_cores -I../rtl/fp_cores/svdpi -I../rtl/fp_cores/fpnew/src/common_cells/include -I../rtl/fp_cores/fpnew/src/common_cells/src -I../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../rtl/fp_cores/fpnew/src 
+INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/simulate $(FPU_INCLUDE)

 SRCS = simulator.cpp testbench.cpp
+SRCS += ../rtl/fp_cores/svdpi/float_dpi.cpp

 all: build-s