fixed lmp_mult parameters, ram init filepath

2025-04-23 21:39:10 -04:00 · 2020-09-04 07:51:46 -07:00 · 2020-09-04 07:51:46 -07:00 · 42e3b6c45d
commit 42e3b6c45d
parent dccea80b68
36 changed files with 738 additions and 495 deletions
--- a/driver/tests/dogfood/testcases.h
+++ b/driver/tests/dogfood/testcases.h
@ -5,12 +5,12 @@
 #include <limits>

 union Float_t {    
-    float   f;
-    int32_t i;
+    float f;
+    int   i;
    struct {
-        uint32_t mantissa : 23;
-        uint32_t exponent : 8;
-        uint32_t sign     : 1;
+        uint32_t man  : 23;
+        uint32_t exp  : 8;
+        uint32_t sign : 1;
    } parts;
 };

--- a/hw/opae/Makefile
+++ b/hw/opae/Makefile
@ -4,19 +4,22 @@ FPGA_BUILD_DIR=build_fpga

 all: ase-1c

-sources.txt:
+sources.txt:	
 	./gen_sources.sh > sources.txt

 gen_sources: sources.txt

 ase-1c: setup-ase-1c gen_sources
 	make -C $(ASE_BUILD_DIR)_1c
+	cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_1c/work

 ase-2c: setup-ase-2c gen_sources
 	make -C $(ASE_BUILD_DIR)_2c
+	cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_2c/work

 ase-4c: setup-ase-4c gen_sources
 	make -C $(ASE_BUILD_DIR)_4c
+	cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_3c/work

 setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile

@ -35,12 +38,15 @@ $(ASE_BUILD_DIR)_4c/Makefile: sources.txt

 fpga-1c: setup-fpga-1c gen_sources
 	cd $(FPGA_BUILD_DIR)_1c && qsub-synth
+	cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_1c

 fpga-2c: setup-fpga-2c gen_sources
 	cd $(FPGA_BUILD_DIR)_2c && qsub-synth
+	cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_2c

 fpga-4c: setup-fpga-4c gen_sources
 	cd $(FPGA_BUILD_DIR)_4c && qsub-synth
+	cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_4c
 	
 setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf

--- a/hw/opae/README
+++ b/hw/opae/README
@ -62,7 +62,7 @@ make ase
 # tests
 ./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -n 256
 ./run_ase.sh build_ase_1c ../../driver/tests/demo/demo -n 16
-./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n 1 -s4 -e4
+./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n1 -s4 -e4
 ./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd

 # modify "vsim_run.tcl" to dump VCD trace
--- a/hw/opae/sources_1c.txt
+++ b/hw/opae/sources_1c.txt
@ -13,6 +13,7 @@
 #+define+DBG_PRINT_DRAM
 #+define+DBG_PRINT_PIPELINE
 #+define+DBG_PRINT_OPAE
+#+define+DBG_CORE_REQ_INFO
 #+define+DBG_PRINT_SCOPE

 vortex_afu.json
--- a/hw/rtl/VX_commit.v
+++ b/hw/rtl/VX_commit.v
@ -62,7 +62,7 @@ module VX_commit #(
        fflags_r      <= fflags;
        has_fflags_r  <= fpu_commit_if.valid && fpu_commit_if.has_fflags;
        wid_r         <= fpu_commit_if.wid;
-        num_commits_r <= num_commits;
+        num_commits_r <= (num_commits << $clog2(`NUM_THREADS));
    end

    assign cmt_to_csr_if.valid       = csr_update_r;            
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -59,6 +59,8 @@
 `define EXT_F_ENABLE
 `endif

+`define FPU_FAST
+
 // Device identification
 `define VENDOR_ID           0
 `define ARCHITECTURE_ID     0
@ -74,12 +76,12 @@
 `define LATENCY_FNONCOMP 1
 `endif

-`ifndef LATENCY_FMADD
-`define LATENCY_FMADD 1
+`ifndef LATENCY_FADDMUL
+`define LATENCY_FADDMUL 3
 `endif

-`ifndef LATENCY_FNMADD
-`define LATENCY_FNMADD 2
+`ifndef LATENCY_FMADD
+`define LATENCY_FMADD 4
 `endif

 `ifndef LATENCY_FDIV
@ -98,16 +100,12 @@
 `define LATENCY_FTOI 3
 `endif

-`ifndef LATENCY_FADDMUL
-`define LATENCY_FADDMUL 2
-`endif
-
 `ifndef LATENCY_FDIVSQRT
-`define LATENCY_FDIVSQRT 2
+`define LATENCY_FDIVSQRT 10
 `endif

 `ifndef LATENCY_FCONV
-`define LATENCY_FCONV 2
+`define LATENCY_FCONV 3
 `endif

 // CSR Addresses //////////////////////////////////////////////////////////////
--- a/hw/rtl/VX_define.vh
+++ b/hw/rtl/VX_define.vh
@ -385,7 +385,7 @@
 `define VX_CORE_TAG_WIDTH       `L3CORE_TAG_WIDTH 
 `define VX_CSR_ID_WIDTH         `LOG2UP(`NUM_CLUSTERS * `NUM_CORES)

-`define DRAM_TO_BYTE_ADDR(x)     {x, (32-$bits(x))'(0)}
+`define DRAM_TO_BYTE_ADDR(x)    {x, (32-$bits(x))'(0)}

 `include "VX_types.vh"

--- a/hw/rtl/VX_mul_unit.v
+++ b/hw/rtl/VX_mul_unit.v
@ -51,32 +51,33 @@ module VX_mul_unit #(
    ///////////////////////////////////////////////////////////////////////////

    wire [`NUM_THREADS-1:0][31:0] mul_result;
-    wire is_mulw = (alu_op == `MUL_MUL);    
-    wire is_mulw_out;
+    wire is_mul_in = (alu_op == `MUL_MUL);    
+    wire is_mul_out;
    wire stall_mul;

    for (genvar i = 0; i < `NUM_THREADS; i++) begin    

        wire [32:0] mul_in1 = {(alu_op != `MUL_MULHU)                          & alu_in1[i][31], alu_in1[i]};
        wire [32:0] mul_in2 = {(alu_op != `MUL_MULHU && alu_op != `MUL_MULHSU) & alu_in2[i][31], alu_in2[i]};
-        wire [63:0] mul_result_tmp;
+    `IGNORE_WARNINGS_BEGIN
+        wire [65:0] mul_result_tmp;
+    `IGNORE_WARNINGS_END

        VX_multiplier #(
            .WIDTHA(33),
            .WIDTHB(33),
-            .WIDTHP(64),
+            .WIDTHP(66),
            .SIGNED(1),
-            .PIPELINE(`LATENCY_IMUL)
+            .LATENCY(`LATENCY_IMUL)
        ) multiplier (
            .clk(clk),
-            .reset(reset),
-            .clk_en(~stall_mul),
+            .enable(~stall_mul),
            .dataa(mul_in1),
            .datab(mul_in2),
            .result(mul_result_tmp)
        );

-        assign mul_result[i] = is_mulw_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32];            
+        assign mul_result[i] = is_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32];            
    end

    wire [MULQ_BITS-1:0] mul_tag;
@ -91,17 +92,17 @@ module VX_mul_unit #(
        .clk(clk),
        .reset(reset),
        .enable(~stall_mul),
-        .in({mul_fire,       tag_in,  is_mulw}),
-        .out({mul_valid_out, mul_tag, is_mulw_out})
+        .in({mul_fire,       tag_in,  is_mul_in}),
+        .out({mul_valid_out, mul_tag, is_mul_out})
    );

    ///////////////////////////////////////////////////////////////////////////

    wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;

-    wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU);
+    wire is_div_only   = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU);
    wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM);     
-    wire div_valid_in = mul_req_if.valid && is_div_op;   
+    wire div_valid_in  = mul_req_if.valid && is_div_op;   
    wire div_ready_in;
    wire div_ready_out;
    wire div_valid_out;
--- a/hw/rtl/cache/VX_cache_config.vh
+++ b/hw/rtl/cache/VX_cache_config.vh
@ -4,6 +4,10 @@
 `include "VX_platform.vh"
 `include "VX_scope.vh"

+`ifdef DBG_CORE_REQ_INFO
+`include "VX_define.vh"
+`endif
+
 `define REQ_TAG_WIDTH           `MAX(CORE_TAG_WIDTH, SNP_REQ_TAG_WIDTH)

 `define REQS_BITS               `LOG2UP(NUM_REQUESTS)
@ -77,4 +81,6 @@

 `define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))}

+`define DRAM_TO_BYTE_ADDR(x)    {x, (32-$bits(x))'(0)}
+
 `endif
--- a/hw/rtl/fp_cores/VX_fp_addmul.v
+++ b/hw/rtl/fp_cores/VX_fp_addmul.v
@ -0,0 +1,187 @@
+`include "VX_define.vh"
+
+`ifndef SYNTHESIS
+`include "float_dpi.vh"
+`endif
+
+module VX_fp_addmul #( 
+    parameter TAGW = 1,
+    parameter LANES = 1
+) (
+    input wire clk,
+    input wire reset,   
+
+    output wire ready_in,
+    input wire  valid_in,
+
+    input wire [TAGW-1:0] tag_in,
+
+    input wire  do_sub,
+    input wire  do_mul,    
+
+    input wire [LANES-1:0][31:0]  dataa,
+    input wire [LANES-1:0][31:0]  datab,
+    output wire [LANES-1:0][31:0] result, 
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);    
+    
+    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;
+
+    reg do_sub_r, do_mul_r;
+
+    for (genvar i = 0; i < LANES; i++) begin
+        
+        wire [31:0] result_add;
+        wire [31:0] result_sub;
+        wire [31:0] result_mul;
+
+    `ifdef QUARTUS
+        twentynm_fp_mac mac_fp_add (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(dataa[i]),
+            .ay(datab[i]),
+            .az(),
+            .clk({2'b00,clk}),
+            .ena({2'b11,enable}),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result_add),
+            .chainout()
+        );
+        defparam mac_fp_add.operation_mode = "sp_add"; 
+        defparam mac_fp_add.use_chainin = "false"; 
+        defparam mac_fp_add.adder_subtract = "false"; 
+        defparam mac_fp_add.ax_clock = "0"; 
+        defparam mac_fp_add.ay_clock = "0"; 
+        defparam mac_fp_add.az_clock = "none"; 
+        defparam mac_fp_add.output_clock = "0"; 
+        defparam mac_fp_add.accumulate_clock = "none"; 
+        defparam mac_fp_add.ax_chainin_pl_clock = "none"; 
+        defparam mac_fp_add.accum_pipeline_clock = "none"; 
+        defparam mac_fp_add.mult_pipeline_clock = "none"; 
+        defparam mac_fp_add.adder_input_clock = "0"; 
+        defparam mac_fp_add.accum_adder_clock = "none"; 
+
+        twentynm_fp_mac mac_fp_sub (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(dataa[i]),
+            .ay(datab[i]),
+            .az(),
+            .clk({2'b00,clk}),
+            .ena({2'b11,enable}),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result_sub),
+            .chainout()
+        );
+        defparam mac_fp_sub.operation_mode = "sp_add"; 
+        defparam mac_fp_sub.use_chainin = "false"; 
+        defparam mac_fp_sub.adder_subtract = "true"; 
+        defparam mac_fp_sub.ax_clock = "0"; 
+        defparam mac_fp_sub.ay_clock = "0"; 
+        defparam mac_fp_sub.az_clock = "none"; 
+        defparam mac_fp_sub.output_clock = "0"; 
+        defparam mac_fp_sub.accumulate_clock = "none"; 
+        defparam mac_fp_sub.ax_chainin_pl_clock = "none"; 
+        defparam mac_fp_sub.accum_pipeline_clock = "none"; 
+        defparam mac_fp_sub.mult_pipeline_clock = "none"; 
+        defparam mac_fp_sub.adder_input_clock = "0"; 
+        defparam mac_fp_sub.accum_adder_clock = "none";
+
+        twentynm_fp_mac mac_fp_mul (
+            // inputs
+            .accumulate(),
+            .chainin_overflow(),
+            .chainin_invalid(),
+            .chainin_underflow(),
+            .chainin_inexact(),
+            .ax(),
+            .ay(datab[i]),
+            .az(dataa[i]),
+            .clk({2'b00,clk}),
+            .ena({2'b11,enable}),
+            .aclr(2'b00),
+            .chainin(),
+            // outputs
+            .overflow(),
+            .invalid(),
+            .underflow(),
+            .inexact(),
+            .chainout_overflow(),
+            .chainout_invalid(),
+            .chainout_underflow(),
+            .chainout_inexact(),
+            .resulta(result_mul),
+            .chainout()
+        );
+        defparam mac_fp_mul.operation_mode = "sp_mult"; 
+        defparam mac_fp_mul.use_chainin = "false"; 
+        defparam mac_fp_mul.adder_subtract = "false"; 
+        defparam mac_fp_mul.ax_clock = "none"; 
+        defparam mac_fp_mul.ay_clock = "0"; 
+        defparam mac_fp_mul.az_clock = "0"; 
+        defparam mac_fp_mul.output_clock = "0"; 
+        defparam mac_fp_mul.accumulate_clock = "none"; 
+        defparam mac_fp_mul.ax_chainin_pl_clock = "none"; 
+        defparam mac_fp_mul.accum_pipeline_clock = "none"; 
+        defparam mac_fp_mul.mult_pipeline_clock = "0"; 
+        defparam mac_fp_mul.adder_input_clock = "none"; 
+        defparam mac_fp_mul.accum_adder_clock = "none";
+    `else
+        always @(posedge clk) begin
+           dpi_fadd(0*LANES+i, enable, dataa[i], datab[i], result_add);
+           dpi_fsub(1*LANES+i, enable, dataa[i], datab[i], result_sub);
+           dpi_fmul(2*LANES+i, enable, dataa[i], datab[i], result_mul);
+        end
+    `endif
+
+        assign result[i] = do_mul_r ? result_mul : (do_sub_r ? result_sub : result_add);
+    end
+    
+    VX_shift_register #(
+        .DATAW(TAGW + 1 + 1 + 1),
+        .DEPTH(`LATENCY_FADDMUL)
+    ) shift_reg (
+        .clk(clk),
+        .reset(reset),
+        .enable(enable),
+        .in({tag_in,   valid_in,  do_sub,   do_mul}),
+        .out({tag_out, valid_out, do_sub_r, do_mul_r})
+    );
+
+    assign ready_in = enable;
+
+endmodule
--- a/hw/rtl/fp_cores/VX_fp_div.v
+++ b/hw/rtl/fp_cores/VX_fp_div.v
@ -26,20 +26,21 @@ module VX_fp_div #(
    output wire valid_out
 );    
    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;
    
    for (genvar i = 0; i < LANES; i++) begin
    `ifdef QUARTUS
-        acl_fp_div fdiv (
+        acl_fdiv fdiv (
            .clk    (clk),
            .areset (1'b0),
-            .en     (~stall),
+            .en     (enable),
            .a      (dataa[i]),
            .b      (datab[i]),
            .q      (result[i])
        );
    `else 
        always @(posedge clk) begin
-           dpi_fdiv(8*LANES+i, ~stall, valid_in, dataa[i], datab[i], result[i]);
+           dpi_fdiv(8*LANES+i, enable, dataa[i], datab[i], result[i]);
        end
    `endif
    end
@ -50,11 +51,11 @@ module VX_fp_div #(
    ) shift_reg (
        .clk(clk),
        .reset(reset),
-        .enable(~stall),
+        .enable(enable),
        .in ({tag_in,  valid_in}),
        .out({tag_out, valid_out})
    );

-    assign ready_in = ~stall;
+    assign ready_in = enable;

 endmodule
--- a/hw/rtl/fp_cores/VX_fp_fpga.v
+++ b/hw/rtl/fp_cores/VX_fp_fpga.v
@ -27,7 +27,7 @@ module VX_fp_fpga #(
    input wire  ready_out,
    output wire valid_out
 );
-    localparam NUM_FPC  = 7;
+    localparam NUM_FPC  = 8;
    localparam FPC_BITS = `LOG2UP(NUM_FPC);
    
    wire [NUM_FPC-1:0] per_core_ready_in;
@ -40,29 +40,28 @@ module VX_fp_fpga #(
    fflags_t [`NUM_THREADS-1:0] fpnew_fflags;  

    reg [FPC_BITS-1:0] core_select;
-    reg do_add, do_sub, do_mul;
+    reg do_sub, do_mul;
    reg is_signed;

    always @(*) begin
        core_select = 'x;
-        do_add      = 'x;
        do_sub      = 'x;
        do_mul      = 'x;
        is_signed   = 'x;
        case (op_type)
-            `FPU_ADD:    begin core_select = 1; do_mul = 0; do_add = 1; do_sub = 0; end
-            `FPU_SUB:    begin core_select = 1; do_mul = 0; do_add = 0; do_sub = 1; end
-            `FPU_MUL:    begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 0; end
-            `FPU_MADD:   begin core_select = 1; do_mul = 1; do_add = 1; do_sub = 0; end
-            `FPU_MSUB:   begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 1; end
-            `FPU_NMSUB:  begin core_select = 2; do_sub = 1; end
-            `FPU_NMADD:  begin core_select = 2; do_sub = 0; end           
-            `FPU_DIV:    begin core_select = 3; end
-            `FPU_SQRT:   begin core_select = 4; end
-            `FPU_CVTWS:  begin core_select = 5; is_signed = 1; end
-            `FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
-            `FPU_CVTSW:  begin core_select = 6; is_signed = 1; end
-            `FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
+            `FPU_ADD:    begin core_select = 1; do_mul = 0; do_sub = 0; end
+            `FPU_SUB:    begin core_select = 1; do_mul = 0; do_sub = 1; end
+            `FPU_MUL:    begin core_select = 1; do_mul = 1; do_sub = 0; end
+            `FPU_MADD:   begin core_select = 2; do_sub = 0; end
+            `FPU_MSUB:   begin core_select = 2; do_sub = 1; end
+            `FPU_NMADD:  begin core_select = 3; do_sub = 0; end
+            `FPU_NMSUB:  begin core_select = 3; do_sub = 1; end
+            `FPU_DIV:    begin core_select = 4; end
+            `FPU_SQRT:   begin core_select = 5; end
+            `FPU_CVTWS:  begin core_select = 6; is_signed = 1; end
+            `FPU_CVTWUS: begin core_select = 6; is_signed = 0; end
+            `FPU_CVTSW:  begin core_select = 7; is_signed = 1; end
+            `FPU_CVTSWU: begin core_select = 7; is_signed = 0; end
            default:     begin core_select = 0; end
        endcase
    end
@ -88,25 +87,42 @@ module VX_fp_fpga #(
        .valid_out  (per_core_valid_out[0])
    );
    
+    VX_fp_addmul #(
+        .TAGW (TAGW),
+        .LANES(`NUM_THREADS)
+    ) fp_addmul (
+        .clk        (clk), 
+        .reset      (reset),   
+        .valid_in   (valid_in && (core_select == 1)),
+        .ready_in   (per_core_ready_in[1]),    
+        .tag_in     (tag_in),   
+        .do_sub     (do_sub),
+        .do_mul     (do_mul),
+        .dataa      (dataa), 
+        .datab      (datab),    
+        .result     (per_core_result[1]),
+        .tag_out    (per_core_tag_out[1]),
+        .ready_out  (per_core_ready_out[1]),
+        .valid_out  (per_core_valid_out[1])
+    );
+
    VX_fp_madd #(
        .TAGW (TAGW),
        .LANES(`NUM_THREADS)
    ) fp_madd (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 1)),
-        .ready_in   (per_core_ready_in[1]),    
-        .tag_in     (tag_in),    
-        .do_add     (do_add),
+        .valid_in   (valid_in && (core_select == 2)),
+        .ready_in   (per_core_ready_in[2]),    
+        .tag_in     (tag_in),  
        .do_sub     (do_sub),
-        .do_mul     (do_mul),
        .dataa      (dataa), 
        .datab      (datab),      
        .datac      (datac),   
-        .result     (per_core_result[1]),
-        .tag_out    (per_core_tag_out[1]),
-        .ready_out  (per_core_ready_out[1]),
-        .valid_out  (per_core_valid_out[1])
+        .result     (per_core_result[2]),
+        .tag_out    (per_core_tag_out[2]),
+        .ready_out  (per_core_ready_out[2]),
+        .valid_out  (per_core_valid_out[2])
    );

    VX_fp_nmadd #(
@ -115,17 +131,17 @@ module VX_fp_fpga #(
    ) fp_nmadd (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 2)),
-        .ready_in   (per_core_ready_in[2]),    
+        .valid_in   (valid_in && (core_select == 3)),
+        .ready_in   (per_core_ready_in[3]),    
        .tag_in     (tag_in),  
        .do_sub     (do_sub),
        .dataa      (dataa), 
        .datab      (datab),   
        .datac      (datac),              
-        .result     (per_core_result[2]),
-        .tag_out    (per_core_tag_out[2]),
-        .ready_out  (per_core_ready_out[2]),
-        .valid_out  (per_core_valid_out[2])
+        .result     (per_core_result[3]),
+        .tag_out    (per_core_tag_out[3]),
+        .ready_out  (per_core_ready_out[3]),
+        .valid_out  (per_core_valid_out[3])
    );

    VX_fp_div #(
@ -134,15 +150,15 @@ module VX_fp_fpga #(
    ) fp_div (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 3)),
-        .ready_in   (per_core_ready_in[3]),    
+        .valid_in   (valid_in && (core_select == 4)),
+        .ready_in   (per_core_ready_in[4]),    
        .tag_in     (tag_in),    
        .dataa      (dataa), 
        .datab      (datab),         
-        .result     (per_core_result[3]),
-        .tag_out    (per_core_tag_out[3]),
-        .ready_out  (per_core_ready_out[3]),
-        .valid_out  (per_core_valid_out[3])
+        .result     (per_core_result[4]),
+        .tag_out    (per_core_tag_out[4]),
+        .ready_out  (per_core_ready_out[4]),
+        .valid_out  (per_core_valid_out[4])
    );

    VX_fp_sqrt #(
@ -151,14 +167,14 @@ module VX_fp_fpga #(
    ) fp_sqrt (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 4)),
-        .ready_in   (per_core_ready_in[4]),    
+        .valid_in   (valid_in && (core_select == 5)),
+        .ready_in   (per_core_ready_in[5]),    
        .tag_in     (tag_in),    
        .dataa      (dataa),  
-        .result     (per_core_result[4]),
-        .tag_out    (per_core_tag_out[4]),
-        .ready_out  (per_core_ready_out[4]),
-        .valid_out  (per_core_valid_out[4])
+        .result     (per_core_result[5]),
+        .tag_out    (per_core_tag_out[5]),
+        .ready_out  (per_core_ready_out[5]),
+        .valid_out  (per_core_valid_out[5])
    );

    VX_fp_ftoi #(
@ -167,15 +183,15 @@ module VX_fp_fpga #(
    ) fp_ftoi (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 5)),
-        .ready_in   (per_core_ready_in[5]),    
+        .valid_in   (valid_in && (core_select == 6)),
+        .ready_in   (per_core_ready_in[6]),    
        .tag_in     (tag_in), 
        .is_signed  (is_signed),   
        .dataa      (dataa),  
-        .result     (per_core_result[5]),
-        .tag_out    (per_core_tag_out[5]),
-        .ready_out  (per_core_ready_out[5]),
-        .valid_out  (per_core_valid_out[5])
+        .result     (per_core_result[6]),
+        .tag_out    (per_core_tag_out[6]),
+        .ready_out  (per_core_ready_out[6]),
+        .valid_out  (per_core_valid_out[6])
    );

    VX_fp_itof #(
@ -184,15 +200,15 @@ module VX_fp_fpga #(
    ) fp_itof (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 6)),
-        .ready_in   (per_core_ready_in[6]),    
+        .valid_in   (valid_in && (core_select == 7)),
+        .ready_in   (per_core_ready_in[7]),    
        .tag_in     (tag_in), 
        .is_signed  (is_signed),      
        .dataa      (dataa),  
-        .result     (per_core_result[6]),
-        .tag_out    (per_core_tag_out[6]),
-        .ready_out  (per_core_ready_out[6]),
-        .valid_out  (per_core_valid_out[6])
+        .result     (per_core_result[7]),
+        .tag_out    (per_core_tag_out[7]),
+        .ready_out  (per_core_ready_out[7]),
+        .valid_out  (per_core_valid_out[7])
    );

    reg valid_out_n;
--- a/hw/rtl/fp_cores/VX_fp_ftoi.v
+++ b/hw/rtl/fp_cores/VX_fp_ftoi.v
@ -27,6 +27,7 @@ module VX_fp_ftoi #(
    output wire valid_out
 );    
    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;

    reg is_signed_r;
    
@ -36,25 +37,25 @@ module VX_fp_ftoi #(
        wire [31:0] result_u;

    `ifdef QUARTUS       
-        acl_fp_ftoi ftoi (
+        acl_ftoi ftoi (
            .clk    (clk),
            .areset (1'b0),
-            .en     (~stall),
+            .en     (enable),
            .a      (dataa[i]),
            .q      (result_s)
        );

-        acl_fp_ftou ftou (
+        acl_ftou ftou (
            .clk    (clk),
            .areset (1'b0),
-            .en     (~stall),
+            .en     (enable),
            .a      (dataa[i]),
            .q      (result_u)
        );        
    `else
        always @(posedge clk) begin
-           dpi_ftoi(10*LANES+i, ~stall, valid_in, dataa[i], result_s);
-           dpi_ftou(11*LANES+i, ~stall, valid_in, dataa[i], result_u);
+           dpi_ftoi(10*LANES+i, enable, dataa[i], result_s);
+           dpi_ftou(11*LANES+i, enable, dataa[i], result_u);
        end
    `endif

@ -67,11 +68,11 @@ module VX_fp_ftoi #(
    ) shift_reg (
        .clk(clk),
        .reset(reset),
-        .enable(~stall),
+        .enable(enable),
        .in ({tag_in,  valid_in,  is_signed}),
        .out({tag_out, valid_out, is_signed_r})
    );

-    assign ready_in = ~stall;
+    assign ready_in = enable;

 endmodule
--- a/hw/rtl/fp_cores/VX_fp_itof.v
+++ b/hw/rtl/fp_cores/VX_fp_itof.v
@ -27,6 +27,7 @@ module VX_fp_itof #(
    output wire valid_out
 );    
    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;

    reg is_signed_r;

@ -36,25 +37,25 @@ module VX_fp_itof #(
        wire [31:0] result_u;

    `ifdef QUARTUS
-        acl_fp_itof itof (
+        acl_itof itof (
            .clk    (clk),
            .areset (1'b0),
-            .en     (~stall),
+            .en     (enable),
            .a      (dataa[i]),
            .q      (result_s)
        );

-        acl_fp_utof utof (
+        acl_utof utof (
            .clk    (clk),
            .areset (1'b0),
-            .en     (~stall),
+            .en     (enable),
            .a      (dataa[i]),
            .q      (result_u)
        );
    `else
        always @(posedge clk) begin
-           dpi_itof(12*LANES+i, ~stall, valid_in, dataa[i], result_s);
-           dpi_utof(13*LANES+i, ~stall, valid_in, dataa[i], result_u);
+           dpi_itof(12*LANES+i, enable, dataa[i], result_s);
+           dpi_utof(13*LANES+i, enable, dataa[i], result_u);
        end
    `endif

@ -67,11 +68,11 @@ module VX_fp_itof #(
    ) shift_reg (
        .clk(clk),
        .reset(reset),
-        .enable(~stall),
+        .enable(enable),
        .in ({tag_in,  valid_in,  is_signed}),
        .out({tag_out, valid_out, is_signed_r})
    );

-    assign ready_in = ~stall;
+    assign ready_in = enable;

 endmodule
--- a/hw/rtl/fp_cores/VX_fp_madd.v
+++ b/hw/rtl/fp_cores/VX_fp_madd.v
@ -16,9 +16,7 @@ module VX_fp_madd #(

    input wire [TAGW-1:0] tag_in,

-    input wire  do_add,
-    input wire  do_sub,
-    input wire  do_mul,    
+    input wire  do_sub,  

    input wire [LANES-1:0][31:0]  dataa,
    input wire [LANES-1:0][31:0]  datab,
@ -32,138 +30,16 @@ module VX_fp_madd #(
 );    
    
    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;

-    reg do_add_r, do_sub_r, do_mul_r;
+    reg do_sub_r;

    for (genvar i = 0; i < LANES; i++) begin
        
-        wire [31:0] result_add;
-        wire [31:0] result_sub;
-        wire [31:0] result_mul;
        wire [31:0] result_madd;
        wire [31:0] result_msub;

    `ifdef QUARTUS
-        twentynm_fp_mac mac_fp_add (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(dataa[i]),
-            .ay(datab[i]),
-            .az(),
-            .clk({2'b00,clk}),
-            .ena({2'b11,~stall}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result_add),
-            .chainout()
-        );
-        defparam mac_fp_add.operation_mode = "sp_add"; 
-        defparam mac_fp_add.use_chainin = "false"; 
-        defparam mac_fp_add.adder_subtract = "false"; 
-        defparam mac_fp_add.ax_clock = "0"; 
-        defparam mac_fp_add.ay_clock = "0"; 
-        defparam mac_fp_add.az_clock = "none"; 
-        defparam mac_fp_add.output_clock = "0"; 
-        defparam mac_fp_add.accumulate_clock = "none"; 
-        defparam mac_fp_add.ax_chainin_pl_clock = "none"; 
-        defparam mac_fp_add.accum_pipeline_clock = "none"; 
-        defparam mac_fp_add.mult_pipeline_clock = "none"; 
-        defparam mac_fp_add.adder_input_clock = "0"; 
-        defparam mac_fp_add.accum_adder_clock = "none"; 
-
-        twentynm_fp_mac mac_fp_sub (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(dataa[i]),
-            .ay(datab[i]),
-            .az(),
-            .clk({2'b00,clk}),
-            .ena({2'b11,~stall}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result_sub),
-            .chainout()
-        );
-        defparam mac_fp_sub.operation_mode = "sp_add"; 
-        defparam mac_fp_sub.use_chainin = "false"; 
-        defparam mac_fp_sub.adder_subtract = "true"; 
-        defparam mac_fp_sub.ax_clock = "0"; 
-        defparam mac_fp_sub.ay_clock = "0"; 
-        defparam mac_fp_sub.az_clock = "none"; 
-        defparam mac_fp_sub.output_clock = "0"; 
-        defparam mac_fp_sub.accumulate_clock = "none"; 
-        defparam mac_fp_sub.ax_chainin_pl_clock = "none"; 
-        defparam mac_fp_sub.accum_pipeline_clock = "none"; 
-        defparam mac_fp_sub.mult_pipeline_clock = "none"; 
-        defparam mac_fp_sub.adder_input_clock = "0"; 
-        defparam mac_fp_sub.accum_adder_clock = "none";
-
-        twentynm_fp_mac mac_fp_mul (
-            // inputs
-            .accumulate(),
-            .chainin_overflow(),
-            .chainin_invalid(),
-            .chainin_underflow(),
-            .chainin_inexact(),
-            .ax(),
-            .ay(datab[i]),
-            .az(dataa[i]),
-            .clk({2'b00,clk}),
-            .ena({2'b11,~stall}),
-            .aclr(2'b00),
-            .chainin(),
-            // outputs
-            .overflow(),
-            .invalid(),
-            .underflow(),
-            .inexact(),
-            .chainout_overflow(),
-            .chainout_invalid(),
-            .chainout_underflow(),
-            .chainout_inexact(),
-            .resulta(result_mul),
-            .chainout()
-        );
-        defparam mac_fp_mul.operation_mode = "sp_mult"; 
-        defparam mac_fp_mul.use_chainin = "false"; 
-        defparam mac_fp_mul.adder_subtract = "false"; 
-        defparam mac_fp_mul.ax_clock = "none"; 
-        defparam mac_fp_mul.ay_clock = "0"; 
-        defparam mac_fp_mul.az_clock = "0"; 
-        defparam mac_fp_mul.output_clock = "0"; 
-        defparam mac_fp_mul.accumulate_clock = "none"; 
-        defparam mac_fp_mul.ax_chainin_pl_clock = "none"; 
-        defparam mac_fp_mul.accum_pipeline_clock = "none"; 
-        defparam mac_fp_mul.mult_pipeline_clock = "0"; 
-        defparam mac_fp_mul.adder_input_clock = "none"; 
-        defparam mac_fp_mul.accum_adder_clock = "none"; 
-
        twentynm_fp_mac mac_fp_madd (
            // inputs
            .accumulate(),
@ -175,7 +51,7 @@ module VX_fp_madd #(
            .ay(datab[i]),
            .az(dataa[i]),
            .clk({2'b00,clk}),
-            .ena({2'b11,~stall}),
+            .ena({2'b11,enable}),
            .aclr(2'b00),
            .chainin(),
            // outputs
@ -215,7 +91,7 @@ module VX_fp_madd #(
            .ay(datab[i]),
            .az(dataa[i]),
            .clk({2'b00,clk}),
-            .ena({2'b11,~stall}),
+            .ena({2'b11,enable}),
            .aclr(2'b00),
            .chainin(),
            // outputs
@ -245,47 +121,25 @@ module VX_fp_madd #(
        defparam mac_fp_msub.accum_adder_clock = "none";
    `else
        always @(posedge clk) begin
-           dpi_fadd(0*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_add);
-           dpi_fsub(1*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_sub);
-           dpi_fmul(2*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_mul);
-           dpi_fmadd(3*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_madd);
-           dpi_fmsub(4*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_msub);
+           dpi_fmadd(3*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
+           dpi_fmsub(4*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
        end
    `endif

-        reg [31:0] result_r;        
-
-        always @(*) begin
-            result_r = 'x;
-            if (do_mul_r) begin
-                if (do_add_r)
-                    result_r = result_madd;
-                else if (do_sub_r)
-                    result_r = result_msub;
-                else
-                    result_r = result_mul;
-            end else begin
-                if (do_add_r)
-                    result_r = result_add;
-                else if (do_sub_r)
-                    result_r = result_sub;
-            end            
-        end
-
-        assign result[i] = result_r;
+        assign result[i] = do_sub_r ? result_msub : result_madd;
    end
    
    VX_shift_register #(
-        .DATAW(TAGW + 1 + 1 + 1 + 1),
+        .DATAW(TAGW + 1 + 1),
        .DEPTH(`LATENCY_FMADD)
-    ) shift_reg1 (
+    ) shift_reg (
        .clk(clk),
        .reset(reset),
-        .enable(~stall),
-        .in({tag_in,   valid_in,  do_add,   do_sub,   do_mul}),
-        .out({tag_out, valid_out, do_add_r, do_sub_r, do_mul_r})
+        .enable(enable),
+        .in({tag_in,   valid_in,  do_sub}),
+        .out({tag_out, valid_out, do_sub_r})
    );

-    assign ready_in  = ~stall;
+    assign ready_in = enable;

 endmodule
--- a/hw/rtl/fp_cores/VX_fp_nmadd.v
+++ b/hw/rtl/fp_cores/VX_fp_nmadd.v
@ -30,13 +30,14 @@ module VX_fp_nmadd #(
 ); 

    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;

    reg do_sub_r;

    for (genvar i = 0; i < LANES; i++) begin

        wire [31:0] result_madd;
-        wire [31:0] result_msub; 
+        wire [31:0] result_msub;     

        wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;

@ -52,7 +53,7 @@ module VX_fp_nmadd #(
            .ay(datab[i]),
            .az(dataa[i]),
            .clk({2'b00,clk}),
-            .ena({2'b11,~stall}),
+            .ena({2'b11,enable}),
            .aclr(2'b00),
            .chainin(),
            // outputs
@ -161,33 +162,36 @@ module VX_fp_nmadd #(
        defparam mac_fp_neg.adder_input_clock = "0"; 
        defparam mac_fp_neg.accum_adder_clock = "none";
    `else
-        reg valid_in_st0;
        always @(posedge clk) begin
-           valid_in_st0 <= reset ? 0 : valid_in; 
-           dpi_fmadd(5*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_madd);
-           dpi_fmsub(6*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_msub);
-           dpi_fsub(7*LANES+i, ~stall, valid_in_st0, 32'b0, result_st0, result[i]);
+           dpi_fmadd(5*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
+           dpi_fmsub(6*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
+           dpi_fsub(7*LANES+i, enable, 32'b0, result_st0, result[i]);
        end
    `endif
-    end    
-
-    always @(posedge clk) begin
-        if (~stall) begin
-            do_sub_r <= do_sub;
-        end
    end

+    VX_shift_register #(
+        .DATAW(1),
+        .DEPTH(`LATENCY_FMADD)
+    ) shift_reg0 (
+        .clk(clk),
+        .reset(reset),
+        .enable(enable),
+        .in({do_sub}),
+        .out({do_sub_r})
+    );
+
    VX_shift_register #(
        .DATAW(TAGW + 1),
-        .DEPTH(`LATENCY_FNMADD)
+        .DEPTH(`LATENCY_FMADD + `LATENCY_FADDMUL)
    ) shift_reg1 (
        .clk(clk),
        .reset(reset),
-        .enable(~stall),
+        .enable(enable),
        .in({tag_in,   valid_in}),
        .out({tag_out, valid_out})
    );

-    assign ready_in  = ~stall;
+    assign ready_in = enable;

 endmodule
--- a/hw/rtl/fp_cores/VX_fp_sqrt.v
+++ b/hw/rtl/fp_cores/VX_fp_sqrt.v
@ -25,19 +25,20 @@ module VX_fp_sqrt #(
    output wire valid_out
 );    
    wire stall = ~ready_out && valid_out;
+    wire enable = ~stall;
    
    for (genvar i = 0; i < LANES; i++) begin
    `ifdef QUARTUS
-        acl_fp_sqrt fsqrt (
+        acl_fsqrt fsqrt (
            .clk    (clk),
            .areset (1'b0),
-            .en     (~stall),
+            .en     (enable),
            .a      (dataa[i]),
            .q      (result[i])
        );
    `else
        always @(posedge clk) begin
-           dpi_fsqrt(9*LANES+i, ~stall, valid_in, dataa[i], result[i]);
+           dpi_fsqrt(9*LANES+i, enable, dataa[i], result[i]);
        end
    `endif
    end
@ -48,11 +49,11 @@ module VX_fp_sqrt #(
    ) shift_reg (
        .clk(clk),
        .reset(reset),
-        .enable(~stall),
+        .enable(enable),
        .in ({tag_in,  valid_in}),
        .out({tag_out, valid_out})
    );

-    assign ready_in = ~stall;
+    assign ready_in = enable;

 endmodule
--- a/hw/rtl/fp_cores/altera/acl_fp_div.sv
+++ b/hw/rtl/fp_cores/altera/acl_fp_div.sv
@ -15,12 +15,12 @@
 // applicable agreement for further details.
 // ---------------------------------------------------------------------------

-// SystemVerilog created from acl_fp_div
-// SystemVerilog created on Mon Aug 31 06:15:17 2020
+// SystemVerilog created from acl_fdiv
+// SystemVerilog created on Wed Sep  2 07:11:09 2020


 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
-module acl_fp_div (
+module acl_fdiv (
    input wire [31:0] a,
    input wire [31:0] b,
    input wire [0:0] en,
@ -623,7 +623,7 @@ module acl_fp_div (
        .outdata_aclr_a("CLEAR0"),
        .clock_enable_input_a("NORMAL"),
        .power_up_uninitialized("FALSE"),
-        .init_file("acl_fp_div_memoryC2_uid118_invTables_lutmem.hex"),
+        .init_file("acl_fdiv_memoryC2_uid118_invTables_lutmem.hex"),
        .init_file_layout("PORT_A"),
        .intended_device_family("Arria 10")
    ) memoryC2_uid118_invTables_lutmem_dmem (
@ -755,7 +755,7 @@ module acl_fp_div (
        .outdata_aclr_a("CLEAR0"),
        .clock_enable_input_a("NORMAL"),
        .power_up_uninitialized("FALSE"),
-        .init_file("acl_fp_div_memoryC1_uid115_invTables_lutmem.hex"),
+        .init_file("acl_fdiv_memoryC1_uid115_invTables_lutmem.hex"),
        .init_file_layout("PORT_A"),
        .intended_device_family("Arria 10")
    ) memoryC1_uid115_invTables_lutmem_dmem (
@ -1060,7 +1060,7 @@ module acl_fp_div (
        .outdata_aclr_a("CLEAR0"),
        .clock_enable_input_a("NORMAL"),
        .power_up_uninitialized("FALSE"),
-        .init_file("acl_fp_div_memoryC0_uid112_invTables_lutmem.hex"),
+        .init_file("acl_fdiv_memoryC0_uid112_invTables_lutmem.hex"),
        .init_file_layout("PORT_A"),
        .intended_device_family("Arria 10")
    ) memoryC0_uid112_invTables_lutmem_dmem (
--- a/hw/rtl/fp_cores/altera/acl_fp_div_memoryC0_uid112_invTables_lutmem.hex
+++ b/hw/rtl/fp_cores/altera/acl_fp_div_memoryC0_uid112_invTables_lutmem.hex
--- a/hw/rtl/fp_cores/altera/acl_fp_div_memoryC1_uid115_invTables_lutmem.hex
+++ b/hw/rtl/fp_cores/altera/acl_fp_div_memoryC1_uid115_invTables_lutmem.hex
--- a/hw/rtl/fp_cores/altera/acl_fp_div_memoryC2_uid118_invTables_lutmem.hex
+++ b/hw/rtl/fp_cores/altera/acl_fp_div_memoryC2_uid118_invTables_lutmem.hex
--- a/hw/rtl/fp_cores/altera/acl_fp_sqrt.sv
+++ b/hw/rtl/fp_cores/altera/acl_fp_sqrt.sv
@ -15,12 +15,12 @@
 // applicable agreement for further details.
 // ---------------------------------------------------------------------------

-// SystemVerilog created from acl_fp_sqrt
-// SystemVerilog created on Mon Aug 31 06:15:18 2020
+// SystemVerilog created from acl_fsqrt
+// SystemVerilog created on Wed Sep  2 07:11:09 2020


 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
-module acl_fp_sqrt (
+module acl_fsqrt (
    input wire [31:0] a,
    input wire [0:0] en,
    output wire [31:0] q,
@ -279,7 +279,7 @@ module acl_fp_sqrt (
        .outdata_aclr_a("CLEAR0"),
        .clock_enable_input_a("NORMAL"),
        .power_up_uninitialized("FALSE"),
-        .init_file("acl_fp_sqrt_memoryC2_uid68_sqrtTables_lutmem.hex"),
+        .init_file("acl_fsqrt_memoryC2_uid68_sqrtTables_lutmem.hex"),
        .init_file_layout("PORT_A"),
        .intended_device_family("Arria 10")
    ) memoryC2_uid68_sqrtTables_lutmem_dmem (
@ -412,7 +412,7 @@ module acl_fp_sqrt (
        .outdata_aclr_a("CLEAR0"),
        .clock_enable_input_a("NORMAL"),
        .power_up_uninitialized("FALSE"),
-        .init_file("acl_fp_sqrt_memoryC1_uid65_sqrtTables_lutmem.hex"),
+        .init_file("acl_fsqrt_memoryC1_uid65_sqrtTables_lutmem.hex"),
        .init_file_layout("PORT_A"),
        .intended_device_family("Arria 10")
    ) memoryC1_uid65_sqrtTables_lutmem_dmem (
@ -723,7 +723,7 @@ module acl_fp_sqrt (
        .outdata_aclr_a("CLEAR0"),
        .clock_enable_input_a("NORMAL"),
        .power_up_uninitialized("FALSE"),
-        .init_file("acl_fp_sqrt_memoryC0_uid62_sqrtTables_lutmem.hex"),
+        .init_file("acl_fsqrt_memoryC0_uid62_sqrtTables_lutmem.hex"),
        .init_file_layout("PORT_A"),
        .intended_device_family("Arria 10")
    ) memoryC0_uid62_sqrtTables_lutmem_dmem (
--- a/hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC0_uid62_sqrtTables_lutmem.hex
+++ b/hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC0_uid62_sqrtTables_lutmem.hex
--- a/hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC1_uid65_sqrtTables_lutmem.hex
+++ b/hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC1_uid65_sqrtTables_lutmem.hex
--- a/hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC2_uid68_sqrtTables_lutmem.hex
+++ b/hw/rtl/fp_cores/altera/acl_fp_sqrt_memoryC2_uid68_sqrtTables_lutmem.hex
--- a/hw/rtl/fp_cores/altera/acl_fp_ftoi.sv
+++ b/hw/rtl/fp_cores/altera/acl_fp_ftoi.sv
@ -15,12 +15,12 @@
 // applicable agreement for further details.
 // ---------------------------------------------------------------------------

-// SystemVerilog created from acl_fp_ftoi
-// SystemVerilog created on Mon Aug 31 06:15:18 2020
+// SystemVerilog created from acl_ftoi
+// SystemVerilog created on Wed Sep  2 07:11:09 2020


 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
-module acl_fp_ftoi (
+module acl_ftoi (
    input wire [31:0] a,
    input wire [0:0] en,
    output wire [31:0] q,
--- a/hw/rtl/fp_cores/altera/acl_fp_ftou.sv
+++ b/hw/rtl/fp_cores/altera/acl_fp_ftou.sv
@ -15,12 +15,12 @@
 // applicable agreement for further details.
 // ---------------------------------------------------------------------------

-// SystemVerilog created from acl_fp_ftou
-// SystemVerilog created on Mon Aug 31 06:15:18 2020
+// SystemVerilog created from acl_ftou
+// SystemVerilog created on Wed Sep  2 07:11:09 2020


 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
-module acl_fp_ftou (
+module acl_ftou (
    input wire [31:0] a,
    input wire [0:0] en,
    output wire [31:0] q,
--- a/hw/rtl/fp_cores/altera/acl_gen.log
+++ b/hw/rtl/fp_cores/altera/acl_gen.log
@ -0,0 +1,169 @@
+starting execution ... 
+build model options ... 
+argc=21
+Generation context:
+    HardFP is enabled enabling set to true 
+    Faithful rounding constraint detected 
+        Will not generate valid and channel signals 
+    The new component name is acl_fdiv
+    Frequency 250MHz 
+    Deployment FPGA Arria10 
+Estimated resources LUTs 539, DSPs 5, RAMBits 32768, RAMBlocks 3 
+The pipeline depth of the block is 15 cycle(s)
+@@start
+@name FPDiv@
+@latency 15@
+@LUT 539@
+@DSP 5@
+@RAMBits 32768@
+@RAMBlockUsage 3@
+@enable 1@
+@subnormals 0@
+@error 1.00@
+@rounding NA@
+@method polynomial approximation@
+@inPort 0 fpieee 8 23@
+@inPort 1 fpieee 8 23@
+@outPort 0 fpieee 8 23@
+@nochanvalid 1@
+@@end
+starting execution ... 
+build model options ... 
+argc=20
+Generation context:
+    HardFP is enabled enabling set to true 
+    Faithful rounding constraint detected 
+        Will not generate valid and channel signals 
+    The new component name is acl_fsqrt
+    Frequency 250MHz 
+    Deployment FPGA Arria10 
+Estimated resources LUTs 271, DSPs 3, RAMBits 15872, RAMBlocks 3 
+The pipeline depth of the block is 10 cycle(s)
+@@start
+@name FPSqrt@
+@latency 10@
+@LUT 271@
+@DSP 3@
+@RAMBits 15872@
+@RAMBlockUsage 3@
+@enable 1@
+@subnormals 0@
+@error 1.00@
+@rounding NA@
+@method polynomial approximation@
+@inPort 0 fpieee 8 23@
+@outPort 0 fpieee 8 23@
+@nochanvalid 1@
+@@end
+starting execution ... 
+build model options ... 
+argc=23
+Generation context:
+    HardFP is enabled enabling set to true 
+    Faithful rounding constraint detected 
+        Will not generate valid and channel signals 
+    The new component name is acl_ftoi
+    Frequency 250MHz 
+    Deployment FPGA Arria10 
+Estimated resources LUTs 327, DSPs 0, RAMBits 0, RAMBlocks 0 
+The pipeline depth of the block is 3 cycle(s)
+@@start
+@name FPToFXP@
+@latency 3@
+@LUT 327@
+@DSP 0@
+@RAMBits 0@
+@RAMBlockUsage 0@
+@enable 1@
+@subnormals 0@
+@error 1.00@
+@rounding NA@
+@method default@
+@inPort 0 fpieee 8 23@
+@outPort 0 fxp 32 0 1@
+@nochanvalid 1@
+@@end
+starting execution ... 
+build model options ... 
+argc=23
+Generation context:
+    HardFP is enabled enabling set to true 
+    Faithful rounding constraint detected 
+        Will not generate valid and channel signals 
+    The new component name is acl_ftou
+    Frequency 250MHz 
+    Deployment FPGA Arria10 
+Estimated resources LUTs 287, DSPs 0, RAMBits 0, RAMBlocks 0 
+The pipeline depth of the block is 3 cycle(s)
+@@start
+@name FPToFXP@
+@latency 3@
+@LUT 287@
+@DSP 0@
+@RAMBits 0@
+@RAMBlockUsage 0@
+@enable 1@
+@subnormals 0@
+@error 1.00@
+@rounding NA@
+@method default@
+@inPort 0 fpieee 8 23@
+@outPort 0 fxp 32 0 0@
+@nochanvalid 1@
+@@end
+starting execution ... 
+build model options ... 
+argc=23
+Generation context:
+    HardFP is enabled enabling set to true 
+    Faithful rounding constraint detected 
+        Will not generate valid and channel signals 
+    The new component name is acl_itof
+    Frequency 250MHz 
+    Deployment FPGA Arria10 
+Estimated resources LUTs 397, DSPs 0, RAMBits 0, RAMBlocks 0 
+The pipeline depth of the block is 7 cycle(s)
+@@start
+@name FXPToFP@
+@latency 7@
+@LUT 397@
+@DSP 0@
+@RAMBits 0@
+@RAMBlockUsage 0@
+@enable 1@
+@subnormals 0@
+@error 1.00@
+@rounding NA@
+@method default@
+@inPort 0 fxp 32 0 1@
+@outPort 0 fpieee 8 23@
+@nochanvalid 1@
+@@end
+starting execution ... 
+build model options ... 
+argc=23
+Generation context:
+    HardFP is enabled enabling set to true 
+    Faithful rounding constraint detected 
+        Will not generate valid and channel signals 
+    The new component name is acl_utof
+    Frequency 300MHz 
+    Deployment FPGA Arria10 
+Estimated resources LUTs 363, DSPs 0, RAMBits 0, RAMBlocks 0 
+The pipeline depth of the block is 7 cycle(s)
+@@start
+@name FXPToFP@
+@latency 7@
+@LUT 363@
+@DSP 0@
+@RAMBits 0@
+@RAMBlockUsage 0@
+@enable 1@
+@subnormals 0@
+@error 1.00@
+@rounding NA@
+@method default@
+@inPort 0 fxp 32 0 0@
+@outPort 0 fpieee 8 23@
+@nochanvalid 1@
+@@end
--- a/hw/rtl/fp_cores/altera/acl_gen.sh
+++ b/hw/rtl/fp_cores/altera/acl_gen.sh
@ -0,0 +1,25 @@
+#!/bin/bash
+
+CMD_POLY_EVAL_PATH=$QUARTUS_HOME/dspba/backend/linux64
+
+OPTIONS="-target Arria10 -lang verilog -enableHardFP 1 -printMachineReadable -faithfulRounding  -noChanValid -enable -speedgrade 2"
+
+export LD_LIBRARY_PATH=$CMD_POLY_EVAL_PATH:$LD_LIBRARY_PATH
+
+CMD="$CMD_POLY_EVAL_PATH/cmdPolyEval $OPTIONS"
+
+EXP_BITS=8
+MAN_BITS=23
+FBITS="f$(($EXP_BITS + $MAN_BITS + 1))"
+
+echo Generating IP cores for $FBITS
+{
+    $CMD -name acl_fdiv  -frequency 250 FPDiv   $EXP_BITS $MAN_BITS 0
+    $CMD -name acl_fsqrt -frequency 250 FPSqrt  $EXP_BITS $MAN_BITS
+    $CMD -name acl_ftoi  -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 1
+    $CMD -name acl_ftou  -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 0
+    $CMD -name acl_itof  -frequency 250 FXPToFP 32 0 1 $EXP_BITS $MAN_BITS
+    $CMD -name acl_utof  -frequency 300 FXPToFP 32 0 0 $EXP_BITS $MAN_BITS
+} > acl_gen.log 2>&1
+
+#cp $QUARTUS_HOME/dspba/backend/Libraries/sv/base/dspba_library_ver.sv .
--- a/hw/rtl/fp_cores/altera/acl_fp_itof.sv
+++ b/hw/rtl/fp_cores/altera/acl_fp_itof.sv
@ -15,12 +15,12 @@
 // applicable agreement for further details.
 // ---------------------------------------------------------------------------

-// SystemVerilog created from acl_fp_itof
-// SystemVerilog created on Mon Aug 31 06:15:18 2020
+// SystemVerilog created from acl_itof
+// SystemVerilog created on Wed Sep  2 07:11:09 2020


 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
-module acl_fp_itof (
+module acl_itof (
    input wire [31:0] a,
    input wire [0:0] en,
    output wire [31:0] q,
--- a/hw/rtl/fp_cores/altera/acl_fp_utof.sv
+++ b/hw/rtl/fp_cores/altera/acl_fp_utof.sv
@ -15,12 +15,12 @@
 // applicable agreement for further details.
 // ---------------------------------------------------------------------------

-// SystemVerilog created from acl_fp_utof
-// SystemVerilog created on Mon Aug 31 06:15:18 2020
+// SystemVerilog created from acl_utof
+// SystemVerilog created on Wed Sep  2 07:11:09 2020


 (* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
-module acl_fp_utof (
+module acl_utof (
    input wire [31:0] a,
    input wire [0:0] en,
    output wire [31:0] q,
--- a/hw/rtl/fp_cores/altera/generate.sh
+++ b/hw/rtl/fp_cores/altera/generate.sh
@ -1,25 +0,0 @@
-#!/bin/bash
-
-CMD_POLY_EVAL_PATH=$QUARTUS_HOME/dspba/backend/linux64
-
-OPTIONS="-target Arria10 -lang verilog -enableHardFP 1 -printMachineReadable -faithfulRounding  -noChanValid -enable -speedgrade 2"
-
-export LD_LIBRARY_PATH=$CMD_POLY_EVAL_PATH:$LD_LIBRARY_PATH
-
-CMD="$CMD_POLY_EVAL_PATH/cmdPolyEval $OPTIONS"
-
-EXP_BITS=8
-MAN_BITS=23
-FBITS="f$(($EXP_BITS + $MAN_BITS + 1))"
-
-echo Generating IP cores for $FBITS
-{
-    $CMD -name acl_fp_div  -frequency 250 FPDiv  $EXP_BITS $MAN_BITS 0
-    $CMD -name acl_fp_sqrt -frequency 250 FPSqrt $EXP_BITS $MAN_BITS
-    $CMD -name acl_fp_ftoi -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 1
-    $CMD -name acl_fp_ftou -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 0
-    $CMD -name acl_fp_itof -frequency 250 FXPToFP 32 0 1 $EXP_BITS $MAN_BITS
-    $CMD -name acl_fp_utof -frequency 300 FXPToFP 32 0 0 $EXP_BITS $MAN_BITS
-} > log.txt 2>&1
-
-cp $QUARTUS_HOME/dspba/backend/Libraries/sv/base/dspba_library_ver.sv .
--- a/hw/rtl/fp_cores/svdpi/float_dpi.cpp
+++ b/hw/rtl/fp_cores/svdpi/float_dpi.cpp
@ -8,21 +8,19 @@
 #include "VX_config.h"

 extern "C" {
-  void dpi_fadd(int inst, bool enable, bool valid, int a, int b, int* result);
-  void dpi_fsub(int inst, bool enable, bool valid, int a, int b, int* result);
-  void dpi_fmul(int inst, bool enable, bool valid, int a, int b, int* result);
-  void dpi_fmadd(int inst, bool enable, bool valid, int a, int b, int c, int* result);
-  void dpi_fmsub(int inst, bool enable, bool valid, int a, int b, int c, int* result);
-  void dpi_fdiv(int inst, bool enable, bool valid, int a, int b, int* result);
-  void dpi_fsqrt(int inst, bool enable, bool valid, int a, int* result);
-  void dpi_ftoi(int inst, bool enable, bool valid, int a, int* result);
-  void dpi_ftou(int inst, bool enable, bool valid, int a, int* result);
-  void dpi_itof(int inst, bool enable, bool valid, int a, int* result);
-  void dpi_utof(int inst, bool enable, bool valid, int a, int* result);
+  void dpi_fadd(int inst, bool enable, int a, int b, int* result);
+  void dpi_fsub(int inst, bool enable, int a, int b, int* result);
+  void dpi_fmul(int inst, bool enable, int a, int b, int* result);
+  void dpi_fmadd(int inst, bool enable, int a, int b, int c, int* result);
+  void dpi_fmsub(int inst, bool enable, int a, int b, int c, int* result);
+  void dpi_fdiv(int inst, bool enable, int a, int b, int* result);
+  void dpi_fsqrt(int inst, bool enable, int a, int* result);
+  void dpi_ftoi(int inst, bool enable, int a, int* result);
+  void dpi_ftou(int inst, bool enable, int a, int* result);
+  void dpi_itof(int inst, bool enable, int a, int* result);
+  void dpi_utof(int inst, bool enable, int a, int* result);
 }

-extern double sc_time_stamp();
-
 class ShiftRegister {
 public:
  ShiftRegister() : init_(false), depth_(0) {}
@ -35,37 +33,36 @@ public:
    }
  }

-  void push(int value, bool enable, bool valid) {
+  void push(int value, bool enable) {
    if (!enable)
      return;      
    for (unsigned i = 0; i < depth_-1; ++i) {
      buffer_[i] = buffer_[i+1];
    }
-    buffer_[depth_-1].value = value;
-    buffer_[depth_-1].valid = valid;
+    buffer_[depth_-1] = value;
  }

  int top() const {
-    return buffer_[0].value;
-  }
-
-  bool valid() const { 
-    return buffer_[0].valid;
+    return buffer_[0];
  }

 private:

-  struct entry_t {
-    int value;
-    bool valid;
-  };
-
-  std::vector<entry_t> buffer_;
-  int top_;
-  unsigned depth_;
+  std::vector<int> buffer_;
  bool init_;
+  unsigned depth_;  
 };

+union Float_t {    
+    float f;
+    int   i;
+    struct {
+        uint32_t man  : 23;
+        uint32_t exp  : 8;
+        uint32_t sign : 1;
+    } parts;
+}; 
+
 class Instances {
 public:
  ShiftRegister& get(int inst) {
@ -82,130 +79,152 @@ private:

 Instances instances;

-void dpi_fadd(int inst, bool enable, bool valid, int a, int b, int* result) {
+void dpi_fadd(int inst, bool enable, int a, int b, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  float fa = *(float*)&a;
-  float fb = *(float*)&b;
-  float fr = fa + fb;   
+  Float_t fa, fb, fr;

-  sr.ensure_init(LATENCY_FMADD);
-  sr.push(*(int*)&fr, enable, valid);
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f + fb.f;
+
+  sr.ensure_init(LATENCY_FADDMUL);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }

-void dpi_fsub(int inst, bool enable, bool valid, int a, int b, int* result) {
+void dpi_fsub(int inst, bool enable, int a, int b, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  float fa = *(float*)&a;
-  float fb = *(float*)&b;
-  float fr = fa - fb;   
+  Float_t fa, fb, fr;

-  sr.ensure_init(LATENCY_FMADD);
-  sr.push(*(int*)&fr, enable, valid);
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f - fb.f;
+
+  sr.ensure_init(LATENCY_FADDMUL);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }

-void dpi_fmul(int inst, bool enable, bool valid, int a, int b, int* result) {
+void dpi_fmul(int inst, bool enable, int a, int b, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  float fa = *(float*)&a;
-  float fb = *(float*)&b;
-  float fr = fa * fb;   
+  Float_t fa, fb, fr;

-  sr.ensure_init(LATENCY_FMADD);
-  sr.push(*(int*)&fr, enable, valid);
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f * fb.f;
+
+  sr.ensure_init(LATENCY_FADDMUL);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }

-void dpi_fmadd(int inst, bool enable, bool valid, int a, int b, int c, int* result) {
+void dpi_fmadd(int inst, bool enable, int a, int b, int c, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  float fa = *(float*)&a;
-  float fb = *(float*)&b;
-  float fc = *(float*)&c;
-  float fr = fa * fb + fc;   
+  Float_t fa, fb, fc, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fc.i = c;
+  fr.f = fa.f * fb.f + fc.f;

  sr.ensure_init(LATENCY_FMADD);
-  sr.push(*(int*)&fr, enable, valid);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }

-void dpi_fmsub(int inst, bool enable, bool valid, int a, int b, int c, int* result) {
+void dpi_fmsub(int inst, bool enable, int a, int b, int c, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  float fa = *(float*)&a;
-  float fb = *(float*)&b;
-  float fc = *(float*)&c;
-  float fr = fa * fb - fc;   
+  Float_t fa, fb, fc, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fc.i = c;
+  fr.f = fa.f * fb.f - fc.f;

  sr.ensure_init(LATENCY_FMADD);
-  sr.push(*(int*)&fr, enable, valid);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }

-void dpi_fdiv(int inst, bool enable, bool valid, int a, int b, int* result) {
+void dpi_fdiv(int inst, bool enable, int a, int b, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  float fa = *(float*)&a;
-  float fb = *(float*)&b;
-  float fr = fa / fb;   
+  Float_t fa, fb, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f / fb.f;

  sr.ensure_init(LATENCY_FDIV);
-  sr.push(*(int*)&fr, enable, valid);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }

-void dpi_fsqrt(int inst, bool enable, bool valid, int a, int* result) {
+void dpi_fsqrt(int inst, bool enable, int a, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  float fa = *(float*)&a;
-  float fr = sqrtf(fa);
+  Float_t fa, fr;
+
+  fa.i = a;
+  fr.f = sqrtf(fa.f);

  sr.ensure_init(LATENCY_FSQRT);
-  sr.push(*(int*)&fr, enable, valid);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }

-void dpi_ftoi(int inst, bool enable, bool valid, int a, int* result) {
+void dpi_ftoi(int inst, bool enable, int a, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  float fa = *(float*)&a;
-  int ir = int(fa);   
+  Float_t fa, fr;
+
+  fa.i = a;
+  fr.i = int(fa.f);   

  sr.ensure_init(LATENCY_FTOI);
-  sr.push(ir, enable, valid);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }

-void dpi_ftou(int inst, bool enable, bool valid, int a, int* result) {
+void dpi_ftou(int inst, bool enable, int a, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  float fa = *(float*)&a;
-  unsigned ir = unsigned(fa);   
+  Float_t fa, fr;
+
+  fa.i = a;
+  fr.i = unsigned(fa.f);   

  sr.ensure_init(LATENCY_FTOI);
-  sr.push(ir, enable, valid);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }

-void dpi_itof(int inst, bool enable, bool valid, int a, int* result) {
+void dpi_itof(int inst, bool enable, int a, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  float fr = (float)a;   
+  Float_t fa, fr;
+
+  fr.f = (float)a;   

  sr.ensure_init(LATENCY_ITOF);
-  sr.push(*(int*)&fr, enable, valid);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }

-void dpi_utof(int inst, bool enable, bool valid, int a, int* result) {
+void dpi_utof(int inst, bool enable, int a, int* result) {
  ShiftRegister& sr = instances.get(inst);

-  unsigned ua = *(unsigned*)&a;
-  float fr = (float)ua;   
+  Float_t fa, fr;
+
+  unsigned ua = a;
+  fr.f = (float)ua;   

  sr.ensure_init(LATENCY_ITOF);
-  sr.push(*(int*)&fr, enable, valid);
+  sr.push(fr.i, enable);
  *result = sr.top();
 }
--- a/hw/rtl/fp_cores/svdpi/float_dpi.vh
+++ b/hw/rtl/fp_cores/svdpi/float_dpi.vh
@ -1,16 +1,16 @@
 `ifndef FLOAT_DPI
 `define FLOAT_DPI

-import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
-import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
-import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
-import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input logic valid, input int a, input int b, input int c, output int result);
-import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input logic valid, input int a, input int b, input int c, output int result);
-import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
-import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input logic valid, input int a, output int result);
-import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input logic valid, input int a, output int result);
-import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input logic valid, input int a, output int result);
-import "DPI-C" context function void dpi_itof(int inst, input logic enable, input logic valid, input int a, output int result);
-import "DPI-C" context function void dpi_utof(int inst, input logic enable, input logic valid, input int a, output int result);
+import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input int a, input int b, output int result);
+import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input int a, input int b, output int result);
+import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input int a, input int b, output int result);
+import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input int a, input int b, input int c, output int result);
+import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input int a, input int b, input int c, output int result);
+import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input int a, input int b, output int result);
+import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input int a, output int result);
+import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input int a, output int result);
+import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input int a, output int result);
+import "DPI-C" context function void dpi_itof(int inst, input logic enable, input int a, output int result);
+import "DPI-C" context function void dpi_utof(int inst, input logic enable, input int a, output int result);

 `endif
--- a/hw/rtl/libs/VX_divide.v
+++ b/hw/rtl/libs/VX_divide.v
@ -1,21 +1,18 @@
 `include "VX_platform.vh"

 module VX_divide #(
-    parameter WIDTHN = 1,
-    parameter WIDTHD = 1,
-    parameter WIDTHQ = 1,
-    parameter WIDTHR = 1,
+    parameter WIDTHN  = 1,
+    parameter WIDTHD  = 1,
+    parameter WIDTHQ  = 1,
+    parameter WIDTHR  = 1,
    parameter NSIGNED = 0,
    parameter DSIGNED = 0,
-    parameter PIPELINE = 0
+    parameter LATENCY = 0
 ) (
    input wire clk,
-    input wire reset,
-
-    input wire clk_en,
+    input wire enable,
    input wire [WIDTHN-1:0] numer,
    input wire [WIDTHD-1:0] denom,
-
    output wire [WIDTHQ-1:0] quotient,
    output wire [WIDTHR-1:0] remainder
 );
@ -27,11 +24,11 @@ module VX_divide #(

    lpm_divide divide (
        .clock    (clk),        
+        .clken    (enable),
        .numer    (numer),
        .denom    (denom),
        .quotient (quotient_unqual),
-        .remain   (remainder_unqual),
-        .clken    (clk_en)
+        .remain   (remainder_unqual)
    );

    defparam
@ -41,7 +38,7 @@ module VX_divide #(
 		divide.lpm_nrepresentation = NSIGNED ? "SIGNED" : "UNSIGNED",
        divide.lpm_drepresentation = DSIGNED ? "SIGNED" : "UNSIGNED",
 		divide.lpm_hint = "MAXIMIZE_SPEED=6,LPM_REMAINDERPOSITIVE=FALSE",
-		divide.lpm_pipeline = PIPELINE;
+		divide.lpm_pipeline = LATENCY;

    assign quotient  = quotient_unqual [WIDTHQ-1:0];
    assign remainder = remainder_unqual [WIDTHR-1:0];
@ -72,34 +69,24 @@ module VX_divide #(
        end
    end

-    if (PIPELINE == 0) begin
+    if (LATENCY == 0) begin
        assign quotient  = quotient_unqual [WIDTHQ-1:0];
        assign remainder = remainder_unqual [WIDTHR-1:0];
    end else begin
-        reg [WIDTHN-1:0] quotient_pipe [0:PIPELINE-1];
-        reg [WIDTHD-1:0] remainder_pipe [0:PIPELINE-1];
+        reg [WIDTHN-1:0] quotient_pipe [0:LATENCY-1];
+        reg [WIDTHD-1:0] remainder_pipe [0:LATENCY-1];

-        for (genvar i = 0; i < PIPELINE; i++) begin
-            always @(posedge clk) begin
-                if (reset) begin
-                    quotient_pipe[i]  <= 0;
-                    remainder_pipe[i] <= 0;
-                end else begin
-                    if (clk_en) begin
-                        if (i == 0) begin
-                            quotient_pipe[i]  <= quotient_unqual;
-                            remainder_pipe[i] <= remainder_unqual;
-                        end else begin
-                            quotient_pipe[i]  <= quotient_pipe[i-1];
-                            remainder_pipe[i] <= remainder_pipe[i-1];
-                        end                    
-                    end
+        for (genvar i = 0; i < LATENCY; i++) begin
+            always @(posedge clk) begin                
+                if (enable) begin
+                    quotient_pipe[i]  <= (0 == i) ? quotient_unqual  : quotient_pipe[i-1];
+                    remainder_pipe[i] <= (0 == i) ? remainder_unqual : remainder_pipe[i-1];
                end
            end
        end

-        assign quotient  = quotient_pipe[PIPELINE-1][WIDTHQ-1:0];
-        assign remainder = remainder_pipe[PIPELINE-1][WIDTHR-1:0];
+        assign quotient  = quotient_pipe[LATENCY-1][WIDTHQ-1:0];
+        assign remainder = remainder_pipe[LATENCY-1][WIDTHR-1:0];
    end    

 `endif
--- a/hw/rtl/libs/VX_multiplier.v
+++ b/hw/rtl/libs/VX_multiplier.v
@ -1,16 +1,14 @@
 `include "VX_platform.vh"

 module VX_multiplier #(
-    parameter WIDTHA = 1,
-    parameter WIDTHB = 1,
-    parameter WIDTHP = 1,
-    parameter SIGNED = 0,
-    parameter PIPELINE = 0
+    parameter WIDTHA  = 1,
+    parameter WIDTHB  = 1,
+    parameter WIDTHP  = 1,
+    parameter SIGNED  = 0,
+    parameter LATENCY = 0
 ) (
-    input wire clk,
-    input wire reset,
-
-    input wire clk_en,
+    input wire clk,    
+    input wire enable,
    input wire [WIDTHA-1:0]  dataa,
    input wire [WIDTHB-1:0]  datab,
    output wire [WIDTHP-1:0] result
@ -20,20 +18,22 @@ module VX_multiplier #(

    lpm_mult mult (
        .clock  (clk),
+        .clken  (enable),
        .dataa  (dataa),
        .datab  (datab),
-        .result (result),
-        .clken  (clk_en),
+        .result (result),        
+        .aclr   (1'b0),
+        .sclr   (1'b0),
        .sum    (1'b0)
    );

-    defparam mult.lpm_type = "LPM_MULT",
+    defparam mult.lpm_type   = "LPM_MULT",
             mult.lpm_widtha = WIDTHA,
             mult.lpm_widthb = WIDTHB,
             mult.lpm_widthp = WIDTHP,
             mult.lpm_representation = SIGNED ? "SIGNED" : "UNSIGNED",
-             mult.lpm_pipeline = PIPELINE,
-             mult.lpm_hint = "MAXIMIZE_SPEED=9,DEDICATED_MULTIPLIER_CIRCUITRY=YES";
+             mult.lpm_pipeline = LATENCY,
+             mult.lpm_hint   = "DEDICATED_MULTIPLIER_CIRCUITRY=YES,MAXIMIZE_SPEED=9";
 `else

    wire [WIDTHP-1:0] result_unqual;
@ -44,29 +44,20 @@ module VX_multiplier #(
        assign result_unqual = dataa * datab;
    end
    
-    if (PIPELINE == 0) begin
+    if (LATENCY == 0) begin
        assign result = result_unqual;
-    end else begin
-        
-        reg [WIDTHP-1:0] result_pipe [0:PIPELINE-1];
+    end else begin        
+        reg [WIDTHP-1:0] result_pipe [0:LATENCY-1];

-        for (genvar i = 0; i < PIPELINE; i++) begin
+        for (genvar i = 0; i < LATENCY; i++) begin
            always @(posedge clk) begin
-                if (reset) begin
-                    result_pipe[i] <= 0;
-                end else begin
-                    if (clk_en) begin
-                        if (i == 0) begin
-                            result_pipe[i] <= result_unqual;
-                        end else begin
-                            result_pipe[i] <= result_pipe[i-1];
-                        end                    
-                    end
+                if (enable) begin
+                    result_pipe[i] <= (0 == i) ? result_unqual : result_pipe[i-1];
                end
            end
-        end
-        
-        assign result = result_pipe[PIPELINE-1]; 
+        end        
+
+        assign result = result_pipe[LATENCY-1]; 
    end

 `endif