speeding up simulation using dedicated full dpi-based FPU core

2025-04-23 13:27:29 -04:00 · 2021-01-06 18:44:06 -08:00 · 2021-01-06 18:44:06 -08:00 · 2b8435471a
commit 2b8435471a
parent 2058718f0f
26 changed files with 990 additions and 430 deletions
--- a/driver/opae/vlsim/Makefile
+++ b/driver/opae/vlsim/Makefile
@ -39,11 +39,12 @@ LDFLAGS += -shared -pthread
 TOP = vortex_afu_shim

 RTL_DIR=../../../hw/rtl
+DPI_DIR=../../../hw/dpi

 SRCS = fpga.cpp opae_sim.cpp
-SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
+SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp

-FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
+FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
 RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
 RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip

--- a/driver/opae/vlsim/opae_sim.cpp
+++ b/driver/opae/vlsim/opae_sim.cpp
@ -8,7 +8,7 @@
 #define CCI_RQ_SIZE 16
 #define CCI_WQ_SIZE 16

-#define RESET_DELAY 2
+#define RESET_DELAY 4

 #define ENABLE_DRAM_STALLS
 #define DRAM_LATENCY 24
@ -135,19 +135,14 @@ void opae_sim::reset() {

  vortex_afu_->reset = 1;

-  vortex_afu_->clk = 0;
-  this->eval();
-  vortex_afu_->clk = 1;
-  this->eval();
-
-  vortex_afu_->reset = 0;
-
  for (int i = 0; i < RESET_DELAY; ++i) {
    vortex_afu_->clk = 0;
    this->eval();
    vortex_afu_->clk = 1;
    this->eval();
  }  
+
+  vortex_afu_->reset = 0;
  
  // Turn on assertion after reset
  Verilated::assertOn(true);
--- a/driver/rtlsim/Makefile
+++ b/driver/rtlsim/Makefile
@ -39,11 +39,12 @@ LDFLAGS += -shared -pthread
 TOP = Vortex

 RTL_DIR = ../../hw/rtl
+DPI_DIR = ../../hw/dpi

 SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
-SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
+SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp

-FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src 
+FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src 
 RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)

 VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
--- a/hw/dpi/float_dpi.cpp
+++ b/hw/dpi/float_dpi.cpp
@ -0,0 +1,264 @@
+#include <stdio.h>
+#include <math.h>
+#include <unordered_map>
+#include <vector>
+#include <mutex>
+#include <iostream>
+#include "svdpi.h"
+#include "verilated_vpi.h"
+#include "VX_config.h"
+
+extern "C" {
+  void dpi_fadd(int a, int b, int frm, int* result, int* fflags);
+  void dpi_fsub(int a, int b, int frm, int* result, int* fflags);
+  void dpi_fmul(int a, int b, int frm, int* result, int* fflags);
+  void dpi_fmadd(int a, int b, int c, int frm, int* result, int* fflags);
+  void dpi_fmsub(int a, int b, int c, int frm, int* result, int* fflags);
+  void dpi_fnmadd(int a, int b, int c, int frm, int* result, int* fflags);
+  void dpi_fnmsub(int a, int b, int c, int frm, int* result, int* fflags);
+
+  void dpi_fdiv(int a, int b, int frm, int* result, int* fflags);
+  void dpi_fsqrt(int a, int frm, int* result, int* fflags);
+  
+  void dpi_ftoi(int a, int frm, int* result, int* fflags);
+  void dpi_ftou(int a, int frm, int* result, int* fflags);
+  void dpi_itof(int a, int frm, int* result, int* fflags);
+  void dpi_utof(int a, int frm, int* result, int* fflags);
+
+  void dpi_fclss(int a, int* result);
+  void dpi_fsgnj(int a, int* result);
+  void dpi_fsgnjn(int a, int* result);
+  void dpi_fsgnjx(int a, int* result);
+
+  void dpi_flt(int a, int b, int* result, int* fflags);
+  void dpi_fle(int a, int b, int* result, int* fflags);
+  void dpi_feq(int a, int b, int* result, int* fflags);
+  void dpi_fmin(int a, int b, int* result, int* fflags);
+  void dpi_fmax(int a, int b, int* result, int* fflags);
+}
+
+union Float_t {    
+    float f;
+    int   i;
+    struct {
+        uint32_t man  : 23;
+        uint32_t exp  : 8;
+        uint32_t sign : 1;
+    } parts;
+};
+
+void dpi_fadd(int a, int b, int frm, int* result, int* fflags) {
+  Float_t fa, fb, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f + fb.f;
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fsub(int a, int b, int frm, int* result, int* fflags) {
+  Float_t fa, fb, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f - fb.f;
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fmul(int a, int b, int frm, int* result, int* fflags) {
+  Float_t fa, fb, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f * fb.f;
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fmadd(int a, int b, int c, int frm, int* result, int* fflags) {
+  Float_t fa, fb, fc, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fc.i = c;
+  fr.f = fa.f * fb.f + fc.f;
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fmsub(int a, int b, int c, int frm, int* result, int* fflags) {
+  Float_t fa, fb, fc, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fc.i = c;
+  fr.f = fa.f * fb.f - fc.f;
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fnmadd(int a, int b, int c, int frm, int* result, int* fflags) {
+  Float_t fa, fb, fc, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fc.i = c;
+  fr.f = -(fa.f * fb.f + fc.f);
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fnmsub(int a, int b, int c, int frm, int* result, int* fflags) {
+  Float_t fa, fb, fc, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fc.i = c;
+  fr.f = -(fa.f * fb.f - fc.f);
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fdiv(int a, int b, int frm, int* result, int* fflags) {
+  Float_t fa, fb, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f / fb.f;
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fsqrt(int a, int frm, int* result, int* fflags) {
+  Float_t fa, fr;
+
+  fa.i = a;
+  fr.f = sqrtf(fa.f);
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_ftoi(int a, int frm, int* result, int* fflags) {
+  Float_t fa, fr;
+
+  fa.i = a;
+  fr.i = int(fa.f);   
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_ftou(int a, int frm, int* result, int* fflags) {
+  Float_t fa, fr;
+
+  fa.i = a;
+  fr.i = unsigned(fa.f);   
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_itof(int a, int frm, int* result, int* fflags) {
+  Float_t fa, fr;
+
+  fr.f = (float)a;   
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_utof(int a, int frm, int* result, int* fflags) {
+  Float_t fa, fr;
+
+  unsigned ua = a;
+  fr.f = (float)ua;   
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_flt(int a, int b, int* result, int* fflags) {
+  Float_t fa, fb, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f < fb.f;
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fle(int a, int b, int* result, int* fflags) {
+  Float_t fa, fb, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f <= fb.f;
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_feq(int a, int b, int* result, int* fflags) {
+  Float_t fa, fb, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fr.f = fa.f == fb.f;
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fmin(int a, int b, int* result, int* fflags) {
+  Float_t fa, fb, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fr.f = std::min<float>(fa.f, fb.f);
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fmax(int a, int b, int* result, int* fflags) {
+  Float_t fa, fb, fr;
+
+  fa.i = a;
+  fb.i = b;
+  fr.f = std::max<float>(fa.f, fb.f);
+
+  *result = fr.i;
+  *fflags = 0;
+}
+
+void dpi_fclss(int a, int* result) {
+  // TODO
+  *result = 0;
+}
+
+void dpi_fsgnj(int a, int* result) {
+  // TODO
+  *result = 0;
+}
+
+void dpi_fsgnjn(int a, int* result) {
+  // TODO
+  *result = 0;
+}
+
+void dpi_fsgnjx(int a, int* result) {
+  // TODO
+  *result = 0;
+}
--- a/hw/dpi/float_dpi.vh
+++ b/hw/dpi/float_dpi.vh
@ -0,0 +1,31 @@
+`ifndef FLOAT_DPI
+`define FLOAT_DPI
+
+import "DPI-C" context function void dpi_fadd(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_fsub(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_fmul(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_fmadd(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_fmsub(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_fnmadd(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_fnmsub(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+
+import "DPI-C" context function void dpi_fdiv(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_fsqrt(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+
+import "DPI-C" context function void dpi_ftoi(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_ftou(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_itof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_utof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
+
+import "DPI-C" context function void dpi_fclss(input int a, output int result);
+import "DPI-C" context function void dpi_fsgnj(input int a, output int result);
+import "DPI-C" context function void dpi_fsgnjn(input int a, output int result);
+import "DPI-C" context function void dpi_fsgnjx(input int a, output int result);
+
+import "DPI-C" context function void dpi_flt(input int a, input int b, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_fle(input int a, input int b, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_feq(input int a, input int b, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_fmin(input int a, input int b, output int result, output bit[4:0] fflags);
+import "DPI-C" context function void dpi_fmax(input int a, input int b, output int result, output bit[4:0] fflags);
+
+`endif
--- a/hw/dpi/util_dpi.cpp
+++ b/hw/dpi/util_dpi.cpp
@ -0,0 +1,84 @@
+#include <stdio.h>
+#include <math.h>
+#include <unordered_map>
+#include <vector>
+#include <mutex>
+#include <iostream>
+#include "svdpi.h"
+#include "verilated_vpi.h"
+#include "VX_config.h"
+
+extern "C" {
+  int dpi_register();
+  void dpi_assert(int inst, bool cond, int delay);
+}
+
+class ShiftRegister {
+public:
+  ShiftRegister() : init_(false), depth_(0) {}
+
+  void ensure_init(int depth) {
+    if (!init_) {
+      buffer_.resize(depth);
+      init_  = true;
+      depth_ = depth;
+    }
+  }
+
+  void push(int value, bool enable) {
+    if (!enable)
+      return;      
+    for (unsigned i = 0; i < depth_-1; ++i) {
+      buffer_[i] = buffer_[i+1];
+    }
+    buffer_[depth_-1] = value;
+  }
+
+  int top() const {
+    return buffer_[0];
+  }
+
+private:
+
+  std::vector<int> buffer_;
+  bool init_;
+  unsigned depth_;  
+};
+
+class Instances {
+public:
+  ShiftRegister& get(int inst) {
+    return instances_.at(inst);
+  }
+
+  int allocate() {
+    mutex_.lock();   
+    int inst = instances_.size();
+    instances_.resize(inst + 1); 
+    mutex_.unlock();
+    return inst;
+  }
+
+private:
+  std::vector<ShiftRegister> instances_;
+  std::mutex mutex_;
+};
+
+Instances instances;
+
+int dpi_register() {
+  return instances.allocate();
+}
+
+void dpi_assert(int inst, bool cond, int delay) {
+  ShiftRegister& sr = instances.get(inst);
+
+  sr.ensure_init(delay);
+  sr.push(!cond, 1);
+
+  auto status = sr.top();
+  if (status) {
+    printf("delayed assertion at %s!\n", svGetNameFromScope(svGetScope()));
+    std::abort();
+  }
+}
--- a/hw/dpi/util_dpi.vh
+++ b/hw/dpi/util_dpi.vh
@ -0,0 +1,7 @@
+`ifndef UTIL_DPI
+`define UTIL_DPI
+
+import "DPI-C" context function int dpi_register();
+import "DPI-C" context function void dpi_assert(int inst, input logic cond, input int delay);
+
+`endif
--- a/hw/rtl/VX_cluster.v
+++ b/hw/rtl/VX_cluster.v
@ -40,7 +40,8 @@ module VX_cluster #(
    // Status
    output wire                             busy, 
    output wire                             ebreak
-);    
+); 
+
    wire [`NUM_CORES-1:0]                        per_core_dram_req_valid;
    wire [`NUM_CORES-1:0]                        per_core_dram_req_rw;    
    wire [`NUM_CORES-1:0][`DDRAM_BYTEEN_WIDTH-1:0] per_core_dram_req_byteen;    
@ -70,15 +71,13 @@ module VX_cluster #(
    for (genvar i = 0; i < `NUM_CORES; i++) begin
        
        wire core_reset;
-        if (`NUM_CORES > 1) begin
-            reg core_reset_r;
-            always @(posedge clk) begin
-                core_reset_r <= reset;
-            end
-            assign core_reset = core_reset_r;
-        end else begin
-            assign core_reset = reset;
-        end
+        VX_reset_relay #(
+            .PASSTHRU (`NUM_CORES == 1)
+        ) reset_relay (
+            .clk       (clk),
+            .reset     (reset),
+            .reset_out (core_reset)
+        );

        VX_core #(
            .CORE_ID(i + (CLUSTER_ID * `NUM_CORES))
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -92,8 +92,8 @@
 `define LATENCY_IMUL 3
 `endif

-`ifndef LATENCY_FNCOMP
-`define LATENCY_FNCOMP 2
+`ifndef LATENCY_FNCP
+`define LATENCY_FNCP 2
 `endif

 `ifndef LATENCY_FMA
@ -128,8 +128,8 @@
 `define LATENCY_FDIVSQRT 32
 `endif

-`ifndef LATENCY_FCONV
-`define LATENCY_FCONV 4
+`ifndef LATENCY_FCVT
+`define LATENCY_FCVT 4
 `endif

 // CSR Addresses //////////////////////////////////////////////////////////////
--- a/hw/rtl/VX_fpu_unit.v
+++ b/hw/rtl/VX_fpu_unit.v
@ -63,11 +63,11 @@ module VX_fpu_unit #(

    // resolve dynamic FRM from CSR   
    assign fpu_to_csr_if.read_wid = fpu_req_if.wid;
-    wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod;   
+    wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod;

 `ifdef FPU_FAST

-    VX_fp_fpga #(
+    VX_fp_dpi #(
        .TAGW (FPUQ_BITS)
    ) fp_core (
        .clk        (clk),
@ -91,21 +91,51 @@ module VX_fpu_unit #(

        .tag_out    (tag_out),

+        .ready_out  (ready_out),
+        .valid_out  (valid_out)
+    );
+
+`elsif FPU_FPNEW
+
+    VX_fpnew #(
+        .FMULADD  (1),
+        .FDIVSQRT (1),
+        .FNONCOMP (1),
+        .FCONV    (1),
+        .TAGW     (FPUQ_BITS)
+    ) fp_core (
+        .clk        (clk),
+        .reset      (reset),   
+
+        .valid_in   (valid_in),
+        .ready_in   (ready_in),        
+
+        .tag_in     (tag_in),
+        
+        .op_type    (fpu_req_if.op_type),
+        .frm        (fpu_frm),
+
+        .dataa      (fpu_req_if.rs1_data),
+        .datab      (fpu_req_if.rs2_data),
+        .datac      (fpu_req_if.rs3_data),
+        .result     (result), 
+
+        .has_fflags (has_fflags),
+        .fflags     (fflags),
+
+        .tag_out    (tag_out),
+
        .ready_out  (ready_out),
        .valid_out  (valid_out)
    );   

 `else

-    VX_fpnew #(
-        .FMULADD  (1),
-        .FDIVSQRT (1),
-        .FNONCOMP (1),
-        .FCONV    (1),
-        .TAGW     (FPUQ_BITS)
+    VX_fp_fpga #(
+        .TAGW (FPUQ_BITS)
    ) fp_core (
        .clk        (clk),
-        .reset      (reset),   
+        .reset      (fpu_reset),   

        .valid_in   (valid_in),
        .ready_in   (ready_in),        
--- a/hw/rtl/VX_lsu_unit.v
+++ b/hw/rtl/VX_lsu_unit.v
@ -64,7 +64,7 @@ module VX_lsu_unit #(
        assign mem_req_addr[i]   = full_address[i][31:2];        
        assign mem_req_offset[i] = full_address[i][1:0];
        assign mem_req_byteen[i] = wmask << full_address[i][1:0];
-        assign mem_req_data[i]   = lsu_req_if.store_data[i] << {mem_req_offset[i], 3'b0};
+        assign mem_req_data[i]   = lsu_req_if.store_data[i] << {full_address[i][1:0], 3'b0};
    end

 `IGNORE_WARNINGS_BEGIN
--- a/hw/rtl/VX_mem_unit.v
+++ b/hw/rtl/VX_mem_unit.v
@ -79,7 +79,17 @@ module VX_mem_unit # (
        .cache_rsp_if (dcache_rsp_if),
        .smem_rsp_if  (smem_rsp_if),
        .core_rsp_if  (core_dcache_rsp_if)
-    );    
+    ); 
+
+    wire icache_reset, dcache_reset;   
+
+    VX_reset_relay #(
+        .NUM_NODES (2)
+    ) reset_relay (
+        .clk       (clk),
+        .reset     (reset),
+        .reset_out ({dcache_reset, icache_reset})
+    );

    VX_cache #(
        .CACHE_ID           (`ICACHE_ID),
@ -102,7 +112,7 @@ module VX_mem_unit # (
        `SCOPE_BIND_VX_mem_unit_icache

        .clk                (clk),
-        .reset              (reset),
+        .reset              (icache_reset),

        // Core request
        .core_req_valid     (core_icache_req_if.valid),
@ -160,7 +170,7 @@ module VX_mem_unit # (
        `SCOPE_BIND_VX_mem_unit_dcache
        
        .clk                (clk),
-        .reset              (reset),
+        .reset              (dcache_reset),

        // Core req
        .core_req_valid     (dcache_req_if.valid),
@ -199,6 +209,14 @@ module VX_mem_unit # (

    if (`SM_ENABLE) begin

+        wire scache_reset;   
+
+        VX_reset_relay reset_relay (
+            .clk       (clk),
+            .reset     (reset),
+            .reset_out (scache_reset)
+        );
+
        VX_cache #(
            .CACHE_ID           (`SCACHE_ID),
            .CACHE_SIZE         (`SMEM_SIZE),
@ -220,7 +238,7 @@ module VX_mem_unit # (
            `SCOPE_BIND_VX_mem_unit_smem
            
            .clk                (clk),
-            .reset              (reset),
+            .reset              (scache_reset),

            // Core request
            .core_req_valid     (smem_req_if.valid),
--- a/hw/rtl/Vortex.v
+++ b/hw/rtl/Vortex.v
@ -72,15 +72,13 @@ module Vortex (
    for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin

        wire cluster_reset;
-        if (`NUM_CLUSTERS > 1) begin
-            reg cluster_reset_r;
-            always @(posedge clk) begin
-                cluster_reset_r <= reset;
-            end
-            assign cluster_reset = cluster_reset_r;
-        end else begin
-            assign cluster_reset = reset;
-        end
+        VX_reset_relay #(
+            .PASSTHRU (`NUM_CLUSTERS == 1)
+        ) reset_relay (
+            .clk       (clk),
+            .reset     (reset),
+            .reset_out (cluster_reset)
+        );

        VX_cluster #(
            .CLUSTER_ID(i)
--- a/hw/rtl/afu/vortex_afu.sv
+++ b/hw/rtl/afu/vortex_afu.sv
@ -37,7 +37,7 @@ module vortex_afu #(
  output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select
 );

-localparam RESET_DELAY        = 2;  
+localparam RESET_DELAY        = 3;  

 localparam DRAM_ADDR_WIDTH    = $bits(t_local_mem_addr);
 localparam DRAM_LINE_WIDTH    = $bits(t_local_mem_data);
--- a/hw/rtl/fp_cores/VX_fp_div.v
+++ b/hw/rtl/fp_cores/VX_fp_div.v
@ -1,9 +1,5 @@
 `include "VX_define.vh"

-`ifndef SYNTHESIS
-`include "float_dpi.vh"
-`endif
-
 module VX_fp_div #( 
    parameter TAGW = 1,
    parameter LANES = 1
@ -32,26 +28,24 @@ module VX_fp_div #(
 );    
    wire stall = ~ready_out && valid_out;
    wire enable = ~stall;
+
+    wire _reset;   
+
+    VX_reset_relay reset_relay (
+        .clk       (clk),
+        .reset     (reset),
+        .reset_out (_reset)
+    );
    
    for (genvar i = 0; i < LANES; i++) begin
-    `ifdef QUARTUS
        acl_fdiv fdiv (
            .clk    (clk),
-            .areset (reset),
+            .areset (_reset),
            .en     (enable),
            .a      (dataa[i]),
            .b      (datab[i]),
            .q      (result[i])
        );
-    `else 
-        integer fdiv_h;
-        initial begin
-            fdiv_h = dpi_register();
-        end
-        always @(posedge clk) begin
-           dpi_fdiv (fdiv_h, enable, dataa[i], datab[i], `LATENCY_FDIV, result[i]);
-        end
-    `endif
    end

    VX_shift_register #(
--- a/hw/rtl/fp_cores/VX_fp_dpi.v
+++ b/hw/rtl/fp_cores/VX_fp_dpi.v
@ -0,0 +1,415 @@
+`ifndef SYNTHESIS
+
+`include "VX_define.vh"
+`include "float_dpi.vh"
+
+module VX_fp_dpi #( 
+    parameter TAGW = 1
+) (
+    input wire clk,
+    input wire reset,
+
+    input wire  valid_in,
+    output wire ready_in,
+
+    input wire [TAGW-1:0] tag_in,
+    
+    input wire [`FPU_BITS-1:0] op_type,
+    input wire [`MOD_BITS-1:0] frm,
+
+    input wire [`NUM_THREADS-1:0][31:0]  dataa,
+    input wire [`NUM_THREADS-1:0][31:0]  datab,
+    input wire [`NUM_THREADS-1:0][31:0]  datac,
+    output wire [`NUM_THREADS-1:0][31:0] result, 
+
+    output wire has_fflags,
+    output fflags_t [`NUM_THREADS-1:0] fflags,
+
+    output wire [TAGW-1:0] tag_out,
+
+    input wire  ready_out,
+    output wire valid_out
+);
+    localparam FPU_FMA  = 0;
+    localparam FPU_DIV  = 1;
+    localparam FPU_SQRT = 2;
+    localparam FPU_CVT  = 3;
+    localparam FPU_NCP  = 4;
+    localparam NUM_FPC  = 5;
+    localparam FPC_BITS = `LOG2UP(NUM_FPC);
+    
+    wire [NUM_FPC-1:0] per_core_ready_in;
+    wire [NUM_FPC-1:0][`NUM_THREADS-1:0][31:0] per_core_result;
+    wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
+    reg [NUM_FPC-1:0] per_core_ready_out;
+    wire [NUM_FPC-1:0] per_core_valid_out;
+    
+    wire [NUM_FPC-1:0] per_core_has_fflags;  
+    fflags_t [NUM_FPC-1:0][`NUM_THREADS-1:0] per_core_fflags;  
+
+    reg [FPC_BITS-1:0] core_select;
+
+    reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub;
+    reg is_itof, is_utof, is_ftoi, is_ftou;
+    reg is_fclss, is_flt, is_fle, is_feq, is_fmin, is_fmax, is_fsgnj, is_fsgnjn, is_fsgnjx;
+
+    always @(*) begin
+        is_fadd   = 0;
+        is_fsub   = 0;        
+        is_fmul   = 0;        
+        is_fmadd  = 0;
+        is_fmsub  = 0;
+        is_fnmadd = 0;           
+        is_fnmsub = 0;        
+        is_itof   = 0;
+        is_utof   = 0;
+        is_ftoi   = 0;
+        is_ftou   = 0;
+        is_fclss  = 0;
+        is_flt    = 0;
+        is_fle    = 0;
+        is_feq    = 0;
+        is_fmin   = 0;
+        is_fmax   = 0;
+        is_fsgnj  = 0;
+        is_fsgnjn = 0;
+        is_fsgnjx = 0;
+
+        case (op_type)
+            `FPU_ADD:   begin core_select = FPU_FMA; is_fadd = 1; end
+            `FPU_SUB:   begin core_select = FPU_FMA; is_fsub = 1; end
+            `FPU_MUL:   begin core_select = FPU_FMA; is_fmul = 1; end
+            `FPU_MADD:  begin core_select = FPU_FMA; is_fmadd = 1; end
+            `FPU_MSUB:  begin core_select = FPU_FMA; is_fmsub = 1; end
+            `FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
+            `FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
+            `FPU_DIV:   begin core_select = FPU_DIV; end
+            `FPU_SQRT:  begin core_select = FPU_SQRT; end
+            `FPU_CVTWS: begin core_select = FPU_CVT; is_ftoi = 1; end
+            `FPU_CVTWUS:begin core_select = FPU_CVT; is_ftou = 1; end
+            `FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; end
+            `FPU_CVTSWU:begin core_select = FPU_CVT; is_utof = 1; end
+            `FPU_CLASS: begin core_select = FPU_NCP; is_fclss = 1; end  
+            `FPU_CMP:   begin core_select = FPU_NCP; 
+                            is_fle = (frm == 0); 
+                            is_flt = (frm == 1); 
+                            is_feq = (frm == 2); 
+                         end  
+            default:   begin core_select = FPU_NCP; 
+                            is_fsgnj  = (frm == 0);
+                            is_fsgnjn = (frm == 1);
+                            is_fsgnjx = (frm == 2);
+                            is_fmin   = (frm == 3);
+                            is_fmax   = (frm == 4);
+                        end
+        endcase
+    end
+
+    generate 
+    begin : fma
+        
+        wire [`NUM_THREADS-1:0][31:0] result_fma;
+        wire [`NUM_THREADS-1:0][31:0] result_fadd;
+        wire [`NUM_THREADS-1:0][31:0] result_fsub;
+        wire [`NUM_THREADS-1:0][31:0] result_fmul;
+        wire [`NUM_THREADS-1:0][31:0] result_fmadd;
+        wire [`NUM_THREADS-1:0][31:0] result_fmsub;
+        wire [`NUM_THREADS-1:0][31:0] result_fnmadd;
+        wire [`NUM_THREADS-1:0][31:0] result_fnmsub;
+        
+        fflags_t [`NUM_THREADS-1:0] fflags_fma;
+        fflags_t [`NUM_THREADS-1:0] fflags_fadd;
+        fflags_t [`NUM_THREADS-1:0] fflags_fsub;
+        fflags_t [`NUM_THREADS-1:0] fflags_fmul;
+        fflags_t [`NUM_THREADS-1:0] fflags_fmadd;
+        fflags_t [`NUM_THREADS-1:0] fflags_fmsub;
+        fflags_t [`NUM_THREADS-1:0] fflags_fnmadd;
+        fflags_t [`NUM_THREADS-1:0] fflags_fnmsub;
+
+        always @(*) begin        
+            for (integer i = 0; i < `NUM_THREADS; i++) begin
+                dpi_fadd   (dataa[i], datab[i], frm, result_fadd[i], fflags_fadd[i]);
+                dpi_fsub   (dataa[i], datab[i], frm, result_fsub[i], fflags_fsub[i]);
+                dpi_fmul   (dataa[i], datab[i], frm, result_fmul[i], fflags_fmul[i]);
+                dpi_fmadd  (dataa[i], datab[i], datac[i], frm, result_fmadd[i], fflags_fmadd[i]);
+                dpi_fmsub  (dataa[i], datab[i], datac[i], frm, result_fmsub[i], fflags_fmsub[i]);
+                dpi_fnmadd (dataa[i], datab[i], datac[i], frm, result_fnmadd[i], fflags_fnmadd[i]);
+                dpi_fnmsub (dataa[i], datab[i], datac[i], frm, result_fnmsub[i], fflags_fnmsub[i]);
+            end
+        end
+
+        assign result_fma = is_fadd   ? result_fadd :
+                            is_fsub   ? result_fsub :
+                            is_fmul   ? result_fmul :
+                            is_fmadd  ? result_fmadd :               
+                            is_fmsub  ? result_fmsub :
+                            is_fnmadd ? result_fnmadd :               
+                            is_fnmsub ? result_fnmsub :
+                                        0;
+
+        assign fflags_fma = is_fadd   ? fflags_fadd :
+                            is_fsub   ? fflags_fsub :
+                            is_fmul   ? fflags_fmul :
+                            is_fmadd  ? fflags_fmadd :               
+                            is_fmsub  ? fflags_fmsub :
+                            is_fnmadd ? fflags_fnmadd :               
+                            is_fnmsub ? fflags_fnmsub : 
+                                        0;
+
+        wire enable = per_core_ready_out[FPU_FMA] || ~per_core_valid_out[FPU_FMA];
+        wire valid  = (valid_in && core_select == FPU_FMA);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
+            .DEPTH  (`LATENCY_FMA),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (enable),
+            .data_in  ({valid,                       tag_in,                    result_fma,               fflags_fma}),
+            .data_out ({per_core_valid_out[FPU_FMA], per_core_tag_out[FPU_FMA], per_core_result[FPU_FMA], per_core_fflags[FPU_FMA]})
+        );
+
+        assign per_core_has_fflags[FPU_FMA] = 1;
+        assign per_core_ready_in[FPU_FMA] = enable;
+
+    end
+    endgenerate
+
+    generate 
+    begin : fdiv
+
+        wire [`NUM_THREADS-1:0][31:0] result_fdiv;
+        fflags_t [`NUM_THREADS-1:0] fflags_fdiv;
+        
+        always @(*) begin        
+            for (integer i = 0; i < `NUM_THREADS; i++) begin
+                dpi_fdiv (dataa[i], datab[i], frm, result_fdiv[i], fflags_fdiv[i]);
+            end
+        end
+
+        wire enable = per_core_ready_out[FPU_DIV] || ~per_core_valid_out[FPU_DIV];
+        wire valid  = (valid_in && core_select == FPU_DIV);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
+            .DEPTH  (`LATENCY_FDIV),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (enable),
+            .data_in  ({valid,                       tag_in,                    result_fdiv,               fflags_fdiv}),
+            .data_out ({per_core_valid_out[FPU_DIV], per_core_tag_out[FPU_DIV], per_core_result[FPU_DIV], per_core_fflags[FPU_DIV]})
+        );
+
+        assign per_core_has_fflags[FPU_DIV] = 1;
+        assign per_core_ready_in[FPU_DIV] = enable;
+
+    end
+    endgenerate
+
+    generate 
+    begin : fsqrt
+
+        wire [`NUM_THREADS-1:0][31:0] result_fsqrt;
+        fflags_t [`NUM_THREADS-1:0] fflags_fsqrt;
+        
+        always @(*) begin        
+            for (integer i = 0; i < `NUM_THREADS; i++) begin
+                dpi_fsqrt (dataa[i], frm, result_fsqrt[i], fflags_fsqrt[i]);
+            end
+        end
+
+        wire enable = per_core_ready_out[FPU_SQRT] || ~per_core_valid_out[FPU_SQRT];
+        wire valid  = (valid_in && core_select == FPU_SQRT);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
+            .DEPTH  (`LATENCY_FSQRT),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (enable),
+            .data_in  ({valid,                        tag_in,                     result_fsqrt,              fflags_fsqrt}),
+            .data_out ({per_core_valid_out[FPU_SQRT], per_core_tag_out[FPU_SQRT], per_core_result[FPU_SQRT], per_core_fflags[FPU_SQRT]})
+        );
+
+        assign per_core_has_fflags[FPU_SQRT] = 1;
+        assign per_core_ready_in[FPU_SQRT] = enable;
+
+    end
+    endgenerate
+
+    generate
+    begin : fcvt
+
+        wire [`NUM_THREADS-1:0][31:0] result_fcvt;
+        wire [`NUM_THREADS-1:0][31:0] result_itof;
+        wire [`NUM_THREADS-1:0][31:0] result_utof;
+        wire [`NUM_THREADS-1:0][31:0] result_ftoi;
+        wire [`NUM_THREADS-1:0][31:0] result_ftou;
+        
+        fflags_t [`NUM_THREADS-1:0] fflags_fcvt;
+        fflags_t [`NUM_THREADS-1:0] fflags_itof;
+        fflags_t [`NUM_THREADS-1:0] fflags_utof;
+        fflags_t [`NUM_THREADS-1:0] fflags_ftoi;
+        fflags_t [`NUM_THREADS-1:0] fflags_ftou;
+        
+        always @(*) begin        
+            for (integer i = 0; i < `NUM_THREADS; i++) begin
+                dpi_itof (dataa[i], frm, result_itof[i], fflags_itof[i]);
+                dpi_utof (dataa[i], frm, result_utof[i], fflags_utof[i]);
+                dpi_ftoi (dataa[i], frm, result_ftoi[i], fflags_ftoi[i]);
+                dpi_ftou (dataa[i], frm, result_ftou[i], fflags_ftou[i]);
+            end
+        end
+
+        assign result_fcvt = is_itof ? result_itof :
+                             is_utof ? result_utof :
+                             is_ftoi ? result_ftoi :
+                             is_ftou ? result_ftou : 
+                                       0;
+
+        assign fflags_fcvt = is_itof ? fflags_itof :
+                             is_utof ? fflags_utof :
+                             is_ftoi ? fflags_ftoi :
+                             is_ftou ? fflags_ftou : 
+                                       0;
+
+        wire enable = per_core_ready_out[FPU_CVT] || ~per_core_valid_out[FPU_CVT];
+        wire valid  = (valid_in && core_select == FPU_CVT);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
+            .DEPTH  (`LATENCY_FCVT),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (enable),
+            .data_in  ({valid,                       tag_in,                    result_fcvt,              fflags_fcvt}),
+            .data_out ({per_core_valid_out[FPU_CVT], per_core_tag_out[FPU_CVT], per_core_result[FPU_CVT], per_core_fflags[FPU_CVT]})
+        );
+
+        assign per_core_has_fflags[FPU_CVT] = 1;
+        assign per_core_ready_in[FPU_CVT] = enable;
+
+    end
+    endgenerate
+
+    generate 
+    begin : fncp
+
+        wire [`NUM_THREADS-1:0][31:0] result_fncp;
+        wire [`NUM_THREADS-1:0][31:0] result_fclss;
+        wire [`NUM_THREADS-1:0][31:0] result_flt;
+        wire [`NUM_THREADS-1:0][31:0] result_fle;
+        wire [`NUM_THREADS-1:0][31:0] result_feq;
+        wire [`NUM_THREADS-1:0][31:0] result_fmin;
+        wire [`NUM_THREADS-1:0][31:0] result_fmax;
+        wire [`NUM_THREADS-1:0][31:0] result_fsgnj;
+        wire [`NUM_THREADS-1:0][31:0] result_fsgnjn;
+        wire [`NUM_THREADS-1:0][31:0] result_fsgnjx;
+        reg [`NUM_THREADS-1:0][31:0] result_fmv;
+
+        fflags_t [`NUM_THREADS-1:0] fflags_fncp;
+        fflags_t [`NUM_THREADS-1:0] fflags_flt;
+        fflags_t [`NUM_THREADS-1:0] fflags_fle;
+        fflags_t [`NUM_THREADS-1:0] fflags_feq;
+        fflags_t [`NUM_THREADS-1:0] fflags_fmin;
+        fflags_t [`NUM_THREADS-1:0] fflags_fmax;
+        
+        always @(*) begin        
+            for (integer i = 0; i < `NUM_THREADS; i++) begin
+                dpi_fclss  (dataa[i], result_fclss[i]);
+                dpi_flt    (dataa[i], datab[i], result_flt[i], fflags_flt[i]);
+                dpi_fle    (dataa[i], datab[i], result_fle[i], fflags_fle[i]);
+                dpi_feq    (dataa[i], datab[i], result_feq[i], fflags_feq[i]);
+                dpi_fmin   (dataa[i], datab[i], result_fmin[i], fflags_fmin[i]);
+                dpi_fmax   (dataa[i], datab[i], result_fmax[i], fflags_fmax[i]);            
+                dpi_fsgnj  (dataa[i], result_fsgnj[i]);
+                dpi_fsgnjn (dataa[i], result_fsgnjn[i]);
+                dpi_fsgnjx (dataa[i], result_fsgnjx[i]);
+                result_fmv[i] = dataa[i];
+            end
+        end
+
+        assign result_fncp = is_fclss  ? result_fclss :
+                             is_flt    ? result_flt :
+                             is_fle    ? result_fle :
+                             is_feq    ? result_feq :
+                             is_fmin   ? result_fmin :
+                             is_fmax   ? result_fmax :
+                             is_fsgnj  ? result_fsgnj :
+                             is_fsgnjn ? result_fsgnjn :
+                             is_fsgnjx ? result_fsgnjx :
+                                         result_fmv;
+
+        wire has_fflags_fncp = (is_flt || is_fle || is_feq || is_fmin || is_fmax);
+
+        assign fflags_fncp = is_flt  ? fflags_flt :
+                             is_fle  ? fflags_fle :
+                             is_feq  ? fflags_feq :
+                             is_fmin ? fflags_fmin :
+                             is_fmax ? fflags_fmax : 
+                                       0;
+
+        wire enable = per_core_ready_out[FPU_NCP] || ~per_core_valid_out[FPU_NCP];
+        wire valid  = (valid_in && core_select == FPU_NCP);
+
+        VX_shift_register #(
+            .DATAW  (1 + TAGW + 1 + `NUM_THREADS * (32 + $bits(fflags_t))),
+            .DEPTH  (`LATENCY_FNCP),
+            .RESETW (1)
+        ) shift_reg (
+            .clk      (clk),
+            .reset    (reset),
+            .enable   (enable),
+            .data_in  ({valid,                       tag_in,                    has_fflags_fncp,              result_fncp,              fflags_fncp}),
+            .data_out ({per_core_valid_out[FPU_NCP], per_core_tag_out[FPU_NCP], per_core_has_fflags[FPU_NCP], per_core_result[FPU_NCP], per_core_fflags[FPU_NCP]})
+        );
+        
+        assign per_core_ready_in[FPU_NCP] = enable;
+
+    end
+    endgenerate
+
+    ///////////////////////////////////////////////////////////////////////////
+
+    reg has_fflags_n;
+    fflags_t [`NUM_THREADS-1:0] fflags_n;
+    reg [`NUM_THREADS-1:0][31:0] result_n;
+    reg [TAGW-1:0] tag_out_n;
+
+    always @(*) begin
+        per_core_ready_out = 0;
+        has_fflags_n       = 'x;
+        fflags_n           = 'x;
+        result_n           = 'x;
+        tag_out_n          = 'x;
+        for (integer i = 0; i < NUM_FPC; i++) begin
+            if (per_core_valid_out[i]) begin                
+                has_fflags_n = per_core_has_fflags[i];
+                fflags_n     = per_core_fflags[i];
+                result_n     = per_core_result[i];
+                tag_out_n    = per_core_tag_out[i];
+                per_core_ready_out[i] = ready_out;
+                break;
+            end
+        end
+    end
+
+    assign valid_out  = (| per_core_valid_out);
+    assign has_fflags = has_fflags_n;
+    assign tag_out    = tag_out_n;
+    assign result     = result_n;    
+    assign fflags     = fflags_n;
+
+    assign ready_in = per_core_ready_in[core_select];
+
+endmodule
+
+`endif
--- a/hw/rtl/fp_cores/VX_fp_fma.v
+++ b/hw/rtl/fp_cores/VX_fp_fma.v
@ -1,9 +1,5 @@
 `include "VX_define.vh"

-`ifndef SYNTHESIS
-`include "float_dpi.vh"
-`endif
-
 module VX_fp_fma #( 
    parameter TAGW = 1,
    parameter LANES = 1
@ -63,7 +59,6 @@ module VX_fp_fma #(
            end    
        end

-    `ifdef QUARTUS
        acl_fmadd fmadd (
            .clk    (clk),
            .areset (reset),
@ -73,15 +68,6 @@ module VX_fp_fma #(
            .c      (c),
            .q      (result[i])
        );
-    `else
-        integer fmadd_h;
-        initial begin
-            fmadd_h = dpi_register();
-        end
-        always @(posedge clk) begin
-           dpi_fmadd (fmadd_h, enable, a, b, c, `LATENCY_FMA, result[i]);
-        end
-    `endif
    end
    
    VX_shift_register #(
--- a/hw/rtl/fp_cores/VX_fp_fpga.v
+++ b/hw/rtl/fp_cores/VX_fp_fpga.v
@ -27,6 +27,11 @@ module VX_fp_fpga #(
    input wire  ready_out,
    output wire valid_out
 );
+    localparam FPU_FMA  = 0;
+    localparam FPU_DIV  = 1;
+    localparam FPU_SQRT = 2;
+    localparam FPU_CVT  = 3;
+    localparam FPU_NCP  = 4;
    localparam NUM_FPC  = 5;
    localparam FPC_BITS = `LOG2UP(NUM_FPC);
    
@ -49,20 +54,20 @@ module VX_fp_fpga #(
        is_itof   = 'x;
        is_signed = 'x;
        case (op_type)
-            `FPU_ADD:    begin core_select = 0; do_madd = 0; do_sub = 0; do_neg = 0; end
-            `FPU_SUB:    begin core_select = 0; do_madd = 0; do_sub = 1; do_neg = 0; end
-            `FPU_MUL:    begin core_select = 0; do_madd = 0; do_sub = 0; do_neg = 1; end
-            `FPU_MADD:   begin core_select = 0; do_madd = 1; do_sub = 0; do_neg = 0; end
-            `FPU_MSUB:   begin core_select = 0; do_madd = 1; do_sub = 1; do_neg = 0; end
-            `FPU_NMADD:  begin core_select = 0; do_madd = 1; do_sub = 0; do_neg = 1; end
-            `FPU_NMSUB:  begin core_select = 0; do_madd = 1; do_sub = 1; do_neg = 1; end
-            `FPU_DIV:    begin core_select = 1; end
-            `FPU_SQRT:   begin core_select = 2; end
-            `FPU_CVTWS:  begin core_select = 3; is_itof = 0; is_signed = 1; end
-            `FPU_CVTWUS: begin core_select = 3; is_itof = 0; is_signed = 0; end
-            `FPU_CVTSW:  begin core_select = 3; is_itof = 1; is_signed = 1; end
-            `FPU_CVTSWU: begin core_select = 3; is_itof = 1; is_signed = 0; end
-            default:     begin core_select = 4; end
+            `FPU_ADD:    begin core_select = FPU_FMA; do_madd = 0; do_sub = 0; do_neg = 0; end
+            `FPU_SUB:    begin core_select = FPU_FMA; do_madd = 0; do_sub = 1; do_neg = 0; end
+            `FPU_MUL:    begin core_select = FPU_FMA; do_madd = 0; do_sub = 0; do_neg = 1; end
+            `FPU_MADD:   begin core_select = FPU_FMA; do_madd = 1; do_sub = 0; do_neg = 0; end
+            `FPU_MSUB:   begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 0; end
+            `FPU_NMADD:  begin core_select = FPU_FMA; do_madd = 1; do_sub = 0; do_neg = 1; end
+            `FPU_NMSUB:  begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 1; end
+            `FPU_DIV:    begin core_select = FPU_DIV; end
+            `FPU_SQRT:   begin core_select = FPU_SQRT; end
+            `FPU_CVTWS:  begin core_select = FPU_CVT; is_itof = 0; is_signed = 1; end
+            `FPU_CVTWUS: begin core_select = FPU_CVT; is_itof = 0; is_signed = 0; end
+            `FPU_CVTSW:  begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
+            `FPU_CVTSWU: begin core_select = FPU_CVT; is_itof = 1; is_signed = 0; end
+            default:     begin core_select = FPU_NCP; end
        endcase
    end

@ -72,8 +77,8 @@ module VX_fp_fpga #(
    ) fp_fma (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 0)),
-        .ready_in   (per_core_ready_in[0]),    
+        .valid_in   (valid_in && (core_select == FPU_FMA)),
+        .ready_in   (per_core_ready_in[FPU_FMA]),    
        .tag_in     (tag_in),  
        .frm        (frm),
        .do_madd    (do_madd),
@ -82,12 +87,12 @@ module VX_fp_fpga #(
        .dataa      (dataa), 
        .datab      (datab),    
        .datac      (datac),   
-        .has_fflags (per_core_has_fflags[0]),
-        .fflags     (per_core_fflags[0]),
-        .result     (per_core_result[0]),
-        .tag_out    (per_core_tag_out[0]),
-        .ready_out  (per_core_ready_out[0]),
-        .valid_out  (per_core_valid_out[0])
+        .has_fflags (per_core_has_fflags[FPU_FMA]),
+        .fflags     (per_core_fflags[FPU_FMA]),
+        .result     (per_core_result[FPU_FMA]),
+        .tag_out    (per_core_tag_out[FPU_FMA]),
+        .ready_out  (per_core_ready_out[FPU_FMA]),
+        .valid_out  (per_core_valid_out[FPU_FMA])
    );

    VX_fp_div #(
@ -96,18 +101,18 @@ module VX_fp_fpga #(
    ) fp_div (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 1)),
-        .ready_in   (per_core_ready_in[1]),    
+        .valid_in   (valid_in && (core_select == FPU_DIV)),
+        .ready_in   (per_core_ready_in[FPU_DIV]),    
        .tag_in     (tag_in),
        .frm        (frm),  
        .dataa      (dataa), 
        .datab      (datab),   
-        .has_fflags (per_core_has_fflags[1]),
-        .fflags     (per_core_fflags[1]),   
-        .result     (per_core_result[1]),
-        .tag_out    (per_core_tag_out[1]),
-        .ready_out  (per_core_ready_out[1]),
-        .valid_out  (per_core_valid_out[1])
+        .has_fflags (per_core_has_fflags[FPU_DIV]),
+        .fflags     (per_core_fflags[FPU_DIV]),   
+        .result     (per_core_result[FPU_DIV]),
+        .tag_out    (per_core_tag_out[FPU_DIV]),
+        .ready_out  (per_core_ready_out[FPU_DIV]),
+        .valid_out  (per_core_valid_out[FPU_DIV])
    );

    VX_fp_sqrt #(
@ -116,17 +121,17 @@ module VX_fp_fpga #(
    ) fp_sqrt (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 2)),
-        .ready_in   (per_core_ready_in[2]),    
+        .valid_in   (valid_in && (core_select == FPU_SQRT)),
+        .ready_in   (per_core_ready_in[FPU_SQRT]),    
        .tag_in     (tag_in),
        .frm        (frm),    
        .dataa      (dataa), 
-        .has_fflags (per_core_has_fflags[2]),
-        .fflags     (per_core_fflags[2]),
-        .result     (per_core_result[2]),
-        .tag_out    (per_core_tag_out[2]),
-        .ready_out  (per_core_ready_out[2]),
-        .valid_out  (per_core_valid_out[2])
+        .has_fflags (per_core_has_fflags[FPU_SQRT]),
+        .fflags     (per_core_fflags[FPU_SQRT]),
+        .result     (per_core_result[FPU_SQRT]),
+        .tag_out    (per_core_tag_out[FPU_SQRT]),
+        .ready_out  (per_core_ready_out[FPU_SQRT]),
+        .valid_out  (per_core_valid_out[FPU_SQRT])
    );

    VX_fp_cvt #(
@ -135,19 +140,19 @@ module VX_fp_fpga #(
    ) fp_cvt (
        .clk        (clk), 
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 3)),
-        .ready_in   (per_core_ready_in[3]),    
+        .valid_in   (valid_in && (core_select == FPU_CVT)),
+        .ready_in   (per_core_ready_in[FPU_CVT]),    
        .tag_in     (tag_in), 
        .frm        (frm),
        .is_itof    (is_itof),   
        .is_signed  (is_signed),        
        .dataa      (dataa),  
-        .has_fflags (per_core_has_fflags[3]),
-        .fflags     (per_core_fflags[3]),
-        .result     (per_core_result[3]),
-        .tag_out    (per_core_tag_out[3]),
-        .ready_out  (per_core_ready_out[3]),
-        .valid_out  (per_core_valid_out[3])
+        .has_fflags (per_core_has_fflags[FPU_CVT]),
+        .fflags     (per_core_fflags[FPU_CVT]),
+        .result     (per_core_result[FPU_CVT]),
+        .tag_out    (per_core_tag_out[FPU_CVT]),
+        .ready_out  (per_core_ready_out[FPU_CVT]),
+        .valid_out  (per_core_valid_out[FPU_CVT])
    );

    VX_fp_ncomp #(
@ -156,19 +161,19 @@ module VX_fp_fpga #(
    ) fp_ncomp (
        .clk        (clk),
        .reset      (reset),   
-        .valid_in   (valid_in && (core_select == 4)),
-        .ready_in   (per_core_ready_in[4]),        
+        .valid_in   (valid_in && (core_select == FPU_NCP)),
+        .ready_in   (per_core_ready_in[FPU_NCP]),        
        .tag_in     (tag_in),        
        .op_type    (op_type),
        .frm        (frm),
        .dataa      (dataa),
        .datab      (datab),        
-        .result     (per_core_result[4]), 
-        .has_fflags (per_core_has_fflags[4]),
-        .fflags     (per_core_fflags[4]),
-        .tag_out    (per_core_tag_out[4]),
-        .ready_out  (per_core_ready_out[4]),
-        .valid_out  (per_core_valid_out[4])
+        .result     (per_core_result[FPU_NCP]), 
+        .has_fflags (per_core_has_fflags[FPU_NCP]),
+        .fflags     (per_core_fflags[FPU_NCP]),
+        .tag_out    (per_core_tag_out[FPU_NCP]),
+        .ready_out  (per_core_ready_out[FPU_NCP]),
+        .valid_out  (per_core_valid_out[FPU_NCP])
    );

    reg has_fflags_n;
--- a/hw/rtl/fp_cores/VX_fp_ncomp.v
+++ b/hw/rtl/fp_cores/VX_fp_ncomp.v
@ -160,7 +160,7 @@ module VX_fp_ncomp #(
    for (genvar i = 0; i < LANES; i++) begin
        always @(*) begin
            case (frm_s0)
-                `FRM_RNE: begin
+                `FRM_RNE: begin // LE
                    fcmp_fflags[i] = 5'h0;
                    if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
                        fcmp_res[i]       = 32'h0;
@ -169,7 +169,7 @@ module VX_fp_ncomp #(
                        fcmp_res[i] = {31'h0, (a_smaller_s0[i] | ab_equal_s0[i])};
                    end
                end
-                `FRM_RTZ: begin
+                `FRM_RTZ: begin // LS
                    fcmp_fflags[i] = 5'h0;
                    if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
                        fcmp_res[i]       = 32'h0;
@ -178,7 +178,7 @@ module VX_fp_ncomp #(
                        fcmp_res[i] = {31'h0, (a_smaller_s0[i] & ~ab_equal_s0[i])};
                    end                    
                end
-                `FRM_RDN: begin
+                `FRM_RDN: begin // EQ
                    fcmp_fflags[i] = 5'h0;
                    if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
                        fcmp_res[i]       = 32'h0;
@ -223,7 +223,7 @@ module VX_fp_ncomp #(
                            tmp_fflags[i] = 0;
                            tmp_fflags[i].NV = a_type_s0[i].is_signaling | b_type_s0[i].is_signaling;
                        end
-                        //5,6,7: 
+                        //5,6,7: MOVE
                        default: begin
                            tmp_result[i] = dataa[i];
                            tmp_fflags[i] = 'x;
--- a/hw/rtl/fp_cores/VX_fp_sqrt.v
+++ b/hw/rtl/fp_cores/VX_fp_sqrt.v
@ -1,9 +1,5 @@
 `include "VX_define.vh"

-`ifndef SYNTHESIS
-`include "float_dpi.vh"
-`endif
-
 module VX_fp_sqrt #( 
    parameter TAGW = 1,
    parameter LANES = 1
@ -30,26 +26,24 @@ module VX_fp_sqrt #(
    output wire valid_out
 );    
    wire stall = ~ready_out && valid_out;
-    wire enable = ~stall;
+    wire enable = ~stall;     
+
+    wire _reset;   
+
+    VX_reset_relay reset_relay (
+        .clk       (clk),
+        .reset     (reset),
+        .reset_out (_reset)
+    );  
    
    for (genvar i = 0; i < LANES; i++) begin
-    `ifdef QUARTUS
        acl_fsqrt fsqrt (
            .clk    (clk),
-            .areset (reset),
+            .areset (_reset),
            .en     (enable),
            .a      (dataa[i]),
            .q      (result[i])
        );
-    `else
-        integer fsqrt_h;
-        initial begin
-            fsqrt_h = dpi_register();
-        end
-        always @(posedge clk) begin
-           dpi_fsqrt (fsqrt_h, enable, dataa[i], `LATENCY_FSQRT, result[i]);
-        end
-    `endif
    end

    VX_shift_register #(
--- a/hw/rtl/fp_cores/VX_fpnew.v
+++ b/hw/rtl/fp_cores/VX_fpnew.v
@ -56,13 +56,13 @@ module VX_fpnew

    localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{
      PipeRegs:'{'{`LATENCY_FMA, 0, 0, 0, 0},   // ADDMUL
-                 '{default: `LATENCY_FDIVSQRT},   // DIVSQRT
-                 '{default: `LATENCY_FNCOMP},   // NONCOMP
-                 '{default: `LATENCY_FCONV}},     // CONV
-      UnitTypes:'{'{default: UNIT_FMULADD},       // ADDMUL
-                  '{default: UNIT_FDIVSQRT},      // DIVSQRT
-                  '{default: UNIT_FNONCOMP},      // NONCOMP
-                  '{default: UNIT_FCONV}},        // CONV
+                 '{default: `LATENCY_FDIVSQRT}, // DIVSQRT
+                 '{default: `LATENCY_FNCP},     // NONCOMP
+                 '{default: `LATENCY_FCVT}},    // CONV
+      UnitTypes:'{'{default: UNIT_FMULADD},     // ADDMUL
+                  '{default: UNIT_FDIVSQRT},    // DIVSQRT
+                  '{default: UNIT_FNONCOMP},    // NONCOMP
+                  '{default: UNIT_FCONV}},      // CONV
      PipeConfig: fpnew_pkg::DISTRIBUTED
    };
    
--- a/hw/rtl/fp_cores/svdpi/float_dpi.cpp
+++ b/hw/rtl/fp_cores/svdpi/float_dpi.cpp
@ -1,239 +0,0 @@
-#include <stdio.h>
-#include <math.h>
-#include <unordered_map>
-#include <vector>
-#include <mutex>
-#include <iostream>
-#include "svdpi.h"
-#include "verilated_vpi.h"
-#include "VX_config.h"
-
-extern "C" {
-  int dpi_register();
-  void dpi_fadd(int inst, bool enable, int a, int b, int delay, int* result);
-  void dpi_fsub(int inst, bool enable, int a, int b, int delay, int* result);
-  void dpi_fmul(int inst, bool enable, int a, int b, int delay, int* result);
-  void dpi_fmadd(int inst, bool enable, int a, int b, int c, int delay, int* result);
-  void dpi_fdiv(int inst, bool enable, int a, int b, int delay, int* result);
-  void dpi_fsqrt(int inst, bool enable, int a, int delay, int* result);
-  void dpi_ftoi(int inst, bool enable, int a, int delay, int* result);
-  void dpi_ftou(int inst, bool enable, int a, int delay, int* result);
-  void dpi_itof(int inst, bool enable, int a, int delay, int* result);
-  void dpi_utof(int inst, bool enable, int a, int delay, int* result);
-  void dpi_assert(int inst, bool cond, int delay);
-}
-
-class ShiftRegister {
-public:
-  ShiftRegister() : init_(false), depth_(0) {}
-
-  void ensure_init(int depth) {
-    if (!init_) {
-      buffer_.resize(depth);
-      init_  = true;
-      depth_ = depth;
-    }
-  }
-
-  void push(int value, bool enable) {
-    if (!enable)
-      return;      
-    for (unsigned i = 0; i < depth_-1; ++i) {
-      buffer_[i] = buffer_[i+1];
-    }
-    buffer_[depth_-1] = value;
-  }
-
-  int top() const {
-    return buffer_[0];
-  }
-
-private:
-
-  std::vector<int> buffer_;
-  bool init_;
-  unsigned depth_;  
-};
-
-union Float_t {    
-    float f;
-    int   i;
-    struct {
-        uint32_t man  : 23;
-        uint32_t exp  : 8;
-        uint32_t sign : 1;
-    } parts;
-}; 
-
-class Instances {
-public:
-  ShiftRegister& get(int inst) {
-    return instances_.at(inst);
-  }
-
-  int allocate() {
-    mutex_.lock();   
-    int inst = instances_.size();
-    instances_.resize(inst + 1); 
-    mutex_.unlock();
-    return inst;
-  }
-
-private:
-  std::vector<ShiftRegister> instances_;
-  std::mutex mutex_;
-};
-
-Instances instances;
-
-int dpi_register() {
-  return instances.allocate();
-}
-
-void dpi_fadd(int inst, bool enable, int a, int b, int delay, int* result) {
-  ShiftRegister& sr = instances.get(inst);
-
-  Float_t fa, fb, fr;
-
-  fa.i = a;
-  fb.i = b;
-  fr.f = fa.f + fb.f;
-
-  sr.ensure_init(delay);
-  sr.push(fr.i, enable);
-  *result = sr.top();
-}
-
-void dpi_fsub(int inst, bool enable, int a, int b, int delay, int* result) {
-  ShiftRegister& sr = instances.get(inst);
-
-  Float_t fa, fb, fr;
-
-  fa.i = a;
-  fb.i = b;
-  fr.f = fa.f - fb.f;
-
-  sr.ensure_init(delay);
-  sr.push(fr.i, enable);
-  *result = sr.top();
-}
-
-void dpi_fmul(int inst, bool enable, int a, int b, int delay, int* result) {
-  ShiftRegister& sr = instances.get(inst);
-
-  Float_t fa, fb, fr;
-
-  fa.i = a;
-  fb.i = b;
-  fr.f = fa.f * fb.f;
-
-  sr.ensure_init(delay);
-  sr.push(fr.i, enable);
-  *result = sr.top();
-}
-
-void dpi_fmadd(int inst, bool enable, int a, int b, int c, int delay, int* result) {
-  ShiftRegister& sr = instances.get(inst);
-
-  Float_t fa, fb, fc, fr;
-
-  fa.i = a;
-  fb.i = b;
-  fc.i = c;
-  fr.f = fa.f * fb.f + fc.f;
-
-  sr.ensure_init(delay);
-  sr.push(fr.i, enable);
-  *result = sr.top();
-}
-
-void dpi_fdiv(int inst, bool enable, int a, int b, int delay, int* result) {
-  ShiftRegister& sr = instances.get(inst);
-
-  Float_t fa, fb, fr;
-
-  fa.i = a;
-  fb.i = b;
-  fr.f = fa.f / fb.f;
-
-  sr.ensure_init(delay);
-  sr.push(fr.i, enable);
-  *result = sr.top();
-}
-
-void dpi_fsqrt(int inst, bool enable, int a, int delay, int* result) {
-  ShiftRegister& sr = instances.get(inst);
-
-  Float_t fa, fr;
-
-  fa.i = a;
-  fr.f = sqrtf(fa.f);
-
-  sr.ensure_init(delay);
-  sr.push(fr.i, enable);
-  *result = sr.top();
-}
-
-void dpi_ftoi(int inst, bool enable, int a, int delay, int* result) {
-  ShiftRegister& sr = instances.get(inst);
-
-  Float_t fa, fr;
-
-  fa.i = a;
-  fr.i = int(fa.f);   
-
-  sr.ensure_init(delay);
-  sr.push(fr.i, enable);
-  *result = sr.top();
-}
-
-void dpi_ftou(int inst, bool enable, int a, int delay, int* result) {
-  ShiftRegister& sr = instances.get(inst);
-
-  Float_t fa, fr;
-
-  fa.i = a;
-  fr.i = unsigned(fa.f);   
-
-  sr.ensure_init(delay);
-  sr.push(fr.i, enable);
-  *result = sr.top();
-}
-
-void dpi_itof(int inst, bool enable, int a, int delay, int* result) {
-  ShiftRegister& sr = instances.get(inst);
-
-  Float_t fa, fr;
-
-  fr.f = (float)a;   
-
-  sr.ensure_init(delay);
-  sr.push(fr.i, enable);
-  *result = sr.top();
-}
-
-void dpi_utof(int inst, bool enable, int a, int delay, int* result) {
-  ShiftRegister& sr = instances.get(inst);
-
-  Float_t fa, fr;
-
-  unsigned ua = a;
-  fr.f = (float)ua;   
-
-  sr.ensure_init(delay);
-  sr.push(fr.i, enable);
-  *result = sr.top();
-}
-
-void dpi_assert(int inst, bool cond, int delay) {
-  ShiftRegister& sr = instances.get(inst);
-
-  sr.ensure_init(delay);
-  sr.push(!cond, 1);
-
-  auto status = sr.top();
-  if (status) {
-    printf("delayed assertion at %s!\n", svGetNameFromScope(svGetScope()));
-    std::abort();
-  }
-}
--- a/hw/rtl/fp_cores/svdpi/float_dpi.vh
+++ b/hw/rtl/fp_cores/svdpi/float_dpi.vh
@ -1,20 +0,0 @@
-`ifndef FLOAT_DPI
-`define FLOAT_DPI
-
-import "DPI-C" context function int dpi_register();
-
-import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input int a, input int b, input int delay, output int result);
-import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input int a, input int b, input int delay, output int result);
-import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input int a, input int b, input int delay, output int result);
-import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input int a, input int b, input int c, input int delay, output int result);
-import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input int a, input int b, input int c, input int delay, output int result);
-import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input int a, input int b, input int delay, output int result);
-import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input int a, input int delay, output int result);
-import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input int a, input int delay, output int result);
-import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input int a, input int delay, output int result);
-import "DPI-C" context function void dpi_itof(int inst, input logic enable, input int a, input int delay, output int result);
-import "DPI-C" context function void dpi_utof(int inst, input logic enable, input int a, input int delay, output int result);
-
-import "DPI-C" context function void dpi_assert(int inst, input logic cond, input int delay);
-
-`endif
--- a/hw/rtl/libs/VX_lzc.v
+++ b/hw/rtl/libs/VX_lzc.v
@ -1,7 +1,7 @@
 `include "VX_platform.vh"

 module VX_lzc #(
-    parameter DATAW  = 1,
+    parameter DATAW  = 32,
    parameter LDATAW = `LOG2UP(DATAW)
 ) (
    input wire  [DATAW-1:0]  data_in,
--- a/hw/simulate/Makefile
+++ b/hw/simulate/Makefile
@ -31,12 +31,13 @@ MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
 TOP = Vortex

 RTL_DIR=../rtl
+DPI_DIR=../dpi

-FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src 
+FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src 
 RTL_INCLUDE = -I$(RTL_DIR)/ -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE)

 SRCS = simulator.cpp testbench.cpp
-SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
+SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp

 VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
 VL_FLAGS += -Wno-DECLFILENAME
--- a/hw/simulate/simulator.cpp
+++ b/hw/simulate/simulator.cpp
@ -3,7 +3,7 @@
 #include <fstream>
 #include <iomanip>

-#define RESET_DELAY 2
+#define RESET_DELAY 4

 #define ENABLE_DRAM_STALLS
 #define DRAM_LATENCY 24
@ -75,13 +75,6 @@ void Simulator::reset() {
  vortex_->csr_rsp_ready  = 0;

  vortex_->reset = 1;
-  
-  vortex_->clk = 0;
-  this->eval();
-  vortex_->clk = 1;
-  this->eval();
-
-  vortex_->reset = 0;

  for (int i = 0; i < RESET_DELAY; ++i) {
    vortex_->clk = 0;
@ -89,8 +82,11 @@ void Simulator::reset() {
    vortex_->clk = 1;
    this->eval();
  }  
+
+  vortex_->reset = 0;
  
  // Turn on assertion after reset
+  printf("*** enabling assertion at tick: %ld", timestamp);
  Verilated::assertOn(true);
 }