runtime refactoring

2025-04-23 21:39:10 -04:00 · 2024-05-27 15:59:41 -07:00 · 2024-05-27 15:59:41 -07:00 · c1000f6a3b
commit c1000f6a3b
parent 405d6b468f
13 changed files with 317 additions and 484 deletions
--- a/runtime/common/common.h
+++ b/runtime/common/common.h
@ -0,0 +1,77 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vortex.h>
+#include <VX_config.h>
+#include <VX_types.h>
+#include <callbacks.h>
+#include <malloc.h>
+
+#include <cstdint>
+#include <unordered_map>
+
+#define CACHE_BLOCK_SIZE    64
+
+#define RAM_PAGE_SIZE       4096
+
+#define ALLOC_BASE_ADDR     CACHE_BLOCK_SIZE
+
+#if (XLEN == 64)
+#define GLOBAL_MEM_SIZE      0x200000000  // 8 GB
+#else
+#define GLOBAL_MEM_SIZE      0x100000000  // 4 GB
+#endif
+
+#ifndef NDEBUG
+#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
+#else
+#define DBGPRINT(format, ...) ((void)0)
+#endif
+
+#define CHECK_ERR(_expr, _cleanup)              \
+    do {                                        \
+        auto err = _expr;                       \
+        if (err == 0)                           \
+            break;                              \
+        printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
+        _cleanup                                \
+    } while (false)
+
+class DeviceConfig {
+public:
+    void write(uint32_t addr, uint32_t value) {
+        store_[addr] = value;
+    }
+
+    int read(uint32_t addr, uint32_t* value) const {
+        auto it = store_.find(addr);
+        if (it == store_.end())
+            return -1;
+        *value = it->second;
+        return 0;
+    }
+private:
+     std::unordered_map<uint32_t, uint32_t> store_;
+};
+
+inline uint64_t aligned_size(uint64_t size, uint64_t alignment) {
+  assert(0 == (alignment & (alignment - 1)));
+  return (size + alignment - 1) & ~(alignment - 1);
+}
+
+inline bool is_aligned(uint64_t addr, uint64_t alignment) {
+  assert(0 == (alignment & (alignment - 1)));
+  return 0 == (addr & (alignment - 1));
+}
--- a/runtime/common/utils.h
+++ b/runtime/common/utils.h
@ -1,50 +0,0 @@
-// Copyright © 2019-2023
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vortex.h>
-#include <cstdint>
-#include <unordered_map>
-#include <VX_config.h>
-#include <VX_types.h>
-
-class DeviceConfig {
-public:
-    void write(uint32_t addr, uint32_t value);
-    int read(uint32_t addr, uint32_t* value) const;
-private:
-     std::unordered_map<uint32_t, uint32_t> store_;
-};
-
-int dcr_initialize(vx_device_h device);
-
-uint64_t aligned_size(uint64_t size, uint64_t alignment);
-
-bool is_aligned(uint64_t addr, uint64_t alignment);
-
-int profiling_add(vx_device_h device);
-
-void profiling_remove(int id);
-
-void profiling_begin(int id);
-
-void profiling_end(int id);
-
-#define CACHE_BLOCK_SIZE    64
-#define ALLOC_BASE_ADDR     CACHE_BLOCK_SIZE
-#if (XLEN == 64)
-#define GLOBAL_MEM_SIZE      0x200000000  // 8 GB
-#else
-#define GLOBAL_MEM_SIZE      0x100000000  // 4 GB
-#endif
--- a/runtime/opae/Makefile
+++ b/runtime/opae/Makefile
@ -20,7 +20,7 @@ CXXFLAGS += $(CONFIGS)

 LDFLAGS += -shared -luuid -ldl -pthread

-SRCS = $(SRC_DIR)/vortex.cpp $(SRC_DIR)/driver.cpp $(COMMON_DIR)/utils.cpp
+SRCS = $(SRC_DIR)/vortex.cpp $(SRC_DIR)/driver.cpp

 # set up target types
 ifeq ($(TARGET), opaesim)
@ -49,11 +49,6 @@ ifdef SCOPE
 	SRCS += $(COMMON_DIR)/scope.cpp
 endif

-# Enable perf counters
-ifdef PERF
-	CXXFLAGS += -DPERF_ENABLE
-endif
-
 PROJECT := libvortex-opae.so

 all: $(DESTDIR)/$(PROJECT)
--- a/runtime/opae/vortex.cpp
+++ b/runtime/opae/vortex.cpp
@ -11,9 +11,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <utils.h>
-#include <malloc.h>
+#include <common.h>
+#include <vortex_afu.h>
+
 #include "driver.h"
+#ifdef SCOPE
+#include "scope.h"
+#endif
+
 #include <iostream>
 #include <stdio.h>
 #include <stdlib.h>
@ -29,16 +34,6 @@
 #include <memory>
 #include <list>

-#include <VX_config.h>
-#include <VX_types.h>
-#include <vortex_afu.h>
-
-#ifdef SCOPE
-#include "scope.h"
-#endif
-
-#include <callbacks.h>
-
 using namespace vortex;

 #define CMD_MEM_READ        AFU_IMAGE_CMD_MEM_READ
@ -58,14 +53,6 @@ using namespace vortex;

 #define STATUS_STATE_BITS   8

-#define RAM_PAGE_SIZE       4096
-
-#ifndef NDEBUG
-#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
-#else
-#define DBGPRINT(format, ...) ((void)0)
-#endif
-
 #define CHECK_HANDLE(handle, _expr, _cleanup)   \
    auto handle = _expr;                        \
    if (handle == nullptr) {                    \
@ -82,15 +69,6 @@ using namespace vortex;
        _cleanup                                \
    } while (false)

-#define CHECK_ERR(_expr, _cleanup)              \
-    do {                                        \
-        auto err = _expr;                       \
-        if (err == 0)                           \
-            break;                              \
-        printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
-        _cleanup                                \
-    } while (false)
-
 ///////////////////////////////////////////////////////////////////////////////

 class vx_device {
@ -113,8 +91,6 @@ public:
            }
            api_.fpgaClose(fpga_);
        }
-
-        profiling_remove(profiling_id_);
    }

    int init() {
@ -211,12 +187,6 @@ public:
        }
    #endif

-        CHECK_ERR(dcr_initialize(this), {
-            return err;
-        });
-
-        profiling_id_ = profiling_add(this);
-
        return 0;
    }

@ -406,8 +376,6 @@ public:
            return err;
        });

-        profiling_begin(profiling_id_);
-
        // start execution
        CHECK_FPGA_ERR(api_.fpgaWriteMMIO64(fpga_, 0, MMIO_CMD_TYPE, CMD_RUN), {
            return -1;
@ -475,8 +443,6 @@ public:
            timeout -= sleep_time_ms;
        };

-        profiling_end(profiling_id_);
-
        return 0;
    }

@ -553,7 +519,6 @@ private:
    uint8_t* staging_ptr_;
    uint64_t staging_size_;
    std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
-    int profiling_id_;
 };

 struct vx_buffer {
--- a/runtime/rtlsim/Makefile
+++ b/runtime/rtlsim/Makefile
@ -17,7 +17,7 @@ CXXFLAGS += $(CONFIGS)
 LDFLAGS += -shared -pthread
 LDFLAGS += -L$(DESTDIR) -lrtlsim

-SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
+SRCS := $(SRC_DIR)/vortex.cpp

 # Debugigng
 ifdef DEBUG
@ -26,11 +26,6 @@ else
 	CXXFLAGS += -O2 -DNDEBUG
 endif

-# Enable perf counters
-ifdef PERF
-	CXXFLAGS += -DPERF_ENABLE
-endif
-
 PROJECT := libvortex-rtlsim.so

 all: $(DESTDIR)/$(PROJECT)
--- a/runtime/rtlsim/vortex.cpp
+++ b/runtime/rtlsim/vortex.cpp
@ -11,6 +11,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <common.h>
+
+#include <mem.h>
+#include <util.h>
+#include <processor.h>
+
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@ -20,39 +26,8 @@
 #include <list>
 #include <chrono>

-#include <vortex.h>
-#include <malloc.h>
-#include <utils.h>
-#include <VX_config.h>
-#include <VX_types.h>
-
-#include <mem.h>
-#include <util.h>
-#include <processor.h>
-
-#include <callbacks.h>
-
 using namespace vortex;

-#define RAM_PAGE_SIZE 4096
-
-#ifndef NDEBUG
-#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
-#else
-#define DBGPRINT(format, ...) ((void)0)
-#endif
-
-#define CHECK_ERR(_expr, _cleanup)              \
-    do {                                        \
-        auto err = _expr;                       \
-        if (err == 0)                           \
-            break;                              \
-        printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
-        _cleanup                                \
-    } while (false)
-
-///////////////////////////////////////////////////////////////////////////////
-
 class vx_device {
 public:
    vx_device()
@ -66,14 +41,9 @@ public:
        if (future_.valid()) {
            future_.wait();
        }
-        profiling_remove(profiling_id_);
    }

    int init() {
-        CHECK_ERR(dcr_initialize(this), {
-            return err;
-        });
-        profiling_id_ = profiling_add(this);
        return 0;
    }

@ -219,8 +189,6 @@ public:
        this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
        this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);

-        profiling_begin(profiling_id_);
-
        // start new run
        future_ = std::async(std::launch::async, [&]{
            processor_.run();
@ -245,7 +213,6 @@ public:
            if (0 == timeout_sec--)
                return -1;
        }
-        profiling_end(profiling_id_);
        return 0;
    }

@ -284,7 +251,6 @@ private:
    DeviceConfig        dcrs_;
    std::future<void>   future_;
    std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
-    int                 profiling_id_;
 };

 struct vx_buffer {
--- a/runtime/simx/Makefile
+++ b/runtime/simx/Makefile
@ -13,7 +13,7 @@ CXXFLAGS += -DXLEN_$(XLEN)
 LDFLAGS += -shared -pthread
 LDFLAGS += -L$(DESTDIR) -lsimx

-SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
+SRCS := $(SRC_DIR)/vortex.cpp

 # Debugigng
 ifdef DEBUG
--- a/runtime/simx/vortex.cpp
+++ b/runtime/simx/vortex.cpp
@ -11,6 +11,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <common.h>
+
+#include <util.h>
+#include <processor.h>
+#include <arch.h>
+#include <mem.h>
+#include <constants.h>
+
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
@ -19,40 +27,8 @@
 #include <future>
 #include <chrono>

-#include <utils.h>
-#include <malloc.h>
-
-#include <VX_config.h>
-#include <VX_types.h>
-
-#include <util.h>
-
-#include <processor.h>
-#include <arch.h>
-#include <mem.h>
-#include <constants.h>
-
-#include <callbacks.h>
-
 using namespace vortex;

-#ifndef NDEBUG
-#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
-#else
-#define DBGPRINT(format, ...) ((void)0)
-#endif
-
-#define CHECK_ERR(_expr, _cleanup)              \
-    do {                                        \
-        auto err = _expr;                       \
-        if (err == 0)                           \
-            break;                              \
-        printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
-        _cleanup                                \
-    } while (false)
-
-///////////////////////////////////////////////////////////////////////////////
-
 class vx_device {
 public:
    vx_device()
@ -69,14 +45,9 @@ public:
        if (future_.valid()) {
            future_.wait();
        }
-        profiling_remove(profiling_id_);
    }

    int init() {
-        CHECK_ERR(dcr_initialize(this), {
-            return err;
-        });
-        profiling_id_ = profiling_add(this);
        return 0;
    }

@ -213,8 +184,6 @@ public:
        this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
        this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);

-        profiling_begin(profiling_id_);
-
        // start new run
        future_ = std::async(std::launch::async, [&]{
            processor_.run();
@ -239,7 +208,6 @@ public:
            if (0 == timeout_sec--)
                return -1;
        }
-        profiling_end(profiling_id_);
        return 0;
    }

@ -278,7 +246,6 @@ private:
    DeviceConfig        dcrs_;
    std::future<void>   future_;
    std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
-    int profiling_id_;
 };

 struct vx_buffer {
--- a/runtime/stub/Makefile
+++ b/runtime/stub/Makefile
@ -4,13 +4,20 @@ DESTDIR ?= $(CURDIR)/..

 SRC_DIR := $(VORTEX_HOME)/runtime/stub

-CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
+CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
 CXXFLAGS += -I$(INC_DIR) -I$(COMMON_DIR) -I$(ROOT_DIR)/hw -I$(SIM_DIR)/common
 CXXFLAGS += -fPIC

 LDFLAGS += -shared -pthread -ldl

-SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
+SRCS := $(SRC_DIR)/vortex.cpp $(SRC_DIR)/utils.cpp
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else
+	CXXFLAGS += -O2 -DNDEBUG
+endif

 PROJECT := libvortex.so

--- a/runtime/common/utils.cpp
+++ b/runtime/common/utils.cpp
@ -11,7 +11,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "utils.h"
+#include <common.h>
+
 #include <iostream>
 #include <fstream>
 #include <list>
@ -21,129 +22,30 @@
 #include <vortex.h>
 #include <assert.h>

-#define RT_CHECK(_expr, _cleanup)                               \
-  do {                                                         \
-    int _ret = _expr;                                          \
-    if (0 == _ret)                                             \
-      break;                                                   \
-    printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
-    _cleanup                                                   \
-  } while (false)
-
-uint64_t aligned_size(uint64_t size, uint64_t alignment) {
-  assert(0 == (alignment & (alignment - 1)));
-  return (size + alignment - 1) & ~(alignment - 1);
-}
-
-bool is_aligned(uint64_t addr, uint64_t alignment) {
-  assert(0 == (alignment & (alignment - 1)));
-  return 0 == (addr & (alignment - 1));
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-class AutoPerfDump {
+class ProfilingMode {
 public:
-  AutoPerfDump() : perf_class_(0) {
+  ProfilingMode() : perf_class_(0) {
    auto profiling_s = getenv("VORTEX_PROFILING");
    if (profiling_s) {
      perf_class_ = std::atoi(profiling_s);
    }
  }

-  ~AutoPerfDump() {}
-
-  int add(vx_device_h hdevice) {
-    int ret = devices_.size();
-    devices_[ret] = hdevice;
-    return ret;
-  }
-
-  void remove(int id) {
-    devices_.erase(id);
-  }
-
-  void begin(int id) {
-    auto device = devices_.at(id);
-    vx_dcr_write(device, VX_DCR_BASE_MPM_CLASS, perf_class_);
-  }
-
-  void end(int id) {
-    auto device = devices_.at(id);
-    vx_dump_perf(device, stdout);
-  }
+  ~ProfilingMode() {}

  int perf_class() const {
    return perf_class_;
  }

 private:
-  std::unordered_map<int, vx_device_h> devices_;
  int perf_class_;
 };

-static AutoPerfDump gAutoPerfDump;
-
-int profiling_add(vx_device_h hdevice) {
-  return gAutoPerfDump.add(hdevice);
+int get_profiling_mode() {
+  static ProfilingMode gProfilingMode;
+  return gProfilingMode.perf_class();
 }

-void profiling_remove(int id) {
-  gAutoPerfDump.remove(id);
-}
-
-void profiling_begin(int id) {
-  gAutoPerfDump.begin(id);
-}
-
-void profiling_end(int id) {
-  gAutoPerfDump.end(id);
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-void DeviceConfig::write(uint32_t addr, uint32_t value) {
-  store_[addr] = value;
-}
-
-int DeviceConfig::read(uint32_t addr, uint32_t* value) const {
-  auto it = store_.find(addr);
-  if (it == store_.end())
-    return -1;
-  *value = it->second;
-  return 0;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-int dcr_initialize(vx_device_h hdevice) {
-  const uint64_t startup_addr(STARTUP_ADDR);
-
-  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
-    return _ret;
-  });
-
-  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
-    return _ret;
-  });
-
-  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, 0), {
-    return _ret;
-  });
-
-  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG1, 0), {
-    return _ret;
-  });
-
-  RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
-    return _ret;
-  });
-
-  return 0;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
 extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer) {
  if (nullptr == hdevice || nullptr == content || size <= 8 || nullptr == hbuffer)
    return -1;
@ -157,30 +59,30 @@ extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint

  vx_buffer_h _hbuffer;
 #ifndef NDEBUG
-  RT_CHECK(vx_mem_reserve(hdevice, min_vma, runtime_size, 0, &_hbuffer), {
-    return _ret;
+  CHECK_ERR(vx_mem_reserve(hdevice, min_vma, runtime_size, 0, &_hbuffer), {
+    return err;
  });
 #else
-  RT_CHECK(vx_mem_alloc(hdevice, runtime_size, 0, &_hbuffer), {
-    return _ret;
+  CHECK_ERR(vx_mem_alloc(hdevice, runtime_size, 0, &_hbuffer), {
+    return err;
  });
 #endif

  // mask binary region as read-only
-  RT_CHECK(vx_mem_access(_hbuffer, 0, bin_size, VX_MEM_READ), {
+  CHECK_ERR(vx_mem_access(_hbuffer, 0, bin_size, VX_MEM_READ), {
    vx_mem_free(_hbuffer);
-    return _ret;
+    return err;
  });

  // mark global variables region as read-write
-  RT_CHECK(vx_mem_access(_hbuffer, bin_size, runtime_size - bin_size, VX_MEM_READ_WRITE), {
+  CHECK_ERR(vx_mem_access(_hbuffer, bin_size, runtime_size - bin_size, VX_MEM_READ_WRITE), {
    vx_mem_free(_hbuffer);
-    return _ret;
+    return err;
  });

-  RT_CHECK(vx_copy_to_dev(_hbuffer, bytes, 0, bin_size), {
+  CHECK_ERR(vx_copy_to_dev(_hbuffer, bytes, 0, bin_size), {
    vx_mem_free(_hbuffer);
-    return _ret;
+    return err;
  });

  *hbuffer = _hbuffer;
@ -206,8 +108,8 @@ extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, vx_b
  ifs.read(content.data(), size);

  // upload buffer
-  RT_CHECK(vx_upload_kernel_bytes(hdevice, content.data(), size, hbuffer), {
-    return _ret;
+  CHECK_ERR(vx_upload_kernel_bytes(hdevice, content.data(), size, hbuffer), {
+    return err;
  });

  return 0;
@ -219,13 +121,13 @@ extern int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t si

  vx_buffer_h _hbuffer;

-  RT_CHECK(vx_mem_alloc(hdevice, size, VX_MEM_READ, &_hbuffer), {
-    return _ret;
+  CHECK_ERR(vx_mem_alloc(hdevice, size, VX_MEM_READ, &_hbuffer), {
+    return err;
  });

-  RT_CHECK(vx_copy_to_dev(_hbuffer, content, 0, size), {
+  CHECK_ERR(vx_copy_to_dev(_hbuffer, content, 0, size), {
    vx_mem_free(_hbuffer);
-    return _ret;
+    return err;
  });

  *hbuffer = _hbuffer;
@ -251,8 +153,8 @@ extern int vx_upload_file(vx_device_h hdevice, const char* filename, vx_buffer_h
  ifs.read(content.data(), size);

  // upload buffer
-  RT_CHECK(vx_upload_bytes(hdevice, content.data(), size, hbuffer), {
-    return _ret;
+  CHECK_ERR(vx_upload_bytes(hdevice, content.data(), size, hbuffer), {
+    return err;
  });

  return 0;
@ -265,8 +167,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
  uint64_t total_cycles = 0;
  uint64_t max_cycles = 0;

-#ifdef PERF_ENABLE
-
  auto calcRatio = [&](uint64_t part, uint64_t total)->int {
    if (total == 0)
      return 0;
@ -283,8 +183,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
    return int(caclAverage(part, total) * 100);
  };

-  auto perf_class = gAutoPerfDump.perf_class();
-
  // PERF: pipeline stalls
  uint64_t sched_idles = 0;
  uint64_t sched_stalls = 0;
@ -319,45 +217,44 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
  uint64_t mem_reads = 0;
  uint64_t mem_writes = 0;
  uint64_t mem_lat = 0;
-#endif

  uint64_t num_cores;
-  RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
-    return _ret;
+  CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
+    return err;
  });

-#ifdef PERF_ENABLE
  uint64_t isa_flags;
-  RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
-    return _ret;
+  CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
+    return err;
  });
+
  bool icache_enable  = isa_flags & VX_ISA_EXT_ICACHE;
  bool dcache_enable  = isa_flags & VX_ISA_EXT_DCACHE;
  bool l2cache_enable = isa_flags & VX_ISA_EXT_L2CACHE;
  bool l3cache_enable = isa_flags & VX_ISA_EXT_L3CACHE;
  bool lmem_enable    = isa_flags & VX_ISA_EXT_LMEM;
-#endif
+
+  auto perf_class = get_profiling_mode();

  for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
    uint64_t cycles_per_core;
-    RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MCYCLE, core_id, &cycles_per_core), {
-      return _ret;
+    CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MCYCLE, core_id, &cycles_per_core), {
+      return err;
    });

    uint64_t instrs_per_core;
-    RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MINSTRET, core_id, &instrs_per_core), {
-      return _ret;
+    CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MINSTRET, core_id, &instrs_per_core), {
+      return err;
    });

-  #ifdef PERF_ENABLE
    switch (perf_class) {
    case VX_DCR_MPM_CLASS_CORE: {
      // PERF: pipeline
      // scheduler idles
      {
        uint64_t sched_idles_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ID, core_id, &sched_idles_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ID, core_id, &sched_idles_per_core), {
+          return err;
        });
        if (num_cores > 1) {
          int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
@ -368,8 +265,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      // scheduler stalls
      {
        uint64_t sched_stalls_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ST, core_id, &sched_stalls_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ST, core_id, &sched_stalls_per_core), {
+          return err;
        });
        if (num_cores > 1) {
          int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
@ -380,8 +277,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      // ibuffer_stalls
      {
        uint64_t ibuffer_stalls_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IBUF_ST, core_id, &ibuffer_stalls_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_IBUF_ST, core_id, &ibuffer_stalls_per_core), {
+          return err;
        });
        if (num_cores > 1) {
          int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
@ -392,24 +289,24 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      // issue_stalls
      {
        uint64_t scrb_stalls_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ST, core_id, &scrb_stalls_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ST, core_id, &scrb_stalls_per_core), {
+          return err;
        });
        uint64_t scrb_alu_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ALU, core_id, &scrb_alu_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ALU, core_id, &scrb_alu_per_core), {
+          return err;
        });
        uint64_t scrb_fpu_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_FPU, core_id, &scrb_fpu_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_FPU, core_id, &scrb_fpu_per_core), {
+          return err;
        });
        uint64_t scrb_lsu_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_LSU, core_id, &scrb_lsu_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_LSU, core_id, &scrb_lsu_per_core), {
+          return err;
        });
        uint64_t scrb_sfu_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
+          return err;
        });
        scrb_alu += scrb_alu_per_core;
        scrb_fpu += scrb_fpu_per_core;
@ -428,16 +325,16 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      // sfu_stalls
      {
        uint64_t scrb_sfu_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
+          return err;
        });
        uint64_t scrb_wctl_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
+          return err;
        });
        uint64_t scrb_csrs_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
+          return err;
        });
        if (num_cores > 1) {
          uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core;
@ -455,15 +352,15 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      // ifetches
      {
        uint64_t ifetches_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCHES, core_id, &ifetches_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCHES, core_id, &ifetches_per_core), {
+          return err;
        });
        if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
        ifetches += ifetches_per_core;

        uint64_t ifetch_lat_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCH_LT, core_id, &ifetch_lat_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCH_LT, core_id, &ifetch_lat_per_core), {
+          return err;
        });
        if (num_cores > 1) {
          int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
@ -474,15 +371,15 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      // loads
      {
        uint64_t loads_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LOADS, core_id, &loads_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LOADS, core_id, &loads_per_core), {
+          return err;
        });
        if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
        loads += loads_per_core;

        uint64_t load_lat_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LOAD_LT, core_id, &load_lat_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LOAD_LT, core_id, &load_lat_per_core), {
+          return err;
        });
        if (num_cores > 1) {
          int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
@ -493,8 +390,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      // stores
      {
        uint64_t stores_per_core;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_STORES, core_id, &stores_per_core), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_STORES, core_id, &stores_per_core), {
+          return err;
        });
        if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
        stores += stores_per_core;
@ -504,16 +401,16 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      if (lmem_enable) {
        // PERF: lmem
        uint64_t lmem_reads;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_READS, core_id, &lmem_reads), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_READS, core_id, &lmem_reads), {
+          return err;
        });
        uint64_t lmem_writes;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_WRITES, core_id, &lmem_writes), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_WRITES, core_id, &lmem_writes), {
+          return err;
        });
        uint64_t lmem_bank_stalls;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_BANK_ST, core_id, &lmem_bank_stalls), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_BANK_ST, core_id, &lmem_bank_stalls), {
+          return err;
        });
        int lmem_bank_utilization = calcAvgPercent(lmem_reads + lmem_writes, lmem_reads + lmem_writes + lmem_bank_stalls);
        fprintf(stream, "PERF: core%d: lmem reads=%ld\n", core_id, lmem_reads);
@ -524,16 +421,16 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      if (icache_enable) {
        // PERF: Icache
        uint64_t icache_reads;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_READS, core_id, &icache_reads), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_READS, core_id, &icache_reads), {
+          return err;
        });
        uint64_t icache_read_misses;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MISS_R, core_id, &icache_read_misses), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MISS_R, core_id, &icache_read_misses), {
+          return err;
        });
        uint64_t icache_mshr_stalls;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MSHR_ST, core_id, &icache_mshr_stalls), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MSHR_ST, core_id, &icache_mshr_stalls), {
+          return err;
        });
        int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
        int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls);
@ -545,28 +442,28 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      if (dcache_enable) {
        // PERF: Dcache
        uint64_t dcache_reads;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_READS, core_id, &dcache_reads), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_READS, core_id, &dcache_reads), {
+          return err;
        });
        uint64_t dcache_writes;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
+          return err;
        });
        uint64_t dcache_read_misses;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
+          return err;
        });
        uint64_t dcache_write_misses;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_W, core_id, &dcache_write_misses), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_W, core_id, &dcache_write_misses), {
+          return err;
        });
        uint64_t dcache_bank_stalls;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_BANK_ST, core_id, &dcache_bank_stalls), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_BANK_ST, core_id, &dcache_bank_stalls), {
+          return err;
        });
        uint64_t dcache_mshr_stalls;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MSHR_ST, core_id, &dcache_mshr_stalls), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MSHR_ST, core_id, &dcache_mshr_stalls), {
+          return err;
        });
        int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads);
        int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes);
@ -583,74 +480,73 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
      if (l2cache_enable) {
        // PERF: L2cache
        uint64_t tmp;
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_READS, core_id, &tmp), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_READS, core_id, &tmp), {
+          return err;
        });
        l2cache_reads += tmp;

-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_WRITES, core_id, &tmp), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_WRITES, core_id, &tmp), {
+          return err;
        });
        l2cache_writes += tmp;

-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_R, core_id, &tmp), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_R, core_id, &tmp), {
+          return err;
        });
        l2cache_read_misses += tmp;

-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_W, core_id, &tmp), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_W, core_id, &tmp), {
+          return err;
        });
        l2cache_write_misses += tmp;

-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_BANK_ST, core_id, &tmp), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_BANK_ST, core_id, &tmp), {
+          return err;
        });
        l2cache_bank_stalls += tmp;

-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MSHR_ST, core_id, &tmp), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MSHR_ST, core_id, &tmp), {
+          return err;
        });
        l2cache_mshr_stalls += tmp;
      }
      if (0 == core_id) {
        if (l3cache_enable) {
          // PERF: L3cache
-          RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_READS, core_id, &l3cache_reads), {
-            return _ret;
+          CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_READS, core_id, &l3cache_reads), {
+            return err;
          });
-          RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_WRITES, core_id, &l3cache_writes), {
-            return _ret;
+          CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_WRITES, core_id, &l3cache_writes), {
+            return err;
          });
-          RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_R, core_id, &l3cache_read_misses), {
-            return _ret;
+          CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_R, core_id, &l3cache_read_misses), {
+            return err;
          });
-          RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_W, core_id, &l3cache_write_misses), {
-            return _ret;
+          CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_W, core_id, &l3cache_write_misses), {
+            return err;
          });
-          RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_BANK_ST, core_id, &l3cache_bank_stalls), {
-            return _ret;
+          CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_BANK_ST, core_id, &l3cache_bank_stalls), {
+            return err;
          });
-          RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MSHR_ST, core_id, &l3cache_mshr_stalls), {
-            return _ret;
+          CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MSHR_ST, core_id, &l3cache_mshr_stalls), {
+            return err;
          });
        }
        // PERF: memory
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_READS, core_id, &mem_reads), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_READS, core_id, &mem_reads), {
+          return err;
        });
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_WRITES, core_id, &mem_writes), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_WRITES, core_id, &mem_writes), {
+          return err;
        });
-        RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), {
-          return _ret;
+        CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), {
+          return err;
        });
      }
    } break;
    default:
      break;
    }
-  #endif

    float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
    if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
@ -659,7 +555,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
    max_cycles = std::max<uint64_t>(cycles_per_core, max_cycles);
  }

-#ifdef PERF_ENABLE
  switch (perf_class) {
  case VX_DCR_MPM_CLASS_CORE: {
    int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles);
@ -728,7 +623,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
  default:
    break;
  }
-#endif

  float IPC = (float)(double(total_instrs) / double(max_cycles));
  fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC);
@ -741,11 +635,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
 int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_barriers, uint32_t* max_localmem) {
   // check group size
  uint64_t warps_per_core, threads_per_warp;
-  RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_WARPS, &warps_per_core), {
-    return _ret;
+  CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_WARPS, &warps_per_core), {
+    return err;
  });
-  RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
-    return _ret;
+  CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
+    return err;
  });
  uint32_t threads_per_core = warps_per_core * threads_per_warp;
  if (group_size > threads_per_core) {
@ -760,8 +654,8 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
  // check barriers capacity
  if (max_barriers) {
    uint64_t num_barriers;
-    RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_BARRIERS, &num_barriers), {
-      return _ret;
+    CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_BARRIERS, &num_barriers), {
+      return err;
    });
    if (warps_per_group < 2) {
      *max_barriers = -1;
@ -773,8 +667,8 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
  // check local memory capacity
  if (max_localmem) {
    uint64_t local_mem_size;
-    RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size), {
-      return _ret;
+    CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size), {
+      return err;
    });
    *max_localmem = local_mem_size / groups_per_core;
  }
--- a/runtime/stub/vortex.cpp
+++ b/runtime/stub/vortex.cpp
@ -11,7 +11,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <callbacks.h>
+#include <common.h>
+
 #include <unistd.h>
 #include <string.h>
 #include <string>
@ -19,12 +20,42 @@
 #include <dlfcn.h>
 #include <iostream>

+int get_profiling_mode();
+
+static int dcr_initialize(vx_device_h hdevice) {
+  const uint64_t startup_addr(STARTUP_ADDR);
+
+  CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
+    return err;
+  });
+
+  CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
+    return err;
+  });
+
+  CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, 0), {
+    return err;
+  });
+
+  CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG1, 0), {
+    return err;
+  });
+
+  CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
+    return err;
+  });
+
+  return 0;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
 static callbacks_t g_callbacks;
 static void* g_drv_handle = nullptr;

 typedef int (*vx_dev_init_t)(callbacks_t*);

-int vx_dev_open(vx_device_h* hdevice) {
+extern int vx_dev_open(vx_device_h* hdevice) {
    {
        const char* driverName = getenv("VORTEX_DRIVER");
        if (driverName == nullptr) {
@ -50,67 +81,86 @@ int vx_dev_open(vx_device_h* hdevice) {
        g_drv_handle = handle;
    }

-    return (g_callbacks.dev_open)(hdevice);
+    vx_device_h _hdevice;
+
+    CHECK_ERR((g_callbacks.dev_open)(&_hdevice), {
+        return err;
+    });
+
+    CHECK_ERR(dcr_initialize(_hdevice), {
+        return err;
+    });
+
+    *hdevice = _hdevice;
+
+    return 0;
 }

-int vx_dev_close(vx_device_h hdevice) {
+extern int vx_dev_close(vx_device_h hdevice) {
+    vx_dump_perf(hdevice, stdout);
    int ret = (g_callbacks.dev_close)(hdevice);
    dlclose(g_drv_handle);
    return ret;
 }

-int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t* value) {
+extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t* value) {
    return (g_callbacks.dev_caps)(hdevice, caps_id, value);
 }

-int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer) {
+extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer) {
    return (g_callbacks.mem_alloc)(hdevice, size, flags, hbuffer);
 }

-int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer) {
+extern int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer) {
    return (g_callbacks.mem_reserve)(hdevice, address, size, flags, hbuffer);
 }

-int vx_mem_free(vx_buffer_h hbuffer) {
+extern int vx_mem_free(vx_buffer_h hbuffer) {
    return (g_callbacks.mem_free)(hbuffer);
 }

-int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags) {
+extern int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags) {
    return (g_callbacks.mem_access)(hbuffer, offset, size, flags);
 }

-int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address) {
+extern int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address) {
    return (g_callbacks.mem_address)(hbuffer, address);
 }

-int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used) {
+extern int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used) {
    return (g_callbacks.mem_info)(hdevice, mem_free, mem_used);
 }

-int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size) {
+extern int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size) {
    return (g_callbacks.copy_to_dev)(hbuffer, host_ptr, dst_offset, size);
 }

-int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
+extern int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
    return (g_callbacks.copy_from_dev)(host_ptr, hbuffer, src_offset, size);
 }

-int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {
+extern int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {
+    int profiling_mode = get_profiling_mode();
+    if (profiling_mode != 0) {
+        CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, profiling_mode), {
+            return err;
+        });
+    }
    return (g_callbacks.start)(hdevice, hkernel, harguments);
 }

-int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
+extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
    return (g_callbacks.ready_wait)(hdevice, timeout);
 }

-int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
+extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
    return (g_callbacks.dcr_read)(hdevice, addr, value);
 }

-int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
+extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
    return (g_callbacks.dcr_write)(hdevice, addr, value);
 }

-int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
+extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
    return (g_callbacks.mpm_query)(hdevice, addr, core_id, value);
 }
--- a/runtime/xrt/Makefile
+++ b/runtime/xrt/Makefile
@ -13,7 +13,7 @@ CXXFLAGS += -fPIC
 LDFLAGS += -shared -pthread
 LDFLAGS += -L$(XILINX_XRT)/lib

-SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp $(SIM_DIR)/common/util.cpp
+SRCS := $(SRC_DIR)/vortex.cpp $(SIM_DIR)/common/util.cpp

 # set up target types
 ifeq ($(TARGET), xrtsim)
--- a/runtime/xrt/vortex.cpp
+++ b/runtime/xrt/vortex.cpp
@ -11,16 +11,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include <malloc.h>
-#include <utils.h>
-#include <VX_config.h>
-#include <VX_types.h>
-#include <stdarg.h>
-#include <util.h>
-#include <limits>
-#include <vector>
-#include <string>
-#include <unordered_map>
+#include <common.h>

 #ifdef SCOPE
 #include "scope.h"
@ -38,7 +29,12 @@
 #include <fpga.h>
 #endif

-#include <callbacks.h>
+#include <stdarg.h>
+#include <util.h>
+#include <limits>
+#include <vector>
+#include <string>
+#include <unordered_map>

 using namespace vortex;

@ -91,20 +87,12 @@ static const platform_info_t g_platforms [] = {

 #endif

-#define RAM_PAGE_SIZE 4096
-
 #define DEFAULT_DEVICE_INDEX 0

 #define DEFAULT_XCLBIN_PATH "vortex_afu.xclbin"

 #define KERNEL_NAME "vortex_afu"

-#ifndef NDEBUG
-#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
-#else
-#define DBGPRINT(format, ...) ((void)0)
-#endif
-
 #define CHECK_HANDLE(handle, _expr, _cleanup)   \
    auto handle = _expr;                        \
    if (handle == nullptr) {                    \
@ -112,15 +100,6 @@ static const platform_info_t g_platforms [] = {
        _cleanup                                \
    }

-#define CHECK_ERR(_expr, _cleanup)              \
-    do {                                        \
-        auto err = _expr;                       \
-        if (err == 0)                           \
-            break;                              \
-        printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
-        _cleanup                                \
-    } while (false)
-
 #ifndef CPP_API

 static void dump_xrt_error(xrtDeviceHandle xrtDevice, xrtErrorCode err) {
@ -164,7 +143,6 @@ public:
 #ifndef CPP_API

    ~vx_device() {
-        profiling_remove(profiling_id_);
        for (auto& entry : xrtBuffers_) {
        #ifdef BANK_INTERLEAVE
            xrtBOFree(entry);
@ -238,12 +216,6 @@ public:
        }
    #endif

-        CHECK_ERR(dcr_initialize(this), {
-            return err;
-        });
-
-        profiling_id_ = profiling_add(this);
-
        return 0;
    }

@ -522,8 +494,6 @@ public:
            return err;
        });

-        profiling_begin(profiling_id_);
-
        // start execution
        CHECK_ERR(this->write_register(MMIO_CTL_ADDR, CTL_AP_START), {
            return err;
@ -563,8 +533,6 @@ public:
            timeout -= sleep_time_ms;
        };

-        profiling_end(profiling_id_);
-
        return 0;
    }

@ -608,7 +576,6 @@ private:
    uint64_t global_mem_size_;
    DeviceConfig dcrs_;
    std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
-    int profiling_id_;

 #ifdef BANK_INTERLEAVE