using ramulator dram simulator

2025-04-24 05:47:35 -04:00 · 2021-12-06 01:22:45 -05:00 · 2021-12-06 01:22:45 -05:00 · b741807f8c
commit b741807f8c
parent 59232642c4
33 changed files with 1473 additions and 1344 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -7,3 +7,6 @@
 [submodule "third_party/cocogfx"]
 	path = third_party/cocogfx
 	url = https://github.com/gtcasl/cocogfx.git
+[submodule "third_party/ramulator"]
+	path = third_party/ramulator
+	url = https://github.com/CMU-SAFARI/ramulator.git
--- a/ci/regression.sh
+++ b/ci/regression.sh
@ -102,7 +102,7 @@ FPU_CORE=FPU_FPNEW ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
 AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo

 # adjust l1 block size to match l2
-CONFIGS="-DMEM_BLOCK_SIZE=16 -DL1_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
+CONFIGS="-DL1_BLOCK_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"

 # test cache banking
 CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
@ -119,18 +119,12 @@ CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=simx --cores
 # test 128-bit MEM block
 CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo

-# test 128-bit MEM and DRAM block
-CONFIGS="-DMEM_BLOCK_SIZE=16 -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
+# test single-bank DRAM
+CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo

 # test 27-bit DRAM address
 CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo

-# test 128-bit DRAM block
-CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
-
-# test long memory latency
-CONFIGS="-DMEM_LATENCY=100 -DMEM_RQ_SIZE=4 -DMEM_STALLS_MODULO=4" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
-
 echo "configuration tests done!"
 }

--- a/driver/rtlsim/Makefile
+++ b/driver/rtlsim/Makefile
@ -5,8 +5,6 @@ CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors

 CXXFLAGS += -I../include -I../common -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common

-LDFLAGS += $(RTLSIM_DIR)/librtlsim.a
-
 # Position independent code
 CXXFLAGS += -fPIC

@ -17,6 +15,7 @@ CXXFLAGS += $(CONFIGS)
 CXXFLAGS += -DDUMP_PERF_STATS

 LDFLAGS += -shared -pthread
+LDFLAGS += -L. -lrtlsim

 SRCS = vortex.cpp ../common/vx_utils.cpp

@ -30,9 +29,9 @@ PROJECT = libvortex.so
 all: $(PROJECT)
 	
 $(PROJECT): $(SRCS)
-	$(MAKE) -C $(RTLSIM_DIR) static
+	DESTDIR=../../driver/rtlsim $(MAKE) -C $(RTLSIM_DIR) ../../driver/rtlsim/librtlsim.so
 	$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT)

 clean:
-	$(MAKE) -C $(RTLSIM_DIR) clean-static
+	DESTDIR=../../driver/rtlsim $(MAKE) -C $(RTLSIM_DIR) clean
 	rm -rf $(PROJECT) *.o
--- a/driver/rtlsim/vortex.cpp
+++ b/driver/rtlsim/vortex.cpp
@ -4,6 +4,7 @@
 #include <assert.h>
 #include <iostream>
 #include <future>
+#include <list>
 #include <chrono>

 #include <vortex.h>
@ -11,7 +12,7 @@
 #include <VX_config.h>
 #include <mem.h>
 #include <util.h>
-#include <simulator.h>
+#include <processor.h>

 #define RAM_PAGE_SIZE 4096

@ -60,7 +61,9 @@ public:
    vx_device() 
        : ram_(RAM_PAGE_SIZE)
        , mem_allocation_(ALLOC_BASE_ADDR) 
-    {}
+    {
+        processor_.attach_ram(&ram_);
+    }

    ~vx_device() {    
        if (future_.valid()) {
@ -121,12 +124,9 @@ public:
            future_.wait();
        }
        // start new run
-        simulator_.attach_ram(&ram_);
        future_ = std::async(std::launch::async, [&]{             
-            simulator_.reset();        
-            while (simulator_.is_busy()) {
-                simulator_.step();
-            }
+            processor_.reset();            
+            processor_.run();
        });
        return 0;
    }
@ -149,7 +149,7 @@ public:
 private:

    RAM ram_;
-    Simulator simulator_;
+    Processor processor_;
    uint64_t mem_allocation_;     
    std::future<void> future_;
 };
--- a/driver/simx/Makefile
+++ b/driver/simx/Makefile
@ -9,7 +9,7 @@ CXXFLAGS += $(CONFIGS)
 CXXFLAGS += -DDUMP_PERF_STATS

 LDFLAGS += -shared -pthread
-LDFLAGS += $(SIMX_DIR)/libsimx.a
+LDFLAGS += -L. -lsimx

 SRCS = vortex.cpp ../common/vx_utils.cpp 

@ -18,9 +18,9 @@ PROJECT = libvortex.so
 all: $(PROJECT)

 $(PROJECT): $(SRCS)
-	$(MAKE) -C $(SIMX_DIR) static
+	DESTDIR=../../driver/simx $(MAKE) -C $(SIMX_DIR) ../../driver/simx/libsimx.so
 	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@

 clean:
-	$(MAKE) -C $(SIMX_DIR) clean-static
-	rm -rf $(PROJECT) *.o
+	DESTDIR=../../driver/simx $(MAKE) -C $(SIMX_DIR) clean
+	rm -rf libsimx.so $(PROJECT) *.o
--- a/driver/simx/vortex.cpp
+++ b/driver/simx/vortex.cpp
@ -60,7 +60,13 @@ public:
        : arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS)
        , ram_(RAM_PAGE_SIZE)
        , mem_allocation_(ALLOC_BASE_ADDR)
-    {}
+    {
+        // setup memory simulator
+        memsim_ = MemSim::Create(MemSim::Config{
+            DRAM_CHANNELS,
+            arch_.num_cores()
+        });
+    }

    ~vx_device() {
        if (future_.valid()) {
@ -113,13 +119,33 @@ public:
        if (future_.valid()) {
            future_.wait();
        }
+        
        // start new run
-        SimPlatform::instance().flush();
-        processor_ = std::make_shared<Processor>(arch_);
-        processor_->attach_ram(&ram_);
        future_ = std::async(std::launch::async, [&]{
-            processor_->run();
+            if (processor_) {                
+                // release current processor instance
+                processor_->MemReqPort.unbind();
+                memsim_->MemRspPort.unbind();
+                SimPlatform::instance().release_object(processor_);
+            }
+
+            // create new processor instance
+            processor_ = Processor::Create(arch_);
+            processor_->MemReqPort.bind(&memsim_->MemReqPort);
+            memsim_->MemRspPort.bind(&processor_->MemRspPort);
+
+            // attach memory object
+            processor_->attach_ram(&ram_);
+
+            // run simulation
+            int exitcode;   
+            for (;;) {
+                SimPlatform::instance().step();
+                if (processor_->check_exit(&exitcode))
+                    break;
+            };
        });
+        
        return 0;
    }

@ -141,6 +167,7 @@ public:
 private:
    ArchDef arch_;
    RAM ram_;
+    MemSim::Ptr memsim_;
    Processor::Ptr processor_;
    uint64_t mem_allocation_;        
    std::future<void> future_;
--- a/driver/vlsim/Makefile
+++ b/driver/vlsim/Makefile
@ -9,8 +9,6 @@ CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors

 CXXFLAGS += -I. -I../include -I../../hw -I$(VLSIM_DIR)

-LDFLAGS += $(VLSIM_DIR)/libopae-c-vlsim.a
-
 # Position independent code
 CXXFLAGS += -fPIC

@ -21,6 +19,7 @@ CXXFLAGS += $(CONFIGS)
 CXXFLAGS += -DDUMP_PERF_STATS

 LDFLAGS += -shared -pthread
+LDFLAGS += -L. -lopae-c-vlsim

 SRCS = ../common/opae.cpp ../common/vx_utils.cpp

@ -47,9 +46,9 @@ scope-defs.h: $(SCRIPT_DIR)/scope.json
 scope: scope-defs.h

 $(PROJECT): $(SRCS) $(SCOPE_H)
-	$(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C $(VLSIM_DIR) static
+	DESTDIR=../../driver/vlsim $(MAKE) -C $(VLSIM_DIR) ../../driver/vlsim/libopae-c-vlsim.so
 	$(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT)

 clean:
-	$(MAKE) -C $(VLSIM_DIR) clean-static
-	rm -rf $(PROJECT) *.o scope-defs.h
+	DESTDIR=../../driver/vlsim $(MAKE) -C $(VLSIM_DIR) clean
+	rm -rf libopae-c-vlsim.so $(PROJECT) *.o scope-defs.h
--- a/sim/common/simobject.h
+++ b/sim/common/simobject.h
@ -51,8 +51,7 @@ public:
    peer_ = peer;
  }

-  void unbind() {    
-    assert(peer_ == nullptr);
+  void unbind() {
    peer_ = nullptr;
  }

@ -292,12 +291,16 @@ public:
  }

  template <typename Impl, typename... Args>
-  typename SimObject<Impl>::Ptr CreateObject(Args&&... args) {
+  typename SimObject<Impl>::Ptr create_object(Args&&... args) {
    auto obj = std::make_shared<Impl>(SimContext{}, std::forward<Args>(args)...);
    objects_.push_back(obj);
    return obj;
  }

+  void release_object(const SimObjectBase::Ptr& object) {
+    objects_.remove(object);
+  }
+
  template <typename Pkt>
  void schedule(const typename SimCallEvent<Pkt>::Func& callback,
                const Pkt& pkt, 
@ -352,7 +355,7 @@ private:
    events_.emplace_back(evt);
  }

-  std::vector<SimObjectBase::Ptr> objects_;
+  std::list<SimObjectBase::Ptr> objects_;
  std::list<SimEventBase::Ptr> events_;
  uint64_t cycles_;

@ -369,7 +372,7 @@ inline SimObjectBase::SimObjectBase(const SimContext&, const char* name)
 template <typename Impl>
 template <typename... Args>
 typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
-  return SimPlatform::instance().CreateObject<Impl>(std::forward<Args>(args)...);
+  return SimPlatform::instance().create_object<Impl>(std::forward<Args>(args)...);
 }

 template <typename Pkt>
--- a/sim/common/texturing.h
+++ b/sim/common/texturing.h
@ -1,7 +1,7 @@
 #pragma once

 #include <cstdint>
-#include <fixed.h>
+#include <cocogfx/include/fixed.h>
 #include <bitmanip.h>

 using namespace cocogfx;
--- a/sim/common/util.h
+++ b/sim/common/util.h
@ -11,4 +11,42 @@ void unused(Args&&...) {}
 #define __unused(...) unused(__VA_ARGS__)

 // return file extension
-const char* fileExtension(const char* filepath);
+const char* fileExtension(const char* filepath);
+
+#if defined(_MSC_VER)
+#define DISABLE_WARNING_PUSH __pragma(warning(push))
+#define DISABLE_WARNING_POP __pragma(warning(pop))
+#define DISABLE_WARNING_UNUSED_PARAMETER \
+  __pragma(warning(disable : 4100))
+#define DISABLE_WARNING_UNREFERENCED_FUNCTION __pragma(warning(disable : 4505))
+#define DISABLE_WARNING_ANONYMOUS_STRUCT __pragma(warning(disable : 4201))
+#define DISABLE_WARNING_UNUSED_VARIABLE __pragma(warning(disable : 4189))
+#elif defined(__GNUC__)
+#define DISABLE_WARNING_PUSH _Pragma("GCC diagnostic push")
+#define DISABLE_WARNING_POP _Pragma("GCC diagnostic pop")
+#define DISABLE_WARNING_UNUSED_PARAMETER \
+  _Pragma("GCC diagnostic ignored \"-Wunused-parameter\"")
+#define DISABLE_WARNING_UNREFERENCED_FUNCTION \
+  _Pragma("GCC diagnostic ignored \"-Wunused-function\"")
+#define DISABLE_WARNING_ANONYMOUS_STRUCT \
+  _Pragma("GCC diagnostic ignored \"-Wpedantic\"")
+#define DISABLE_WARNING_UNUSED_VARIABLE \
+  _Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"")
+#elif defined(__clang__)
+#define DISABLE_WARNING_PUSH _Pragma("clang diagnostic push")
+#define DISABLE_WARNING_POP _Pragma("clang diagnostic pop")
+#define DISABLE_WARNING_UNUSED_PARAMETER \
+  _Pragma("clang diagnostic ignored \"-Wunused-parameter\"")
+#define DISABLE_WARNING_UNREFERENCED_FUNCTION \
+  _Pragma("clang diagnostic ignored \"-Wunused-function\"")
+#define DISABLE_WARNING_ANONYMOUS_STRUCT \
+  _Pragma("clang diagnostic ignored \"-Wgnu-anonymous-struct\"")
+#define DISABLE_WARNING_UNUSED_VARIABLE \
+  _Pragma("clang diagnostic ignored \"-Wunused-but-set-variable\"")
+#else
+#define DISABLE_WARNING_PUSH
+#define DISABLE_WARNING_POP
+#define DISABLE_WARNING_UNUSED_PARAMETER
+#define DISABLE_WARNING_UNREFERENCED_FUNCTION
+#define DISABLE_WARNING_ANONYMOUS_STRUCT
+#endif
--- a/sim/rtlsim/Makefile
+++ b/sim/rtlsim/Makefile
@ -1,3 +1,4 @@
+DESTDIR ?= .
 RTL_DIR = ../../hw/rtl
 DPI_DIR = ../../hw/dpi
 THIRD_PARTY_DIR = ../../third_party
@ -6,8 +7,10 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
 CXXFLAGS += -fPIC -Wno-maybe-uninitialized
 CXXFLAGS += -I../../../hw -I../../common
 CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
+CXXFLAGS += -I../$(THIRD_PARTY_DIR)

 LDFLAGS += ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
+LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator

 # control RTL debug tracing states
 DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE  
@ -31,7 +34,7 @@ RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interface

 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
 SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
-SRCS += main.cpp simulator.cpp
+SRCS += processor.cpp

 ifdef AXI_BUS
 	TOP = Vortex_axi
@ -86,15 +89,11 @@ PROJECT = rtlsim

 all: $(PROJECT)

-$(PROJECT): $(SRCS)
-	verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
+$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
+	verilator --build $(VL_FLAGS) $^ $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$@
 	
-static: $(SRCS)
-	verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)'
-	$(AR) rcs lib$(PROJECT).a obj_dir/*.o $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/*.o
+$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
+	verilator --build $(VL_FLAGS) $^ -CFLAGS '$(CXXFLAGS)' -LDFLAGS '-shared $(LDFLAGS)' -o ../$@

-clean-static:
-	rm -rf lib$(PROJECT).a obj_dir
-
-clean: clean-static
-	rm -rf $(PROJECT)
+clean:
+	rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
--- a/sim/rtlsim/main.cpp
+++ b/sim/rtlsim/main.cpp
@ -5,7 +5,8 @@
 #include <unistd.h>
 #include <util.h>
 #include <mem.h>
-#include "simulator.h"
+#include <VX_config.h>
+#include "processor.h"

 #define RAM_PAGE_SIZE 4096

@ -52,8 +53,8 @@ int main(int argc, char **argv) {
 		std::cout << "Running " << program << "..." << std::endl;

 		vortex::RAM ram(RAM_PAGE_SIZE);
-		vortex::Simulator simulator;
-		simulator.attach_ram(&ram);
+		vortex::Processor processor;
+		processor.attach_ram(&ram);

 		std::string program_ext(fileExtension(program));
 		if (program_ext == "bin") {
@ -65,7 +66,7 @@ int main(int argc, char **argv) {
 			return -1;
 		}

-		exitcode = simulator.run();
+		exitcode = processor.run();
 		
 		if (riscv_test) {
 			if (1 == exitcode) {
--- a/sim/rtlsim/processor.cpp
+++ b/sim/rtlsim/processor.cpp
@ -0,0 +1,599 @@
+#include "processor.h"
+
+#include <verilated.h>
+
+#ifdef AXI_BUS
+#include "VVortex_axi.h"
+#include "VVortex_axi__Syms.h"
+#else
+#include "VVortex.h"
+#include "VVortex__Syms.h"
+#endif
+
+#ifdef VCD_OUTPUT
+#include <verilated_vcd_c.h>
+#endif
+
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+#include <mem.h>
+
+#include <VX_config.h>
+#include <ostream>
+#include <list>
+#include <vector>
+#include <sstream> 
+#include <unordered_map>
+
+#define RAMULATOR
+#include <ramulator/src/Gem5Wrapper.h>
+#include <ramulator/src/Request.h>
+#include <ramulator/src/Statistics.h>
+
+#ifndef MEMORY_BANKS
+  #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
+    #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS
+  #else
+    #define MEMORY_BANKS 2
+  #endif
+#endif
+
+#define ENABLE_MEM_STALLS
+
+#ifndef TRACE_START_TIME
+#define TRACE_START_TIME 0ull
+#endif
+
+#ifndef TRACE_STOP_TIME
+#define TRACE_STOP_TIME -1ull
+#endif
+
+#ifndef VERILATOR_RESET_VALUE
+#define VERILATOR_RESET_VALUE 2
+#endif
+
+#define VL_WDATA_GETW(lwp, i, n, w) \
+  VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w)
+
+using namespace vortex;
+
+static uint64_t timestamp = 0;
+
+double sc_time_stamp() { 
+  return timestamp;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+static bool trace_enabled = false;
+static uint64_t trace_start_time = TRACE_START_TIME;
+static uint64_t trace_stop_time = TRACE_STOP_TIME;
+
+bool sim_trace_enabled() {
+  if (timestamp >= trace_start_time 
+   && timestamp < trace_stop_time)
+    return true;
+  return trace_enabled;
+}
+
+void sim_trace_enable(bool enable) {
+  trace_enabled = enable;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+class Processor::Impl {
+public:
+  Impl() {
+    // force random values for unitialized signals  
+    Verilated::randReset(VERILATOR_RESET_VALUE);
+    Verilated::randSeed(50);
+
+    // turn off assertion before reset
+    Verilated::assertOn(false);
+
+    // create RTL module instance
+  #ifdef AXI_BUS
+    device_ = new VVortex_axi();
+  #else
+    device_ = new VVortex();
+  #endif
+
+  #ifdef VCD_OUTPUT
+    Verilated::traceEverOn(true);
+    trace_ = new VerilatedVcdC();
+    device_->trace(trace_, 99);
+    trace_->open("trace.vcd");
+  #endif
+
+    ram_ = nullptr;
+    
+    // initialize dram simulator
+    ramulator::Config ram_config;
+    ram_config.add("standard", "DDR4");
+    ram_config.add("channels", std::to_string(MEMORY_BANKS));
+    ram_config.add("ranks", "1");
+    ram_config.add("speed", "DDR4_2400R");
+    ram_config.add("org", "DDR4_4Gb_x8");
+    ram_config.add("mapping", "defaultmapping");
+    ram_config.set_core_num(1);
+    dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE);
+    Stats::statlist.output("ramulator.ddr4.log");
+
+    // reset the device
+    this->reset();
+  }
+
+  ~Impl() {
+    for (auto& buf : print_bufs_) {
+      auto str = buf.second.str();
+      if (!str.empty()) {
+        std::cout << "#" << buf.first << ": " << str << std::endl;
+      }
+    }
+
+  #ifdef VCD_OUTPUT
+    trace_->close();
+    delete trace_;
+  #endif
+    
+    delete device_;
+    
+    if (dram_) {
+      dram_->finish();
+      Stats::statlist.printall();
+      delete dram_;
+    }
+  }
+
+  void attach_ram(RAM* ram) {
+    ram_ = ram;
+  }
+
+  void reset() { 
+    print_bufs_.clear();
+
+    pending_mem_reqs_.clear();
+    
+    mem_rd_rsp_active_ = false;
+    mem_wr_rsp_active_ = false;
+
+  #ifdef AXI_BUS
+    this->reset_axi_bus();
+  #else
+    this->reset_avs_bus();
+  #endif
+
+    device_->reset = 1;
+
+    for (int i = 0; i < RESET_DELAY; ++i) {
+      device_->clk = 0;
+      this->eval();
+      device_->clk = 1;
+      this->eval();
+    }  
+
+    device_->reset = 0;
+    
+    // Turn on assertion after reset
+    Verilated::assertOn(true);
+  }
+
+  int run() {
+    int exitcode = 0;
+
+  #ifndef NDEBUG
+    std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
+  #endif
+
+    // execute program
+    while (device_->busy) {
+      if (get_ebreak()) {
+        exitcode = get_last_wb_value(3);
+        break;  
+      }
+      this->step();
+    }
+
+    // wait 5 cycles to flush the pipeline
+    this->wait(5);  
+
+    return exitcode;
+  }
+
+private:
+
+  void step() {
+
+    device_->clk = 0;
+    this->eval();
+
+  #ifdef AXI_BUS
+    this->eval_axi_bus(0);
+  #else
+    this->eval_avs_bus(0);
+  #endif
+
+    device_->clk = 1;
+    this->eval();
+      
+  #ifdef AXI_BUS
+    this->eval_axi_bus(1);
+  #else
+    this->eval_avs_bus(1);
+  #endif
+
+    dram_->tick();
+
+  #ifndef NDEBUG
+    fflush(stdout);
+  #endif
+  }
+
+  void eval() {
+    device_->eval();
+  #ifdef VCD_OUTPUT
+    if (sim_trace_enabled()) {
+      trace_->dump(timestamp);
+    }
+  #endif
+    ++timestamp;
+  }
+
+#ifdef AXI_BUS
+
+  void reset_axi_bus() {    
+    device_->m_axi_wready  = 0;
+    device_->m_axi_awready = 0;
+    device_->m_axi_arready = 0;  
+    device_->m_axi_rvalid  = 0;
+    device_->m_axi_bvalid  = 0;
+  }
+    
+  void eval_axi_bus(bool clk) {
+    if (!clk) {
+      mem_rd_rsp_ready_ = device_->m_axi_rready;
+      mem_wr_rsp_ready_ = device_->m_axi_bready;
+      return;
+    }
+
+    if (ram_ == nullptr) {
+      device_->m_axi_wready  = 0;
+      device_->m_axi_awready = 0;
+      device_->m_axi_arready = 0;  
+      return;
+    }
+
+    // process memory responses
+    if (mem_rd_rsp_active_
+    && device_->m_axi_rvalid && mem_rd_rsp_ready_) {
+      mem_rd_rsp_active_ = false;
+    }    
+    if (!mem_rd_rsp_active_) {      
+      if (!pending_mem_reqs_.empty()
+       && (*pending_mem_reqs_.begin())->ready 
+       && !(*pending_mem_reqs_.begin())->write) {      
+        auto mem_rsp_it = pending_mem_reqs_.begin();
+        auto mem_req = *mem_rsp_it;
+        /*
+          printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr);
+          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
+            printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]);
+          }
+          printf("\n");
+        */      
+        device_->m_axi_rvalid = 1;
+        device_->m_axi_rid    = mem_req->tag;   
+        device_->m_axi_rresp  = 0;
+        device_->m_axi_rlast  = 1;
+        memcpy((uint8_t*)device_->m_axi_rdata, mem_req->block.data(), MEM_BLOCK_SIZE);
+        pending_mem_reqs_.erase(mem_rsp_it);
+        mem_rd_rsp_active_ = true;
+        delete mem_req;
+      } else {
+        device_->m_axi_rvalid = 0;
+      }
+    }
+
+    // send memory write response  
+    if (mem_wr_rsp_active_
+    && device_->m_axi_bvalid && mem_wr_rsp_ready_) {
+      mem_wr_rsp_active_ = false;
+    }
+    if (!mem_wr_rsp_active_) {
+      if (!pending_mem_reqs_.empty()
+       && (*pending_mem_reqs_.begin())->ready 
+       && (*pending_mem_reqs_.begin())->write) {
+        auto mem_rsp_it = pending_mem_reqs_.begin();
+        auto mem_req = *mem_rsp_it;
+        /*
+          printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_req->addr);        
+        */
+        device_->m_axi_bvalid = 1;      
+        device_->m_axi_bid    = mem_req->tag;
+        device_->m_axi_bresp  = 0;
+        pending_mem_reqs_.erase(mem_rsp_it);        
+        mem_wr_rsp_active_ = true;
+        delete mem_req;
+      } else {
+        device_->m_axi_bvalid = 0;
+      }      
+    }
+
+    // select the memory bank
+    uint32_t req_addr = device_->m_axi_wvalid ? device_->m_axi_awaddr : device_->m_axi_araddr;
+    
+    // process memory requests
+    if (device_->m_axi_wvalid || device_->m_axi_arvalid) {
+      if (device_->m_axi_wvalid) {        
+        uint64_t byteen = device_->m_axi_wstrb;
+        unsigned base_addr = device_->m_axi_awaddr;
+        uint8_t* data = (uint8_t*)(device_->m_axi_wdata);
+
+        // check console output
+        if (base_addr >= IO_COUT_ADDR 
+        && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {          
+          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
+            if ((byteen >> i) & 0x1) {            
+              auto& ss_buf = print_bufs_[i];
+              char c = data[i];
+              ss_buf << c;
+              if (c == '\n') {
+                std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush;
+                ss_buf.str("");
+              }
+            }
+          }   
+        } else {
+          /*
+            printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen);
+            for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
+              printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
+            }
+            printf("\n");
+          */
+          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
+            if ((byteen >> i) & 0x1) {            
+              (*ram_)[base_addr + i] = data[i];
+            }
+          }  
+
+          auto mem_req = new mem_req_t();
+          mem_req->tag   = device_->m_axi_awid;
+          mem_req->addr  = device_->m_axi_awaddr;        
+          mem_req->write = true;
+          mem_req->ready = true;
+          pending_mem_reqs_.emplace_back(mem_req);
+
+          // send dram request
+          ramulator::Request dram_req( 
+            device_->m_axi_awaddr,
+            ramulator::Request::Type::WRITE,
+            0
+          );
+          dram_->send(dram_req);
+        }        
+      } else {
+        // process reads
+        auto mem_req = new mem_req_t();
+        mem_req->tag  = device_->m_axi_arid;   
+        mem_req->addr = device_->m_axi_araddr;
+        ram_->read(mem_req->block.data(), device_->m_axi_araddr, MEM_BLOCK_SIZE);
+        mem_req->write = false;
+        mem_req->ready = false;
+        pending_mem_reqs_.emplace_back(mem_req);
+
+        // send dram request
+        ramulator::Request dram_req( 
+          device_->m_axi_araddr,
+          ramulator::Request::Type::READ,
+          std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) {
+              mem_req->ready = true;
+            }, placeholders::_1, mem_req),
+          0
+        );
+        dram_->send(dram_req);
+      } 
+    } 
+
+    device_->m_axi_wready  = 1;
+    device_->m_axi_awready = 1;
+    device_->m_axi_arready = 1;     
+  }
+
+#else
+
+  void reset_avs_bus() {
+    device_->mem_req_ready = 0;
+    device_->mem_rsp_valid = 0;
+  }
+
+  void eval_avs_bus(bool clk) {
+    if (!clk) {
+      mem_rd_rsp_ready_ = device_->mem_rsp_ready;
+      return;
+    }
+
+    if (ram_ == nullptr) {
+      device_->mem_req_ready = 0;
+      return;
+    }
+
+    // process memory responses    
+    if (mem_rd_rsp_active_
+    && device_->mem_rsp_valid && mem_rd_rsp_ready_) {
+      mem_rd_rsp_active_ = false;
+    }
+    if (!mem_rd_rsp_active_) {
+      if (!pending_mem_reqs_.empty()
+       && (*pending_mem_reqs_.begin())->ready) {
+        device_->mem_rsp_valid = 1;      
+        auto mem_rsp_it = pending_mem_reqs_.begin();
+        auto mem_req = *mem_rsp_it;
+        /*
+          printf("%0ld: [sim] MEM Rd: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_req->addr);
+          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
+            printf("%02x", mem_req->block[(MEM_BLOCK_SIZE-1)-i]);
+          }
+          printf("\n");
+        */
+        memcpy((uint8_t*)device_->mem_rsp_data, mem_req->block.data(), MEM_BLOCK_SIZE);
+        device_->mem_rsp_tag = mem_req->tag;   
+        pending_mem_reqs_.erase(mem_rsp_it);
+        mem_rd_rsp_active_ = true;
+        delete mem_req;
+      } else {
+        device_->mem_rsp_valid = 0;
+      }
+    }
+
+    // process memory requests    
+    if (device_->mem_req_valid) {
+      uint32_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
+      if (device_->mem_req_rw) {        
+        // process writes
+        uint64_t byteen = device_->mem_req_byteen;        
+        uint8_t* data = (uint8_t*)(device_->mem_req_data);
+
+        // check console output
+        if (byte_addr >= IO_COUT_ADDR 
+        && byte_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {          
+          for (int i = 0; i < IO_COUT_SIZE; i++) {
+            if ((byteen >> i) & 0x1) {            
+              auto& ss_buf = print_bufs_[i];
+              char c = data[i];
+              ss_buf << c;
+              if (c == '\n') {
+                std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush;
+                ss_buf.str("");
+              }
+            }
+          }   
+        } else {
+          /*
+            printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, byte_addr, byteen);
+            for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
+              printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
+            }
+            printf("\n");
+          */
+          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
+            if ((byteen >> i) & 0x1) {            
+              (*ram_)[byte_addr + i] = data[i];
+            }
+          }
+
+          // send dram request
+          ramulator::Request dram_req( 
+            byte_addr,
+            ramulator::Request::Type::WRITE,
+            0
+          );
+          dram_->send(dram_req);
+        }
+      } else {
+        // process reads
+        auto mem_req = new mem_req_t();
+        mem_req->tag   = device_->mem_req_tag;   
+        mem_req->addr  = byte_addr;
+        mem_req->write = false;
+        mem_req->ready = false;
+        ram_->read(mem_req->block.data(), byte_addr, MEM_BLOCK_SIZE);
+        pending_mem_reqs_.emplace_back(mem_req);
+
+        // send dram request
+        ramulator::Request dram_req( 
+          byte_addr,
+          ramulator::Request::Type::READ,
+          std::bind([](ramulator::Request& dram_req, mem_req_t* mem_req) {
+              mem_req->ready = true;
+            }, placeholders::_1, mem_req),
+          0
+        );
+        dram_->send(dram_req);
+      }
+    }   
+
+    device_->mem_req_ready = 1;
+  }
+
+#endif
+
+  void wait(uint32_t cycles) {
+    for (int i = 0; i < cycles; ++i) {
+      this->step();
+    }
+  }
+
+  bool get_ebreak() const {
+  #ifdef AXI_BUS
+    return (bool)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak;
+  #else
+    return (bool)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak;
+  #endif
+  }
+
+  int get_last_wb_value(int reg) const {
+  #ifdef AXI_BUS
+    return (int)device_->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg];
+  #else
+    return (int)device_->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg];
+  #endif
+  }
+
+private:
+
+  typedef struct {    
+    bool ready;  
+    std::array<uint8_t, MEM_BLOCK_SIZE> block;
+    uint64_t addr;
+    uint64_t tag;
+    bool write;
+  } mem_req_t;
+
+#ifdef AXI_BUS
+  VVortex_axi *device_;
+#else
+  VVortex *device_;
+#endif
+#ifdef VCD_OUTPUT
+  VerilatedVcdC *trace_;
+#endif
+
+  std::unordered_map<int, std::stringstream> print_bufs_;
+
+  std::list<mem_req_t*> pending_mem_reqs_;
+
+  bool mem_rd_rsp_active_;
+  bool mem_rd_rsp_ready_;
+
+  bool mem_wr_rsp_active_;
+  bool mem_wr_rsp_ready_;
+
+  RAM *ram_;
+
+  ramulator::Gem5Wrapper* dram_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+Processor::Processor() 
+  : impl_(new Impl())
+{}
+
+Processor::~Processor() {
+  delete impl_;
+}
+
+void Processor::attach_ram(RAM* mem) {
+  impl_->attach_ram(mem);
+}
+
+void Processor::reset() {
+  impl_->reset();
+}
+
+int Processor::run() {
+  return impl_->run();
+}
--- a/sim/rtlsim/processor.h
+++ b/sim/rtlsim/processor.h
@ -0,0 +1,25 @@
+#pragma once
+
+namespace vortex {
+
+class RAM;
+
+class Processor {
+public:
+  
+  Processor();
+  virtual ~Processor();
+
+  void attach_ram(RAM* ram);
+
+  void reset();
+
+  int run();
+
+private:
+
+  class Impl;
+  Impl* impl_;
+};
+
+}
--- a/sim/rtlsim/simulator.cpp
+++ b/sim/rtlsim/simulator.cpp
@ -1,579 +0,0 @@
-#include "simulator.h"
-
-#include <verilated.h>
-
-#ifdef AXI_BUS
-#include "VVortex_axi.h"
-#include "VVortex_axi__Syms.h"
-#else
-#include "VVortex.h"
-#include "VVortex__Syms.h"
-#endif
-
-#ifdef VCD_OUTPUT
-#include <verilated_vcd_c.h>
-#endif
-
-#include <iostream>
-#include <fstream>
-#include <iomanip>
-#include <mem.h>
-
-#define ENABLE_MEM_STALLS
-
-#ifndef TRACE_START_TIME
-#define TRACE_START_TIME 0ull
-#endif
-
-#ifndef TRACE_STOP_TIME
-#define TRACE_STOP_TIME -1ull
-#endif
-
-#ifndef MEM_LATENCY
-#define MEM_LATENCY 24
-#endif
-
-#ifndef MEM_RQ_SIZE
-#define MEM_RQ_SIZE 16
-#endif
-
-#ifndef MEM_STALLS_MODULO
-#define MEM_STALLS_MODULO 16
-#endif
-
-#ifndef VERILATOR_RESET_VALUE
-#define VERILATOR_RESET_VALUE 2
-#endif
-
-#define VL_WDATA_GETW(lwp, i, n, w) \
-  VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w)
-
-using namespace vortex;
-
-static uint64_t timestamp = 0;
-
-double sc_time_stamp() { 
-  return timestamp;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-static bool trace_enabled = false;
-static uint64_t trace_start_time = TRACE_START_TIME;
-static uint64_t trace_stop_time = TRACE_STOP_TIME;
-
-bool sim_trace_enabled() {
-  if (timestamp >= trace_start_time 
-   && timestamp < trace_stop_time)
-    return true;
-  return trace_enabled;
-}
-
-void sim_trace_enable(bool enable) {
-  trace_enabled = enable;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-namespace vortex {
-class VL_OBJ {
-public:
-#ifdef AXI_BUS
-  VVortex_axi *device;
-#else
-  VVortex *device;
-#endif
-#ifdef VCD_OUTPUT
-  VerilatedVcdC *trace;
-#endif
-
-  VL_OBJ() {
-    // force random values for unitialized signals  
-    Verilated::randReset(VERILATOR_RESET_VALUE);
-    Verilated::randSeed(50);
-
-    // Turn off assertion before reset
-    Verilated::assertOn(false);
-
-  #ifdef AXI_BUS
-    this->device = new VVortex_axi();
-  #else
-    this->device = new VVortex();
-  #endif
-
-  #ifdef VCD_OUTPUT
-    Verilated::traceEverOn(true);
-    this->trace = new VerilatedVcdC();
-    this->device->trace(this->trace, 99);
-    this->trace->open("trace.vcd");
-  #endif
-  }
-
-  ~VL_OBJ() {
-  #ifdef VCD_OUTPUT
-    this->trace->close();
-    delete this->trace;
-  #endif
-    delete this->device;
-  }
-};
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-Simulator::Simulator() {
-  vl_obj_ = new VL_OBJ();
-  ram_ = nullptr;
-  // reset the device
-  this->reset();
-}
-
-Simulator::~Simulator() {
-  for (auto& buf : print_bufs_) {
-    auto str = buf.second.str();
-    if (!str.empty()) {
-      std::cout << "#" << buf.first << ": " << str << std::endl;
-    }
-  }
-  delete vl_obj_;
-}
-
-void Simulator::attach_ram(RAM* ram) {
-  ram_ = ram;
-  for (int b = 0; b < MEMORY_BANKS; ++b) {
-    mem_rsp_vec_[b].clear();
-  }
-  last_mem_rsp_bank_ = 0;
-}
-
-void Simulator::reset() { 
-  print_bufs_.clear();
-
-  for (int b = 0; b < MEMORY_BANKS; ++b) {
-    mem_rsp_vec_[b].clear();
-  }
-  last_mem_rsp_bank_ = 0;
-  mem_rd_rsp_active_ = false;
-  mem_wr_rsp_active_ = false;
-
-#ifdef AXI_BUS
-  this->reset_axi_bus();
-#else
-  this->reset_mem_bus();
-#endif
-
-  vl_obj_->device->reset = 1;
-
-  for (int i = 0; i < RESET_DELAY; ++i) {
-    vl_obj_->device->clk = 0;
-    this->eval();
-    vl_obj_->device->clk = 1;
-    this->eval();
-  }  
-
-  vl_obj_->device->reset = 0;
-  
-  // Turn on assertion after reset
-  Verilated::assertOn(true);
-}
-
-void Simulator::step() {
-
-  vl_obj_->device->clk = 0;
-  this->eval();
-
-#ifdef AXI_BUS
-  this->eval_axi_bus(0);
-#else
-  this->eval_mem_bus(0);
-#endif
-
-  vl_obj_->device->clk = 1;
-  this->eval();
-    
-#ifdef AXI_BUS
-  this->eval_axi_bus(1);
-#else
-  this->eval_mem_bus(1);
-#endif
-
-#ifndef NDEBUG
-  fflush(stdout);
-#endif
-}
-
-void Simulator::eval() {
-  vl_obj_->device->eval();
-#ifdef VCD_OUTPUT
-  if (sim_trace_enabled()) {
-    vl_obj_->trace->dump(timestamp);
-  }
-#endif
-  ++timestamp;
-}
-
-#ifdef AXI_BUS
-
-void Simulator::reset_axi_bus() {
-  vl_obj_->device->m_axi_wready  = 0;
-  vl_obj_->device->m_axi_awready = 0;
-  vl_obj_->device->m_axi_arready = 0;  
-  vl_obj_->device->m_axi_rvalid  = 0;
-  vl_obj_->device->m_axi_bvalid  = 0;
-}
-  
-void Simulator::eval_axi_bus(bool clk) {
-  if (!clk) {
-    mem_rd_rsp_ready_ = vl_obj_->device->m_axi_rready;
-    mem_wr_rsp_ready_ = vl_obj_->device->m_axi_bready;
-    return;
-  }
-
-  if (ram_ == nullptr) {
-    vl_obj_->device->m_axi_wready  = 0;
-    vl_obj_->device->m_axi_awready = 0;
-    vl_obj_->device->m_axi_arready = 0;  
-    return;
-  }
-
-  // update memory responses schedule
-  for (int b = 0; b < MEMORY_BANKS; ++b) {    
-    for (auto& rsp : mem_rsp_vec_[b]) {
-      if (rsp.cycles_left > 0)
-        rsp.cycles_left -= 1;
-    }
-  }
-
-  bool has_rd_response = false;
-  bool has_wr_response = false;
-
-  // schedule memory responses that are ready
-  for (int i = 0; i < MEMORY_BANKS; ++i) {
-    uint32_t b = (i + last_mem_rsp_bank_ + 1) % MEMORY_BANKS;
-    if (!mem_rsp_vec_[b].empty()) {
-      auto mem_rsp_it = mem_rsp_vec_[b].begin();
-      if (mem_rsp_it->cycles_left <= 0) {
-          has_rd_response = !mem_rsp_it->write;
-          has_wr_response = mem_rsp_it->write;
-          last_mem_rsp_bank_ = b;
-          break;
-      }
-    }
-  }
-
-  // send memory read response  
-  if (mem_rd_rsp_active_
-  && vl_obj_->device->m_axi_rvalid && mem_rd_rsp_ready_) {
-    mem_rd_rsp_active_ = false;
-  }
-  if (!mem_rd_rsp_active_) {
-    if (has_rd_response) {      
-      auto mem_rsp_it = mem_rsp_vec_[last_mem_rsp_bank_].begin();
-      /*
-        printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp_it->addr);
-        for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
-          printf("%02x", mem_rsp_it->block[(MEM_BLOCK_SIZE-1)-i]);
-        }
-        printf("\n");
-      */      
-      vl_obj_->device->m_axi_rvalid = 1;
-      vl_obj_->device->m_axi_rid    = mem_rsp_it->tag;   
-      vl_obj_->device->m_axi_rresp  = 0;
-      vl_obj_->device->m_axi_rlast  = 1;
-      memcpy((uint8_t*)vl_obj_->device->m_axi_rdata, mem_rsp_it->block.data(), MEM_BLOCK_SIZE);
-      mem_rsp_vec_[last_mem_rsp_bank_].erase(mem_rsp_it);
-      mem_rd_rsp_active_ = true;
-    } else {
-      vl_obj_->device->m_axi_rvalid = 0;
-    }
-  }
-
-  // send memory write response  
-  if (mem_wr_rsp_active_
-  && vl_obj_->device->m_axi_bvalid && mem_wr_rsp_ready_) {
-    mem_wr_rsp_active_ = false;
-  }
-  if (!mem_wr_rsp_active_) {
-    if (has_wr_response) {
-      auto mem_rsp_it = mem_rsp_vec_[last_mem_rsp_bank_].begin();
-      /*
-        printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_rsp_it->addr);        
-      */
-      vl_obj_->device->m_axi_bvalid = 1;      
-      vl_obj_->device->m_axi_bid    = mem_rsp_it->tag;
-      vl_obj_->device->m_axi_bresp  = 0;
-      mem_rsp_vec_[last_mem_rsp_bank_].erase(mem_rsp_it);
-      mem_wr_rsp_active_ = true;
-    } else {
-      vl_obj_->device->m_axi_bvalid = 0;
-    }
-  }
-
-  // select the memory bank
-  uint32_t req_addr = vl_obj_->device->m_axi_wvalid ? vl_obj_->device->m_axi_awaddr : vl_obj_->device->m_axi_araddr;
-  uint32_t req_bank = (MEMORY_BANKS >= 2) ? ((req_addr / MEM_BLOCK_SIZE) % MEMORY_BANKS) : 0;
-
-  // handle memory stalls
-  bool mem_stalled = false;
-#ifdef ENABLE_MEM_STALLS
-  if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { 
-    mem_stalled = true;
-  } else
-  if (mem_rsp_vec_[req_bank].size() >= MEM_RQ_SIZE) {
-    mem_stalled = true;
-  }
-#endif
-
-  // process memory requests
-  if (!mem_stalled) {
-    if (vl_obj_->device->m_axi_wvalid || vl_obj_->device->m_axi_arvalid) {
-      if (vl_obj_->device->m_axi_wvalid) {        
-        uint64_t byteen = vl_obj_->device->m_axi_wstrb;
-        unsigned base_addr = vl_obj_->device->m_axi_awaddr;
-        uint8_t* data = (uint8_t*)(vl_obj_->device->m_axi_wdata);
-
-        // detect stdout write
-        if (base_addr >= IO_COUT_ADDR 
-         && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {          
-          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
-            if ((byteen >> i) & 0x1) {            
-              auto& ss_buf = print_bufs_[i];
-              char c = data[i];
-              ss_buf << c;
-              if (c == '\n') {
-                std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush;
-                ss_buf.str("");
-              }
-            }
-          }   
-        } else {
-          /*
-            printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen);
-            for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
-              printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
-            }
-            printf("\n");
-          */
-          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
-            if ((byteen >> i) & 0x1) {            
-              (*ram_)[base_addr + i] = data[i];
-            }
-          }
-          mem_req_t mem_req;
-          mem_req.tag  = vl_obj_->device->m_axi_arid;
-          mem_req.addr = vl_obj_->device->m_axi_araddr;        
-          mem_req.cycles_left = 0;
-          mem_req.write = 1;
-          mem_rsp_vec_[req_bank].emplace_back(mem_req);
-        }        
-      } else {
-        mem_req_t mem_req;        
-        mem_req.tag  = vl_obj_->device->m_axi_arid;   
-        mem_req.addr = vl_obj_->device->m_axi_araddr;
-        ram_->read(mem_req.block.data(), vl_obj_->device->m_axi_araddr, MEM_BLOCK_SIZE);
-        mem_req.cycles_left = MEM_LATENCY;
-        mem_req.write = 0;
-        for (auto& rsp : mem_rsp_vec_[req_bank]) {
-          if (mem_req.addr == rsp.addr) {
-            // duplicate requests receive the same cycle delay
-            mem_req.cycles_left = rsp.cycles_left;
-            break;
-          }
-        }     
-        mem_rsp_vec_[req_bank].emplace_back(mem_req);
-      } 
-    }    
-  }
-
-  vl_obj_->device->m_axi_wready  = !mem_stalled;
-  vl_obj_->device->m_axi_awready = !mem_stalled;
-  vl_obj_->device->m_axi_arready = !mem_stalled;
-}
-
-#else
-
-void Simulator::reset_mem_bus() {
-  vl_obj_->device->mem_req_ready = 0;
-  vl_obj_->device->mem_rsp_valid = 0;
-}
-
-void Simulator::eval_mem_bus(bool clk) {
-  if (!clk) {
-    mem_rd_rsp_ready_ = vl_obj_->device->mem_rsp_ready;
-    return;
-  }
-
-  if (ram_ == nullptr) {
-    vl_obj_->device->mem_req_ready = 0;
-    return;
-  }
-
-  // update memory responses schedule
-  for (int b = 0; b < MEMORY_BANKS; ++b) {    
-    for (auto& rsp : mem_rsp_vec_[b]) {
-      if (rsp.cycles_left > 0)
-        rsp.cycles_left -= 1;
-    }
-  }
-
-  bool has_response = false;
-
-  // schedule memory responses that are ready
-  for (int i = 0; i < MEMORY_BANKS; ++i) {
-    uint32_t b = (i + last_mem_rsp_bank_ + 1) % MEMORY_BANKS;
-    if (!mem_rsp_vec_[b].empty()
-    && (mem_rsp_vec_[b].begin()->cycles_left) <= 0) {
-        has_response = true;
-        last_mem_rsp_bank_ = b;
-        break;
-    }
-  }
-
-  // send memory response  
-  if (mem_rd_rsp_active_
-  && vl_obj_->device->mem_rsp_valid && mem_rd_rsp_ready_) {
-    mem_rd_rsp_active_ = false;
-  }
-  if (!mem_rd_rsp_active_) {
-    if (has_response) {
-      vl_obj_->device->mem_rsp_valid = 1;      
-      auto mem_rsp_it = mem_rsp_vec_[last_mem_rsp_bank_].begin();      
-      /*
-        printf("%0ld: [sim] MEM Rd: bank=%d, addr=%0lx, data=", timestamp, last_mem_rsp_bank_, mem_rsp_it->addr);
-        for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
-          printf("%02x", mem_rsp_it->block[(MEM_BLOCK_SIZE-1)-i]);
-        }
-        printf("\n");
-      */
-      memcpy((uint8_t*)vl_obj_->device->mem_rsp_data, mem_rsp_it->block.data(), MEM_BLOCK_SIZE);
-      vl_obj_->device->mem_rsp_tag = mem_rsp_it->tag;   
-      mem_rsp_vec_[last_mem_rsp_bank_].erase(mem_rsp_it);
-      mem_rd_rsp_active_ = true;
-    } else {
-      vl_obj_->device->mem_rsp_valid = 0;
-    }
-  }
-
-  // select the memory bank
-  uint32_t req_bank = (MEMORY_BANKS >= 2) ? (vl_obj_->device->mem_req_addr % MEMORY_BANKS) : 0;
-
-  // handle memory stalls
-  bool mem_stalled = false;
-#ifdef ENABLE_MEM_STALLS
-  if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { 
-    mem_stalled = true;
-  } else
-  if (mem_rsp_vec_[req_bank].size() >= MEM_RQ_SIZE) {
-    mem_stalled = true;
-  }
-#endif
-
-  // process memory requests
-  if (!mem_stalled) {
-    if (vl_obj_->device->mem_req_valid) {
-      if (vl_obj_->device->mem_req_rw) {        
-        uint64_t byteen = vl_obj_->device->mem_req_byteen;
-        unsigned base_addr = (vl_obj_->device->mem_req_addr * MEM_BLOCK_SIZE);
-        uint8_t* data = (uint8_t*)(vl_obj_->device->mem_req_data);
-        if (base_addr >= IO_COUT_ADDR 
-         && base_addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {          
-          for (int i = 0; i < IO_COUT_SIZE; i++) {
-            if ((byteen >> i) & 0x1) {            
-              auto& ss_buf = print_bufs_[i];
-              char c = data[i];
-              ss_buf << c;
-              if (c == '\n') {
-                std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush;
-                ss_buf.str("");
-              }
-            }
-          }   
-        } else {
-          /*
-            printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen);
-            for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
-              printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
-            }
-            printf("\n");
-          */
-          for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
-            if ((byteen >> i) & 0x1) {            
-              (*ram_)[base_addr + i] = data[i];
-            }
-          }
-        }
-      } else {
-        mem_req_t mem_req;        
-        mem_req.tag  = vl_obj_->device->mem_req_tag;   
-        mem_req.addr = (vl_obj_->device->mem_req_addr * MEM_BLOCK_SIZE);
-        ram_->read(mem_req.block.data(), vl_obj_->device->mem_req_addr * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE);
-        mem_req.cycles_left = MEM_LATENCY;
-        for (auto& rsp : mem_rsp_vec_[req_bank]) {
-          if (mem_req.addr == rsp.addr) {
-            // duplicate requests receive the same cycle delay
-            mem_req.cycles_left = rsp.cycles_left;
-            break;
-          }
-        }     
-        mem_rsp_vec_[req_bank].emplace_back(mem_req);
-      } 
-    }    
-  }
-
-  vl_obj_->device->mem_req_ready = !mem_stalled;
-}
-
-#endif
-
-void Simulator::wait(uint32_t cycles) {
-  for (int i = 0; i < cycles; ++i) {
-    this->step();
-  }
-}
-
-bool Simulator::is_busy() const {
-  return vl_obj_->device->busy;
-}
-
-int Simulator::run() {
-  int exitcode = 0;
-
-#ifndef NDEBUG
-  std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
-#endif
-
-  // execute program
-  while (vl_obj_->device->busy) {
-    if (get_ebreak()) {
-      exitcode = get_last_wb_value(3);
-      break;  
-    }
-    this->step();
-  }
-
-  // wait 5 cycles to flush the pipeline
-  this->wait(5);  
-
-  return exitcode;
-}
-
-bool Simulator::get_ebreak() const {
-#ifdef AXI_BUS
-  return (int)vl_obj_->device->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak;
-#else
-  return (int)vl_obj_->device->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->execute->ebreak;
-#endif
-}
-
-int Simulator::get_last_wb_value(int reg) const {
-#ifdef AXI_BUS
-  return (int)vl_obj_->device->Vortex_axi->vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg];
-#else
-  return (int)vl_obj_->device->Vortex->genblk2__BRA__0__KET____DOT__cluster->genblk2__BRA__0__KET____DOT__core->pipeline->commit->writeback->last_wb_value[reg];
-#endif
-}
-
-void Simulator::print_stats(std::ostream& out) {
-  out << std::left;
-  out << std::setw(24) << "# of total cycles:" << std::dec << timestamp/2 << std::endl;
-}
--- a/sim/rtlsim/simulator.h
+++ b/sim/rtlsim/simulator.h
@ -1,81 +0,0 @@
-#pragma once
-
-#include <VX_config.h>
-#include <ostream>
-#include <list>
-#include <vector>
-#include <sstream> 
-#include <unordered_map>
-
-#ifndef MEMORY_BANKS
-  #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
-    #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS
-  #else
-    #define MEMORY_BANKS 2
-  #endif
-#endif
-
-namespace vortex {
-
-class VL_OBJ;
-class RAM;
-
-class Simulator {
-public:
-  
-  Simulator();
-  virtual ~Simulator();
-
-  void attach_ram(RAM* ram);
-
-  bool is_busy() const;
-
-  void reset();
-  void step();
-  void wait(uint32_t cycles);
-
-  int run();
-
-  void print_stats(std::ostream& out);
-
-private:  
-
-  typedef struct {    
-    int cycles_left;  
-    std::array<uint8_t, MEM_BLOCK_SIZE> block;
-    uint64_t addr;
-    uint64_t tag;
-    bool write;
-  } mem_req_t;
-
-  std::unordered_map<int, std::stringstream> print_bufs_;
-
-  void eval();  
-  
-#ifdef AXI_BUS
-  void reset_axi_bus();  
-  void eval_axi_bus(bool clk); 
-#else
-  void reset_mem_bus();  
-  void eval_mem_bus(bool clk);
-#endif
-
-  int get_last_wb_value(int reg) const;  
-  
-  bool get_ebreak() const;
-
-  std::list<mem_req_t> mem_rsp_vec_ [MEMORY_BANKS];
-  uint32_t last_mem_rsp_bank_;
-
-  bool mem_rd_rsp_active_;
-  bool mem_rd_rsp_ready_;
-
-  bool mem_wr_rsp_active_;
-  bool mem_wr_rsp_ready_;
-
-  RAM *ram_;
-
-  VL_OBJ* vl_obj_;
-};
-
-}
--- a/sim/simx/Makefile
+++ b/sim/simx/Makefile
@ -1,3 +1,4 @@
+DESTDIR ?= .
 RTL_DIR = ../hw/rtl
 THIRD_PARTY_DIR = ../../third_party

@ -5,15 +6,17 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
 CXXFLAGS += -fPIC -Wno-maybe-uninitialized
 CXXFLAGS += -I. -I../common -I../../hw
 CXXFLAGS += -I$(THIRD_PARTY_DIR)/softfloat/source/include
-CXXFLAGS += -I$(THIRD_PARTY_DIR)/cocogfx/include
+CXXFLAGS += -I$(THIRD_PARTY_DIR)
 CXXFLAGS += $(CONFIGS)

-LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx
+LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a 
+LDFLAGS += -L$(THIRD_PARTY_DIR)/cocogfx -lcocogfx 
+LDFLAGS += -L$(THIRD_PARTY_DIR)/ramulator -lramulator

 TOP = vx_cache_sim

-SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp 
-SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp main.cpp
+SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
+SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp tex_unit.cpp processor.cpp

 OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
 VPATH := $(sort $(dir $(SRCS)))
@ -30,23 +33,16 @@ endif

 PROJECT = simx

-all: $(PROJECT)
-
-$(PROJECT): $(SRCS)
+all: $(DESTDIR)/$(PROJECT)
+	
+$(DESTDIR)/$(PROJECT): $(SRCS) main.cpp
 	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@

-obj_dir/%.o: %.cpp
-	mkdir -p obj_dir
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-static: $(OBJS)
-	$(AR) rcs lib$(PROJECT).a $(OBJS) $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/*.o
+$(DESTDIR)/lib$(PROJECT).so: $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ -shared $(LDFLAGS) -o $@

 .depend: $(SRCS)
 	$(CXX) $(CXXFLAGS) -MM $^ > .depend;

-clean-static:
-	rm -rf lib$(PROJECT).a obj_dir .depend
-
-clean: clean-static
-	rm -rf $(PROJECT)
+clean:
+	rm -rf obj_dir $(DESTDIR)/$(PROJECT) $(DESTDIR)/lib$(PROJECT).so
--- a/sim/simx/cache.cpp
+++ b/sim/simx/cache.cpp
@ -116,6 +116,7 @@ struct bank_req_t {
    bool mshr_replay;
    uint64_t tag;
    uint32_t set_id;
+    uint32_t core_id;
    std::vector<bank_req_info_t> infos;

    bank_req_t(uint32_t size) 
@ -124,6 +125,7 @@ struct bank_req_t {
        , mshr_replay(false)
        , tag(0)
        , set_id(0)
+        , core_id(0)
        , infos(size)
    {}
 };
@ -292,7 +294,7 @@ public:
            auto& mem_rsp = bypass_port.front();
            uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);                
            uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
-            MemRsp core_rsp(tag);
+            MemRsp core_rsp{tag, mem_rsp.core_id};
            simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
            bypass_port.pop();
        }
@ -327,7 +329,7 @@ public:
            auto& core_req = core_req_port.front();

            // check cache bypassing
-            if (core_req.is_io) {
+            if (core_req.non_cacheable) {
                // send IO request
                this->processIORequest(core_req, req_id);

@ -348,6 +350,7 @@ public:
            bank_req.mshr_replay = false;
            bank_req.tag = tag;            
            bank_req.set_id = set_id;       
+            bank_req.core_id = core_req.core_id;
            bank_req.infos.at(port_id) = {true, req_id, core_req.tag};

            auto& bank = banks_.at(bank_id);            
@ -439,7 +442,8 @@ public:
            if (pipeline_req.mshr_replay) {
                // send core response
                for (auto& info : pipeline_req.infos) {
-                    simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);           
+                    MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
+                    simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);           
                }
            } else {        
                bool hit = false;
@ -480,6 +484,7 @@ public:
                            MemReq mem_req;
                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
                            mem_req.write = true;
+                            mem_req.core_id = pipeline_req.core_id;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
                        } else {
                            // mark block as dirty
@ -488,8 +493,9 @@ public:
                    }
                    // send core response
                    if (!pipeline_req.write || config_.write_reponse) {
-                        for (auto& info : pipeline_req.infos) {          
-                            simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
+                        for (auto& info : pipeline_req.infos) {     
+                            MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
+                            simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
                        }
                    }
                } else {     
@ -508,6 +514,7 @@ public:
                            MemReq mem_req;
                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
                            mem_req.write = true;
+                            mem_req.core_id = pipeline_req.core_id;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
                            ++perf_stats_.evictions;
                        }
@ -519,12 +526,14 @@ public:
                            MemReq mem_req;
                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
                            mem_req.write = true;
+                            mem_req.core_id = pipeline_req.core_id;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
                        }
                        // send core response
                        if (config_.write_reponse) {
-                            for (auto& info : pipeline_req.infos) {            
-                                simobject_->CoreRspPorts.at(info.req_id).send(MemRsp{info.req_tag}, config_.latency);
+                            for (auto& info : pipeline_req.infos) {         
+                                MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
+                                simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
                            }
                        }
                    } else {
@ -540,6 +549,7 @@ public:
                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
                            mem_req.write = false;
                            mem_req.tag   = mshr_id;
+                            mem_req.core_id = pipeline_req.core_id;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
                            ++pending_fill_reqs_;
                        }
--- a/sim/simx/constants.h
+++ b/sim/simx/constants.h
@ -2,12 +2,10 @@

 #include "types.h"

-#ifndef MEM_LATENCY
-#define MEM_LATENCY 24
-#endif
-
 #define RAM_PAGE_SIZE 4096

+#define DRAM_CHANNELS 2
+
 namespace vortex {

 enum Constants {
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@ -87,12 +87,12 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
  }

  // register execute units
-  exe_units_.at((int)ExeType::NOP) = SimPlatform::instance().CreateObject<NopUnit>(this);
-  exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().CreateObject<AluUnit>(this);
-  exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().CreateObject<LsuUnit>(this);
-  exe_units_.at((int)ExeType::CSR) = SimPlatform::instance().CreateObject<CsrUnit>(this);
-  exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().CreateObject<FpuUnit>(this);  
-  exe_units_.at((int)ExeType::GPU) = SimPlatform::instance().CreateObject<GpuUnit>(this);
+  exe_units_.at((int)ExeType::NOP) = SimPlatform::instance().create_object<NopUnit>(this);
+  exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
+  exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
+  exe_units_.at((int)ExeType::CSR) = SimPlatform::instance().create_object<CsrUnit>(this);
+  exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);  
+  exe_units_.at((int)ExeType::GPU) = SimPlatform::instance().create_object<GpuUnit>(this);

  // connect l1 switch
  icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
@ -216,6 +216,7 @@ void Core::fetch(uint64_t cycle) {
    mem_req.addr  = trace->PC;
    mem_req.write = false;
    mem_req.tag   = pending_icache_.allocate(trace);    
+    mem_req.core_id = id_;
    icache_->CoreReqPorts.at(0).send(mem_req, 1);
    DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
    fetch_latch_.pop();
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@ -403,7 +403,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    break;
  case JALR_INST:
    trace->exe_type = ExeType::ALU;    
-    trace->alu.type = AluType::BRANCH;    
+    trace->alu.type = AluType::BRANCH;
    trace->used_iregs.set(rsrc0);
    for (int t = 0; t < num_threads; ++t) {
      if (!tmask_.test(t))
@ -535,6 +535,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      Word csr_value;
      if (func3 == 0) {
        trace->exe_type = ExeType::ALU;
+        trace->alu.type = AluType::SYSCALL;
        trace->fetch_stall = true;
        switch (csr_addr) {
        case 0: // ECALL
--- a/sim/simx/exeunit.cpp
+++ b/sim/simx/exeunit.cpp
@ -143,8 +143,9 @@ void LsuUnit::step(uint64_t cycle) {
        MemReq mem_req;
        mem_req.addr  = mem_addr.addr;
        mem_req.write = is_write;
+        mem_req.non_cacheable = (type == AddrType::IO); 
        mem_req.tag   = tag;
-        mem_req.is_io = (type == AddrType::IO); 
+        mem_req.core_id = core_->id();
        
        if (type == AddrType::Shared) {
            core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
@ -153,7 +154,7 @@ void LsuUnit::step(uint64_t cycle) {
        } else {            
            dcache_req_port.send(mem_req, 2);
            DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
-                << ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << mem_req.is_io << ", " << *trace);
+                << ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
        }        
        
        if (is_dup)
@ -182,6 +183,7 @@ void AluUnit::step(uint64_t cycle) {
    switch (trace->alu.type) {
    case AluType::ARITH:        
    case AluType::BRANCH:
+    case AluType::SYSCALL:
    case AluType::CMOV:
        Output.send(trace, 1);
        break;
@ -359,6 +361,7 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
            mem_req.addr  = mem_addr.addr;
            mem_req.write = (trace->lsu.type == LsuType::STORE);
            mem_req.tag   = tag;
+            mem_req.core_id = core_->id();
            dcache_req_port.send(mem_req, 3);
            DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
                << ", tid=" << t << ", "<< trace);
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@ -13,7 +13,7 @@
 using namespace vortex;

 int main(int argc, char **argv) {
-  int exitcode;
+  int exitcode = 0;

  std::string archStr("rv32imf");
  std::string imgFileName;
@ -54,12 +54,7 @@ int main(int argc, char **argv) {
    return -1;

  {
-    ArchDef arch(archStr, num_cores, num_warps, num_threads);
-
-    Processor processor(arch);
-
    RAM ram(RAM_PAGE_SIZE);
-
    {
      std::string program_ext(fileExtension(imgFileName.c_str()));
      if (program_ext == "bin") {
@ -72,25 +67,40 @@ int main(int argc, char **argv) {
      }
    }

-    processor.attach_ram(&ram);
+    ArchDef arch(archStr, num_cores, num_warps, num_threads);
+    auto processor = Processor::Create(arch);
+    processor->attach_ram(&ram);

-    exitcode = processor.run();
+    // setup memory simulator
+    auto memsim = MemSim::Create(MemSim::Config{
+      DRAM_CHANNELS,
+      arch.num_cores()
+    });    
+    processor->MemReqPort.bind(&memsim->MemReqPort);
+    memsim->MemRspPort.bind(&processor->MemRspPort);

-    if (riscv_test) {
-      if (1 == exitcode) {
-        std::cout << "Passed." << std::endl;
-        exitcode = 0;
-      } else {
-        std::cout << "Failed." << std::endl;
-      }
-    } else {
-      if (exitcode != 0) {
-        std::cout << "*** error: exitcode=" << exitcode << std::endl;
-      }
-    }
-  }  
+    // run simulation
+    for (;;) {
+      SimPlatform::instance().step();
+      if (processor->check_exit(&exitcode))
+          break;
+    };    
+  }

  SimPlatform::instance().finalize();

+  if (riscv_test) {
+    if (1 == exitcode) {
+      std::cout << "Passed." << std::endl;
+      exitcode = 0;
+    } else {
+      std::cout << "Failed." << std::endl;
+    }
+  } else {
+    if (exitcode != 0) {
+      std::cout << "*** error: exitcode=" << exitcode << std::endl;
+    }
+  }  
+
  return exitcode;
 }
--- a/sim/simx/memsim.cpp
+++ b/sim/simx/memsim.cpp
@ -1,56 +1,99 @@
 #include "memsim.h"
 #include <vector>
 #include <queue>
+#include <stdlib.h>
+
+DISABLE_WARNING_PUSH
+DISABLE_WARNING_UNUSED_PARAMETER
+#define RAMULATOR
+#include <ramulator/src/Gem5Wrapper.h>
+#include <ramulator/src/Request.h>
+#include <ramulator/src/Statistics.h>
+DISABLE_WARNING_POP
+
 #include "constants.h"
+#include "types.h"

 using namespace vortex;

 class MemSim::Impl {
 private:
    MemSim* simobject_;
-    uint32_t num_banks_;
-    uint32_t latency_;
+    Config config_;
    PerfStats perf_stats_;
+    ramulator::Gem5Wrapper* dram_;

 public:
-    Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) 
+
+    Impl(MemSim* simobject, const Config& config) 
        : simobject_(simobject)
-        , num_banks_(num_banks)
-        , latency_(latency)  
-    {}
+        , config_(config)
+    {
+        ramulator::Config ram_config;
+        ram_config.add("standard", "DDR4");
+        ram_config.add("channels", std::to_string(config.channels));
+        ram_config.add("ranks", "1");
+        ram_config.add("speed", "DDR4_2400R");
+        ram_config.add("org", "DDR4_4Gb_x8");
+        ram_config.add("mapping", "defaultmapping");
+        ram_config.set_core_num(config.num_cores);
+        dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE);
+        Stats::statlist.output("ramulator.ddr4.log");
+    }
+
+    ~Impl() {
+        dram_->finish();
+        Stats::statlist.printall();
+        delete dram_;
+    }

    const PerfStats& perf_stats() const {
        return perf_stats_;
    }

+    void dram_callback(ramulator::Request& req, uint32_t tag) {
+        MemRsp mem_rsp{tag, (uint32_t)req.coreid};
+        simobject_->MemRspPort.send(mem_rsp, 1);
+    }
+
    void step(uint64_t /*cycle*/) {
-        for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
-            auto& mem_req_port = simobject_->MemReqPorts.at(i); 
-            if (mem_req_port.empty())
-                continue;
-            auto& mem_req = mem_req_port.front();
-            if (!mem_req.write) {
-                MemRsp mem_rsp;
-                mem_rsp.tag = mem_req.tag;
-                simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
-                ++perf_stats_.reads;
-            } else {
-                ++perf_stats_.writes;
-            }
-            mem_req_port.pop();
+        dram_->tick();
+              
+        if (simobject_->MemReqPort.empty())
+            return;
+        
+        auto& mem_req = simobject_->MemReqPort.front();
+
+        if (mem_req.write) {      
+            ramulator::Request dram_req( 
+                mem_req.addr,
+                ramulator::Request::Type::WRITE,
+                mem_req.core_id
+            );
+            dram_->send(dram_req);
+            ++perf_stats_.writes;
+        } else {
+            ramulator::Request dram_req( 
+                mem_req.addr,
+                ramulator::Request::Type::READ,
+                std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag),
+                mem_req.core_id
+            );
+            dram_->send(dram_req);
+            ++perf_stats_.reads;
        }
+
+        simobject_->MemReqPort.pop();        
    }
 };

 ///////////////////////////////////////////////////////////////////////////////

-MemSim::MemSim(const SimContext& ctx, 
-               uint32_t num_banks,
-               uint32_t latency) 
+MemSim::MemSim(const SimContext& ctx, const Config& config) 
    : SimObject<MemSim>(ctx, "MemSim")
-    , MemReqPorts(num_banks, this) 
-    , MemRspPorts(num_banks, this)
-    , impl_(new Impl(this, num_banks, latency))
+    , MemReqPort(this) 
+    , MemRspPort(this)
+    , impl_(new Impl(this, config))
 {}

 MemSim::~MemSim() {
--- a/sim/simx/memsim.h
+++ b/sim/simx/memsim.h
@ -8,6 +8,11 @@ namespace vortex {

 class MemSim : public SimObject<MemSim>{
 public:
+    struct Config {        
+        uint32_t channels;      
+        uint32_t num_cores;
+    };
+
    struct PerfStats {
        uint64_t reads;
        uint64_t writes;
@ -18,10 +23,10 @@ public:
        {}
    };

-    std::vector<SimPort<MemReq>> MemReqPorts;
-    std::vector<SimPort<MemRsp>> MemRspPorts;
+    SimPort<MemReq> MemReqPort;
+    SimPort<MemRsp> MemRspPort;

-    MemSim(const SimContext& ctx, uint32_t num_banks, uint32_t latency);
+    MemSim(const SimContext& ctx, const Config& config);
    ~MemSim();

    void step(uint64_t cycle);
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@ -3,147 +3,173 @@

 using namespace vortex;

-Processor::Processor(const ArchDef& arch) 
-  : cores_(arch.num_cores())
-  , l2caches_(NUM_CLUSTERS)
-  , l2_mem_switches_(NUM_CLUSTERS)
-{
-  uint32_t num_cores = arch.num_cores();
-  uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; 
+class Processor::Impl {
+private:
+  Processor* simobject_;
+  std::vector<Core::Ptr> cores_;
+  std::vector<Cache::Ptr> l2caches_;
+  std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
+  Cache::Ptr l3cache_;
+  Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;

-  // create cores
-  for (uint32_t i = 0; i < num_cores; ++i) {
-      cores_.at(i) = Core::Create(arch, i);
-  }
+public:
+  Impl(Processor* simobject, const ArchDef& arch) 
+    : simobject_(simobject)
+    , cores_(arch.num_cores())
+    , l2caches_(NUM_CLUSTERS)
+    , l2_mem_switches_(NUM_CLUSTERS)
+  {
+    uint32_t num_cores = arch.num_cores();
+    uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;

-  // connect memory sub-systen
-  memsim_ = MemSim::Create(1, MEM_LATENCY);
-  std::vector<SimPort<MemReq>*> mem_req_ports(1); 
-  std::vector<SimPort<MemRsp>*> mem_rsp_ports(1);
-
-  mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
-  mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
-
-  if (L3_ENABLE) {
-    l3cache_ = Cache::Create("l3cache", Cache::Config{
-      log2ceil(L3_CACHE_SIZE),  // C
-      log2ceil(MEM_BLOCK_SIZE), // B
-      2,                      // W
-      0,                      // A
-      32,                    // address bits    
-      L3_NUM_BANKS,           // number of banks
-      L3_NUM_PORTS,           // number of ports
-      NUM_CLUSTERS,           // request size   
-      true,                   // write-through
-      false,                  // write response
-      0,                      // victim size
-      L3_MSHR_SIZE,           // mshr
-      2,                      // pipeline latency
-      }
-    );
-      
-    mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
-    l3cache_->MemReqPort.bind(mem_req_ports.at(0));
-
-    mem_req_ports.resize(NUM_CLUSTERS);
-    mem_rsp_ports.resize(NUM_CLUSTERS);
-
-    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-      mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
-      mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
+    // create cores
+    for (uint32_t i = 0; i < num_cores; ++i) {
+        cores_.at(i) = Core::Create(arch, i);
    }
-  } else if (NUM_CLUSTERS > 1) {
-    l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
-    mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
-    l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));      
+    
+    std::vector<SimPort<MemReq>*> mem_req_ports(1);
+    std::vector<SimPort<MemRsp>*> mem_rsp_ports(1);

-    mem_req_ports.resize(NUM_CLUSTERS);
-    mem_rsp_ports.resize(NUM_CLUSTERS);
+    mem_req_ports.at(0) = &simobject_->MemReqPort;
+    mem_rsp_ports.at(0) = &simobject_->MemRspPort;

-    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-      mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
-      mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
-    }
-  }
-
-  for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {  
-    std::vector<SimPort<MemReq>*> cluster_mem_req_ports(cores_per_cluster); 
-    std::vector<SimPort<MemRsp>*> cluster_mem_rsp_ports(cores_per_cluster);
-
-    if (L2_ENABLE) {
-      auto& l2cache = l2caches_.at(i);
-      l2cache = Cache::Create("l2cache", Cache::Config{
-        log2ceil(L2_CACHE_SIZE),  // C
+    if (L3_ENABLE) {
+      l3cache_ = Cache::Create("l3cache", Cache::Config{
+        log2ceil(L3_CACHE_SIZE),  // C
        log2ceil(MEM_BLOCK_SIZE), // B
        2,                      // W
        0,                      // A
-        32,                     // address bits    
-        L2_NUM_BANKS,           // number of banks
-        L2_NUM_PORTS,           // number of ports
-        (uint8_t)cores_per_cluster, // request size   
+        32,                    // address bits  
+        L3_NUM_BANKS,           // number of banks
+        L3_NUM_PORTS,           // number of ports
+        NUM_CLUSTERS,           // request size 
        true,                   // write-through
        false,                  // write response
        0,                      // victim size
-        L2_MSHR_SIZE,           // mshr
+        L3_MSHR_SIZE,           // mshr
        2,                      // pipeline latency
-      });
+        }
+      );        
+      l3cache_->MemReqPort.bind(mem_req_ports.at(0));
+      mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);

-      mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
-      l2cache->MemReqPort.bind(mem_req_ports.at(i));
+      mem_req_ports.resize(NUM_CLUSTERS);
+      mem_rsp_ports.resize(NUM_CLUSTERS);

-      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-        cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
-        cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
+      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
+        mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
+        mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
      }
-    } else {
-      auto& l2_mem_switch = l2_mem_switches_.at(i);
-      l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster);
+    } else if (NUM_CLUSTERS > 1) {
+      l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
+      l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));      
+      mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);

-      mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
-      l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
+      mem_req_ports.resize(NUM_CLUSTERS);
+      mem_rsp_ports.resize(NUM_CLUSTERS);

-      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-        cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
-        cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
+      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
+        mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
+        mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
      }
    }

-    for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-      auto& core = cores_.at((i * cores_per_cluster) + j);        
-      cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort);
-      core->MemReqPort.bind(cluster_mem_req_ports.at(j));
+    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
+      std::vector<SimPort<MemReq>*> cluster_mem_req_ports(cores_per_cluster); 
+      std::vector<SimPort<MemRsp>*> cluster_mem_rsp_ports(cores_per_cluster);
+
+      if (L2_ENABLE) {
+        auto& l2cache = l2caches_.at(i);
+        l2cache = Cache::Create("l2cache", Cache::Config{
+          log2ceil(L2_CACHE_SIZE),  // C
+          log2ceil(MEM_BLOCK_SIZE), // B
+          2,                      // W
+          0,                      // A
+          32,                     // address bits  
+          L2_NUM_BANKS,           // number of banks
+          L2_NUM_PORTS,           // number of ports
+          (uint8_t)cores_per_cluster, // request size 
+          true,                   // write-through
+          false,                  // write response
+          0,                      // victim size
+          L2_MSHR_SIZE,           // mshr
+          2,                      // pipeline latency
+        });
+        l2cache->MemReqPort.bind(mem_req_ports.at(i));
+        mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
+
+        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+          cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
+          cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
+        }
+      } else {
+        auto& l2_mem_switch = l2_mem_switches_.at(i);
+        l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster);
+        l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
+        mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
+
+        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+          cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
+          cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
+        }
+      }
+
+      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+        auto& core = cores_.at((i * cores_per_cluster) + j);
+        core->MemReqPort.bind(cluster_mem_req_ports.at(j));
+        cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort);
+      }
    }
  }
-}

-void Processor::attach_ram(RAM* ram) {
-  for (auto core : cores_) {
-    core->attach_ram(ram);
+  ~Impl() {}
+
+  void step(uint64_t cycle) {
+    __unused (cycle);
  }
-}

-Processor::~Processor() {}
+  void attach_ram(RAM* ram) {
+    for (auto core : cores_) {
+      core->attach_ram(ram);
+    }
+  }

-int Processor::run() {
-  bool running;
-  int exitcode = 0;
-  do {
-    SimPlatform::instance().step();
-    
-    running = false;
+  bool check_exit(int* exitcode) {
+    bool running = false;
    for (auto& core : cores_) {
      if (core->running()) {
        running = true;
      }
      if (core->check_exit()) {
-        exitcode = core->getIRegValue(3);
-        running = false;
-        break;
+        *exitcode = core->getIRegValue(3);
+        return true;
      }
    }
-  } while (running);
+    return !running;
+  }
+};

-  std::cout << std::flush;
+///////////////////////////////////////////////////////////////////////////////

-  return exitcode;
+Processor::Processor(const SimContext& ctx, const ArchDef& arch) 
+  : SimObject<Processor>(ctx, "Vortex")
+  , MemReqPort(this) 
+  , MemRspPort(this)
+  , impl_(new Impl(this, arch))
+{}
+
+Processor::~Processor() {
+  delete impl_;
+}
+
+void Processor::attach_ram(RAM* mem) {
+  impl_->attach_ram(mem);
+}
+
+bool Processor::check_exit(int* exitcode) {
+  return impl_->check_exit(exitcode);
+}
+
+void Processor::step(uint64_t cycle) {
+  impl_->step(cycle);
 }
--- a/sim/simx/processor.h
+++ b/sim/simx/processor.h
@ -4,24 +4,23 @@

 namespace vortex {

-class Processor {
+class Processor : public SimObject<Processor> {
 public:
-  typedef std::shared_ptr<Processor> Ptr;
+  SimPort<MemReq> MemReqPort;
+  SimPort<MemRsp> MemRspPort;
  
-  Processor(const ArchDef& arch);
+  Processor(const SimContext& ctx, const ArchDef& arch);
  ~Processor();

  void attach_ram(RAM* mem);

-  int run();
+  bool check_exit(int* exitcode);
+
+  void step(uint64_t cycle);

 private:
-  std::vector<Core::Ptr> cores_;  
-  std::vector<Cache::Ptr> l2caches_;  
-  std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
-  Cache::Ptr l3cache_;
-  Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
-  MemSim::Ptr memsim_;
+  class Impl;
+  Impl* impl_;
 };

 }
--- a/sim/simx/sharedmem.h
+++ b/sim/simx/sharedmem.h
@ -65,8 +65,7 @@ public:

            if (!core_req.write || config_.write_reponse) {
                // send response
-                MemRsp core_rsp;
-                core_rsp.tag = core_req.tag;
+                MemRsp core_rsp{core_req.tag, core_req.core_id};
                this->Outputs.at(req_id).send(core_rsp, 1);
            }

--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@ -70,6 +70,7 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
 enum class AluType {
  ARITH,
  BRANCH,
+  SYSCALL,
  IMUL,
  IDIV,    
  CMOV,
@ -77,11 +78,12 @@ enum class AluType {

 inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
  switch (type) {
-  case AluType::ARITH:  os << "ARITH"; break;
-  case AluType::BRANCH: os << "BRANCH"; break;
-  case AluType::IMUL:   os << "IMUL"; break;
-  case AluType::IDIV:   os << "IDIV"; break;
-  case AluType::CMOV:   os << "CMOV"; break;
+  case AluType::ARITH:   os << "ARITH"; break;
+  case AluType::BRANCH:  os << "BRANCH"; break;
+  case AluType::SYSCALL: os << "SYSCALL"; break;
+  case AluType::IMUL:    os << "IMUL"; break;
+  case AluType::IDIV:    os << "IDIV"; break;
+  case AluType::CMOV:    os << "CMOV"; break;
  }
  return os;
 }
@ -207,24 +209,31 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {

 struct MemReq {
    uint64_t addr;
-    uint32_t tag;
    bool write;
-    bool is_io;
+    bool non_cacheable;
+    uint32_t tag;
+    uint32_t core_id;    

    MemReq(uint64_t _addr = 0, 
+           bool _write = false,
+           bool _non_cacheable = false,
           uint64_t _tag = 0, 
-           bool _write = false, 
-           bool _is_io = false
+           uint32_t _core_id = 0
    )   : addr(_addr)
-        , tag(_tag)
        , write(_write)
-        , is_io(_is_io) 
+        , non_cacheable(_non_cacheable)
+        , tag(_tag)
+        , core_id(_core_id)
    {}
 };

 struct MemRsp {
    uint64_t tag;    
-    MemRsp(uint64_t _tag = 0) : tag (_tag) {}
+    uint32_t core_id;
+    MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0)
+      : tag (_tag) 
+      , core_id(_core_id)
+    {}
 };

 ///////////////////////////////////////////////////////////////////////////////
--- a/sim/vlsim/Makefile
+++ b/sim/vlsim/Makefile
@ -1,3 +1,4 @@
+DESTDIR ?= .
 RTL_DIR = ../../hw/rtl
 DPI_DIR = ../../hw/dpi
 SCRIPT_DIR = ../../hw/scripts
@ -7,8 +8,10 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
 CXXFLAGS += -fPIC -Wno-maybe-uninitialized
 CXXFLAGS += -I.. -I../../../hw -I../../common
 CXXFLAGS += -I../$(THIRD_PARTY_DIR)/softfloat/source/include
+CXXFLAGS += -I../$(THIRD_PARTY_DIR)

 LDFLAGS += -shared ../$(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a
+LDFLAGS += -L../$(THIRD_PARTY_DIR)/ramulator -lramulator

 # control RTL debug tracing states
 DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE  
@ -87,22 +90,15 @@ VL_FLAGS += -DIDIV_DPI
 FPU_CORE ?= FPU_DPI
 VL_FLAGS += -D$(FPU_CORE)

-PROJECT = libopae-c-vlsim
+PROJECT = libopae-c-vlsim.so

-all: $(PROJECT).so
+all: $(PROJECT)

 vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh
 	$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h

-$(PROJECT).so: $(SRCS) vortex_afu.h
-	verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT).so
+$(DESTDIR)/$(PROJECT): $(SRCS) vortex_afu.h
+	verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(DESTDIR)/$(PROJECT)

-static: $(SRCS) vortex_afu.h
-	verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)'
-	$(AR) rcs $(PROJECT).a obj_dir/*.o $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/*.o
-
-clean-static:
-	rm -rf $(PROJECT).a obj_dir vortex_afu.h
-
-clean: clean-static
-	rm -rf $(PROJECT).so
+clean:
+	rm -rf obj_dir $(DESTDIR)/$(PROJECT)
--- a/sim/vlsim/opae_sim.cpp
+++ b/sim/vlsim/opae_sim.cpp
@ -13,6 +13,31 @@
 #include <iomanip>
 #include <mem.h>

+#define RAMULATOR
+#include <ramulator/src/Gem5Wrapper.h>
+#include <ramulator/src/Request.h>
+#include <ramulator/src/Statistics.h>
+
+#include <VX_config.h>
+#include <vortex_afu.h>
+
+#include <future>
+#include <list>
+#include <unordered_map>
+
+#ifndef MEMORY_BANKS 
+  #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
+    #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS
+  #else
+    #define MEMORY_BANKS 2
+  #endif
+#endif
+
+#undef MEM_BLOCK_SIZE
+#define MEM_BLOCK_SIZE    (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
+
+#define CACHE_BLOCK_SIZE  64
+
 #define CCI_LATENCY  8
 #define CCI_RAND_MOD 8
 #define CCI_RQ_SIZE 16
@ -28,18 +53,6 @@
 #define TRACE_STOP_TIME -1ull
 #endif

-#ifndef MEM_LATENCY
-#define MEM_LATENCY 24
-#endif
-
-#ifndef MEM_RQ_SIZE
-#define MEM_RQ_SIZE 16
-#endif
-
-#ifndef MEM_STALLS_MODULO
-#define MEM_STALLS_MODULO 16
-#endif
-
 #ifndef VERILATOR_RESET_VALUE
 #define VERILATOR_RESET_VALUE 2
 #endif
@ -88,357 +101,417 @@ void sim_trace_enable(bool enable) {

 ///////////////////////////////////////////////////////////////////////////////

-namespace vortex {
-class VL_OBJ {
+class opae_sim::Impl {
 public:
-#ifdef AXI_BUS
-  VVortex_axi *device;
-#else
-  Vvortex_afu_shim *device;
-#endif
-#ifdef VCD_OUTPUT
-  VerilatedVcdC *trace;
-#endif
-
-  VL_OBJ() {
+  Impl()
+  : stop_(false)
+  , host_buffer_ids_(0) {
    // force random values for unitialized signals  
    Verilated::randReset(VERILATOR_RESET_VALUE);
    Verilated::randSeed(50);

-    // Turn off assertion before reset
+    // turn off assertion before reset
    Verilated::assertOn(false);

-  #ifdef AXI_BUS
-    this->device = new Vvortex_afu_shim();
-  #else
-    this->device = new Vvortex_afu_shim();
-  #endif
+    // create RTL module instance
+    device_ = new Vvortex_afu_shim();

  #ifdef VCD_OUTPUT
    Verilated::traceEverOn(true);
-    this->trace = new VerilatedVcdC();
-    this->device->trace(this->trace, 99);
-    this->trace->open("trace.vcd");
+    trace_ = new VerilatedVcdC();
+    device_->trace(this->trace, 99);
+    trace_->open("trace.vcd");
  #endif
+
+    ram_ = new RAM(RAM_PAGE_SIZE);
+
+    // initialize dram simulator
+    ramulator::Config ram_config;
+    ram_config.add("standard", "DDR4");
+    ram_config.add("channels", std::to_string(MEMORY_BANKS));
+    ram_config.add("ranks", "1");
+    ram_config.add("speed", "DDR4_2400R");
+    ram_config.add("org", "DDR4_4Gb_x8");
+    ram_config.add("mapping", "defaultmapping");
+    ram_config.set_core_num(1);
+    dram_ = new ramulator::Gem5Wrapper(ram_config, MEM_BLOCK_SIZE);
+    Stats::statlist.output("ramulator.ddr4.log");
+
+    // reset the device
+    this->reset();
+
+    // launch execution thread
+    future_ = std::async(std::launch::async, [&]{                 
+        while (!stop_) {
+            std::lock_guard<std::mutex> guard(mutex_);
+            this->step();
+        }
+    }); 
  }

-  ~VL_OBJ() {
+  ~Impl() {  
+    stop_ = true;
+    if (future_.valid()) {
+      future_.wait();
+    } 
+    for (auto& buffer : host_buffers_) {
+      __aligned_free(buffer.second.data);
+    }   
  #ifdef VCD_OUTPUT
-    this->trace->close();
-    delete this->trace;
+    trace_->close();
+    delete trace_;
  #endif
-    delete this->device;
-  }
-};
-}
+    delete device_;
+    
+    delete ram_;

-///////////////////////////////////////////////////////////////////////////////
-
-opae_sim::opae_sim() 
-  : stop_(false)
-  , host_buffer_ids_(0) {  
-  vl_obj_ = new VL_OBJ();
-  ram_ = new RAM(RAM_PAGE_SIZE);
-
-  // reset the device
-  this->reset();
-
-  // launch execution thread
-  future_ = std::async(std::launch::async, [&]{                   
-      while (!stop_) {
-          std::lock_guard<std::mutex> guard(mutex_);
-          this->step();
-      }
-  }); 
-}
-
-opae_sim::~opae_sim() {  
-  stop_ = true;
-  if (future_.valid()) {
-    future_.wait();
-  } 
-  for (auto& buffer : host_buffers_) {
-    __aligned_free(buffer.second.data);
-  }   
-  delete vl_obj_;
-  delete ram_;
-}
-
-int opae_sim::prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
-  auto alloc = __aligned_malloc(CACHE_BLOCK_SIZE, len);
-  if (alloc == NULL)
-    return -1;
-  host_buffer_t buffer;
-  buffer.data   = (uint64_t*)alloc;
-  buffer.size   = len;
-  buffer.ioaddr = uintptr_t(alloc); 
-  auto buffer_id = host_buffer_ids_++;
-  host_buffers_.emplace(buffer_id, buffer);
-  *buf_addr = alloc;
-  *wsid = buffer_id;
-  return 0;
-}
-
-void opae_sim::release_buffer(uint64_t wsid) {
-  auto it = host_buffers_.find(wsid);
-  if (it != host_buffers_.end()) {
-    __aligned_free(it->second.data);
-    host_buffers_.erase(it);
-  }
-}
-
-void opae_sim::get_io_address(uint64_t wsid, uint64_t *ioaddr) {
-  *ioaddr = host_buffers_[wsid].ioaddr;
-}
-
-void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
-  std::lock_guard<std::mutex> guard(mutex_);
-
-  vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid = 1;
-  vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
-  vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
-  vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
-  this->step();  
-  vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid = 0;
-  assert(vl_obj_->device->af2cp_sTxPort_c2_mmioRdValid);  
-  *value = vl_obj_->device->af2cp_sTxPort_c2_data;
-}
-
-void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) {
-  std::lock_guard<std::mutex> guard(mutex_);
-  
-  vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid = 1;  
-  vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
-  vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
-  vl_obj_->device->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
-  memcpy(vl_obj_->device->vcp2af_sRxPort_c0_data, &value, 8);
-  this->step();
-  vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid = 0;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-
-void opae_sim::reset() {  
-  cci_reads_.clear();
-  cci_writes_.clear();
-  vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid = 0;
-  vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid = 0;
-  vl_obj_->device->vcp2af_sRxPort_c0_rspValid = 0;  
-  vl_obj_->device->vcp2af_sRxPort_c1_rspValid = 0;  
-  vl_obj_->device->vcp2af_sRxPort_c0_TxAlmFull = 0;
-  vl_obj_->device->vcp2af_sRxPort_c1_TxAlmFull = 0;
-
-  for (int b = 0; b < MEMORY_BANKS; ++b) {
-    mem_reads_[b].clear();
-    vl_obj_->device->avs_readdatavalid[b] = 0;  
-    vl_obj_->device->avs_waitrequest[b] = 0;
+    if (dram_) {
+      dram_->finish();
+      Stats::statlist.printall();
+      delete dram_;
+    }
  }

-  vl_obj_->device->reset = 1;
+  int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
+    auto alloc = __aligned_malloc(CACHE_BLOCK_SIZE, len);
+    if (alloc == NULL)
+      return -1;
+    host_buffer_t buffer;
+    buffer.data   = (uint64_t*)alloc;
+    buffer.size   = len;
+    buffer.ioaddr = uintptr_t(alloc); 
+    auto buffer_id = host_buffer_ids_++;
+    host_buffers_.emplace(buffer_id, buffer);
+    *buf_addr = alloc;
+    *wsid = buffer_id;
+    return 0;
+  }

-  for (int i = 0; i < RESET_DELAY; ++i) {
-    vl_obj_->device->clk = 0;
+  void release_buffer(uint64_t wsid) {
+    auto it = host_buffers_.find(wsid);
+    if (it != host_buffers_.end()) {
+      __aligned_free(it->second.data);
+      host_buffers_.erase(it);
+    }
+  }
+
+  void get_io_address(uint64_t wsid, uint64_t *ioaddr) {
+    *ioaddr = host_buffers_[wsid].ioaddr;
+  }
+
+  void read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
+    std::lock_guard<std::mutex> guard(mutex_);
+
+    device_->vcp2af_sRxPort_c0_mmioRdValid = 1;
+    device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
+    device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
+    device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
+    this->step();
+    device_->vcp2af_sRxPort_c0_mmioRdValid = 0;
+    assert(device_->af2cp_sTxPort_c2_mmioRdValid);  
+    *value = device_->af2cp_sTxPort_c2_data;
+  }
+
+  void write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) {
+    std::lock_guard<std::mutex> guard(mutex_);
+    
+    device_->vcp2af_sRxPort_c0_mmioWrValid = 1;  
+    device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
+    device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
+    device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
+    memcpy(device_->vcp2af_sRxPort_c0_data, &value, 8);
+    this->step();
+    device_->vcp2af_sRxPort_c0_mmioWrValid = 0;
+  }
+
+private:
+
+  void reset() {  
+    cci_reads_.clear();
+    cci_writes_.clear();
+    device_->vcp2af_sRxPort_c0_mmioRdValid = 0;
+    device_->vcp2af_sRxPort_c0_mmioWrValid = 0;
+    device_->vcp2af_sRxPort_c0_rspValid = 0;  
+    device_->vcp2af_sRxPort_c1_rspValid = 0;  
+    device_->vcp2af_sRxPort_c0_TxAlmFull = 0;
+    device_->vcp2af_sRxPort_c1_TxAlmFull = 0;
+
+    for (int b = 0; b < MEMORY_BANKS; ++b) {
+      pending_mem_reqs_[b].clear();
+      device_->avs_readdatavalid[b] = 0;  
+      device_->avs_waitrequest[b] = 0;
+    }
+
+    device_->reset = 1;
+
+    for (int i = 0; i < RESET_DELAY; ++i) {
+      device_->clk = 0;
+      this->eval();
+      device_->clk = 1;
+      this->eval();
+    } 
+
+    device_->reset = 0;
+    
+    // Turn on assertion after reset
+    Verilated::assertOn(true);
+  }
+
+  void step() {
+    this->sRxPort_bus();
+    this->sTxPort_bus();
+    this->avs_bus();
+        
+    device_->clk = 0;
    this->eval();
-    vl_obj_->device->clk = 1;
+    device_->clk = 1;
    this->eval();
-  }  

-  vl_obj_->device->reset = 0;
-  
-  // Turn on assertion after reset
-  Verilated::assertOn(true);
-}
+    dram_->tick();

-void opae_sim::step() {
-  this->sRxPort_bus();
-  this->sTxPort_bus();
-  this->avs_bus();
-  
-  vl_obj_->device->clk = 0;
-  this->eval();
-  vl_obj_->device->clk = 1;
-  this->eval();
-
-#ifndef NDEBUG
-  fflush(stdout);
-#endif
-}
-
-void opae_sim::eval() {  
-  vl_obj_->device->eval();
-#ifdef VCD_OUTPUT
-  if (sim_trace_enabled()) {
-    vl_obj_->trace->dump(timestamp);
-  }
-#endif
-  ++timestamp;
-}
-
-void opae_sim::sRxPort_bus() {      
-  // check mmio request
-  bool mmio_req_enabled = vl_obj_->device->vcp2af_sRxPort_c0_mmioRdValid
-                       || vl_obj_->device->vcp2af_sRxPort_c0_mmioWrValid;
-
-  // schedule CCI read responses
-  std::list<cci_rd_req_t>::iterator cci_rd_it(cci_reads_.end());
-  for (auto it = cci_reads_.begin(), ie = cci_reads_.end(); it != ie; ++it) {
-    if (it->cycles_left > 0)
-      it->cycles_left -= 1;
-    if ((cci_rd_it == ie) && (it->cycles_left == 0)) {
-      cci_rd_it = it;
-    }
+  #ifndef NDEBUG
+    fflush(stdout);
+  #endif
  }

-  // schedule CCI write responses
-  std::list<cci_wr_req_t>::iterator cci_wr_it(cci_writes_.end());
-  for (auto it = cci_writes_.begin(), ie = cci_writes_.end(); it != ie; ++it) {
-    if (it->cycles_left > 0)
-      it->cycles_left -= 1;
-    if ((cci_wr_it == ie) && (it->cycles_left == 0)) {
-      cci_wr_it = it;
-    }
-  }
-
-  // send CCI write response  
-  vl_obj_->device->vcp2af_sRxPort_c1_rspValid = 0;  
-  if (cci_wr_it != cci_writes_.end()) {
-    vl_obj_->device->vcp2af_sRxPort_c1_rspValid = 1;
-    vl_obj_->device->vcp2af_sRxPort_c1_hdr_resp_type = 0;
-    vl_obj_->device->vcp2af_sRxPort_c1_hdr_mdata = cci_wr_it->mdata;
-    cci_writes_.erase(cci_wr_it);
-  }
-
-  // send CCI read response (ensure mmio disabled) 
-  vl_obj_->device->vcp2af_sRxPort_c0_rspValid = 0;  
-  if (!mmio_req_enabled 
-   && (cci_rd_it != cci_reads_.end())) {
-    vl_obj_->device->vcp2af_sRxPort_c0_rspValid = 1;
-    vl_obj_->device->vcp2af_sRxPort_c0_hdr_resp_type = 0;
-    memcpy(vl_obj_->device->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE);
-    vl_obj_->device->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata;    
-    /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata);
-    for (int i = 0; i < CACHE_BLOCK_SIZE; ++i)
-      printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]);
-    printf("\n");*/
-    cci_reads_.erase(cci_rd_it);
-  }
-}
-  
-void opae_sim::sTxPort_bus() {
-  // process read requests
-  if (vl_obj_->device->af2cp_sTxPort_c0_valid) {
-    assert(!vl_obj_->device->vcp2af_sRxPort_c0_TxAlmFull);
-    cci_rd_req_t cci_req;
-    cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD);     
-    cci_req.addr = vl_obj_->device->af2cp_sTxPort_c0_hdr_address;
-    cci_req.mdata = vl_obj_->device->af2cp_sTxPort_c0_hdr_mdata;
-    auto host_ptr = (uint64_t*)(vl_obj_->device->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE);
-    memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE);
-    //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, vl_obj_->device->af2cp_sTxPort_c0_hdr_address, cci_req.mdata);
-    cci_reads_.emplace_back(cci_req);    
-  }
-
-  // process write requests
-  if (vl_obj_->device->af2cp_sTxPort_c1_valid) {
-    assert(!vl_obj_->device->vcp2af_sRxPort_c1_TxAlmFull);
-    cci_wr_req_t cci_req;
-    cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD);
-    cci_req.mdata = vl_obj_->device->af2cp_sTxPort_c1_hdr_mdata;
-    auto host_ptr = (uint64_t*)(vl_obj_->device->af2cp_sTxPort_c1_hdr_address * CACHE_BLOCK_SIZE);
-    memcpy(host_ptr, vl_obj_->device->af2cp_sTxPort_c1_data, CACHE_BLOCK_SIZE);
-    cci_writes_.emplace_back(cci_req);
-  } 
-
-  // check queues overflow
-  vl_obj_->device->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= (CCI_RQ_SIZE-1));
-  vl_obj_->device->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= (CCI_WQ_SIZE-1));
-}
-  
-void opae_sim::avs_bus() {
-  for (int b = 0; b < MEMORY_BANKS; ++b) {
-    // update memory responses schedule
-    for (auto& rsp : mem_reads_[b]) {
-      if (rsp.cycles_left > 0)
-        rsp.cycles_left -= 1;
-    }
-
-    // schedule memory responses in FIFO order
-    std::list<mem_rd_req_t>::iterator mem_rd_it(mem_reads_[b].end());
-    if (!mem_reads_[b].empty() 
-    && (0 == mem_reads_[b].begin()->cycles_left)) {
-        mem_rd_it = mem_reads_[b].begin();
-    }
-
-    // send memory response  
-    vl_obj_->device->avs_readdatavalid[b] = 0;  
-    if (mem_rd_it != mem_reads_[b].end()) {
-      vl_obj_->device->avs_readdatavalid[b] = 1;
-      memcpy(vl_obj_->device->avs_readdata[b], mem_rd_it->data.data(), MEM_BLOCK_SIZE);
-      uint32_t addr = mem_rd_it->addr;
-      mem_reads_[b].erase(mem_rd_it);
-      /*printf("%0ld: [sim] MEM Rd Rsp: bank=%d, addr=%x, pending={", timestamp, b, addr * MEM_BLOCK_SIZE);
-      for (auto& req : mem_reads_[b]) {
-        if (req.cycles_left != 0) 
-          printf(" !%0x", req.addr * MEM_BLOCK_SIZE);
-        else
-          printf(" %0x", req.addr * MEM_BLOCK_SIZE);
-      }
-      printf("}\n");*/
-    }
-
-    // handle memory stalls
-    bool mem_stalled = false;
-  #ifdef ENABLE_MEM_STALLS
-    if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { 
-      mem_stalled = true;
-    } else
-    if (mem_reads_[b].size() >= MEM_RQ_SIZE) {
-      mem_stalled = true;
+  void eval() {  
+    device_->eval();
+  #ifdef VCD_OUTPUT
+    if (sim_trace_enabled()) {
+      trace_->dump(timestamp);
    }
  #endif
+    ++timestamp;
+  }

-    // process memory requests
-    if (!mem_stalled) {
-      assert(!vl_obj_->device->avs_read[b] || !vl_obj_->device->avs_write[b]);
-      if (vl_obj_->device->avs_write[b]) {           
-        uint64_t byteen = vl_obj_->device->avs_byteenable[b];
-        unsigned base_addr = vl_obj_->device->avs_address[b] * MEM_BLOCK_SIZE;
-        uint8_t* data = (uint8_t*)(vl_obj_->device->avs_writedata[b]);
+  void sRxPort_bus() {      
+    // check mmio request
+    bool mmio_req_enabled = device_->vcp2af_sRxPort_c0_mmioRdValid
+                        || device_->vcp2af_sRxPort_c0_mmioWrValid;
+
+    // schedule CCI read responses
+    std::list<cci_rd_req_t>::iterator cci_rd_it(cci_reads_.end());
+    for (auto it = cci_reads_.begin(), ie = cci_reads_.end(); it != ie; ++it) {
+      if (it->cycles_left > 0)
+        it->cycles_left -= 1;
+      if ((cci_rd_it == ie) && (it->cycles_left == 0)) {
+        cci_rd_it = it;
+      }
+    }
+
+    // schedule CCI write responses
+    std::list<cci_wr_req_t>::iterator cci_wr_it(cci_writes_.end());
+    for (auto it = cci_writes_.begin(), ie = cci_writes_.end(); it != ie; ++it) {
+      if (it->cycles_left > 0)
+        it->cycles_left -= 1;
+      if ((cci_wr_it == ie) && (it->cycles_left == 0)) {
+        cci_wr_it = it;
+      }
+    }
+
+    // send CCI write response  
+    device_->vcp2af_sRxPort_c1_rspValid = 0;  
+    if (cci_wr_it != cci_writes_.end()) {
+      device_->vcp2af_sRxPort_c1_rspValid = 1;
+      device_->vcp2af_sRxPort_c1_hdr_resp_type = 0;
+      device_->vcp2af_sRxPort_c1_hdr_mdata = cci_wr_it->mdata;
+      cci_writes_.erase(cci_wr_it);
+    }
+
+    // send CCI read response (ensure mmio disabled) 
+    device_->vcp2af_sRxPort_c0_rspValid = 0;  
+    if (!mmio_req_enabled 
+    && (cci_rd_it != cci_reads_.end())) {
+      device_->vcp2af_sRxPort_c0_rspValid = 1;
+      device_->vcp2af_sRxPort_c0_hdr_resp_type = 0;
+      memcpy(device_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE);
+      device_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata;    
+      /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata);
+      for (int i = 0; i < CACHE_BLOCK_SIZE; ++i)
+        printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]);
+      printf("\n");*/
+      cci_reads_.erase(cci_rd_it);
+    }
+  }
+    
+  void sTxPort_bus() {
+    // process read requests
+    if (device_->af2cp_sTxPort_c0_valid) {
+      assert(!device_->vcp2af_sRxPort_c0_TxAlmFull);
+      cci_rd_req_t cci_req;
+      cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD);   
+      cci_req.addr = device_->af2cp_sTxPort_c0_hdr_address;
+      cci_req.mdata = device_->af2cp_sTxPort_c0_hdr_mdata;
+      auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE);
+      memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE);
+      //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata);
+      cci_reads_.emplace_back(cci_req);  
+    }
+
+    // process write requests
+    if (device_->af2cp_sTxPort_c1_valid) {
+      assert(!device_->vcp2af_sRxPort_c1_TxAlmFull);
+      cci_wr_req_t cci_req;
+      cci_req.cycles_left = CCI_LATENCY + (timestamp % CCI_RAND_MOD);
+      cci_req.mdata = device_->af2cp_sTxPort_c1_hdr_mdata;
+      auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c1_hdr_address * CACHE_BLOCK_SIZE);
+      memcpy(host_ptr, device_->af2cp_sTxPort_c1_data, CACHE_BLOCK_SIZE);
+      cci_writes_.emplace_back(cci_req);
+    } 
+
+    // check queues overflow
+    device_->vcp2af_sRxPort_c0_TxAlmFull = (cci_reads_.size() >= (CCI_RQ_SIZE-1));
+    device_->vcp2af_sRxPort_c1_TxAlmFull = (cci_writes_.size() >= (CCI_WQ_SIZE-1));
+  }
+    
+  void avs_bus() {
+    for (int b = 0; b < MEMORY_BANKS; ++b) {
+      // process memory responses
+      device_->avs_readdatavalid[b] = 0;  
+      if (!pending_mem_reqs_[b].empty() 
+       && (*pending_mem_reqs_[b].begin())->ready) {
+        auto mem_rd_it = pending_mem_reqs_[b].begin();
+        auto mem_req = *mem_rd_it;
+        device_->avs_readdatavalid[b] = 1;
+        memcpy(device_->avs_readdata[b], mem_req->data.data(), MEM_BLOCK_SIZE);
+        uint32_t addr = mem_req->addr;
+        pending_mem_reqs_[b].erase(mem_rd_it);
+        delete mem_req;
+      }
+
+      // process memory requests
+      assert(!device_->avs_read[b] || !device_->avs_write[b]);
+      unsigned byte_addr = device_->avs_address[b] * MEM_BLOCK_SIZE;
+      if (device_->avs_write[b]) {           
+        uint64_t byteen = device_->avs_byteenable[b];        
+        uint8_t* data = (uint8_t*)(device_->avs_writedata[b]);
        for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
          if ((byteen >> i) & 0x1) {            
-            (*ram_)[base_addr + i] = data[i];
+            (*ram_)[byte_addr + i] = data[i];
          }
        }
-        /*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, base_addr);
+
+        /*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, byte_addr);
        for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
          printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
        }
        printf("\n");*/
+
+        // send dram request
+        ramulator::Request dram_req( 
+          byte_addr,
+          ramulator::Request::Type::WRITE,
+          0
+        );
+        dram_->send(dram_req);
      }
-      if (vl_obj_->device->avs_read[b]) {
-        mem_rd_req_t mem_req;      
-        mem_req.addr = vl_obj_->device->avs_address[b];
-        ram_->read(mem_req.data.data(), vl_obj_->device->avs_address[b] * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE);      
-        mem_req.cycles_left = MEM_LATENCY;
-        for (auto& rsp : mem_reads_[b]) {
-          if (mem_req.addr == rsp.addr) {
-            // duplicate requests receive the same cycle delay
-            mem_req.cycles_left = rsp.cycles_left;
-            break;
-          }
-        }
-        mem_reads_[b].emplace_back(mem_req);
+
+      if (device_->avs_read[b]) {
+        auto mem_req = new mem_rd_req_t();
+        mem_req->addr = device_->avs_address[b];
+        ram_->read(mem_req->data.data(), byte_addr, MEM_BLOCK_SIZE);      
+        mem_req->ready = false;
+        pending_mem_reqs_[b].emplace_back(mem_req);
+
        /*printf("%0ld: [sim] MEM Rd Req: bank=%d, addr=%x, pending={", timestamp, b, mem_req.addr * MEM_BLOCK_SIZE);
-        for (auto& req : mem_reads_[b]) {
+        for (auto& req : pending_mem_reqs_[b]) {
          if (req.cycles_left != 0) 
            printf(" !%0x", req.addr * MEM_BLOCK_SIZE);
          else
            printf(" %0x", req.addr * MEM_BLOCK_SIZE);
        }
        printf("}\n");*/
-      }
-    }

-    vl_obj_->device->avs_waitrequest[b] = mem_stalled;
+        // send dram request
+        ramulator::Request dram_req( 
+          byte_addr,
+          ramulator::Request::Type::READ,
+          std::bind([](ramulator::Request& dram_req, mem_rd_req_t* mem_req) {
+              mem_req->ready = true;
+            }, placeholders::_1, mem_req),
+          0
+        );
+        dram_->send(dram_req);
+      }
+
+      device_->avs_waitrequest[b] = false;
+    }
  }
+
+  typedef struct {
+    bool ready;  
+    std::array<uint8_t, MEM_BLOCK_SIZE> data;
+    uint32_t addr;
+  } mem_rd_req_t;
+
+  typedef struct {
+    int cycles_left;  
+    std::array<uint8_t, CACHE_BLOCK_SIZE> data;
+    uint64_t addr;
+    uint32_t mdata;
+  } cci_rd_req_t;
+
+  typedef struct {
+    int cycles_left;  
+    uint32_t mdata;
+  } cci_wr_req_t;
+
+  typedef struct {    
+    uint64_t* data;
+    size_t    size;
+    uint64_t  ioaddr;  
+  } host_buffer_t;
+
+  std::future<void> future_;
+  bool stop_;
+
+  std::unordered_map<int64_t, host_buffer_t> host_buffers_;
+  int64_t host_buffer_ids_;
+
+  std::list<mem_rd_req_t*> pending_mem_reqs_[MEMORY_BANKS];
+
+  std::list<cci_rd_req_t> cci_reads_;
+
+  std::list<cci_wr_req_t> cci_writes_;
+
+  std::mutex mutex_;
+
+  RAM *ram_;
+
+  ramulator::Gem5Wrapper* dram_;
+
+  Vvortex_afu_shim *device_;
+#ifdef VCD_OUTPUT
+  VerilatedVcdC *trace_;
+#endif
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+opae_sim::opae_sim() 
+  : impl_(new Impl())
+{}
+
+opae_sim::~opae_sim() {
+  delete impl_;
+}
+
+int opae_sim::prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) {
+  return impl_->prepare_buffer(len, buf_addr, wsid, flags);
+}
+
+void opae_sim::release_buffer(uint64_t wsid) {
+  impl_->release_buffer(wsid);
+}
+
+void opae_sim::get_io_address(uint64_t wsid, uint64_t *ioaddr) {
+  impl_->get_io_address(wsid, ioaddr);
+}
+
+void opae_sim::write_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t value) {
+  impl_->write_mmio64(mmio_num, offset, value);
+}
+
+void opae_sim::read_mmio64(uint32_t mmio_num, uint64_t offset, uint64_t *value) {
+  impl_->read_mmio64(mmio_num, offset, value);
 }
--- a/sim/vlsim/opae_sim.h
+++ b/sim/vlsim/opae_sim.h
@ -1,29 +1,8 @@
 #pragma once

-#include <VX_config.h>
-#include <vortex_afu.h>
-
-#include <ostream>
-#include <future>
-#include <list>
-#include <unordered_map>
-
-#ifndef MEMORY_BANKS 
-  #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
-    #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS
-  #else
-    #define MEMORY_BANKS 2
-  #endif
-#endif
-
-#undef MEM_BLOCK_SIZE
-#define MEM_BLOCK_SIZE    (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
-
-#define CACHE_BLOCK_SIZE  64
-
+#include <stdint.h>
 namespace vortex {

-class VL_OBJ;
 class RAM;

 class opae_sim {
@ -44,57 +23,8 @@ public:

 private: 

-  typedef struct {
-    int cycles_left;  
-    std::array<uint8_t, MEM_BLOCK_SIZE> data;
-    uint32_t addr;
-  } mem_rd_req_t;
-
-  typedef struct {
-    int cycles_left;  
-    std::array<uint8_t, CACHE_BLOCK_SIZE> data;
-    uint64_t addr;
-    uint32_t mdata;
-  } cci_rd_req_t;
-
-  typedef struct {
-    int cycles_left;  
-    uint32_t mdata;
-  } cci_wr_req_t;
-
-  typedef struct {    
-    uint64_t* data;
-    size_t    size;
-    uint64_t  ioaddr;  
-  } host_buffer_t;
-
-  void reset();
-
-  void eval();
-
-  void step();
-
-  void sRxPort_bus();
-  void sTxPort_bus();
-  void avs_bus();
-
-  std::future<void> future_;
-  bool stop_;
-
-  std::unordered_map<int64_t, host_buffer_t> host_buffers_;
-  int64_t host_buffer_ids_;
-
-  std::list<mem_rd_req_t> mem_reads_ [MEMORY_BANKS];
-
-  std::list<cci_rd_req_t> cci_reads_;
-
-  std::list<cci_wr_req_t> cci_writes_;
-
-  std::mutex mutex_;
-
-  RAM *ram_;
-
-  VL_OBJ* vl_obj_;
+  class Impl;
+  Impl* impl_;  
 };

 }
--- a/third_party/Makefile
+++ b/third_party/Makefile
@ -1,4 +1,4 @@
-all: fpnew cocogfx softfloat
+all: fpnew cocogfx softfloat ramulator

 fpnew:

@ -8,8 +8,11 @@ cocogfx:
 softfloat:
 	SPECIALIZE_TYPE=RISCV SOFTFLOAT_OPTS="-fPIC -DSOFTFLOAT_ROUND_ODD -DINLINE_LEVEL=5 -DSOFTFLOAT_FAST_DIV32TO16 -DSOFTFLOAT_FAST_DIV64TO32" $(MAKE) -C softfloat/build/Linux-x86_64-GCC

+ramulator:
+	$(MAKE) -C ramulator libramulator.a
+
 clean:
 	$(MAKE) clean -C cocogfx
 	$(MAKE) -C softfloat/build/Linux-x86_64-GCC clean

-.PHONY: all fpnew cocogfx softfloat
+.PHONY: all fpnew cocogfx softfloat ramulator