dram simulator fix

2025-04-23 21:39:10 -04:00 · 2021-12-07 22:44:06 -05:00 · 2021-12-07 22:44:06 -05:00 · 5825b7c15a
commit 5825b7c15a
parent a9ec1c08a7
30 changed files with 702 additions and 499 deletions
--- a/driver/rtlsim/vortex.cpp
+++ b/driver/rtlsim/vortex.cpp
@ -124,8 +124,7 @@ public:
            future_.wait();
        }
        // start new run
-        future_ = std::async(std::launch::async, [&]{             
-            processor_.reset();            
+        future_ = std::async(std::launch::async, [&]{
            processor_.run();
        });
        return 0;
--- a/driver/simx/vortex.cpp
+++ b/driver/simx/vortex.cpp
@ -8,11 +8,17 @@

 #include <vortex.h>
 #include <vx_utils.h>
-#include <processor.h>
-#include <constants.h>
+
 #include <VX_config.h>
+
 #include <util.h>

+#include <processor.h>
+#include <archdef.h>
+#include <mem.h>
+#include <constants.h>
+
+
 using namespace vortex;

 ///////////////////////////////////////////////////////////////////////////////
@ -59,13 +65,11 @@ public:
    vx_device() 
        : arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS)
        , ram_(RAM_PAGE_SIZE)
+        , processor_(arch_)
        , mem_allocation_(ALLOC_BASE_ADDR)
    {
-        // setup memory simulator
-        memsim_ = MemSim::Create(MemSim::Config{
-            DRAM_CHANNELS,
-            arch_.num_cores()
-        });
+        // attach memory module
+        processor_.attach_ram(&ram_);
    }

    ~vx_device() {
@ -122,28 +126,7 @@ public:
        
        // start new run
        future_ = std::async(std::launch::async, [&]{
-            if (processor_) {                
-                // release current processor instance
-                processor_->MemReqPort.unbind();
-                memsim_->MemRspPort.unbind();
-                SimPlatform::instance().release_object(processor_);
-            }
-
-            // create new processor instance
-            processor_ = Processor::Create(arch_);
-            processor_->MemReqPort.bind(&memsim_->MemReqPort);
-            memsim_->MemRspPort.bind(&processor_->MemRspPort);
-
-            // attach memory object
-            processor_->attach_ram(&ram_);
-
-            // run simulation
-            int exitcode;   
-            for (;;) {
-                SimPlatform::instance().step();
-                if (processor_->check_exit(&exitcode))
-                    break;
-            };
+            processor_.run();
        });
        
        return 0;
@ -167,8 +150,7 @@ public:
 private:
    ArchDef arch_;
    RAM ram_;
-    MemSim::Ptr memsim_;
-    Processor::Ptr processor_;
+    Processor processor_;
    uint64_t mem_allocation_;        
    std::future<void> future_;
 };
@ -207,9 +189,6 @@ extern int vx_dev_open(vx_device_h* hdevice) {
    if (nullptr == hdevice)
        return  -1;

-    if (!SimPlatform::instance().initialize())
-        return -1;
-
    *hdevice = new vx_device();    

 #ifdef DUMP_PERF_STATS
@ -232,8 +211,6 @@ extern int vx_dev_close(vx_device_h hdevice) {

    delete device;

-    SimPlatform::instance().finalize();
-
    return 0;
 }

--- a/sim/common/simobject.h
+++ b/sim/common/simobject.h
@ -127,7 +127,7 @@ public:

  virtual ~SimEventBase() {}
  
-  virtual void fire() const  = 0;
+  virtual void fire() const = 0;

  uint64_t time() const {
    return time_;
@ -219,15 +219,21 @@ public:

  const std::string& name() const {
    return name_;
-  }
-
-  virtual void step(uint64_t cycle) = 0;
+  } 

 protected:

  SimObjectBase(const SimContext& ctx, const char* name); 

+private:
+
+  virtual void do_reset() = 0;
+
+  virtual void do_tick() = 0;
+
  std::string name_;
+
+  friend class SimPlatform;
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -246,18 +252,22 @@ protected:
    : SimObjectBase(ctx, name) 
  {}

-  void step(uint64_t cycle) override {
-    this->impl().step(cycle);
-  }
-
 private:

-  const Impl& impl() const {
-    return static_cast<const Impl&>(*this);
+  const Impl* impl() const {
+    return static_cast<const Impl*>(this);
  }

-  Impl& impl() {
-    return static_cast<Impl&>(*this);
+  Impl* impl() {
+    return static_cast<Impl*>(this);
+  }
+
+  void do_reset() override {
+    this->impl()->reset();
+  }
+
+  void do_tick() override {
+    this->impl()->tick();
  }
 };

@ -282,10 +292,6 @@ public:
    return true;
  }

-  void flush() {
-    instance().clear();
-  }
-
  void finalize() {
    instance().clear();
  }
@ -310,7 +316,15 @@ public:
    events_.emplace_back(evt);
  }

-  void step() {
+  void reset() {
+    events_.clear();
+    for (auto& object : objects_) {
+      object->do_reset();
+    }
+    cycles_ = 0;
+  }
+
+  void tick() {
    // evaluate events
    auto evt_it = events_.begin();
    auto evt_it_end = events_.end();
@ -325,7 +339,7 @@ public:
    }
    // evaluate components
    for (auto& object : objects_) {
-      object->step(cycles_);
+      object->do_tick();
    }
    // advance clock    
    ++cycles_;
--- a/sim/rtlsim/main.cpp
+++ b/sim/rtlsim/main.cpp
@ -49,12 +49,12 @@ int main(int argc, char **argv) {
 	
 	parse_args(argc, argv);

-	for (auto program : programs) {
-		std::cout << "Running " << program << "..." << std::endl;
+	vortex::RAM ram(RAM_PAGE_SIZE);
+	vortex::Processor processor;
+	processor.attach_ram(&ram);

-		vortex::RAM ram(RAM_PAGE_SIZE);
-		vortex::Processor processor;
-		processor.attach_ram(&ram);
+	for (auto program : programs) {
+		std::cout << "Running " << program << "..." << std::endl;		

 		std::string program_ext(fileExtension(program));
 		if (program_ext == "bin") {
--- a/sim/rtlsim/processor.cpp
+++ b/sim/rtlsim/processor.cpp
@ -22,6 +22,7 @@
 #include <VX_config.h>
 #include <ostream>
 #include <list>
+#include <queue>
 #include <vector>
 #include <sstream> 
 #include <unordered_map>
@ -39,7 +40,9 @@
  #endif
 #endif

-#define ENABLE_MEM_STALLS
+#ifndef MEM_CYCLE_RATIO
+#define MEM_CYCLE_RATIO -1
+#endif

 #ifndef TRACE_START_TIME
 #define TRACE_START_TIME 0ull
@ -126,12 +129,7 @@ public:
  }

  ~Impl() {
-    for (auto& buf : print_bufs_) {
-      auto str = buf.second.str();
-      if (!str.empty()) {
-        std::cout << "#" << buf.first << ": " << str << std::endl;
-      }
-    }
+    this->cout_flush();

  #ifdef VCD_OUTPUT
    trace_->close();
@ -147,10 +145,46 @@ public:
    }
  }

+  void cout_flush() {
+    for (auto& buf : print_bufs_) {
+      auto str = buf.second.str();
+      if (!str.empty()) {
+        std::cout << "#" << buf.first << ": " << str << std::endl;
+      }
+    }
+  }
+
  void attach_ram(RAM* ram) {
    ram_ = ram;
  }

+  int run() {
+    int exitcode = 0;
+
+  #ifndef NDEBUG
+    std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
+  #endif
+
+    // reset device
+    this->reset();
+
+    // execute program
+    while (device_->busy) {
+      if (get_ebreak()) {
+        exitcode = get_last_wb_value(3);
+        break;  
+      }
+      this->tick();
+    }
+
+    // wait 5 cycles to flush the pipeline
+    this->wait(5);  
+
+    return exitcode;
+  }
+
+private:
+
  void reset() { 
    print_bufs_.clear();

@ -178,33 +212,11 @@ public:
    
    // Turn on assertion after reset
    Verilated::assertOn(true);
+
+    this->cout_flush();
  }

-  int run() {
-    int exitcode = 0;
-
-  #ifndef NDEBUG
-    std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;
-  #endif
-
-    // execute program
-    while (device_->busy) {
-      if (get_ebreak()) {
-        exitcode = get_last_wb_value(3);
-        break;  
-      }
-      this->step();
-    }
-
-    // wait 5 cycles to flush the pipeline
-    this->wait(5);  
-
-    return exitcode;
-  }
-
-private:
-
-  void step() {
+  void tick() {

    device_->clk = 0;
    this->eval();
@ -224,7 +236,19 @@ private:
    this->eval_avs_bus(1);
  #endif

-    dram_->tick();
+    if (MEM_CYCLE_RATIO > 0) { 
+      auto cycle = timestamp / 2;
+      if ((cycle % MEM_CYCLE_RATIO) == 0)
+        dram_->tick();
+    } else {
+      for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
+        dram_->tick();            
+    }
+
+    if (!dram_queue_.empty()) {
+      if (dram_->send(dram_queue_.front()))
+        dram_queue_.pop();
+    }

  #ifndef NDEBUG
    fflush(stdout);
@ -372,7 +396,7 @@ private:
            ramulator::Request::Type::WRITE,
            0
          );
-          dram_->send(dram_req);
+          dram_queue_.push(dram_req);
        }        
      } else {
        // process reads
@ -393,7 +417,7 @@ private:
            }, placeholders::_1, mem_req),
          0
        );
-        dram_->send(dram_req);
+        dram_queue_.push(dram_req);
      } 
    } 

@ -490,7 +514,7 @@ private:
            ramulator::Request::Type::WRITE,
            0
          );
-          dram_->send(dram_req);
+          dram_queue_.push(dram_req);
        }
      } else {
        // process reads
@ -511,7 +535,7 @@ private:
            }, placeholders::_1, mem_req),
          0
        );
-        dram_->send(dram_req);
+        dram_queue_.push(dram_req);
      }
    }   

@ -522,7 +546,7 @@ private:

  void wait(uint32_t cycles) {
    for (int i = 0; i < cycles; ++i) {
-      this->step();
+      this->tick();
    }
  }

@ -574,6 +598,8 @@ private:
  RAM *ram_;

  ramulator::Gem5Wrapper* dram_;
+
+  std::queue<ramulator::Request> dram_queue_;
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -590,10 +616,6 @@ void Processor::attach_ram(RAM* mem) {
  impl_->attach_ram(mem);
 }

-void Processor::reset() {
-  impl_->reset();
-}
-
 int Processor::run() {
  return impl_->run();
 }
--- a/sim/rtlsim/processor.h
+++ b/sim/rtlsim/processor.h
@ -8,12 +8,10 @@ class Processor {
 public:
  
  Processor();
-  virtual ~Processor();
+  ~Processor();

  void attach_ram(RAM* ram);

-  void reset();
-
  int run();

 private:
--- a/sim/simx/cache.cpp
+++ b/sim/simx/cache.cpp
@ -102,6 +102,12 @@ struct block_t {
 struct set_t {
    std::vector<block_t> blocks;    
    set_t(uint32_t size) : blocks(size) {}
+
+    void clear() {
+        for (auto& block : blocks) {
+            block.valid = false;
+        }
+    }
 };

 struct bank_req_info_t {
@ -117,6 +123,7 @@ struct bank_req_t {
    uint64_t tag;
    uint32_t set_id;
    uint32_t core_id;
+    uint64_t uuid;
    std::vector<bank_req_info_t> infos;

    bank_req_t(uint32_t size) 
@ -126,6 +133,7 @@ struct bank_req_t {
        , tag(0)
        , set_id(0)
        , core_id(0)
+        , uuid(0)
        , infos(size)
    {}
 };
@ -142,20 +150,20 @@ struct mshr_entry_t : public bank_req_t {
 class MSHR {
 private:
    std::vector<mshr_entry_t> entries_;
-    uint32_t capacity_;
+    uint32_t size_;

 public:    
    MSHR(uint32_t size)
        : entries_(size)
-        , capacity_(0) 
+        , size_(0) 
    {}

    bool empty() const {
-        return (0 == capacity_);
+        return (0 == size_);
    }
    
    bool full() const {
-        return (capacity_ == entries_.size());
+        return (size_ == entries_.size());
    }

    int lookup(const bank_req_t& bank_req) {
@ -178,7 +186,7 @@ public:
                entry.valid = true;
                entry.mshr_replay = false;
                entry.block_id = block_id;  
-                ++capacity_;              
+                ++size_;              
                return i;
            }
        }
@ -204,12 +212,21 @@ public:
            if (entry.valid && entry.mshr_replay) {
                *out = entry;
                entry.valid = false;
-                --capacity_;
+                --size_;
                return true;
            }
        }
        return false;
    }
+
+    void clear() {
+        for (auto& entry : entries_) {
+            if (entry.valid && entry.mshr_replay) {
+                entry.valid = false;
+            }
+        }
+        size_ = 0;
+    }
 };

 struct bank_t {
@ -221,6 +238,13 @@ struct bank_t {
        : sets(params.sets_per_bank, params.blocks_per_set)
        , mshr(config.mshr_size)
    {}
+
+    void clear() {
+        mshr.clear();
+        for (auto& set : sets) {
+            set.clear();
+        }
+    }
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -235,11 +259,11 @@ private:
    Switch<MemReq, MemRsp>::Ptr bypass_switch_;
    std::vector<SimPort<MemReq>> mem_req_ports_;
    std::vector<SimPort<MemRsp>>  mem_rsp_ports_;
+    uint32_t flush_cycles_;
    PerfStats perf_stats_;
    uint64_t pending_read_reqs_;
    uint64_t pending_write_reqs_;
-    uint64_t pending_fill_reqs_;
-    uint32_t flush_cycles_;
+    uint64_t pending_fill_reqs_;    

 public:
    Impl(Cache* simobject, const Config& config) 
@ -249,9 +273,6 @@ public:
        , banks_(config.num_banks, {config, params_})
        , mem_req_ports_(config.num_banks, simobject)
        , mem_rsp_ports_(config.num_banks, simobject)
-        , pending_read_reqs_(0)
-        , pending_write_reqs_(0)
-        , pending_fill_reqs_(0)
    {
        bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
        bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
@ -272,19 +293,28 @@ public:

        // calculate tag flush cycles
        flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
-    }    
-
-    const PerfStats& perf_stats() const {
-        return perf_stats_;
    }

-    void step(uint64_t cycle) {
+    void reset() {
+        for (auto& bank : banks_) {
+            bank.clear();
+        }
+        perf_stats_ = PerfStats();
+        pending_read_reqs_ = 0;
+        pending_write_reqs_ = 0;
+        pending_fill_reqs_ = 0;
+    }
+
+    void tick() {
        // wait on flush cycles
        if (flush_cycles_ != 0) {
            --flush_cycles_;
            return;
        }

+        // per-bank pipeline request
+        std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
+
        // calculate memory latency
        perf_stats_.mem_latency += pending_fill_reqs_;

@ -294,12 +324,11 @@ public:
            auto& mem_rsp = bypass_port.front();
            uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);                
            uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
-            MemRsp core_rsp{tag, mem_rsp.core_id};
+            MemRsp core_rsp{tag, mem_rsp.core_id, mem_rsp.uuid};
            simobject_->CoreRspPorts.at(req_id).send(core_rsp, config_.latency);
+            DT(3, simobject_->name() << "-" << core_rsp);
            bypass_port.pop();
-        }
-
-        std::vector<bank_req_t> pipeline_reqs(config_.num_banks, config_.ports_per_bank);
+        }        

        // handle MSHR replay
        for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
@ -351,6 +380,7 @@ public:
            bank_req.tag = tag;            
            bank_req.set_id = set_id;       
            bank_req.core_id = core_req.core_id;
+            bank_req.uuid = core_req.uuid;
            bank_req.infos.at(port_id) = {true, req_id, core_req.tag};

            auto& bank = banks_.at(bank_id);            
@ -400,22 +430,31 @@ public:

            // remove request
            auto time = core_req_port.pop();
-            perf_stats_.pipeline_stalls += (cycle - time);
+            perf_stats_.pipeline_stalls += (SimPlatform::instance().cycles() - time);
        }
    
        // process active request        
        this->processBankRequest(pipeline_reqs);
+    } 
+
+    const PerfStats& perf_stats() const {
+        return perf_stats_;
    }
+
+private:
    
    void processIORequest(const MemReq& core_req, uint32_t req_id) {
        {
            MemReq mem_req(core_req);
            mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
            bypass_switch_->ReqIn.at(1).send(mem_req, 1);
+            DT(3, simobject_->name() << "-" << mem_req);
        }

        if (core_req.write && config_.write_reponse) {
-            simobject_->CoreRspPorts.at(req_id).send(MemRsp{core_req.tag}, 1);            
+            MemRsp core_rsp{core_req.tag, core_req.core_id, core_req.uuid};
+            simobject_->CoreRspPorts.at(req_id).send(core_rsp, 1);            
+            DT(3, simobject_->name() << "-" << core_rsp);
        }
    }

@ -442,8 +481,9 @@ public:
            if (pipeline_req.mshr_replay) {
                // send core response
                for (auto& info : pipeline_req.infos) {
-                    MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
-                    simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);           
+                    MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
+                    simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);  
+                    DT(3, simobject_->name() << "-" << core_rsp);         
                }
            } else {        
                bool hit = false;
@ -485,7 +525,9 @@ public:
                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, hit_block.tag);
                            mem_req.write = true;
                            mem_req.core_id = pipeline_req.core_id;
+                            mem_req.uuid = pipeline_req.uuid;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-" << mem_req);
                        } else {
                            // mark block as dirty
                            hit_block.dirty = true;
@ -494,8 +536,9 @@ public:
                    // send core response
                    if (!pipeline_req.write || config_.write_reponse) {
                        for (auto& info : pipeline_req.infos) {     
-                            MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
+                            MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
                            simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
+                            DT(3, simobject_->name() << "-" << core_rsp);
                        }
                    }
                } else {     
@ -516,6 +559,7 @@ public:
                            mem_req.write = true;
                            mem_req.core_id = pipeline_req.core_id;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-" << mem_req);
                            ++perf_stats_.evictions;
                        }
                    }
@ -527,13 +571,16 @@ public:
                            mem_req.addr  = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
                            mem_req.write = true;
                            mem_req.core_id = pipeline_req.core_id;
+                            mem_req.uuid = pipeline_req.uuid;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-" << mem_req);
                        }
                        // send core response
                        if (config_.write_reponse) {
                            for (auto& info : pipeline_req.infos) {         
-                                MemRsp core_rsp{info.req_tag, pipeline_req.core_id};
+                                MemRsp core_rsp{info.req_tag, pipeline_req.core_id, pipeline_req.uuid};
                                simobject_->CoreRspPorts.at(info.req_id).send(core_rsp, config_.latency);
+                                DT(3, simobject_->name() << "-" << core_rsp);
                            }
                        }
                    } else {
@ -550,7 +597,9 @@ public:
                            mem_req.write = false;
                            mem_req.tag   = mshr_id;
                            mem_req.core_id = pipeline_req.core_id;
+                            mem_req.uuid = pipeline_req.uuid;
                            mem_req_ports_.at(bank_id).send(mem_req, 1);
+                            DT(3, simobject_->name() << "-" << mem_req);
                            ++pending_fill_reqs_;
                        }
                    }
@ -575,8 +624,12 @@ Cache::~Cache() {
    delete impl_;
 }

-void Cache::step(uint64_t cycle) {
-    impl_->step(cycle);
+void Cache::reset() {
+    impl_->reset();
+}
+
+void Cache::tick() {
+    impl_->tick();
 }

 const Cache::PerfStats& Cache::perf_stats() const {
--- a/sim/simx/cache.h
+++ b/sim/simx/cache.h
@ -22,6 +22,7 @@ public:
        uint16_t mshr_size;     // MSHR buffer size
        uint8_t latency;        // pipeline latency
    };
+    
    struct PerfStats {
        uint64_t reads;
        uint64_t writes;
@ -54,7 +55,9 @@ public:
    Cache(const SimContext& ctx, const char* name, const Config& config);
    ~Cache();

-    void step(uint64_t cycle);
+    void reset();
+    
+    void tick();

    const PerfStats& perf_stats() const;
    
--- a/sim/simx/constants.h
+++ b/sim/simx/constants.h
@ -1,10 +1,16 @@
 #pragma once

-#include "types.h"
-
+#ifndef RAM_PAGE_SIZE
 #define RAM_PAGE_SIZE 4096
+#endif

-#define DRAM_CHANNELS 2
+#ifndef MEM_CYCLE_RATIO
+#define MEM_CYCLE_RATIO -1
+#endif
+
+#ifndef MEMORY_BANKS
+#define MEMORY_BANKS 2
+#endif

 namespace vortex {

--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@ -30,7 +30,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
    , ibuffers_(arch.num_warps(), IBUF_SIZE)
    , scoreboard_(arch_) 
    , exe_units_((int)ExeType::MAX)
-    , icache_(Cache::Create("Icache", Cache::Config{
+    , icache_(Cache::Create("icache", Cache::Config{
        log2ceil(ICACHE_SIZE),  // C
        log2ceil(L1_BLOCK_SIZE),// B
        2,                      // W
@ -45,7 +45,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
        NUM_WARPS,              // mshr
        2,                      // pipeline latency
      }))
-    , dcache_(Cache::Create("Dcache", Cache::Config{
+    , dcache_(Cache::Create("dcache", Cache::Config{
        log2ceil(DCACHE_SIZE),  // C
        log2ceil(L1_BLOCK_SIZE),// B
        2,                      // W
@ -72,15 +72,6 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
    , fetch_latch_("fetch")
    , decode_latch_("decode")
    , pending_icache_(arch_.num_warps())
-    , active_warps_(1)
-    , stalled_warps_(0)
-    , last_schedule_wid_(0)
-    , issued_instrs_(0)
-    , committed_instrs_(0)
-    , csr_tex_unit_(0)
-    , ecall_(false)
-    , ebreak_(false)   
-    , perf_mem_pending_reads_(0) 
 {  
  for (int i = 0; i < arch_.num_warps(); ++i) {
    warps_.at(i) = std::make_shared<Warp>(this, i);
@ -112,10 +103,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
 #endif        
    sw->ReqOut.bind(&dcache_->CoreReqPorts.at(i));
    dcache_->CoreRspPorts.at(i).bind(&sw->RspIn);
-  }
-
-  // activate warp0
-  warps_.at(0)->setTmask(0, true);
+  } 

  // memory perf callbacks
  MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
@ -128,9 +116,62 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
    __unused (cycle);
    --perf_mem_pending_reads_;
  });
+
+  this->reset();
 }

 Core::~Core() {
+  this->cout_flush();
+}
+
+void Core::reset() {
+  for (auto& warp : warps_) {
+    warp->clear();
+  }
+  warps_.at(0)->setTmask(0, true);
+  active_warps_ = 1;
+
+  for (auto& tex_unit : tex_units_) {
+    tex_unit.clear();
+  }
+
+  for ( auto& barrier : barriers_) {
+    barrier.reset();
+  }
+  
+  for (auto& csr : csrs_) {
+    csr = 0;
+  }
+  
+  for (auto& fcsr : fcsrs_) {
+    fcsr = 0;
+  }
+  
+  for (auto& ibuf : ibuffers_) {
+    ibuf.clear();
+  }
+
+  scoreboard_.clear(); 
+  fetch_latch_.clear();
+  decode_latch_.clear();
+  pending_icache_.clear();  
+  stalled_warps_.reset();  
+  last_schedule_wid_ = 0;
+  issued_instrs_ = 0;
+  committed_instrs_ = 0;
+  csr_tex_unit_ = 0;
+  ecall_ = false;
+  ebreak_ = false;
+  perf_mem_pending_reads_ = 0;
+  perf_stats_ = PerfStats();
+}
+
+void Core::attach_ram(RAM* ram) {
+  // bind RAM to memory unit
+  mmu_.attach(*ram, 0, 0xFFFFFFFF);    
+}
+
+void Core::cout_flush() {
  for (auto& buf : print_bufs_) {
    auto str = buf.second.str();
    if (!str.empty()) {
@ -139,17 +180,12 @@ Core::~Core() {
  }
 }

-void Core::attach_ram(RAM* ram) {
-  // bind RAM to memory unit
-  mmu_.attach(*ram, 0, 0xFFFFFFFF);    
-}
-
-void Core::step(uint64_t cycle) {
-  this->commit(cycle);
-  this->execute(cycle);
-  this->decode(cycle);
-  this->fetch(cycle);
-  this->schedule(cycle);
+void Core::tick() {
+  this->commit();
+  this->execute();
+  this->decode();
+  this->fetch();
+  this->schedule();

  // update perf counter  
  perf_stats_.mem_latency += perf_mem_pending_reads_;
@ -157,9 +193,7 @@ void Core::step(uint64_t cycle) {
  DPN(2, std::flush);
 }

-void Core::schedule(uint64_t cycle) {
-  __unused (cycle);
-
+void Core::schedule() {
  bool foundSchedule = false;
  int scheduled_warp = last_schedule_wid_;

@ -181,30 +215,27 @@ void Core::schedule(uint64_t cycle) {
  // suspend warp until decode
  stalled_warps_.set(scheduled_warp);

-  auto& warp = warps_.at(scheduled_warp);
-
  uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_;

  auto trace = new pipeline_trace_t(uuid, arch_);

+  auto& warp = warps_.at(scheduled_warp);
  warp->eval(trace);

-  DT(3, cycle, "pipeline-schedule: " << *trace);
+  DT(3, "pipeline-schedule: " << *trace);

  // advance to fetch stage  
  fetch_latch_.push(trace);
 }

-void Core::fetch(uint64_t cycle) {
-  __unused (cycle);
-
+void Core::fetch() {
  // handle icache reponse
  auto& icache_rsp_port = icache_->CoreRspPorts.at(0);      
  if (!icache_rsp_port.empty()){
    auto& mem_rsp = icache_rsp_port.front();
    auto trace = pending_icache_.at(mem_rsp.tag);
    decode_latch_.push(trace);
-    DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
+    DT(3, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
    pending_icache_.release(mem_rsp.tag);
    icache_rsp_port.pop();
  }
@ -216,16 +247,15 @@ void Core::fetch(uint64_t cycle) {
    mem_req.addr  = trace->PC;
    mem_req.write = false;
    mem_req.tag   = pending_icache_.allocate(trace);    
-    mem_req.core_id = id_;
-    icache_->CoreReqPorts.at(0).send(mem_req, 1);
-    DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
+    mem_req.core_id = trace->cid;
+    mem_req.uuid = trace->uuid;
+    icache_->CoreReqPorts.at(0).send(mem_req, 1);    
+    DT(3, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
    fetch_latch_.pop();
  }    
 }

-void Core::decode(uint64_t cycle) {
-  __unused (cycle);
-
+void Core::decode() {
  if (decode_latch_.empty())
    return;

@ -235,7 +265,7 @@ void Core::decode(uint64_t cycle) {
  auto& ibuffer = ibuffers_.at(trace->wid);
  if (ibuffer.full()) {
    if (!trace->suspend()) {
-      DT(3, cycle, "*** ibuffer-stall: " << *trace);
+      DT(3, "*** ibuffer-stall: " << *trace);
    }
    ++perf_stats_.ibuf_stalls;
    return;
@ -257,7 +287,7 @@ void Core::decode(uint64_t cycle) {
  if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH) 
    perf_stats_.branches += active_threads;

-  DT(3, cycle, "pipeline-decode: " << *trace);
+  DT(3, "pipeline-decode: " << *trace);

  // insert to ibuffer 
  ibuffer.push(trace);
@ -265,9 +295,7 @@ void Core::decode(uint64_t cycle) {
  decode_latch_.pop();
 }

-void Core::execute(uint64_t cycle) {
-  __unused (cycle);  
-    
+void Core::execute() {    
  // issue ibuffer instructions
  for (auto& ibuffer : ibuffers_) {
    if (ibuffer.empty())
@ -278,7 +306,7 @@ void Core::execute(uint64_t cycle) {
    // check scoreboard
    if (scoreboard_.in_use(trace)) {
      if (!trace->suspend()) {
-        DTH(3, cycle, "*** scoreboard-stall: dependents={");
+        DTH(3, "*** scoreboard-stall: dependents={");
        auto uses = scoreboard_.get_uses(trace);
        for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
          auto& use = uses.at(i);
@ -297,7 +325,7 @@ void Core::execute(uint64_t cycle) {
    // update scoreboard
    scoreboard_.reserve(trace);

-    DT(3, cycle, "pipeline-issue: " << *trace);
+    DT(3, "pipeline-issue: " << *trace);

    // push to execute units
    auto& exe_unit = exe_units_.at((int)trace->exe_type);
@ -308,9 +336,7 @@ void Core::execute(uint64_t cycle) {
  }
 }

-void Core::commit(uint64_t cycle) {
-  __unused (cycle);
-  
+void Core::commit() {  
  // commit completed instructions
  bool wb = false;
  for (auto& exe_unit : exe_units_) {
@ -323,7 +349,7 @@ void Core::commit(uint64_t cycle) {
      wb |= trace->wb;

      // advance to commit stage
-      DT(3, cycle, "pipeline-commit: " << *trace);
+      DT(3, "pipeline-commit: " << *trace);

      // update scoreboard
      scoreboard_.release(trace);
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@ -75,16 +75,14 @@ public:

  bool running() const;

-  void step(uint64_t cycle);
+  void reset();
+
+  void tick();

  Word id() const {
    return id_;
  }

-  Warp& warp(int i) {
-    return *warps_.at(i);
-  }
-
  const Decoder& decoder() {
    return decoder_;
  }
@ -125,14 +123,16 @@ public:

 private:

-  void schedule(uint64_t cycle);
-  void fetch(uint64_t cycle);
-  void decode(uint64_t cycle);
-  void execute(uint64_t cycle);
-  void commit(uint64_t cycle);
+  void schedule();
+  void fetch();
+  void decode();
+  void execute();
+  void commit();
  
  void writeToStdOut(Addr addr, Word data);

+  void cout_flush();
+
  Word id_;
  const ArchDef arch_;
  const Decoder decoder_;
--- a/sim/simx/debug.h
+++ b/sim/simx/debug.h
@ -33,15 +33,15 @@
  } \
 } while(0)

-#define DT(lvl, t, x) do { \
+#define DT(lvl, x) do { \
  if ((lvl) <= DEBUG_LEVEL) { \
-    std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \
+    std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x << std::endl; \
  } \
 } while(0)

-#define DTH(lvl, t, x) do { \
+#define DTH(lvl, x) do { \
  if ((lvl) <= DEBUG_LEVEL) { \
-    std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \
+    std::cout TRACE_HEADER << std::setw(10) << std::dec << SimPlatform::instance().cycles() << std::setw(0) << ": " << x; \
  } \
 } while(0)

@ -58,8 +58,8 @@
 #define DPH(lvl, x) do {} while(0)
 #define DPN(lvl, x) do {} while(0)

-#define DT(lvl, t, x) do {} while(0)
-#define DTH(lvl, t, x) do {} while(0)
+#define DT(lvl, x) do {} while(0)
+#define DTH(lvl, x) do {} while(0)
 #define DTN(lvl, x) do {} while(0)

 #endif
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@ -87,7 +87,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
            DPN(2, "-");
            continue;            
          }
-          rsdata[t][i] = iRegFile_.at(t)[reg];          
+          rsdata[t][i] = ireg_file_.at(t)[reg];          
          DPN(2, std::hex << rsdata[t][i]); 
        }
        DPN(2, "}" << std::endl);
@ -100,7 +100,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
            DPN(2, "-");
            continue;            
          }
-          rsdata[t][i] = fRegFile_.at(t)[reg];
+          rsdata[t][i] = freg_file_.at(t)[reg];
          DPN(2, std::hex << rsdata[t][i]); 
        }
        DPN(2, "}" << std::endl);
@ -460,7 +460,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      DP(4, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
      DP(4, "dest: v" << rdest);
      DP(4, "width" << instr.getVlsWidth());
-      auto &vd = vRegFile_.at(rdest);
+      auto &vd = vreg_file_.at(rdest);
      switch (instr.getVlsWidth()) {
      case 6: { 
        // load word and unit strided (not checking for unit stride)
@ -517,7 +517,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
        switch (instr.getVlsWidth()) {
        case 6: {
          // store word and unit strided (not checking for unit stride)          
-          uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
+          uint32_t value = *(uint32_t *)(vreg_file_.at(instr.getVs3()).data() + i);
          core_->dcache_write(memAddr, value, 4);
          DP(4, "store: " << memAddr << " value:" << value);
        } break;
@ -784,7 +784,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
        // predicate mode
        ThreadMask pred;
        for (int i = 0; i < num_threads; ++i) {
-          pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
+          pred[i] = tmask_.test(i) ? (ireg_file_.at(i).at(rsrc0) != 0) : 0;
        }
        if (pred.any()) {
          tmask_ &= pred;
@ -819,15 +819,15 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      trace->gpu.type = GpuType::SPLIT;
      trace->used_iregs.set(rsrc0);
      trace->fetch_stall = true;
-      if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {          
+      if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {          
        ThreadMask tmask;
        for (int i = 0; i < num_threads; ++i) {
-          tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
+          tmask[i] = tmask_.test(i) && !ireg_file_.at(i).at(rsrc0);
        }

        DomStackEntry e(tmask, nextPC);
-        domStack_.push(tmask_);
-        domStack_.push(e);
+        dom_stack_.push(tmask_);
+        dom_stack_.push(e);
        for (size_t i = 0; i < e.tmask.size(); ++i) {
          tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
        }
@ -842,7 +842,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
        DP(3, "*** Unanimous pred");
        DomStackEntry e(tmask_);
        e.unanimous = true;
-        domStack_.push(e);
+        dom_stack_.push(e);
      }        
    } break;
    case 3: {
@ -850,25 +850,25 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      trace->exe_type = ExeType::GPU;
      trace->gpu.type = GpuType::JOIN;        
      trace->fetch_stall = true;        
-      if (!domStack_.empty() && domStack_.top().unanimous) {
+      if (!dom_stack_.empty() && dom_stack_.top().unanimous) {
        DP(3, "*** Uninimous branch at join");
-        tmask_ = domStack_.top().tmask;
+        tmask_ = dom_stack_.top().tmask;
        active_ = tmask_.any();
-        domStack_.pop();
+        dom_stack_.pop();
      } else {
-        if (!domStack_.top().fallThrough) {
-          nextPC = domStack_.top().PC;
+        if (!dom_stack_.top().fallThrough) {
+          nextPC = dom_stack_.top().PC;
          DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
        }

-        tmask_ = domStack_.top().tmask;
+        tmask_ = dom_stack_.top().tmask;
        active_ = tmask_.any();

        DPH(3, "*** Join: New TM=");
        for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
        DPN(3, "\n");

-        domStack_.pop();
+        dom_stack_.pop();
      }        
    } break;
    case 4: {
@ -946,10 +946,10 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    case 0: // vector-vector
      switch (func6) {
      case 0: {
-        auto& vr1 = vRegFile_.at(rsrc0);
-        auto& vr2 = vRegFile_.at(rsrc1);
-        auto& vd = vRegFile_.at(rdest);
-        auto& mask = vRegFile_.at(0);
+        auto& vr1 = vreg_file_.at(rsrc0);
+        auto& vr2 = vreg_file_.at(rsrc1);
+        auto& vd = vreg_file_.at(rdest);
+        auto& mask = vreg_file_.at(0);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t emask = *(uint8_t *)(mask.data() + i);
@ -990,9 +990,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 24: {
        // vmseq
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1021,9 +1021,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 25: { 
        // vmsne
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1052,9 +1052,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 26: {
        // vmsltu
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1083,9 +1083,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 27: {
        // vmslt
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            int8_t first  = *(int8_t *)(vr1.data() + i);
@ -1114,9 +1114,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 28: {
        // vmsleu
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1145,9 +1145,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 29: {
        // vmsle
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            int8_t first  = *(int8_t *)(vr1.data() + i);
@ -1176,9 +1176,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 30: {
        // vmsgtu
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1207,9 +1207,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 31: {
        // vmsgt
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            int8_t first  = *(int8_t *)(vr1.data() + i);
@ -1242,9 +1242,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      switch (func6) {
      case 24: { 
        // vmandnot
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1288,9 +1288,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 25: {
        // vmand
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1334,9 +1334,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 26: {
        // vmor
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1380,9 +1380,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 27: { 
        // vmxor
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1426,9 +1426,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 28: {
        // vmornot
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1472,9 +1472,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 29: {
        // vmnand
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1518,9 +1518,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 30: {
        // vmnor
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1564,9 +1564,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 31: {
        // vmxnor
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1610,9 +1610,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 37: {
        // vmul
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1650,9 +1650,9 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 45: {
        // vmacc
-        auto &vr1 = vRegFile_.at(rsrc0);
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr1 = vreg_file_.at(rsrc0);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
@ -1693,8 +1693,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    case 6: {
      switch (func6) {
      case 0: {
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t second = *(uint8_t *)(vr2.data() + i);
@ -1729,8 +1729,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      } break;
      case 37: {
        // vmul.vx
-        auto &vr2 = vRegFile_.at(rsrc1);
-        auto &vd = vRegFile_.at(rdest);
+        auto &vr2 = vreg_file_.at(rsrc1);
+        auto &vd = vreg_file_.at(rdest);
        if (vtype_.vsew == 8) {
          for (int i = 0; i < vl_; i++) {
            uint8_t second = *(uint8_t *)(vr2.data() + i);
@ -1805,7 +1805,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
            DPN(2, "-");
            continue;            
          }
-          iRegFile_.at(t)[rdest] = rddata[t];
+          ireg_file_.at(t)[rdest] = rddata[t];
          DPN(2, "0x" << std::hex << rddata[t]);         
        }
        DPN(2, "}" << std::endl);
@ -1820,7 +1820,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
          DPN(2, "-");
          continue;            
        }
-        fRegFile_.at(t)[rdest] = rddata[t];        
+        freg_file_.at(t)[rdest] = rddata[t];        
        DPN(2, "0x" << std::hex << rddata[t]);         
      }
      DPN(2, "}" << std::endl);
--- a/sim/simx/exeunit.cpp
+++ b/sim/simx/exeunit.cpp
@ -12,7 +12,7 @@ using namespace vortex;

 NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
    
-void NopUnit::step(uint64_t /*cycle*/) {
+void NopUnit::tick() {
    if (Input.empty()) 
        return;
    auto trace = Input.front();
@ -25,26 +25,31 @@ void NopUnit::step(uint64_t /*cycle*/) {
 LsuUnit::LsuUnit(const SimContext& ctx, Core* core) 
    : ExeUnit(ctx, core, "LSU")
    , num_threads_(core->arch().num_threads()) 
-    , pending_dcache_(LSUQ_SIZE)
+    , pending_rd_reqs_(LSUQ_SIZE)
    , fence_lock_(false)
 {}

-void LsuUnit::step(uint64_t cycle) {
+void LsuUnit::reset() {
+    pending_rd_reqs_.clear();
+    fence_lock_ = false;
+}
+
+void LsuUnit::tick() {
    // handle dcache response
    for (uint32_t t = 0; t < num_threads_; ++t) {
        auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
        if (dcache_rsp_port.empty())
            continue;
        auto& mem_rsp = dcache_rsp_port.front();
-        auto& entry = pending_dcache_.at(mem_rsp.tag);          
+        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
        auto trace = entry.first;
-        DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
+        DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
            << ", tid=" << t << ", " << *trace);  
        assert(entry.second);
        --entry.second; // track remaining blocks 
        if (0 == entry.second) {
            Output.send(trace, 1);
-            pending_dcache_.release(mem_rsp.tag);
+            pending_rd_reqs_.release(mem_rsp.tag);
        } 
        dcache_rsp_port.pop();  
    }
@ -55,26 +60,26 @@ void LsuUnit::step(uint64_t cycle) {
        if (smem_rsp_port.empty())
            continue;
        auto& mem_rsp = smem_rsp_port.front();
-        auto& entry = pending_dcache_.at(mem_rsp.tag);          
+        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
        auto trace = entry.first;
-        DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
+        DT(3, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type 
            << ", tid=" << t << ", " << *trace);  
        assert(entry.second);
        --entry.second; // track remaining blocks 
        if (0 == entry.second) {
            Output.send(trace, 1);
-            pending_dcache_.release(mem_rsp.tag);
+            pending_rd_reqs_.release(mem_rsp.tag);
        } 
        smem_rsp_port.pop();  
    }

    if (fence_lock_) {
        // wait for all pending memory operations to complete
-        if (!pending_dcache_.empty())
+        if (!pending_rd_reqs_.empty())
            return;
        Output.send(fence_state_, 1);
        fence_lock_ = false;
-        DT(3, cycle, "fence-unlock: " << fence_state_);
+        DT(3, "fence-unlock: " << fence_state_);
    }

    // check input queue
@ -87,17 +92,17 @@ void LsuUnit::step(uint64_t cycle) {
        // schedule fence lock
        fence_state_ = trace;
        fence_lock_ = true;        
-        DT(3, cycle, "fence-lock: " << *trace);
+        DT(3, "fence-lock: " << *trace);
        // remove input
        auto time = Input.pop(); 
-        core_->perf_stats_.lsu_stalls += (cycle - time);
+        core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
        return;
    }

    // check pending queue capacity    
-    if (pending_dcache_.full()) {
+    if (pending_rd_reqs_.full()) {
        if (!trace->suspend()) {
-            DT(3, cycle, "*** lsu-queue-stall: " << *trace);
+            DT(3, "*** lsu-queue-stall: " << *trace);
        }
        return;
    } else {
@ -130,7 +135,7 @@ void LsuUnit::step(uint64_t cycle) {
        }
    }

-    auto tag = pending_dcache_.allocate({trace, valid_addrs});
+    auto tag = pending_rd_reqs_.allocate({trace, valid_addrs});

    for (uint32_t t = 0; t < num_threads_; ++t) {
        if (!trace->tmask.test(t))
@ -145,15 +150,16 @@ void LsuUnit::step(uint64_t cycle) {
        mem_req.write = is_write;
        mem_req.non_cacheable = (type == AddrType::IO); 
        mem_req.tag   = tag;
-        mem_req.core_id = core_->id();
+        mem_req.core_id = trace->cid;
+        mem_req.uuid = trace->uuid;
        
        if (type == AddrType::Shared) {
            core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
-            DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
+            DT(3, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
                << ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
        } else {            
            dcache_req_port.send(mem_req, 2);
-            DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
+            DT(3, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
                << ", type=" << trace->lsu.type << ", tid=" << t << ", nc=" << mem_req.non_cacheable << ", " << *trace);
        }        
        
@ -163,20 +169,20 @@ void LsuUnit::step(uint64_t cycle) {

    // do not wait on writes
    if (is_write) {        
-        pending_dcache_.release(tag);
+        pending_rd_reqs_.release(tag);
        Output.send(trace, 1);
    }

    // remove input
    auto time = Input.pop();
-    core_->perf_stats_.lsu_stalls += (cycle - time);
+    core_->perf_stats_.lsu_stalls += (SimPlatform::instance().cycles() - time);
 }

 ///////////////////////////////////////////////////////////////////////////////

 AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
    
-void AluUnit::step(uint64_t cycle) {    
+void AluUnit::tick() {    
    if (Input.empty())
        return;
    auto trace = Input.front();    
@ -196,33 +202,33 @@ void AluUnit::step(uint64_t cycle) {
    default:
        std::abort();
    }
-    DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
+    DT(3, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
    if (trace->fetch_stall) {
        core_->stalled_warps_.reset(trace->wid);
    }
    auto time = Input.pop();
-    core_->perf_stats_.alu_stalls += (cycle - time);
+    core_->perf_stats_.alu_stalls += (SimPlatform::instance().cycles() - time);
 }

 ///////////////////////////////////////////////////////////////////////////////

 CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
    
-void CsrUnit::step(uint64_t cycle) {
+void CsrUnit::tick() {
    if (Input.empty()) 
        return;
    auto trace = Input.front();
    Output.send(trace, 1);
    auto time = Input.pop();
-    core_->perf_stats_.csr_stalls += (cycle - time);
-    DT(3, cycle, "pipeline-execute: op=CSR, " << *trace);
+    core_->perf_stats_.csr_stalls += (SimPlatform::instance().cycles() - time);
+    DT(3, "pipeline-execute: op=CSR, " << *trace);
 }

 ///////////////////////////////////////////////////////////////////////////////

 FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
    
-void FpuUnit::step(uint64_t cycle) {
+void FpuUnit::tick() {
    if (Input.empty()) 
        return;
    auto trace = Input.front();
@ -245,9 +251,9 @@ void FpuUnit::step(uint64_t cycle) {
    default:
        std::abort();
    }    
-    DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
+    DT(3, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
    auto time = Input.pop();
-    core_->perf_stats_.fpu_stalls += (cycle - time);
+    core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
 }

 ///////////////////////////////////////////////////////////////////////////////
@ -257,8 +263,12 @@ GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
    , num_threads_(core->arch().num_threads()) 
    , pending_tex_reqs_(TEXQ_SIZE)
 {}
+
+void GpuUnit::reset() {
+    pending_tex_reqs_.clear();
+}
    
-void GpuUnit::step(uint64_t cycle) {
+void GpuUnit::tick() {
 #ifdef EXT_TEX_ENABLE
    // handle memory response
    for (uint32_t t = 0; t < num_threads_; ++t) {
@ -268,7 +278,7 @@ void GpuUnit::step(uint64_t cycle) {
        auto& mem_rsp = dcache_rsp_port.front();
        auto& entry = pending_tex_reqs_.at(mem_rsp.tag);  
        auto trace = entry.first;
-        DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);  
+        DT(3, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);  
        assert(entry.second);
        --entry.second; // track remaining blocks 
        if (0 == entry.second) {
@ -312,7 +322,7 @@ void GpuUnit::step(uint64_t cycle) {
        issued = true;
        break;
    case GpuType::TEX:
-        if (this->processTexRequest(cycle, trace))
+        if (this->processTexRequest(trace))
           issued = true;
        break;
    default:
@ -320,22 +330,20 @@ void GpuUnit::step(uint64_t cycle) {
    }

    if (issued) {    
-        DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
+        DT(3, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
        if (trace->fetch_stall)  {
            core_->stalled_warps_.reset(trace->wid);
        }
        auto time = Input.pop();
-        core_->perf_stats_.fpu_stalls += (cycle - time);
+        core_->perf_stats_.fpu_stalls += (SimPlatform::instance().cycles() - time);
    }
 }

-bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
-    __unused (cycle);
-    
+bool GpuUnit::processTexRequest(pipeline_trace_t* trace) {    
    // check pending queue capacity    
    if (pending_tex_reqs_.full()) {
        if (!trace->suspend()) {
-            DT(3, cycle, "*** tex-queue-stall: " << *trace);
+            DT(3, "*** tex-queue-stall: " << *trace);
        }
        return false;
    } else {
@ -356,14 +364,15 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
            continue;

        auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
-        for (auto mem_addr : trace->mem_addrs.at(t)) {
+        for (auto& mem_addr : trace->mem_addrs.at(t)) {
            MemReq mem_req;
            mem_req.addr  = mem_addr.addr;
            mem_req.write = (trace->lsu.type == LsuType::STORE);
            mem_req.tag   = tag;
            mem_req.core_id = core_->id();
+            mem_req.uuid = trace->uuid;
            dcache_req_port.send(mem_req, 3);
-            DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
+            DT(3, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag 
                << ", tid=" << t << ", "<< trace);
            ++ core_->perf_stats_.tex_reads;
            ++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
--- a/sim/simx/exeunit.h
+++ b/sim/simx/exeunit.h
@ -18,10 +18,14 @@ public:
        , Input(this)
        , Output(this)
        , core_(core)
-    {}    
+    {}
    
    virtual ~ExeUnit() {}

+    virtual void reset() {}
+
+    virtual void tick() = 0;
+
 protected:
    Core* core_;
 };
@ -32,7 +36,7 @@ class NopUnit : public ExeUnit {
 public:
    NopUnit(const SimContext& ctx, Core*);
    
-    void step(uint64_t cycle);
+    void tick();
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -40,14 +44,16 @@ public:
 class LsuUnit : public ExeUnit {
 private:    
    uint32_t num_threads_;
-    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_dcache_;
+    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_rd_reqs_;
    pipeline_trace_t* fence_state_;
    bool fence_lock_;

 public:
    LsuUnit(const SimContext& ctx, Core*);

-    void step(uint64_t cycle);
+    void reset();
+
+    void tick();
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -56,7 +62,7 @@ class AluUnit : public ExeUnit {
 public:
    AluUnit(const SimContext& ctx, Core*);
    
-    void step(uint64_t cycle);
+    void tick();
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -65,7 +71,7 @@ class CsrUnit : public ExeUnit {
 public:
    CsrUnit(const SimContext& ctx, Core*);
    
-    void step(uint64_t cycle);
+    void tick();
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -74,7 +80,7 @@ class FpuUnit : public ExeUnit {
 public:
    FpuUnit(const SimContext& ctx, Core*);
    
-    void step(uint64_t cycle);
+    void tick();
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -84,12 +90,14 @@ private:
    uint32_t num_threads_;
    HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;

-    bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace);
+    bool processTexRequest(pipeline_trace_t* trace);
    
 public:
    GpuUnit(const SimContext& ctx, Core*);
+
+    void reset();
    
-    void step(uint64_t cycle);
+    void tick();
 };

 }
--- a/sim/simx/ibuffer.h
+++ b/sim/simx/ibuffer.h
@ -34,6 +34,11 @@ public:
    void pop() {
        return entries_.pop();
    }
+
+    void clear() {
+        std::queue<pipeline_trace_t*> empty;
+        std::swap(entries_, empty );
+    }
 };

 }
--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@ -6,6 +6,8 @@
 #include <stdlib.h>
 #include <sys/stat.h>
 #include "processor.h"
+#include "archdef.h"
+#include "mem.h"
 #include "constants.h"
 #include <util.h>
 #include "args.h"
@ -50,11 +52,14 @@ int main(int argc, char **argv) {

  std::cout << "Running " << imgFileName << "..." << std::endl;
  
-  if (!SimPlatform::instance().initialize())
-    return -1;
-
  {
+    // create processor configuation
+    ArchDef arch(archStr, num_cores, num_warps, num_threads);
+
+    // create memory module
    RAM ram(RAM_PAGE_SIZE);
+
+    // load program
    {
      std::string program_ext(fileExtension(imgFileName.c_str()));
      if (program_ext == "bin") {
@ -67,27 +72,15 @@ int main(int argc, char **argv) {
      }
    }

-    ArchDef arch(archStr, num_cores, num_warps, num_threads);
-    auto processor = Processor::Create(arch);
-    processor->attach_ram(&ram);
-
-    // setup memory simulator
-    auto memsim = MemSim::Create(MemSim::Config{
-      DRAM_CHANNELS,
-      arch.num_cores()
-    });    
-    processor->MemReqPort.bind(&memsim->MemReqPort);
-    memsim->MemRspPort.bind(&processor->MemRspPort);
+    // create processor
+    Processor processor(arch);
+  
+    // attach memory module
+    processor.attach_ram(&ram);   

    // run simulation
-    for (;;) {
-      SimPlatform::instance().step();
-      if (processor->check_exit(&exitcode))
-          break;
-    };    
-  }
-
-  SimPlatform::instance().finalize();
+    processor.run();
+  } 

  if (riscv_test) {
    if (1 == exitcode) {
--- a/sim/simx/memsim.cpp
+++ b/sim/simx/memsim.cpp
@ -13,6 +13,7 @@ DISABLE_WARNING_POP

 #include "constants.h"
 #include "types.h"
+#include "debug.h"

 using namespace vortex;

@ -51,37 +52,50 @@ public:
        return perf_stats_;
    }

-    void dram_callback(ramulator::Request& req, uint32_t tag) {
-        MemRsp mem_rsp{tag, (uint32_t)req.coreid};
+    void dram_callback(ramulator::Request& req, uint32_t tag, uint64_t uuid) {
+        if (req.type == ramulator::Request::Type::WRITE)
+            return;
+        MemRsp mem_rsp{tag, (uint32_t)req.coreid, uuid};
        simobject_->MemRspPort.send(mem_rsp, 1);
+        DT(3, simobject_->name() << "-" << mem_rsp);
    }

-    void step(uint64_t /*cycle*/) {
-        dram_->tick();
+    void reset() {
+        perf_stats_ = PerfStats();
+    }
+
+    void tick() {
+        if (MEM_CYCLE_RATIO > 0) { 
+            auto cycle = SimPlatform::instance().cycles();
+            if ((cycle % MEM_CYCLE_RATIO) == 0)
+                dram_->tick();
+        } else {
+            for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
+                dram_->tick();            
+        }
              
        if (simobject_->MemReqPort.empty())
            return;
        
        auto& mem_req = simobject_->MemReqPort.front();

-        if (mem_req.write) {      
-            ramulator::Request dram_req( 
-                mem_req.addr,
-                ramulator::Request::Type::WRITE,
-                mem_req.core_id
-            );
-            dram_->send(dram_req);
+        ramulator::Request dram_req( 
+            mem_req.addr,
+            mem_req.write ? ramulator::Request::Type::WRITE : ramulator::Request::Type::READ,
+            std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag, mem_req.uuid),
+            mem_req.core_id
+        );
+
+        if (!dram_->send(dram_req))
+            return;
+        
+        if (mem_req.write) {
            ++perf_stats_.writes;
        } else {
-            ramulator::Request dram_req( 
-                mem_req.addr,
-                ramulator::Request::Type::READ,
-                std::bind(&Impl::dram_callback, this, placeholders::_1, mem_req.tag),
-                mem_req.core_id
-            );
-            dram_->send(dram_req);
            ++perf_stats_.reads;
        }
+        
+        DT(3, simobject_->name() << "-" << mem_req);

        simobject_->MemReqPort.pop();        
    }
@ -89,8 +103,8 @@ public:

 ///////////////////////////////////////////////////////////////////////////////

-MemSim::MemSim(const SimContext& ctx, const Config& config) 
-    : SimObject<MemSim>(ctx, "MemSim")
+MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config) 
+    : SimObject<MemSim>(ctx, name)
    , MemReqPort(this) 
    , MemRspPort(this)
    , impl_(new Impl(this, config))
@ -100,6 +114,10 @@ MemSim::~MemSim() {
    delete impl_;
 }

-void MemSim::step(uint64_t cycle) {
-    impl_->step(cycle);
+void MemSim::reset() {
+    impl_->reset();
+}
+
+void MemSim::tick() {
+    impl_->tick();
 }
--- a/sim/simx/memsim.h
+++ b/sim/simx/memsim.h
@ -26,10 +26,12 @@ public:
    SimPort<MemReq> MemReqPort;
    SimPort<MemRsp> MemRspPort;

-    MemSim(const SimContext& ctx, const Config& config);
+    MemSim(const SimContext& ctx, const char* name, const Config& config);
    ~MemSim();

-    void step(uint64_t cycle);
+    void reset();
+
+    void tick();

    const PerfStats& perf_stats() const;
    
--- a/sim/simx/pipeline.h
+++ b/sim/simx/pipeline.h
@ -98,14 +98,40 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state)
  return os;
 }

-class PipelineLatch : public Queue<pipeline_trace_t*> {
+class PipelineLatch {
 protected:
  const char* name_;
+  std::queue<pipeline_trace_t*> queue_;

 public:
  PipelineLatch(const char* name = nullptr) 
    : name_(name) 
  {}
+  
+  bool empty() const {
+    return queue_.empty();
+  }
+
+  pipeline_trace_t* front() {
+    return queue_.front();
+  }
+
+  pipeline_trace_t* back() {
+    return queue_.back();
+  }
+
+  void push(pipeline_trace_t* value) {    
+    queue_.push(value);
+  }
+
+  void pop() {
+    queue_.pop();
+  }
+
+  void clear() {
+    std::queue<pipeline_trace_t*> empty;
+    std::swap(queue_, empty );
+  }
 };

 }
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@ -1,11 +1,11 @@
 #include "processor.h"
+#include "core.h"
 #include "constants.h"

 using namespace vortex;

 class Processor::Impl {
 private:
-  Processor* simobject_;
  std::vector<Core::Ptr> cores_;
  std::vector<Cache::Ptr> l2caches_;
  std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
@ -13,12 +13,13 @@ private:
  Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;

 public:
-  Impl(Processor* simobject, const ArchDef& arch) 
-    : simobject_(simobject)
-    , cores_(arch.num_cores())
+  Impl(const ArchDef& arch) 
+    : cores_(arch.num_cores())
    , l2caches_(NUM_CLUSTERS)
    , l2_mem_switches_(NUM_CLUSTERS)
  {
+    SimPlatform::instance().initialize();
+
    uint32_t num_cores = arch.num_cores();
    uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;

@ -26,12 +27,15 @@ public:
    for (uint32_t i = 0; i < num_cores; ++i) {
        cores_.at(i) = Core::Create(arch, i);
    }
-    
-    std::vector<SimPort<MemReq>*> mem_req_ports(1);
-    std::vector<SimPort<MemRsp>*> mem_rsp_ports(1);

-    mem_req_ports.at(0) = &simobject_->MemReqPort;
-    mem_rsp_ports.at(0) = &simobject_->MemRspPort;
+     // setup memory simulator
+    auto memsim = MemSim::Create("dram", MemSim::Config{
+      MEMORY_BANKS,
+      arch.num_cores()
+    });
+    
+    std::vector<SimPort<MemReq>*> mem_req_ports(1, &memsim->MemReqPort);
+    std::vector<SimPort<MemRsp>*> mem_rsp_ports(1, &memsim->MemRspPort);

    if (L3_ENABLE) {
      l3cache_ = Cache::Create("l3cache", Cache::Config{
@ -39,7 +43,7 @@ public:
        log2ceil(MEM_BLOCK_SIZE), // B
        2,                      // W
        0,                      // A
-        32,                    // address bits  
+        32,                     // address bits  
        L3_NUM_BANKS,           // number of banks
        L3_NUM_PORTS,           // number of ports
        NUM_CLUSTERS,           // request size 
@ -122,10 +126,8 @@ public:
    }
  }

-  ~Impl() {}
-
-  void step(uint64_t cycle) {
-    __unused (cycle);
+  ~Impl() {
+    SimPlatform::instance().finalize();
  }

  void attach_ram(RAM* ram) {
@ -134,28 +136,33 @@ public:
    }
  }

-  bool check_exit(int* exitcode) {
-    bool running = false;
-    for (auto& core : cores_) {
-      if (core->running()) {
-        running = true;
+  int run() {
+    SimPlatform::instance().reset();
+    bool running;
+    int exitcode = 0;
+    do {
+      SimPlatform::instance().tick();
+      running = false;
+      for (auto& core : cores_) {
+        if (core->running()) {
+          running = true;
+        }
+        if (core->check_exit()) {
+          exitcode = core->getIRegValue(3);
+          running = false;
+          break;
+        }
      }
-      if (core->check_exit()) {
-        *exitcode = core->getIRegValue(3);
-        return true;
-      }
-    }
-    return !running;
+    } while (running);
+
+    return exitcode;
  }
 };

 ///////////////////////////////////////////////////////////////////////////////

-Processor::Processor(const SimContext& ctx, const ArchDef& arch) 
-  : SimObject<Processor>(ctx, "Vortex")
-  , MemReqPort(this) 
-  , MemRspPort(this)
-  , impl_(new Impl(this, arch))
+Processor::Processor(const ArchDef& arch) 
+  : impl_(new Impl(arch))
 {}

 Processor::~Processor() {
@ -166,10 +173,6 @@ void Processor::attach_ram(RAM* mem) {
  impl_->attach_ram(mem);
 }

-bool Processor::check_exit(int* exitcode) {
-  return impl_->check_exit(exitcode);
-}
-
-void Processor::step(uint64_t cycle) {
-  impl_->step(cycle);
+int Processor::run() {
+  return impl_->run();
 }
--- a/sim/simx/processor.h
+++ b/sim/simx/processor.h
@ -1,22 +1,18 @@
 #pragma once

-#include "core.h"
-
 namespace vortex {

-class Processor : public SimObject<Processor> {
+class ArchDef;
+class RAM;
+
+class Processor {
 public:
-  SimPort<MemReq> MemReqPort;
-  SimPort<MemRsp> MemRspPort;
-  
-  Processor(const SimContext& ctx, const ArchDef& arch);
+  Processor(const ArchDef& arch);
  ~Processor();

  void attach_ram(RAM* mem);

-  bool check_exit(int* exitcode);
-
-  void step(uint64_t cycle);
+  int run();

 private:
  class Impl;
--- a/sim/simx/scoreboard.h
+++ b/sim/simx/scoreboard.h
@ -24,11 +24,16 @@ public:
        , in_use_fregs_(arch.num_warps())
        , in_use_vregs_(arch.num_warps())
    {
-        for (int w = 0; w < arch.num_warps(); ++w) {    
-            in_use_iregs_.at(w).reset();
-            in_use_fregs_.at(w).reset();
-            in_use_vregs_.at(w).reset();    
+        this->clear();
+    }
+
+    void clear() {
+        for (int i = 0, n = in_use_iregs_.size(); i < n; ++i) {
+            in_use_iregs_.at(i).reset();
+            in_use_fregs_.at(i).reset();
+            in_use_vregs_.at(i).reset();
        }
+        owners_.clear();
    }

    bool in_use(pipeline_trace_t* state) const {
--- a/sim/simx/sharedmem.h
+++ b/sim/simx/sharedmem.h
@ -45,7 +45,11 @@ public:
    
    virtual ~SharedMem() {}

-    void step(uint64_t /*cycle*/) {
+    void reset() {
+        perf_stats_ = PerfStats();
+    }
+
+    void tick() {
        std::vector<bool> in_used_banks(config_.num_banks);
        for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
            auto& core_req_port = this->Inputs.at(req_id);            
--- a/sim/simx/tex_unit.cpp
+++ b/sim/simx/tex_unit.cpp
@ -16,6 +16,12 @@ TexUnit::TexUnit(Core* core) : core_(core) {}

 TexUnit::~TexUnit() {}

+void TexUnit::clear() {
+  for (auto& state : states_) {
+    state = 0;
+  }
+}
+
 uint32_t TexUnit::get_state(uint32_t state) {
  return states_.at(state);
 }
--- a/sim/simx/tex_unit.h
+++ b/sim/simx/tex_unit.h
@ -11,6 +11,8 @@ public:
    TexUnit(Core* core);
    ~TexUnit();

+    void clear();
+
    uint32_t get_state(uint32_t state);
  
    void set_state(uint32_t state, uint32_t value);
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@ -213,67 +213,48 @@ struct MemReq {
    bool non_cacheable;
    uint32_t tag;
    uint32_t core_id;    
+    uint64_t uuid;

    MemReq(uint64_t _addr = 0, 
           bool _write = false,
           bool _non_cacheable = false,
           uint64_t _tag = 0, 
-           uint32_t _core_id = 0
+           uint32_t _core_id = 0,
+           uint64_t _uuid = 0
    )   : addr(_addr)
        , write(_write)
        , non_cacheable(_non_cacheable)
        , tag(_tag)
        , core_id(_core_id)
+        , uuid(_uuid)
    {}
 };

+inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
+  os << "mem-" << (req.write ? "wr" : "rd") << ": ";
+  os << "addr=" << req.addr << ", tag=" << req.tag << ", core_id=" << req.core_id;
+  os << " (#" << std::dec << req.uuid << ")";
+  return os;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
 struct MemRsp {
    uint64_t tag;    
    uint32_t core_id;
-    MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0)
+    uint64_t uuid;
+    MemRsp(uint64_t _tag = 0, uint32_t _core_id = 0, uint64_t _uuid = 0)
      : tag (_tag) 
      , core_id(_core_id)
+      , uuid(_uuid)
    {}
 };

-///////////////////////////////////////////////////////////////////////////////
-
-template <typename T>
-class Queue {
-protected:
-  std::queue<T> queue_;
-
-public:
-  Queue() {}
-
-  bool empty() const {
-    return queue_.empty();
-  }
-
-  const T& front() const {
-    return queue_.front();
-  }
-
-  T& front() {
-    return queue_.front();
-  }
-
-  const T& back() const {
-    return queue_.back();
-  }
-
-  T& back() {
-    return queue_.back();
-  }
-
-  void push(const T& value) {    
-    queue_.push(value);
-  }
-
-  void pop() {
-    queue_.pop();
-  }
-};
+inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
+  os << "mem-rsp: tag=" << rsp.tag << ", core_id=" << rsp.core_id;
+  os << " (#" << std::dec << rsp.uuid << ")";
+  return os;
+}

 ///////////////////////////////////////////////////////////////////////////////

@ -337,6 +318,14 @@ public:
    entry.first = false;
    --size_;
  }
+
+  void clear() {
+    for (uint32_t i = 0, n = entries_.size(); i < n; ++i) {
+      auto& entry = entries_.at(i);
+      entry.first = false;
+    }
+    size_ = 0;
+  }
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -376,7 +365,11 @@ public:
    }
  }

-  void step(uint64_t /*cycle*/) {  
+  void reset() {
+    cursor_ = 0;
+  }
+
+  void tick() {  
    if (ReqIn.size() == 1)
      return;
        
--- a/sim/simx/warp.cpp
+++ b/sim/simx/warp.cpp
@ -13,12 +13,28 @@ using namespace vortex;
 Warp::Warp(Core *core, Word id)
    : id_(id)
    , core_(core)
-    , active_(false)
-    , PC_(STARTUP_ADDR)
-    , tmask_(0) {
-  iRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
-  fRegFile_.resize(core_->arch().num_threads(), std::vector<Word>(core_->arch().num_regs(), 0));
-  vRegFile_.resize(core_->arch().num_regs(), std::vector<Byte>(core_->arch().vsize(), 0));
+    , ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
+    , freg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
+    , vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
+{
+  this->clear();
+}
+
+void Warp::clear() {
+  active_ = false;
+  PC_ = STARTUP_ADDR;
+  tmask_.reset();  
+  for (int i = 0, n = core_->arch().num_threads(); i < n; ++i) {
+    for (auto& reg : ireg_file_.at(i)) {
+      reg = 0;
+    }
+    for (auto& reg : freg_file_.at(i)) {
+      reg = 0;
+    }
+    for (auto& reg : vreg_file_.at(i)) {
+      reg = 0;
+    }
+  }
 }

 void Warp::eval(pipeline_trace_t *trace) {
@ -55,7 +71,7 @@ void Warp::eval(pipeline_trace_t *trace) {
  for (int i = 0; i < core_->arch().num_regs(); ++i) {
    DPN(4, "  %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
    for (int j = 0; j < core_->arch().num_threads(); ++j) {
-      DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << iRegFile_.at(j).at(i) << std::setfill(' ') << ' ');
+      DPN(4, ' ' << std::setfill('0') << std::setw(8) << std::hex << ireg_file_.at(j).at(i) << std::setfill(' ') << ' ');
    }
    DPN(4, std::endl);
  }  
--- a/sim/simx/warp.h
+++ b/sim/simx/warp.h
@ -41,6 +41,8 @@ struct vtype {
 class Warp {
 public:
  Warp(Core *core, Word id);
+
+  void clear();
  
  bool active() const {
    return active_;
@ -84,7 +86,7 @@ public:
  }

  Word getIRegValue(int reg) const {
-    return iRegFile_.at(0).at(reg);
+    return ireg_file_.at(0).at(reg);
  }

  void eval(pipeline_trace_t *);
@ -100,10 +102,10 @@ private:
  Word PC_;
  ThreadMask tmask_;  
  
-  std::vector<std::vector<Word>> iRegFile_;
-  std::vector<std::vector<Word>> fRegFile_;
-  std::vector<std::vector<Byte>> vRegFile_;
-  std::stack<DomStackEntry> domStack_;
+  std::vector<std::vector<Word>> ireg_file_;
+  std::vector<std::vector<Word>> freg_file_;
+  std::vector<std::vector<Byte>> vreg_file_;
+  std::stack<DomStackEntry> dom_stack_;

  struct vtype vtype_;
  int vl_;
--- a/sim/vlsim/opae_sim.cpp
+++ b/sim/vlsim/opae_sim.cpp
@ -23,6 +23,7 @@

 #include <future>
 #include <list>
+#include <queue>
 #include <unordered_map>

 #ifndef MEMORY_BANKS 
@ -33,8 +34,12 @@
  #endif
 #endif

+#ifndef MEM_CYCLE_RATIO
+#define MEM_CYCLE_RATIO -1
+#endif
+
 #undef MEM_BLOCK_SIZE
-#define MEM_BLOCK_SIZE    (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
+#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)

 #define CACHE_BLOCK_SIZE  64

@ -43,8 +48,6 @@
 #define CCI_RQ_SIZE 16
 #define CCI_WQ_SIZE 16

-#define ENABLE_MEM_STALLS
-
 #ifndef TRACE_START_TIME
 #define TRACE_START_TIME 0ull
 #endif
@ -144,7 +147,7 @@ public:
    future_ = std::async(std::launch::async, [&]{                 
        while (!stop_) {
            std::lock_guard<std::mutex> guard(mutex_);
-            this->step();
+            this->tick();
        }
    }); 
  }
@ -206,7 +209,7 @@ public:
    device_->vcp2af_sRxPort_c0_ReqMmioHdr_address = offset / 4;
    device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
    device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
-    this->step();
+    this->tick();
    device_->vcp2af_sRxPort_c0_mmioRdValid = 0;
    assert(device_->af2cp_sTxPort_c2_mmioRdValid);  
    *value = device_->af2cp_sTxPort_c2_data;
@ -220,7 +223,7 @@ public:
    device_->vcp2af_sRxPort_c0_ReqMmioHdr_length = 1;
    device_->vcp2af_sRxPort_c0_ReqMmioHdr_tid = 0;
    memcpy(device_->vcp2af_sRxPort_c0_data, &value, 8);
-    this->step();
+    this->tick();
    device_->vcp2af_sRxPort_c0_mmioWrValid = 0;
  }

@ -257,17 +260,29 @@ private:
    Verilated::assertOn(true);
  }

-  void step() {
+  void tick() {
    this->sRxPort_bus();
    this->sTxPort_bus();
    this->avs_bus();
+
+    if (!dram_queue_.empty()) {
+      if (dram_->send(dram_queue_.front()))
+        dram_queue_.pop();
+    }
        
    device_->clk = 0;
    this->eval();
    device_->clk = 1;
    this->eval();

-    dram_->tick();
+    if (MEM_CYCLE_RATIO > 0) { 
+      auto cycle = timestamp / 2;
+      if ((cycle % MEM_CYCLE_RATIO) == 0)
+        dram_->tick();
+    } else {
+      for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
+        dram_->tick();            
+    }

  #ifndef NDEBUG
    fflush(stdout);
@ -403,7 +418,7 @@ private:
          ramulator::Request::Type::WRITE,
          0
        );
-        dram_->send(dram_req);
+        dram_queue_.push(dram_req);
      }

      if (device_->avs_read[b]) {
@ -431,7 +446,7 @@ private:
            }, placeholders::_1, mem_req),
          0
        );
-        dram_->send(dram_req);
+        dram_queue_.push(dram_req);
      }

      device_->avs_waitrequest[b] = false;
@ -480,6 +495,8 @@ private:

  ramulator::Gem5Wrapper* dram_;

+  std::queue<ramulator::Request> dram_queue_;
+
  Vvortex_afu_shim *device_;
 #ifdef VCD_OUTPUT
  VerilatedVcdC *trace_;