simx timing simulation refactoring

2025-04-23 21:39:10 -04:00 · 2021-11-14 08:52:34 -05:00 · 2021-11-14 08:52:34 -05:00 · 808bddb586
commit 808bddb586
parent 9656779d48
22 changed files with 1123 additions and 903 deletions
--- a/sim/common/simobject.h
+++ b/sim/common/simobject.h
@ -11,6 +11,128 @@ namespace vortex {

 class SimObjectBase;

+///////////////////////////////////////////////////////////////////////////////
+
+class SimPortBase {
+public:  
+  virtual ~SimPortBase() {}
+  
+  SimObjectBase* module() const {
+    return module_;
+  }
+
+  SimPortBase* peer() const {
+    return peer_;
+  }
+
+  bool connected() const {
+    return (peer_ != nullptr);
+  }
+
+protected:
+  SimPortBase(SimObjectBase* module)
+    : module_(module)
+    , peer_(nullptr)
+  {}
+
+  void connect(SimPortBase* peer) {
+    assert(peer_ == nullptr);
+    peer_ = peer;
+  }
+
+  void disconnect() {    
+    assert(peer_ == nullptr);  
+    peer_ = nullptr;
+  }
+
+  SimPortBase& operator=(const SimPortBase&) = delete;
+
+  SimObjectBase* module_;
+  SimPortBase*   peer_;
+
+  template <typename U> friend class SlavePort;
+  template <typename U> friend class MasterPort;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pkt>
+class SimPort : public SimPortBase {
+public:
+  void send(const Pkt& pkt, uint64_t delay) const; 
+
+  bool read(Pkt* out) {
+    if (!valid_)
+      return false;
+    *out = data_;
+    valid_ = false;
+    return true;
+  }
+
+protected:
+  SimPort(SimObjectBase* module)
+    : SimPortBase(module)
+    , valid_(false)
+  {}
+
+  void write(const Pkt& data) {
+    assert(!valid_);
+    data_  = data;
+    valid_ = true;
+  }
+
+  SimPort& operator=(const SimPort&) = delete;
+
+  Pkt data_;
+  bool valid_;
+
+  template <typename U> friend class SimPortEvent;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pkt>
+class SlavePort : public SimPort<Pkt> {
+public:
+  SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}
+
+  void bind(SlavePort<Pkt>* peer) {
+    this->connect(peer);
+  }
+
+  void unbind() {    
+    this->disconnect();
+  }
+
+protected:
+  SlavePort& operator=(const SlavePort&) = delete;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Pkt>
+class MasterPort : public SimPort<Pkt> {
+public:
+  MasterPort(SimObjectBase* module) : SimPort<Pkt>(module) {}
+
+  void bind(SlavePort<Pkt>* peer) {
+    this->connect(peer);
+  }
+
+  void bind(MasterPort<Pkt>* peer) {
+    this->connect(peer);
+  }
+
+  void unbind() {    
+    this->disconnect();
+  }
+
+protected:
+  MasterPort& operator=(const MasterPort&) = delete;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
 class SimEventBase {
 public:
  typedef std::shared_ptr<SimEventBase> Ptr;
@ -32,16 +154,16 @@ protected:
 ///////////////////////////////////////////////////////////////////////////////

 template <typename Pkt>
-class SimSimpleEvent : public SimEventBase {
+class SimCallEvent : public SimEventBase {
 public:
  typedef std::function<void (const Pkt&)> Func;

  template <typename... Args>
  static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) {
-    return std::make_shared<SimSimpleEvent>(func, pkt, delay);
+    return std::make_shared<SimCallEvent>(func, pkt, delay);
  }   

-  SimSimpleEvent(const Func& func, const Pkt& pkt, uint64_t delay) 
+  SimCallEvent(const Func& func, const Pkt& pkt, uint64_t delay) 
    : SimEventBase(delay)
    , func_(func)
    , pkt_(pkt)
@ -61,167 +183,23 @@ protected:
 template <typename Pkt>
 class SimPortEvent : public SimEventBase {
 public:
-  typedef std::function<void (const Pkt&, uint32_t)> Func;
-
-  template <typename... Args>
-  static Ptr Create(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) {
-    return std::make_shared<SimPortEvent>(func, pkt, port_id, delay);
+  static Ptr Create(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay) {
+    return std::make_shared<SimPortEvent>(port, pkt, delay);
  }

-  SimPortEvent(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) 
+  SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay) 
    : SimEventBase(delay) 
-    , func_(func)
+    , port_(port)
    , pkt_(pkt)
-    , port_id_(port_id)
  {}
  
  void fire() const override {
-    func_(pkt_, port_id_);
+    const_cast<SimPort<Pkt>*>(port_)->write(pkt_);
  }

 private:  
-  Func     func_;
-  Pkt      pkt_;  
-  uint32_t port_id_;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-class SimPortBase {
-public:
-  typedef std::shared_ptr<SimPortBase> Ptr;  
-
-  virtual ~SimPortBase() {}
-  
-  SimObjectBase* module() const {
-    return module_;
-  }
-  
-  uint32_t port_id() const {
-    return port_id_;
-  }
-
-  SimPortBase* peer() const {
-    return peer_;
-  }
-
-  bool connected() const {
-    return (peer_ != nullptr);
-  }
-
-  bool is_slave() const {
-    return is_slave_;
-  }
-
-protected:
-
-  SimPortBase(SimObjectBase* module, bool is_slave);
-
-  void connect(SimPortBase* peer) {
-    assert(peer_ == nullptr);
-    peer_ = peer;
-  }
-
-  void disconnect() { 
-    assert(peer_ == nullptr);  
-    peer_ = nullptr;
-  }
-
-  SimObjectBase* module_;
-  uint32_t       port_id_;
-  bool           is_slave_;
-  SimPortBase*   peer_;
-
-  template <typename Pkt> friend class MasterPort;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename Pkt>
-class SlavePort : public SimPortBase {
-public:
-  typedef std::shared_ptr<SlavePort<Ptr>> Ptr;
-  typedef std::function<void (const Pkt&, uint32_t)> Func;
-
-  static Ptr Create(SimObjectBase* module, const Func& func) {
-    return std::make_shared<SlavePort<Pkt>>(module, func);
-  }
-
-  template <typename T>
-  static Ptr Create(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) {
-    return std::make_shared<SlavePort<Pkt>>(module, obj, entry);
-  } 
-
-  SlavePort(SimObjectBase* module, const Func& func)
-    : SimPortBase(module, true)
-    , func_(func)
-  {}
-
-  template <typename T>
-  SlavePort(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t))
-    : SimPortBase(module, true)
-    , func_(std::bind(entry, obj, std::placeholders::_1, std::placeholders::_2))
-  {}
-
-  SlavePort(SimObjectBase* module, SlavePort* peer) 
-    : SimPortBase(module, false) 
-  {
-    this->connect(peer);
-  }
-
-  void send(const Pkt& pkt, uint64_t delay) const;
-
-  const Func& func() const {
-    return func_;
-  }
-
-protected:
-  SlavePort& operator=(const SlavePort&);
-  Func func_;
-};
-
-///////////////////////////////////////////////////////////////////////////////
-
-template <typename Pkt>
-class MasterPort : public SimPortBase {
-public:
-  typedef std::shared_ptr<MasterPort<Ptr>> Ptr;
-  typedef std::function<void (const Pkt&, uint32_t)> Func;
-
-  static Ptr Create() {
-    return std::make_shared<MasterPort<Ptr>>(module);
-  }  
-
-  MasterPort(SimObjectBase* module) : SimPortBase(module, false) {}
-
-  MasterPort(SimObjectBase* module, MasterPort* peer) 
-    : SimPortBase(module, false) 
-  {
-    peer->connect(this);
-  }
-
-  void bind(SlavePort<Pkt>* peer) {
-    this->connect(peer);
-  }
-
-  void unbind() {    
-    peer_->disconnect();
-    this->disconnect();
-  }
-
-  void send(const Pkt& pkt, uint64_t delay) const {
-    assert(peer_ != nullptr);
-    if (peer_->is_slave()) {
-      auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
-      slave->send(pkt, delay);
-    } else {
-      auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
-      master->send(pkt, delay);
-    }  
-  }
-
-private:
-  MasterPort& operator=(const MasterPort&);
+  const SimPort<Pkt>* port_; 
+  Pkt pkt_;
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -237,25 +215,18 @@ public:
  template <typename T, typename Pkt>
  void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay);

-  virtual void step(uint64_t cycle) = 0;
-
  const std::string& name() const {
    return name_;
  }

 protected:

-  SimObjectBase(const SimContext& ctx, const char* name);
+  virtual void step(uint64_t cycle) = 0;

-  uint32_t allocate_port(SimPortBase* port) {
-      uint32_t id = ports_.size();
-      ports_.push_back(port);
-      return id;
-  }
+  SimObjectBase(const SimContext& ctx, const char* name);

 private:
  std::string name_;
-  std::vector<SimPortBase*> ports_;

  friend class SimPlatform;
  friend class SimPortBase;
@ -320,20 +291,19 @@ public:
  }

  template <typename Pkt>
-  void schedule(const typename SimSimpleEvent<Pkt>::Func& callback, 
+  void schedule(const typename SimCallEvent<Pkt>::Func& callback, 
                const Pkt& pkt, 
                uint64_t delay) {    
-    auto evt = SimSimpleEvent<Pkt>::Create(callback, pkt, delay);
+    auto evt = SimCallEvent<Pkt>::Create(callback, pkt, delay);
    assert(delay != 0);
    events_.emplace_back(evt);
  }

  template <typename Pkt>
-  void schedule(const typename SimPortEvent<Pkt>::Func& callback, 
+  void schedule(const SimPort<Pkt>* port, 
                const Pkt& pkt, 
-                uint32_t port_id, 
                uint64_t delay) {
-    auto evt = SimPortEvent<Pkt>::Create(callback, pkt, port_id, delay);
+    auto evt = SimPortEvent<Pkt>::Create(port, pkt, delay);
    assert(delay != 0);
    events_.emplace_back(evt);
  }
@ -383,13 +353,6 @@ private:

 ///////////////////////////////////////////////////////////////////////////////

-inline SimPortBase::SimPortBase(SimObjectBase* module, bool is_slave) 
-  : module_(module)  
-  , port_id_(module->allocate_port(this))
-  , is_slave_(is_slave)
-  , peer_(nullptr) 
-{}
-
 inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) 
  : name_(name) 
 {}
@ -403,18 +366,11 @@ typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
 }

 template <typename Pkt>
-void SlavePort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
-  if (func_) {
-    SimPlatform::instance().schedule(func_, pkt, port_id_, delay);
+void SimPort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
+  if (peer_) {
+    reinterpret_cast<const SimPort<Pkt>*>(peer_)->send(pkt, delay);    
  } else {
-    assert(peer_ != nullptr);
-    if (peer_->is_slave()) {
-      auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
-      slave->send(pkt, delay);
-    } else {
-      auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
-      master->send(pkt, delay);
-    }
+    SimPlatform::instance().schedule(this, pkt, delay);
  }  
 }

--- a/sim/simX/Makefile
+++ b/sim/simX/Makefile
@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
 TOP = vx_cache_sim

 SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp 
-SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp main.cpp
+SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp

 OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
 VPATH := $(sort $(dir $(SRCS)))
--- a/sim/simX/cache.cpp
+++ b/sim/simX/cache.cpp
@ -1,5 +1,6 @@
 #include "cache.h"
 #include "debug.h"
+#include "types.h"
 #include <util.h>
 #include <unordered_map>
 #include <vector>
@ -30,8 +31,7 @@ struct params_t {
        uint32_t offset_bits = config.B - config.W;
        uint32_t log2_bank_size  = config.C - bank_bits;
        uint32_t index_bits  = log2_bank_size - (config.B << config.A);        
-        assert(log2_bank_size >= config.B);
-        
+        assert(log2_bank_size >= config.B);        
        
        this->words_per_block = 1 << offset_bits;
        this->blocks_per_set  = 1 << config.A;
@ -229,9 +229,10 @@ private:
    CacheConfig config_;
    params_t params_;
    std::vector<bank_t> banks_;
-    std::vector<std::pair<bool, MemReq>> core_reqs_;
-    std::pair<bool, MemRsp> mem_rsp_;
    std::vector<std::queue<uint32_t>> core_rsps_;
+    Switch<MemReq, MemRsp>::Ptr mem_switch_;
+    std::vector<MasterPort<MemReq>> mem_req_ports_;
+    std::vector<SlavePort<MemRsp>>  mem_rsp_ports_;

 public:
    Impl(Cache* simobject, const CacheConfig& config) 
@ -239,16 +240,22 @@ public:
        , config_(config)
        , params_(config)
        , banks_(config.num_banks, {config, params_})
-        , core_reqs_(config.num_inputs)
        , core_rsps_(config.num_inputs)
-    {}    
-
-    void handleMemResponse(const MemRsp& response, uint32_t) {        
-        mem_rsp_ = {true, response};
-    }
-
-    void handleCoreRequest(const MemReq& request, uint32_t port_id) {
-        core_reqs_.at(port_id) = {true, request};
+        , mem_req_ports_(config.num_banks, simobject)
+        , mem_rsp_ports_(config.num_banks, simobject)
+    {
+        if (config.num_banks > 1) {
+            mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
+            for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
+                mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
+                mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
+            }    
+            mem_switch_->ReqOut.bind(&simobject->MemReqPort);
+            simobject->MemRspPort.bind(&mem_switch_->RspIn);
+        } else {
+            mem_req_ports_.at(0).bind(&simobject->MemReqPort);
+            simobject->MemRspPort.bind(&mem_rsp_ports_.at(0));
+        }
    }

    void step(uint64_t /*cycle*/) {
@ -269,31 +276,29 @@ public:
                bank.mshr.try_pop(&active_req);
            }

-            // try schedule stall replay
+            // try schedule stall queue if MSHR has space
            if (!active_req.valid 
-             && !bank.stall_buffer.empty()) {            
+             && !bank.stall_buffer.empty()
+             && !bank.mshr.full()) {            
                active_req = bank.stall_buffer.front();
                bank.stall_buffer.pop();
            }
        }

        // handle memory fills
-        if (mem_rsp_.first) {
-            mem_rsp_.first = false;
-            auto bank_id = bit_getw(mem_rsp_.second.tag, 0, 15);
-            auto mshr_id = bit_getw(mem_rsp_.second.tag, 16, 31);
-            this->processMemoryFill(bank_id, mshr_id);        
+        for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) {
+            MemRsp mem_rsp;
+            if (mem_rsp_ports_.at(i).read(&mem_rsp)) {
+                this->processMemoryFill(i, mem_rsp.tag);
+            }
        }
        
        // handle incoming core requests
-        for (uint32_t i = 0, n = core_reqs_.size(); i < n; ++i) {
-            auto& entry = core_reqs_.at(i);
-            if (!entry.first)
+        for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) {
+            MemReq core_req;
+            if (!simobject_->CoreReqPorts.at(i).read(&core_req))
                continue;
-                
-            entry.first = false;

-            auto& core_req = entry.second;
            auto bank_id   = params_.addr_bank_id(core_req.addr);
            auto set_id    = params_.addr_set_id(core_req.addr);
            auto tag       = params_.addr_tag(core_req.addr);
@ -417,7 +422,7 @@ public:
                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag);
                        mem_req.write = true;
                        mem_req.tag   = 0;
-                        simobject_->MemReqPort.send(mem_req, 1);
+                        mem_req_ports_.at(bank_id).send(mem_req, 1);
                    } else {
                        // mark block as dirty
                        hit_block.dirty = true;
@ -438,7 +443,8 @@ public:
                        MemReq mem_req;
                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag);
                        mem_req.write = true;
-                        simobject_->MemReqPort.send(mem_req, 1);
+                        mem_req.tag   = 0;
+                        mem_req_ports_.at(bank_id).send(mem_req, 1);
                    }
                }

@ -449,7 +455,7 @@ public:
                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
                        mem_req.write = true;
                        mem_req.tag   = 0;
-                        simobject_->MemReqPort.send(mem_req, 1);
+                        mem_req_ports_.at(bank_id).send(mem_req, 1);
                    }
                    // send core response
                    for (auto& info : active_req.infos) {
@ -467,9 +473,8 @@ public:
                        MemReq mem_req;
                        mem_req.addr  = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
                        mem_req.write = active_req.write;
-                        mem_req.tag = bit_setw(0,            0, 15, bank_id);
-                        mem_req.tag = bit_setw(mem_req.tag, 16, 31, mshr_id);
-                        simobject_->MemReqPort.send(mem_req, 1);
+                        mem_req.tag   = mshr_id;
+                        mem_req_ports_.at(bank_id).send(mem_req, 1);
                    }
                }
            }
@ -480,12 +485,12 @@ public:
 ///////////////////////////////////////////////////////////////////////////////

 Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config) 
-    : SimObject<Cache>(ctx, name)
-    , impl_(new Impl(this, config))
-    , CoreReqPorts(config.num_inputs, {this, impl_, &Cache::Impl::handleCoreRequest})
+    : SimObject<Cache>(ctx, name)    
+    , CoreReqPorts(config.num_inputs, this)
    , CoreRspPorts(config.num_inputs, this)
    , MemReqPort(this)
-    , MemRspPort(this, impl_, &Impl::handleMemResponse)
+    , MemRspPort(this)
+    , impl_(new Impl(this, config))
 {}

 Cache::~Cache() {
--- a/sim/simX/cache.h
+++ b/sim/simX/cache.h
@ -20,11 +20,7 @@ struct CacheConfig {
    uint8_t latency;        // pipeline latency 
 };

-class Cache : public SimObject<Cache> {
-private:
-    class Impl;
-    Impl* impl_;
-    
+class Cache : public SimObject<Cache> {  
 public:
    Cache(const SimContext& ctx, const char* name, const CacheConfig& config);
    ~Cache();
@ -35,6 +31,10 @@ public:
    std::vector<MasterPort<MemRsp>> CoreRspPorts;
    MasterPort<MemReq>              MemReqPort;
    SlavePort<MemRsp>               MemRspPort;
+    
+private:
+    class Impl;
+    Impl* impl_;
 };

 }
--- a/sim/simX/core.cpp
+++ b/sim/simX/core.cpp
@ -12,13 +12,13 @@

 using namespace vortex;

-Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
+Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
    : SimObject(ctx, "Core")
    , id_(id)
    , arch_(arch)
-    , decoder_(decoder)
-    , mem_(mem)
-    , shared_mem_(1, SMEM_SIZE)
+    , decoder_(arch)
+    , mmu_(0, arch.wsize(), true)
+    , shared_mem_(4096)
    , warps_(arch.num_warps())
    , barriers_(arch.num_barriers(), 0)
    , csrs_(arch.num_csrs(), 0)
@ -54,9 +54,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU
        DCACHE_MSHR_SIZE,       // mshr
        2,                      // pipeline latency
      }))
-    , l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2)) 
-    , icache_rsp_port_(this, this, &Core::icache_handleCacheReponse)
-    , dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast<LsuUnit*>(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse})
+    , l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
    , fetch_stage_("fetch")
    , decode_stage_("decode")
    , issue_stage_("issue")
@ -65,36 +63,34 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU
    , pending_icache_(arch_.num_warps())
    , stalled_warps_(0)
    , last_schedule_wid_(0)
-    , pending_instrs_(0)
+    , issued_instrs_(0)
+    , committed_instrs_(0)
    , ebreak_(false)   
    , stats_insts_(0)
    , stats_loads_(0)
    , stats_stores_(0)
-    , MemRspPort(this, &l1_mem_switch_->RspIn)
-    , MemReqPort(this, &l1_mem_switch_->ReqOut)    
+    , MemRspPort(this)
+    , MemReqPort(this)    
 {  
  for (int i = 0; i < arch_.num_warps(); ++i) {
    warps_.at(i) = std::make_shared<Warp>(this, i);
  }

  // register execute units
+  exe_units_.at((int)ExeType::NOP) = std::make_shared<NopUnit>(this);
  exe_units_.at((int)ExeType::ALU) = std::make_shared<AluUnit>(this);
  exe_units_.at((int)ExeType::LSU) = std::make_shared<LsuUnit>(this);
  exe_units_.at((int)ExeType::CSR) = std::make_shared<CsrUnit>(this);
  exe_units_.at((int)ExeType::FPU) = std::make_shared<FpuUnit>(this);  
  exe_units_.at((int)ExeType::GPU) = std::make_shared<GpuUnit>(this);

-  // connect l1 caches
-  icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_);
-  for (int i = 0; i < arch_.num_threads(); ++i) {
-    dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i));
-  }
-
  // connect l1 switch
  icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
  dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]);
  l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort);  
  l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort);
+  this->MemRspPort.bind(&l1_mem_switch_->RspIn);
+  l1_mem_switch_->ReqOut.bind(&this->MemReqPort);

  // activate warp0
  warps_.at(0)->setTmask(0, true);
@ -109,31 +105,24 @@ Core::~Core() {
  }
 }

-void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) {
-  // advance to decode stage
-  uint32_t wid = response.tag;
-  pipeline_state_t state;
-  pending_icache_.remove(wid, &state);
-  auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
-  state.icache_latency = latency;
-  decode_stage_.push(state);
+void Core::attach_ram(RAM* ram) {
+  // bind RAM to memory unit
+  mmu_.attach(*ram, 0, 0xFFFFFFFF);    
 }

 void Core::step(uint64_t cycle) {
-    __unused (cycle);
-  D(2, "###########################################################");
-  D(2, std::dec << "Core" << id_ << ": cycle: " << cycle);
-
-  this->commit();
-  this->execute();
-  this->issue();
-  this->decode();
-  this->fetch();
+  this->commit(cycle);
+  this->execute(cycle);
+  this->issue(cycle);
+  this->decode(cycle);
+  this->fetch(cycle);

  DPN(2, std::flush);
 }

-void Core::warp_scheduler() {
+void Core::warp_scheduler(uint64_t cycle) {
+  __unused (cycle);
+
  bool foundSchedule = false;
  int scheduled_warp = last_schedule_wid_;

@ -159,53 +148,77 @@ void Core::warp_scheduler() {
  stats_insts_ += warp->getActiveThreads();
  
  pipeline_state_t state;
+  state.clear();
+  state.id = (issued_instrs_++ * arch_.num_cores()) + id_;
+
  warp->eval(&state);

-  D(4, state);  
+  DT(3, cycle, "pipeline-schedule: " << state);

-  // advance to fetch stage
-  ++pending_instrs_;
+  // advance to fetch stage  
  fetch_stage_.push(state);
 }

-void Core::fetch() {
-  // schedule icache request
-  pipeline_state_t state;
-  if (fetch_stage_.try_pop(&state)) {
-    state.icache_latency = SimPlatform::instance().cycles();
-    MemReq mem_req;
-    mem_req.addr  = state.PC;
-    mem_req.write = false;
-    mem_req.tag   = pending_icache_.allocate(state);    
-    icache_->CoreReqPorts.at(0).send(mem_req, 1);
+void Core::fetch(uint64_t cycle) {
+  // handle icache reponse
+  {
+    MemRsp mem_rsp;
+    if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){
+      pipeline_state_t state;
+      pending_icache_.remove(mem_rsp.tag, &state);
+      auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
+      state.icache_latency = latency;
+      decode_stage_.push(state);
+      DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state);
+    }
+  }
+
+  // send icache request
+  {
+    pipeline_state_t state;
+    if (fetch_stage_.try_pop(&state)) {
+      state.icache_latency = SimPlatform::instance().cycles();
+      MemReq mem_req;
+      mem_req.addr  = state.PC;
+      mem_req.write = false;
+      mem_req.tag   = pending_icache_.allocate(state);    
+      icache_->CoreReqPorts.at(0).send(mem_req, 1);
+      DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state);
+    }
  }  

  // schedule next warp
-  this->warp_scheduler();  
+  this->warp_scheduler(cycle);  
 }

-void Core::decode() {
+void Core::decode(uint64_t cycle) {
+  __unused (cycle);
+
  pipeline_state_t state;
  if (!decode_stage_.try_pop(&state))
    return;    
  
-  if (state.stall_warp) {
-    D(3, "*** warp#" << state.wid << " fetch stalled");
-  } else {
-    // release warp
+  // release warp
+  if (!state.stall_warp) {
    stalled_warps_.reset(state.wid);
  }
+
+  DT(3, cycle, "pipeline-decode: " << state);
  
  // advance to issue stage
  issue_stage_.push(state);
 }

-void Core::issue() {
+void Core::issue(uint64_t cycle) {
+  __unused (cycle);
+
  if (!issue_stage_.empty()) {
    // insert to ibuffer 
    auto& state = issue_stage_.top();
    auto& ibuffer = ibuffers_.at(state.wid);
-    if (!ibuffer.full()) {
+    if (ibuffer.full()) {
+      DT(3, cycle, "*** ibuffer-stall: " << state);
+    } else {
      ibuffer.push(state);
      issue_stage_.pop();
    }
@ -219,8 +232,18 @@ void Core::issue() {
    auto& state = ibuffer.top();

    // check scoreboard
-    if (scoreboard_.in_use(state))
+    if (scoreboard_.in_use(state)) {
+      DTH(3, cycle, "*** scoreboard-stall: dependents={");
+      auto owners = scoreboard_.owners(state);
+      for (uint32_t i = 0, n = owners.size(); i < n; ++i) {
+        if (i) DTN(3, ", ");
+        DTN(3, "#" << owners.at(i));  
+      }
+      DTN(3, "}, " << state << std::endl);
      continue;
+    }
+
+    DT(3, cycle, "pipeline-issue: " << state);

    // update scoreboard
    scoreboard_.reserve(state);
@ -233,18 +256,19 @@ void Core::issue() {
  }
 }

-void Core::execute() {
+void Core::execute(uint64_t cycle) {
  // process stage inputs
  if (!execute_stage_.empty()) {
    auto& state = execute_stage_.top();
    auto& exe_unit = exe_units_.at((int)state.exe_type);
    exe_unit->push_input(state);
    execute_stage_.pop();
+    DT(3, cycle, "pipeline-execute: " << state);
  }

  // advance execute units
  for (auto& exe_unit : exe_units_) {
-    exe_unit->step();
+    exe_unit->step(cycle);
  }  
  
  // commit completed instructions
@ -255,18 +279,29 @@ void Core::execute() {
        stalled_warps_.reset(state.wid);
      }
      // advance to commit stage
-      commit_stage_.push(state);      
+      commit_stage_.push(state);   
    }
  }
 }

-void Core::commit() {
+void Core::commit(uint64_t cycle) {
+  __unused (cycle);
+  
  pipeline_state_t state;
  if (!commit_stage_.try_pop(&state))
    return;

+  DT(3, cycle, "pipeline-commit: " << state);
+
  // update scoreboard
  scoreboard_.release(state);
+
+  assert(committed_instrs_ <= issued_instrs_);
+  ++committed_instrs_;
+}
+
+bool Core::running() const {
+  return (committed_instrs_ != issued_instrs_);
 }

 Word Core::get_csr(Addr addr, int tid, int wid) {
@ -349,9 +384,9 @@ void Core::barrier(int bar_id, int count, int warp_id) {
  barrier.reset();
 }

-Word Core::icache_fetch(Addr addr) {
+Word Core::icache_read(Addr addr, Size size) {
  Word data;
-  mem_.read(&data, addr, sizeof(Word), 0);
+  mmu_.read(&data, addr, size, 0);
  return data;
 }

@ -365,7 +400,7 @@ Word Core::dcache_read(Addr addr, Size size) {
     return data;
  }
 #endif
-  mem_.read(&data, addr, size, 0);
+  mmu_.read(&data, addr, size, 0);
  return data;
 }

@ -383,11 +418,7 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
     this->writeToStdOut(addr, data);
     return;
  }
-  mem_.write(&data, addr, size, 0);
-}
-
-bool Core::running() const {
-  return pending_instrs_;
+  mmu_.write(&data, addr, size, 0);
 }

 void Core::printStats() const {
@ -399,7 +430,7 @@ void Core::printStats() const {

 void Core::writeToStdOut(Addr addr, Word data) {
  uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
-  auto& ss_buf = print_bufs_.at(tid);
+  auto& ss_buf = print_bufs_[tid];
  char c = (char)data;
  ss_buf << c;
  if (c == '\n') {
--- a/sim/simX/core.h
+++ b/sim/simX/core.h
@ -25,9 +25,11 @@ namespace vortex {

 class Core : public SimObject<Core> {
 public:
-  Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id);
+  Core(const SimContext& ctx, const ArchDef &arch, Word id);
  ~Core();

+  void attach_ram(RAM* ram);
+
  bool running() const;

  void step(uint64_t cycle);
@ -64,7 +66,7 @@ public:

  void barrier(int bar_id, int count, int warp_id);

-  Word icache_fetch(Addr);
+  Word icache_read(Addr, Size);

  Word dcache_read(Addr, Size);

@ -76,22 +78,21 @@ public:

 private:

-  void fetch();
-  void decode();
-  void issue();
-  void execute();
-  void commit();
+  void fetch(uint64_t cycle);
+  void decode(uint64_t cycle);
+  void issue(uint64_t cycle);
+  void execute(uint64_t cycle);
+  void commit(uint64_t cycle);

-  void warp_scheduler();
-
-  void icache_handleCacheReponse(const MemRsp& response, uint32_t port_id);
+  void warp_scheduler(uint64_t cycle);

  void writeToStdOut(Addr addr, Word data);

  Word id_;
-  const ArchDef& arch_;
-  const Decoder& decoder_;
-  MemoryUnit& mem_;
+  const ArchDef arch_;
+  const Decoder decoder_;
+  MemoryUnit mmu_;
+
 #ifdef SM_ENABLE
  RAM shared_mem_;
 #endif 
@ -106,8 +107,6 @@ private:
  Cache::Ptr icache_;
  Cache::Ptr dcache_;
  Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
-  SlavePort<MemRsp> icache_rsp_port_;
-  std::vector<SlavePort<MemRsp>> dcache_rsp_port_;

  PipelineStage fetch_stage_;
  PipelineStage decode_stage_;
@ -118,10 +117,12 @@ private:
  HashTable<pipeline_state_t> pending_icache_;
  WarpMask stalled_warps_;  
  uint32_t last_schedule_wid_;
-  uint32_t pending_instrs_;
+  uint32_t issued_instrs_;
+  uint32_t committed_instrs_;
  bool ebreak_;

  std::unordered_map<int, std::stringstream> print_bufs_;
+  
  uint64_t stats_insts_;
  uint64_t stats_loads_;
  uint64_t stats_stores_;
--- a/sim/simX/debug.h
+++ b/sim/simX/debug.h
@ -7,14 +7,15 @@
 #define DEBUG_HEADER << "DEBUG "
 //#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "

+#define TRACE_HEADER << "TRACE "
+//#define TRACE_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
+
 #ifndef NDEBUG

 #include <iostream>
 #include <iomanip>

-#define DX(x) x
-
-#define D(lvl, x) do { \
+#define DP(lvl, x) do { \
  if ((lvl) <= DEBUG_LEVEL) { \
    std::cout DEBUG_HEADER << x << std::endl; \
  } \
@ -32,12 +33,33 @@
  } \
 } while(0)

+#define DT(lvl, t, x) do { \
+  if ((lvl) <= DEBUG_LEVEL) { \
+    std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \
+  } \
+} while(0)
+
+#define DTH(lvl, t, x) do { \
+  if ((lvl) <= DEBUG_LEVEL) { \
+    std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \
+  } \
+} while(0)
+
+#define DTN(lvl, x) do { \
+  if ((lvl) <= DEBUG_LEVEL) { \
+    std::cout << x; \
+  } \
+} while(0)
+
+
 #else

-#define DX(x)
-#define D(lvl, x) do {} while(0)
+#define DP(lvl, x) do {} while(0)
 #define DPH(lvl, x) do {} while(0)
 #define DPN(lvl, x) do {} while(0)
-#define D_RAW(x) do {} while(0)
+
+#define DT(lvl, t, x) do {} while(0)
+#define DTH(lvl, t, x) do {} while(0)
+#define DTN(lvl, x) do {} while(0)

 #endif
--- a/sim/simX/decode.cpp
+++ b/sim/simX/decode.cpp
@ -194,47 +194,26 @@ static const char* op_string(const Instr &instr) {
 namespace vortex {
 std::ostream &operator<<(std::ostream &os, const Instr &instr) {
  os << op_string(instr) << ": ";
-  auto opcode = instr.getOpcode();
-    
-  auto rd_to_string = [&]() {
-    int rdt = instr.getRDType();
-    int rd = instr.getRDest();
-    switch (rdt) {
-    case 1: os << "r" << std::dec << rd << " <- "; break;
-    case 2: os << "fr" << std::dec << rd << " <- "; break;
-    case 3: os << "vr" << std::dec << rd << " <- "; break;
-    default: break;
-    }
-  };
-
-  auto rs_to_string = [&](int i) {
-    int rst = instr.getRSType(i);
-    int rs = instr.getRSrc(i);    
-    switch (rst) {
-    case 1: os << "r" << std::dec << rs; break;
-    case 2: os << "fr" << std::dec << rs; break;
-    case 3: os << "vr" << std::dec << rs; break;
-    default: break;
-    }
-  };
-
+  auto opcode = instr.getOpcode();    
  if (opcode == S_INST 
   || opcode == FS
   || opcode == VS) {     
     os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
-     rs_to_string(1);
+     os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
  } else 
  if (opcode == L_INST 
   || opcode == FL
   || opcode == VL) {     
-     rd_to_string();
+     os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
     os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
  } else {
-    rd_to_string();
+    if (instr.getRDType() != RegType::None) {
+      os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
+    }
    int i = 0;
    for (; i < instr.getNRSrc(); ++i) {    
      if (i) os << ", ";
-      rs_to_string(i);
+      os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
    }    
    if (instr.hasImm()) {
      if (i) os << ", ";
@ -281,7 +260,7 @@ Decoder::Decoder(const ArchDef &arch) {
  v_imm_mask_  = 0x7ff;  
 }

-std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {  
+std::shared_ptr<Instr> Decoder::decode(Word code) const {  
  auto instr = std::make_shared<Instr>();
  Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_);
  instr->setOpcode(op);
@ -297,8 +276,8 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {

  auto op_it = sc_instTable.find(op);
  if (op_it == sc_instTable.end()) {
-    std::cout << std::hex << "invalid opcode: 0x" << op << ", instruction=0x" << code << ", PC=" << PC << std::endl;
-    std::abort();
+    std::cout << std::hex << "Error: invalid opcode: 0x" << op << std::endl;
+    return nullptr;
  }

  auto iType = op_it->second.iType;
@ -459,7 +438,5 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {
    std::abort();
  }

-  D(2, "Instr 0x" << std::hex << code << ": " << *instr << std::flush);
-
  return instr;
 }
--- a/sim/simX/decode.h
+++ b/sim/simX/decode.h
@ -13,7 +13,7 @@ class Decoder {
 public:
  Decoder(const ArchDef &);    
  
-  std::shared_ptr<Instr> decode(Word code, Word PC) const;
+  std::shared_ptr<Instr> decode(Word code) const;

 private:

--- a/sim/simX/execute.cpp
+++ b/sim/simX/execute.cpp
@ -75,11 +75,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
  if (num_rsrcs) {              
    for (int i = 0; i < num_rsrcs; ++i) {    
      DPH(2, "Src Reg [" << std::dec << i << "]: ");
-      int type = instr.getRSType(i);
+      auto type = instr.getRSType(i);
      int reg = instr.getRSrc(i);        
      switch (type) {
-      case 1: 
-        DPH(2, "r" << std::dec << reg << "={");
+      case RegType::Integer: 
+        DPN(2, "r" << std::dec << reg << "={");
        for (int t = 0; t < num_threads; ++t) {
          if (t) DPN(2, ", ");
          if (!tmask_.test(t)) {
@ -91,8 +91,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        }
        DPN(2, "}" << std::endl);
        break;
-      case 2: 
-        DPH(2, "fr" << std::dec << reg << "={");
+      case RegType::Float: 
+        DPN(2, "fr" << std::dec << reg << "={");
        for (int t = 0; t < num_threads; ++t) {
          if (t) DPN(2, ", ");
          if (!tmask_.test(t)) {
@ -105,6 +105,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        DPN(2, "}" << std::endl);
        break;
      default: 
+        std::abort();
        break;
      }      
    }
@ -415,7 +416,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    break;
  case L_INST:
    pipeline_state->exe_type = ExeType::LSU;    
-    pipeline_state->lsu.load = 0;    
+    pipeline_state->lsu.type = LsuType::LOAD;
    pipeline_state->used_iregs[rsrc0] = 1;
    pipeline_state->mem_addrs.resize(num_threads);
    for (int t = 0; t < num_threads; ++t) {
@ -425,7 +426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
      Word shift_by  = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
      Word data_read = core_->dcache_read(memAddr, 4);
      pipeline_state->mem_addrs.at(t) = memAddr;
-      D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
+      DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
      switch (func3) {
      case 0:
        // LBI
@ -455,7 +456,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    break;
  case S_INST:     
    pipeline_state->exe_type = ExeType::LSU;    
-    pipeline_state->lsu.store = 1;    
+    pipeline_state->lsu.type = LsuType::STORE;
    pipeline_state->used_iregs[rsrc0] = 1;
    pipeline_state->used_iregs[rsrc1] = 1;
    pipeline_state->mem_addrs.resize(num_threads);
@ -464,7 +465,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        continue;
      Word memAddr = rsdata[t][0] + immsrc;
      pipeline_state->mem_addrs.at(t) = memAddr;
-      D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
+      DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
      switch (func3) {
      case 0:
        // SB
@ -543,12 +544,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    break;
  case FENCE:
    pipeline_state->exe_type = ExeType::LSU;    
-    pipeline_state->lsu.fence = 1;
+    pipeline_state->lsu.type = LsuType::FENCE;
    pipeline_state->stall_warp = true;
    break;
  case (FL | VL):
    pipeline_state->exe_type = ExeType::LSU;       
-    pipeline_state->lsu.load = 1;
+    pipeline_state->lsu.type = LsuType::LOAD;
    pipeline_state->used_iregs[rsrc0] = 1;    
    if (func3 == 0x2) {
      pipeline_state->mem_addrs.resize(num_threads);
@ -558,14 +559,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        Word memAddr = rsdata[t][0] + immsrc;
        pipeline_state->mem_addrs.at(t) = memAddr;
        Word data_read = core_->dcache_read(memAddr, 4);        
-        D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
+        DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
        rddata[t] = data_read;
      }
    } else {  
-      D(3, "Executing vector load");      
-      D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
-      D(3, "dest: v" << rdest);
-      D(3, "width" << instr.getVlsWidth());
+      DP(3, "Executing vector load");      
+      DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
+      DP(3, "dest: v" << rdest);
+      DP(3, "width" << instr.getVlsWidth());
      pipeline_state->mem_addrs.resize(vl_);
      auto &vd = vRegFile_.at(rdest);
      switch (instr.getVlsWidth()) {
@ -574,9 +575,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        for (int i = 0; i < vl_; i++) {
          Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
          pipeline_state->mem_addrs.at(i) = memAddr;
-          D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
+          DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
          Word data_read = core_->dcache_read(memAddr, 4);
-          D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
+          DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
          int *result_ptr = (int *)(vd.data() + i);
          *result_ptr = data_read;            
        }
@ -590,7 +591,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    break;
  case (FS | VS):
    pipeline_state->exe_type = ExeType::LSU;       
-    pipeline_state->lsu.store = 1;
+    pipeline_state->lsu.type = LsuType::STORE;
    pipeline_state->used_iregs[rsrc0] = 1;
    pipeline_state->used_iregs[rsrc1] = 1;    
    if (func3 == 0x2) {
@ -601,20 +602,20 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        Word memAddr = rsdata[t][0] + immsrc;
        pipeline_state->mem_addrs.at(t) = memAddr;
        core_->dcache_write(memAddr, rsdata[t][1], 4);
-        D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
+        DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
      }
    } else {      
      pipeline_state->mem_addrs.resize(vl_);
      for (int i = 0; i < vl_; i++) {
        Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
        pipeline_state->mem_addrs.at(i) = memAddr;
-        D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
+        DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
        switch (instr.getVlsWidth()) {
        case 6: {
          //store word and unit strided (not checking for unit stride)          
          uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
          core_->dcache_write(memAddr, value, 4);
-          D(3, "store: " << memAddr << " value:" << value);
+          DP(3, "store: " << memAddr << " value:" << value);
        } break;
        default:
          std::abort();
@ -705,9 +706,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
        } else {          
          // FMV.X.W
          rddata[t] = rsdata[t][0];
-          pipeline_state->fpu.type = FpuType::FNCP;
-          pipeline_state->used_fregs[rsrc0] = 1;
-        } 
+        }        
+        pipeline_state->fpu.type = FpuType::FNCP;
+        pipeline_state->used_fregs[rsrc0] = 1;
        break;
      case 0x50:             
        switch(func3) {              
@ -783,132 +784,138 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
    }
    rd_write = true;
    break;
-  case GPGPU:
-    pipeline_state->exe_type = ExeType::GPU; 
+  case GPGPU: {
+    pipeline_state->exe_type = ExeType::GPU;
+    int ts = 0;
    for (int t = 0; t < num_threads; ++t) {
-      if (!tmask_.test(t))
-        continue;
-      switch (func3) {
-      case 0: {
-        // TMC        
-        pipeline_state->gpu.type = GpuType::TMC;
-        pipeline_state->used_iregs[rsrc0] = 1;
-        pipeline_state->stall_warp = true;
-        if (rsrc1) {
-          // predicate mode
-          ThreadMask pred;
-          for (int i = 0; i < num_threads; ++i) {
-            pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
-          }
-          if (pred.any()) {
-            tmask_ &= pred;
-          }
-        } else {
-          tmask_.reset();
-          for (int i = 0; i < num_threads; ++i) {
-            tmask_.set(i, rsdata.at(t)[0] & (1 << i));
-          }
-        }
-        D(3, "*** TMC " << tmask_);
-        active_ = tmask_.any();
-        break; // runOnce
-      } break;
-      case 1: {
-        // WSPAWN
-        pipeline_state->gpu.type = GpuType::WSPAWN;
-        pipeline_state->used_iregs[rsrc0] = 1;
-        pipeline_state->used_iregs[rsrc1] = 1;
-        pipeline_state->stall_warp = true;
-        int active_warps = std::min<int>(rsdata.at(t)[0], core_->arch().num_warps());
-        D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(t)[1]);
-        for (int i = 1; i < active_warps; ++i) {
-          Warp &newWarp = core_->warp(i);
-          newWarp.setPC(rsdata[t][1]);
-          newWarp.setTmask(0, true);
-        }
-        break; // runOnce
-      } break;
-      case 2: {
-        // SPLIT    
-        pipeline_state->gpu.type = GpuType::SPLIT;
-        pipeline_state->used_iregs[rsrc0] = 1;
-        pipeline_state->stall_warp = true;
-        if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {          
-          ThreadMask tmask;
-          for (int i = 0; i < num_threads; ++i) {
-            tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
-          }
-
-          DomStackEntry e(tmask, nextPC);
-          domStack_.push(tmask_);
-          domStack_.push(e);
-          for (size_t i = 0; i < e.tmask.size(); ++i) {
-            tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
-          }
-          active_ = tmask_.any();
-
-          DPH(3, "*** Split: New TM=");
-          for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
-          DPN(3, ", Pushed TM=");
-          for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1));
-          DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
-        } else {
-          D(3, "*** Unanimous pred");
-          DomStackEntry e(tmask_);
-          e.unanimous = true;
-          domStack_.push(e);
-        }        
-        break; // runOnce
-      } break;
-      case 3: {
-        // JOIN
-        pipeline_state->gpu.type = GpuType::JOIN;        
-        pipeline_state->stall_warp = true;        
-        if (!domStack_.empty() && domStack_.top().unanimous) {
-          D(3, "*** Uninimous branch at join");
-          tmask_ = domStack_.top().tmask;
-          active_ = tmask_.any();
-          domStack_.pop();
-        } else {
-          if (!domStack_.top().fallThrough) {
-            nextPC = domStack_.top().PC;
-            D(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
-          }
-
-          tmask_ = domStack_.top().tmask;
-          active_ = tmask_.any();
-
-          DPH(3, "*** Join: New TM=");
-          for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
-          DPN(3, "\n");
-
-          domStack_.pop();
-        }        
-        break; // runOnce
-      } break;
-      case 4: {
-        // BAR
-        pipeline_state->gpu.type = GpuType::BAR;
-        pipeline_state->used_iregs[rsrc0] = 1;
-        pipeline_state->used_iregs[rsrc1] = 1;
-        pipeline_state->stall_warp = true; 
-        active_ = false;
-        core_->barrier(rsdata[t][0], rsdata[t][1], id_);        
-        break; // runOnce
-      } break;
-      case 6: {
-        // PREFETCH
-        pipeline_state->exe_type = ExeType::LSU; 
-        pipeline_state->lsu.prefetch = 1; 
-        pipeline_state->used_iregs[rsrc0] = 1;
-        int addr = rsdata[t][0];
-        printf("*** PREFETCHED %d ***\n", addr);
-      } break;
-      default:
-        std::abort();
+      if (tmask_.test(t)) {
+        ts = t;
+        break;
      }
    }
-    break;
+    switch (func3) {
+    case 0: {
+      // TMC        
+      pipeline_state->gpu.type = GpuType::TMC;
+      pipeline_state->used_iregs[rsrc0] = 1;
+      pipeline_state->stall_warp = true;
+      if (rsrc1) {
+        // predicate mode
+        ThreadMask pred;
+        for (int i = 0; i < num_threads; ++i) {
+          pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
+        }
+        if (pred.any()) {
+          tmask_ &= pred;
+        }
+      } else {
+        tmask_.reset();
+        for (int i = 0; i < num_threads; ++i) {
+          tmask_.set(i, rsdata.at(ts)[0] & (1 << i));
+        }
+      }
+      DPH(3, "*** New TMC: ");
+      for (int i = 0; i < num_threads; ++i)
+        DPN(3, tmask_.test(num_threads-i-1));
+      DPN(3, std::endl);
+
+      active_ = tmask_.any();
+    } break;
+    case 1: {
+      // WSPAWN
+      pipeline_state->gpu.type = GpuType::WSPAWN;
+      pipeline_state->used_iregs[rsrc0] = 1;
+      pipeline_state->used_iregs[rsrc1] = 1;
+      pipeline_state->stall_warp = true;
+      int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps());
+      DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]);
+      for (int i = 1; i < active_warps; ++i) {
+        Warp &newWarp = core_->warp(i);
+        newWarp.setPC(rsdata[ts][1]);
+        newWarp.setTmask(0, true);
+      }
+    } break;
+    case 2: {
+      // SPLIT    
+      pipeline_state->gpu.type = GpuType::SPLIT;
+      pipeline_state->used_iregs[rsrc0] = 1;
+      pipeline_state->stall_warp = true;
+      if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {          
+        ThreadMask tmask;
+        for (int i = 0; i < num_threads; ++i) {
+          tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
+        }
+
+        DomStackEntry e(tmask, nextPC);
+        domStack_.push(tmask_);
+        domStack_.push(e);
+        for (size_t i = 0; i < e.tmask.size(); ++i) {
+          tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
+        }
+        active_ = tmask_.any();
+
+        DPH(3, "*** Split: New TM=");
+        for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
+        DPN(3, ", Pushed TM=");
+        for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1));
+        DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
+      } else {
+        DP(3, "*** Unanimous pred");
+        DomStackEntry e(tmask_);
+        e.unanimous = true;
+        domStack_.push(e);
+      }        
+    } break;
+    case 3: {
+      // JOIN
+      pipeline_state->gpu.type = GpuType::JOIN;        
+      pipeline_state->stall_warp = true;        
+      if (!domStack_.empty() && domStack_.top().unanimous) {
+        DP(3, "*** Uninimous branch at join");
+        tmask_ = domStack_.top().tmask;
+        active_ = tmask_.any();
+        domStack_.pop();
+      } else {
+        if (!domStack_.top().fallThrough) {
+          nextPC = domStack_.top().PC;
+          DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
+        }
+
+        tmask_ = domStack_.top().tmask;
+        active_ = tmask_.any();
+
+        DPH(3, "*** Join: New TM=");
+        for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
+        DPN(3, "\n");
+
+        domStack_.pop();
+      }        
+    } break;
+    case 4: {
+      // BAR
+      pipeline_state->gpu.type = GpuType::BAR;
+      pipeline_state->used_iregs[rsrc0] = 1;
+      pipeline_state->used_iregs[rsrc1] = 1;
+      pipeline_state->stall_warp = true; 
+      active_ = false;
+      core_->barrier(rsdata[ts][0], rsdata[ts][1], id_); 
+    } break;
+    case 6: {
+      // PREFETCH
+      pipeline_state->exe_type = ExeType::LSU; 
+      pipeline_state->lsu.type = LsuType::PREFETCH; 
+      pipeline_state->used_iregs[rsrc0] = 1;
+      for (int t = 0; t < num_threads; ++t) {
+        if (!tmask_.test(t))
+          continue;
+        int addr = rsdata[t][0];
+        printf("*** PREFETCHED %d ***\n", addr);
+      }
+    } break;
+    default:
+      std::abort();
+    }
+    }  break;
  case VSET: {
    int VLEN = core_->arch().vsize() * 8;
    int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
@ -928,7 +935,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
              uint8_t first  = *(uint8_t *)(vr1.data() + i);
              uint8_t second = *(uint8_t *)(vr2.data() + i);
              uint8_t result = first + second;
-              D(3, "Adding " << first << " + " << second << " = " << result);
+              DP(3, "Adding " << first << " + " << second << " = " << result);
              *(uint8_t *)(vd.data() + i) = result;
            }
          }
@ -940,7 +947,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
              uint16_t first  = *(uint16_t *)(vr1.data() + i);
              uint16_t second = *(uint16_t *)(vr2.data() + i);
              uint16_t result = first + second;
-              D(3, "Adding " << first << " + " << second << " = " << result);
+              DP(3, "Adding " << first << " + " << second << " = " << result);
              *(uint16_t *)(vd.data() + i) = result;
            }
          }
@ -952,7 +959,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
              uint32_t first  = *(uint32_t *)(vr1.data() + i);
              uint32_t second = *(uint32_t *)(vr2.data() + i);
              uint32_t result = first + second;
-              D(3, "Adding " << first << " + " << second << " = " << result);
+              DP(3, "Adding " << first << " + " << second << " = " << result);
              *(uint32_t *)(vd.data() + i) = result;
            }
          }
@ -968,7 +975,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
            uint8_t second = *(uint8_t *)(vr2.data() + i);
            uint8_t result = (first == second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 16) {
@ -976,7 +983,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first  = *(uint16_t *)(vr1.data() + i);
            uint16_t second = *(uint16_t *)(vr2.data() + i);
            uint16_t result = (first == second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 32) {
@ -984,7 +991,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first  = *(uint32_t *)(vr1.data() + i);
            uint32_t second = *(uint32_t *)(vr2.data() + i);
            uint32_t result = (first == second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
        }
@ -999,7 +1006,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
            uint8_t second = *(uint8_t *)(vr2.data() + i);
            uint8_t result = (first != second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 16) {
@ -1007,7 +1014,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first  = *(uint16_t *)(vr1.data() + i);
            uint16_t second = *(uint16_t *)(vr2.data() + i);
            uint16_t result = (first != second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 32) {
@ -1015,7 +1022,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first  = *(uint32_t *)(vr1.data() + i);
            uint32_t second = *(uint32_t *)(vr2.data() + i);
            uint32_t result = (first != second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
        }
@ -1030,7 +1037,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
            uint8_t second = *(uint8_t *)(vr2.data() + i);
            uint8_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 16) {
@ -1038,7 +1045,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first  = *(uint16_t *)(vr1.data() + i);
            uint16_t second = *(uint16_t *)(vr2.data() + i);
            uint16_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 32) {
@ -1046,7 +1053,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first  = *(uint32_t *)(vr1.data() + i);
            uint32_t second = *(uint32_t *)(vr2.data() + i);
            uint32_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
        }
@ -1061,7 +1068,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            int8_t first  = *(int8_t *)(vr1.data() + i);
            int8_t second = *(int8_t *)(vr2.data() + i);
            int8_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 16) {
@ -1069,7 +1076,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            int16_t first  = *(int16_t *)(vr1.data() + i);
            int16_t second = *(int16_t *)(vr2.data() + i);
            int16_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(int16_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 32) {
@ -1077,7 +1084,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            int32_t first  = *(int32_t *)(vr1.data() + i);
            int32_t second = *(int32_t *)(vr2.data() + i);
            int32_t result = (first < second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(int32_t *)(vd.data() + i) = result;
          }
        }
@ -1092,7 +1099,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
            uint8_t second = *(uint8_t *)(vr2.data() + i);
            uint8_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 16) {
@ -1100,7 +1107,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first  = *(uint16_t *)(vr1.data() + i);
            uint16_t second = *(uint16_t *)(vr2.data() + i);
            uint16_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 32) {
@ -1108,7 +1115,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first  = *(uint32_t *)(vr1.data() + i);
            uint32_t second = *(uint32_t *)(vr2.data() + i);
            uint32_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
        }
@ -1123,7 +1130,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            int8_t first  = *(int8_t *)(vr1.data() + i);
            int8_t second = *(int8_t *)(vr2.data() + i);
            int8_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 16) {
@ -1131,7 +1138,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            int16_t first  = *(int16_t *)(vr1.data() + i);
            int16_t second = *(int16_t *)(vr2.data() + i);
            int16_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(int16_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 32) {
@ -1139,7 +1146,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            int32_t first  = *(int32_t *)(vr1.data() + i);
            int32_t second = *(int32_t *)(vr2.data() + i);
            int32_t result = (first <= second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(int32_t *)(vd.data() + i) = result;
          }
        }
@ -1154,7 +1161,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
            uint8_t second = *(uint8_t *)(vr2.data() + i);
            uint8_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 16) {
@ -1162,7 +1169,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first  = *(uint16_t *)(vr1.data() + i);
            uint16_t second = *(uint16_t *)(vr2.data() + i);
            uint16_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 32) {
@ -1170,7 +1177,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first  = *(uint32_t *)(vr1.data() + i);
            uint32_t second = *(uint32_t *)(vr2.data() + i);
            uint32_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
        }
@ -1185,7 +1192,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            int8_t first  = *(int8_t *)(vr1.data() + i);
            int8_t second = *(int8_t *)(vr2.data() + i);
            int8_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 16) {
@ -1193,7 +1200,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            int16_t first  = *(int16_t *)(vr1.data() + i);
            int16_t second = *(int16_t *)(vr2.data() + i);
            int16_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(int16_t *)(vd.data() + i) = result;
          }
        } else if (vtype_.vsew == 32) {
@ -1201,7 +1208,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            int32_t first  = *(int32_t *)(vr1.data() + i);
            int32_t second = *(int32_t *)(vr2.data() + i);
            int32_t result = (first > second) ? 1 : 0;
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(int32_t *)(vd.data() + i) = result;
          }
        }
@ -1222,7 +1229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first_value  = (first & 0x1);
            uint8_t second_value = (second & 0x1);
            uint8_t result = (first_value & !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }            
          for (int i = vl_; i < VLMAX; i++) {
@ -1235,7 +1242,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first_value  = (first & 0x1);
            uint16_t second_value = (second & 0x1);
            uint16_t result = (first_value & !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1248,7 +1255,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first_value  = (first & 0x1);
            uint32_t second_value = (second & 0x1);
            uint32_t result = (first_value & !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1268,7 +1275,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first_value  = (first & 0x1);
            uint8_t second_value = (second & 0x1);
            uint8_t result = (first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1281,7 +1288,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first_value  = (first & 0x1);
            uint16_t second_value = (second & 0x1);
            uint16_t result = (first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1294,7 +1301,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first_value  = (first & 0x1);
            uint32_t second_value = (second & 0x1);
            uint32_t result = (first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1314,7 +1321,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first_value  = (first & 0x1);
            uint8_t second_value = (second & 0x1);
            uint8_t result = (first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1327,7 +1334,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first_value  = (first & 0x1);
            uint16_t second_value = (second & 0x1);
            uint16_t result = (first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1340,7 +1347,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first_value  = (first & 0x1);
            uint32_t second_value = (second & 0x1);
            uint32_t result = (first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1360,7 +1367,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first_value  = (first & 0x1);
            uint8_t second_value = (second & 0x1);
            uint8_t result = (first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1373,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first_value  = (first & 0x1);
            uint16_t second_value = (second & 0x1);
            uint16_t result = (first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1386,7 +1393,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first_value  = (first & 0x1);
            uint32_t second_value = (second & 0x1);
            uint32_t result = (first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1406,7 +1413,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first_value  = (first & 0x1);
            uint8_t second_value = (second & 0x1);
            uint8_t result = (first_value | !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1419,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first_value  = (first & 0x1);
            uint16_t second_value = (second & 0x1);
            uint16_t result = (first_value | !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1432,7 +1439,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first_value  = (first & 0x1);
            uint32_t second_value = (second & 0x1);
            uint32_t result = (first_value | !second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1452,7 +1459,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first_value  = (first & 0x1);
            uint8_t second_value = (second & 0x1);
            uint8_t result = !(first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1465,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first_value  = (first & 0x1);
            uint16_t second_value = (second & 0x1);
            uint16_t result = !(first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1478,7 +1485,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first_value  = (first & 0x1);
            uint32_t second_value = (second & 0x1);
            uint32_t result = !(first_value & second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1498,7 +1505,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first_value  = (first & 0x1);
            uint8_t second_value = (second & 0x1);
            uint8_t result = !(first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1511,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first_value  = (first & 0x1);
            uint16_t second_value = (second & 0x1);
            uint16_t result = !(first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1524,7 +1531,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first_value  = (first & 0x1);
            uint32_t second_value = (second & 0x1);
            uint32_t result = !(first_value | second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1544,7 +1551,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first_value  = (first & 0x1);
            uint8_t second_value = (second & 0x1);
            uint8_t result = !(first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1557,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first_value  = (first & 0x1);
            uint16_t second_value = (second & 0x1);
            uint16_t result = !(first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1570,7 +1577,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first_value  = (first & 0x1);
            uint32_t second_value = (second & 0x1);
            uint32_t result = !(first_value ^ second_value);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1588,7 +1595,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
            uint8_t second = *(uint8_t *)(vr2.data() + i);
            uint8_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1599,7 +1606,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first  = *(uint16_t *)(vr1.data() + i);
            uint16_t second = *(uint16_t *)(vr2.data() + i);
            uint16_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1610,7 +1617,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first  = *(uint32_t *)(vr1.data() + i);
            uint32_t second = *(uint32_t *)(vr2.data() + i);
            uint32_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1628,7 +1635,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint8_t first  = *(uint8_t *)(vr1.data() + i);
            uint8_t second = *(uint8_t *)(vr2.data() + i);
            uint8_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) += result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1639,7 +1646,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint16_t first  = *(uint16_t *)(vr1.data() + i);
            uint16_t second = *(uint16_t *)(vr2.data() + i);
            uint16_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) += result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1650,7 +1657,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
            uint32_t first  = *(uint32_t *)(vr1.data() + i);
            uint32_t second = *(uint32_t *)(vr2.data() + i);
            uint32_t result = (first * second);
-            D(3, "Comparing " << first << " + " << second << " = " << result);
+            DP(3, "Comparing " << first << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) += result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1669,7 +1676,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          for (int i = 0; i < vl_; i++) {
            uint8_t second = *(uint8_t *)(vr2.data() + i);
            uint8_t result = (rsdata[i][0] + second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1679,7 +1686,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          for (int i = 0; i < vl_; i++) {
            uint16_t second = *(uint16_t *)(vr2.data() + i);
            uint16_t result = (rsdata[i][0] + second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1689,7 +1696,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          for (int i = 0; i < vl_; i++) {
            uint32_t second = *(uint32_t *)(vr2.data() + i);
            uint32_t result = (rsdata[i][0] + second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1705,7 +1712,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          for (int i = 0; i < vl_; i++) {
            uint8_t second = *(uint8_t *)(vr2.data() + i);
            uint8_t result = (rsdata[i][0] * second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
            *(uint8_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1715,7 +1722,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          for (int i = 0; i < vl_; i++) {
            uint16_t second = *(uint16_t *)(vr2.data() + i);
            uint16_t result = (rsdata[i][0] * second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
            *(uint16_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1725,7 +1732,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
          for (int i = 0; i < vl_; i++) {
            uint32_t second = *(uint32_t *)(vr2.data() + i);
            uint32_t result = (rsdata[i][0] * second);
-            D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
+            DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
            *(uint32_t *)(vd.data() + i) = result;
          }
          for (int i = vl_; i < VLMAX; i++) {
@ -1741,7 +1748,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
      vtype_.vsew  = instr.getVsew();
      vtype_.vlmul = instr.getVlmul();

-      D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew  << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX);
+      DP(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew  << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX);

      int s0 = rsdata[0][0];
      if (s0 <= VLMAX) {
@ -1762,46 +1769,49 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
  }

  if (rd_write) {
+    pipeline_state->wb = true;
    DPH(2, "Dest Reg: ");
-    int rdt = instr.getRDType();    
+    auto rdt = instr.getRDType();    
    switch (rdt) {
-    case 1:      
+    case RegType::Integer:      
      if (rdest) {    
-        DPH(2, "r" << std::dec << rdest << "={");    
+        DPN(2, "r" << std::dec << rdest << "={");    
        for (int t = 0; t < num_threads; ++t) {
-          if (!tmask_.test(t))
-            continue;
-          iRegFile_.at(t)[rdest] = rddata[t];  
          if (t) DPN(2, ", ");
+          if (!tmask_.test(t)) {
+            DPN(2, "-");
+            continue;            
+          }
+          iRegFile_.at(t)[rdest] = rddata[t];
          DPN(2, "0x" << std::hex << rddata[t]);         
        }
        DPN(2, "}" << std::endl);
        pipeline_state->used_iregs[rdest] = 1;
      }
      break;
-    case 2:
-      DPH(2, "fr" << std::dec << rdest << "={");
+    case RegType::Float:
+      DPN(2, "fr" << std::dec << rdest << "={");
      for (int t = 0; t < num_threads; ++t) {
-        if (!tmask_.test(t))
-          continue;
-        fRegFile_.at(t)[rdest] = rddata[t];        
        if (t) DPN(2, ", ");
+        if (!tmask_.test(t)) {
+          DPN(2, "-");
+          continue;            
+        }
+        fRegFile_.at(t)[rdest] = rddata[t];        
        DPN(2, "0x" << std::hex << rddata[t]);         
      }
      DPN(2, "}" << std::endl);
      pipeline_state->used_fregs[rdest] = 1;
      break;
-    case 3:
-      pipeline_state->used_vregs[rdest] = 1;
-      break;
    default:
+      std::abort();
      break;
    }
  }

  PC_ += core_->arch().wsize();
  if (PC_ != nextPC) {
-    D(3, "*** Next PC: " << std::hex << nextPC << std::dec);
+    DP(3, "*** Next PC: " << std::hex << nextPC << std::dec);
    PC_ = nextPC;
  }
 }
--- a/sim/simX/exeunit.cpp
+++ b/sim/simX/exeunit.cpp
@ -9,6 +9,17 @@

 using namespace vortex;

+NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
+    
+void NopUnit::step(uint64_t /*cycle*/) {
+    pipeline_state_t state;
+    if (!inputs_.try_pop(&state))
+        return;
+    this->schedule_output(state, 1);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
 LsuUnit::LsuUnit(Core* core) 
    : ExeUnit("LSU")
    , core_(core)
@ -17,61 +28,77 @@ LsuUnit::LsuUnit(Core* core)
    , fence_lock_(false)
 {}

-void LsuUnit::handleCacheReponse(const MemRsp& response, uint32_t port_id) {
-    auto entry = pending_dcache_.at(response.tag);    
-    entry.second.reset(port_id); // track remaining blocks
-    if (!entry.second.any()) {        
-        auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
-        entry.first.dcache_latency = latency;
-        this->schedule_output(entry.first, 1);
-        pending_dcache_.release(response.tag);
-    }
-}
+void LsuUnit::step(uint64_t cycle) {
+    __unused (cycle);
+
+    // handle dcache response
+    for (uint32_t t = 0; t < num_threads_; ++t) {
+        MemRsp mem_rsp;
+        if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp))
+            continue;
+        auto& entry = pending_dcache_.at(mem_rsp.tag);  
+        DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first);  
+        assert(entry.second.test(t));
+        entry.second.reset(t); // track remaining blocks        
+        if (!entry.second.any()) {        
+            auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
+            entry.first.dcache_latency = latency;
+            this->schedule_output(entry.first, 1);
+            pending_dcache_.release(mem_rsp.tag);
+        }   
+    }

-void LsuUnit::step() {
    if (fence_lock_) {
        // wait for all pending memory operations to complete
        if (!pending_dcache_.empty())
            return;
        this->schedule_output(fence_state_, 1);
        fence_lock_ = false;
+        DT(3, cycle, "fence-unlock: " << fence_state_);
    }

+    // check input queue
    if (inputs_.empty())
        return;

    auto state = inputs_.top();

-    if (state.lsu.fence) {
+    if (state.lsu.type == LsuType::FENCE) {
        // schedule fence lock
        fence_state_ = state;
        fence_lock_ = true;
        inputs_.pop();
+        DT(3, cycle, "fence-lock: " << state);
        return;
    }

-    // send dcache requests
-    if (!pending_dcache_.full()) {   
-        state.dcache_latency = SimPlatform::instance().cycles();
-        auto tag = pending_dcache_.allocate({state, state.tmask});         
-        for (uint32_t t = 0; t < num_threads_; ++t) {
-            if (!state.tmask.test(t))
-                continue;
-            MemReq mem_req;
-            mem_req.addr  = state.mem_addrs.at(t);
-            mem_req.write = state.lsu.store;
-            mem_req.tag   = tag;
-            core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
-        }            
-        inputs_.pop();
+    // check pending queue capacity
+    if (pending_dcache_.full()) {
+        DT(3, cycle, "*** lsu-queue-stall: " << state);
+        return;
    }
+
+    // send dcache request 
+    state.dcache_latency = SimPlatform::instance().cycles();
+    auto tag = pending_dcache_.allocate({state, state.tmask});         
+    for (uint32_t t = 0; t < num_threads_; ++t) {
+        if (!state.tmask.test(t))
+            continue;
+        MemReq mem_req;
+        mem_req.addr  = state.mem_addrs.at(t);
+        mem_req.write = (state.lsu.type == LsuType::STORE);
+        mem_req.tag   = tag;
+        core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
+        DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state);
+    }            
+    inputs_.pop();
 }

 ///////////////////////////////////////////////////////////////////////////////

 AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
    
-void AluUnit::step() {
+void AluUnit::step(uint64_t /*cycle*/) {
    pipeline_state_t state;
    if (!inputs_.try_pop(&state))
        return;
@ -95,7 +122,7 @@ void AluUnit::step() {

 CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
    
-void CsrUnit::step() {
+void CsrUnit::step(uint64_t /*cycle*/) {
    pipeline_state_t state;
    if (!inputs_.try_pop(&state))
        return;
@ -106,7 +133,7 @@ void CsrUnit::step() {

 FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
    
-void FpuUnit::step() {
+void FpuUnit::step(uint64_t /*cycle*/) {
    pipeline_state_t state;
    if (!inputs_.try_pop(&state))
        return;
@ -133,7 +160,7 @@ void FpuUnit::step() {

 GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {}
    
-void GpuUnit::step() {
+void GpuUnit::step(uint64_t /*cycle*/) {
    pipeline_state_t state;
    if (!inputs_.try_pop(&state))
        return;
--- a/sim/simX/exeunit.h
+++ b/sim/simX/exeunit.h
@ -43,7 +43,16 @@ public:
        return outputs_.try_pop(state);
    }

-    virtual void step() = 0;
+    virtual void step(uint64_t cycle) = 0;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
+class NopUnit : public ExeUnit {
+public:
+    NopUnit(Core*);
+    
+    void step(uint64_t cycle);
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -59,9 +68,7 @@ private:
 public:
    LsuUnit(Core*);

-    void handleCacheReponse(const MemRsp& response, uint32_t port_id);
-
-    void step();
+    void step(uint64_t cycle);
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -70,7 +77,7 @@ class AluUnit : public ExeUnit {
 public:
    AluUnit(Core*);
    
-    void step();
+    void step(uint64_t cycle);
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -79,7 +86,7 @@ class CsrUnit : public ExeUnit {
 public:
    CsrUnit(Core*);
    
-    void step();
+    void step(uint64_t cycle);
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -88,7 +95,7 @@ class FpuUnit : public ExeUnit {
 public:
    FpuUnit(Core*);
    
-    void step();
+    void step(uint64_t cycle);
 };

 ///////////////////////////////////////////////////////////////////////////////
@ -97,7 +104,7 @@ class GpuUnit : public ExeUnit {
 public:
    GpuUnit(Core*);
    
-    void step();
+    void step(uint64_t cycle);
 };

 }
--- a/sim/simX/instr.h
+++ b/sim/simX/instr.h
@ -53,22 +53,23 @@ public:
    : opcode_(Opcode::NOP)
    , num_rsrcs_(0)
    , has_imm_(false)
+    , rdest_type_(RegType::None)
    , rdest_(0)
    , func3_(0)
    , func7_(0) {
    for (int i = 0; i < MAX_REG_SOURCES; ++i) {
-       rsrc_type_[i] = 0;
+       rsrc_type_[i] = RegType::None;
    }
  }

  /* Setters used to "craft" the instruction. */
  void setOpcode(Opcode opcode)  { opcode_ = opcode; }
-  void setDestReg(int destReg) { rdest_type_ = 1; rdest_ = destReg; }
-  void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = 1; rsrc_[num_rsrcs_++] = srcReg; }
-  void setDestFReg(int destReg) { rdest_type_ = 2; rdest_ = destReg; }
-  void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = 2; rsrc_[num_rsrcs_++] = srcReg;  }
-  void setDestVReg(int destReg) { rdest_type_ = 3; rdest_ = destReg; }
-  void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = 3; rsrc_[num_rsrcs_++] = srcReg;  }
+  void setDestReg(int destReg) { rdest_type_ = RegType::Integer; rdest_ = destReg; }
+  void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Integer; rsrc_[num_rsrcs_++] = srcReg; }
+  void setDestFReg(int destReg) { rdest_type_ = RegType::Float; rdest_ = destReg; }
+  void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg;  }
+  void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
+  void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg;  }
  void setFunc3(Word func3) { func3_ = func3; }
  void setFunc7(Word func7) { func7_ = func7; }
  void setImm(Word imm) { has_imm_ = true; imm_ = imm; }
@ -89,9 +90,9 @@ public:
  Word getFunc7() const { return func7_; }
  int getNRSrc() const { return num_rsrcs_; }
  int getRSrc(int i) const { return rsrc_[i]; }
-  int getRSType(int i) const { return rsrc_type_[i]; }
+  RegType getRSType(int i) const { return rsrc_type_[i]; }
  int getRDest() const { return rdest_; }  
-  int getRDType() const { return rdest_type_; }  
+  RegType getRDType() const { return rdest_type_; }  
  bool hasImm() const { return has_imm_; }
  Word getImm() const { return imm_; }
  Word getVlsWidth() const { return vlsWidth_; }
@ -112,15 +113,15 @@ private:
  Opcode opcode_;
  int num_rsrcs_;
  bool has_imm_;
-  int rdest_type_;
+  RegType rdest_type_;
  Word imm_;
-  int rsrc_type_[MAX_REG_SOURCES];
+  RegType rsrc_type_[MAX_REG_SOURCES];
  int rsrc_[MAX_REG_SOURCES];  
  int rdest_;
  Word func3_;
  Word func6_;

-  //Vector
+  // Vector
  Word vmask_;
  Word vlsWidth_;
  Word vMop_;
--- a/sim/simX/main.cpp
+++ b/sim/simX/main.cpp
@ -6,12 +6,15 @@
 #include <stdlib.h>
 #include <sys/stat.h>
 #include "processor.h"
+#include <util.h>
 #include "args.h"

+#define RAM_PAGE_SIZE 4096
+
 using namespace vortex;

 int main(int argc, char **argv) {
-  int ret;
+  int exitcode;

  std::string archStr("rv32imf");
  std::string imgFileName;
@ -53,11 +56,42 @@ int main(int argc, char **argv) {

  {
    ArchDef arch(archStr, num_cores, num_warps, num_threads);
+
    Processor processor(arch);
-    ret = processor.run(imgFileName, riscv_test, showStats);
+
+    RAM ram(RAM_PAGE_SIZE);
+
+    {
+      std::string program_ext(fileExtension(imgFileName.c_str()));
+      if (program_ext == "bin") {
+        ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
+      } else if (program_ext == "hex") {
+        ram.loadHexImage(imgFileName.c_str());
+      } else {
+        std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
+        return -1;
+      }
+    }
+
+    processor.attach_ram(&ram);
+
+    exitcode = processor.run();
+
+    if (riscv_test) {
+      if (1 == exitcode) {
+        std::cout << "Passed." << std::endl;
+        exitcode = 0;
+      } else {
+        std::cout << "Failed." << std::endl;
+      }
+    } else {
+      if (exitcode != 0) {
+        std::cout << "*** error: exitcode=" << exitcode << std::endl;
+      }
+    }
  }  

  SimPlatform::instance().finalize();

-  return ret;
+  return exitcode;
 }
--- a/sim/simX/memsim.cpp
+++ b/sim/simX/memsim.cpp
@ -8,32 +8,26 @@ using namespace vortex;
 class MemSim::Impl {
 private:
    MemSim* simobject_;
-    std::vector<std::queue<MemReq>> inputs_;
+    uint32_t num_banks_;
    uint32_t latency_;

 public:
    Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency) 
        : simobject_(simobject)
-        , inputs_(num_banks)
+        , num_banks_(num_banks)
        , latency_(latency)  
    {}

-    void handleMemRequest(const MemReq& mem_req, uint32_t port_id) {
-        inputs_.at(port_id).push(mem_req);        
-    }
-
    void step(uint64_t /*cycle*/) {
-        for (uint32_t i = 0, n = inputs_.size(); i < n; ++i) {
-            auto& queue = inputs_.at(i);            
-            if (queue.empty())
+        for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
+            MemReq mem_req;     
+            if (!simobject_->MemReqPorts.at(i).read(&mem_req))
                continue;
-            auto& entry = queue.front();
-            if (!entry.write) {
+            if (!mem_req.write) {
                MemRsp mem_rsp;
-                mem_rsp.tag = entry.tag;
+                mem_rsp.tag = mem_req.tag;
                simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
            }
-            queue.pop();
        }
    }
 };
@ -45,7 +39,7 @@ MemSim::MemSim(const SimContext& ctx,
               uint32_t latency) 
    : SimObject<MemSim>(ctx, "MemSim")
    , impl_(new Impl(this, num_banks, latency))
-    , MemReqPorts(num_banks, {this, impl_, &Impl::handleMemRequest}) 
+    , MemReqPorts(num_banks, this) 
    , MemRspPorts(num_banks, this)
 {}

--- a/sim/simX/pipeline.h
+++ b/sim/simX/pipeline.h
@ -10,14 +10,19 @@
 namespace vortex {

 struct pipeline_state_t {
-  //--    
+  //--
+  uint64_t    id;
+  
+  //--
+  int         cid;
  int         wid;  
  ThreadMask  tmask;
  Word        PC;

  //--
  bool        stall_warp;
-  int         rdest_type;
+  bool        wb;  
+  RegType     rdest_type;
  int         rdest;
  RegMask     used_iregs;
  RegMask     used_fregs;
@ -30,10 +35,7 @@ struct pipeline_state_t {
  //--
  union {
    struct {        
-      uint8_t load : 1;
-      uint8_t store: 1;
-      uint8_t fence : 1;
-      uint8_t prefetch: 1;
+      LsuType type;
    } lsu;
    struct {
      AluType type;
@ -49,8 +51,37 @@ struct pipeline_state_t {
  // stats
  uint64_t icache_latency;
  uint64_t dcache_latency;
+
+  void clear() {
+    cid = 0;
+    wid = 0;
+    tmask.reset();
+    PC = 0;
+    stall_warp = false;
+    wb = false;
+    rdest = 0;
+    rdest_type = RegType::None;
+    used_iregs.reset();
+    used_fregs.reset();
+    used_vregs.reset();
+    exe_type = ExeType::NOP;
+    mem_addrs.clear();    
+    icache_latency = 0;
+    dcache_latency = 0;
+  }
 };

+inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
+  os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
+  os << ", wb=" << state.wb;
+  if (state.wb) {
+     os << ", rd=" << state.rdest_type << std::dec << state.rdest;
+  }
+  os << ", ex=" << state.exe_type;
+  os << " (#" << std::dec << state.id << ")";
+  return os;
+}
+
 class PipelineStage : public Queue<pipeline_state_t> {
 protected:
  const char* name_;
@ -62,15 +93,4 @@ public:
  {}
 };

-inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
-  os << "stall_warp="   << state.stall_warp;
-  os << ", wid="        << state.wid;
-  os << ", PC="         << std::hex << state.PC;
-  os << ", used_iregs=" << state.used_iregs;
-  os << ", used_fregs=" << state.used_fregs;
-  os << ", used_vregs=" << state.used_vregs;
-  os << std::endl;
-  return os;
-}
-
 }
--- a/sim/simX/processor.cpp
+++ b/sim/simX/processor.cpp
@ -0,0 +1,141 @@
+#include "processor.h"
+#include "constants.h"
+
+using namespace vortex;
+
+Processor::Processor(const ArchDef& arch) 
+  : cores_(arch.num_cores())
+  , l2caches_(NUM_CLUSTERS)
+  , l2_mem_switches_(NUM_CLUSTERS)
+{
+  uint32_t num_cores = arch.num_cores();
+  uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS; 
+
+  // create cores
+  for (uint32_t i = 0; i < num_cores; ++i) {
+      cores_.at(i) = Core::Create(arch, i);
+  }
+
+  // connect memory sub-systen
+  memsim_ = MemSim::Create(1, MEM_LATENCY);
+  std::vector<SlavePort<MemReq>*>  mem_req_ports(1); 
+  std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
+  mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
+  mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
+
+  if (L3_ENABLE) {
+    l3cache_ = Cache::Create("l3cache", CacheConfig{
+      log2ceil(L3_CACHE_SIZE),  // C
+      log2ceil(MEM_BLOCK_SIZE), // B
+      2,                      // W
+      0,                      // A
+      32,                    // address bits    
+      L3_NUM_BANKS,           // number of banks
+      L3_NUM_PORTS,           // number of ports
+      NUM_CLUSTERS,           // request size   
+      true,                   // write-throught
+      0,                      // victim size
+      L3_MSHR_SIZE,           // mshr
+      2,                      // pipeline latency
+      }
+    );
+      
+    mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
+    l3cache_->MemReqPort.bind(mem_req_ports.at(0));
+
+    mem_req_ports.resize(NUM_CLUSTERS);
+    mem_rsp_ports.resize(NUM_CLUSTERS);
+    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
+      mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
+      mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
+    }
+  } else if (NUM_CLUSTERS > 1) {
+    l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
+    mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
+    l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));      
+
+    mem_req_ports.resize(NUM_CLUSTERS);
+    mem_rsp_ports.resize(NUM_CLUSTERS);
+    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
+      mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
+      mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
+    }
+  }
+
+  for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {      
+    if (L2_ENABLE) {
+      auto& l2cache = l2caches_.at(i);
+      l2cache = Cache::Create("l2cache", CacheConfig{
+        log2ceil(L2_CACHE_SIZE),  // C
+        log2ceil(MEM_BLOCK_SIZE), // B
+        2,                      // W
+        0,                      // A
+        32,                     // address bits    
+        L2_NUM_BANKS,           // number of banks
+        L2_NUM_PORTS,           // number of ports
+        NUM_CORES,              // request size   
+        true,                   // write-throught
+        0,                      // victim size
+        L2_MSHR_SIZE,           // mshr
+        2,                      // pipeline latency
+      });
+      mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
+      l2cache->MemReqPort.bind(mem_req_ports.at(i));
+
+      mem_req_ports.resize(cores_per_cluster);
+      mem_rsp_ports.resize(cores_per_cluster);
+      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+        mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
+        mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
+      }
+    } else if (cores_per_cluster > 1) {
+      auto& l2_mem_switch = l2_mem_switches_.at(i);
+      l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
+      mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
+      l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));  
+
+      mem_req_ports.resize(cores_per_cluster);
+      mem_rsp_ports.resize(cores_per_cluster);
+      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+        mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
+        mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
+      }
+    }
+
+    for (uint32_t j = 0; j < cores_per_cluster; ++j) {
+      auto& core = cores_.at((i * NUM_CLUSTERS) + j);        
+      mem_rsp_ports.at(i)->bind(&core->MemRspPort);
+      core->MemReqPort.bind(mem_req_ports.at(j));
+    }
+  }
+}
+
+void Processor::attach_ram(RAM* ram) {
+  for (auto core : cores_) {
+    core->attach_ram(ram);
+  }
+}
+
+Processor::~Processor() {}
+
+int Processor::run() {
+  bool running;
+  int exitcode = 0;
+  do {
+    SimPlatform::instance().step();
+    
+    running = false;
+    for (auto& core : cores_) {
+      if (core->running()) {
+        running = true;
+      }
+      if (core->check_ebreak()) {
+        exitcode = core->getIRegValue(3);
+        running = false;
+        break;
+      }
+    }
+  } while (running);
+
+  return exitcode;
+}
--- a/sim/simX/processor.h
+++ b/sim/simX/processor.h
@ -1,189 +1,27 @@
 #pragma once

-#include "constants.h"
-#include "debug.h"
-#include "types.h"
 #include "core.h"

 namespace vortex {

 class Processor {
+public:
+  typedef std::shared_ptr<Processor> Ptr;
+  
+  Processor(const ArchDef& arch);
+  ~Processor();
+
+  void attach_ram(RAM* mem);
+
+  int run();
+
 private:
-  ArchDef arch_; 
-  Decoder decoder_;
-  MemoryUnit mu_;
-  RAM ram_;
  std::vector<Core::Ptr> cores_;  
  std::vector<Cache::Ptr> l2caches_;  
  std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
  Cache::Ptr l3cache_;
  Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
  MemSim::Ptr memsim_;
-
-public:
-  Processor(const ArchDef& arch) 
-    : arch_(arch)
-    , decoder_(arch)
-    , mu_(0, arch.wsize(), true)
-    , ram_((1<<12), (1<<20)) 
-    , cores_(arch.num_cores())
-    , l2caches_(NUM_CLUSTERS)
-    , l2_mem_switches_(NUM_CLUSTERS)
-  {
-    uint32_t num_cores = arch.num_cores();
-    uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
-    
-    // bind RAM to memory unit
-    mu_.attach(ram_, 0, 0xFFFFFFFF);    
-
-    // create cores
-    for (uint32_t i = 0; i < num_cores; ++i) {
-      cores_.at(i) = Core::Create(arch, decoder_, mu_, i);
-    }
-    
-    // connect memory sub-systen
-    memsim_ = MemSim::Create(1, MEM_LATENCY);
-    std::vector<SlavePort<MemReq>*>  mem_req_ports(1); 
-    std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
-    mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
-    mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
-
-    if (L3_ENABLE) {
-      l3cache_ = Cache::Create("l3cache", CacheConfig{
-        log2ceil(L3_CACHE_SIZE),  // C
-        log2ceil(MEM_BLOCK_SIZE), // B
-        2,                      // W
-        0,                      // A
-        32,                    // address bits    
-        L3_NUM_BANKS,           // number of banks
-        L3_NUM_PORTS,           // number of ports
-        NUM_CLUSTERS,           // request size   
-        true,                   // write-throught
-        0,                      // victim size
-        L3_MSHR_SIZE,           // mshr
-        2,                      // pipeline latency
-      });
-      mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
-      l3cache_->MemReqPort.bind(mem_req_ports.at(0));
-
-      mem_req_ports.resize(NUM_CLUSTERS);
-      mem_rsp_ports.resize(NUM_CLUSTERS);
-      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-        mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
-        mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
-      }
-    } else if (NUM_CLUSTERS > 1) {
-      l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
-      mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
-      l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));      
-
-      mem_req_ports.resize(NUM_CLUSTERS);
-      mem_rsp_ports.resize(NUM_CLUSTERS);
-      for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
-        mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
-        mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
-      }
-    }
-
-    for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {      
-      if (L2_ENABLE) {
-        auto& l2cache = l2caches_.at(i);
-        l2cache = Cache::Create("l2cache", CacheConfig{
-          log2ceil(L2_CACHE_SIZE),  // C
-          log2ceil(MEM_BLOCK_SIZE), // B
-          2,                      // W
-          0,                      // A
-          32,                     // address bits    
-          L2_NUM_BANKS,           // number of banks
-          L2_NUM_PORTS,           // number of ports
-          NUM_CORES,              // request size   
-          true,                   // write-throught
-          0,                      // victim size
-          L2_MSHR_SIZE,           // mshr
-          2,                      // pipeline latency
-        });
-        mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
-        l2cache->MemReqPort.bind(mem_req_ports.at(i));
-
-        mem_req_ports.resize(cores_per_cluster);
-        mem_rsp_ports.resize(cores_per_cluster);
-        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-          mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
-          mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
-        }
-      } else if (cores_per_cluster > 1) {
-        auto& l2_mem_switch = l2_mem_switches_.at(i);
-        l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
-        mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
-        l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));  
-
-        mem_req_ports.resize(cores_per_cluster);
-        mem_rsp_ports.resize(cores_per_cluster);
-        for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-          mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
-          mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
-        }
-      }
-
-      for (uint32_t j = 0; j < cores_per_cluster; ++j) {
-        auto& core = cores_.at((i * NUM_CLUSTERS) + j);        
-        mem_rsp_ports.at(i)->bind(&core->MemRspPort);
-        core->MemReqPort.bind(mem_req_ports.at(j));
-      }
-    }
-  }
-
-  ~Processor() {}
-
-  int run(const std::string& program, bool riscv_test, bool /*showStats*/) {
-    {
-      std::string program_ext(fileExtension(program.c_str()));
-      if (program_ext == "bin") {
-        ram_.loadBinImage(program.c_str(), STARTUP_ADDR);
-      } else if (program_ext == "hex") {
-        ram_.loadHexImage(program.c_str());
-      } else {
-        std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
-        return -1;
-      }
-    }
-
-    bool running;
-    int exitcode = 0;
-    do {
-      SimPlatform::instance().step();
-      
-      running = false;
-      for (auto& core : cores_) {
-        if (core->running()) {
-          running = true;
-        }
-        if (core->check_ebreak()) {
-          exitcode = core->getIRegValue(3);
-          running = false;
-          break;
-        }
-      }
-    } while (running);
-
-    // get error status
-
-    if (riscv_test) {
-      if (1 == exitcode) {
-        std::cout << "Passed." << std::endl;
-        exitcode = 0;
-      } else {
-        std::cout << "Failed." << std::endl;
-      }
-    } else {
-      if (exitcode != 0) {
-        std::cout << "*** error: exitcode=" << exitcode << std::endl;
-      }
-    }
-
-    return exitcode;
-  }
-
 };

 }
--- a/sim/simX/scoreboard.h
+++ b/sim/simX/scoreboard.h
@ -10,6 +10,7 @@ private:
    std::vector<RegMask> in_use_iregs_;
    std::vector<RegMask> in_use_fregs_;
    std::vector<RegMask> in_use_vregs_;
+    std::unordered_map<uint32_t, uint64_t> owners_; 

 public:    
    Scoreboard(const ArchDef &arch) 
@ -29,42 +30,87 @@ public:
            || (state.used_fregs & in_use_fregs_.at(state.wid)) != 0
            || (state.used_vregs & in_use_vregs_.at(state.wid)) != 0;
    }
+
+    std::vector<uint64_t> owners(const pipeline_state_t& state) const {
+        std::vector<uint64_t> out;        
+        {
+            uint32_t r = 0;
+            auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid);        
+            while (used_iregs.any()) {
+                if (used_iregs.test(0)) {
+                    uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer;
+                    out.push_back(owners_.at(tag));
+                }
+                used_iregs >>= 1;
+                ++r;
+            }
+        }
+        {
+            uint32_t r = 0;
+            auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid);
+            while (used_fregs.any()) {
+                if (used_fregs.test(0)) {
+                    uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float;
+                    out.push_back(owners_.at(tag));
+                }
+                used_fregs >>= 1;
+                ++r;
+            }
+        }
+        {
+            uint32_t r = 0;
+            auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid);
+            while (used_vregs.any()) {
+                if (used_vregs.test(0)) {
+                    uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector;
+                    out.push_back(owners_.at(tag));
+                }
+                used_vregs >>= 1;
+                ++r;
+            }
+        }
+        return std::move(out);
+    }
    
    void reserve(const pipeline_state_t& state) {
-        if (!state.rdest)
-            return;
-        
+        if (!state.wb)
+            return;  
        switch (state.rdest_type) {
-        case 1:            
+        case RegType::Integer:            
            in_use_iregs_.at(state.wid).set(state.rdest);
            break;
-        case 2:
+        case RegType::Float:
            in_use_fregs_.at(state.wid).set(state.rdest);
            break;
-        case 3:
+        case RegType::Vector:
            in_use_vregs_.at(state.wid).set(state.rdest);
            break;
        default:  
            break;
-        }
+        }      
+        uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
+        assert(owners_.count(tag) == 0);
+        owners_[tag] = state.id;
    }

    void release(const pipeline_state_t& state) {
-        if (!state.rdest)
-            return;
+        if (!state.wb)
+            return;       
        switch (state.rdest_type) {
-        case 1:
+        case RegType::Integer:
            in_use_iregs_.at(state.wid).reset(state.rdest);
            break;
-        case 2:
+        case RegType::Float:
            in_use_fregs_.at(state.wid).reset(state.rdest);
            break;
-        case 3:
+        case RegType::Vector:
            in_use_vregs_.at(state.wid).reset(state.rdest);
            break;
        default:  
            break;
        }      
+        uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
+        owners_.erase(tag);
    }
 };

--- a/sim/simX/types.h
+++ b/sim/simX/types.h
@ -4,6 +4,7 @@
 #include <bitset>
 #include <queue>
 #include <unordered_map>
+#include <util.h>
 #include <VX_config.h>
 #include <simobject.h>

@ -20,7 +21,25 @@ typedef std::bitset<32> RegMask;
 typedef std::bitset<32> ThreadMask;
 typedef std::bitset<32> WarpMask;

+enum class RegType {
+  None,
+  Integer,
+  Float,
+  Vector
+};
+
+inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
+  switch (type) {
+  case RegType::None: break;
+  case RegType::Integer: os << "r"; break;
+  case RegType::Float:   os << "fr"; break;
+  case RegType::Vector:  os << "vr"; break;
+  }
+  return os;
+}
+
 enum class ExeType {
+  NOP,
  ALU,
  LSU,
  CSR,
@ -29,6 +48,19 @@ enum class ExeType {
  MAX,
 };

+inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
+  switch (type) {
+  case ExeType::NOP: os << "NOP"; break;
+  case ExeType::ALU: os << "ALU"; break;
+  case ExeType::LSU: os << "LSU"; break;
+  case ExeType::CSR: os << "CSR"; break;
+  case ExeType::FPU: os << "FPU"; break;
+  case ExeType::GPU: os << "GPU"; break;
+  case ExeType::MAX: break;
+  }
+  return os;
+}
+
 enum class AluType {
  ARITH,
  BRANCH,
@ -36,6 +68,33 @@ enum class AluType {
  IDIV,    
 };

+inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
+  switch (type) {
+  case AluType::ARITH:  os << "ARITH"; break;
+  case AluType::BRANCH: os << "BRANCH"; break;
+  case AluType::IMUL:   os << "IMUL"; break;
+  case AluType::IDIV:   os << "IDIV"; break;
+  }
+  return os;
+}
+
+enum class LsuType {
+  LOAD,
+  STORE,
+  FENCE,
+  PREFETCH,    
+};
+
+inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
+  switch (type) {
+  case LsuType::LOAD:     os << "LOAD"; break;
+  case LsuType::STORE:    os << "STORE"; break;
+  case LsuType::FENCE:    os << "FENCE"; break;
+  case LsuType::PREFETCH: os << "PREFETCH"; break;
+  }
+  return os;
+}
+
 enum class FpuType {
  FNCP,
  FMA,
@ -44,6 +103,17 @@ enum class FpuType {
  FCVT,
 };

+inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
+  switch (type) {
+  case FpuType::FNCP:  os << "FNCP"; break;
+  case FpuType::FMA:   os << "FMA"; break;
+  case FpuType::FDIV:  os << "FDIV"; break;
+  case FpuType::FSQRT: os << "FSQRT"; break;
+  case FpuType::FCVT:  os << "FCVT"; break;
+  }
+  return os;
+}
+
 enum class GpuType {
  TMC,
  WSPAWN,
@ -53,11 +123,31 @@ enum class GpuType {
  TEX,
 };

+inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
+  switch (type) {
+  case GpuType::TMC:    os << "TMC"; break;
+  case GpuType::WSPAWN: os << "WSPAWN"; break;
+  case GpuType::SPLIT:  os << "SPLIT"; break;
+  case GpuType::JOIN:   os << "JOIN"; break;
+  case GpuType::BAR:    os << "BAR"; break;
+  case GpuType::TEX:    os << "TEX"; break;
+  }
+  return os;
+}
+
 enum class ArbiterType {
  Priority,
  RoundRobin
 };

+inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
+  switch (type) {
+  case ArbiterType::Priority:   os << "Priority"; break;
+  case ArbiterType::RoundRobin: os << "RoundRobin"; break;
+  }
+  return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////

 template <typename T>
@ -65,6 +155,8 @@ class Queue {
 protected:
  std::queue<T> queue_;

+  uint32_t count;
+
 public:
  Queue() {}

@ -77,6 +169,7 @@ public:
  }

  void push(const T& value) {
+    ++count;
    queue_.push(value);
  }

@ -141,6 +234,7 @@ public:
        return i;
      }
    }
+    assert(false);
    return -1;
  }

@ -148,6 +242,7 @@ public:
    auto& entry = entries_.at(index);
    assert(entry.first);
    entry.first = false;
+    --capacity_;
  }

  void remove(uint32_t index, T* value) {
@ -155,6 +250,7 @@ public:
    assert(entry.first);
    *value = entry.second;
    entry.first = false;
+    --capacity_;
  }
 };

@ -163,29 +259,21 @@ public:
 template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
 class Switch : public SimObject<Switch<Req, Rsp>> {
 private:
-  struct req_t {  
+  struct req_batch_t {  
    std::vector<Req>       data;
    std::bitset<MaxInputs> valid;
-    req_t() {} 
-    req_t(uint32_t size) : data(size) {} 
+    req_batch_t() {} 
+    req_batch_t(uint32_t size) 
+      : data(size)
+      , valid(0)
+    {} 
  };

-  void handleIncomingRequest(const Req& req, uint32_t port_id) {
-    cur_req_.data.at(port_id) = req;
-    cur_req_.valid.set(port_id);
-  }
-
-  void handleIncomingResponse(const Rsp& rsp, uint32_t) {
-    rsps_.push(rsp);
-  }
-
  ArbiterType type_;
-  std::queue<req_t> reqs_;
-  std::queue<Rsp> rsps_;
-  req_t cur_req_; 
+  std::queue<req_batch_t> reqq_;
  uint32_t delay_;  
  uint32_t cursor_;
-  std::unordered_map<uint32_t, uint32_t> addr_table_;
+  uint32_t tag_shift_;

 public:
  Switch(
@ -197,12 +285,12 @@ public:
  ) 
    : SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)    
    , type_(type)
-    , cur_req_(num_inputs)
    , delay_(delay)
    , cursor_(0)
-    , ReqIn(num_inputs, {this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingRequest})
+    , tag_shift_(log2ceil(num_inputs))
+    , ReqIn(num_inputs, this)
    , ReqOut(this)
-    , RspIn(this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingResponse)    
+    , RspIn(this)    
    , RspOut(num_inputs, this)
  {
    assert(delay_ != 0);
@ -210,36 +298,52 @@ public:
  }

  void step(uint64_t /*cycle*/) {    
-    if (cur_req_.valid.any()) {
-      reqs_.push(cur_req_);      
-      cur_req_.valid.reset();
-    }
-
-    while (!reqs_.empty()) {
-      auto& entry = reqs_.front();
-      bool found = false;
-      for (uint32_t i = 0, n = entry.data.size(); i < n; ++i) {
-        auto j = (cursor_ + i) % n;        
-        if (entry.valid.test(j)) {
-          auto& req = entry.data.at(j);
-          addr_table_[req.tag] = j;
-          ReqOut.send(req, delay_);
-          entry.valid.reset(j);
-          this->update_cursor(j);
-          found = true;
-          break;
+    // process incomming requests
+    {
+      req_batch_t req_batch(ReqIn.size());
+      for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
+        Req req;
+        if (ReqIn.at(i).read(&req)) {
+          req_batch.data.at(i) = req;
+          req_batch.valid.set(i);
        }
      }
-      if (found)
-        break;
-      reqs_.pop();
+      if (req_batch.valid.any()) {
+        reqq_.push(req_batch);
+      }
+    }
+
+    // apply arbitration
+    if (!reqq_.empty()) {
+      auto& req_batch = reqq_.front();
+      for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) {
+        auto j = (cursor_ + i) % n;        
+        if (req_batch.valid.test(j)) {
+          auto& req = req_batch.data.at(j);
+          if (tag_shift_) {
+            req.tag = (req.tag << tag_shift_) | j;
+          }
+          ReqOut.send(req, delay_);
+          req_batch.valid.reset(j);
+          this->update_cursor(j);
+          if (!req_batch.valid.any())
+            reqq_.pop(); // pop when empty
+          break;
+        }
+      }      
    } 

-    if (!rsps_.empty()) {
-      auto& rsp = rsps_.front();
-      auto port_id = addr_table_.at(rsp.tag);
-      RspOut.at(port_id).send(rsp, 1);
-      rsps_.pop();
+    // process incoming reponses
+    {
+      Rsp rsp;
+      if (RspIn.read(&rsp)) {    
+        uint32_t port_id = 0;
+        if (tag_shift_) {
+          port_id = rsp.tag & ((1 << tag_shift_)-1);
+          rsp.tag >>= tag_shift_;
+        }      
+        RspOut.at(port_id).send(rsp, 1);
+      }
    }
  }

--- a/sim/simX/warp.cpp
+++ b/sim/simX/warp.cpp
@ -24,30 +24,34 @@ Warp::Warp(Core *core, Word id)
 void Warp::eval(pipeline_state_t *pipeline_state) {
  assert(tmask_.any());

-  DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask=");
+  DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
  for (int i = 0, n = core_->arch().num_threads(); i < n; ++i)
    DPN(2, tmask_.test(n-i-1));
-  DPN(2, "\n");
+  DPN(2, ", PC=0x" << std::hex << PC_ << std::endl);

  /* Fetch and decode. */    

-  Word fetched = core_->icache_fetch(PC_);
-  auto instr = core_->decoder().decode(fetched, PC_);
+  Word instr_code = core_->icache_read(PC_, sizeof(Word));
+  auto instr = core_->decoder().decode(instr_code);
+  if (!instr) {
+    std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
+    std::abort();
+  }  
+
+  DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);

  // Update state
+  pipeline_state->cid   = core_->id();
  pipeline_state->wid   = id_;
  pipeline_state->PC    = PC_;
  pipeline_state->tmask = tmask_;
  pipeline_state->rdest = instr->getRDest();
  pipeline_state->rdest_type = instr->getRDType();
-  pipeline_state->used_iregs.reset();
-  pipeline_state->used_fregs.reset();
-  pipeline_state->used_vregs.reset();
-  
+    
  // Execute
  this->execute(*instr, pipeline_state);

-  D(4, "Register state:");
+  DP(4, "Register state:");
  for (int i = 0; i < core_->arch().num_regs(); ++i) {
    DPN(4, "  %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
    for (int j = 0; j < core_->arch().num_threads(); ++j) {
--- a/sim/vlsim/opae_sim.cpp
+++ b/sim/vlsim/opae_sim.cpp
@ -44,6 +44,8 @@
 #define VERILATOR_RESET_VALUE 2
 #endif

+#define RAM_PAGE_SIZE 4096
+
 using namespace vortex;

 static uint64_t timestamp = 0;
@ -136,7 +138,7 @@ opae_sim::opae_sim()
  : stop_(false)
  , host_buffer_ids_(0) {  
  vl_obj_ = new VL_OBJ();
-  ram_ = new RAM((1<<12), (1<<20));
+  ram_ = new RAM(RAM_PAGE_SIZE);

  // reset the device
  this->reset();