minor updates

2025-04-23 21:39:10 -04:00 · 2024-12-04 06:00:19 -08:00 · 2024-12-04 06:00:19 -08:00 · 3ace9bbeda
commit 3ace9bbeda
parent 30b0daf050
18 changed files with 476 additions and 178 deletions
--- a/ci/regression.sh.in
+++ b/ci/regression.sh.in
@ -105,7 +105,7 @@ regression()
    ./ci/blackbox.sh --driver=simx --app=vecadd --rebuild=3

    # test for matmul
-    CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1" 
+    CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1"

    echo "regression tests done!"
 }
@ -322,6 +322,10 @@ config2()
    CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=1" ./ci/blackbox.sh --driver=opae --app=mstress
    CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress

+    # test memory ports
+    CONFIGS="-DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=demo
+    CONFIGS="-DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=demo --threads=32
+
    echo "configuration-2 tests done!"
 }

--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -648,9 +648,9 @@
 // Number of Memory Ports
 `ifndef L1_MEM_PORTS
 `ifdef L1_DISABLE
-`define L1_MEM_PORTS `L2_MEM_PORTS
+`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_BANKS)
 `else
-`define L1_MEM_PORTS `MIN(`L2_MEM_PORTS, `DCACHE_NUM_BANKS)
+`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_BANKS)
 `endif
 `endif

@ -727,9 +727,9 @@
 // Number of Memory Ports
 `ifndef L2_MEM_PORTS
 `ifdef L2_ENABLE
-`define L2_MEM_PORTS `MIN(`L3_MEM_PORTS, `L2_NUM_BANKS)
+`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_BANKS)
 `else
-`define L2_MEM_PORTS `L3_MEM_PORTS
+`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_BANKS)
 `endif
 `endif

@ -788,9 +788,9 @@
 // Number of Memory Ports
 `ifndef L3_MEM_PORTS
 `ifdef L3_ENABLE
-`define L3_MEM_PORTS `MIN(`PLATFORM_MEMORY_BANKS, `L3_NUM_BANKS)
+`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_BANKS)
 `else
-`define L3_MEM_PORTS `PLATFORM_MEMORY_BANKS
+`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_BANKS)
 `endif
 `endif

--- a/sim/common/simobject.h
+++ b/sim/common/simobject.h
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -27,9 +27,9 @@ class SimObjectBase;
 ///////////////////////////////////////////////////////////////////////////////

 class SimPortBase {
-public:  
+public:
  virtual ~SimPortBase() {}
-  
+
  SimObjectBase* module() const {
    return module_;
  }
@ -92,7 +92,7 @@ public:
    auto cycles = queue_.front().cycles;
    queue_.pop();
    return cycles;
-  }  
+  }

  void tx_callback(const TxCallback& callback) {
    tx_cb_ = callback;
@ -137,7 +137,7 @@ public:
  typedef std::shared_ptr<SimEventBase> Ptr;

  virtual ~SimEventBase() {}
-  
+
  virtual void fire() const = 0;

  uint64_t cycles() const {
@ -161,7 +161,7 @@ public:

  typedef std::function<void (const Pkt&)> Func;

-  SimCallEvent(const Func& func, const Pkt& pkt, uint64_t cycles) 
+  SimCallEvent(const Func& func, const Pkt& pkt, uint64_t cycles)
    : SimEventBase(cycles)
    , func_(func)
    , pkt_(pkt)
@ -194,8 +194,8 @@ public:
    const_cast<SimPort<Pkt>*>(port_)->transfer(pkt_, cycles_);
  }

-  SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t cycles) 
-    : SimEventBase(cycles) 
+  SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t cycles)
+    : SimEventBase(cycles)
    , port_(port)
    , pkt_(pkt)
  {}
@ -209,7 +209,7 @@ public:
  }

 protected:
-  const SimPort<Pkt>* port_; 
+  const SimPort<Pkt>* port_;
  Pkt pkt_;

  static MemoryPool<SimPortEvent<Pkt>> allocator_;
@ -230,11 +230,11 @@ public:

  const std::string& name() const {
    return name_;
-  } 
+  }

 protected:

-  SimObjectBase(const SimContext& ctx, const char* name); 
+  SimObjectBase(const SimContext& ctx, const std::string& name);

 private:

@ -259,8 +259,8 @@ public:

 protected:

-  SimObject(const SimContext& ctx, const char* name) 
-    : SimObjectBase(ctx, name) 
+  SimObject(const SimContext& ctx, const std::string& name)
+    : SimObjectBase(ctx, name)
  {}

 private:
@ -283,9 +283,9 @@ private:
 };

 class SimContext {
-private:    
+private:
  SimContext() {}
-  
+
  friend class SimPlatform;
 };

@ -320,10 +320,10 @@ public:

  template <typename Pkt>
  void schedule(const typename SimCallEvent<Pkt>::Func& callback,
-                const Pkt& pkt, 
-                uint64_t delay) {    
+                const Pkt& pkt,
+                uint64_t delay) {
    assert(delay != 0);
-    auto evt = std::make_shared<SimCallEvent<Pkt>>(callback, pkt, cycles_ + delay);    
+    auto evt = std::make_shared<SimCallEvent<Pkt>>(callback, pkt, cycles_ + delay);
    events_.emplace_back(evt);
  }

@ -341,10 +341,10 @@ public:
    auto evt_it_end = events_.end();
    while (evt_it != evt_it_end) {
      auto& event = *evt_it;
-      if (cycles_ >= event->cycles()) {        
+      if (cycles_ >= event->cycles()) {
        event->fire();
        evt_it = events_.erase(evt_it);
-      } else {        
+      } else {
        ++evt_it;
      }
    }
@ -352,7 +352,7 @@ public:
    for (auto& object : objects_) {
      object->do_tick();
    }
-    // advance clock    
+    // advance clock
    ++cycles_;
  }

@ -390,8 +390,8 @@ private:

 ///////////////////////////////////////////////////////////////////////////////

-inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) 
-  : name_(name) 
+inline SimObjectBase::SimObjectBase(const SimContext&, const std::string& name)
+  : name_(name)
 {}

 template <typename Impl>
@ -403,8 +403,8 @@ typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
 template <typename Pkt>
 void SimPort<Pkt>::push(const Pkt& pkt, uint64_t delay) const {
  if (peer_ && !tx_cb_) {
-    reinterpret_cast<const SimPort<Pkt>*>(peer_)->push(pkt, delay);    
+    reinterpret_cast<const SimPort<Pkt>*>(peer_)->push(pkt, delay);
  } else {
    SimPlatform::instance().schedule(this, pkt, delay);
-  } 
+  }
 }
--- a/sim/common/stringutil.h
+++ b/sim/common/stringutil.h
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -47,7 +47,7 @@ public:
    , indent_(indent, ' ')
    , owner_(nullptr)
  {}
-  
+
  explicit IndentStream(std::ostream& dest, int indent = 4)
    : dest_(dest.rdbuf())
    , isBeginLine_(true)
@ -76,3 +76,14 @@ private:
  std::string     indent_;
  std::ostream*   owner_;
 };
+
+template <typename... Args>
+std::string StrFormat(const std::string& fmt, Args... args) {
+  auto size = std::snprintf(nullptr, 0, fmt.c_str(), args...) + 1;
+  if (size <= 0) {
+    throw std::runtime_error("Error during formatting.");
+  }
+  std::vector<char> buf(size);
+  std::snprintf(buf.data(), size, fmt.c_str(), args...);
+  return std::string(buf.data(), buf.data() + size - 1);
+}
--- a/sim/simx/cache_sim.cpp
+++ b/sim/simx/cache_sim.cpp
@ -430,7 +430,7 @@ public:
 				continue;

 			auto& mem_rsp = mem_rsp_port.front();
-			DT(3, simobject_->name() << "-bank" << bank_id << " fill-rsp: " << mem_rsp);
+			DT(3, simobject_->name() << "-bank" << bank_id << "-fill-rsp: " << mem_rsp);
 			pipeline_req.type = bank_req_t::Fill;
 			pipeline_req.tag = mem_rsp.tag;
 			mem_rsp_port.pop();
@ -495,7 +495,7 @@ public:
 				bank_req.type  = bank_req_t::Core;
 				bank_req.write = core_req.write;
 				pipeline_req   = bank_req;
-				DT(3, simobject_->name() << " core-req: " << core_req);
+				DT(3, simobject_->name() << "-core-req: " << core_req);
 			}

 			if (core_req.write)
@ -523,7 +523,7 @@ private:
 		uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
 		MemRsp core_rsp{tag, mem_rsp.cid, mem_rsp.uuid};
 		simobject_->CoreRspPorts.at(req_id).push(core_rsp, config_.latency);
-		DT(3, simobject_->name() << " bypass-core-rsp: " << core_rsp);
+		DT(3, simobject_->name() << "-bypass-core-rsp: " << core_rsp);
 	}

 	void processBypassRequest(const MemReq& core_req, uint32_t req_id) {
@ -532,13 +532,13 @@ private:
 			mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id;
 			uint32_t mem_port = req_id % config_.mem_ports;
 			nc_arbs_.at(mem_port)->ReqIn.at(1).push(mem_req, 1);
-			DT(3, simobject_->name() << " bypass-dram-req: " << mem_req);
+			DT(3, simobject_->name() << "-bypass-dram-req: " << mem_req);
 		}

 		if (core_req.write && config_.write_reponse) {
 			MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
 			simobject_->CoreRspPorts.at(req_id).push(core_rsp, 1);
-			DT(3, simobject_->name() << " bypass-core-rsp: " << core_rsp);
+			DT(3, simobject_->name() << "-bypass-core-rsp: " << core_rsp);
 		}
 	}

@ -568,7 +568,7 @@ private:
 							continue;
 						MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
 						simobject_->CoreRspPorts.at(info.req_id).push(core_rsp, config_.latency);
-						DT(3, simobject_->name() << "-bank" << bank_id << " replay: " << core_rsp);
+						DT(3, simobject_->name() << "-bank" << bank_id << "-replay: " << core_rsp);
 					}
 				}
 			} break;
@ -612,7 +612,7 @@ private:
 							mem_req.cid   = pipeline_req.cid;
 							mem_req.uuid  = pipeline_req.uuid;
 							mem_req_ports_.at(bank_id).push(mem_req, 1);
-							DT(3, simobject_->name() << "-bank" << bank_id << " writethrough: " << mem_req);
+							DT(3, simobject_->name() << "-bank" << bank_id << "-writethrough: " << mem_req);
 						} else {
 							// mark line as dirty
 							hit_line.dirty = true;
@ -625,7 +625,7 @@ private:
 								continue;
 							MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
 							simobject_->CoreRspPorts.at(info.req_id).push(core_rsp, config_.latency);
-							DT(3, simobject_->name() << "-bank" << bank_id << " core-rsp: " << core_rsp);
+							DT(3, simobject_->name() << "-bank" << bank_id << "-core-rsp: " << core_rsp);
 						}
 					}
 				} else {
@ -644,7 +644,7 @@ private:
 							mem_req.write = true;
 							mem_req.cid   = pipeline_req.cid;
 							mem_req_ports_.at(bank_id).push(mem_req, 1);
-							DT(3, simobject_->name() << "-bank" << bank_id << " writeback: " << mem_req);
+							DT(3, simobject_->name() << "-bank" << bank_id << "-writeback: " << mem_req);
 							++perf_stats_.evictions;
 						}
 					}
@ -658,7 +658,7 @@ private:
 							mem_req.cid   = pipeline_req.cid;
 							mem_req.uuid  = pipeline_req.uuid;
 							mem_req_ports_.at(bank_id).push(mem_req, 1);
-							DT(3, simobject_->name() << "-bank" << bank_id << " writethrough: " << mem_req);
+							DT(3, simobject_->name() << "-bank" << bank_id << "-writethrough: " << mem_req);
 						}
 						// send core response
 						if (config_.write_reponse) {
@ -667,7 +667,7 @@ private:
 									continue;
 								MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
 								simobject_->CoreRspPorts.at(info.req_id).push(core_rsp, config_.latency);
-								DT(3, simobject_->name() << "-bank" << bank_id << " core-rsp: " << core_rsp);
+								DT(3, simobject_->name() << "-bank" << bank_id << "-core-rsp: " << core_rsp);
 							}
 						}
 					} else {
@ -676,7 +676,7 @@ private:

 						// allocate MSHR
 						auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id);
-						DT(3, simobject_->name() << "-bank" << bank_id << " mshr-enqueue: " << pipeline_req);
+						DT(3, simobject_->name() << "-bank" << bank_id << "-mshr-enqueue: " << pipeline_req);

 						// send fill request
 						if (!mshr_pending) {
@ -687,7 +687,7 @@ private:
 							mem_req.cid   = pipeline_req.cid;
 							mem_req.uuid  = pipeline_req.uuid;
 							mem_req_ports_.at(bank_id).push(mem_req, 1);
-							DT(3, simobject_->name() << "-bank" << bank_id << " fill: " << mem_req);
+							DT(3, simobject_->name() << "-bank" << bank_id << "-fill: " << mem_req);
 							++pending_fill_reqs_;
 						}
 					}
--- a/sim/simx/cluster.cpp
+++ b/sim/simx/cluster.cpp
@ -20,7 +20,7 @@ Cluster::Cluster(const SimContext& ctx,
                 ProcessorImpl* processor,
                 const Arch &arch,
                 const DCRS &dcrs)
-  : SimObject(ctx, "cluster")
+  : SimObject(ctx, StrFormat("cluster%d", cluster_id))
  , mem_req_ports(L2_MEM_PORTS, this)
  , mem_rsp_ports(L2_MEM_PORTS, this)
  , cluster_id_(cluster_id)
@ -42,7 +42,7 @@ Cluster::Cluster(const SimContext& ctx,

  // Create l2cache

-  snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
+  snprintf(sname, 100, "%s-l2cache", this->name().c_str());
  l2cache_ = CacheSim::Create(sname, CacheSim::Config{
    !L2_ENABLED,
    log2ceil(L2_CACHE_SIZE),// C
--- a/sim/simx/constants.h
+++ b/sim/simx/constants.h
@ -34,8 +34,8 @@ inline constexpr int DCACHE_NUM_REQS	= (NUM_LSU_BLOCKS * DCACHE_CHANNELS);

 inline constexpr int NUM_SOCKETS      = UP(NUM_CORES / SOCKET_SIZE);

-inline constexpr int L2_NUM_REQS      = 2;
+inline constexpr int L2_NUM_REQS      = NUM_SOCKETS * L1_MEM_PORTS;

-inline constexpr int L3_NUM_REQS      = NUM_CLUSTERS;
+inline constexpr int L3_NUM_REQS      = NUM_CLUSTERS * L2_MEM_PORTS;

 inline constexpr int PER_ISSUE_WARPS  = NUM_WARPS / ISSUE_WIDTH;
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@ -30,7 +30,7 @@ Core::Core(const SimContext& ctx,
           Socket* socket,
           const Arch &arch,
           const DCRS &dcrs)
-  : SimObject(ctx, "core")
+  : SimObject(ctx, StrFormat("core%d", core_id))
  , icache_req_ports(1, this)
  , icache_rsp_ports(1, this)
  , dcache_req_ports(DCACHE_NUM_REQS, this)
@ -59,12 +59,12 @@ Core::Core(const SimContext& ctx,

  // create the memory coalescer
  for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
-    snprintf(sname, 100, "core%d-coalescer%d", core_id, i);
+    snprintf(sname, 100, "%s-coalescer%d", this->name().c_str(), i);
    mem_coalescers_.at(i) = MemCoalescer::Create(sname, LSU_CHANNELS, DCACHE_CHANNELS, DCACHE_WORD_SIZE, LSUQ_OUT_SIZE, 1);
  }

  // create local memory
-  snprintf(sname, 100, "core%d-local_mem", core_id);
+  snprintf(sname, 100, "%s-local_mem", this->name().c_str());
  local_mem_ = LocalMem::Create(sname, LocalMem::Config{
    (1 << LMEM_LOG_SIZE),
    LSU_WORD_SIZE,
@ -75,19 +75,19 @@ Core::Core(const SimContext& ctx,

  // create lsu demux
  for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
-    snprintf(sname, 100, "core%d-lsu_demux%d", core_id, i);
+    snprintf(sname, 100, "%s-lsu_demux%d", this->name().c_str(), i);
    lsu_demux_.at(i) = LocalMemSwitch::Create(sname, 1);
  }

  // create lsu dcache adapter
  for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
-    snprintf(sname, 100, "core%d-lsu_dcache_adapter%d", core_id, i);
+    snprintf(sname, 100, "%s-lsu_dcache_adapter%d", this->name().c_str(), i);
    lsu_dcache_adapter_.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
  }

  // create lsu lmem adapter
  for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
-    snprintf(sname, 100, "core%d-lsu_lmem_adapter%d", core_id, i);
+    snprintf(sname, 100, "%s-lsu_lmem_adapter%d", this->name().c_str(), i);
    lsu_lmem_adapter_.at(i) = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
  }

@ -140,7 +140,7 @@ Core::Core(const SimContext& ctx,

  // bind commit arbiters
  for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
-    snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
+    snprintf(sname, 100, "%s-commit-arb%d", this->name().c_str(), i);
    auto arbiter = TraceArbiter::Create(sname, ArbiterType::RoundRobin, (uint32_t)FUType::Count, 1);
    for (uint32_t j = 0; j < (uint32_t)FUType::Count; ++j) {
      func_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@ -103,7 +103,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
      auto reg = instr.getRSrc(i);
      switch (type) {
      case RegType::Integer:
-        DPH(2, "Src" << i << " Reg: " << type << reg << "={");
+        DPH(2, "Src" << i << "-Reg: " << type << reg << "={");
        for (uint32_t t = 0; t < num_threads; ++t) {
          if (t) DPN(2, ", ");
          if (!warp.tmask.test(t)) {
@ -116,7 +116,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
        DPN(2, "}" << std::endl);
        break;
      case RegType::Float:
-        DPH(2, "Src" << i << " Reg: " << type << reg << "={");
+        DPH(2, "Src" << i << "-Reg: " << type << reg << "={");
        for (uint32_t t = 0; t < num_threads; ++t) {
          if (t) DPN(2, ", ");
          if (!warp.tmask.test(t)) {
@ -1421,7 +1421,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
      std::abort();
    }
  } break;
-  case Opcode::TCU: 
+  case Opcode::TCU:
  { //TODO - make it data-type flexible
    uint32_t mem_bytes = 1;
    DP(3, "mem_bytes=" << mem_bytes << std::endl);
@ -1443,7 +1443,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {

    //LOAD
    if(num_threads > tc_size*tc_size*n_tiles*TC_per_warp)
-    { 
+    {
      num_threads_actv = tc_size*tc_size*n_tiles*TC_per_warp;
      num_data_per_thread = 1;
    }
@ -1456,7 +1456,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {

    //STORE
    if(num_threads > tc_size*tc_size*TC_per_warp)
-    { 
+    {
      num_threads_actv_st = tc_size*tc_size*TC_per_warp;
      num_data_per_thread_st = 1;
    }
@ -1466,30 +1466,30 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
      num_data_per_thread_st = (tc_size*tc_size)/num_threads_per_tc;
    }
    data_bytes_store = mem_bytes*num_data_per_thread_st;
-    
+
    DP(3, "Num Tiles=" << n_tiles << std::endl);
-    
+
    switch (func3) {
-      case 0: 
-      { //Matrix Load  
+      case 0:
+      { //Matrix Load

        DP (4, "TCU LOAD");
        trace->fu_type = FUType::LSU;
        trace->lsu_type = LsuType::TCU_LOAD;
-        
+
        trace->src_regs[0] = {RegType::Integer, rsrc0};
        auto trace_data = std::make_shared<LsuTraceData>(num_threads);
        trace->data = trace_data;
-        
-        for (uint32_t t = thread_start; t < num_threads_actv; ++t) 
+
+        for (uint32_t t = thread_start; t < num_threads_actv; ++t)
        {
          if (!warp.tmask.test(t))
            continue;
-          DP(3, "Thread ID" << t); 
+          DP(3, "Thread ID" << t);

          uint32_t base_addr = rsdata[t][0].i ;
          trace_data->mem_addrs.at(t) = {base_addr, data_bytes_load};
-          
+
          //Load A or B (depends on immsrc)
          int loop_offset = 0;
          DP(3, "n_tiles = " << n_tiles << "; num_data_per_thread = " << num_data_per_thread <<std::endl);
@ -1502,10 +1502,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
              DP(3, "Scratchpad Index: " << loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n << ", Value: " << scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n]);
            }
        }
-        rd_write = true;  
+        rd_write = true;
      } break;
-      case 1: 
-      { 
+      case 1:
+      {
        DP(4, "TCU STORE");
        trace->fu_type = FUType::LSU;
        trace->lsu_type = LsuType::TCU_STORE;
@ -1513,12 +1513,12 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
        auto trace_data = std::make_shared<LsuTraceData>(num_threads);
        trace->data = trace_data;

-        for (uint32_t t = thread_start; t < num_threads_actv_st; ++t) 
+        for (uint32_t t = thread_start; t < num_threads_actv_st; ++t)
        {
          if (!warp.tmask.test(t))
            continue;

-          DP(3, "Thread ID" << t); 
+          DP(3, "Thread ID" << t);
          uint32_t base_addr = rsdata[t][0].i ;

          trace_data->mem_addrs.at(t) = {base_addr, data_bytes_store};
@ -1529,7 +1529,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
            Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
            *temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n];

-            this->dcache_write(temp_ref, base_addr+(n*mem_bytes), mem_bytes);  
+            this->dcache_write(temp_ref, base_addr+(n*mem_bytes), mem_bytes);
          }
        }
        //Clear the scratchpad
@ -1539,18 +1539,18 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
        }
      }
      break;
-      case 2: 
+      case 2:
      { //Matrix Multiply
        DP(4, "TCU MULTIPLY MAT");
        trace->fu_type = FUType::TCU;
        trace->tcu_type = TCUType::TCU_MUL;
        uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp);
-        for (uint32_t t = thread_start; t < num_threads_actv; ++t) 
+        for (uint32_t t = thread_start; t < num_threads_actv; ++t)
        {
          if (!warp.tmask.test(t))
            continue;
-         
-          DP(3, "Thread ID" << t); 
+
+          DP(3, "Thread ID" << t);
          //TC operation [only 1 thread in 1 warp needs to do this]
          if (t%threads_per_tc == 0)
          {
@ -1563,7 +1563,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
            int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
            uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
            for(int tiles = 0 ; tiles < n_tiles ; tiles++)  //What's the HW implication of this?? A counter implementation?
-            { 
+            {
              for (int i = 0; i < tc_size; i++) { //ROW-1
                for (int j = 0; j < tc_size; j++) { //COL-2
                  int sum = 0;
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@ -121,7 +121,7 @@ void LsuUnit::tick() {
 			continue;
 		auto& state = states_.at(b);
 		auto& lsu_rsp = lsu_rsp_port.front();
-		DT(3, this->name() << " mem-rsp: " << lsu_rsp);
+		DT(3, this->name() << "-mem-rsp: " << lsu_rsp);
 		auto& entry = state.pending_rd_reqs.at(lsu_rsp.tag);
 		auto trace = entry.trace;
 		assert(!entry.mask.none());
@ -146,7 +146,7 @@ void LsuUnit::tick() {
 				continue;
 			Outputs.at(iw).push(state.fence_trace, 1);
 			state.fence_lock = false;
-			DT(3, this->name() << " fence-unlock: " << state.fence_trace);
+			DT(3, this->name() << "-fence-unlock: " << state.fence_trace);
 		}

 		// check input queue
@ -160,7 +160,7 @@ void LsuUnit::tick() {
 			// schedule fence lock
 			state.fence_trace = trace;
 			state.fence_lock = true;
-			DT(3, this->name() << " fence-lock: " << *trace);
+			DT(3, this->name() << "-fence-lock: " << *trace);
 			// remove input
 			input.pop();
 			continue;
@ -171,7 +171,7 @@ void LsuUnit::tick() {
 		// check pending queue capacity
 		if (!is_write && state.pending_rd_reqs.full()) {
 			if (!trace->log_once(true)) {
-				DT(4, "*** " << this->name() << " queue-full: " << *trace);
+				DT(4, "*** " << this->name() << "-queue-full: " << *trace);
 			}
 			continue;
 		} else {
@ -202,7 +202,7 @@ void LsuUnit::tick() {

 		// send memory request
 		core_->lsu_demux_.at(block_idx)->ReqIn.push(lsu_req);
-		DT(3, this->name() << " mem-req: " << lsu_req);
+		DT(3, this->name() << "-mem-req: " << lsu_req);

 		// update stats
 		auto num_addrs = lsu_req.mask.count();
@ -237,7 +237,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
 	{
 		req_per_thread= (1>(trace_data->mem_addrs.at(0).size)/4)? 1: ((trace_data->mem_addrs.at(0).size)/4);
 	}
-	
+
 	auto t0 = trace->pid * NUM_LSU_LANES;

 	for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
@ -250,7 +250,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {

 		auto mem_addr = trace_data->mem_addrs.at(t);
 		auto type = get_addr_type(mem_addr.addr);
-		// DT(3, "addr_type = " << type << ", " << *trace);		
+		// DT(3, "addr_type = " << type << ", " << *trace);
 		uint32_t mem_bytes = 1;
 		for (int i = 0; i < req_per_thread; i++)
 		{
@ -261,7 +261,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
 			mem_req.tag   = tag;
 			mem_req.cid   = trace->cid;
 			mem_req.uuid  = trace->uuid;
-		
+
 			dcache_req_port.push(mem_req, 1);
 			DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
 				<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
@ -272,7 +272,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
 				++core_->perf_stats_.loads;
 				++pending_loads_;
 			}
-		
+
 			++count;
 		}
 	}
@ -282,7 +282,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {

 ///////////////////////////////////////////////////////////////////////////////

-TcuUnit::TcuUnit(const SimContext& ctx, Core* core) 
+TcuUnit::TcuUnit(const SimContext& ctx, Core* core)
    : FuncUnit(ctx, core, "TCU")
    {}

@ -290,7 +290,7 @@ void TcuUnit::tick() {

 	for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
        auto& input = Inputs.at(i);
-        if (input.empty()) 
+        if (input.empty())
            continue;
        auto& output = Outputs.at(i);
        auto trace = input.front();
@ -307,7 +307,7 @@ void TcuUnit::tick() {
            }
            default:
                std::abort();
-        }    
+        }
        DT(3, "pipeline-execute: op=" << trace->tcu_type << ", " << *trace);
        input.pop();
    }
--- a/sim/simx/local_mem.cpp
+++ b/sim/simx/local_mem.cpp
@ -24,8 +24,7 @@ protected:
 	LocalMem* simobject_;
 	Config    config_;
 	RAM       ram_;
-	int32_t   bank_sel_addr_start_;
-  int32_t   bank_sel_addr_end_;
+	MemCrossBar::Ptr mem_xbar_;
 	PerfStats perf_stats_;

 	uint64_t to_local_addr(uint64_t addr) {
@ -40,9 +39,15 @@ public:
 		: simobject_(simobject)
 		, config_(config)
 		, ram_(config.capacity)
-		, bank_sel_addr_start_(0)
-		, bank_sel_addr_end_(config.B-1)
-	{}
+	{
+		char sname[100];
+		snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
+		mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_reqs, (1 << config.B));
+		for (uint32_t i = 0; i < config.num_reqs; ++i) {
+			simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i));
+			mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i));
+		}
+	}

 	virtual ~Impl() {}

@ -82,7 +87,7 @@ public:
 				continue;
 			}

-			DT(4, simobject_->name() << " mem-req" << req_id << ": "<< core_req);
+			DT(4, simobject_->name() << "-mem-req" << req_id << ": "<< core_req);

 			in_used_banks.at(bank_id) = true;

--- a/sim/simx/mem_coalescer.cpp
+++ b/sim/simx/mem_coalescer.cpp
@ -42,10 +42,10 @@ void MemCoalescer::reset() {
 }

 void MemCoalescer::tick() {
-  // process incoming responses
+  // process outgoing responses
  if (!RspOut.empty()) {
    auto& out_rsp = RspOut.front();
-    DT(4, this->name() << " mem-rsp: " << out_rsp);
+    DT(4, this->name() << "-mem-rsp: " << out_rsp);
    auto& entry = pending_rd_reqs_.at(out_rsp.tag);

    BitVector<> rsp_mask(input_size_);
@ -89,7 +89,7 @@ void MemCoalescer::tick() {

  // ensure we can allocate a response tag
  if (pending_rd_reqs_.full()) {
-    DT(4, "*** " << this->name() << " queue-full: " << in_req);
+    DT(4, "*** " << this->name() << "-queue-full: " << in_req);
    return;
  }

@ -145,7 +145,7 @@ void MemCoalescer::tick() {

  // send memory request
  ReqOut.push(out_req, delay_);
-  DT(4, this->name() << " mem-req: coalesced=" << cur_mask.count() << ", " << out_req);
+  DT(4, this->name() << "-mem-req: coalesced=" << cur_mask.count() << ", " << out_req);

  // update sent mask
  sent_mask_ |= cur_mask;
--- a/sim/simx/mem_sim.cpp
+++ b/sim/simx/mem_sim.cpp
@ -27,13 +27,14 @@ class MemSim::Impl {
 private:
 	MemSim*   simobject_;
 	Config    config_;
+	MemCrossBar::Ptr mem_xbar_;
 	DramSim   dram_sim_;
 	PerfStats perf_stats_;

 	struct DramCallbackArgs {
-		MemSim* simobject;
-		MemReq  request;
-		uint32_t i;
+		MemSim::Impl* memsim;
+		MemReq request;
+		uint32_t bank_id;
 	};

 public:
@ -41,7 +42,15 @@ public:
 		: simobject_(simobject)
 		, config_(config)
 		, dram_sim_(MEM_CLOCK_RATIO)
-	{}
+	{
+		char sname[100];
+		snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
+		mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_ports, config.num_banks);
+		for (uint32_t i = 0; i < config.num_ports; ++i) {
+			simobject->MemReqPorts.at(i).bind(&mem_xbar_->ReqIn.at(i));
+			mem_xbar_->RspIn.at(i).bind(&simobject->MemRspPorts.at(i));
+		}
+	}

 	~Impl() {
 		//--
@ -59,14 +68,14 @@ public:
 		dram_sim_.tick();
 		uint32_t counter = 0;

-		for (uint32_t i = 0; i < config_.channels; ++i) {
-			if (simobject_->MemReqPorts.at(i).empty())
+		for (uint32_t i = 0; i < config_.num_banks; ++i) {
+			if (mem_xbar_->ReqOut.at(i).empty())
 				continue;

-			auto& mem_req = simobject_->MemReqPorts.at(i).front();
+			auto& mem_req = mem_xbar_->ReqOut.at(i).front();

 			// try to enqueue the request to the memory system
-			auto req_args = new DramCallbackArgs{simobject_, mem_req, i};
+			auto req_args = new DramCallbackArgs{this, mem_req, i};
 			auto enqueue_success = dram_sim_.send_request(
 				mem_req.write,
 				mem_req.addr,
@ -76,8 +85,8 @@ public:
 					// only send a response for read requests
 					if (!rsp_args->request.write) {
 						MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid};
-						rsp_args->simobject->MemRspPorts.at(rsp_args->i).push(mem_rsp, 1);
-						DT(3, rsp_args->simobject->name() << " mem-rsp: bank=" << rsp_args->i << ", " << mem_rsp);
+						rsp_args->memsim->mem_xbar_->RspOut.at(rsp_args->bank_id).push(mem_rsp, 1);
+						DT(3, rsp_args->memsim->simobject_->name() << "-mem-rsp: bank=" << rsp_args->bank_id << ", " << mem_rsp);
 					}
 					delete rsp_args;
 				},
@ -90,9 +99,9 @@ public:
 				continue;
 			}

-			DT(3, simobject_->name() << " mem-req: bank=" << i << ", " << mem_req);
+			DT(3, simobject_->name() << "-mem-req: bank=" << i << ", " << mem_req);

-			simobject_->MemReqPorts.at(i).pop();
+			mem_xbar_->ReqOut.at(i).pop();
 			counter++;
 		}

@ -107,8 +116,8 @@ public:

 MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config)
 	: SimObject<MemSim>(ctx, name)
-	, MemReqPorts(config.channels, this)
-	, MemRspPorts(config.channels, this)
+	, MemReqPorts(config.num_ports, this)
+	, MemRspPorts(config.num_ports, this)
 	, impl_(new Impl(this, config))
 {}

--- a/sim/simx/mem_sim.h
+++ b/sim/simx/mem_sim.h
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,15 +21,15 @@ namespace vortex {
 class MemSim : public SimObject<MemSim>{
 public:
 	struct Config {
-		uint32_t channels;
-		uint32_t num_cores;
+		uint32_t num_banks;
+		uint32_t num_ports;
 	};

 	struct PerfStats {
 		uint64_t counter;
 		uint64_t ticks;

-		PerfStats() 
+		PerfStats()
 			: counter(0)
 			, ticks(0)
 		{}
@ -52,7 +52,7 @@ public:
 	void tick();

 	const PerfStats& perf_stats() const;
-	
+
 private:
 	class Impl;
 	Impl* impl_;
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@ -25,7 +25,7 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
  // create memory simulator
  memsim_ = MemSim::Create("dram", MemSim::Config{
    PLATFORM_MEMORY_BANKS,
-    uint32_t(arch.num_cores()) * arch.num_clusters()
+    L3_MEM_PORTS
  });

  // create clusters
--- a/sim/simx/socket.cpp
+++ b/sim/simx/socket.cpp
@ -21,7 +21,7 @@ Socket::Socket(const SimContext& ctx,
                Cluster* cluster,
                const Arch &arch,
                const DCRS &dcrs)
-  : SimObject(ctx, "socket")
+  : SimObject(ctx, StrFormat("socket%d", socket_id))
  , mem_req_ports(L1_MEM_PORTS, this)
  , mem_rsp_ports(L1_MEM_PORTS, this)
  , socket_id_(socket_id)
@ -31,7 +31,7 @@ Socket::Socket(const SimContext& ctx,
  auto cores_per_socket = cores_.size();

  char sname[100];
-  snprintf(sname, 100, "socket%d-icaches", socket_id);
+  snprintf(sname, 100, "%s-icaches", this->name().c_str());
  icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, CacheSim::Config{
    !ICACHE_ENABLED,
    log2ceil(ICACHE_SIZE),  // C
@ -49,7 +49,7 @@ Socket::Socket(const SimContext& ctx,
    2,                      // pipeline latency
  });

-  snprintf(sname, 100, "socket%d-dcaches", socket_id);
+  snprintf(sname, 100, "%s-dcaches", this->name().c_str());
  dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, CacheSim::Config{
    !DCACHE_ENABLED,
    log2ceil(DCACHE_SIZE),  // C
@ -70,7 +70,7 @@ Socket::Socket(const SimContext& ctx,
  // connect l1 caches to outgoing memory interfaces
  for (uint32_t i = 0; i < L1_MEM_PORTS; ++i) {
    if (i == 0) {
-      snprintf(sname, 100, "socket%d-l1_arb%d", socket_id, i);
+      snprintf(sname, 100, "%s-l1_arb%d", this->name().c_str(), i);
      auto l1_arb = MemArbiter::Create(sname, ArbiterType::RoundRobin, 2, 1);

      icaches_->MemReqPorts.at(0).bind(&l1_arb->ReqIn.at(1));
@ -82,8 +82,8 @@ Socket::Socket(const SimContext& ctx,
      l1_arb->ReqOut.at(0).bind(&this->mem_req_ports.at(0));
      this->mem_rsp_ports.at(0).bind(&l1_arb->RspOut.at(0));
    } else {
-      this->mem_req_ports.at(i).bind(&dcaches_->MemReqPorts.at(i));
-      dcaches_->MemRspPorts.at(i).bind(&this->mem_rsp_ports.at(i));
+      dcaches_->MemReqPorts.at(i).bind(&this->mem_req_ports.at(i));
+      this->mem_rsp_ports.at(i).bind(&dcaches_->MemRspPorts.at(i));
    }
  }

--- a/sim/simx/types.cpp
+++ b/sim/simx/types.cpp
@ -32,16 +32,16 @@ LocalMemSwitch::LocalMemSwitch(
 void LocalMemSwitch::reset() {}

 void LocalMemSwitch::tick() {
-  // process incoming responses
+  // process outgoing responses
  if (!RspLmem.empty()) {
    auto& out_rsp = RspLmem.front();
-    DT(4, this->name() << " lmem-rsp: " << out_rsp);
+    DT(4, this->name() << "-lmem-rsp: " << out_rsp);
    RspIn.push(out_rsp, 1);
    RspLmem.pop();
  }
  if (!RspDC.empty()) {
    auto& out_rsp = RspDC.front();
-    DT(4, this->name() << " dc-rsp: " << out_rsp);
+    DT(4, this->name() << "-dc-rsp: " << out_rsp);
    RspIn.push(out_rsp, 1);
    RspDC.pop();
  }
@ -73,12 +73,12 @@ void LocalMemSwitch::tick() {

    if (!out_dc_req.mask.none()) {
      ReqDC.push(out_dc_req, delay_);
-      DT(4, this->name() << " dc-req: " << out_dc_req);
+      DT(4, this->name() << "-dc-req: " << out_dc_req);
    }

    if (!out_lmem_req.mask.none()) {
      ReqLmem.push(out_lmem_req, delay_);
-      DT(4, this->name() << " lmem-req: " << out_lmem_req);
+      DT(4, this->name() << "-lmem-req: " << out_lmem_req);
    }
    ReqIn.pop();
  }
@ -104,12 +104,12 @@ void LsuMemAdapter::reset() {}
 void LsuMemAdapter::tick() {
  uint32_t input_size = ReqOut.size();

-  // process incoming responses
+  // process outgoing responses
  for (uint32_t i = 0; i < input_size; ++i) {
    if (RspOut.at(i).empty())
      continue;
    auto& out_rsp = RspOut.at(i).front();
-    DT(4, this->name() << " rsp" << i << ": " << out_rsp);
+    DT(4, this->name() << "-rsp" << i << ": " << out_rsp);

    // build memory response
    LsuRsp in_rsp(input_size);
@ -155,7 +155,7 @@ void LsuMemAdapter::tick() {

        // send memory request
        ReqOut.at(i).push(out_req, delay_);
-        DT(4, this->name() << " req" << i << ": " << out_req);
+        DT(4, this->name() << "-req" << i << ": " << out_req);
      }
    }
    ReqIn.pop();
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@ -483,12 +483,12 @@ public:
    , Outputs(num_outputs, this)
    , type_(type)
    , delay_(delay)
-    , cursors_(num_outputs, 0)
+    , grants_(num_outputs, 0)
    , num_reqs_(log2ceil(num_inputs / num_outputs))
  {
    assert(delay != 0);
-    assert(num_inputs <= 32);
-    assert(num_outputs <= 32);
+    assert(num_inputs <= 64);
+    assert(num_outputs <= 64);
    assert(num_inputs >= num_outputs);

    // bypass mode
@ -500,8 +500,8 @@ public:
  }

  void reset() {
-    for (auto& cursor : cursors_) {
-      cursor = 0;
+    for (auto& grant : grants_) {
+      grant = 0;
    }
  }

@ -517,8 +517,8 @@ public:
    // process inputs
    for (uint32_t o = 0; o < O; ++o) {
      for (uint32_t r = 0; r < R; ++r) {
-        uint32_t i = (cursors_.at(o) + r) & (R-1);
-        uint32_t j = o * R + i;
+        uint32_t g = (grants_.at(o) + r) & (R-1);
+        uint32_t j = o * R + g;
        if (j >= I)
          continue;

@ -527,29 +527,132 @@ public:
          auto& req = req_in.front();
          Outputs.at(o).push(req, delay_);
          req_in.pop();
-          this->update_cursor(o, i);
+          this->update_grant(o, g);
          break;
        }
      }
    }
  }

-private:
+protected:

-  void update_cursor(uint32_t index, uint32_t grant) {
+  void update_grant(uint32_t index, uint32_t grant) {
    if (type_ == ArbiterType::RoundRobin) {
-      cursors_.at(index) = grant + 1;
+      grants_.at(index) = grant + 1;
    }
  }

  ArbiterType type_;
  uint32_t delay_;
-  std::vector<uint32_t> cursors_;
+  std::vector<uint32_t> grants_;
  uint32_t num_reqs_;
 };

 ///////////////////////////////////////////////////////////////////////////////

+template <typename Type>
+class CrossBar : public SimObject<CrossBar<Type>> {
+public:
+  std::vector<SimPort<Type>> Inputs;
+  std::vector<SimPort<Type>> Outputs;
+
+  CrossBar(
+    const SimContext& ctx,
+    const char* name,
+    ArbiterType type,
+    uint32_t num_inputs,
+    uint32_t num_outputs = 1,
+    uint32_t addr_start = 0,
+    uint32_t delay = 1
+  )
+    : SimObject<CrossBar<Type>>(ctx, name)
+    , Inputs(num_inputs, this)
+    , Outputs(num_outputs, this)
+    , type_(type)
+    , delay_(delay)
+    , grants_(num_outputs, 0)
+    , lg_num_reqs_(log2ceil(num_inputs))
+    , addr_start_(addr_start)
+    , addr_end_(num_outputs-1)
+    , collisions_(0) {
+    assert(delay != 0);
+    assert(num_inputs <= 64);
+    assert(num_outputs <= 64);
+    assert(ispow2(num_outputs));
+  }
+
+  void reset() {
+    for (auto& grant : grants_) {
+      grant = 0;
+    }
+  }
+
+  void tick() {
+    uint32_t I = Inputs.size();
+    uint32_t O = Outputs.size();
+    uint32_t R = 1 << lg_num_reqs_;
+
+    // process incoming requests
+    for (uint32_t o = 0; o < O; ++o) {
+      int32_t input_idx = -1;
+      for (uint32_t r = 0; r < R; ++r) {
+        uint32_t i = (grants_.at(o) + r) & (R-1);
+        if (i >= I)
+          continue;
+        auto& req_in = Inputs.at(i);
+        if (!req_in.empty()) {
+          auto& req = req_in.front();
+          // skip if input is not going to this output
+          uint32_t output_idx = 0;
+          if (O != 1) {
+            output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_end_);
+          }
+          if (output_idx != o)
+            continue;
+          if (input_idx != -1) {
+            ++collisions_;
+            continue;
+          }
+          input_idx = i;
+        }
+      }
+      if (input_idx != -1) {
+        auto& req_in = Inputs.at(input_idx);
+        auto& req = req_in.front();
+        if (lg_num_reqs_ != 0) {
+          req.tag = (req.tag << lg_num_reqs_) | input_idx;
+        }
+        DT(4, this->name() << "-req" << input_idx << ": " << req);
+        Outputs.at(o).push(req, delay_);
+        req_in.pop();
+        this->update_grant(o, input_idx);
+      }
+    }
+  }
+
+  uint64_t collisions() const {
+    return collisions_;
+  }
+
+protected:
+
+  void update_grant(uint32_t index, uint32_t grant) {
+    if (type_ == ArbiterType::RoundRobin) {
+      grants_.at(index) = grant + 1;
+    }
+  }
+
+  ArbiterType type_;
+  uint32_t delay_;
+  std::vector<uint32_t> grants_;
+  uint32_t lg_num_reqs_;
+  uint32_t addr_start_;
+  uint32_t addr_end_;
+  uint64_t collisions_;
+};
+
+///////////////////////////////////////////////////////////////////////////////
+
 template <typename Req, typename Rsp>
 class TxArbiter : public SimObject<TxArbiter<Req, Rsp>> {
 public:
@ -574,12 +677,12 @@ public:
    , RspOut(num_outputs, this)
    , type_(type)
    , delay_(delay)
-    , cursors_(num_outputs, 0)
+    , grants_(num_outputs, 0)
    , lg_num_reqs_(log2ceil(num_inputs / num_outputs))
  {
    assert(delay != 0);
-    assert(num_inputs <= 32);
-    assert(num_outputs <= 32);
+    assert(num_inputs <= 64);
+    assert(num_outputs <= 64);
    assert(num_inputs >= num_outputs);

    // bypass mode
@ -592,8 +695,8 @@ public:
  }

  void reset() {
-    for (auto& cursor : cursors_) {
-      cursor = 0;
+    for (auto& grant : grants_) {
+      grant = 0;
    }
  }

@ -606,25 +709,28 @@ public:
    if (I == O)
      return;

+    // process outgoing responses
    for (uint32_t o = 0; o < O; ++o) {
-      // process incoming responses
-      if (!RspOut.at(o).empty()) {
-        auto& rsp = RspOut.at(o).front();
-        uint32_t i = 0;
+      auto& rsp_out = RspOut.at(o);
+      if (!rsp_out.empty()) {
+        auto& rsp = rsp_out.front();
+        uint32_t g = 0;
        if (lg_num_reqs_ != 0) {
-          i = rsp.tag & (R-1);
+          g = rsp.tag & (R-1);
          rsp.tag >>= lg_num_reqs_;
        }
-        DT(4, this->name() << " rsp" << o << ": " << rsp);
-        uint32_t j = o * R + i;
+        DT(4, this->name() << "-rsp" << o << ": " << rsp);
+        uint32_t j = o * R + g;
        RspIn.at(j).push(rsp, 1);
-        RspOut.at(o).pop();
+        rsp_out.pop();
      }
+    }

-      // process incoming requests
+    // process incoming requests
+    for (uint32_t o = 0; o < O; ++o) {
      for (uint32_t r = 0; r < R; ++r) {
-        uint32_t i = (cursors_.at(o) + r) & (R-1);
-        uint32_t j = o * R + i;
+        uint32_t g = (grants_.at(o) + r) & (R-1);
+        uint32_t j = o * R + g;
        if (j >= I)
          continue;

@ -632,32 +738,193 @@ public:
        if (!req_in.empty()) {
          auto& req = req_in.front();
          if (lg_num_reqs_ != 0) {
-            req.tag = (req.tag << lg_num_reqs_) | i;
+            req.tag = (req.tag << lg_num_reqs_) | g;
          }
-          DT(4, this->name() << " req" << j << ": " << req);
+          DT(4, this->name() << "-req" << j << ": " << req);
          ReqOut.at(o).push(req, delay_);
          req_in.pop();
-          this->update_cursor(o, i);
+          this->update_grant(o, g);
          break;
        }
      }
    }
  }

-  void update_cursor(uint32_t index, uint32_t grant) {
+protected:
+
+  void update_grant(uint32_t index, uint32_t grant) {
    if (type_ == ArbiterType::RoundRobin) {
-      cursors_.at(index) = grant + 1;
+      grants_.at(index) = grant + 1;
    }
  }

-private:
  ArbiterType type_;
  uint32_t delay_;
-  std::vector<uint32_t> cursors_;
+  std::vector<uint32_t> grants_;
  uint32_t lg_num_reqs_;
 };

-using MemArbiter = TxArbiter<MemReq, MemRsp>;
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Req, typename Rsp>
+class TxCrossBar : public SimObject<TxCrossBar<Req, Rsp>> {
+public:
+  std::vector<SimPort<Req>> ReqIn;
+  std::vector<SimPort<Rsp>> RspIn;
+
+  std::vector<SimPort<Req>> ReqOut;
+  std::vector<SimPort<Rsp>> RspOut;
+
+  TxCrossBar(
+    const SimContext& ctx,
+    const char* name,
+    ArbiterType type,
+    uint32_t num_inputs,
+    uint32_t num_outputs = 1,
+    uint32_t addr_start = 0,
+    uint32_t delay = 1
+  )
+    : SimObject<TxCrossBar<Req, Rsp>>(ctx, name)
+    , ReqIn(num_inputs, this)
+    , RspIn(num_inputs, this)
+    , ReqOut(num_outputs, this)
+    , RspOut(num_outputs, this)
+    , type_(type)
+    , delay_(delay)
+    , req_grants_(num_outputs, 0)
+    , rsp_grants_(num_inputs, 0)
+    , lg_num_reqs_(log2ceil(num_inputs))
+    , lg_num_rsps_(log2ceil(num_outputs))
+    , addr_start_(addr_start)
+    , addr_end_(num_outputs-1)
+    , collisions_(0) {
+    assert(delay != 0);
+    assert(num_inputs <= 64);
+    assert(num_outputs <= 64);
+    assert(ispow2(num_inputs));
+    assert(ispow2(num_outputs));
+  }
+
+  void reset() {
+    for (auto& grant : req_grants_) {
+      grant = 0;
+    }
+    for (auto& grant : rsp_grants_) {
+      grant = 0;
+    }
+  }
+
+  void tick() {
+    uint32_t I = ReqIn.size();
+    uint32_t O = ReqOut.size();
+    uint32_t R = 1 << lg_num_reqs_;
+    uint32_t T = 1 << lg_num_rsps_;
+
+    // process outgoing responses
+    for (uint32_t i = 0; i < I; ++i) {
+      int32_t output_idx = -1;
+      for (uint32_t t = 0; t < T; ++t) {
+        uint32_t o = (rsp_grants_.at(i) + t) & (T-1);
+        if (o >= O)
+          continue;
+        auto& rsp_out = RspOut.at(o);
+        if (!rsp_out.empty()) {
+          auto& rsp = rsp_out.front();
+          // skip if response is not going to current input
+          uint32_t input_idx = 0;
+          if (lg_num_reqs_ != 0) {
+            input_idx = rsp.tag & (R-1);
+          }
+          if (input_idx != i)
+            continue;
+          if (output_idx != -1) {
+            ++collisions_;
+            continue;
+          }
+          output_idx = o;
+        }
+      }
+      if (output_idx != -1) {
+        auto& rsp_out = RspOut.at(output_idx);
+        auto& rsp = rsp_out.front();
+        uint32_t input_idx = 0;
+        if (lg_num_reqs_ != 0) {
+          input_idx = rsp.tag & (R-1);
+          rsp.tag >>= lg_num_reqs_;
+        }
+        DT(4, this->name() << "-rsp" << output_idx << ": " << rsp);
+        RspIn.at(input_idx).push(rsp, 1);
+        rsp_out.pop();
+        this->update_rsp_grant(i, output_idx);
+      }
+    }
+
+    // process incoming requests
+    for (uint32_t o = 0; o < O; ++o) {
+      int32_t input_idx = -1;
+      for (uint32_t r = 0; r < R; ++r) {
+        uint32_t i = (req_grants_.at(o) + r) & (R-1);
+        if (i >= I)
+          continue;
+        auto& req_in = ReqIn.at(i);
+        if (!req_in.empty()) {
+          auto& req = req_in.front();
+          // skip if request is not going to current output
+          uint32_t output_idx = 0;
+          if (O != 1) {
+            output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_end_);
+          }
+          if (output_idx != o)
+            continue;
+          if (input_idx != -1) {
+            ++collisions_;
+            continue;
+          }
+          input_idx = i;
+        }
+      }
+      if (input_idx != -1) {
+        auto& req_in = ReqIn.at(input_idx);
+        auto& req = req_in.front();
+        if (lg_num_reqs_ != 0) {
+          req.tag = (req.tag << lg_num_reqs_) | input_idx;
+        }
+        DT(4, this->name() << "-req" << input_idx << ": " << req);
+        ReqOut.at(o).push(req, delay_);
+        req_in.pop();
+        this->update_req_grant(o, input_idx);
+      }
+    }
+  }
+
+  uint64_t collisions() const {
+    return collisions_;
+  }
+
+protected:
+
+  void update_req_grant(uint32_t index, uint32_t grant) {
+    if (type_ == ArbiterType::RoundRobin) {
+      req_grants_.at(index) = grant + 1;
+    }
+  }
+
+  void update_rsp_grant(uint32_t index, uint32_t grant) {
+    if (type_ == ArbiterType::RoundRobin) {
+      rsp_grants_.at(index) = grant + 1;
+    }
+  }
+
+  ArbiterType type_;
+  uint32_t delay_;
+  std::vector<uint32_t> req_grants_;
+  std::vector<uint32_t> rsp_grants_;
+  uint32_t lg_num_reqs_;
+  uint32_t lg_num_rsps_;
+  uint32_t addr_start_;
+  uint32_t addr_end_;
+  uint64_t collisions_;
+};

 ///////////////////////////////////////////////////////////////////////////////

@ -711,4 +978,6 @@ private:
  uint32_t delay_;
 };

+using MemArbiter = TxArbiter<MemReq, MemRsp>;
+using MemCrossBar = TxCrossBar<MemReq, MemRsp>;
 }