simx memory coalescing support

2025-04-23 21:39:10 -04:00 · 2024-03-14 12:20:39 -07:00 · 2024-03-14 12:20:39 -07:00 · f1522e68f8
commit f1522e68f8
parent 07c063031f
11 changed files with 433 additions and 383 deletions
--- a/hw/rtl/core/VX_core.sv
+++ b/hw/rtl/core/VX_core.sv
@ -252,9 +252,9 @@ module VX_core import VX_gpu_pkg::*; #(
    wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;

    for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
-        assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
-        assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
-        assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
+        assign perf_dcache_rd_req_fire[i] = dcache_lmem_bus_if[i].req_valid && dcache_lmem_bus_if[i].req_ready && ~dcache_lmem_bus_if[i].req_data.rw;
+        assign perf_dcache_wr_req_fire[i] = dcache_lmem_bus_if[i].req_valid && dcache_lmem_bus_if[i].req_ready && dcache_lmem_bus_if[i].req_data.rw;
+        assign perf_dcache_rsp_fire[i] = dcache_lmem_bus_if[i].rsp_valid && dcache_lmem_bus_if[i].rsp_ready;
    end

    `BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
--- a/sim/simx/cache_cluster.h
+++ b/sim/simx/cache_cluster.h
@ -26,42 +26,42 @@ public:

 	CacheCluster(const SimContext& ctx, 
 							const char* name, 
-							uint32_t num_units, 
+							uint32_t num_inputs, 
 							uint32_t num_caches, 
 							uint32_t num_requests,
-							const CacheSim::Config& config) 
+							const CacheSim::Config& cache_config) 
 		: SimObject(ctx, name)
-		, CoreReqPorts(num_units, std::vector<SimPort<MemReq>>(num_requests, this))
-		, CoreRspPorts(num_units, std::vector<SimPort<MemRsp>>(num_requests, this))
+		, CoreReqPorts(num_inputs, std::vector<SimPort<MemReq>>(num_requests, this))
+		, CoreRspPorts(num_inputs, std::vector<SimPort<MemRsp>>(num_requests, this))
 		, MemReqPort(this)
 		, MemRspPort(this)
 		, caches_(MAX(num_caches, 0x1)) {

-		CacheSim::Config config2(config);
+		CacheSim::Config cache_config2(cache_config);
 		if (0 == num_caches) {
 			num_caches = 1;
-			config2.bypass = true;
+			cache_config2.bypass = true;
 		}

 		char sname[100];
 		
-		std::vector<MemSwitch::Ptr> unit_arbs(num_units);
-		for (uint32_t u = 0; u < num_units; ++u) {
-			snprintf(sname, 100, "%s-unit-arb-%d", name, u);
-			unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
+		std::vector<MemSwitch::Ptr> input_arbs(num_inputs);
+		for (uint32_t j = 0; j < num_inputs; ++j) {
+			snprintf(sname, 100, "%s-input-arb%d", name, j);
+			input_arbs.at(j) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, cache_config.num_inputs);
 			for (uint32_t i = 0; i < num_requests; ++i) {
-				this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
-				unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
+				this->CoreReqPorts.at(j).at(i).bind(&input_arbs.at(j)->ReqIn.at(i));
+				input_arbs.at(j)->RspIn.at(i).bind(&this->CoreRspPorts.at(j).at(i));
 			}
 		}

-		std::vector<MemSwitch::Ptr> mem_arbs(config.num_inputs);
-		for (uint32_t i = 0; i < config.num_inputs; ++i) {
-			snprintf(sname, 100, "%s-mem-arb-%d", name, i);
-			mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
-			for (uint32_t u = 0; u < num_units; ++u) {
-				unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
-				mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
+		std::vector<MemSwitch::Ptr> mem_arbs(cache_config.num_inputs);
+		for (uint32_t i = 0; i < cache_config.num_inputs; ++i) {
+			snprintf(sname, 100, "%s-mem-arb%d", name, i);
+			mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_inputs, num_caches);
+			for (uint32_t j = 0; j < num_inputs; ++j) {
+				input_arbs.at(j)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(j));
+				mem_arbs.at(i)->RspIn.at(j).bind(&input_arbs.at(j)->RspOut.at(i));
 			}
 		}

@ -70,9 +70,9 @@ public:

 		for (uint32_t i = 0; i < num_caches; ++i) {
 			snprintf(sname, 100, "%s-cache%d", name, i);
-			caches_.at(i) = CacheSim::Create(sname, config2);
+			caches_.at(i) = CacheSim::Create(sname, cache_config2);

-			for (uint32_t j = 0; j < config.num_inputs; ++j) {
+			for (uint32_t j = 0; j < cache_config.num_inputs; ++j) {
 				mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j));
 				caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
 			}
--- a/sim/simx/cluster.cpp
+++ b/sim/simx/cluster.cpp
@ -72,7 +72,7 @@ Cluster::Cluster(const SimContext& ctx,
    2,                      // request size 
    true,                   // write-through
    false,                  // write response
-    L2_MSHR_SIZE,           // mshr
+    L2_MSHR_SIZE,           // mshr size
    2,                      // pipeline latency
  });

--- a/sim/simx/constants.h
+++ b/sim/simx/constants.h
@ -23,4 +23,8 @@

 #ifndef MEMORY_BANKS
 #define MEMORY_BANKS 2
-#endif
+#endif
+
+#define DCACHE_WORD_SIZE  LSU_LINE_SIZE
+#define DCACHE_CHANNELS 	UP((NUM_LSU_LANES * (XLEN / 8)) / DCACHE_WORD_SIZE)
+#define DCACHE_NUM_REQS	  (NUM_LSU_BLOCKS * DCACHE_CHANNELS)
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@ -30,23 +30,23 @@ Core::Core(const SimContext& ctx,
           Socket* socket,
           const Arch &arch, 
           const DCRS &dcrs)
-    : SimObject(ctx, "core")
-    , icache_req_ports(1, this)
-    , icache_rsp_ports(1, this)
-    , dcache_req_ports(NUM_LSU_LANES, this)
-    , dcache_rsp_ports(NUM_LSU_LANES, this)
-    , core_id_(core_id)
-    , socket_(socket)
-    , arch_(arch)
-    , emulator_(arch, dcrs, this)
-    , ibuffers_(arch.num_warps(), IBUF_SIZE)
-    , scoreboard_(arch_)
-    , operands_(ISSUE_WIDTH)
-    , dispatchers_((uint32_t)FUType::Count)
-    , func_units_((uint32_t)FUType::Count)
-    , lmem_demuxs_(NUM_LSU_LANES)
-    , pending_icache_(arch_.num_warps())
-    , commit_arbs_(ISSUE_WIDTH)
+  : SimObject(ctx, "core")
+  , icache_req_ports(1, this)
+  , icache_rsp_ports(1, this)
+  , dcache_req_ports(DCACHE_NUM_REQS, this)
+  , dcache_rsp_ports(DCACHE_NUM_REQS, this)
+  , core_id_(core_id)
+  , socket_(socket)
+  , arch_(arch)
+  , emulator_(arch, dcrs, this)
+  , ibuffers_(arch.num_warps(), IBUF_SIZE)
+  , scoreboard_(arch_)
+  , operands_(ISSUE_WIDTH)
+  , dispatchers_((uint32_t)FUType::Count)
+  , func_units_((uint32_t)FUType::Count)
+  , lsu_demux_(DCACHE_NUM_REQS)
+  , pending_icache_(arch_.num_warps())
+  , commit_arbs_(ISSUE_WIDTH)
 {
  char sname[100];

@ -58,30 +58,30 @@ Core::Core(const SimContext& ctx,
  snprintf(sname, 100, "core%d-local_mem", core_id);
  local_mem_ = LocalMem::Create(sname, LocalMem::Config{
    (1 << LMEM_LOG_SIZE),
-    sizeof(Word),
-    NUM_LSU_LANES, 
-    NUM_LSU_LANES,
+    DCACHE_WORD_SIZE,
+    DCACHE_NUM_REQS, 
+    LMEM_NUM_BANKS,
    false
  });
-  for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
-    snprintf(sname, 100, "core%d-lmem_demux%d", core_id, i);
+  for (uint32_t i = 0; i < DCACHE_NUM_REQS; ++i) {
+    snprintf(sname, 100, "core%d-lsu_demux%d", core_id, i);
    auto lmem_demux = LocalMemDemux::Create(sname);
-    
+
    lmem_demux->ReqDC.bind(&dcache_req_ports.at(i));
    dcache_rsp_ports.at(i).bind(&lmem_demux->RspDC);

    lmem_demux->ReqSM.bind(&local_mem_->Inputs.at(i));
    local_mem_->Outputs.at(i).bind(&lmem_demux->RspSM);

-    lmem_demuxs_.at(i) = lmem_demux;
+    lsu_demux_.at(i) = lmem_demux;
  }

  // initialize dispatchers
  dispatchers_.at((int)FUType::ALU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_ALU_BLOCKS, NUM_ALU_LANES);
  dispatchers_.at((int)FUType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
-  dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, 1, NUM_LSU_LANES);
-  dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, 1, NUM_SFU_LANES);
-  
+  dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES);
+  dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES);
+
  // initialize execute units
  func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
  func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
@ -89,7 +89,7 @@ Core::Core(const SimContext& ctx,
  func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);

  // bind commit arbiters
-  for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {    
+  for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
    snprintf(sname, 100, "core%d-commit-arb%d", core_id, i);
    auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)FUType::Count, 1);
    for (uint32_t j = 0; j < (uint32_t)FUType::Count; ++j) {
@ -116,7 +116,7 @@ void Core::reset() {
  for (auto& commit_arb : commit_arbs_) {
    commit_arb->reset();
  }
-  
+
  for (auto& ibuf : ibuffers_) {
    ibuf.clear();
  }
@ -125,11 +125,11 @@ void Core::reset() {
  fetch_latch_.clear();
  decode_latch_.clear();
  pending_icache_.clear();
-  
+
  ibuffer_idx_ = 0;
-  pending_instrs_ = 0;  
+  pending_instrs_ = 0;
  pending_ifetches_ = 0;
-  
+
  perf_stats_ = PerfStats();
 }

@ -142,7 +142,7 @@ void Core::tick() {
  this->schedule();

  ++perf_stats_.cycles;
-  DPN(2, std::flush);  
+  DPN(2, std::flush);
 }

 void Core::schedule() {
@ -184,11 +184,11 @@ void Core::fetch() {
  MemReq mem_req;
  mem_req.addr  = trace->PC;
  mem_req.write = false;
-  mem_req.tag   = pending_icache_.allocate(trace);    
+  mem_req.tag   = pending_icache_.allocate(trace);
  mem_req.cid   = trace->cid;
  mem_req.uuid  = trace->uuid;
-  icache_req_ports.at(0).push(mem_req, 2);    
-  DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);    
+  icache_req_ports.at(0).push(mem_req, 2);
+  DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
  fetch_latch_.pop();
  ++perf_stats_.ifetches;
  ++pending_ifetches_;
@ -211,9 +211,9 @@ void Core::decode() {
  } else {
    trace->log_once(false);
  }
-  
+
  // release warp
-  if (!trace->fetch_stall) {    
+  if (!trace->fetch_stall) {
    emulator_.resume(trace->wid);
  }

@ -225,10 +225,10 @@ void Core::decode() {
  decode_latch_.pop();
 }

-void Core::issue() {   
+void Core::issue() { 
  // operands to dispatchers
  for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
-    auto& operand = operands_.at(i);    
+    auto& operand = operands_.at(i);
    if (operand->Output.empty())
      continue;
    auto trace = operand->Output.front();
@ -255,7 +255,7 @@ void Core::issue() {
    if (scoreboard_.in_use(trace)) {
      auto uses = scoreboard_.get_uses(trace);
      if (!trace->log_once(true)) {
-        DTH(3, "*** scoreboard-stall: dependents={");        
+        DTH(3, "*** scoreboard-stall: dependents={");
        for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
          auto& use = uses.at(j);
          __unused (use);
@ -266,10 +266,10 @@ void Core::issue() {
      }
      for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
        auto& use = uses.at(j);
-        switch (use.fu_type) {        
+        switch (use.fu_type) {
        case FUType::ALU: ++perf_stats_.scrb_alu; break;
        case FUType::FPU: ++perf_stats_.scrb_fpu; break;
-        case FUType::LSU: ++perf_stats_.scrb_lsu; break;        
+        case FUType::LSU: ++perf_stats_.scrb_lsu; break;
        case FUType::SFU: {
          ++perf_stats_.scrb_sfu;
          switch (use.sfu_type) {
@ -286,7 +286,7 @@ void Core::issue() {
          }
        } break;
        default: assert(false);
-        }        
+        }
      }
      ++perf_stats_.scrb_stalls;
      continue;
--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@ -145,7 +145,7 @@ private:
  std::vector<Dispatcher::Ptr> dispatchers_;
  std::vector<FuncUnit::Ptr> func_units_;  
  LocalMem::Ptr local_mem_;
-  std::vector<LocalMemDemux::Ptr> lmem_demuxs_;
+  std::vector<LocalMemDemux::Ptr> lsu_demux_;

  PipelineLatch fetch_latch_;
  PipelineLatch decode_latch_;
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@ -25,301 +25,337 @@
 using namespace vortex;

 AluUnit::AluUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "ALU") {}
-    
-void AluUnit::tick() {    
-    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
-        auto& input = Inputs.at(i);
-        if (input.empty()) 
-            continue;
-        auto& output = Outputs.at(i);
-        auto trace = input.front();
-        switch (trace->alu_type) {
-        case AluType::ARITH:        
-        case AluType::BRANCH:
-        case AluType::SYSCALL:
-        case AluType::IMUL:
-            output.push(trace, LATENCY_IMUL+1);
-            break;
-        case AluType::IDIV:
-            output.push(trace, XLEN+1);
-            break;
-        default:
-            std::abort();
-        }
-        DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
-        if (trace->eop && trace->fetch_stall) {
-            core_->resume(trace->wid);
-        }
-        input.pop();
-    }
+
+void AluUnit::tick() {
+  for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
+		auto& input = Inputs.at(iw);
+		if (input.empty()) 
+			continue;
+		auto& output = Outputs.at(iw);
+		auto trace = input.front();
+		switch (trace->alu_type) {
+		case AluType::ARITH:
+		case AluType::BRANCH:
+		case AluType::SYSCALL:
+		case AluType::IMUL:
+			output.push(trace, LATENCY_IMUL+1);
+			break;
+		case AluType::IDIV:
+			output.push(trace, XLEN+1);
+			break;
+		default:
+			std::abort();
+		}
+		DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
+		if (trace->eop && trace->fetch_stall) {
+			core_->resume(trace->wid);
+		}
+		input.pop();
+	}
 }

 ///////////////////////////////////////////////////////////////////////////////

 FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "FPU") {}
-    
+
 void FpuUnit::tick() {
-    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
-        auto& input = Inputs.at(i);
-        if (input.empty()) 
-            continue;
-        auto& output = Outputs.at(i);
-        auto trace = input.front();
-        switch (trace->fpu_type) {
-        case FpuType::FNCP:
-            output.push(trace, 2);
-            break;
-        case FpuType::FMA:
-            output.push(trace, LATENCY_FMA+1);
-            break;
-        case FpuType::FDIV:
-            output.push(trace, LATENCY_FDIV+1);
-            break;
-        case FpuType::FSQRT:
-            output.push(trace, LATENCY_FSQRT+1);
-            break;
-        case FpuType::FCVT:
-            output.push(trace, LATENCY_FCVT+1);
-            break;
-        default:
-            std::abort();
-        }    
-        DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
-        input.pop();
-    }
+	for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
+		auto& input = Inputs.at(iw);
+		if (input.empty()) 
+			continue;
+		auto& output = Outputs.at(iw);
+		auto trace = input.front();
+		switch (trace->fpu_type) {
+		case FpuType::FNCP:
+			output.push(trace, 2);
+			break;
+		case FpuType::FMA:
+			output.push(trace, LATENCY_FMA+1);
+			break;
+		case FpuType::FDIV:
+			output.push(trace, LATENCY_FDIV+1);
+			break;
+		case FpuType::FSQRT:
+			output.push(trace, LATENCY_FSQRT+1);
+			break;
+		case FpuType::FCVT:
+			output.push(trace, LATENCY_FCVT+1);
+			break;
+		default:
+			std::abort();
+		}
+		DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
+		input.pop();
+	}
 }

 ///////////////////////////////////////////////////////////////////////////////

 LsuUnit::LsuUnit(const SimContext& ctx, Core* core) 
-    : FuncUnit(ctx, core, "LSU")
-    , pending_rd_reqs_(LSUQ_IN_SIZE)
-    , num_lanes_(NUM_LSU_LANES)     
-    , pending_loads_(0)
-    , fence_lock_(false)
-    , input_idx_(0)
+	: FuncUnit(ctx, core, "LSU")
+	, pending_loads_(0)
+{}
+
+LsuUnit::~LsuUnit()
 {}

 void LsuUnit::reset() {
-    pending_rd_reqs_.clear();
-    pending_loads_ = 0;
-    fence_lock_ = false;
+	for (auto& state : states_) {
+		state.clear();
+	}
+	pending_loads_ = 0;
 }

-void LsuUnit::tick() {    
-    core_->perf_stats_.load_latency += pending_loads_;
+void LsuUnit::tick() {
+	core_->perf_stats_.load_latency += pending_loads_;

-    // handle dcache response    
-    for (uint32_t t = 0; t < num_lanes_; ++t) {
-        auto& dcache_rsp_port = core_->lmem_demuxs_.at(t)->RspIn;
-        if (dcache_rsp_port.empty())
-            continue;
-        auto& mem_rsp = dcache_rsp_port.front();
-        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
-        auto trace = entry.trace;
-        DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type 
-            << ", tid=" << t << ", " << *trace);  
-        assert(entry.count);
-        --entry.count; // track remaining addresses 
-        if (0 == entry.count) {
-            int iw = trace->wid % ISSUE_WIDTH;
-            auto& output = Outputs.at(iw);
-            output.push(trace, 1);
-            pending_rd_reqs_.release(mem_rsp.tag);
-        } 
-        dcache_rsp_port.pop();
-        --pending_loads_;
-    }
+	// handle dcache responses
+	for (uint32_t r = 0; r < DCACHE_NUM_REQS; ++r) {
+		auto& dcache_rsp_port = core_->lsu_demux_.at(r)->RspIn;
+		if (dcache_rsp_port.empty())
+			continue;
+		uint32_t block_idx = r / DCACHE_CHANNELS;
+		auto& state = states_.at(block_idx);
+		auto& mem_rsp = dcache_rsp_port.front();
+		auto& entry = state.pending_rd_reqs.at(mem_rsp.tag);
+		auto trace = entry.trace;
+		DT(3, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", rid=" << r << ", " << *trace);
+		assert(entry.count);
+		--entry.count; // track remaining addresses 
+		if (0 == entry.count) {
+			int iw = trace->wid % ISSUE_WIDTH;
+			Outputs.at(iw).push(trace, 1);
+			state.pending_rd_reqs.release(mem_rsp.tag);
+		} 
+		dcache_rsp_port.pop();
+		--pending_loads_;
+	}

-    // handle local memory response
-    for (uint32_t t = 0; t < num_lanes_; ++t) {
-        auto& lmem_rsp_port = core_->local_mem_->Outputs.at(t);
-        if (lmem_rsp_port.empty())
-            continue;
-        auto& mem_rsp = lmem_rsp_port.front();
-        auto& entry = pending_rd_reqs_.at(mem_rsp.tag);          
-        auto trace = entry.trace;
-        DT(3, "lmem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", tid=" << t << ", " << *trace);
-        assert(entry.count);
-        --entry.count; // track remaining addresses 
-        if (0 == entry.count) {
-            int iw = trace->wid % ISSUE_WIDTH;
-            auto& output = Outputs.at(iw);
-            output.push(trace, 1);
-            pending_rd_reqs_.release(mem_rsp.tag);
-        } 
-        lmem_rsp_port.pop();  
-        --pending_loads_;
-    }
+	// handle LSU requests 
+	for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
+		uint32_t block_idx = iw % NUM_LSU_BLOCKS;
+		auto& state = states_.at(block_idx);
+		if (state.fence_lock) {
+			// wait for all pending memory operations to complete
+			if (!state.pending_rd_reqs.empty())
+				continue;
+			Outputs.at(iw).push(state.fence_trace, 1);
+			state.fence_lock = false;
+			DT(3, "fence-unlock: " << state.fence_trace);
+		}

-    if (fence_lock_) {
-        // wait for all pending memory operations to complete
-        if (!pending_rd_reqs_.empty())
-            return;
-        int iw = fence_state_->wid % ISSUE_WIDTH;
-        auto& output = Outputs.at(iw);
-        output.push(fence_state_, 1);
-        fence_lock_ = false;
-        DT(3, "fence-unlock: " << fence_state_);
-    }    
+		// check input queue
+		auto& input = Inputs.at(iw);
+		if (input.empty())
+			continue;

-    // check input queue
-    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
-        int iw = (input_idx_ + i) % ISSUE_WIDTH;
-        auto& input = Inputs.at(iw);
-        if (input.empty())
-            continue;
-        auto& output = Outputs.at(iw);
-        auto trace = input.front();
-        auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
+		auto& output = Outputs.at(iw);
+		auto trace = input.front();

-        auto t0 = trace->pid * num_lanes_;
+		if (trace->lsu_type == LsuType::FENCE) {
+			// schedule fence lock
+			state.fence_trace = trace;
+			state.fence_lock = true;
+			DT(3, "fence-lock: " << *trace);
+			// remove input
+			input.pop(); 
+			continue;
+		}

-        if (trace->lsu_type == LsuType::FENCE) {
-            // schedule fence lock
-            fence_state_ = trace;
-            fence_lock_ = true;        
-            DT(3, "fence-lock: " << *trace);
-            // remove input
-            input.pop(); 
-            break;
-        }
+		// check pending queue capacity
+		if (state.pending_rd_reqs.full()) {
+			if (!trace->log_once(true)) {
+				DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
+			}
+			continue;
+		} else {
+			trace->log_once(false);
+		}

-        // check pending queue capacity    
-        if (pending_rd_reqs_.full()) {
-            if (!trace->log_once(true)) {
-                DT(3, "*** " << this->name() << "-lsu-queue-stall: " << *trace);
-            }
-            break;
-        } else {
-            trace->log_once(false);
-        }
-        
-        bool is_write = (trace->lsu_type == LsuType::STORE);
+		uint32_t num_reqs;
+		auto tag = state.pending_rd_reqs.allocate({trace, 0});		
+		if (DCACHE_WORD_SIZE != (XLEN/8)) {
+			num_reqs = this->send_coalesced_requests(trace, block_idx, tag);
+		} else {
+			num_reqs = this->send_requests(trace, block_idx, tag);
+		}	
+		state.pending_rd_reqs.at(tag).count = num_reqs;

-        // duplicates detection
-        bool is_dup = false;
-        if (trace->tmask.test(t0)) {
-            uint64_t addr_mask = sizeof(uint32_t)-1;
-            uint32_t addr0 = trace_data->mem_addrs.at(0).addr & ~addr_mask;
-            uint32_t matches = 1;
-            for (uint32_t t = 1; t < num_lanes_; ++t) {
-                if (!trace->tmask.test(t0 + t))
-                    continue;
-                auto mem_addr = trace_data->mem_addrs.at(t + t0).addr & ~addr_mask;
-                matches += (addr0 == mem_addr);
-            }
-        #ifdef LSU_DUP_ENABLE
-            is_dup = (matches == trace->tmask.count());
-        #endif
-        }
+		// do not wait on writes
+		bool is_write = (trace->lsu_type == LsuType::STORE);
+		if (is_write) {
+			state.pending_rd_reqs.release(tag);
+			output.push(trace, 1);
+		}

-        uint32_t addr_count;
-        if (is_dup) {
-            addr_count = 1;
-        } else {
-            addr_count = trace->tmask.count();
-        }
+		// remove input
+		input.pop();
+	}
+}

-        auto tag = pending_rd_reqs_.allocate({trace, addr_count});
+int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
+	int count = 0;
+	
+	auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
+	bool is_write = (trace->lsu_type == LsuType::STORE);
+	auto t0 = trace->pid * NUM_LSU_LANES;

-        for (uint32_t t = 0; t < num_lanes_; ++t) {
-            if (!trace->tmask.test(t0 + t))
-                continue;
-            
-            auto& dcache_req_port = core_->lmem_demuxs_.at(t)->ReqIn;
-            auto mem_addr = trace_data->mem_addrs.at(t + t0);
-            auto type = get_addr_type(mem_addr.addr);
+	for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
+		uint32_t t = t0 + i;
+		if (!trace->tmask.test(t))
+			continue;
+		
+		int req_idx = block_idx * DCACHE_CHANNELS + (i % DCACHE_CHANNELS);		
+		auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;		
+		
+		auto mem_addr = trace_data->mem_addrs.at(t);
+		auto type = get_addr_type(mem_addr.addr);

-            MemReq mem_req;
-            mem_req.addr  = mem_addr.addr;
-            mem_req.write = is_write;
-            mem_req.type  = type; 
-            mem_req.tag   = tag;
-            mem_req.cid   = trace->cid;
-            mem_req.uuid  = trace->uuid;        
-                
-            dcache_req_port.push(mem_req, 1);
-            DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag 
-                << ", lsu_type=" << trace->lsu_type << ", tid=" << t << ", addr_type=" << mem_req.type << ", " << *trace);
+		MemReq mem_req;
+		mem_req.addr  = mem_addr.addr;
+		mem_req.write = is_write;
+		mem_req.type  = type; 
+		mem_req.tag   = tag;
+		mem_req.cid   = trace->cid;
+		mem_req.uuid  = trace->uuid;
+				
+		dcache_req_port.push(mem_req, 1);
+		DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag 
+			<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);

-            if (is_write) {
-                ++core_->perf_stats_.stores;
-            } else {                
-                ++core_->perf_stats_.loads;
-                ++pending_loads_;
-            }
-            if (is_dup)
-                break;
-        }
+		if (is_write) {
+			++core_->perf_stats_.stores;
+		} else {
+			++core_->perf_stats_.loads;
+			++pending_loads_;
+		}

-        // do not wait on writes
-        if (is_write) {
-            pending_rd_reqs_.release(tag);
-            output.push(trace, 1);            
-        }
+		++count;
+	}
+	return count;
+}

-        // remove input
-        input.pop();
+int LsuUnit::send_coalesced_requests(instr_trace_t* trace, int block_idx, int tag) {
+	int count = 0;
+	
+	auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
+	bool is_write = (trace->lsu_type == LsuType::STORE);
+	auto t0 = trace->pid * NUM_LSU_LANES;

-        break; // single block
-    }
-    ++input_idx_;
+	auto addr_mask = ~uint64_t(LSU_LINE_SIZE-1);
+
+	for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
+
+		std::bitset<NUM_LSU_LANES / DCACHE_CHANNELS> mask(0);		
+		for (uint32_t i = 0; i < mask.size(); ++i) {
+			mask.set(i, trace->tmask.test(t0 + i));
+		}
+		
+		int req_idx = block_idx * DCACHE_CHANNELS + c;
+		auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
+
+		while (mask.any()) {
+			// calculate seed idex
+			int seed_idx = 0;
+			for (uint32_t i = 0; i < mask.size(); ++i) {
+				if (mask.test(i)) {
+					seed_idx = i;
+					break;
+				}
+			}
+
+			uint32_t seed_addr = trace_data->mem_addrs.at(t0 + seed_idx).addr & addr_mask;
+			auto type = get_addr_type(seed_addr);
+
+			// coalesce addresses matching the seed
+			uint32_t coelescing_size = 0;
+			for (uint32_t i = seed_idx; i < mask.size(); ++i) {
+				auto mem_addr = trace_data->mem_addrs.at(t0 + i).addr & addr_mask;
+				if (mem_addr == seed_addr) {
+					mask.set(i, 0);
+					++coelescing_size;		
+				}
+			}		
+
+			MemReq mem_req;
+			mem_req.addr  = seed_addr;
+			mem_req.write = is_write;
+			mem_req.type  = type; 
+			mem_req.tag   = tag;
+			mem_req.cid   = trace->cid;
+			mem_req.uuid  = trace->uuid;
+					
+			dcache_req_port.push(mem_req, 1);
+			DT(3, "dcache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag 
+				<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
+			if (coelescing_size > 1) {
+				DT(3, "*** coalescing: size=" << coelescing_size << ", " << *trace);
+			}
+
+			if (is_write) {
+				++core_->perf_stats_.stores;
+			} else {
+				++core_->perf_stats_.loads;
+				++pending_loads_;
+			}
+			
+			++count;
+		}
+
+		t0 += mask.size();
+	}
+	
+	return count;
 }

 ///////////////////////////////////////////////////////////////////////////////

 SfuUnit::SfuUnit(const SimContext& ctx, Core* core) 
-    : FuncUnit(ctx, core, "SFU")
-    , input_idx_(0)
+	: FuncUnit(ctx, core, "SFU")
 {}
-    
+
 void SfuUnit::tick() {
-    // check input queue
-    for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
-        int iw = (input_idx_ + i) % ISSUE_WIDTH;        
-        auto& input = Inputs.at(iw);
-        if (input.empty())
-            continue;
-        auto& output = Outputs.at(iw);
-        auto trace = input.front();
-        auto sfu_type = trace->sfu_type;
-        bool release_warp = trace->fetch_stall;
+	// check input queue
+	for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
+		auto& input = Inputs.at(iw);
+		if (input.empty())
+			continue;
+		auto& output = Outputs.at(iw);
+		auto trace = input.front();
+		auto sfu_type = trace->sfu_type;
+		bool release_warp = trace->fetch_stall;

-        switch  (sfu_type) {
-        case SfuType::TMC: 
-        case SfuType::WSPAWN:
-        case SfuType::SPLIT:
-        case SfuType::JOIN:
-        case SfuType::PRED:
-        case SfuType::CSRRW:
-        case SfuType::CSRRS:
-        case SfuType::CSRRC:
-            output.push(trace, 1);
-            break;
-        case SfuType::BAR: {
-            output.push(trace, 1);
-            auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
-            if (trace->eop) {
-                core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
-            }
-            release_warp = false;
-        }   break;
-        case SfuType::CMOV:
-            output.push(trace, 3);
-            break;
-        default:
-            std::abort();
-        }
+		switch  (sfu_type) {
+		case SfuType::TMC: 
+		case SfuType::WSPAWN:
+		case SfuType::SPLIT:
+		case SfuType::JOIN:
+		case SfuType::PRED:
+		case SfuType::CSRRW:
+		case SfuType::CSRRS:
+		case SfuType::CSRRC:
+			output.push(trace, 1);
+			break;
+		case SfuType::BAR: {
+			output.push(trace, 1);
+			auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
+			if (trace->eop) {
+				core_->barrier(trace_data->bar.id, trace_data->bar.count, trace->wid);
+			}
+			release_warp = false;
+		} break;
+		case SfuType::CMOV:
+			output.push(trace, 3);
+			break;
+		default:
+			std::abort();
+		}

-        DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
-        if (trace->eop && release_warp)  {
-            core_->resume(trace->wid);
-        }
+		DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
+		if (trace->eop && release_warp)  {
+			core_->resume(trace->wid);
+		}

-        input.pop();
-
-        break; // single block
-    }
-    ++input_idx_;
+		input.pop();
+	}
 }
--- a/sim/simx/func_unit.h
+++ b/sim/simx/func_unit.h
@ -14,6 +14,7 @@
 #pragma once

 #include <simobject.h>
+#include <array>
 #include "instr_trace.h"

 namespace vortex {
@ -22,77 +23,89 @@ class Core;

 class FuncUnit : public SimObject<FuncUnit> {
 public:
-    std::vector<SimPort<instr_trace_t*>> Inputs;
-    std::vector<SimPort<instr_trace_t*>> Outputs;
+	std::vector<SimPort<instr_trace_t*>> Inputs;
+	std::vector<SimPort<instr_trace_t*>> Outputs;

-    FuncUnit(const SimContext& ctx, Core* core, const char* name) 
-        : SimObject<FuncUnit>(ctx, name) 
-        , Inputs(ISSUE_WIDTH, this)
-        , Outputs(ISSUE_WIDTH, this)
-        , core_(core)
-    {}
-    
-    virtual ~FuncUnit() {}
+	FuncUnit(const SimContext& ctx, Core* core, const char* name) 
+		: SimObject<FuncUnit>(ctx, name) 
+		, Inputs(ISSUE_WIDTH, this)
+		, Outputs(ISSUE_WIDTH, this)
+		, core_(core)
+	{}
+	
+	virtual ~FuncUnit() {}

-    virtual void reset() {}
+	virtual void reset() {}

-    virtual void tick() = 0;
+	virtual void tick() = 0;

 protected:
-    Core* core_;
+	Core* core_;
 };

 ///////////////////////////////////////////////////////////////////////////////

 class AluUnit : public FuncUnit {
 public:
-    AluUnit(const SimContext& ctx, Core*);
-    
-    void tick();
+  AluUnit(const SimContext& ctx, Core*);
+
+  void tick();
 };

 ///////////////////////////////////////////////////////////////////////////////

 class FpuUnit : public FuncUnit {
 public:
-    FpuUnit(const SimContext& ctx, Core*);
-    
-    void tick();
+  FpuUnit(const SimContext& ctx, Core*);
+
+  void tick();
 };

 ///////////////////////////////////////////////////////////////////////////////

 class LsuUnit : public FuncUnit {
 public:
-    LsuUnit(const SimContext& ctx, Core*);
+	LsuUnit(const SimContext& ctx, Core*);
+	~LsuUnit();

-    void reset();
+	void reset();
+	void tick();

-    void tick();
+private:

-private:    
-    struct pending_req_t {
-      instr_trace_t* trace;
-      uint32_t count;
-    };
-    HashTable<pending_req_t> pending_rd_reqs_;    
-    uint32_t num_lanes_;
-    instr_trace_t* fence_state_;
-    uint64_t pending_loads_;
-    bool fence_lock_;
-    uint32_t input_idx_;
+	int send_requests(instr_trace_t* trace, int block_idx, int tag);
+	int send_coalesced_requests(instr_trace_t* trace, int block_idx, int tag);
+
+	struct pending_req_t {
+		instr_trace_t* trace;
+		uint32_t count;
+	};
+
+	struct lsu_state_t {		
+		HashTable<pending_req_t> pending_rd_reqs;
+		instr_trace_t* fence_trace;	
+		bool fence_lock;
+
+		lsu_state_t() : pending_rd_reqs(LSUQ_IN_SIZE) {}
+		
+		void clear() {
+			this->pending_rd_reqs.clear();
+			this->fence_trace = nullptr;
+			this->fence_lock = false;
+		}
+	};
+	
+	std::array<lsu_state_t, NUM_LSU_BLOCKS> states_;	
+	uint64_t pending_loads_;
 };

 ///////////////////////////////////////////////////////////////////////////////

 class SfuUnit : public FuncUnit {
 public:
-    SfuUnit(const SimContext& ctx, Core*);
-    
-    void tick();
-
-private:
-  uint32_t input_idx_;
+	SfuUnit(const SimContext& ctx, Core*);
+	
+	void tick();
 };

 }
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@ -41,7 +41,7 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
    uint8_t(arch.num_clusters()), // request size 
    true,                     // write-through
    false,                    // write response
-    L3_MSHR_SIZE,             // mshr
+    L3_MSHR_SIZE,             // mshr size
    2,                        // pipeline latency
    }
  );        
--- a/sim/simx/socket.cpp
+++ b/sim/simx/socket.cpp
@ -28,10 +28,10 @@ Socket::Socket(const SimContext& ctx,
  , dcache_mem_rsp_port(this)
  , socket_id_(socket_id)
  , cluster_(cluster)
-  , cores_(arch.socket_size())  
+  , cores_(arch.socket_size())
 {
  auto cores_per_socket = cores_.size();
-  
+
  char sname[100];
  snprintf(sname, 100, "socket%d-icaches", socket_id);
  icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{
@ -46,7 +46,7 @@ Socket::Socket(const SimContext& ctx,
    1,                      // number of inputs
    false,                  // write-through
    false,                  // write response
-    (uint8_t)arch.num_warps(), // mshr
+    (uint8_t)arch.num_warps(), // mshr size
    2,                      // pipeline latency
  });

@ -54,19 +54,19 @@ Socket::Socket(const SimContext& ctx,
  icache_mem_rsp_port.bind(&icaches_->MemRspPort);

  snprintf(sname, 100, "socket%d-dcaches", socket_id);
-  dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
+  dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, DCACHE_NUM_REQS, CacheSim::Config{
    !DCACHE_ENABLED,
    log2ceil(DCACHE_SIZE),  // C
    log2ceil(L1_LINE_SIZE), // L
-    log2ceil(sizeof(Word)), // W
+    log2ceil(DCACHE_WORD_SIZE), // W
    log2ceil(DCACHE_NUM_WAYS),// A
    log2ceil(DCACHE_NUM_BANKS), // B
    XLEN,                   // address bits
    1,                      // number of ports
-    DCACHE_NUM_BANKS,       // number of inputs
+    DCACHE_NUM_REQS,        // number of inputs
    true,                   // write-through
    false,                  // write response
-    DCACHE_MSHR_SIZE,       // mshr
+    DCACHE_MSHR_SIZE,       // mshr size
    2,                      // pipeline latency
  });

@ -75,17 +75,14 @@ Socket::Socket(const SimContext& ctx,

  // create cores

-  for (uint32_t i = 0; i < cores_per_socket; ++i) {  
+  for (uint32_t i = 0; i < cores_per_socket; ++i) {
    uint32_t core_id = socket_id * cores_per_socket + i;
-    cores_.at(i) = Core::Create(core_id, 
-                                this, 
-                                arch, 
-                                dcrs);
+    cores_.at(i) = Core::Create(core_id, this, arch, dcrs);

    cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
-    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));      
+    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));

-    for (uint32_t j = 0; j < NUM_LSU_LANES; ++j) {
+    for (uint32_t j = 0; j < DCACHE_NUM_REQS; ++j) {
      cores_.at(i)->dcache_req_ports.at(j).bind(&dcaches_->CoreReqPorts.at(i).at(j));
      dcaches_->CoreRspPorts.at(i).at(j).bind(&cores_.at(i)->dcache_rsp_ports.at(j));
    }
@ -96,7 +93,7 @@ Socket::~Socket() {
  //--
 }

-void Socket::reset() {  
+void Socket::reset() {
  //--
 }

@ -137,6 +134,6 @@ void Socket::resume(uint32_t core_index) {
 Socket::PerfStats Socket::perf_stats() const {
  PerfStats perf_stats;
  perf_stats.icache = icaches_->perf_stats();
-  perf_stats.dcache = dcaches_->perf_stats();  
+  perf_stats.dcache = dcaches_->perf_stats();
  return perf_stats;
 }
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@ -244,7 +244,7 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {

 struct MemReq {
  uint64_t addr;
-  bool write;
+  bool     write;
  AddrType type;
  uint32_t tag;
  uint32_t cid;