Merge branch 'master' into stencil3d

2025-04-23 21:39:10 -04:00 · 2024-07-23 21:01:10 -07:00 · 2024-07-23 21:01:10 -07:00 · e42c7c6a82
commit e42c7c6a82
parent b489cc7abd 8df962d6b4
24 changed files with 845 additions and 375 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -102,7 +102,8 @@ jobs:
          cd build${{ matrix.xlen }}
          ../configure --tooldir=$TOOLDIR --xlen=${{ matrix.xlen }}
          source ci/toolchain_env.sh
-          make build -s > /dev/null
+          make software -s > /dev/null
+          make tests -s > /dev/null

      - name: Upload Build Artifact
        uses: actions/upload-artifact@v2
--- a/Makefile.in
+++ b/Makefile.in
@ -1,5 +1,7 @@
 include config.mk

+.PHONY: build software tests
+
 all:
 	$(MAKE) -C $(VORTEX_HOME)/third_party
 	$(MAKE) -C hw
@ -15,6 +17,14 @@ build:
 	$(MAKE) -C runtime
 	$(MAKE) -C tests

+software:
+	$(MAKE) -C hw
+	$(MAKE) -C kernel
+	$(MAKE) -C runtime/stub
+
+tests:
+	$(MAKE) -C tests
+
 clean:
 	$(MAKE) -C hw clean
 	$(MAKE) -C sim clean
--- a/README.md
+++ b/README.md
@ -56,7 +56,6 @@ More detailed build instructions can be found [here](docs/install_vortex.md).
    $ git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
    $ cd Vortex
 ### Configure your build folder
-    # By default, the toolchain default install location is the /opt folder and can be overridden by setting --tooldir.
    $ mkdir build
    $ cd build
    $ ../configure --xlen=32 --tooldir=$HOME/tools
--- a/ci/regression.sh.in
+++ b/ci/regression.sh.in
@ -23,37 +23,6 @@ XLEN=${XLEN:=@XLEN@}

 echo "Vortex Regression Test: XLEN=$XLEN"

-split_file() {
-    if [[ $# -ne 2 ]]; then
-        echo "Usage: $0 <filename> <start_with>"
-        return 1
-    fi
-    input_file="$1"
-    start_with="$2"
-    if [[ ! -r "$input_file" ]]; then
-        echo "Error: File '$input_file' is not readable or does not exist."
-        return 1
-    fi
-    count=0
-    output_file=""
-    while IFS= read -r line; do
-        if [[ $line == $start_with* ]]; then
-            count=$((count + 1))
-            output_file="$input_file.part$count"
-            > "$output_file" # ensure empty
-        fi
-        if [[ -n "$output_file" ]]; then
-            echo "$line" >> "$output_file"
-        fi
-    done < "$input_file"
-
-    if [[ $count -eq 0 ]]; then
-        echo "No lines starting with '$start_with' were found in '$input_file'."
-    fi
-}
-
-###############################################################################
-
 unittest()
 {
    make -C tests/unittest run
@ -64,6 +33,9 @@ isa()
 {
    echo "begin isa tests..."

+    make -C sim/simx
+    make -C sim/rtlsim
+
    make -C tests/riscv/isa run-simx
    make -C tests/riscv/isa run-rtlsim

@ -94,8 +66,8 @@ isa()
        make -C tests/riscv/isa run-rtlsim-64fx
    fi

-    # restore default prebuilt configuration
-    make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
+    # clean build
+    make -C sim/rtlsim clean

    echo "isa tests done!"
 }
@ -104,6 +76,9 @@ kernel()
 {
    echo "begin kernel tests..."

+    make -C sim/simx
+    make -C sim/rtlsim
+
    make -C tests/kernel run-simx
    make -C tests/kernel run-rtlsim

@ -114,6 +89,9 @@ regression()
 {
    echo "begin regression tests..."

+    make -C runtime/simx
+    make -C runtime/rtlsim
+
    make -C tests/regression run-simx
    make -C tests/regression run-rtlsim

@ -132,6 +110,9 @@ opencl()
 {
    echo "begin opencl tests..."

+    make -C runtime/simx
+    make -C runtime/rtlsim
+
    make -C tests/opencl run-simx
    make -C tests/opencl run-rtlsim

@ -148,29 +129,20 @@ test_csv_trace()
    make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
    make -C tests/riscv/isa run-simx-32im > run_simx.log
    make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
-    split_file run_simx.log "Running "
-    split_file run_rtlsim.log "Running "
-    for file in ./run_simx.log.part*; do
-        if [[ -f "$file" ]]; then
-            file2="${file//simx/rtlsim}"
-            if [[ -f "$file2" ]]; then
-                ./ci/trace_csv.py -tsimx $file -otrace_simx.csv
-                ./ci/trace_csv.py -trtlsim $file2 -otrace_rtlsim.csv
-                diff trace_rtlsim.csv trace_simx.csv
-            else
-                echo "File $file2 not found."
-            fi
-        fi
-    done
-    # restore default prebuilt configuration
-    make -C sim/simx clean && make -C sim/simx > /dev/null
-    make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
+    ./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
+    ./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
+    diff trace_rtlsim.csv trace_simx.csv
+    # clean build
+    make -C sim/simx clean
+    make -C sim/rtlsim clean
 }

 debug()
 {
    echo "begin debugging tests..."
+
    test_csv_trace
+
    ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
    ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
    ./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
@ -250,11 +222,12 @@ config2()
    STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
    ./ci/blackbox.sh --driver=simx --app=dogfood
    ./ci/blackbox.sh --driver=rtlsim --app=dogfood
+    make -C tests/regression/dogfood clean-kernel

    # disabling M & F extensions
    make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null
    make -C tests/riscv/isa run-rtlsim-32i
-    make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
+    make -C sim/rtlsim clean

    # disabling ZICOND extension
    CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo
--- a/4
+++ b/4
@ -111,7 +111,7 @@ copy_files() {

 # default configuration parameters
 default_xlen=32
-default_tooldir=/opt
+default_tooldir=$HOME/tools
 default_osversion=$(detect_osversion)
 default_prefix=$CURRENT_DIR

@ -140,7 +140,7 @@ PREFIX=${PREFIX:=$default_prefix}
 usage() {
    echo "Usage: $0 [--xlen=<value>] [--tooldir=<path>] [--osversion=<version>]"
    echo "  --xlen=<value>       Set the XLEN value (default: 32)"
-    echo "  --tooldir=<path>     Set the TOOLDIR path (default: /opt)"
+    echo "  --tooldir=<path>     Set the TOOLDIR path (default: $HOME/tools)"
    echo "  --osversion=<version> Set the OS Version (default: $(detect_osversion))"
    echo "  --prefix=<path>      Set installation directory"
    exit 1
--- a/hw/rtl/libs/VX_mem_coalescer.sv
+++ b/hw/rtl/libs/VX_mem_coalescer.sv
@ -374,7 +374,7 @@ module VX_mem_coalescer #(
            `TRACE_ARRAY1D(1, "%0d", out_req_offset, NUM_REQS);
            `TRACE(1, (", pmask=%b, tag=0x%0h (#%0d)\n", out_req_pmask, out_req_tag, out_req_uuid));
            if ($countones(out_req_pmask) > 1) begin
-                `TRACE(1, ("%t: *** %s: coalescing=%b (#%0d)\n", $time, INSTANCE_ID, out_req_pmask, out_req_uuid));
+                `TRACE(1, ("%t: *** %s: coalesced=%d (#%0d)\n", $time, INSTANCE_ID, $countones(out_req_pmask), out_req_uuid));
            end
        end
        if (out_rsp_fire) begin
--- a/hw/rtl/libs/VX_stream_pack.sv
+++ b/hw/rtl/libs/VX_stream_pack.sv
@ -38,11 +38,9 @@ module VX_stream_pack #(
    output wire [TAG_WIDTH-1:0]         tag_out,
    input wire                          ready_out
 );
-    localparam LOG_NUM_REQS = `CLOG2(NUM_REQS);
-
    if (NUM_REQS > 1) begin

-        wire [LOG_NUM_REQS-1:0] grant_index;
+        wire [NUM_REQS-1:0] grant_onehot;
        wire grant_valid;
        wire grant_ready;

@ -54,29 +52,33 @@ module VX_stream_pack #(
            .reset       (reset),
            .requests    (valid_in),
            .grant_valid (grant_valid),
-            .grant_index (grant_index),
-            `UNUSED_PIN  (grant_onehot),
+            `UNUSED_PIN  (grant_index),
+            .grant_onehot(grant_onehot),
            .grant_ready (grant_ready)
        );

-        reg [NUM_REQS-1:0] valid_sel;
-        reg [NUM_REQS-1:0] ready_sel;
-        wire ready_unqual;
+        wire [TAG_WIDTH-1:0] tag_sel;

-        wire [TAG_WIDTH-1:0] tag_sel = tag_in[grant_index];
+        VX_onehot_mux #(
+            .DATAW (TAG_WIDTH),
+            .N     (NUM_REQS)
+        ) onehot_mux (
+            .data_in  (tag_in),
+            .sel_in   (grant_onehot),
+            .data_out (tag_sel)
+        );

-        always @(*) begin
-            valid_sel = '0;
-            ready_sel = '0;
-            for (integer i = 0; i < NUM_REQS; ++i) begin
-                if (tag_in[i][TAG_SEL_BITS-1:0] == tag_sel[TAG_SEL_BITS-1:0]) begin
-                    valid_sel[i] = valid_in[i];
-                    ready_sel[i] = ready_unqual;
-                end
-            end
+        wire [NUM_REQS-1:0] tag_matches;
+
+        for (genvar i = 0; i < NUM_REQS; ++i) begin
+            assign tag_matches[i] = (tag_in[i][TAG_SEL_BITS-1:0] == tag_sel[TAG_SEL_BITS-1:0]);
        end

-        assign grant_ready = ready_unqual;
+        for (genvar i = 0; i < NUM_REQS; ++i) begin
+            assign ready_in[i] = grant_ready & tag_matches[i];
+        end
+
+        wire [NUM_REQS-1:0] mask_sel = valid_in & tag_matches;

        VX_elastic_buffer #(
            .DATAW   (NUM_REQS + TAG_WIDTH + (NUM_REQS * DATA_WIDTH)),
@ -86,15 +88,13 @@ module VX_stream_pack #(
            .clk       (clk),
            .reset     (reset),
            .valid_in  (grant_valid),
-            .data_in   ({valid_sel, tag_sel, data_in}),
-            .ready_in  (ready_unqual),
+            .data_in   ({mask_sel, tag_sel, data_in}),
+            .ready_in  (grant_ready),
            .valid_out (valid_out),
            .data_out  ({mask_out, tag_out, data_out}),
            .ready_out (ready_out)
        );

-        assign ready_in = ready_sel;
-
    end else begin

        `UNUSED_VAR (clk)
--- a/ramulator_config.yaml
+++ b/ramulator_config.yaml
@ -1,28 +0,0 @@
-Frontend:
-  impl: GEM5
-
-MemorySystem:
-  impl: GenericDRAM
-  clock_ratio: 1
-
-  DRAM:
-    impl: HBM2
-    org:
-      preset: HBM2_8Gb
-      density: 8192
-    timing:
-      preset: HBM2_2Gbps
-
-  Controller:
-    impl: Generic
-    Scheduler:
-      impl: FRFCFS
-    RefreshManager:
-      impl: AllBank
-    RowPolicy:
-      impl: OpenRowPolicy
-      cap: 1
-    plugins:
-
-  AddrMapper:
-    impl: RoBaRaCoCh
--- a/sim/common/bitvector.h
+++ b/sim/common/bitvector.h
@ -0,0 +1,314 @@
+// Copyright © 2019-2023
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include <stdexcept>
+#include <algorithm>
+
+namespace vortex {
+
+template <typename T = uint32_t>
+class BitVector {
+private:
+  static constexpr size_t BITS_PER_WORD = sizeof(T) * 8;
+  std::vector<T> bits_;
+  size_t size_;
+  bool all_zero_;
+
+  size_t wordIndex(size_t pos) const {
+    return pos / BITS_PER_WORD;
+  }
+
+  T bitMask(size_t pos) const {
+    return T(1) << (pos % BITS_PER_WORD);
+  }
+
+  void updateAllZero() {
+    all_zero_ = std::all_of(bits_.begin(), bits_.end(), [](T word) { return word == 0; });
+  }
+
+public:
+  explicit BitVector(size_t size = 0)
+    : bits_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD)
+    , size_(size)
+    , all_zero_(true)
+  {}
+
+  void set(size_t pos) {
+    if (pos >= size_) throw std::out_of_range("Index out of range");
+    bits_[this->wordIndex(pos)] |= this->bitMask(pos);
+    all_zero_ = false;
+  }
+
+  void set(size_t pos, bool value) {
+    if (value) {
+      this->set(pos);
+    } else {
+      this->reset(pos);
+    }
+  }
+
+  void reset() {
+    std::fill(bits_.begin(), bits_.end(), 0);
+    all_zero_ = true;
+  }
+
+  void reset(size_t pos) {
+    if (pos >= size_) throw std::out_of_range("Index out of range");
+    bits_[this->wordIndex(pos)] &= ~this->bitMask(pos);
+    this->updateAllZero();
+  }
+
+  bool test(size_t pos) const {
+    if (pos >= size_) throw std::out_of_range("Index out of range");
+    return bits_[this->wordIndex(pos)] & this->bitMask(pos);
+  }
+
+  size_t size() const {
+    return size_;
+  }
+
+  void resize(size_t new_size) {
+    size_ = new_size;
+    bits_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0);
+    this->updateAllZero();
+  }
+
+  bool operator==(const BitVector& other) const {
+    return (size_ == other.size_) && (bits_ == other.bits_);
+  }
+
+  bool operator!=(const BitVector& other) const {
+    return !(*this == other);
+  }
+
+  bool operator[](size_t pos) const {
+    return test(pos);
+  }
+
+  BitVector& operator&=(const BitVector& other) {
+    if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
+    for (size_t i = 0; i < bits_.size(); ++i) {
+      bits_[i] &= other.bits_[i];
+    }
+    this->updateAllZero();
+    return *this;
+  }
+
+  BitVector& operator|=(const BitVector& other) {
+    if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
+    for (size_t i = 0; i < bits_.size(); ++i) {
+      bits_[i] |= other.bits_[i];
+    }
+    this->updateAllZero();
+    return *this;
+  }
+
+  BitVector& operator^=(const BitVector& other) {
+    if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
+    for (size_t i = 0; i < bits_.size(); ++i) {
+      bits_[i] ^= other.bits_[i];
+    }
+    this->updateAllZero();
+    return *this;
+  }
+
+  BitVector operator~() const {
+    BitVector result(size_);
+    for (size_t i = 0; i < bits_.size(); ++i) {
+      result.bits_[i] = ~bits_[i];
+    }
+    result.updateAllZero();
+    return result;
+  }
+
+  void flip() {
+    for (auto &word : bits_) {
+      word = ~word;
+    }
+    this->updateAllZero();
+  }
+
+  size_t count() const {
+    size_t count = 0;
+    for (const auto &word : bits_) {
+      count += std::bitset<BITS_PER_WORD>(word).count();
+    }
+    return count;
+  }
+
+  bool none() const {
+    return all_zero_;
+  }
+
+  bool any() const {
+    return !all_zero_;
+  }
+
+  bool all() const {
+    size_t full_bits = size_ / BITS_PER_WORD;
+    size_t remaining_bits = size_ % BITS_PER_WORD;
+    T full_mask = ~T(0);
+    for (size_t i = 0; i < full_bits; ++i) {
+      if (bits_[i] != full_mask)
+        return false;
+    }
+    if (remaining_bits > 0) {
+      T partial_mask = (T(1) << remaining_bits) - 1;
+      if ((bits_[full_bits] & partial_mask) != partial_mask)
+        return false;
+    }
+    return true;
+  }
+
+   BitVector& operator<<=(size_t pos) {
+    if (pos >= size_) {
+      reset();
+      return *this;
+    }
+
+    size_t word_shift = pos / BITS_PER_WORD;
+    size_t bit_shift = pos % BITS_PER_WORD;
+
+    if (word_shift > 0) {
+      for (size_t i = bits_.size() - 1; i >= word_shift; --i) {
+        bits_[i] = bits_[i - word_shift];
+      }
+      std::fill(bits_.begin(), bits_.begin() + word_shift, 0);
+    }
+
+    if (bit_shift > 0) {
+      for (size_t i = bits_.size() - 1; i > 0; --i) {
+        bits_[i] = (bits_[i] << bit_shift) | (bits_[i - 1] >> (BITS_PER_WORD - bit_shift));
+      }
+      bits_[0] <<= bit_shift;
+    }
+
+    this->updateAllZero();
+    return *this;
+  }
+
+  BitVector& operator>>=(size_t pos) {
+    if (pos >= size_) {
+      reset();
+      return *this;
+    }
+
+    size_t word_shift = pos / BITS_PER_WORD;
+    size_t bit_shift = pos % BITS_PER_WORD;
+
+    if (word_shift > 0) {
+      for (size_t i = 0; i < bits_.size() - word_shift; ++i) {
+        bits_[i] = bits_[i + word_shift];
+      }
+      std::fill(bits_.end() - word_shift, bits_.end(), 0);
+    }
+
+    if (bit_shift > 0) {
+      for (size_t i = 0; i < bits_.size() - 1; ++i) {
+        bits_[i] = (bits_[i] >> bit_shift) | (bits_[i + 1] << (BITS_PER_WORD - bit_shift));
+      }
+      bits_.back() >>= bit_shift;
+    }
+
+    this->updateAllZero();
+    return *this;
+  }
+
+  std::string to_string() const {
+    std::string result;
+    for (size_t i = 0; i < size_; ++i) {
+      result.push_back(test(i) ? '1' : '0');
+    }
+    return result;
+  }
+
+  unsigned long to_ulong() const {
+    if (size_ > sizeof(unsigned long) * 8) {
+      throw std::overflow_error("BitVector size exceeds unsigned long capacity");
+    }
+
+    unsigned long result = 0;
+    for (size_t i = 0; i < size_; ++i) {
+      if (test(i)) {
+        result |= (1UL << i);
+      }
+    }
+    return result;
+  }
+
+  unsigned long long to_ullong() const {
+    if (size_ > sizeof(unsigned long long) * 8) {
+      throw std::overflow_error("BitVector size exceeds unsigned long long capacity");
+    }
+
+    unsigned long long result = 0;
+    for (size_t i = 0; i < size_; ++i) {
+      if (test(i)) {
+        result |= (1ULL << i);
+      }
+    }
+    return result;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const BitVector& bv) {
+    for (size_t i = 0; i < bv.size_; ++i) {
+      os << bv.test(i);
+    }
+    return os;
+  }
+
+  friend BitVector operator&(const BitVector& lhs, const BitVector& rhs) {
+    BitVector result(lhs);
+    result &= rhs;
+    return result;
+  }
+
+  friend BitVector operator|(const BitVector& lhs, const BitVector& rhs) {
+    BitVector result(lhs);
+    result |= rhs;
+    return result;
+  }
+
+  friend BitVector operator^(const BitVector& lhs, const BitVector& rhs) {
+    BitVector result(lhs);
+    result ^= rhs;
+    return result;
+  }
+
+  friend BitVector operator<<(const BitVector& lhs, size_t pos) {
+    BitVector result(lhs);
+    result <<= pos;
+    return result;
+  }
+
+  friend BitVector operator>>(const BitVector& lhs, size_t pos) {
+    BitVector result(lhs);
+    result >>= pos;
+    return result;
+  }
+};
+
+}
+
+// std::hash specialization for BitVector
+namespace std {
+
+template <typename T>
+struct hash<vortex::BitVector<T>> {
+  size_t operator()(const vortex::BitVector<T>& bv) const {
+    return hash<std::string>()(bv.to_string());
+  }
+};
+
+}
--- a/sim/rtlsim/main.cpp
+++ b/sim/rtlsim/main.cpp
@ -89,7 +89,9 @@ int main(int argc, char **argv) {
 			return -1;
 		}
 	}
-
+#ifndef NDEBUG
+	std::cout << "[VXDRV] START: program=" << program << std::endl;
+#endif
 	// run simulation
 	processor.run();

--- a/sim/simx/cache_sim.cpp
+++ b/sim/simx/cache_sim.cpp
@ -539,7 +539,7 @@ private:
 							continue;
 						MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid};
 						simobject_->CoreRspPorts.at(info.req_id).push(core_rsp, config_.latency);
-						DT(3, simobject_->name() << "-core-" << core_rsp);
+						DT(3, simobject_->name() << "-replay-" << core_rsp);
 					}
 				}
 			} break;
@ -583,7 +583,7 @@ private:
 							mem_req.cid   = pipeline_req.cid;
 							mem_req.uuid  = pipeline_req.uuid;
 							mem_req_ports_.at(bank_id).push(mem_req, 1);
-							DT(3, simobject_->name() << "-dram-" << mem_req);
+							DT(3, simobject_->name() << "-writethrough-" << mem_req);
 						} else {
 							// mark line as dirty
 							hit_line.dirty = true;
@ -615,7 +615,7 @@ private:
 							mem_req.write = true;
 							mem_req.cid   = pipeline_req.cid;
 							mem_req_ports_.at(bank_id).push(mem_req, 1);
-							DT(3, simobject_->name() << "-dram-" << mem_req);
+							DT(3, simobject_->name() << "-writeback-" << mem_req);
 							++perf_stats_.evictions;
 						}
 					}
@ -629,7 +629,7 @@ private:
 							mem_req.cid   = pipeline_req.cid;
 							mem_req.uuid  = pipeline_req.uuid;
 							mem_req_ports_.at(bank_id).push(mem_req, 1);
-							DT(3, simobject_->name() << "-dram-" << mem_req);
+							DT(3, simobject_->name() << "-writethrough-" << mem_req);
 						}
 						// send core response
 						if (config_.write_reponse) {
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@ -44,8 +44,10 @@ Core::Core(const SimContext& ctx,
  , operands_(ISSUE_WIDTH)
  , dispatchers_((uint32_t)FUType::Count)
  , func_units_((uint32_t)FUType::Count)
-  , lsu_demux_(LSU_NUM_REQS)
+  , lsu_demux_(NUM_LSU_BLOCKS)
  , mem_coalescers_(NUM_LSU_BLOCKS)
+  , lsu_dcache_adapter_(NUM_LSU_BLOCKS)
+  , lsu_lmem_adapter_(NUM_LSU_BLOCKS)
  , pending_icache_(arch_.num_warps())
  , commit_arbs_(ISSUE_WIDTH)
 {
@ -72,31 +74,53 @@ Core::Core(const SimContext& ctx,
  });

  // create lsu demux
-  for (uint32_t i = 0; i < LSU_NUM_REQS; ++i) {
+  for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
    snprintf(sname, 100, "core%d-lsu_demux%d", core_id, i);
    lsu_demux_.at(i) = LocalMemDemux::Create(sname, 1);
  }

-  // connect dcache-coalescer
-  for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
-    for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
-      uint32_t i = b * DCACHE_CHANNELS + c;
-      mem_coalescers_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
-      dcache_rsp_ports.at(i).bind(&mem_coalescers_.at(b)->RspOut.at(c));
-    }
+  // create lsu dcache adapter
+  for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
+    snprintf(sname, 100, "core%d-lsu_dcache_adapter%d", core_id, i);
+    lsu_dcache_adapter_.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
+  }
+
+  // create lsu lmem adapter
+  for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
+    snprintf(sname, 100, "core%d-lsu_lmem_adapter%d", core_id, i);
+    lsu_lmem_adapter_.at(i) = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
  }

  // connect lsu demux
+  for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
+    lsu_demux_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
+    mem_coalescers_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspDC);
+
+    lsu_demux_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
+    lsu_lmem_adapter_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspLmem);
+  }
+
+  // connect coalescer-adapter
+  for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
+    mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter_.at(b)->ReqIn);
+    lsu_dcache_adapter_.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
+  }
+
+  // connect adapter-dcache
+  for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
+    for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
+      uint32_t i = b * DCACHE_CHANNELS + c;
+      lsu_dcache_adapter_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
+      dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter_.at(b)->RspOut.at(c));
+    }
+  }
+
+  // connect adapter-lmem
  for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
    for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
      uint32_t i = b * LSU_CHANNELS + c;
-      auto lmem_demux = lsu_demux_.at(i);
-
-      lmem_demux->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn.at(c));
-      mem_coalescers_.at(b)->RspIn.at(c).bind(&lmem_demux->RspDC);
-
-      lmem_demux->ReqSM.bind(&local_mem_->Inputs.at(i));
-      local_mem_->Outputs.at(i).bind(&lmem_demux->RspSM);
+      lsu_lmem_adapter_.at(b)->ReqOut.at(c).bind(&local_mem_->Inputs.at(i));
+      local_mem_->Outputs.at(i).bind(&lsu_lmem_adapter_.at(b)->RspOut.at(c));
    }
  }

--- a/sim/simx/core.h
+++ b/sim/simx/core.h
@ -152,6 +152,8 @@ private:
  LocalMem::Ptr local_mem_;
  std::vector<LocalMemDemux::Ptr> lsu_demux_;
  std::vector<MemCoalescer::Ptr> mem_coalescers_;
+  std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter_;
+  std::vector<LsuMemAdapter::Ptr> lsu_lmem_adapter_;

  PipelineLatch fetch_latch_;
  PipelineLatch decode_latch_;
--- a/sim/simx/func_unit.cpp
+++ b/sim/simx/func_unit.cpp
@ -24,7 +24,7 @@

 using namespace vortex;

-AluUnit::AluUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "ALU") {}
+AluUnit::AluUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "alu-unit") {}

 void AluUnit::tick() {
  for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
@ -49,7 +49,7 @@ void AluUnit::tick() {
 		default:
 			std::abort();
 		}
-		DT(3, "pipeline-execute: op=" << trace->alu_type << ", " << *trace);
+		DT(3, this->name() << ": op" << trace->alu_type << ", " << *trace);
 		if (trace->eop && trace->fetch_stall) {
 			core_->resume(trace->wid);
 		}
@ -59,7 +59,7 @@ void AluUnit::tick() {

 ///////////////////////////////////////////////////////////////////////////////

-FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "FPU") {}
+FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "fpu-unit") {}

 void FpuUnit::tick() {
 	for (uint32_t iw = 0; iw < ISSUE_WIDTH; ++iw) {
@ -88,7 +88,7 @@ void FpuUnit::tick() {
 		default:
 			std::abort();
 		}
-		DT(3, "pipeline-execute: op=" << trace->fpu_type << ", " << *trace);
+		DT(3,this->name() << ": op=" << trace->fpu_type << ", " << *trace);
 		input.pop();
 	}
 }
@ -96,7 +96,7 @@ void FpuUnit::tick() {
 ///////////////////////////////////////////////////////////////////////////////

 LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
-	: FuncUnit(ctx, core, "LSU")
+	: FuncUnit(ctx, core, "lsu-unit")
 	, pending_loads_(0)
 {}

@ -114,25 +114,25 @@ void LsuUnit::tick() {
 	core_->perf_stats_.load_latency += pending_loads_;

 	// handle memory responses
-	for (uint32_t r = 0; r < LSU_NUM_REQS; ++r) {
-		auto& dcache_rsp_port = core_->lsu_demux_.at(r)->RspIn;
-		if (dcache_rsp_port.empty())
+	for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
+		auto& lsu_rsp_port = core_->lsu_demux_.at(b)->RspIn;
+		if (lsu_rsp_port.empty())
 			continue;
-		uint32_t block_idx = r / LSU_CHANNELS;
-		auto& state = states_.at(block_idx);
-		auto& mem_rsp = dcache_rsp_port.front();
-		auto& entry = state.pending_rd_reqs.at(mem_rsp.tag);
+		auto& state = states_.at(b);
+		auto& lsu_rsp = lsu_rsp_port.front();
+		DT(3, this->name() << "-" << lsu_rsp);
+		auto& entry = state.pending_rd_reqs.at(lsu_rsp.tag);
 		auto trace = entry.trace;
-		DT(3, "mem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu_type << ", rid=" << r << ", " << *trace);
-		assert(entry.count);
-		--entry.count; // track remaining addresses
-		if (0 == entry.count) {
+		assert(!entry.mask.none());
+		entry.mask &= ~lsu_rsp.mask; // track remaining
+		if (entry.mask.none()) {
+			// whole response received, release trace
 			int iw = trace->wid % ISSUE_WIDTH;
 			Outputs.at(iw).push(trace, 1);
-			state.pending_rd_reqs.release(mem_rsp.tag);
+			state.pending_rd_reqs.release(lsu_rsp.tag);
 		}
-		dcache_rsp_port.pop();
-		--pending_loads_;
+		pending_loads_ -= lsu_rsp.mask.count();
+		lsu_rsp_port.pop();
 	}

 	// handle LSU requests
@ -145,7 +145,7 @@ void LsuUnit::tick() {
 				continue;
 			Outputs.at(iw).push(state.fence_trace, 1);
 			state.fence_lock = false;
-			DT(3, "fence-unlock: " << state.fence_trace);
+			DT(3, this->name() << "-fence-unlock: " << state.fence_trace);
 		}

 		// check input queue
@ -153,14 +153,13 @@ void LsuUnit::tick() {
 		if (input.empty())
 			continue;

-		auto& output = Outputs.at(iw);
 		auto trace = input.front();

 		if (trace->lsu_type == LsuType::FENCE) {
 			// schedule fence lock
 			state.fence_trace = trace;
 			state.fence_lock = true;
-			DT(3, "fence-lock: " << *trace);
+			DT(3, this->name() << "-fence-lock: " << *trace);
 			// remove input
 			input.pop();
 			continue;
@ -178,21 +177,43 @@ void LsuUnit::tick() {
 			trace->log_once(false);
 		}

+		// build memory request
+		LsuReq lsu_req(NUM_LSU_LANES);
+		lsu_req.write = is_write;
+		{
+			auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
+			auto t0 = trace->pid * NUM_LSU_LANES;
+			for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
+				if (trace->tmask.test(t0 + i)) {
+					lsu_req.mask.set(i);
+					lsu_req.addrs.at(i) = trace_data->mem_addrs.at(t0 + i).addr;
+				}
+			}
+		}
 		uint32_t tag = 0;
 		if (!is_write) {
-			tag = state.pending_rd_reqs.allocate({trace, 0});
+			tag = state.pending_rd_reqs.allocate({trace, lsu_req.mask});
 		}
+		lsu_req.tag  = tag;
+		lsu_req.cid  = trace->cid;
+		lsu_req.uuid = trace->uuid;

 		// send memory request
-		auto num_reqs = this->send_requests(trace, block_idx, tag);
+		core_->lsu_demux_.at(block_idx)->ReqIn.push(lsu_req);
+		DT(3, this->name() << "-" << lsu_req);

-		if (!is_write) {
-			state.pending_rd_reqs.at(tag).count = num_reqs;
+		// update stats
+		auto num_addrs = lsu_req.mask.count();
+		if (is_write) {
+			core_->perf_stats_.stores += num_addrs;
+		} else {
+			core_->perf_stats_.loads += num_addrs;
+			pending_loads_ += num_addrs;
 		}

 		// do not wait on writes
 		if (is_write) {
-			output.push(trace, 1);
+			Outputs.at(iw).push(trace, 1);
 		}

 		// remove input
@ -200,52 +221,10 @@ void LsuUnit::tick() {
 	}
 }

-int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
-	int count = 0;
-
-	auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
-	bool is_write = (trace->lsu_type == LsuType::STORE);
-	auto t0 = trace->pid * NUM_LSU_LANES;
-
-	for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
-		uint32_t t = t0 + i;
-		if (!trace->tmask.test(t))
-			continue;
-
-		int req_idx = block_idx * LSU_CHANNELS + (i % LSU_CHANNELS);
-		auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn;
-
-		auto mem_addr = trace_data->mem_addrs.at(t);
-		auto type = get_addr_type(mem_addr.addr);
-
-		MemReq mem_req;
-		mem_req.addr  = mem_addr.addr;
-		mem_req.write = is_write;
-		mem_req.type  = type;
-		mem_req.tag   = tag;
-		mem_req.cid   = trace->cid;
-		mem_req.uuid  = trace->uuid;
-
-		dcache_req_port.push(mem_req, 1);
-		DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
-			<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
-
-		if (is_write) {
-			++core_->perf_stats_.stores;
-		} else {
-			++core_->perf_stats_.loads;
-			++pending_loads_;
-		}
-
-		++count;
-	}
-	return count;
-}
-
 ///////////////////////////////////////////////////////////////////////////////

 SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
-	: FuncUnit(ctx, core, "SFU")
+	: FuncUnit(ctx, core, "sfu-unit")
 {}

 void SfuUnit::tick() {
@ -287,7 +266,7 @@ void SfuUnit::tick() {
 			std::abort();
 		}

-		DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
+		DT(3, this->name() << ": op=" << trace->sfu_type << ", " << *trace);
 		if (trace->eop && release_warp)  {
 			core_->resume(trace->wid);
 		}
--- a/sim/simx/func_unit.h
+++ b/sim/simx/func_unit.h
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -26,13 +26,13 @@ public:
 	std::vector<SimPort<instr_trace_t*>> Inputs;
 	std::vector<SimPort<instr_trace_t*>> Outputs;

-	FuncUnit(const SimContext& ctx, Core* core, const char* name) 
-		: SimObject<FuncUnit>(ctx, name) 
+	FuncUnit(const SimContext& ctx, Core* core, const char* name)
+		: SimObject<FuncUnit>(ctx, name)
 		, Inputs(ISSUE_WIDTH, this)
 		, Outputs(ISSUE_WIDTH, this)
 		, core_(core)
 	{}
-	
+
 	virtual ~FuncUnit() {}

 	virtual void reset() {}
@ -73,28 +73,26 @@ public:

 private:

-	int send_requests(instr_trace_t* trace, int block_idx, int tag);
-
-	struct pending_req_t {
+ 	struct pending_req_t {
 		instr_trace_t* trace;
-		uint32_t count;
+		BitVector<> mask;
 	};

-	struct lsu_state_t {		
+	struct lsu_state_t {
 		HashTable<pending_req_t> pending_rd_reqs;
-		instr_trace_t* fence_trace;	
+		instr_trace_t* fence_trace;
 		bool fence_lock;

 		lsu_state_t() : pending_rd_reqs(LSUQ_IN_SIZE) {}
-		
+
 		void clear() {
 			this->pending_rd_reqs.clear();
 			this->fence_trace = nullptr;
 			this->fence_lock = false;
 		}
 	};
-	
-	std::array<lsu_state_t, NUM_LSU_BLOCKS> states_;	
+
+	std::array<lsu_state_t, NUM_LSU_BLOCKS> states_;
 	uint64_t pending_loads_;
 };

@ -103,7 +101,7 @@ private:
 class SfuUnit : public FuncUnit {
 public:
 	SfuUnit(const SimContext& ctx, Core*);
-	
+
 	void tick();
 };

--- a/sim/simx/local_mem.cpp
+++ b/sim/simx/local_mem.cpp
@ -82,11 +82,13 @@ public:
 				continue;
 			}

+			DT(4, simobject_->name() << "-" << core_req);
+
 			in_used_banks.at(bank_id) = true;

 			if (!core_req.write || config_.write_reponse) {
 				// send response
-				MemRsp core_rsp{core_req.tag, core_req.cid};
+				MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid};
 				simobject_->Outputs.at(req_id).push(core_rsp, 1);
 			}

--- a/sim/simx/main.cpp
+++ b/sim/simx/main.cpp
@ -112,7 +112,9 @@ int main(int argc, char **argv) {
        return -1;
      }
    }
-
+#ifndef NDEBUG
+    std::cout << "[VXDRV] START: program=" << program << std::endl;
+#endif
    // run simulation
    processor.run();

--- a/sim/simx/mem_coalescer.cpp
+++ b/sim/simx/mem_coalescer.cpp
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,100 +16,141 @@
 using namespace vortex;

 MemCoalescer::MemCoalescer(
-  const SimContext& ctx, 
-  const char* name, 
+  const SimContext& ctx,
+  const char* name,
  uint32_t input_size,
  uint32_t output_size,
  uint32_t line_size,
  uint32_t queue_size,
  uint32_t delay
-) : SimObject<MemCoalescer>(ctx, name)    
-  , ReqIn(input_size, this)
-  , RspIn(input_size, this)
-  , ReqOut(output_size, this)
-  , RspOut(output_size, this)
+) : SimObject<MemCoalescer>(ctx, name)
+  , ReqIn(this)
+  , RspIn(this)
+  , ReqOut(this)
+  , RspOut(this)
+  , input_size_(input_size)
+  , output_size_(output_size)
+  , output_ratio_(input_size / output_size)
  , pending_rd_reqs_(queue_size)
+  , sent_mask_(input_size)
  , line_size_(line_size)
  , delay_(delay)
 {}

 void MemCoalescer::reset() {
-  last_index_ =  0;
  sent_mask_.reset();
 }

-void MemCoalescer::tick() {    
-  uint32_t I = ReqIn.size();
-  uint32_t O = ReqOut.size();
-
+void MemCoalescer::tick() {
  // process incoming responses
-  for (uint32_t o = 0; o < O; ++o) {
-    if (RspOut.at(o).empty())
-      continue;
-    auto& mem_rsp = RspOut.at(o).front();
-    DT(3, this->name() << "-" << mem_rsp);
-    auto& entry = pending_rd_reqs_.at(mem_rsp.tag);
-    for (uint32_t i = 0; i < I; ++i) {
-      if (entry.mask.test(i)) {
-        MemRsp rsp(mem_rsp);
-        rsp.tag = entry.tag;
-        RspIn.at(i).push(rsp, 1);
+  if (!RspOut.empty()) {
+    auto& out_rsp = RspOut.front();
+    DT(4, this->name() << "-" << out_rsp);
+    auto& entry = pending_rd_reqs_.at(out_rsp.tag);
+
+    BitVector<> rsp_mask(input_size_);
+    for (uint32_t o = 0; o < output_size_; ++o) {
+      if (!out_rsp.mask.test(o))
+        continue;
+      for (uint32_t r = 0; r < output_ratio_; ++r) {
+        uint32_t i = o * output_ratio_ + r;
+        if (entry.mask.test(i))
+          rsp_mask.set(i);
      }
    }
-    pending_rd_reqs_.release(mem_rsp.tag);
-    RspOut.at(o).pop();
+
+    // build memory response
+    LsuRsp in_rsp(input_size_);
+    in_rsp.mask = rsp_mask;
+    in_rsp.tag = entry.tag;
+    in_rsp.cid = out_rsp.cid;
+    in_rsp.uuid = out_rsp.uuid;
+
+    // send memory response
+    RspIn.push(in_rsp, 1);
+
+    // track remaining responses
+    assert(!entry.mask.none());
+		entry.mask &= ~rsp_mask;
+		if (entry.mask.none()) {
+      // whole response received, release tag
+			pending_rd_reqs_.release(out_rsp.tag);
+		}
+    RspOut.pop();
  }

  // process incoming requests
-  uint64_t addr_mask = ~uint64_t(line_size_-1);
-  bool completed = true;
-  for (uint32_t i = last_index_; i < I; ++i) {
-    if (sent_mask_.test(i) || ReqIn.at(i).empty())
-      continue;
+  if (ReqIn.empty())
+    return;

-    auto& seed = ReqIn.at(i).front();
+  auto& in_req = ReqIn.front();
+  assert(in_req.mask.size() == input_size_);
+  assert(!in_req.mask.none());

-    // ensure we can allocate a response tag      
-    if (!seed.write && pending_rd_reqs_.full()) {
-      DT(4, "*** " << this->name() << "-queue-full: " << seed);
-      last_index_ = i;
-      completed = false;
-      break;
-    }
-
-    std::bitset<64> mask(0);      
-    mask.set(i);      
-
-    // coalesce matching requests      
-    uint64_t seed_addr = seed.addr & addr_mask;
-    for (uint32_t j = i + 1; j < I; ++j) {
-      if (sent_mask_.test(j) || ReqIn.at(j).empty())
-        continue;
-      auto& match = ReqIn.at(j).front();
-      uint64_t match_addr = match.addr & addr_mask;
-      if (match_addr == seed_addr) {
-        mask.set(j);
-        ReqIn.at(j).pop();   
-      }
-    }
-
-    uint32_t tag = 0;
-    if (!seed.write) {
-      tag = pending_rd_reqs_.allocate(pending_req_t{seed.tag, mask});
-    }
-
-    MemReq mem_req{seed};
-    mem_req.tag = tag;      
-    DT(3, this->name() << "-" << mem_req << ", coalesced=" << mask.count());        
-    uint32_t c = i % O;
-    ReqOut.at(c).push(mem_req, delay_);
-    ReqIn.at(i).pop();
-
-    sent_mask_ |= mask;     
+  // ensure we can allocate a response tag
+  if (pending_rd_reqs_.full()) {
+    DT(4, "*** " << this->name() << "-queue-full: " << in_req);
+    return;
  }

-  if (completed) {
-    last_index_ = 0;
+  uint64_t addr_mask = ~uint64_t(line_size_-1);
+
+  BitVector<> out_mask(output_size_);
+  std::vector<uint64_t> out_addrs(output_size_);
+
+  BitVector<> cur_mask(input_size_);
+
+  for (uint32_t o = 0; o < output_size_; ++o) {
+    for (uint32_t r = 0; r < output_ratio_; ++r) {
+      uint32_t i = o * output_ratio_ + r;
+      if (sent_mask_.test(i) || !in_req.mask.test(i))
+        continue;
+
+      uint64_t seed_addr = in_req.addrs.at(i) & addr_mask;
+      cur_mask.set(i);
+
+      // coalesce matching requests
+      for (uint32_t s = r + 1; s < output_ratio_; ++s) {
+        uint32_t j = o * output_ratio_ + s;
+        if (sent_mask_.test(j) || !in_req.mask.test(j))
+          continue;
+        uint64_t match_addr = in_req.addrs.at(j) & addr_mask;
+        if (match_addr == seed_addr) {
+          cur_mask.set(j);
+        }
+      }
+
+      out_mask.set(o);
+      out_addrs.at(o) = seed_addr;
+      break;
+    }
+  }
+
+  assert(!out_mask.none());
+
+  uint32_t tag = 0;
+  if (!in_req.write) {
+    // allocate a response tag for read requests
+    tag = pending_rd_reqs_.allocate(pending_req_t{in_req.tag, cur_mask});
+  }
+
+  // build memory request
+  LsuReq out_req{output_size_};
+  out_req.mask = out_mask;
+  out_req.tag = tag;
+  out_req.write = in_req.write;
+  out_req.addrs = out_addrs;
+  out_req.cid = in_req.cid;
+  out_req.uuid = in_req.uuid;
+
+  // send memory request
+  ReqOut.push(out_req, delay_);
+  DT(4, this->name() << "-" << out_req << ", coalesced=" << cur_mask.count());
+
+  // update sent mask
+  sent_mask_ |= cur_mask;
+  if (sent_mask_ == in_req.mask) {
+    ReqIn.pop();
    sent_mask_.reset();
  }
 }
--- a/sim/simx/mem_coalescer.h
+++ b/sim/simx/mem_coalescer.h
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,15 +17,15 @@ namespace vortex {

 class MemCoalescer : public SimObject<MemCoalescer> {
 public:
-  std::vector<SimPort<MemReq>> ReqIn;
-  std::vector<SimPort<MemRsp>> RspIn;
+  SimPort<LsuReq> ReqIn;
+  SimPort<LsuRsp> RspIn;

-  std::vector<SimPort<MemReq>> ReqOut;
-  std::vector<SimPort<MemRsp>> RspOut;
+  SimPort<LsuReq> ReqOut;
+  SimPort<LsuRsp> RspOut;

  MemCoalescer(
-    const SimContext& ctx, 
-    const char* name, 
+    const SimContext& ctx,
+    const char* name,
    uint32_t input_size,
    uint32_t output_size,
    uint32_t line_size,
@ -41,14 +41,17 @@ private:

  struct pending_req_t {
    uint32_t tag;
-    std::bitset<64> mask;
+    BitVector<> mask;
  };

+  uint32_t input_size_;
+  uint32_t output_size_;
+  uint32_t output_ratio_;
+
  HashTable<pending_req_t> pending_rd_reqs_;
+  BitVector<> sent_mask_;
  uint32_t line_size_;
  uint32_t delay_;
-  uint32_t last_index_;
-  std::bitset<64> sent_mask_;
 };

 }
--- a/sim/simx/types.cpp
+++ b/sim/simx/types.cpp
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,14 +16,14 @@
 using namespace vortex;

 LocalMemDemux::LocalMemDemux(
-  const SimContext& ctx, 
-  const char* name, 
+  const SimContext& ctx,
+  const char* name,
  uint32_t delay
-) : SimObject<LocalMemDemux>(ctx, name)    
+) : SimObject<LocalMemDemux>(ctx, name)
  , ReqIn(this)
  , RspIn(this)
-  , ReqSM(this)
-  , RspSM(this)
+  , ReqLmem(this)
+  , RspLmem(this)
  , ReqDC(this)
  , RspDC(this)
  , delay_(delay)
@ -31,30 +31,133 @@ LocalMemDemux::LocalMemDemux(

 void LocalMemDemux::reset() {}

-void LocalMemDemux::tick() {      
+void LocalMemDemux::tick() {
  // process incoming responses
-  if (!RspSM.empty()) {
-    auto& rsp = RspSM.front();
-    DT(4, this->name() << "-" << rsp);
-    RspIn.push(rsp, 1);
-    RspSM.pop();
+  if (!RspLmem.empty()) {
+    auto& out_rsp = RspLmem.front();
+    DT(4, this->name() << "-" << out_rsp);
+    RspIn.push(out_rsp, 1);
+    RspLmem.pop();
  }
  if (!RspDC.empty()) {
-    auto& rsp = RspDC.front();
-    DT(4, this->name() << "-" << rsp);
-    RspIn.push(rsp, 1);
-    RspDC
-    .pop();
+    auto& out_rsp = RspDC.front();
+    DT(4, this->name() << "-" << out_rsp);
+    RspIn.push(out_rsp, 1);
+    RspDC.pop();
  }
-  // process incoming requests  
+
+  // process incoming requests
  if (!ReqIn.empty()) {
-    auto& req = ReqIn.front();
-    DT(4, this->name() << "-" << req);
-    if (req.type == AddrType::Shared) {
-      ReqSM.push(req, delay_);
-    } else {
-      ReqDC.push(req, delay_);
+    auto& in_req = ReqIn.front();
+
+    LsuReq out_dc_req(in_req.mask.size());
+    out_dc_req.write = in_req.write;
+    out_dc_req.tag   = in_req.tag;
+    out_dc_req.cid   = in_req.cid;
+    out_dc_req.uuid  = in_req.uuid;
+
+    LsuReq out_lmem_req(out_dc_req);
+
+    for (uint32_t i = 0; i < in_req.mask.size(); ++i) {
+      if (in_req.mask.test(i)) {
+        auto type = get_addr_type(in_req.addrs.at(i));
+        if (type == AddrType::Shared) {
+          out_lmem_req.mask.set(i);
+          out_lmem_req.addrs.at(i) = in_req.addrs.at(i);
+        } else {
+          out_dc_req.mask.set(i);
+          out_dc_req.addrs.at(i) = in_req.addrs.at(i);
+        }
+      }
+    }
+
+    if (!out_dc_req.mask.none()) {
+      ReqDC.push(out_dc_req, delay_);
+      DT(4, this->name() << "-" << out_dc_req);
+    }
+
+    if (!out_lmem_req.mask.none()) {
+      ReqLmem.push(out_lmem_req, delay_);
+      DT(4, this->name() << "-" << out_lmem_req);
    }
    ReqIn.pop();
-  }   
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+LsuMemAdapter::LsuMemAdapter(
+  const SimContext& ctx,
+  const char* name,
+  uint32_t num_inputs,
+  uint32_t delay
+) : SimObject<LsuMemAdapter>(ctx, name)
+  , ReqIn(this)
+  , RspIn(this)
+  , ReqOut(num_inputs, this)
+  , RspOut(num_inputs, this)
+  , delay_(delay)
+{}
+
+void LsuMemAdapter::reset() {}
+
+void LsuMemAdapter::tick() {
+  uint32_t input_size = ReqOut.size();
+
+  // process incoming responses
+  for (uint32_t i = 0; i < input_size; ++i) {
+    if (RspOut.at(i).empty())
+      continue;
+    auto& out_rsp = RspOut.at(i).front();
+    DT(4, this->name() << "-" << out_rsp);
+
+    // build memory response
+    LsuRsp in_rsp(input_size);
+    in_rsp.mask.set(i);
+    in_rsp.tag = out_rsp.tag;
+    in_rsp.cid = out_rsp.cid;
+    in_rsp.uuid = out_rsp.uuid;
+
+    // include other responses with the same tag
+    for (uint32_t j = i + 1; j < input_size; ++j) {
+      if (RspOut.at(j).empty())
+        continue;
+      auto& other_rsp = RspOut.at(j).front();
+      if (out_rsp.tag == other_rsp.tag) {
+        in_rsp.mask.set(j);
+        RspOut.at(j).pop();
+      }
+    }
+
+    // send memory response
+    RspIn.push(in_rsp, 1);
+
+    // remove input
+    RspOut.at(i).pop();
+    break;
+  }
+
+  // process incoming requests
+  if (!ReqIn.empty()) {
+    auto& in_req = ReqIn.front();
+    assert(in_req.mask.size() == input_size);
+
+    for (uint32_t i = 0; i < input_size; ++i) {
+      if (in_req.mask.test(i)) {
+        // build memory request
+        MemReq out_req;
+        out_req.write = in_req.write;
+        out_req.addr  = in_req.addrs.at(i);
+        out_req.type  = get_addr_type(in_req.addrs.at(i));
+        out_req.tag   = in_req.tag;
+        out_req.cid   = in_req.cid;
+        out_req.uuid  = in_req.uuid;
+
+        // send memory request
+        ReqOut.at(i).push(out_req, delay_);
+        DT(4, this->name() << "-" << out_req);
+      }
+    }
+    ReqIn.pop();
+  }
 }
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@ -23,6 +23,7 @@
 #include <VX_config.h>
 #include <VX_types.h>
 #include <simobject.h>
+#include <bitvector.h>
 #include "debug.h"

 namespace vortex {
@ -238,6 +239,62 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
  default: assert(false);
  }
  return os;
+}///////////////////////////////////////////////////////////////////////////////
+
+struct LsuReq {
+  BitVector<> mask;
+  std::vector<uint64_t> addrs;
+  bool     write;
+  uint32_t tag;
+  uint32_t cid;
+  uint64_t uuid;
+
+  LsuReq(uint32_t size)
+    : mask(size)
+    , addrs(size, 0)
+    , write(false)
+    , tag(0)
+    , cid(0)
+    , uuid(0)
+  {}
+};
+
+inline std::ostream &operator<<(std::ostream &os, const LsuReq& req) {
+  os << "lsu-req: rw=" << req.write << ", mask=" << req.mask << ", ";
+  for (size_t i = 0; i < req.mask.size(); ++i) {
+    os << "addr" << i << "=";
+    if (req.mask.test(i)) {
+      os << "0x" << std::hex << req.addrs.at(i);
+    } else {
+      os << "-";
+    }
+    os << ", ";
+  }
+  os << std::dec << "tag=" << req.tag << ", cid=" << req.cid;
+  os << " (#" << std::dec << req.uuid << ")";
+  return os;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+
+struct LsuRsp {
+  BitVector<> mask;
+  uint64_t tag;
+  uint32_t cid;
+  uint64_t uuid;
+
+ LsuRsp(uint32_t size)
+    : mask(size)
+    , tag (0)
+    , cid(0)
+    , uuid(0)
+  {}
+};
+
+inline std::ostream &operator<<(std::ostream &os, const LsuRsp& rsp) {
+  os << "lsu-rsp: mask=" << rsp.mask << ", tag=" << rsp.tag << ", cid=" << rsp.cid;
+  os << " (#" << std::dec << rsp.uuid << ")";
+  return os;
 }

 ///////////////////////////////////////////////////////////////////////////////
@ -266,7 +323,7 @@ struct MemReq {
 };

 inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
-  os << "mem-" << (req.write ? "wr" : "rd") << ": ";
+  os << "mem-req: rw=" << req.write << ", ";
  os << "addr=0x" << std::hex << req.addr << ", type=" << req.type;
  os << std::dec << ", tag=" << req.tag << ", cid=" << req.cid;
  os << " (#" << std::dec << req.uuid << ")";
@ -427,7 +484,6 @@ public:
        auto& req_in = Inputs.at(j);
        if (!req_in.empty()) {
          auto& req = req_in.front();
-          DT(4, this->name() << "-" << req);
          Outputs.at(o).push(req, delay_);
          req_in.pop();
          this->update_cursor(o, i);
@ -566,14 +622,14 @@ using MemSwitch = Switch<MemReq, MemRsp>;

 class LocalMemDemux : public SimObject<LocalMemDemux> {
 public:
-  SimPort<MemReq> ReqIn;
-  SimPort<MemRsp> RspIn;
+  SimPort<LsuReq> ReqIn;
+  SimPort<LsuRsp> RspIn;

-  SimPort<MemReq> ReqSM;
-  SimPort<MemRsp> RspSM;
+  SimPort<LsuReq> ReqLmem;
+  SimPort<LsuRsp> RspLmem;

-  SimPort<MemReq> ReqDC;
-  SimPort<MemRsp> RspDC;
+  SimPort<LsuReq> ReqDC;
+  SimPort<LsuRsp> RspDC;

  LocalMemDemux(
    const SimContext& ctx,
@ -589,4 +645,29 @@ private:
  uint32_t delay_;
 };

+///////////////////////////////////////////////////////////////////////////////
+
+class LsuMemAdapter : public SimObject<LsuMemAdapter> {
+public:
+  SimPort<LsuReq> ReqIn;
+  SimPort<LsuRsp> RspIn;
+
+  std::vector<SimPort<MemReq>> ReqOut;
+  std::vector<SimPort<MemRsp>> RspOut;
+
+  LsuMemAdapter(
+    const SimContext& ctx,
+    const char* name,
+    uint32_t num_inputs,
+    uint32_t delay
+  );
+
+  void reset();
+
+  void tick();
+
+private:
+  uint32_t delay_;
+};
+
 }
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@ -65,27 +65,6 @@ run-rtlsim:
 	$(MAKE) -C sgemm3 run-rtlsim
 	$(MAKE) -C psum run-rtlsim

-run-opae:
-	$(MAKE) -C vecadd run-opae
-	$(MAKE) -C sgemm run-opae
-	$(MAKE) -C conv3 run-opae
-	$(MAKE) -C psort run-opae
-	$(MAKE) -C saxpy run-opae
-	$(MAKE) -C sfilter run-opae
-	$(MAKE) -C oclprintf run-opae
-	$(MAKE) -C dotproduct run-opae
-	$(MAKE) -C transpose run-opae
-	$(MAKE) -C spmv run-opae
-	$(MAKE) -C stencil run-opae
-	$(MAKE) -C nearn run-opae
-	$(MAKE) -C guassian run-opae
-	$(MAKE) -C kmeans run-opae
-	$(MAKE) -C blackscholes run-opae
-	$(MAKE) -C bfs run-opae
-	$(MAKE) -C sgemm2 run-opae
-	$(MAKE) -C sgemm3 run-opae
-	$(MAKE) -C psum run-opae
-
 clean:
 	$(MAKE) -C vecadd clean
 	$(MAKE) -C sgemm clean
--- a/tests/opencl/lbm/main.cc
+++ b/tests/opencl/lbm/main.cc
@ -46,7 +46,7 @@ static float* read_output_file(const char* filename, int size) {
        return NULL;
    }
    // Read the float data
-    if (fread(floats, sizeof(float), size, file) != size) {
+    if (fread(floats, sizeof(float), size, file) != (size_t)size) {
        fclose(file);
        free(floats);
        perror("Error reading floats from file");
@ -128,6 +128,7 @@ int main(int nArgs, char *arg[]) {
  MAIN_initialize(&param, &prm);

  for (t = 1; t <= param.nTimeSteps; t++) {
+    
    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
    OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
    pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
@ -198,9 +199,9 @@ void MAIN_printInfo(const MAIN_Param *param) {
         "\tsimulation type: %s\n"
         "\tobstacle file  : %s\n\n",
         SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
-         param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
-         (param->obstacleFilename == NULL) ? "<none>"
-                                           : param->obstacleFilename);
+         param->nTimeSteps, ((param->resultFilename == NULL) ? "<none>" : param->resultFilename), "store", "lid-driven cavity",
+         ((param->obstacleFilename == NULL) ? "<none>" : param->obstacleFilename)
+  );
 }

 /*############################################################################*/
@ -316,7 +317,7 @@ void OpenCL_initialize(struct pb_Parameters *p, OpenCL_Param *prm) {
  // read kernel binary from file
  uint8_t *kernel_bin = NULL;
  size_t kernel_size;
-  cl_int binary_status = 0;
+  //cl_int binary_status = 0;

  clStatus = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
  CHECK_ERROR("read_kernel_file")
--- a/tests/regression/Makefile
+++ b/tests/regression/Makefile
@ -49,22 +49,6 @@ run-rtlsim:
 	$(MAKE) -C sgemm2x run-rtlsim
 	$(MAKE) -C stencil3d run-rtlsim

-run-opae:
-	$(MAKE) -C basic run-opae
-	$(MAKE) -C demo run-opae
-	$(MAKE) -C dogfood run-opae
-	$(MAKE) -C mstress run-opae
-	$(MAKE) -C io_addr run-opae
-	$(MAKE) -C printf run-opae
-	$(MAKE) -C diverge run-opae
-	$(MAKE) -C sort run-opae
-	$(MAKE) -C fence run-opae
-	$(MAKE) -C vecaddx run-opae
-	$(MAKE) -C sgemmx run-opae
-	$(MAKE) -C conv3x run-opae
-	$(MAKE) -C sgemm2x run-opae
-	$(MAKE) -C stencil3d run-opae
-
 clean:
 	$(MAKE) -C basic clean
 	$(MAKE) -C demo clean