global barrier fixes

2025-04-22 21:09:15 -04:00 · 2023-06-21 02:23:50 -04:00 · 2023-06-21 02:23:50 -04:00 · 24471d11c1
commit 24471d11c1
parent ececadd77b
6 changed files with 33 additions and 39 deletions
--- a/hw/rtl/core/VX_warp_sched.sv
+++ b/hw/rtl/core/VX_warp_sched.sv
@ -215,7 +215,7 @@ module VX_warp_sched #(
    assign gbar_if.req_valid   = gbar_req_valid;
    assign gbar_if.req_id      = gbar_req_id;
    assign gbar_if.req_size_m1 = gbar_req_size_m1;
-    assign gbar_if.req_core_id = NC_WIDTH'(CORE_ID);
+    assign gbar_if.req_core_id = NC_WIDTH'(CORE_ID % `NUM_CORES);

    // split/join stack management    

--- a/sim/simx/cluster.cpp
+++ b/sim/simx/cluster.cpp
@ -12,7 +12,8 @@ Cluster::Cluster(const SimContext& ctx,
  , mem_req_port(this)
  , mem_rsp_port(this)
  , cluster_id_(cluster_id)
-  , cores_(num_cores)
+  , cores_(num_cores)  
+  , barriers_(arch.num_barriers(), 0)
  , raster_units_(NUM_RASTER_UNITS)
  , rop_units_(NUM_ROP_UNITS)
  , tex_units_(NUM_TEX_UNITS)
@ -20,7 +21,6 @@ Cluster::Cluster(const SimContext& ctx,
  , processor_(processor)
 {
  char sname[100];
-
  snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
  l2cache_ = CacheSim::Create(sname, CacheSim::Config{
    !L2_ENABLED,
@ -257,8 +257,10 @@ Cluster::~Cluster() {
  //--
 }

-void Cluster::reset() {
-  //--
+void Cluster::reset() {  
+  for (auto& barrier : barriers_) {
+    barrier.reset();
+  }
 }

 void Cluster::tick() {
@ -303,6 +305,26 @@ bool Cluster::check_exit(Word* exitcode, int reg) const {
  return done;
 }

+void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
+  auto& barrier = barriers_.at(bar_id);
+
+  uint32_t local_core_id = core_id % cores_.size();
+  barrier.set(local_core_id);
+
+  DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
+
+  if (barrier.count() == (size_t)count) {
+      // resume all suspended cores
+      for (uint32_t i = 0; i < cores_.size(); ++i) {
+        if (barrier.test(i)) {
+          DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
+          cores_.at(i)->resume();
+        }
+      }
+      barrier.reset();
+    }
+}
+
 ProcessorImpl* Cluster::processor() const {
  return processor_;
 }
--- a/sim/simx/cluster.h
+++ b/sim/simx/cluster.h
@ -64,19 +64,18 @@ public:

  bool running() const;

-  bool check_exit(Word* exitcode, int reg) const;
+  bool check_exit(Word* exitcode, int reg) const;  
+
+  void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);

  ProcessorImpl* processor() const;

  Cluster::PerfStats perf_stats() const;
-
-  auto& core(uint32_t index) {
-    return cores_.at(index);
-  }
  
 private:
  uint32_t                     cluster_id_;  
-  std::vector<Core::Ptr>       cores_;
+  std::vector<Core::Ptr>       cores_;  
+  std::vector<CoreMask>        barriers_;
  std::vector<RasterUnit::Ptr> raster_units_;
  std::vector<RopUnit::Ptr>    rop_units_;
  std::vector<TexUnit::Ptr>    tex_units_;
--- a/sim/simx/core.cpp
+++ b/sim/simx/core.cpp
@ -326,7 +326,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
  if (is_global) {
    // global barrier handling
    if (barrier.count() == active_warps_.count()) {
-      cluster_->processor()->barrier(bar_idx, count, core_id_);
+      cluster_->barrier(bar_idx, count, core_id_);
      barrier.reset();
    }    
  } else {
--- a/sim/simx/processor.cpp
+++ b/sim/simx/processor.cpp
@ -6,7 +6,6 @@ using namespace vortex;
 ProcessorImpl::ProcessorImpl(const Arch& arch) 
  : arch_(arch)
  , clusters_(NUM_CLUSTERS)
-  , barriers_(arch.num_barriers(), 0)
 {
  SimPlatform::instance().initialize();

@ -100,35 +99,12 @@ int ProcessorImpl::run() {
 }
 
 void ProcessorImpl::reset() {
-  for (auto& barrier : barriers_) {
-    barrier.reset();
-  }
  perf_mem_reads_ = 0;
  perf_mem_writes_ = 0;
  perf_mem_latency_ = 0;
  perf_mem_pending_reads_ = 0;
 }

-void ProcessorImpl::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
-  auto& barrier = barriers_.at(bar_id);
-  barrier.set(core_id);
-  DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
-
-  if (barrier.count() == (size_t)count) {
-      // resume suspended cores
-      uint32_t cores_per_cluster = arch_.num_cores() / NUM_CLUSTERS;
-      for (uint32_t i = 0; i < arch_.num_cores(); ++i) {
-        if (barrier.test(i)) {
-          DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
-          uint32_t core_idx = i % cores_per_cluster;
-          uint32_t cluster_idx = i / cores_per_cluster;
-          clusters_.at(cluster_idx)->core(core_idx)->resume();
-        }
-      }
-      barrier.reset();
-    }
-}
-
 void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
  dcrs_.write(addr, value);
 }
--- a/sim/simx/processor_impl.h
+++ b/sim/simx/processor_impl.h
@ -31,8 +31,6 @@ public:

  int run();

-  void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
-
  void write_dcr(uint32_t addr, uint32_t value);

  ProcessorImpl::PerfStats perf_stats() const;
@ -46,7 +44,6 @@ private:
  DCRS dcrs_;
  MemSim::Ptr   memsim_;
  CacheSim::Ptr l3cache_;
-  std::vector<CoreMask> barriers_;
  uint64_t perf_mem_reads_;
  uint64_t perf_mem_writes_;
  uint64_t perf_mem_latency_;