global barrier fixes

This commit is contained in:
Blaise Tine 2023-06-21 02:23:50 -04:00
parent ececadd77b
commit 24471d11c1
6 changed files with 33 additions and 39 deletions

View file

@ -215,7 +215,7 @@ module VX_warp_sched #(
assign gbar_if.req_valid = gbar_req_valid;
assign gbar_if.req_id = gbar_req_id;
assign gbar_if.req_size_m1 = gbar_req_size_m1;
assign gbar_if.req_core_id = NC_WIDTH'(CORE_ID);
assign gbar_if.req_core_id = NC_WIDTH'(CORE_ID % `NUM_CORES);
// split/join stack management

View file

@ -12,7 +12,8 @@ Cluster::Cluster(const SimContext& ctx,
, mem_req_port(this)
, mem_rsp_port(this)
, cluster_id_(cluster_id)
, cores_(num_cores)
, cores_(num_cores)
, barriers_(arch.num_barriers(), 0)
, raster_units_(NUM_RASTER_UNITS)
, rop_units_(NUM_ROP_UNITS)
, tex_units_(NUM_TEX_UNITS)
@ -20,7 +21,6 @@ Cluster::Cluster(const SimContext& ctx,
, processor_(processor)
{
char sname[100];
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
!L2_ENABLED,
@ -257,8 +257,10 @@ Cluster::~Cluster() {
//--
}
void Cluster::reset() {
//--
void Cluster::reset() {
for (auto& barrier : barriers_) {
barrier.reset();
}
}
void Cluster::tick() {
@ -303,6 +305,26 @@ bool Cluster::check_exit(Word* exitcode, int reg) const {
return done;
}
void Cluster::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
auto& barrier = barriers_.at(bar_id);
uint32_t local_core_id = core_id % cores_.size();
barrier.set(local_core_id);
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
if (barrier.count() == (size_t)count) {
// resume all suspended cores
for (uint32_t i = 0; i < cores_.size(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
cores_.at(i)->resume();
}
}
barrier.reset();
}
}
ProcessorImpl* Cluster::processor() const {
return processor_;
}

View file

@ -64,19 +64,18 @@ public:
bool running() const;
bool check_exit(Word* exitcode, int reg) const;
bool check_exit(Word* exitcode, int reg) const;
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
ProcessorImpl* processor() const;
Cluster::PerfStats perf_stats() const;
auto& core(uint32_t index) {
return cores_.at(index);
}
private:
uint32_t cluster_id_;
std::vector<Core::Ptr> cores_;
std::vector<Core::Ptr> cores_;
std::vector<CoreMask> barriers_;
std::vector<RasterUnit::Ptr> raster_units_;
std::vector<RopUnit::Ptr> rop_units_;
std::vector<TexUnit::Ptr> tex_units_;

View file

@ -326,7 +326,7 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t warp_id) {
if (is_global) {
// global barrier handling
if (barrier.count() == active_warps_.count()) {
cluster_->processor()->barrier(bar_idx, count, core_id_);
cluster_->barrier(bar_idx, count, core_id_);
barrier.reset();
}
} else {

View file

@ -6,7 +6,6 @@ using namespace vortex;
ProcessorImpl::ProcessorImpl(const Arch& arch)
: arch_(arch)
, clusters_(NUM_CLUSTERS)
, barriers_(arch.num_barriers(), 0)
{
SimPlatform::instance().initialize();
@ -100,35 +99,12 @@ int ProcessorImpl::run() {
}
void ProcessorImpl::reset() {
for (auto& barrier : barriers_) {
barrier.reset();
}
perf_mem_reads_ = 0;
perf_mem_writes_ = 0;
perf_mem_latency_ = 0;
perf_mem_pending_reads_ = 0;
}
void ProcessorImpl::barrier(uint32_t bar_id, uint32_t count, uint32_t core_id) {
auto& barrier = barriers_.at(bar_id);
barrier.set(core_id);
DP(3, "*** Suspend core #" << core_id << " at barrier #" << bar_id);
if (barrier.count() == (size_t)count) {
// resume suspended cores
uint32_t cores_per_cluster = arch_.num_cores() / NUM_CLUSTERS;
for (uint32_t i = 0; i < arch_.num_cores(); ++i) {
if (barrier.test(i)) {
DP(3, "*** Resume core #" << i << " at barrier #" << bar_id);
uint32_t core_idx = i % cores_per_cluster;
uint32_t cluster_idx = i / cores_per_cluster;
clusters_.at(cluster_idx)->core(core_idx)->resume();
}
}
barrier.reset();
}
}
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
dcrs_.write(addr, value);
}

View file

@ -31,8 +31,6 @@ public:
int run();
void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id);
void write_dcr(uint32_t addr, uint32_t value);
ProcessorImpl::PerfStats perf_stats() const;
@ -46,7 +44,6 @@ private:
DCRS dcrs_;
MemSim::Ptr memsim_;
CacheSim::Ptr l3cache_;
std::vector<CoreMask> barriers_;
uint64_t perf_mem_reads_;
uint64_t perf_mem_writes_;
uint64_t perf_mem_latency_;