minor update

This commit is contained in:
Blaise Tine 2024-05-01 00:02:52 -07:00
parent 5ea10fd872
commit e84f978502
8 changed files with 67 additions and 35 deletions

View file

@ -155,6 +155,9 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
// back to single-threaded
vx_tmc_one();
}
// wait for spawned tasks to complete
vx_wspawn(1, 0);
}
///////////////////////////////////////////////////////////////////////////////
@ -315,6 +318,9 @@ void vx_spawn_pocl_kernel(pocl_kernel_context_t * ctx, pocl_kernel_cb callback,
// back to single-threaded
vx_tmc_one();
}
// wait for spawned tasks to complete
vx_wspawn(1, 0);
}
#ifdef __cplusplus

View file

@ -393,6 +393,10 @@ void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
emulator_.barrier(bar_id, count, wid);
}
void Core::wspawn(uint32_t num_warps, Word nextPC) {
emulator_.wspawn(num_warps, nextPC);
}
void Core::attach_ram(RAM* ram) {
emulator_.attach_ram(ram);
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -56,7 +56,7 @@ public:
uint64_t ifetch_latency;
uint64_t load_latency;
PerfStats()
PerfStats()
: cycles(0)
, instrs(0)
, sched_idle(0)
@ -83,10 +83,10 @@ public:
std::vector<SimPort<MemReq>> dcache_req_ports;
std::vector<SimPort<MemRsp>> dcache_rsp_ports;
Core(const SimContext& ctx,
uint32_t core_id,
Core(const SimContext& ctx,
uint32_t core_id,
Socket* socket,
const Arch &arch,
const Arch &arch,
const DCRS &dcrs);
~Core();
@ -103,6 +103,8 @@ public:
void barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
void wspawn(uint32_t num_warps, Word nextPC);
uint32_t id() const {
return core_id_;
}
@ -139,26 +141,26 @@ private:
const Arch& arch_;
Emulator emulator_;
std::vector<IBuffer> ibuffers_;
Scoreboard scoreboard_;
std::vector<Operand::Ptr> operands_;
std::vector<Dispatcher::Ptr> dispatchers_;
std::vector<FuncUnit::Ptr> func_units_;
std::vector<FuncUnit::Ptr> func_units_;
LocalMem::Ptr local_mem_;
std::vector<LocalMemDemux::Ptr> lsu_demux_;
std::vector<MemCoalescer::Ptr> mem_coalescers_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;
HashTable<instr_trace_t*> pending_icache_;
uint64_t pending_instrs_;
uint64_t pending_ifetches_;
PerfStats perf_stats_;
std::vector<TraceSwitch::Ptr> commit_arbs_;
uint32_t commit_exe_;

View file

@ -108,6 +108,7 @@ void Emulator::clear() {
// activate first warp and thread
active_warps_.set(0);
warps_[0].tmask.set(0);
wspawn_.valid = false;
}
void Emulator::attach_ram(RAM* ram) {
@ -122,6 +123,19 @@ void Emulator::attach_ram(RAM* ram) {
instr_trace_t* Emulator::step() {
int scheduled_warp = -1;
// process pending wspawn
if (wspawn_.valid && active_warps_.count() == 1) {
DP(3, "*** Activate " << (wspawn_.num_warps-1) << " warps at PC: " << std::hex << wspawn_.nextPC);
for (uint32_t i = 1; i < wspawn_.num_warps; ++i) {
auto& warp = warps_.at(i);
warp.PC = wspawn_.nextPC;
warp.tmask.set(0);
active_warps_.set(i);
}
wspawn_.valid = false;
stalled_warps_.reset(0);
}
// find next ready warp
for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) {
bool warp_active = active_warps_.test(wid);
@ -210,19 +224,10 @@ void Emulator::resume(uint32_t wid) {
}
}
bool Emulator::wspawn(uint32_t num_warps, Word nextPC) {
// wait for single warp
if (active_warps_.count() != 1)
return false;
uint32_t active_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC);
for (uint32_t i = 1; i < active_warps; ++i) {
auto& warp = warps_.at(i);
warp.PC = nextPC;
warp.tmask.set(0);
active_warps_.set(i);
}
return true;
void Emulator::wspawn(uint32_t num_warps, Word nextPC) {
wspawn_.valid = true;
wspawn_.num_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
wspawn_.nextPC = nextPC;
}
void Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {

View file

@ -50,7 +50,7 @@ public:
void barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
bool wspawn(uint32_t num_warps, Word nextPC);
void wspawn(uint32_t num_warps, Word nextPC);
int get_exitcode() const;
@ -78,6 +78,12 @@ private:
UUIDGenerator uui_gen;
};
struct wspawn_t {
bool valid;
uint32_t num_warps;
Word nextPC;
};
std::shared_ptr<Instr> decode(uint32_t code) const;
void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace);
@ -117,7 +123,8 @@ private:
std::vector<WarpMask> barriers_;
std::unordered_map<int, std::stringstream> print_bufs_;
MemoryUnit mmu_;
Word csr_mscratch_;
Word csr_mscratch_;
wspawn_t wspawn_;
};
}

View file

@ -257,12 +257,12 @@ void SfuUnit::tick() {
switch (sfu_type) {
case SfuType::WSPAWN:
output.push(trace, 1);
if (trace->eop) {
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
if (!core_->emulator_.wspawn(trace_data->arg1, trace_data->arg2))
return;
core_->wspawn(trace_data->arg1, trace_data->arg2);
release_warp = false;
}
output.push(trace, 1);
break;
case SfuType::TMC:
case SfuType::SPLIT:
@ -275,11 +275,11 @@ void SfuUnit::tick() {
break;
case SfuType::BAR: {
output.push(trace, 1);
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
if (trace->eop) {
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
core_->barrier(trace_data->arg1, trace_data->arg2, trace->wid);
release_warp = false;
}
release_warp = false;
} break;
case SfuType::CMOV:
output.push(trace, 3);

View file

@ -332,9 +332,14 @@ void kernel_trigo(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
dst_ptr[offset+i] = sinf(a * b);
uint32_t j = offset + i;
auto a = src0_ptr[j];
auto b = src1_ptr[j];
auto c = a * b;
if ((j % 4) == 0) {
c = sinf(c);
}
dst_ptr[j] = c;
}
}

View file

@ -752,7 +752,10 @@ public:
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = sinf(a[i] * b[i]);
auto ref = a[i] * b[i];
if ((i % 4) == 0) {
ref = sinf(ref);
}
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;