mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
simx timing simulation refactoring
This commit is contained in:
parent
9656779d48
commit
808bddb586
22 changed files with 1123 additions and 903 deletions
|
@ -11,6 +11,128 @@ namespace vortex {
|
|||
|
||||
class SimObjectBase;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class SimPortBase {
|
||||
public:
|
||||
virtual ~SimPortBase() {}
|
||||
|
||||
SimObjectBase* module() const {
|
||||
return module_;
|
||||
}
|
||||
|
||||
SimPortBase* peer() const {
|
||||
return peer_;
|
||||
}
|
||||
|
||||
bool connected() const {
|
||||
return (peer_ != nullptr);
|
||||
}
|
||||
|
||||
protected:
|
||||
SimPortBase(SimObjectBase* module)
|
||||
: module_(module)
|
||||
, peer_(nullptr)
|
||||
{}
|
||||
|
||||
void connect(SimPortBase* peer) {
|
||||
assert(peer_ == nullptr);
|
||||
peer_ = peer;
|
||||
}
|
||||
|
||||
void disconnect() {
|
||||
assert(peer_ == nullptr);
|
||||
peer_ = nullptr;
|
||||
}
|
||||
|
||||
SimPortBase& operator=(const SimPortBase&) = delete;
|
||||
|
||||
SimObjectBase* module_;
|
||||
SimPortBase* peer_;
|
||||
|
||||
template <typename U> friend class SlavePort;
|
||||
template <typename U> friend class MasterPort;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Pkt>
|
||||
class SimPort : public SimPortBase {
|
||||
public:
|
||||
void send(const Pkt& pkt, uint64_t delay) const;
|
||||
|
||||
bool read(Pkt* out) {
|
||||
if (!valid_)
|
||||
return false;
|
||||
*out = data_;
|
||||
valid_ = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected:
|
||||
SimPort(SimObjectBase* module)
|
||||
: SimPortBase(module)
|
||||
, valid_(false)
|
||||
{}
|
||||
|
||||
void write(const Pkt& data) {
|
||||
assert(!valid_);
|
||||
data_ = data;
|
||||
valid_ = true;
|
||||
}
|
||||
|
||||
SimPort& operator=(const SimPort&) = delete;
|
||||
|
||||
Pkt data_;
|
||||
bool valid_;
|
||||
|
||||
template <typename U> friend class SimPortEvent;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Pkt>
|
||||
class SlavePort : public SimPort<Pkt> {
|
||||
public:
|
||||
SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}
|
||||
|
||||
void bind(SlavePort<Pkt>* peer) {
|
||||
this->connect(peer);
|
||||
}
|
||||
|
||||
void unbind() {
|
||||
this->disconnect();
|
||||
}
|
||||
|
||||
protected:
|
||||
SlavePort& operator=(const SlavePort&) = delete;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Pkt>
|
||||
class MasterPort : public SimPort<Pkt> {
|
||||
public:
|
||||
MasterPort(SimObjectBase* module) : SimPort<Pkt>(module) {}
|
||||
|
||||
void bind(SlavePort<Pkt>* peer) {
|
||||
this->connect(peer);
|
||||
}
|
||||
|
||||
void bind(MasterPort<Pkt>* peer) {
|
||||
this->connect(peer);
|
||||
}
|
||||
|
||||
void unbind() {
|
||||
this->disconnect();
|
||||
}
|
||||
|
||||
protected:
|
||||
MasterPort& operator=(const MasterPort&) = delete;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class SimEventBase {
|
||||
public:
|
||||
typedef std::shared_ptr<SimEventBase> Ptr;
|
||||
|
@ -32,16 +154,16 @@ protected:
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Pkt>
|
||||
class SimSimpleEvent : public SimEventBase {
|
||||
class SimCallEvent : public SimEventBase {
|
||||
public:
|
||||
typedef std::function<void (const Pkt&)> Func;
|
||||
|
||||
template <typename... Args>
|
||||
static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) {
|
||||
return std::make_shared<SimSimpleEvent>(func, pkt, delay);
|
||||
return std::make_shared<SimCallEvent>(func, pkt, delay);
|
||||
}
|
||||
|
||||
SimSimpleEvent(const Func& func, const Pkt& pkt, uint64_t delay)
|
||||
SimCallEvent(const Func& func, const Pkt& pkt, uint64_t delay)
|
||||
: SimEventBase(delay)
|
||||
, func_(func)
|
||||
, pkt_(pkt)
|
||||
|
@ -61,167 +183,23 @@ protected:
|
|||
template <typename Pkt>
|
||||
class SimPortEvent : public SimEventBase {
|
||||
public:
|
||||
typedef std::function<void (const Pkt&, uint32_t)> Func;
|
||||
|
||||
template <typename... Args>
|
||||
static Ptr Create(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay) {
|
||||
return std::make_shared<SimPortEvent>(func, pkt, port_id, delay);
|
||||
static Ptr Create(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay) {
|
||||
return std::make_shared<SimPortEvent>(port, pkt, delay);
|
||||
}
|
||||
|
||||
SimPortEvent(const Func& func, const Pkt& pkt, uint32_t port_id, uint64_t delay)
|
||||
SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay)
|
||||
: SimEventBase(delay)
|
||||
, func_(func)
|
||||
, port_(port)
|
||||
, pkt_(pkt)
|
||||
, port_id_(port_id)
|
||||
{}
|
||||
|
||||
void fire() const override {
|
||||
func_(pkt_, port_id_);
|
||||
const_cast<SimPort<Pkt>*>(port_)->write(pkt_);
|
||||
}
|
||||
|
||||
private:
|
||||
Func func_;
|
||||
Pkt pkt_;
|
||||
uint32_t port_id_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class SimPortBase {
|
||||
public:
|
||||
typedef std::shared_ptr<SimPortBase> Ptr;
|
||||
|
||||
virtual ~SimPortBase() {}
|
||||
|
||||
SimObjectBase* module() const {
|
||||
return module_;
|
||||
}
|
||||
|
||||
uint32_t port_id() const {
|
||||
return port_id_;
|
||||
}
|
||||
|
||||
SimPortBase* peer() const {
|
||||
return peer_;
|
||||
}
|
||||
|
||||
bool connected() const {
|
||||
return (peer_ != nullptr);
|
||||
}
|
||||
|
||||
bool is_slave() const {
|
||||
return is_slave_;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
SimPortBase(SimObjectBase* module, bool is_slave);
|
||||
|
||||
void connect(SimPortBase* peer) {
|
||||
assert(peer_ == nullptr);
|
||||
peer_ = peer;
|
||||
}
|
||||
|
||||
void disconnect() {
|
||||
assert(peer_ == nullptr);
|
||||
peer_ = nullptr;
|
||||
}
|
||||
|
||||
SimObjectBase* module_;
|
||||
uint32_t port_id_;
|
||||
bool is_slave_;
|
||||
SimPortBase* peer_;
|
||||
|
||||
template <typename Pkt> friend class MasterPort;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Pkt>
|
||||
class SlavePort : public SimPortBase {
|
||||
public:
|
||||
typedef std::shared_ptr<SlavePort<Ptr>> Ptr;
|
||||
typedef std::function<void (const Pkt&, uint32_t)> Func;
|
||||
|
||||
static Ptr Create(SimObjectBase* module, const Func& func) {
|
||||
return std::make_shared<SlavePort<Pkt>>(module, func);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static Ptr Create(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t)) {
|
||||
return std::make_shared<SlavePort<Pkt>>(module, obj, entry);
|
||||
}
|
||||
|
||||
SlavePort(SimObjectBase* module, const Func& func)
|
||||
: SimPortBase(module, true)
|
||||
, func_(func)
|
||||
{}
|
||||
|
||||
template <typename T>
|
||||
SlavePort(SimObjectBase* module, T *obj, void (T::*entry)(const Pkt&, uint32_t))
|
||||
: SimPortBase(module, true)
|
||||
, func_(std::bind(entry, obj, std::placeholders::_1, std::placeholders::_2))
|
||||
{}
|
||||
|
||||
SlavePort(SimObjectBase* module, SlavePort* peer)
|
||||
: SimPortBase(module, false)
|
||||
{
|
||||
this->connect(peer);
|
||||
}
|
||||
|
||||
void send(const Pkt& pkt, uint64_t delay) const;
|
||||
|
||||
const Func& func() const {
|
||||
return func_;
|
||||
}
|
||||
|
||||
protected:
|
||||
SlavePort& operator=(const SlavePort&);
|
||||
Func func_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Pkt>
|
||||
class MasterPort : public SimPortBase {
|
||||
public:
|
||||
typedef std::shared_ptr<MasterPort<Ptr>> Ptr;
|
||||
typedef std::function<void (const Pkt&, uint32_t)> Func;
|
||||
|
||||
static Ptr Create() {
|
||||
return std::make_shared<MasterPort<Ptr>>(module);
|
||||
}
|
||||
|
||||
MasterPort(SimObjectBase* module) : SimPortBase(module, false) {}
|
||||
|
||||
MasterPort(SimObjectBase* module, MasterPort* peer)
|
||||
: SimPortBase(module, false)
|
||||
{
|
||||
peer->connect(this);
|
||||
}
|
||||
|
||||
void bind(SlavePort<Pkt>* peer) {
|
||||
this->connect(peer);
|
||||
}
|
||||
|
||||
void unbind() {
|
||||
peer_->disconnect();
|
||||
this->disconnect();
|
||||
}
|
||||
|
||||
void send(const Pkt& pkt, uint64_t delay) const {
|
||||
assert(peer_ != nullptr);
|
||||
if (peer_->is_slave()) {
|
||||
auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
|
||||
slave->send(pkt, delay);
|
||||
} else {
|
||||
auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
|
||||
master->send(pkt, delay);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
MasterPort& operator=(const MasterPort&);
|
||||
const SimPort<Pkt>* port_;
|
||||
Pkt pkt_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -237,25 +215,18 @@ public:
|
|||
template <typename T, typename Pkt>
|
||||
void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay);
|
||||
|
||||
virtual void step(uint64_t cycle) = 0;
|
||||
|
||||
const std::string& name() const {
|
||||
return name_;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
SimObjectBase(const SimContext& ctx, const char* name);
|
||||
virtual void step(uint64_t cycle) = 0;
|
||||
|
||||
uint32_t allocate_port(SimPortBase* port) {
|
||||
uint32_t id = ports_.size();
|
||||
ports_.push_back(port);
|
||||
return id;
|
||||
}
|
||||
SimObjectBase(const SimContext& ctx, const char* name);
|
||||
|
||||
private:
|
||||
std::string name_;
|
||||
std::vector<SimPortBase*> ports_;
|
||||
|
||||
friend class SimPlatform;
|
||||
friend class SimPortBase;
|
||||
|
@ -320,20 +291,19 @@ public:
|
|||
}
|
||||
|
||||
template <typename Pkt>
|
||||
void schedule(const typename SimSimpleEvent<Pkt>::Func& callback,
|
||||
void schedule(const typename SimCallEvent<Pkt>::Func& callback,
|
||||
const Pkt& pkt,
|
||||
uint64_t delay) {
|
||||
auto evt = SimSimpleEvent<Pkt>::Create(callback, pkt, delay);
|
||||
auto evt = SimCallEvent<Pkt>::Create(callback, pkt, delay);
|
||||
assert(delay != 0);
|
||||
events_.emplace_back(evt);
|
||||
}
|
||||
|
||||
template <typename Pkt>
|
||||
void schedule(const typename SimPortEvent<Pkt>::Func& callback,
|
||||
void schedule(const SimPort<Pkt>* port,
|
||||
const Pkt& pkt,
|
||||
uint32_t port_id,
|
||||
uint64_t delay) {
|
||||
auto evt = SimPortEvent<Pkt>::Create(callback, pkt, port_id, delay);
|
||||
auto evt = SimPortEvent<Pkt>::Create(port, pkt, delay);
|
||||
assert(delay != 0);
|
||||
events_.emplace_back(evt);
|
||||
}
|
||||
|
@ -383,13 +353,6 @@ private:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline SimPortBase::SimPortBase(SimObjectBase* module, bool is_slave)
|
||||
: module_(module)
|
||||
, port_id_(module->allocate_port(this))
|
||||
, is_slave_(is_slave)
|
||||
, peer_(nullptr)
|
||||
{}
|
||||
|
||||
inline SimObjectBase::SimObjectBase(const SimContext&, const char* name)
|
||||
: name_(name)
|
||||
{}
|
||||
|
@ -403,18 +366,11 @@ typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
|
|||
}
|
||||
|
||||
template <typename Pkt>
|
||||
void SlavePort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
|
||||
if (func_) {
|
||||
SimPlatform::instance().schedule(func_, pkt, port_id_, delay);
|
||||
void SimPort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
|
||||
if (peer_) {
|
||||
reinterpret_cast<const SimPort<Pkt>*>(peer_)->send(pkt, delay);
|
||||
} else {
|
||||
assert(peer_ != nullptr);
|
||||
if (peer_->is_slave()) {
|
||||
auto slave = reinterpret_cast<const SlavePort<Pkt>*>(peer_);
|
||||
slave->send(pkt, delay);
|
||||
} else {
|
||||
auto master = reinterpret_cast<const MasterPort<Pkt>*>(peer_);
|
||||
master->send(pkt, delay);
|
||||
}
|
||||
SimPlatform::instance().schedule(this, pkt, delay);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ LDFLAGS += ../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
|||
TOP = vx_cache_sim
|
||||
|
||||
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
||||
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp main.cpp
|
||||
SRCS += args.cpp cache.cpp memsim.cpp warp.cpp core.cpp decode.cpp execute.cpp exeunit.cpp processor.cpp main.cpp
|
||||
|
||||
OBJS := $(patsubst %.cpp, obj_dir/%.o, $(notdir $(SRCS)))
|
||||
VPATH := $(sort $(dir $(SRCS)))
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#include "cache.h"
|
||||
#include "debug.h"
|
||||
#include "types.h"
|
||||
#include <util.h>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
@ -30,8 +31,7 @@ struct params_t {
|
|||
uint32_t offset_bits = config.B - config.W;
|
||||
uint32_t log2_bank_size = config.C - bank_bits;
|
||||
uint32_t index_bits = log2_bank_size - (config.B << config.A);
|
||||
assert(log2_bank_size >= config.B);
|
||||
|
||||
assert(log2_bank_size >= config.B);
|
||||
|
||||
this->words_per_block = 1 << offset_bits;
|
||||
this->blocks_per_set = 1 << config.A;
|
||||
|
@ -229,9 +229,10 @@ private:
|
|||
CacheConfig config_;
|
||||
params_t params_;
|
||||
std::vector<bank_t> banks_;
|
||||
std::vector<std::pair<bool, MemReq>> core_reqs_;
|
||||
std::pair<bool, MemRsp> mem_rsp_;
|
||||
std::vector<std::queue<uint32_t>> core_rsps_;
|
||||
Switch<MemReq, MemRsp>::Ptr mem_switch_;
|
||||
std::vector<MasterPort<MemReq>> mem_req_ports_;
|
||||
std::vector<SlavePort<MemRsp>> mem_rsp_ports_;
|
||||
|
||||
public:
|
||||
Impl(Cache* simobject, const CacheConfig& config)
|
||||
|
@ -239,16 +240,22 @@ public:
|
|||
, config_(config)
|
||||
, params_(config)
|
||||
, banks_(config.num_banks, {config, params_})
|
||||
, core_reqs_(config.num_inputs)
|
||||
, core_rsps_(config.num_inputs)
|
||||
{}
|
||||
|
||||
void handleMemResponse(const MemRsp& response, uint32_t) {
|
||||
mem_rsp_ = {true, response};
|
||||
}
|
||||
|
||||
void handleCoreRequest(const MemReq& request, uint32_t port_id) {
|
||||
core_reqs_.at(port_id) = {true, request};
|
||||
, mem_req_ports_(config.num_banks, simobject)
|
||||
, mem_rsp_ports_(config.num_banks, simobject)
|
||||
{
|
||||
if (config.num_banks > 1) {
|
||||
mem_switch_ = Switch<MemReq, MemRsp>::Create("mem_arb", ArbiterType::RoundRobin, config.num_banks);
|
||||
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
|
||||
mem_req_ports_.at(i).bind(&mem_switch_->ReqIn.at(i));
|
||||
mem_switch_->RspOut.at(i).bind(&mem_rsp_ports_.at(i));
|
||||
}
|
||||
mem_switch_->ReqOut.bind(&simobject->MemReqPort);
|
||||
simobject->MemRspPort.bind(&mem_switch_->RspIn);
|
||||
} else {
|
||||
mem_req_ports_.at(0).bind(&simobject->MemReqPort);
|
||||
simobject->MemRspPort.bind(&mem_rsp_ports_.at(0));
|
||||
}
|
||||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
|
@ -269,31 +276,29 @@ public:
|
|||
bank.mshr.try_pop(&active_req);
|
||||
}
|
||||
|
||||
// try schedule stall replay
|
||||
// try schedule stall queue if MSHR has space
|
||||
if (!active_req.valid
|
||||
&& !bank.stall_buffer.empty()) {
|
||||
&& !bank.stall_buffer.empty()
|
||||
&& !bank.mshr.full()) {
|
||||
active_req = bank.stall_buffer.front();
|
||||
bank.stall_buffer.pop();
|
||||
}
|
||||
}
|
||||
|
||||
// handle memory fills
|
||||
if (mem_rsp_.first) {
|
||||
mem_rsp_.first = false;
|
||||
auto bank_id = bit_getw(mem_rsp_.second.tag, 0, 15);
|
||||
auto mshr_id = bit_getw(mem_rsp_.second.tag, 16, 31);
|
||||
this->processMemoryFill(bank_id, mshr_id);
|
||||
for (uint32_t i = 0, n = config_.num_banks; i < n; ++i) {
|
||||
MemRsp mem_rsp;
|
||||
if (mem_rsp_ports_.at(i).read(&mem_rsp)) {
|
||||
this->processMemoryFill(i, mem_rsp.tag);
|
||||
}
|
||||
}
|
||||
|
||||
// handle incoming core requests
|
||||
for (uint32_t i = 0, n = core_reqs_.size(); i < n; ++i) {
|
||||
auto& entry = core_reqs_.at(i);
|
||||
if (!entry.first)
|
||||
for (uint32_t i = 0, n = config_.num_inputs; i < n; ++i) {
|
||||
MemReq core_req;
|
||||
if (!simobject_->CoreReqPorts.at(i).read(&core_req))
|
||||
continue;
|
||||
|
||||
entry.first = false;
|
||||
|
||||
auto& core_req = entry.second;
|
||||
auto bank_id = params_.addr_bank_id(core_req.addr);
|
||||
auto set_id = params_.addr_set_id(core_req.addr);
|
||||
auto tag = params_.addr_tag(core_req.addr);
|
||||
|
@ -417,7 +422,7 @@ public:
|
|||
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, hit_block.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.tag = 0;
|
||||
simobject_->MemReqPort.send(mem_req, 1);
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
} else {
|
||||
// mark block as dirty
|
||||
hit_block.dirty = true;
|
||||
|
@ -438,7 +443,8 @@ public:
|
|||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, repl_block.tag);
|
||||
mem_req.write = true;
|
||||
simobject_->MemReqPort.send(mem_req, 1);
|
||||
mem_req.tag = 0;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -449,7 +455,7 @@ public:
|
|||
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.tag = 0;
|
||||
simobject_->MemReqPort.send(mem_req, 1);
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
}
|
||||
// send core response
|
||||
for (auto& info : active_req.infos) {
|
||||
|
@ -467,9 +473,8 @@ public:
|
|||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, active_req.set_id, active_req.tag);
|
||||
mem_req.write = active_req.write;
|
||||
mem_req.tag = bit_setw(0, 0, 15, bank_id);
|
||||
mem_req.tag = bit_setw(mem_req.tag, 16, 31, mshr_id);
|
||||
simobject_->MemReqPort.send(mem_req, 1);
|
||||
mem_req.tag = mshr_id;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -480,12 +485,12 @@ public:
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config)
|
||||
: SimObject<Cache>(ctx, name)
|
||||
, impl_(new Impl(this, config))
|
||||
, CoreReqPorts(config.num_inputs, {this, impl_, &Cache::Impl::handleCoreRequest})
|
||||
: SimObject<Cache>(ctx, name)
|
||||
, CoreReqPorts(config.num_inputs, this)
|
||||
, CoreRspPorts(config.num_inputs, this)
|
||||
, MemReqPort(this)
|
||||
, MemRspPort(this, impl_, &Impl::handleMemResponse)
|
||||
, MemRspPort(this)
|
||||
, impl_(new Impl(this, config))
|
||||
{}
|
||||
|
||||
Cache::~Cache() {
|
||||
|
|
|
@ -20,11 +20,7 @@ struct CacheConfig {
|
|||
uint8_t latency; // pipeline latency
|
||||
};
|
||||
|
||||
class Cache : public SimObject<Cache> {
|
||||
private:
|
||||
class Impl;
|
||||
Impl* impl_;
|
||||
|
||||
class Cache : public SimObject<Cache> {
|
||||
public:
|
||||
Cache(const SimContext& ctx, const char* name, const CacheConfig& config);
|
||||
~Cache();
|
||||
|
@ -35,6 +31,10 @@ public:
|
|||
std::vector<MasterPort<MemRsp>> CoreRspPorts;
|
||||
MasterPort<MemReq> MemReqPort;
|
||||
SlavePort<MemRsp> MemRspPort;
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
Impl* impl_;
|
||||
};
|
||||
|
||||
}
|
|
@ -12,13 +12,13 @@
|
|||
|
||||
using namespace vortex;
|
||||
|
||||
Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id)
|
||||
Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
||||
: SimObject(ctx, "Core")
|
||||
, id_(id)
|
||||
, arch_(arch)
|
||||
, decoder_(decoder)
|
||||
, mem_(mem)
|
||||
, shared_mem_(1, SMEM_SIZE)
|
||||
, decoder_(arch)
|
||||
, mmu_(0, arch.wsize(), true)
|
||||
, shared_mem_(4096)
|
||||
, warps_(arch.num_warps())
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, csrs_(arch.num_csrs(), 0)
|
||||
|
@ -54,9 +54,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU
|
|||
DCACHE_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
}))
|
||||
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
|
||||
, icache_rsp_port_(this, this, &Core::icache_handleCacheReponse)
|
||||
, dcache_rsp_port_(arch.num_threads(), {this, reinterpret_cast<LsuUnit*>(exe_units_.at((int)ExeType::LSU).get()) , &LsuUnit::handleCacheReponse})
|
||||
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
|
||||
, fetch_stage_("fetch")
|
||||
, decode_stage_("decode")
|
||||
, issue_stage_("issue")
|
||||
|
@ -65,36 +63,34 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryU
|
|||
, pending_icache_(arch_.num_warps())
|
||||
, stalled_warps_(0)
|
||||
, last_schedule_wid_(0)
|
||||
, pending_instrs_(0)
|
||||
, issued_instrs_(0)
|
||||
, committed_instrs_(0)
|
||||
, ebreak_(false)
|
||||
, stats_insts_(0)
|
||||
, stats_loads_(0)
|
||||
, stats_stores_(0)
|
||||
, MemRspPort(this, &l1_mem_switch_->RspIn)
|
||||
, MemReqPort(this, &l1_mem_switch_->ReqOut)
|
||||
, MemRspPort(this)
|
||||
, MemReqPort(this)
|
||||
{
|
||||
for (int i = 0; i < arch_.num_warps(); ++i) {
|
||||
warps_.at(i) = std::make_shared<Warp>(this, i);
|
||||
}
|
||||
|
||||
// register execute units
|
||||
exe_units_.at((int)ExeType::NOP) = std::make_shared<NopUnit>(this);
|
||||
exe_units_.at((int)ExeType::ALU) = std::make_shared<AluUnit>(this);
|
||||
exe_units_.at((int)ExeType::LSU) = std::make_shared<LsuUnit>(this);
|
||||
exe_units_.at((int)ExeType::CSR) = std::make_shared<CsrUnit>(this);
|
||||
exe_units_.at((int)ExeType::FPU) = std::make_shared<FpuUnit>(this);
|
||||
exe_units_.at((int)ExeType::GPU) = std::make_shared<GpuUnit>(this);
|
||||
|
||||
// connect l1 caches
|
||||
icache_->CoreRspPorts.at(0).bind(&icache_rsp_port_);
|
||||
for (int i = 0; i < arch_.num_threads(); ++i) {
|
||||
dcache_->CoreRspPorts.at(i).bind(&dcache_rsp_port_.at(i));
|
||||
}
|
||||
|
||||
// connect l1 switch
|
||||
icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
|
||||
dcache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[1]);
|
||||
l1_mem_switch_->RspOut[0].bind(&icache_->MemRspPort);
|
||||
l1_mem_switch_->RspOut[1].bind(&dcache_->MemRspPort);
|
||||
this->MemRspPort.bind(&l1_mem_switch_->RspIn);
|
||||
l1_mem_switch_->ReqOut.bind(&this->MemReqPort);
|
||||
|
||||
// activate warp0
|
||||
warps_.at(0)->setTmask(0, true);
|
||||
|
@ -109,31 +105,24 @@ Core::~Core() {
|
|||
}
|
||||
}
|
||||
|
||||
void Core::icache_handleCacheReponse(const MemRsp& response, uint32_t /*port_id*/) {
|
||||
// advance to decode stage
|
||||
uint32_t wid = response.tag;
|
||||
pipeline_state_t state;
|
||||
pending_icache_.remove(wid, &state);
|
||||
auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
|
||||
state.icache_latency = latency;
|
||||
decode_stage_.push(state);
|
||||
void Core::attach_ram(RAM* ram) {
|
||||
// bind RAM to memory unit
|
||||
mmu_.attach(*ram, 0, 0xFFFFFFFF);
|
||||
}
|
||||
|
||||
void Core::step(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
D(2, "###########################################################");
|
||||
D(2, std::dec << "Core" << id_ << ": cycle: " << cycle);
|
||||
|
||||
this->commit();
|
||||
this->execute();
|
||||
this->issue();
|
||||
this->decode();
|
||||
this->fetch();
|
||||
this->commit(cycle);
|
||||
this->execute(cycle);
|
||||
this->issue(cycle);
|
||||
this->decode(cycle);
|
||||
this->fetch(cycle);
|
||||
|
||||
DPN(2, std::flush);
|
||||
}
|
||||
|
||||
void Core::warp_scheduler() {
|
||||
void Core::warp_scheduler(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
bool foundSchedule = false;
|
||||
int scheduled_warp = last_schedule_wid_;
|
||||
|
||||
|
@ -159,53 +148,77 @@ void Core::warp_scheduler() {
|
|||
stats_insts_ += warp->getActiveThreads();
|
||||
|
||||
pipeline_state_t state;
|
||||
state.clear();
|
||||
state.id = (issued_instrs_++ * arch_.num_cores()) + id_;
|
||||
|
||||
warp->eval(&state);
|
||||
|
||||
D(4, state);
|
||||
DT(3, cycle, "pipeline-schedule: " << state);
|
||||
|
||||
// advance to fetch stage
|
||||
++pending_instrs_;
|
||||
// advance to fetch stage
|
||||
fetch_stage_.push(state);
|
||||
}
|
||||
|
||||
void Core::fetch() {
|
||||
// schedule icache request
|
||||
pipeline_state_t state;
|
||||
if (fetch_stage_.try_pop(&state)) {
|
||||
state.icache_latency = SimPlatform::instance().cycles();
|
||||
MemReq mem_req;
|
||||
mem_req.addr = state.PC;
|
||||
mem_req.write = false;
|
||||
mem_req.tag = pending_icache_.allocate(state);
|
||||
icache_->CoreReqPorts.at(0).send(mem_req, 1);
|
||||
void Core::fetch(uint64_t cycle) {
|
||||
// handle icache reponse
|
||||
{
|
||||
MemRsp mem_rsp;
|
||||
if (icache_->CoreRspPorts.at(0).read(&mem_rsp)){
|
||||
pipeline_state_t state;
|
||||
pending_icache_.remove(mem_rsp.tag, &state);
|
||||
auto latency = (SimPlatform::instance().cycles() - state.icache_latency);
|
||||
state.icache_latency = latency;
|
||||
decode_stage_.push(state);
|
||||
DT(3, cycle, "icache-rsp: addr=" << std::hex << state.PC << ", tag=" << mem_rsp.tag << ", " << state);
|
||||
}
|
||||
}
|
||||
|
||||
// send icache request
|
||||
{
|
||||
pipeline_state_t state;
|
||||
if (fetch_stage_.try_pop(&state)) {
|
||||
state.icache_latency = SimPlatform::instance().cycles();
|
||||
MemReq mem_req;
|
||||
mem_req.addr = state.PC;
|
||||
mem_req.write = false;
|
||||
mem_req.tag = pending_icache_.allocate(state);
|
||||
icache_->CoreReqPorts.at(0).send(mem_req, 1);
|
||||
DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << state);
|
||||
}
|
||||
}
|
||||
|
||||
// schedule next warp
|
||||
this->warp_scheduler();
|
||||
this->warp_scheduler(cycle);
|
||||
}
|
||||
|
||||
void Core::decode() {
|
||||
void Core::decode(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
pipeline_state_t state;
|
||||
if (!decode_stage_.try_pop(&state))
|
||||
return;
|
||||
|
||||
if (state.stall_warp) {
|
||||
D(3, "*** warp#" << state.wid << " fetch stalled");
|
||||
} else {
|
||||
// release warp
|
||||
// release warp
|
||||
if (!state.stall_warp) {
|
||||
stalled_warps_.reset(state.wid);
|
||||
}
|
||||
|
||||
DT(3, cycle, "pipeline-decode: " << state);
|
||||
|
||||
// advance to issue stage
|
||||
issue_stage_.push(state);
|
||||
}
|
||||
|
||||
void Core::issue() {
|
||||
void Core::issue(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
if (!issue_stage_.empty()) {
|
||||
// insert to ibuffer
|
||||
auto& state = issue_stage_.top();
|
||||
auto& ibuffer = ibuffers_.at(state.wid);
|
||||
if (!ibuffer.full()) {
|
||||
if (ibuffer.full()) {
|
||||
DT(3, cycle, "*** ibuffer-stall: " << state);
|
||||
} else {
|
||||
ibuffer.push(state);
|
||||
issue_stage_.pop();
|
||||
}
|
||||
|
@ -219,8 +232,18 @@ void Core::issue() {
|
|||
auto& state = ibuffer.top();
|
||||
|
||||
// check scoreboard
|
||||
if (scoreboard_.in_use(state))
|
||||
if (scoreboard_.in_use(state)) {
|
||||
DTH(3, cycle, "*** scoreboard-stall: dependents={");
|
||||
auto owners = scoreboard_.owners(state);
|
||||
for (uint32_t i = 0, n = owners.size(); i < n; ++i) {
|
||||
if (i) DTN(3, ", ");
|
||||
DTN(3, "#" << owners.at(i));
|
||||
}
|
||||
DTN(3, "}, " << state << std::endl);
|
||||
continue;
|
||||
}
|
||||
|
||||
DT(3, cycle, "pipeline-issue: " << state);
|
||||
|
||||
// update scoreboard
|
||||
scoreboard_.reserve(state);
|
||||
|
@ -233,18 +256,19 @@ void Core::issue() {
|
|||
}
|
||||
}
|
||||
|
||||
void Core::execute() {
|
||||
void Core::execute(uint64_t cycle) {
|
||||
// process stage inputs
|
||||
if (!execute_stage_.empty()) {
|
||||
auto& state = execute_stage_.top();
|
||||
auto& exe_unit = exe_units_.at((int)state.exe_type);
|
||||
exe_unit->push_input(state);
|
||||
execute_stage_.pop();
|
||||
DT(3, cycle, "pipeline-execute: " << state);
|
||||
}
|
||||
|
||||
// advance execute units
|
||||
for (auto& exe_unit : exe_units_) {
|
||||
exe_unit->step();
|
||||
exe_unit->step(cycle);
|
||||
}
|
||||
|
||||
// commit completed instructions
|
||||
|
@ -255,18 +279,29 @@ void Core::execute() {
|
|||
stalled_warps_.reset(state.wid);
|
||||
}
|
||||
// advance to commit stage
|
||||
commit_stage_.push(state);
|
||||
commit_stage_.push(state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Core::commit() {
|
||||
void Core::commit(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
pipeline_state_t state;
|
||||
if (!commit_stage_.try_pop(&state))
|
||||
return;
|
||||
|
||||
DT(3, cycle, "pipeline-commit: " << state);
|
||||
|
||||
// update scoreboard
|
||||
scoreboard_.release(state);
|
||||
|
||||
assert(committed_instrs_ <= issued_instrs_);
|
||||
++committed_instrs_;
|
||||
}
|
||||
|
||||
bool Core::running() const {
|
||||
return (committed_instrs_ != issued_instrs_);
|
||||
}
|
||||
|
||||
Word Core::get_csr(Addr addr, int tid, int wid) {
|
||||
|
@ -349,9 +384,9 @@ void Core::barrier(int bar_id, int count, int warp_id) {
|
|||
barrier.reset();
|
||||
}
|
||||
|
||||
Word Core::icache_fetch(Addr addr) {
|
||||
Word Core::icache_read(Addr addr, Size size) {
|
||||
Word data;
|
||||
mem_.read(&data, addr, sizeof(Word), 0);
|
||||
mmu_.read(&data, addr, size, 0);
|
||||
return data;
|
||||
}
|
||||
|
||||
|
@ -365,7 +400,7 @@ Word Core::dcache_read(Addr addr, Size size) {
|
|||
return data;
|
||||
}
|
||||
#endif
|
||||
mem_.read(&data, addr, size, 0);
|
||||
mmu_.read(&data, addr, size, 0);
|
||||
return data;
|
||||
}
|
||||
|
||||
|
@ -383,11 +418,7 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
|
|||
this->writeToStdOut(addr, data);
|
||||
return;
|
||||
}
|
||||
mem_.write(&data, addr, size, 0);
|
||||
}
|
||||
|
||||
bool Core::running() const {
|
||||
return pending_instrs_;
|
||||
mmu_.write(&data, addr, size, 0);
|
||||
}
|
||||
|
||||
void Core::printStats() const {
|
||||
|
@ -399,7 +430,7 @@ void Core::printStats() const {
|
|||
|
||||
void Core::writeToStdOut(Addr addr, Word data) {
|
||||
uint32_t tid = (addr - IO_COUT_ADDR) & (IO_COUT_SIZE-1);
|
||||
auto& ss_buf = print_bufs_.at(tid);
|
||||
auto& ss_buf = print_bufs_[tid];
|
||||
char c = (char)data;
|
||||
ss_buf << c;
|
||||
if (c == '\n') {
|
||||
|
|
|
@ -25,9 +25,11 @@ namespace vortex {
|
|||
|
||||
class Core : public SimObject<Core> {
|
||||
public:
|
||||
Core(const SimContext& ctx, const ArchDef &arch, Decoder &decoder, MemoryUnit &mem, Word id);
|
||||
Core(const SimContext& ctx, const ArchDef &arch, Word id);
|
||||
~Core();
|
||||
|
||||
void attach_ram(RAM* ram);
|
||||
|
||||
bool running() const;
|
||||
|
||||
void step(uint64_t cycle);
|
||||
|
@ -64,7 +66,7 @@ public:
|
|||
|
||||
void barrier(int bar_id, int count, int warp_id);
|
||||
|
||||
Word icache_fetch(Addr);
|
||||
Word icache_read(Addr, Size);
|
||||
|
||||
Word dcache_read(Addr, Size);
|
||||
|
||||
|
@ -76,22 +78,21 @@ public:
|
|||
|
||||
private:
|
||||
|
||||
void fetch();
|
||||
void decode();
|
||||
void issue();
|
||||
void execute();
|
||||
void commit();
|
||||
void fetch(uint64_t cycle);
|
||||
void decode(uint64_t cycle);
|
||||
void issue(uint64_t cycle);
|
||||
void execute(uint64_t cycle);
|
||||
void commit(uint64_t cycle);
|
||||
|
||||
void warp_scheduler();
|
||||
|
||||
void icache_handleCacheReponse(const MemRsp& response, uint32_t port_id);
|
||||
void warp_scheduler(uint64_t cycle);
|
||||
|
||||
void writeToStdOut(Addr addr, Word data);
|
||||
|
||||
Word id_;
|
||||
const ArchDef& arch_;
|
||||
const Decoder& decoder_;
|
||||
MemoryUnit& mem_;
|
||||
const ArchDef arch_;
|
||||
const Decoder decoder_;
|
||||
MemoryUnit mmu_;
|
||||
|
||||
#ifdef SM_ENABLE
|
||||
RAM shared_mem_;
|
||||
#endif
|
||||
|
@ -106,8 +107,6 @@ private:
|
|||
Cache::Ptr icache_;
|
||||
Cache::Ptr dcache_;
|
||||
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
|
||||
SlavePort<MemRsp> icache_rsp_port_;
|
||||
std::vector<SlavePort<MemRsp>> dcache_rsp_port_;
|
||||
|
||||
PipelineStage fetch_stage_;
|
||||
PipelineStage decode_stage_;
|
||||
|
@ -118,10 +117,12 @@ private:
|
|||
HashTable<pipeline_state_t> pending_icache_;
|
||||
WarpMask stalled_warps_;
|
||||
uint32_t last_schedule_wid_;
|
||||
uint32_t pending_instrs_;
|
||||
uint32_t issued_instrs_;
|
||||
uint32_t committed_instrs_;
|
||||
bool ebreak_;
|
||||
|
||||
std::unordered_map<int, std::stringstream> print_bufs_;
|
||||
|
||||
uint64_t stats_insts_;
|
||||
uint64_t stats_loads_;
|
||||
uint64_t stats_stores_;
|
||||
|
|
|
@ -7,14 +7,15 @@
|
|||
#define DEBUG_HEADER << "DEBUG "
|
||||
//#define DEBUG_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
|
||||
|
||||
#define TRACE_HEADER << "TRACE "
|
||||
//#define TRACE_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
||||
#include <iostream>
|
||||
#include <iomanip>
|
||||
|
||||
#define DX(x) x
|
||||
|
||||
#define D(lvl, x) do { \
|
||||
#define DP(lvl, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout DEBUG_HEADER << x << std::endl; \
|
||||
} \
|
||||
|
@ -32,12 +33,33 @@
|
|||
} \
|
||||
} while(0)
|
||||
|
||||
#define DT(lvl, t, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x << std::endl; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DTH(lvl, t, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout TRACE_HEADER << std::setw(10) << std::dec << t << std::setw(0) << ": " << x; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define DTN(lvl, x) do { \
|
||||
if ((lvl) <= DEBUG_LEVEL) { \
|
||||
std::cout << x; \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
|
||||
#else
|
||||
|
||||
#define DX(x)
|
||||
#define D(lvl, x) do {} while(0)
|
||||
#define DP(lvl, x) do {} while(0)
|
||||
#define DPH(lvl, x) do {} while(0)
|
||||
#define DPN(lvl, x) do {} while(0)
|
||||
#define D_RAW(x) do {} while(0)
|
||||
|
||||
#define DT(lvl, t, x) do {} while(0)
|
||||
#define DTH(lvl, t, x) do {} while(0)
|
||||
#define DTN(lvl, x) do {} while(0)
|
||||
|
||||
#endif
|
|
@ -194,47 +194,26 @@ static const char* op_string(const Instr &instr) {
|
|||
namespace vortex {
|
||||
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
|
||||
os << op_string(instr) << ": ";
|
||||
auto opcode = instr.getOpcode();
|
||||
|
||||
auto rd_to_string = [&]() {
|
||||
int rdt = instr.getRDType();
|
||||
int rd = instr.getRDest();
|
||||
switch (rdt) {
|
||||
case 1: os << "r" << std::dec << rd << " <- "; break;
|
||||
case 2: os << "fr" << std::dec << rd << " <- "; break;
|
||||
case 3: os << "vr" << std::dec << rd << " <- "; break;
|
||||
default: break;
|
||||
}
|
||||
};
|
||||
|
||||
auto rs_to_string = [&](int i) {
|
||||
int rst = instr.getRSType(i);
|
||||
int rs = instr.getRSrc(i);
|
||||
switch (rst) {
|
||||
case 1: os << "r" << std::dec << rs; break;
|
||||
case 2: os << "fr" << std::dec << rs; break;
|
||||
case 3: os << "vr" << std::dec << rs; break;
|
||||
default: break;
|
||||
}
|
||||
};
|
||||
|
||||
auto opcode = instr.getOpcode();
|
||||
if (opcode == S_INST
|
||||
|| opcode == FS
|
||||
|| opcode == VS) {
|
||||
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "] <- ";
|
||||
rs_to_string(1);
|
||||
os << instr.getRSType(1) << std::dec << instr.getRSrc(1);
|
||||
} else
|
||||
if (opcode == L_INST
|
||||
|| opcode == FL
|
||||
|| opcode == VL) {
|
||||
rd_to_string();
|
||||
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
|
||||
os << "M[r" << std::dec << instr.getRSrc(0) << " + 0x" << std::hex << instr.getImm() << "]";
|
||||
} else {
|
||||
rd_to_string();
|
||||
if (instr.getRDType() != RegType::None) {
|
||||
os << instr.getRDType() << std::dec << instr.getRDest() << " <- ";
|
||||
}
|
||||
int i = 0;
|
||||
for (; i < instr.getNRSrc(); ++i) {
|
||||
if (i) os << ", ";
|
||||
rs_to_string(i);
|
||||
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
|
||||
}
|
||||
if (instr.hasImm()) {
|
||||
if (i) os << ", ";
|
||||
|
@ -281,7 +260,7 @@ Decoder::Decoder(const ArchDef &arch) {
|
|||
v_imm_mask_ = 0x7ff;
|
||||
}
|
||||
|
||||
std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {
|
||||
std::shared_ptr<Instr> Decoder::decode(Word code) const {
|
||||
auto instr = std::make_shared<Instr>();
|
||||
Opcode op = (Opcode)((code >> shift_opcode_) & opcode_mask_);
|
||||
instr->setOpcode(op);
|
||||
|
@ -297,8 +276,8 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {
|
|||
|
||||
auto op_it = sc_instTable.find(op);
|
||||
if (op_it == sc_instTable.end()) {
|
||||
std::cout << std::hex << "invalid opcode: 0x" << op << ", instruction=0x" << code << ", PC=" << PC << std::endl;
|
||||
std::abort();
|
||||
std::cout << std::hex << "Error: invalid opcode: 0x" << op << std::endl;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto iType = op_it->second.iType;
|
||||
|
@ -459,7 +438,5 @@ std::shared_ptr<Instr> Decoder::decode(Word code, Word PC) const {
|
|||
std::abort();
|
||||
}
|
||||
|
||||
D(2, "Instr 0x" << std::hex << code << ": " << *instr << std::flush);
|
||||
|
||||
return instr;
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ class Decoder {
|
|||
public:
|
||||
Decoder(const ArchDef &);
|
||||
|
||||
std::shared_ptr<Instr> decode(Word code, Word PC) const;
|
||||
std::shared_ptr<Instr> decode(Word code) const;
|
||||
|
||||
private:
|
||||
|
||||
|
|
|
@ -75,11 +75,11 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
if (num_rsrcs) {
|
||||
for (int i = 0; i < num_rsrcs; ++i) {
|
||||
DPH(2, "Src Reg [" << std::dec << i << "]: ");
|
||||
int type = instr.getRSType(i);
|
||||
auto type = instr.getRSType(i);
|
||||
int reg = instr.getRSrc(i);
|
||||
switch (type) {
|
||||
case 1:
|
||||
DPH(2, "r" << std::dec << reg << "={");
|
||||
case RegType::Integer:
|
||||
DPN(2, "r" << std::dec << reg << "={");
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (t) DPN(2, ", ");
|
||||
if (!tmask_.test(t)) {
|
||||
|
@ -91,8 +91,8 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
break;
|
||||
case 2:
|
||||
DPH(2, "fr" << std::dec << reg << "={");
|
||||
case RegType::Float:
|
||||
DPN(2, "fr" << std::dec << reg << "={");
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (t) DPN(2, ", ");
|
||||
if (!tmask_.test(t)) {
|
||||
|
@ -105,6 +105,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
DPN(2, "}" << std::endl);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -415,7 +416,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
break;
|
||||
case L_INST:
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.load = 0;
|
||||
pipeline_state->lsu.type = LsuType::LOAD;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->mem_addrs.resize(num_threads);
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
|
@ -425,7 +426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
|
||||
Word data_read = core_->dcache_read(memAddr, 4);
|
||||
pipeline_state->mem_addrs.at(t) = memAddr;
|
||||
D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
|
||||
DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
|
||||
switch (func3) {
|
||||
case 0:
|
||||
// LBI
|
||||
|
@ -455,7 +456,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
break;
|
||||
case S_INST:
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.store = 1;
|
||||
pipeline_state->lsu.type = LsuType::STORE;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
pipeline_state->mem_addrs.resize(num_threads);
|
||||
|
@ -464,7 +465,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
continue;
|
||||
Word memAddr = rsdata[t][0] + immsrc;
|
||||
pipeline_state->mem_addrs.at(t) = memAddr;
|
||||
D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
switch (func3) {
|
||||
case 0:
|
||||
// SB
|
||||
|
@ -543,12 +544,12 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
break;
|
||||
case FENCE:
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.fence = 1;
|
||||
pipeline_state->lsu.type = LsuType::FENCE;
|
||||
pipeline_state->stall_warp = true;
|
||||
break;
|
||||
case (FL | VL):
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.load = 1;
|
||||
pipeline_state->lsu.type = LsuType::LOAD;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
if (func3 == 0x2) {
|
||||
pipeline_state->mem_addrs.resize(num_threads);
|
||||
|
@ -558,14 +559,14 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
Word memAddr = rsdata[t][0] + immsrc;
|
||||
pipeline_state->mem_addrs.at(t) = memAddr;
|
||||
Word data_read = core_->dcache_read(memAddr, 4);
|
||||
D(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
|
||||
DP(3, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
|
||||
rddata[t] = data_read;
|
||||
}
|
||||
} else {
|
||||
D(3, "Executing vector load");
|
||||
D(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
|
||||
D(3, "dest: v" << rdest);
|
||||
D(3, "width" << instr.getVlsWidth());
|
||||
DP(3, "Executing vector load");
|
||||
DP(3, "lmul: " << vtype_.vlmul << " VLEN:" << (core_->arch().vsize() * 8) << "sew: " << vtype_.vsew);
|
||||
DP(3, "dest: v" << rdest);
|
||||
DP(3, "width" << instr.getVlsWidth());
|
||||
pipeline_state->mem_addrs.resize(vl_);
|
||||
auto &vd = vRegFile_.at(rdest);
|
||||
switch (instr.getVlsWidth()) {
|
||||
|
@ -574,9 +575,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
for (int i = 0; i < vl_; i++) {
|
||||
Word memAddr = ((rsdata[i][0]) & 0xFFFFFFFC) + (i * vtype_.vsew / 8);
|
||||
pipeline_state->mem_addrs.at(i) = memAddr;
|
||||
D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
Word data_read = core_->dcache_read(memAddr, 4);
|
||||
D(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
|
||||
DP(3, "Mem addr: " << std::hex << memAddr << " Data read " << data_read);
|
||||
int *result_ptr = (int *)(vd.data() + i);
|
||||
*result_ptr = data_read;
|
||||
}
|
||||
|
@ -590,7 +591,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
break;
|
||||
case (FS | VS):
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.store = 1;
|
||||
pipeline_state->lsu.type = LsuType::STORE;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
if (func3 == 0x2) {
|
||||
|
@ -601,20 +602,20 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
Word memAddr = rsdata[t][0] + immsrc;
|
||||
pipeline_state->mem_addrs.at(t) = memAddr;
|
||||
core_->dcache_write(memAddr, rsdata[t][1], 4);
|
||||
D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
}
|
||||
} else {
|
||||
pipeline_state->mem_addrs.resize(vl_);
|
||||
for (int i = 0; i < vl_; i++) {
|
||||
Word memAddr = rsdata[i][0] + (i * vtype_.vsew / 8);
|
||||
pipeline_state->mem_addrs.at(i) = memAddr;
|
||||
D(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
DP(3, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
switch (instr.getVlsWidth()) {
|
||||
case 6: {
|
||||
//store word and unit strided (not checking for unit stride)
|
||||
uint32_t value = *(uint32_t *)(vRegFile_.at(instr.getVs3()).data() + i);
|
||||
core_->dcache_write(memAddr, value, 4);
|
||||
D(3, "store: " << memAddr << " value:" << value);
|
||||
DP(3, "store: " << memAddr << " value:" << value);
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
|
@ -705,9 +706,9 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
} else {
|
||||
// FMV.X.W
|
||||
rddata[t] = rsdata[t][0];
|
||||
pipeline_state->fpu.type = FpuType::FNCP;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
}
|
||||
}
|
||||
pipeline_state->fpu.type = FpuType::FNCP;
|
||||
pipeline_state->used_fregs[rsrc0] = 1;
|
||||
break;
|
||||
case 0x50:
|
||||
switch(func3) {
|
||||
|
@ -783,132 +784,138 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
}
|
||||
rd_write = true;
|
||||
break;
|
||||
case GPGPU:
|
||||
pipeline_state->exe_type = ExeType::GPU;
|
||||
case GPGPU: {
|
||||
pipeline_state->exe_type = ExeType::GPU;
|
||||
int ts = 0;
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
switch (func3) {
|
||||
case 0: {
|
||||
// TMC
|
||||
pipeline_state->gpu.type = GpuType::TMC;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
if (rsrc1) {
|
||||
// predicate mode
|
||||
ThreadMask pred;
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
|
||||
}
|
||||
if (pred.any()) {
|
||||
tmask_ &= pred;
|
||||
}
|
||||
} else {
|
||||
tmask_.reset();
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
tmask_.set(i, rsdata.at(t)[0] & (1 << i));
|
||||
}
|
||||
}
|
||||
D(3, "*** TMC " << tmask_);
|
||||
active_ = tmask_.any();
|
||||
break; // runOnce
|
||||
} break;
|
||||
case 1: {
|
||||
// WSPAWN
|
||||
pipeline_state->gpu.type = GpuType::WSPAWN;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
int active_warps = std::min<int>(rsdata.at(t)[0], core_->arch().num_warps());
|
||||
D(3, "*** Spawning " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(t)[1]);
|
||||
for (int i = 1; i < active_warps; ++i) {
|
||||
Warp &newWarp = core_->warp(i);
|
||||
newWarp.setPC(rsdata[t][1]);
|
||||
newWarp.setTmask(0, true);
|
||||
}
|
||||
break; // runOnce
|
||||
} break;
|
||||
case 2: {
|
||||
// SPLIT
|
||||
pipeline_state->gpu.type = GpuType::SPLIT;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {
|
||||
ThreadMask tmask;
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
|
||||
}
|
||||
|
||||
DomStackEntry e(tmask, nextPC);
|
||||
domStack_.push(tmask_);
|
||||
domStack_.push(e);
|
||||
for (size_t i = 0; i < e.tmask.size(); ++i) {
|
||||
tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
|
||||
}
|
||||
active_ = tmask_.any();
|
||||
|
||||
DPH(3, "*** Split: New TM=");
|
||||
for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
|
||||
DPN(3, ", Pushed TM=");
|
||||
for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1));
|
||||
DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
|
||||
} else {
|
||||
D(3, "*** Unanimous pred");
|
||||
DomStackEntry e(tmask_);
|
||||
e.unanimous = true;
|
||||
domStack_.push(e);
|
||||
}
|
||||
break; // runOnce
|
||||
} break;
|
||||
case 3: {
|
||||
// JOIN
|
||||
pipeline_state->gpu.type = GpuType::JOIN;
|
||||
pipeline_state->stall_warp = true;
|
||||
if (!domStack_.empty() && domStack_.top().unanimous) {
|
||||
D(3, "*** Uninimous branch at join");
|
||||
tmask_ = domStack_.top().tmask;
|
||||
active_ = tmask_.any();
|
||||
domStack_.pop();
|
||||
} else {
|
||||
if (!domStack_.top().fallThrough) {
|
||||
nextPC = domStack_.top().PC;
|
||||
D(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
|
||||
}
|
||||
|
||||
tmask_ = domStack_.top().tmask;
|
||||
active_ = tmask_.any();
|
||||
|
||||
DPH(3, "*** Join: New TM=");
|
||||
for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
|
||||
DPN(3, "\n");
|
||||
|
||||
domStack_.pop();
|
||||
}
|
||||
break; // runOnce
|
||||
} break;
|
||||
case 4: {
|
||||
// BAR
|
||||
pipeline_state->gpu.type = GpuType::BAR;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
active_ = false;
|
||||
core_->barrier(rsdata[t][0], rsdata[t][1], id_);
|
||||
break; // runOnce
|
||||
} break;
|
||||
case 6: {
|
||||
// PREFETCH
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.prefetch = 1;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
int addr = rsdata[t][0];
|
||||
printf("*** PREFETCHED %d ***\n", addr);
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
if (tmask_.test(t)) {
|
||||
ts = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
switch (func3) {
|
||||
case 0: {
|
||||
// TMC
|
||||
pipeline_state->gpu.type = GpuType::TMC;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
if (rsrc1) {
|
||||
// predicate mode
|
||||
ThreadMask pred;
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
pred[i] = tmask_.test(i) ? (iRegFile_.at(i).at(rsrc0) != 0) : 0;
|
||||
}
|
||||
if (pred.any()) {
|
||||
tmask_ &= pred;
|
||||
}
|
||||
} else {
|
||||
tmask_.reset();
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
tmask_.set(i, rsdata.at(ts)[0] & (1 << i));
|
||||
}
|
||||
}
|
||||
DPH(3, "*** New TMC: ");
|
||||
for (int i = 0; i < num_threads; ++i)
|
||||
DPN(3, tmask_.test(num_threads-i-1));
|
||||
DPN(3, std::endl);
|
||||
|
||||
active_ = tmask_.any();
|
||||
} break;
|
||||
case 1: {
|
||||
// WSPAWN
|
||||
pipeline_state->gpu.type = GpuType::WSPAWN;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps());
|
||||
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]);
|
||||
for (int i = 1; i < active_warps; ++i) {
|
||||
Warp &newWarp = core_->warp(i);
|
||||
newWarp.setPC(rsdata[ts][1]);
|
||||
newWarp.setTmask(0, true);
|
||||
}
|
||||
} break;
|
||||
case 2: {
|
||||
// SPLIT
|
||||
pipeline_state->gpu.type = GpuType::SPLIT;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
if (HasDivergentThreads(tmask_, iRegFile_, rsrc0)) {
|
||||
ThreadMask tmask;
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
tmask[i] = tmask_.test(i) && !iRegFile_.at(i).at(rsrc0);
|
||||
}
|
||||
|
||||
DomStackEntry e(tmask, nextPC);
|
||||
domStack_.push(tmask_);
|
||||
domStack_.push(e);
|
||||
for (size_t i = 0; i < e.tmask.size(); ++i) {
|
||||
tmask_.set(i, !e.tmask.test(i) && tmask_.test(i));
|
||||
}
|
||||
active_ = tmask_.any();
|
||||
|
||||
DPH(3, "*** Split: New TM=");
|
||||
for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
|
||||
DPN(3, ", Pushed TM=");
|
||||
for (int i = 0; i < num_threads; ++i) DPN(3, e.tmask.test(num_threads-i-1));
|
||||
DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
|
||||
} else {
|
||||
DP(3, "*** Unanimous pred");
|
||||
DomStackEntry e(tmask_);
|
||||
e.unanimous = true;
|
||||
domStack_.push(e);
|
||||
}
|
||||
} break;
|
||||
case 3: {
|
||||
// JOIN
|
||||
pipeline_state->gpu.type = GpuType::JOIN;
|
||||
pipeline_state->stall_warp = true;
|
||||
if (!domStack_.empty() && domStack_.top().unanimous) {
|
||||
DP(3, "*** Uninimous branch at join");
|
||||
tmask_ = domStack_.top().tmask;
|
||||
active_ = tmask_.any();
|
||||
domStack_.pop();
|
||||
} else {
|
||||
if (!domStack_.top().fallThrough) {
|
||||
nextPC = domStack_.top().PC;
|
||||
DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
|
||||
}
|
||||
|
||||
tmask_ = domStack_.top().tmask;
|
||||
active_ = tmask_.any();
|
||||
|
||||
DPH(3, "*** Join: New TM=");
|
||||
for (int i = 0; i < num_threads; ++i) DPN(3, tmask_.test(num_threads-i-1));
|
||||
DPN(3, "\n");
|
||||
|
||||
domStack_.pop();
|
||||
}
|
||||
} break;
|
||||
case 4: {
|
||||
// BAR
|
||||
pipeline_state->gpu.type = GpuType::BAR;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
pipeline_state->used_iregs[rsrc1] = 1;
|
||||
pipeline_state->stall_warp = true;
|
||||
active_ = false;
|
||||
core_->barrier(rsdata[ts][0], rsdata[ts][1], id_);
|
||||
} break;
|
||||
case 6: {
|
||||
// PREFETCH
|
||||
pipeline_state->exe_type = ExeType::LSU;
|
||||
pipeline_state->lsu.type = LsuType::PREFETCH;
|
||||
pipeline_state->used_iregs[rsrc0] = 1;
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
int addr = rsdata[t][0];
|
||||
printf("*** PREFETCHED %d ***\n", addr);
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
} break;
|
||||
case VSET: {
|
||||
int VLEN = core_->arch().vsize() * 8;
|
||||
int VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
|
||||
|
@ -928,7 +935,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
uint8_t result = first + second;
|
||||
D(3, "Adding " << first << " + " << second << " = " << result);
|
||||
DP(3, "Adding " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -940,7 +947,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first = *(uint16_t *)(vr1.data() + i);
|
||||
uint16_t second = *(uint16_t *)(vr2.data() + i);
|
||||
uint16_t result = first + second;
|
||||
D(3, "Adding " << first << " + " << second << " = " << result);
|
||||
DP(3, "Adding " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -952,7 +959,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first = *(uint32_t *)(vr1.data() + i);
|
||||
uint32_t second = *(uint32_t *)(vr2.data() + i);
|
||||
uint32_t result = first + second;
|
||||
D(3, "Adding " << first << " + " << second << " = " << result);
|
||||
DP(3, "Adding " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -968,7 +975,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
uint8_t result = (first == second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 16) {
|
||||
|
@ -976,7 +983,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first = *(uint16_t *)(vr1.data() + i);
|
||||
uint16_t second = *(uint16_t *)(vr2.data() + i);
|
||||
uint16_t result = (first == second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 32) {
|
||||
|
@ -984,7 +991,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first = *(uint32_t *)(vr1.data() + i);
|
||||
uint32_t second = *(uint32_t *)(vr2.data() + i);
|
||||
uint32_t result = (first == second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -999,7 +1006,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
uint8_t result = (first != second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 16) {
|
||||
|
@ -1007,7 +1014,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first = *(uint16_t *)(vr1.data() + i);
|
||||
uint16_t second = *(uint16_t *)(vr2.data() + i);
|
||||
uint16_t result = (first != second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 32) {
|
||||
|
@ -1015,7 +1022,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first = *(uint32_t *)(vr1.data() + i);
|
||||
uint32_t second = *(uint32_t *)(vr2.data() + i);
|
||||
uint32_t result = (first != second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -1030,7 +1037,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
uint8_t result = (first < second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 16) {
|
||||
|
@ -1038,7 +1045,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first = *(uint16_t *)(vr1.data() + i);
|
||||
uint16_t second = *(uint16_t *)(vr2.data() + i);
|
||||
uint16_t result = (first < second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 32) {
|
||||
|
@ -1046,7 +1053,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first = *(uint32_t *)(vr1.data() + i);
|
||||
uint32_t second = *(uint32_t *)(vr2.data() + i);
|
||||
uint32_t result = (first < second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -1061,7 +1068,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
int8_t first = *(int8_t *)(vr1.data() + i);
|
||||
int8_t second = *(int8_t *)(vr2.data() + i);
|
||||
int8_t result = (first < second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 16) {
|
||||
|
@ -1069,7 +1076,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
int16_t first = *(int16_t *)(vr1.data() + i);
|
||||
int16_t second = *(int16_t *)(vr2.data() + i);
|
||||
int16_t result = (first < second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(int16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 32) {
|
||||
|
@ -1077,7 +1084,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
int32_t first = *(int32_t *)(vr1.data() + i);
|
||||
int32_t second = *(int32_t *)(vr2.data() + i);
|
||||
int32_t result = (first < second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(int32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -1092,7 +1099,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
uint8_t result = (first <= second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 16) {
|
||||
|
@ -1100,7 +1107,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first = *(uint16_t *)(vr1.data() + i);
|
||||
uint16_t second = *(uint16_t *)(vr2.data() + i);
|
||||
uint16_t result = (first <= second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 32) {
|
||||
|
@ -1108,7 +1115,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first = *(uint32_t *)(vr1.data() + i);
|
||||
uint32_t second = *(uint32_t *)(vr2.data() + i);
|
||||
uint32_t result = (first <= second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -1123,7 +1130,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
int8_t first = *(int8_t *)(vr1.data() + i);
|
||||
int8_t second = *(int8_t *)(vr2.data() + i);
|
||||
int8_t result = (first <= second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 16) {
|
||||
|
@ -1131,7 +1138,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
int16_t first = *(int16_t *)(vr1.data() + i);
|
||||
int16_t second = *(int16_t *)(vr2.data() + i);
|
||||
int16_t result = (first <= second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(int16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 32) {
|
||||
|
@ -1139,7 +1146,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
int32_t first = *(int32_t *)(vr1.data() + i);
|
||||
int32_t second = *(int32_t *)(vr2.data() + i);
|
||||
int32_t result = (first <= second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(int32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -1154,7 +1161,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
uint8_t result = (first > second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 16) {
|
||||
|
@ -1162,7 +1169,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first = *(uint16_t *)(vr1.data() + i);
|
||||
uint16_t second = *(uint16_t *)(vr2.data() + i);
|
||||
uint16_t result = (first > second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 32) {
|
||||
|
@ -1170,7 +1177,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first = *(uint32_t *)(vr1.data() + i);
|
||||
uint32_t second = *(uint32_t *)(vr2.data() + i);
|
||||
uint32_t result = (first > second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -1185,7 +1192,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
int8_t first = *(int8_t *)(vr1.data() + i);
|
||||
int8_t second = *(int8_t *)(vr2.data() + i);
|
||||
int8_t result = (first > second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 16) {
|
||||
|
@ -1193,7 +1200,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
int16_t first = *(int16_t *)(vr1.data() + i);
|
||||
int16_t second = *(int16_t *)(vr2.data() + i);
|
||||
int16_t result = (first > second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(int16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
} else if (vtype_.vsew == 32) {
|
||||
|
@ -1201,7 +1208,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
int32_t first = *(int32_t *)(vr1.data() + i);
|
||||
int32_t second = *(int32_t *)(vr2.data() + i);
|
||||
int32_t result = (first > second) ? 1 : 0;
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(int32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
}
|
||||
|
@ -1222,7 +1229,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first_value = (first & 0x1);
|
||||
uint8_t second_value = (second & 0x1);
|
||||
uint8_t result = (first_value & !second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1235,7 +1242,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first_value = (first & 0x1);
|
||||
uint16_t second_value = (second & 0x1);
|
||||
uint16_t result = (first_value & !second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1248,7 +1255,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first_value = (first & 0x1);
|
||||
uint32_t second_value = (second & 0x1);
|
||||
uint32_t result = (first_value & !second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1268,7 +1275,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first_value = (first & 0x1);
|
||||
uint8_t second_value = (second & 0x1);
|
||||
uint8_t result = (first_value & second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1281,7 +1288,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first_value = (first & 0x1);
|
||||
uint16_t second_value = (second & 0x1);
|
||||
uint16_t result = (first_value & second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1294,7 +1301,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first_value = (first & 0x1);
|
||||
uint32_t second_value = (second & 0x1);
|
||||
uint32_t result = (first_value & second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1314,7 +1321,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first_value = (first & 0x1);
|
||||
uint8_t second_value = (second & 0x1);
|
||||
uint8_t result = (first_value | second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1327,7 +1334,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first_value = (first & 0x1);
|
||||
uint16_t second_value = (second & 0x1);
|
||||
uint16_t result = (first_value | second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1340,7 +1347,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first_value = (first & 0x1);
|
||||
uint32_t second_value = (second & 0x1);
|
||||
uint32_t result = (first_value | second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1360,7 +1367,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first_value = (first & 0x1);
|
||||
uint8_t second_value = (second & 0x1);
|
||||
uint8_t result = (first_value ^ second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1373,7 +1380,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first_value = (first & 0x1);
|
||||
uint16_t second_value = (second & 0x1);
|
||||
uint16_t result = (first_value ^ second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1386,7 +1393,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first_value = (first & 0x1);
|
||||
uint32_t second_value = (second & 0x1);
|
||||
uint32_t result = (first_value ^ second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1406,7 +1413,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first_value = (first & 0x1);
|
||||
uint8_t second_value = (second & 0x1);
|
||||
uint8_t result = (first_value | !second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1419,7 +1426,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first_value = (first & 0x1);
|
||||
uint16_t second_value = (second & 0x1);
|
||||
uint16_t result = (first_value | !second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1432,7 +1439,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first_value = (first & 0x1);
|
||||
uint32_t second_value = (second & 0x1);
|
||||
uint32_t result = (first_value | !second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1452,7 +1459,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first_value = (first & 0x1);
|
||||
uint8_t second_value = (second & 0x1);
|
||||
uint8_t result = !(first_value & second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1465,7 +1472,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first_value = (first & 0x1);
|
||||
uint16_t second_value = (second & 0x1);
|
||||
uint16_t result = !(first_value & second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1478,7 +1485,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first_value = (first & 0x1);
|
||||
uint32_t second_value = (second & 0x1);
|
||||
uint32_t result = !(first_value & second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1498,7 +1505,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first_value = (first & 0x1);
|
||||
uint8_t second_value = (second & 0x1);
|
||||
uint8_t result = !(first_value | second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1511,7 +1518,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first_value = (first & 0x1);
|
||||
uint16_t second_value = (second & 0x1);
|
||||
uint16_t result = !(first_value | second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1524,7 +1531,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first_value = (first & 0x1);
|
||||
uint32_t second_value = (second & 0x1);
|
||||
uint32_t result = !(first_value | second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1544,7 +1551,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first_value = (first & 0x1);
|
||||
uint8_t second_value = (second & 0x1);
|
||||
uint8_t result = !(first_value ^ second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1557,7 +1564,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first_value = (first & 0x1);
|
||||
uint16_t second_value = (second & 0x1);
|
||||
uint16_t result = !(first_value ^ second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1570,7 +1577,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first_value = (first & 0x1);
|
||||
uint32_t second_value = (second & 0x1);
|
||||
uint32_t result = !(first_value ^ second_value);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1588,7 +1595,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
uint8_t result = (first * second);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1599,7 +1606,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first = *(uint16_t *)(vr1.data() + i);
|
||||
uint16_t second = *(uint16_t *)(vr2.data() + i);
|
||||
uint16_t result = (first * second);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1610,7 +1617,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first = *(uint32_t *)(vr1.data() + i);
|
||||
uint32_t second = *(uint32_t *)(vr2.data() + i);
|
||||
uint32_t result = (first * second);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1628,7 +1635,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint8_t first = *(uint8_t *)(vr1.data() + i);
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
uint8_t result = (first * second);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) += result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1639,7 +1646,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint16_t first = *(uint16_t *)(vr1.data() + i);
|
||||
uint16_t second = *(uint16_t *)(vr2.data() + i);
|
||||
uint16_t result = (first * second);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) += result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1650,7 +1657,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
uint32_t first = *(uint32_t *)(vr1.data() + i);
|
||||
uint32_t second = *(uint32_t *)(vr2.data() + i);
|
||||
uint32_t result = (first * second);
|
||||
D(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << first << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) += result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1669,7 +1676,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
uint8_t result = (rsdata[i][0] + second);
|
||||
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1679,7 +1686,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
for (int i = 0; i < vl_; i++) {
|
||||
uint16_t second = *(uint16_t *)(vr2.data() + i);
|
||||
uint16_t result = (rsdata[i][0] + second);
|
||||
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1689,7 +1696,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
for (int i = 0; i < vl_; i++) {
|
||||
uint32_t second = *(uint32_t *)(vr2.data() + i);
|
||||
uint32_t result = (rsdata[i][0] + second);
|
||||
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1705,7 +1712,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
for (int i = 0; i < vl_; i++) {
|
||||
uint8_t second = *(uint8_t *)(vr2.data() + i);
|
||||
uint8_t result = (rsdata[i][0] * second);
|
||||
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
*(uint8_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1715,7 +1722,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
for (int i = 0; i < vl_; i++) {
|
||||
uint16_t second = *(uint16_t *)(vr2.data() + i);
|
||||
uint16_t result = (rsdata[i][0] * second);
|
||||
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
*(uint16_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1725,7 +1732,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
for (int i = 0; i < vl_; i++) {
|
||||
uint32_t second = *(uint32_t *)(vr2.data() + i);
|
||||
uint32_t result = (rsdata[i][0] * second);
|
||||
D(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
DP(3, "Comparing " << rsdata[i][0] << " + " << second << " = " << result);
|
||||
*(uint32_t *)(vd.data() + i) = result;
|
||||
}
|
||||
for (int i = vl_; i < VLMAX; i++) {
|
||||
|
@ -1741,7 +1748,7 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
vtype_.vsew = instr.getVsew();
|
||||
vtype_.vlmul = instr.getVlmul();
|
||||
|
||||
D(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX);
|
||||
DP(3, "lmul:" << vtype_.vlmul << " sew:" << vtype_.vsew << " ediv: " << vtype_.vediv << "rsrc_" << rsdata[0][0] << "VLMAX" << VLMAX);
|
||||
|
||||
int s0 = rsdata[0][0];
|
||||
if (s0 <= VLMAX) {
|
||||
|
@ -1762,46 +1769,49 @@ void Warp::execute(const Instr &instr, pipeline_state_t *pipeline_state) {
|
|||
}
|
||||
|
||||
if (rd_write) {
|
||||
pipeline_state->wb = true;
|
||||
DPH(2, "Dest Reg: ");
|
||||
int rdt = instr.getRDType();
|
||||
auto rdt = instr.getRDType();
|
||||
switch (rdt) {
|
||||
case 1:
|
||||
case RegType::Integer:
|
||||
if (rdest) {
|
||||
DPH(2, "r" << std::dec << rdest << "={");
|
||||
DPN(2, "r" << std::dec << rdest << "={");
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
iRegFile_.at(t)[rdest] = rddata[t];
|
||||
if (t) DPN(2, ", ");
|
||||
if (!tmask_.test(t)) {
|
||||
DPN(2, "-");
|
||||
continue;
|
||||
}
|
||||
iRegFile_.at(t)[rdest] = rddata[t];
|
||||
DPN(2, "0x" << std::hex << rddata[t]);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
pipeline_state->used_iregs[rdest] = 1;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
DPH(2, "fr" << std::dec << rdest << "={");
|
||||
case RegType::Float:
|
||||
DPN(2, "fr" << std::dec << rdest << "={");
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
fRegFile_.at(t)[rdest] = rddata[t];
|
||||
if (t) DPN(2, ", ");
|
||||
if (!tmask_.test(t)) {
|
||||
DPN(2, "-");
|
||||
continue;
|
||||
}
|
||||
fRegFile_.at(t)[rdest] = rddata[t];
|
||||
DPN(2, "0x" << std::hex << rddata[t]);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
pipeline_state->used_fregs[rdest] = 1;
|
||||
break;
|
||||
case 3:
|
||||
pipeline_state->used_vregs[rdest] = 1;
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
PC_ += core_->arch().wsize();
|
||||
if (PC_ != nextPC) {
|
||||
D(3, "*** Next PC: " << std::hex << nextPC << std::dec);
|
||||
DP(3, "*** Next PC: " << std::hex << nextPC << std::dec);
|
||||
PC_ = nextPC;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,6 +9,17 @@
|
|||
|
||||
using namespace vortex;
|
||||
|
||||
NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
|
||||
|
||||
void NopUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
return;
|
||||
this->schedule_output(state, 1);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
LsuUnit::LsuUnit(Core* core)
|
||||
: ExeUnit("LSU")
|
||||
, core_(core)
|
||||
|
@ -17,61 +28,77 @@ LsuUnit::LsuUnit(Core* core)
|
|||
, fence_lock_(false)
|
||||
{}
|
||||
|
||||
void LsuUnit::handleCacheReponse(const MemRsp& response, uint32_t port_id) {
|
||||
auto entry = pending_dcache_.at(response.tag);
|
||||
entry.second.reset(port_id); // track remaining blocks
|
||||
if (!entry.second.any()) {
|
||||
auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
|
||||
entry.first.dcache_latency = latency;
|
||||
this->schedule_output(entry.first, 1);
|
||||
pending_dcache_.release(response.tag);
|
||||
}
|
||||
}
|
||||
void LsuUnit::step(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
// handle dcache response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
MemRsp mem_rsp;
|
||||
if (!core_->dcache_->CoreRspPorts.at(t).read(&mem_rsp))
|
||||
continue;
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
DT(3, cycle, "dcache-rsp: addr=" << std::hex << entry.first.mem_addrs.at(t) << ", tag=" << mem_rsp.tag << ", type=" << entry.first.lsu.type << ", tid=" << t << ", " << entry.first);
|
||||
assert(entry.second.test(t));
|
||||
entry.second.reset(t); // track remaining blocks
|
||||
if (!entry.second.any()) {
|
||||
auto latency = (SimPlatform::instance().cycles() - entry.first.dcache_latency);
|
||||
entry.first.dcache_latency = latency;
|
||||
this->schedule_output(entry.first, 1);
|
||||
pending_dcache_.release(mem_rsp.tag);
|
||||
}
|
||||
}
|
||||
|
||||
void LsuUnit::step() {
|
||||
if (fence_lock_) {
|
||||
// wait for all pending memory operations to complete
|
||||
if (!pending_dcache_.empty())
|
||||
return;
|
||||
this->schedule_output(fence_state_, 1);
|
||||
fence_lock_ = false;
|
||||
DT(3, cycle, "fence-unlock: " << fence_state_);
|
||||
}
|
||||
|
||||
// check input queue
|
||||
if (inputs_.empty())
|
||||
return;
|
||||
|
||||
auto state = inputs_.top();
|
||||
|
||||
if (state.lsu.fence) {
|
||||
if (state.lsu.type == LsuType::FENCE) {
|
||||
// schedule fence lock
|
||||
fence_state_ = state;
|
||||
fence_lock_ = true;
|
||||
inputs_.pop();
|
||||
DT(3, cycle, "fence-lock: " << state);
|
||||
return;
|
||||
}
|
||||
|
||||
// send dcache requests
|
||||
if (!pending_dcache_.full()) {
|
||||
state.dcache_latency = SimPlatform::instance().cycles();
|
||||
auto tag = pending_dcache_.allocate({state, state.tmask});
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!state.tmask.test(t))
|
||||
continue;
|
||||
MemReq mem_req;
|
||||
mem_req.addr = state.mem_addrs.at(t);
|
||||
mem_req.write = state.lsu.store;
|
||||
mem_req.tag = tag;
|
||||
core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
|
||||
}
|
||||
inputs_.pop();
|
||||
// check pending queue capacity
|
||||
if (pending_dcache_.full()) {
|
||||
DT(3, cycle, "*** lsu-queue-stall: " << state);
|
||||
return;
|
||||
}
|
||||
|
||||
// send dcache request
|
||||
state.dcache_latency = SimPlatform::instance().cycles();
|
||||
auto tag = pending_dcache_.allocate({state, state.tmask});
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!state.tmask.test(t))
|
||||
continue;
|
||||
MemReq mem_req;
|
||||
mem_req.addr = state.mem_addrs.at(t);
|
||||
mem_req.write = (state.lsu.type == LsuType::STORE);
|
||||
mem_req.tag = tag;
|
||||
core_->dcache_->CoreReqPorts.at(t).send(mem_req, 1);
|
||||
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", type=" << state.lsu.type << ", tid=" << t << ", " << state);
|
||||
}
|
||||
inputs_.pop();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
|
||||
|
||||
void AluUnit::step() {
|
||||
void AluUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
return;
|
||||
|
@ -95,7 +122,7 @@ void AluUnit::step() {
|
|||
|
||||
CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
|
||||
|
||||
void CsrUnit::step() {
|
||||
void CsrUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
return;
|
||||
|
@ -106,7 +133,7 @@ void CsrUnit::step() {
|
|||
|
||||
FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
|
||||
|
||||
void FpuUnit::step() {
|
||||
void FpuUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
return;
|
||||
|
@ -133,7 +160,7 @@ void FpuUnit::step() {
|
|||
|
||||
GpuUnit::GpuUnit(Core*) : ExeUnit("GPU") {}
|
||||
|
||||
void GpuUnit::step() {
|
||||
void GpuUnit::step(uint64_t /*cycle*/) {
|
||||
pipeline_state_t state;
|
||||
if (!inputs_.try_pop(&state))
|
||||
return;
|
||||
|
|
|
@ -43,7 +43,16 @@ public:
|
|||
return outputs_.try_pop(state);
|
||||
}
|
||||
|
||||
virtual void step() = 0;
|
||||
virtual void step(uint64_t cycle) = 0;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class NopUnit : public ExeUnit {
|
||||
public:
|
||||
NopUnit(Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -59,9 +68,7 @@ private:
|
|||
public:
|
||||
LsuUnit(Core*);
|
||||
|
||||
void handleCacheReponse(const MemRsp& response, uint32_t port_id);
|
||||
|
||||
void step();
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -70,7 +77,7 @@ class AluUnit : public ExeUnit {
|
|||
public:
|
||||
AluUnit(Core*);
|
||||
|
||||
void step();
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -79,7 +86,7 @@ class CsrUnit : public ExeUnit {
|
|||
public:
|
||||
CsrUnit(Core*);
|
||||
|
||||
void step();
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -88,7 +95,7 @@ class FpuUnit : public ExeUnit {
|
|||
public:
|
||||
FpuUnit(Core*);
|
||||
|
||||
void step();
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -97,7 +104,7 @@ class GpuUnit : public ExeUnit {
|
|||
public:
|
||||
GpuUnit(Core*);
|
||||
|
||||
void step();
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
||||
}
|
|
@ -53,22 +53,23 @@ public:
|
|||
: opcode_(Opcode::NOP)
|
||||
, num_rsrcs_(0)
|
||||
, has_imm_(false)
|
||||
, rdest_type_(RegType::None)
|
||||
, rdest_(0)
|
||||
, func3_(0)
|
||||
, func7_(0) {
|
||||
for (int i = 0; i < MAX_REG_SOURCES; ++i) {
|
||||
rsrc_type_[i] = 0;
|
||||
rsrc_type_[i] = RegType::None;
|
||||
}
|
||||
}
|
||||
|
||||
/* Setters used to "craft" the instruction. */
|
||||
void setOpcode(Opcode opcode) { opcode_ = opcode; }
|
||||
void setDestReg(int destReg) { rdest_type_ = 1; rdest_ = destReg; }
|
||||
void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = 1; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setDestFReg(int destReg) { rdest_type_ = 2; rdest_ = destReg; }
|
||||
void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = 2; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setDestVReg(int destReg) { rdest_type_ = 3; rdest_ = destReg; }
|
||||
void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = 3; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setDestReg(int destReg) { rdest_type_ = RegType::Integer; rdest_ = destReg; }
|
||||
void setSrcReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Integer; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setDestFReg(int destReg) { rdest_type_ = RegType::Float; rdest_ = destReg; }
|
||||
void setSrcFReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Float; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setDestVReg(int destReg) { rdest_type_ = RegType::Vector; rdest_ = destReg; }
|
||||
void setSrcVReg(int srcReg) { rsrc_type_[num_rsrcs_] = RegType::Vector; rsrc_[num_rsrcs_++] = srcReg; }
|
||||
void setFunc3(Word func3) { func3_ = func3; }
|
||||
void setFunc7(Word func7) { func7_ = func7; }
|
||||
void setImm(Word imm) { has_imm_ = true; imm_ = imm; }
|
||||
|
@ -89,9 +90,9 @@ public:
|
|||
Word getFunc7() const { return func7_; }
|
||||
int getNRSrc() const { return num_rsrcs_; }
|
||||
int getRSrc(int i) const { return rsrc_[i]; }
|
||||
int getRSType(int i) const { return rsrc_type_[i]; }
|
||||
RegType getRSType(int i) const { return rsrc_type_[i]; }
|
||||
int getRDest() const { return rdest_; }
|
||||
int getRDType() const { return rdest_type_; }
|
||||
RegType getRDType() const { return rdest_type_; }
|
||||
bool hasImm() const { return has_imm_; }
|
||||
Word getImm() const { return imm_; }
|
||||
Word getVlsWidth() const { return vlsWidth_; }
|
||||
|
@ -112,15 +113,15 @@ private:
|
|||
Opcode opcode_;
|
||||
int num_rsrcs_;
|
||||
bool has_imm_;
|
||||
int rdest_type_;
|
||||
RegType rdest_type_;
|
||||
Word imm_;
|
||||
int rsrc_type_[MAX_REG_SOURCES];
|
||||
RegType rsrc_type_[MAX_REG_SOURCES];
|
||||
int rsrc_[MAX_REG_SOURCES];
|
||||
int rdest_;
|
||||
Word func3_;
|
||||
Word func6_;
|
||||
|
||||
//Vector
|
||||
// Vector
|
||||
Word vmask_;
|
||||
Word vlsWidth_;
|
||||
Word vMop_;
|
||||
|
|
|
@ -6,12 +6,15 @@
|
|||
#include <stdlib.h>
|
||||
#include <sys/stat.h>
|
||||
#include "processor.h"
|
||||
#include <util.h>
|
||||
#include "args.h"
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
int ret;
|
||||
int exitcode;
|
||||
|
||||
std::string archStr("rv32imf");
|
||||
std::string imgFileName;
|
||||
|
@ -53,11 +56,42 @@ int main(int argc, char **argv) {
|
|||
|
||||
{
|
||||
ArchDef arch(archStr, num_cores, num_warps, num_threads);
|
||||
|
||||
Processor processor(arch);
|
||||
ret = processor.run(imgFileName, riscv_test, showStats);
|
||||
|
||||
RAM ram(RAM_PAGE_SIZE);
|
||||
|
||||
{
|
||||
std::string program_ext(fileExtension(imgFileName.c_str()));
|
||||
if (program_ext == "bin") {
|
||||
ram.loadBinImage(imgFileName.c_str(), STARTUP_ADDR);
|
||||
} else if (program_ext == "hex") {
|
||||
ram.loadHexImage(imgFileName.c_str());
|
||||
} else {
|
||||
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
processor.attach_ram(&ram);
|
||||
|
||||
exitcode = processor.run();
|
||||
|
||||
if (riscv_test) {
|
||||
if (1 == exitcode) {
|
||||
std::cout << "Passed." << std::endl;
|
||||
exitcode = 0;
|
||||
} else {
|
||||
std::cout << "Failed." << std::endl;
|
||||
}
|
||||
} else {
|
||||
if (exitcode != 0) {
|
||||
std::cout << "*** error: exitcode=" << exitcode << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
SimPlatform::instance().finalize();
|
||||
|
||||
return ret;
|
||||
return exitcode;
|
||||
}
|
||||
|
|
|
@ -8,32 +8,26 @@ using namespace vortex;
|
|||
class MemSim::Impl {
|
||||
private:
|
||||
MemSim* simobject_;
|
||||
std::vector<std::queue<MemReq>> inputs_;
|
||||
uint32_t num_banks_;
|
||||
uint32_t latency_;
|
||||
|
||||
public:
|
||||
Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency)
|
||||
: simobject_(simobject)
|
||||
, inputs_(num_banks)
|
||||
, num_banks_(num_banks)
|
||||
, latency_(latency)
|
||||
{}
|
||||
|
||||
void handleMemRequest(const MemReq& mem_req, uint32_t port_id) {
|
||||
inputs_.at(port_id).push(mem_req);
|
||||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
for (uint32_t i = 0, n = inputs_.size(); i < n; ++i) {
|
||||
auto& queue = inputs_.at(i);
|
||||
if (queue.empty())
|
||||
for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
|
||||
MemReq mem_req;
|
||||
if (!simobject_->MemReqPorts.at(i).read(&mem_req))
|
||||
continue;
|
||||
auto& entry = queue.front();
|
||||
if (!entry.write) {
|
||||
if (!mem_req.write) {
|
||||
MemRsp mem_rsp;
|
||||
mem_rsp.tag = entry.tag;
|
||||
mem_rsp.tag = mem_req.tag;
|
||||
simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
|
||||
}
|
||||
queue.pop();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -45,7 +39,7 @@ MemSim::MemSim(const SimContext& ctx,
|
|||
uint32_t latency)
|
||||
: SimObject<MemSim>(ctx, "MemSim")
|
||||
, impl_(new Impl(this, num_banks, latency))
|
||||
, MemReqPorts(num_banks, {this, impl_, &Impl::handleMemRequest})
|
||||
, MemReqPorts(num_banks, this)
|
||||
, MemRspPorts(num_banks, this)
|
||||
{}
|
||||
|
||||
|
|
|
@ -10,14 +10,19 @@
|
|||
namespace vortex {
|
||||
|
||||
struct pipeline_state_t {
|
||||
//--
|
||||
//--
|
||||
uint64_t id;
|
||||
|
||||
//--
|
||||
int cid;
|
||||
int wid;
|
||||
ThreadMask tmask;
|
||||
Word PC;
|
||||
|
||||
//--
|
||||
bool stall_warp;
|
||||
int rdest_type;
|
||||
bool wb;
|
||||
RegType rdest_type;
|
||||
int rdest;
|
||||
RegMask used_iregs;
|
||||
RegMask used_fregs;
|
||||
|
@ -30,10 +35,7 @@ struct pipeline_state_t {
|
|||
//--
|
||||
union {
|
||||
struct {
|
||||
uint8_t load : 1;
|
||||
uint8_t store: 1;
|
||||
uint8_t fence : 1;
|
||||
uint8_t prefetch: 1;
|
||||
LsuType type;
|
||||
} lsu;
|
||||
struct {
|
||||
AluType type;
|
||||
|
@ -49,8 +51,37 @@ struct pipeline_state_t {
|
|||
// stats
|
||||
uint64_t icache_latency;
|
||||
uint64_t dcache_latency;
|
||||
|
||||
void clear() {
|
||||
cid = 0;
|
||||
wid = 0;
|
||||
tmask.reset();
|
||||
PC = 0;
|
||||
stall_warp = false;
|
||||
wb = false;
|
||||
rdest = 0;
|
||||
rdest_type = RegType::None;
|
||||
used_iregs.reset();
|
||||
used_fregs.reset();
|
||||
used_vregs.reset();
|
||||
exe_type = ExeType::NOP;
|
||||
mem_addrs.clear();
|
||||
icache_latency = 0;
|
||||
dcache_latency = 0;
|
||||
}
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
|
||||
os << "coreid=" << state.cid << ", wid=" << state.wid << ", PC=" << std::hex << state.PC;
|
||||
os << ", wb=" << state.wb;
|
||||
if (state.wb) {
|
||||
os << ", rd=" << state.rdest_type << std::dec << state.rdest;
|
||||
}
|
||||
os << ", ex=" << state.exe_type;
|
||||
os << " (#" << std::dec << state.id << ")";
|
||||
return os;
|
||||
}
|
||||
|
||||
class PipelineStage : public Queue<pipeline_state_t> {
|
||||
protected:
|
||||
const char* name_;
|
||||
|
@ -62,15 +93,4 @@ public:
|
|||
{}
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const pipeline_state_t& state) {
|
||||
os << "stall_warp=" << state.stall_warp;
|
||||
os << ", wid=" << state.wid;
|
||||
os << ", PC=" << std::hex << state.PC;
|
||||
os << ", used_iregs=" << state.used_iregs;
|
||||
os << ", used_fregs=" << state.used_fregs;
|
||||
os << ", used_vregs=" << state.used_vregs;
|
||||
os << std::endl;
|
||||
return os;
|
||||
}
|
||||
|
||||
}
|
141
sim/simX/processor.cpp
Normal file
141
sim/simX/processor.cpp
Normal file
|
@ -0,0 +1,141 @@
|
|||
#include "processor.h"
|
||||
#include "constants.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
Processor::Processor(const ArchDef& arch)
|
||||
: cores_(arch.num_cores())
|
||||
, l2caches_(NUM_CLUSTERS)
|
||||
, l2_mem_switches_(NUM_CLUSTERS)
|
||||
{
|
||||
uint32_t num_cores = arch.num_cores();
|
||||
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
|
||||
|
||||
// create cores
|
||||
for (uint32_t i = 0; i < num_cores; ++i) {
|
||||
cores_.at(i) = Core::Create(arch, i);
|
||||
}
|
||||
|
||||
// connect memory sub-systen
|
||||
memsim_ = MemSim::Create(1, MEM_LATENCY);
|
||||
std::vector<SlavePort<MemReq>*> mem_req_ports(1);
|
||||
std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
|
||||
mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
|
||||
mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
|
||||
|
||||
if (L3_ENABLE) {
|
||||
l3cache_ = Cache::Create("l3cache", CacheConfig{
|
||||
log2ceil(L3_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
2, // W
|
||||
0, // A
|
||||
32, // address bits
|
||||
L3_NUM_BANKS, // number of banks
|
||||
L3_NUM_PORTS, // number of ports
|
||||
NUM_CLUSTERS, // request size
|
||||
true, // write-throught
|
||||
0, // victim size
|
||||
L3_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
}
|
||||
);
|
||||
|
||||
mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
|
||||
l3cache_->MemReqPort.bind(mem_req_ports.at(0));
|
||||
|
||||
mem_req_ports.resize(NUM_CLUSTERS);
|
||||
mem_rsp_ports.resize(NUM_CLUSTERS);
|
||||
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
|
||||
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
|
||||
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
|
||||
}
|
||||
} else if (NUM_CLUSTERS > 1) {
|
||||
l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
|
||||
mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
|
||||
l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));
|
||||
|
||||
mem_req_ports.resize(NUM_CLUSTERS);
|
||||
mem_rsp_ports.resize(NUM_CLUSTERS);
|
||||
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
|
||||
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
|
||||
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
|
||||
if (L2_ENABLE) {
|
||||
auto& l2cache = l2caches_.at(i);
|
||||
l2cache = Cache::Create("l2cache", CacheConfig{
|
||||
log2ceil(L2_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
2, // W
|
||||
0, // A
|
||||
32, // address bits
|
||||
L2_NUM_BANKS, // number of banks
|
||||
L2_NUM_PORTS, // number of ports
|
||||
NUM_CORES, // request size
|
||||
true, // write-throught
|
||||
0, // victim size
|
||||
L2_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
});
|
||||
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
|
||||
l2cache->MemReqPort.bind(mem_req_ports.at(i));
|
||||
|
||||
mem_req_ports.resize(cores_per_cluster);
|
||||
mem_rsp_ports.resize(cores_per_cluster);
|
||||
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
|
||||
mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
|
||||
mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
|
||||
}
|
||||
} else if (cores_per_cluster > 1) {
|
||||
auto& l2_mem_switch = l2_mem_switches_.at(i);
|
||||
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
|
||||
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
|
||||
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
|
||||
|
||||
mem_req_ports.resize(cores_per_cluster);
|
||||
mem_rsp_ports.resize(cores_per_cluster);
|
||||
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
|
||||
mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
|
||||
mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
|
||||
auto& core = cores_.at((i * NUM_CLUSTERS) + j);
|
||||
mem_rsp_ports.at(i)->bind(&core->MemRspPort);
|
||||
core->MemReqPort.bind(mem_req_ports.at(j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Processor::attach_ram(RAM* ram) {
|
||||
for (auto core : cores_) {
|
||||
core->attach_ram(ram);
|
||||
}
|
||||
}
|
||||
|
||||
Processor::~Processor() {}
|
||||
|
||||
int Processor::run() {
|
||||
bool running;
|
||||
int exitcode = 0;
|
||||
do {
|
||||
SimPlatform::instance().step();
|
||||
|
||||
running = false;
|
||||
for (auto& core : cores_) {
|
||||
if (core->running()) {
|
||||
running = true;
|
||||
}
|
||||
if (core->check_ebreak()) {
|
||||
exitcode = core->getIRegValue(3);
|
||||
running = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (running);
|
||||
|
||||
return exitcode;
|
||||
}
|
|
@ -1,189 +1,27 @@
|
|||
#pragma once
|
||||
|
||||
#include "constants.h"
|
||||
#include "debug.h"
|
||||
#include "types.h"
|
||||
#include "core.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Processor {
|
||||
public:
|
||||
typedef std::shared_ptr<Processor> Ptr;
|
||||
|
||||
Processor(const ArchDef& arch);
|
||||
~Processor();
|
||||
|
||||
void attach_ram(RAM* mem);
|
||||
|
||||
int run();
|
||||
|
||||
private:
|
||||
ArchDef arch_;
|
||||
Decoder decoder_;
|
||||
MemoryUnit mu_;
|
||||
RAM ram_;
|
||||
std::vector<Core::Ptr> cores_;
|
||||
std::vector<Cache::Ptr> l2caches_;
|
||||
std::vector<Switch<MemReq, MemRsp>::Ptr> l2_mem_switches_;
|
||||
Cache::Ptr l3cache_;
|
||||
Switch<MemReq, MemRsp>::Ptr l3_mem_switch_;
|
||||
MemSim::Ptr memsim_;
|
||||
|
||||
public:
|
||||
Processor(const ArchDef& arch)
|
||||
: arch_(arch)
|
||||
, decoder_(arch)
|
||||
, mu_(0, arch.wsize(), true)
|
||||
, ram_((1<<12), (1<<20))
|
||||
, cores_(arch.num_cores())
|
||||
, l2caches_(NUM_CLUSTERS)
|
||||
, l2_mem_switches_(NUM_CLUSTERS)
|
||||
{
|
||||
uint32_t num_cores = arch.num_cores();
|
||||
uint32_t cores_per_cluster = num_cores / NUM_CLUSTERS;
|
||||
|
||||
// bind RAM to memory unit
|
||||
mu_.attach(ram_, 0, 0xFFFFFFFF);
|
||||
|
||||
// create cores
|
||||
for (uint32_t i = 0; i < num_cores; ++i) {
|
||||
cores_.at(i) = Core::Create(arch, decoder_, mu_, i);
|
||||
}
|
||||
|
||||
// connect memory sub-systen
|
||||
memsim_ = MemSim::Create(1, MEM_LATENCY);
|
||||
std::vector<SlavePort<MemReq>*> mem_req_ports(1);
|
||||
std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
|
||||
mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
|
||||
mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
|
||||
|
||||
if (L3_ENABLE) {
|
||||
l3cache_ = Cache::Create("l3cache", CacheConfig{
|
||||
log2ceil(L3_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
2, // W
|
||||
0, // A
|
||||
32, // address bits
|
||||
L3_NUM_BANKS, // number of banks
|
||||
L3_NUM_PORTS, // number of ports
|
||||
NUM_CLUSTERS, // request size
|
||||
true, // write-throught
|
||||
0, // victim size
|
||||
L3_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
});
|
||||
mem_rsp_ports.at(0)->bind(&l3cache_->MemRspPort);
|
||||
l3cache_->MemReqPort.bind(mem_req_ports.at(0));
|
||||
|
||||
mem_req_ports.resize(NUM_CLUSTERS);
|
||||
mem_rsp_ports.resize(NUM_CLUSTERS);
|
||||
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
|
||||
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
|
||||
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
|
||||
}
|
||||
} else if (NUM_CLUSTERS > 1) {
|
||||
l3_mem_switch_ = Switch<MemReq, MemRsp>::Create("l3_arb", ArbiterType::RoundRobin, NUM_CLUSTERS);
|
||||
mem_rsp_ports.at(0)->bind(&l3_mem_switch_->RspIn);
|
||||
l3_mem_switch_->ReqOut.bind(mem_req_ports.at(0));
|
||||
|
||||
mem_req_ports.resize(NUM_CLUSTERS);
|
||||
mem_rsp_ports.resize(NUM_CLUSTERS);
|
||||
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
|
||||
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
|
||||
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
|
||||
if (L2_ENABLE) {
|
||||
auto& l2cache = l2caches_.at(i);
|
||||
l2cache = Cache::Create("l2cache", CacheConfig{
|
||||
log2ceil(L2_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
2, // W
|
||||
0, // A
|
||||
32, // address bits
|
||||
L2_NUM_BANKS, // number of banks
|
||||
L2_NUM_PORTS, // number of ports
|
||||
NUM_CORES, // request size
|
||||
true, // write-throught
|
||||
0, // victim size
|
||||
L2_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
});
|
||||
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
|
||||
l2cache->MemReqPort.bind(mem_req_ports.at(i));
|
||||
|
||||
mem_req_ports.resize(cores_per_cluster);
|
||||
mem_rsp_ports.resize(cores_per_cluster);
|
||||
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
|
||||
mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
|
||||
mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
|
||||
}
|
||||
} else if (cores_per_cluster > 1) {
|
||||
auto& l2_mem_switch = l2_mem_switches_.at(i);
|
||||
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
|
||||
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
|
||||
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
|
||||
|
||||
mem_req_ports.resize(cores_per_cluster);
|
||||
mem_rsp_ports.resize(cores_per_cluster);
|
||||
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
|
||||
mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
|
||||
mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
|
||||
auto& core = cores_.at((i * NUM_CLUSTERS) + j);
|
||||
mem_rsp_ports.at(i)->bind(&core->MemRspPort);
|
||||
core->MemReqPort.bind(mem_req_ports.at(j));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
~Processor() {}
|
||||
|
||||
int run(const std::string& program, bool riscv_test, bool /*showStats*/) {
|
||||
{
|
||||
std::string program_ext(fileExtension(program.c_str()));
|
||||
if (program_ext == "bin") {
|
||||
ram_.loadBinImage(program.c_str(), STARTUP_ADDR);
|
||||
} else if (program_ext == "hex") {
|
||||
ram_.loadHexImage(program.c_str());
|
||||
} else {
|
||||
std::cout << "*** error: only *.bin or *.hex images supported." << std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
bool running;
|
||||
int exitcode = 0;
|
||||
do {
|
||||
SimPlatform::instance().step();
|
||||
|
||||
running = false;
|
||||
for (auto& core : cores_) {
|
||||
if (core->running()) {
|
||||
running = true;
|
||||
}
|
||||
if (core->check_ebreak()) {
|
||||
exitcode = core->getIRegValue(3);
|
||||
running = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (running);
|
||||
|
||||
// get error status
|
||||
|
||||
if (riscv_test) {
|
||||
if (1 == exitcode) {
|
||||
std::cout << "Passed." << std::endl;
|
||||
exitcode = 0;
|
||||
} else {
|
||||
std::cout << "Failed." << std::endl;
|
||||
}
|
||||
} else {
|
||||
if (exitcode != 0) {
|
||||
std::cout << "*** error: exitcode=" << exitcode << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
|
@ -10,6 +10,7 @@ private:
|
|||
std::vector<RegMask> in_use_iregs_;
|
||||
std::vector<RegMask> in_use_fregs_;
|
||||
std::vector<RegMask> in_use_vregs_;
|
||||
std::unordered_map<uint32_t, uint64_t> owners_;
|
||||
|
||||
public:
|
||||
Scoreboard(const ArchDef &arch)
|
||||
|
@ -29,42 +30,87 @@ public:
|
|||
|| (state.used_fregs & in_use_fregs_.at(state.wid)) != 0
|
||||
|| (state.used_vregs & in_use_vregs_.at(state.wid)) != 0;
|
||||
}
|
||||
|
||||
std::vector<uint64_t> owners(const pipeline_state_t& state) const {
|
||||
std::vector<uint64_t> out;
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_iregs = state.used_iregs & in_use_iregs_.at(state.wid);
|
||||
while (used_iregs.any()) {
|
||||
if (used_iregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Integer;
|
||||
out.push_back(owners_.at(tag));
|
||||
}
|
||||
used_iregs >>= 1;
|
||||
++r;
|
||||
}
|
||||
}
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_fregs = state.used_fregs & in_use_fregs_.at(state.wid);
|
||||
while (used_fregs.any()) {
|
||||
if (used_fregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Float;
|
||||
out.push_back(owners_.at(tag));
|
||||
}
|
||||
used_fregs >>= 1;
|
||||
++r;
|
||||
}
|
||||
}
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_vregs = state.used_vregs & in_use_vregs_.at(state.wid);
|
||||
while (used_vregs.any()) {
|
||||
if (used_vregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state.wid << 4) | (int)RegType::Vector;
|
||||
out.push_back(owners_.at(tag));
|
||||
}
|
||||
used_vregs >>= 1;
|
||||
++r;
|
||||
}
|
||||
}
|
||||
return std::move(out);
|
||||
}
|
||||
|
||||
void reserve(const pipeline_state_t& state) {
|
||||
if (!state.rdest)
|
||||
return;
|
||||
|
||||
if (!state.wb)
|
||||
return;
|
||||
switch (state.rdest_type) {
|
||||
case 1:
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(state.wid).set(state.rdest);
|
||||
break;
|
||||
case 2:
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(state.wid).set(state.rdest);
|
||||
break;
|
||||
case 3:
|
||||
case RegType::Vector:
|
||||
in_use_vregs_.at(state.wid).set(state.rdest);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
|
||||
assert(owners_.count(tag) == 0);
|
||||
owners_[tag] = state.id;
|
||||
}
|
||||
|
||||
void release(const pipeline_state_t& state) {
|
||||
if (!state.rdest)
|
||||
return;
|
||||
if (!state.wb)
|
||||
return;
|
||||
switch (state.rdest_type) {
|
||||
case 1:
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(state.wid).reset(state.rdest);
|
||||
break;
|
||||
case 2:
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(state.wid).reset(state.rdest);
|
||||
break;
|
||||
case 3:
|
||||
case RegType::Vector:
|
||||
in_use_vregs_.at(state.wid).reset(state.rdest);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
uint32_t tag = (state.rdest << 16) | (state.wid << 4) | (int)state.rdest_type;
|
||||
owners_.erase(tag);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
194
sim/simX/types.h
194
sim/simX/types.h
|
@ -4,6 +4,7 @@
|
|||
#include <bitset>
|
||||
#include <queue>
|
||||
#include <unordered_map>
|
||||
#include <util.h>
|
||||
#include <VX_config.h>
|
||||
#include <simobject.h>
|
||||
|
||||
|
@ -20,7 +21,25 @@ typedef std::bitset<32> RegMask;
|
|||
typedef std::bitset<32> ThreadMask;
|
||||
typedef std::bitset<32> WarpMask;
|
||||
|
||||
enum class RegType {
|
||||
None,
|
||||
Integer,
|
||||
Float,
|
||||
Vector
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
|
||||
switch (type) {
|
||||
case RegType::None: break;
|
||||
case RegType::Integer: os << "r"; break;
|
||||
case RegType::Float: os << "fr"; break;
|
||||
case RegType::Vector: os << "vr"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
enum class ExeType {
|
||||
NOP,
|
||||
ALU,
|
||||
LSU,
|
||||
CSR,
|
||||
|
@ -29,6 +48,19 @@ enum class ExeType {
|
|||
MAX,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
|
||||
switch (type) {
|
||||
case ExeType::NOP: os << "NOP"; break;
|
||||
case ExeType::ALU: os << "ALU"; break;
|
||||
case ExeType::LSU: os << "LSU"; break;
|
||||
case ExeType::CSR: os << "CSR"; break;
|
||||
case ExeType::FPU: os << "FPU"; break;
|
||||
case ExeType::GPU: os << "GPU"; break;
|
||||
case ExeType::MAX: break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
enum class AluType {
|
||||
ARITH,
|
||||
BRANCH,
|
||||
|
@ -36,6 +68,33 @@ enum class AluType {
|
|||
IDIV,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
|
||||
switch (type) {
|
||||
case AluType::ARITH: os << "ARITH"; break;
|
||||
case AluType::BRANCH: os << "BRANCH"; break;
|
||||
case AluType::IMUL: os << "IMUL"; break;
|
||||
case AluType::IDIV: os << "IDIV"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
enum class LsuType {
|
||||
LOAD,
|
||||
STORE,
|
||||
FENCE,
|
||||
PREFETCH,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
|
||||
switch (type) {
|
||||
case LsuType::LOAD: os << "LOAD"; break;
|
||||
case LsuType::STORE: os << "STORE"; break;
|
||||
case LsuType::FENCE: os << "FENCE"; break;
|
||||
case LsuType::PREFETCH: os << "PREFETCH"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
enum class FpuType {
|
||||
FNCP,
|
||||
FMA,
|
||||
|
@ -44,6 +103,17 @@ enum class FpuType {
|
|||
FCVT,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
|
||||
switch (type) {
|
||||
case FpuType::FNCP: os << "FNCP"; break;
|
||||
case FpuType::FMA: os << "FMA"; break;
|
||||
case FpuType::FDIV: os << "FDIV"; break;
|
||||
case FpuType::FSQRT: os << "FSQRT"; break;
|
||||
case FpuType::FCVT: os << "FCVT"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
enum class GpuType {
|
||||
TMC,
|
||||
WSPAWN,
|
||||
|
@ -53,11 +123,31 @@ enum class GpuType {
|
|||
TEX,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
|
||||
switch (type) {
|
||||
case GpuType::TMC: os << "TMC"; break;
|
||||
case GpuType::WSPAWN: os << "WSPAWN"; break;
|
||||
case GpuType::SPLIT: os << "SPLIT"; break;
|
||||
case GpuType::JOIN: os << "JOIN"; break;
|
||||
case GpuType::BAR: os << "BAR"; break;
|
||||
case GpuType::TEX: os << "TEX"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
enum class ArbiterType {
|
||||
Priority,
|
||||
RoundRobin
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
|
||||
switch (type) {
|
||||
case ArbiterType::Priority: os << "Priority"; break;
|
||||
case ArbiterType::RoundRobin: os << "RoundRobin"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
|
@ -65,6 +155,8 @@ class Queue {
|
|||
protected:
|
||||
std::queue<T> queue_;
|
||||
|
||||
uint32_t count;
|
||||
|
||||
public:
|
||||
Queue() {}
|
||||
|
||||
|
@ -77,6 +169,7 @@ public:
|
|||
}
|
||||
|
||||
void push(const T& value) {
|
||||
++count;
|
||||
queue_.push(value);
|
||||
}
|
||||
|
||||
|
@ -141,6 +234,7 @@ public:
|
|||
return i;
|
||||
}
|
||||
}
|
||||
assert(false);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -148,6 +242,7 @@ public:
|
|||
auto& entry = entries_.at(index);
|
||||
assert(entry.first);
|
||||
entry.first = false;
|
||||
--capacity_;
|
||||
}
|
||||
|
||||
void remove(uint32_t index, T* value) {
|
||||
|
@ -155,6 +250,7 @@ public:
|
|||
assert(entry.first);
|
||||
*value = entry.second;
|
||||
entry.first = false;
|
||||
--capacity_;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -163,29 +259,21 @@ public:
|
|||
template <typename Req, typename Rsp, uint32_t MaxInputs = 32>
|
||||
class Switch : public SimObject<Switch<Req, Rsp>> {
|
||||
private:
|
||||
struct req_t {
|
||||
struct req_batch_t {
|
||||
std::vector<Req> data;
|
||||
std::bitset<MaxInputs> valid;
|
||||
req_t() {}
|
||||
req_t(uint32_t size) : data(size) {}
|
||||
req_batch_t() {}
|
||||
req_batch_t(uint32_t size)
|
||||
: data(size)
|
||||
, valid(0)
|
||||
{}
|
||||
};
|
||||
|
||||
void handleIncomingRequest(const Req& req, uint32_t port_id) {
|
||||
cur_req_.data.at(port_id) = req;
|
||||
cur_req_.valid.set(port_id);
|
||||
}
|
||||
|
||||
void handleIncomingResponse(const Rsp& rsp, uint32_t) {
|
||||
rsps_.push(rsp);
|
||||
}
|
||||
|
||||
ArbiterType type_;
|
||||
std::queue<req_t> reqs_;
|
||||
std::queue<Rsp> rsps_;
|
||||
req_t cur_req_;
|
||||
std::queue<req_batch_t> reqq_;
|
||||
uint32_t delay_;
|
||||
uint32_t cursor_;
|
||||
std::unordered_map<uint32_t, uint32_t> addr_table_;
|
||||
uint32_t tag_shift_;
|
||||
|
||||
public:
|
||||
Switch(
|
||||
|
@ -197,12 +285,12 @@ public:
|
|||
)
|
||||
: SimObject<Switch<Req, Rsp, MaxInputs>>(ctx, name)
|
||||
, type_(type)
|
||||
, cur_req_(num_inputs)
|
||||
, delay_(delay)
|
||||
, cursor_(0)
|
||||
, ReqIn(num_inputs, {this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingRequest})
|
||||
, tag_shift_(log2ceil(num_inputs))
|
||||
, ReqIn(num_inputs, this)
|
||||
, ReqOut(this)
|
||||
, RspIn(this, this, &Switch<Req, Rsp, MaxInputs>::handleIncomingResponse)
|
||||
, RspIn(this)
|
||||
, RspOut(num_inputs, this)
|
||||
{
|
||||
assert(delay_ != 0);
|
||||
|
@ -210,36 +298,52 @@ public:
|
|||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
if (cur_req_.valid.any()) {
|
||||
reqs_.push(cur_req_);
|
||||
cur_req_.valid.reset();
|
||||
}
|
||||
|
||||
while (!reqs_.empty()) {
|
||||
auto& entry = reqs_.front();
|
||||
bool found = false;
|
||||
for (uint32_t i = 0, n = entry.data.size(); i < n; ++i) {
|
||||
auto j = (cursor_ + i) % n;
|
||||
if (entry.valid.test(j)) {
|
||||
auto& req = entry.data.at(j);
|
||||
addr_table_[req.tag] = j;
|
||||
ReqOut.send(req, delay_);
|
||||
entry.valid.reset(j);
|
||||
this->update_cursor(j);
|
||||
found = true;
|
||||
break;
|
||||
// process incomming requests
|
||||
{
|
||||
req_batch_t req_batch(ReqIn.size());
|
||||
for (uint32_t i = 0, n = ReqIn.size(); i < n; ++i) {
|
||||
Req req;
|
||||
if (ReqIn.at(i).read(&req)) {
|
||||
req_batch.data.at(i) = req;
|
||||
req_batch.valid.set(i);
|
||||
}
|
||||
}
|
||||
if (found)
|
||||
break;
|
||||
reqs_.pop();
|
||||
if (req_batch.valid.any()) {
|
||||
reqq_.push(req_batch);
|
||||
}
|
||||
}
|
||||
|
||||
// apply arbitration
|
||||
if (!reqq_.empty()) {
|
||||
auto& req_batch = reqq_.front();
|
||||
for (uint32_t i = 0, n = req_batch.data.size(); i < n; ++i) {
|
||||
auto j = (cursor_ + i) % n;
|
||||
if (req_batch.valid.test(j)) {
|
||||
auto& req = req_batch.data.at(j);
|
||||
if (tag_shift_) {
|
||||
req.tag = (req.tag << tag_shift_) | j;
|
||||
}
|
||||
ReqOut.send(req, delay_);
|
||||
req_batch.valid.reset(j);
|
||||
this->update_cursor(j);
|
||||
if (!req_batch.valid.any())
|
||||
reqq_.pop(); // pop when empty
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!rsps_.empty()) {
|
||||
auto& rsp = rsps_.front();
|
||||
auto port_id = addr_table_.at(rsp.tag);
|
||||
RspOut.at(port_id).send(rsp, 1);
|
||||
rsps_.pop();
|
||||
// process incoming reponses
|
||||
{
|
||||
Rsp rsp;
|
||||
if (RspIn.read(&rsp)) {
|
||||
uint32_t port_id = 0;
|
||||
if (tag_shift_) {
|
||||
port_id = rsp.tag & ((1 << tag_shift_)-1);
|
||||
rsp.tag >>= tag_shift_;
|
||||
}
|
||||
RspOut.at(port_id).send(rsp, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,30 +24,34 @@ Warp::Warp(Core *core, Word id)
|
|||
void Warp::eval(pipeline_state_t *pipeline_state) {
|
||||
assert(tmask_.any());
|
||||
|
||||
DPH(2, "Step: wid=" << id_ << ", PC=0x" << std::hex << PC_ << ", tmask=");
|
||||
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
|
||||
for (int i = 0, n = core_->arch().num_threads(); i < n; ++i)
|
||||
DPN(2, tmask_.test(n-i-1));
|
||||
DPN(2, "\n");
|
||||
DPN(2, ", PC=0x" << std::hex << PC_ << std::endl);
|
||||
|
||||
/* Fetch and decode. */
|
||||
|
||||
Word fetched = core_->icache_fetch(PC_);
|
||||
auto instr = core_->decoder().decode(fetched, PC_);
|
||||
Word instr_code = core_->icache_read(PC_, sizeof(Word));
|
||||
auto instr = core_->decoder().decode(instr_code);
|
||||
if (!instr) {
|
||||
std::cout << std::hex << "Error: invalid instruction 0x" << instr_code << ", at PC=" << PC_ << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
|
||||
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
|
||||
|
||||
// Update state
|
||||
pipeline_state->cid = core_->id();
|
||||
pipeline_state->wid = id_;
|
||||
pipeline_state->PC = PC_;
|
||||
pipeline_state->tmask = tmask_;
|
||||
pipeline_state->rdest = instr->getRDest();
|
||||
pipeline_state->rdest_type = instr->getRDType();
|
||||
pipeline_state->used_iregs.reset();
|
||||
pipeline_state->used_fregs.reset();
|
||||
pipeline_state->used_vregs.reset();
|
||||
|
||||
|
||||
// Execute
|
||||
this->execute(*instr, pipeline_state);
|
||||
|
||||
D(4, "Register state:");
|
||||
DP(4, "Register state:");
|
||||
for (int i = 0; i < core_->arch().num_regs(); ++i) {
|
||||
DPN(4, " %r" << std::setfill('0') << std::setw(2) << std::dec << i << ':');
|
||||
for (int j = 0; j < core_->arch().num_threads(); ++j) {
|
||||
|
|
|
@ -44,6 +44,8 @@
|
|||
#define VERILATOR_RESET_VALUE 2
|
||||
#endif
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
static uint64_t timestamp = 0;
|
||||
|
@ -136,7 +138,7 @@ opae_sim::opae_sim()
|
|||
: stop_(false)
|
||||
, host_buffer_ids_(0) {
|
||||
vl_obj_ = new VL_OBJ();
|
||||
ram_ = new RAM((1<<12), (1<<20));
|
||||
ram_ = new RAM(RAM_PAGE_SIZE);
|
||||
|
||||
// reset the device
|
||||
this->reset();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue