vortex/sim/simx/cluster.cpp
2022-08-02 18:53:15 -07:00

302 lines
No EOL
10 KiB
C++

#include "cluster.h"
using namespace vortex;
Cluster::Cluster(uint32_t cluster_id, uint32_t cores_per_cluster, ProcessorImpl* processor, const Arch &arch, const DCRS &dcrs)
: cluster_id_(cluster_id)
, cores_(cores_per_cluster)
, raster_units_(NUM_RASTER_UNITS)
, rop_units_(NUM_ROP_UNITS)
, tex_units_(NUM_TEX_UNITS)
, sharedmems_(cores_per_cluster)
, processor_(processor)
{
char sname[100];
snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
!L2_ENABLED,
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L2_NUM_WAYS), // W
0, // A
32, // address bits
L2_NUM_BANKS, // number of banks
L2_NUM_PORTS, // number of ports
5, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
snprintf(sname, 100, "cluster%d-icaches", cluster_id);
icaches_ = CacheCluster::Create(sname, cores_per_cluster, NUM_ICACHES, CacheSim::Config{
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_BLOCK_SIZE),// B
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
32, // address bits
1, // number of banks
1, // number of ports
1, // number of requests
true, // write-through
false, // write response
0, // victim size
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);
snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
dcaches_ = CacheCluster::Create(sname, cores_per_cluster, NUM_DCACHES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_BLOCK_SIZE),// B
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
32, // address bits
DCACHE_NUM_BANKS, // number of banks
DCACHE_NUM_PORTS, // number of ports
(uint8_t)arch.num_threads(), // number of requests
true, // write-through
false, // write response
0, // victim size
DCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
snprintf(sname, 100, "cluster%d-tcaches", cluster_id);
tcaches_ = CacheCluster::Create(sname, NUM_TEX_UNITS, NUM_TCACHES, CacheSim::Config{
!TCACHE_ENABLED,
log2ceil(TCACHE_SIZE), // C
log2ceil(L1_BLOCK_SIZE),// B
log2ceil(sizeof(uint32_t)), // W
log2ceil(TCACHE_NUM_WAYS),// A
32, // address bits
TCACHE_NUM_BANKS, // number of banks
TCACHE_NUM_PORTS, // number of ports
(uint8_t)arch.num_threads(), // number of requests
false, // write-through
false, // write response
0, // victim size
TCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
tcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(2));
l2cache_->CoreRspPorts.at(2).bind(&tcaches_->MemRspPort);
snprintf(sname, 100, "cluster%d-ocaches", cluster_id);
ocaches_ = CacheCluster::Create(sname, NUM_ROP_UNITS, NUM_OCACHES, CacheSim::Config{
!OCACHE_ENABLED,
log2ceil(OCACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(sizeof(uint32_t)), // W
log2ceil(OCACHE_NUM_WAYS), // A
32, // address bits
OCACHE_NUM_BANKS, // number of banks
OCACHE_NUM_PORTS, // number of ports
(uint8_t)arch.num_threads(), // number of requests
false, // write-through
false, // write response
0, // victim size
OCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
ocaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(3));
l2cache_->CoreRspPorts.at(3).bind(&ocaches_->MemRspPort);
snprintf(sname, 100, "cluster%d-rcaches", cluster_id);
rcaches_ = CacheCluster::Create(sname, NUM_RASTER_UNITS, NUM_RCACHES, CacheSim::Config{
!RCACHE_ENABLED,
log2ceil(RCACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(sizeof(uint32_t)), // W
log2ceil(RCACHE_NUM_WAYS), // A
32, // address bits
RCACHE_NUM_BANKS, // number of banks
RCACHE_NUM_PORTS, // number of ports
1, // number of requests
false, // write-through
false, // write response
0, // victim size
RCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
rcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(4));
l2cache_->CoreRspPorts.at(4).bind(&rcaches_->MemRspPort);
///////////////////////////////////////////////////////////////////////////
uint32_t cores_per_raster = cores_per_cluster / NUM_RASTER_UNITS;
uint32_t cores_per_rop = cores_per_cluster / NUM_ROP_UNITS;
uint32_t cores_per_tex = cores_per_cluster / NUM_TEX_UNITS;
// create raster units
for (uint32_t i = 0; i < NUM_RASTER_UNITS; ++i) {
snprintf(sname, 100, "cluster%d-raster_unit%d", cluster_id, i);
uint32_t raster_idx = cluster_id * NUM_RASTER_UNITS + i;
raster_units_.at(i) = RasterUnit::Create(sname, raster_idx, cores_per_raster, arch, dcrs.raster_dcrs, RasterUnit::Config{
RASTER_TILE_LOGSIZE,
RASTER_BLOCK_LOGSIZE
});
raster_units_.at(i)->MemReqs.bind(&rcaches_->CoreReqPorts.at(i).at(0));
rcaches_->CoreRspPorts.at(i).at(0).bind(&raster_units_.at(i)->MemRsps);
}
// create rop units
for (uint32_t i = 0; i < NUM_ROP_UNITS; ++i) {
snprintf(sname, 100, "cluster%d-rop_unit%d", cluster_id, i);
rop_units_.at(i) = RopUnit::Create(sname, cores_per_rop, arch, dcrs.rop_dcrs);
for (uint32_t j = 0; j < arch.num_threads(); ++j) {
rop_units_.at(i)->MemReqs.at(j).bind(&ocaches_->CoreReqPorts.at(i).at(j));
ocaches_->CoreRspPorts.at(i).at(j).bind(&rop_units_.at(i)->MemRsps.at(j));
}
}
// create tex units
for (uint32_t i = 0; i < NUM_TEX_UNITS; ++i) {
snprintf(sname, 100, "cluster%d-tex_unit%d", cluster_id, i);
tex_units_.at(i) = TexUnit::Create(sname, cores_per_tex, arch, dcrs.tex_dcrs, TexUnit::Config{
2, // address latency
6, // sampler latency
});
for (uint32_t j = 0; j < arch.num_threads(); ++j) {
tex_units_.at(i)->MemReqs.at(j).bind(&tcaches_->CoreReqPorts.at(i).at(j));
tcaches_->CoreRspPorts.at(i).at(j).bind(&tex_units_.at(i)->MemRsps.at(j));
}
}
// create shared memory blocks
for (uint32_t i = 0; i < cores_per_cluster; ++i) {
snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
uint32_t(SMEM_LOCAL_SIZE) * arch.num_warps() * arch.num_threads(),
SMEM_LOCAL_SIZE,
arch.num_threads(),
arch.num_threads(),
log2ceil(STACK_SIZE),
1,
false
});
}
// create cores
for (uint32_t i = 0; i < cores_per_cluster; ++i) {
uint32_t raster_idx = i / cores_per_raster;
uint32_t rop_idx = i / cores_per_rop;
uint32_t tex_idx = i / cores_per_tex;
uint32_t core_id = cluster_id * cores_per_cluster + i;
cores_.at(i) = Core::Create(core_id,
this,
arch,
dcrs,
sharedmems_.at(i),
raster_units_.at(raster_idx),
rop_units_.at(rop_idx),
tex_units_.at(tex_idx));
cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));
for (uint32_t j = 0; j < arch.num_threads(); ++j) {
snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
auto smem_demux = SMemDemux::Create(sname);
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
}
}
}
Cluster::~Cluster() {
//--
}
void Cluster::attach_ram(RAM* ram) {
for (auto core : cores_) {
core->attach_ram(ram);
}
for (auto raster_unit : raster_units_) {
raster_unit->attach_ram(ram);
}
for (auto rop_unit : rop_units_) {
rop_unit->attach_ram(ram);
}
for (auto tex_unit : tex_units_) {
tex_unit->attach_ram(ram);
}
}
bool Cluster::running() const {
for (auto& core : cores_) {
if (core->running())
return true;
}
return false;
}
bool Cluster::getIRegValue(int* value, int reg) const {
for (auto& core : cores_) {
if (core->check_exit()) {
*value = core->getIRegValue(reg);
return true;
}
}
return false;
}
void Cluster::bind(SimPort<MemReq>* mem_req_port, SimPort<MemRsp>* mem_rsp_port) {
l2cache_->MemReqPort.bind(mem_req_port);
mem_rsp_port->bind(&l2cache_->MemRspPort);
}
ProcessorImpl* Cluster::processor() const {
return processor_;
}
Cluster::PerfStats Cluster::perf_stats() const {
Cluster::PerfStats perf;
perf.icache = icaches_->perf_stats();
perf.dcache = dcaches_->perf_stats();
perf.tcache = tcaches_->perf_stats();
perf.ocache = ocaches_->perf_stats();
perf.rcache = rcaches_->perf_stats();
perf.l2cache = l2cache_->perf_stats();
for (auto sharedmem : sharedmems_) {
perf.sharedmem += sharedmem->perf_stats();
}
for (uint32_t i = 0; i < NUM_RASTER_UNITS; ++i) {
perf.raster_unit += raster_units_.at(i)->perf_stats();
}
for (uint32_t i = 0; i < NUM_ROP_UNITS; ++i) {
perf.rop_unit += rop_units_.at(i)->perf_stats();
}
for (uint32_t i = 0; i < NUM_TEX_UNITS; ++i) {
perf.tex_unit += tex_units_.at(i)->perf_stats();
}
return perf;
}