#include "cluster.h"

using namespace vortex;

Cluster::Cluster(uint32_t cluster_id, uint32_t cores_per_cluster, ProcessorImpl* processor, const Arch &arch, const DCRS &dcrs) 
  : cluster_id_(cluster_id)
  , cores_(cores_per_cluster)
  , raster_units_(NUM_RASTER_UNITS)
  , rop_units_(NUM_ROP_UNITS)
  , tex_units_(NUM_TEX_UNITS)
  , sharedmems_(cores_per_cluster)
  , processor_(processor)
{
  char sname[100];

  snprintf(sname, 100, "cluster%d-l2cache", cluster_id);
  l2cache_ = CacheSim::Create(sname, CacheSim::Config{
    !L2_ENABLED,
    log2ceil(L2_CACHE_SIZE),  // C
    log2ceil(MEM_BLOCK_SIZE), // B
    log2ceil(L2_NUM_WAYS),  // W
    0,                      // A
    32,                     // address bits  
    L2_NUM_BANKS,           // number of banks
    L2_NUM_PORTS,           // number of ports
    5,                      // request size 
    true,                   // write-through
    false,                  // write response
    0,                      // victim size
    L2_MSHR_SIZE,           // mshr
    2,                      // pipeline latency
  });

  snprintf(sname, 100, "cluster%d-icaches", cluster_id);
  icaches_ = CacheCluster::Create(sname, cores_per_cluster, NUM_ICACHES, CacheSim::Config{
    !ICACHE_ENABLED,
    log2ceil(ICACHE_SIZE),  // C
    log2ceil(L1_BLOCK_SIZE),// B
    log2ceil(sizeof(uint32_t)), // W
    log2ceil(ICACHE_NUM_WAYS),// A
    32,                     // address bits    
    1,                      // number of banks
    1,                      // number of ports
    1,                      // number of requests
    true,                   // write-through
    false,                  // write response
    0,                      // victim size
    (uint8_t)arch.num_warps(), // mshr
    2,                      // pipeline latency
  });

  icaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(0));
  l2cache_->CoreRspPorts.at(0).bind(&icaches_->MemRspPort);

  snprintf(sname, 100, "cluster%d-dcaches", cluster_id);
  dcaches_ = CacheCluster::Create(sname, cores_per_cluster, NUM_DCACHES, CacheSim::Config{
    !DCACHE_ENABLED,
    log2ceil(DCACHE_SIZE),  // C
    log2ceil(L1_BLOCK_SIZE),// B
    log2ceil(sizeof(Word)), // W
    log2ceil(DCACHE_NUM_WAYS),// A
    32,                     // address bits    
    DCACHE_NUM_BANKS,       // number of banks
    DCACHE_NUM_PORTS,       // number of ports
    (uint8_t)arch.num_threads(), // number of requests
    true,                   // write-through
    false,                  // write response
    0,                      // victim size
    DCACHE_MSHR_SIZE,       // mshr
    4,                      // pipeline latency
  });

  dcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(1));
  l2cache_->CoreRspPorts.at(1).bind(&dcaches_->MemRspPort);
  
  snprintf(sname, 100, "cluster%d-tcaches", cluster_id);
  tcaches_ = CacheCluster::Create(sname, NUM_TEX_UNITS, NUM_TCACHES, CacheSim::Config{
    !TCACHE_ENABLED,
    log2ceil(TCACHE_SIZE),  // C
    log2ceil(L1_BLOCK_SIZE),// B
    log2ceil(sizeof(uint32_t)), // W
    log2ceil(TCACHE_NUM_WAYS),// A
    32,                     // address bits    
    TCACHE_NUM_BANKS,       // number of banks
    TCACHE_NUM_PORTS,       // number of ports
    (uint8_t)arch.num_threads(), // number of requests
    false,                  // write-through
    false,                  // write response
    0,                      // victim size
    TCACHE_MSHR_SIZE,       // mshr
    4,                      // pipeline latency
  });

  tcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(2));
  l2cache_->CoreRspPorts.at(2).bind(&tcaches_->MemRspPort);

  snprintf(sname, 100, "cluster%d-ocaches", cluster_id);
  ocaches_ = CacheCluster::Create(sname, NUM_ROP_UNITS, NUM_OCACHES, CacheSim::Config{
    !OCACHE_ENABLED,
    log2ceil(OCACHE_SIZE),  // C
    log2ceil(MEM_BLOCK_SIZE), // B
    log2ceil(sizeof(uint32_t)), // W
    log2ceil(OCACHE_NUM_WAYS), // A 
    32,                     // address bits    
    OCACHE_NUM_BANKS,       // number of banks
    OCACHE_NUM_PORTS,       // number of ports
    (uint8_t)arch.num_threads(), // number of requests
    false,                  // write-through
    false,                  // write response
    0,                      // victim size
    OCACHE_MSHR_SIZE,       // mshr
    4,                      // pipeline latency
  });

  ocaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(3));
  l2cache_->CoreRspPorts.at(3).bind(&ocaches_->MemRspPort);

  snprintf(sname, 100, "cluster%d-rcaches", cluster_id);
  rcaches_ = CacheCluster::Create(sname, NUM_RASTER_UNITS, NUM_RCACHES, CacheSim::Config{
    !RCACHE_ENABLED,
    log2ceil(RCACHE_SIZE),  // C
    log2ceil(MEM_BLOCK_SIZE), // B
    log2ceil(sizeof(uint32_t)), // W
    log2ceil(RCACHE_NUM_WAYS), // A
    32,                     // address bits    
    RCACHE_NUM_BANKS,       // number of banks
    RCACHE_NUM_PORTS,       // number of ports
    1,                      // number of requests 
    false,                  // write-through
    false,                  // write response
    0,                      // victim size
    RCACHE_MSHR_SIZE,       // mshr
    4,                      // pipeline latency
  });

  rcaches_->MemReqPort.bind(&l2cache_->CoreReqPorts.at(4));
  l2cache_->CoreRspPorts.at(4).bind(&rcaches_->MemRspPort);

  ///////////////////////////////////////////////////////////////////////////

  uint32_t cores_per_raster = cores_per_cluster / NUM_RASTER_UNITS;
  uint32_t cores_per_rop = cores_per_cluster / NUM_ROP_UNITS;
  uint32_t cores_per_tex = cores_per_cluster / NUM_TEX_UNITS;

  // create raster units    
  for (uint32_t i = 0; i < NUM_RASTER_UNITS; ++i) {
    snprintf(sname, 100, "cluster%d-raster_unit%d", cluster_id, i);
    uint32_t raster_idx = cluster_id * NUM_RASTER_UNITS + i;      
    raster_units_.at(i) = RasterUnit::Create(sname, raster_idx, cores_per_raster, arch, dcrs.raster_dcrs, RasterUnit::Config{
      RASTER_TILE_LOGSIZE, 
      RASTER_BLOCK_LOGSIZE
    });
    raster_units_.at(i)->MemReqs.bind(&rcaches_->CoreReqPorts.at(i).at(0));
    rcaches_->CoreRspPorts.at(i).at(0).bind(&raster_units_.at(i)->MemRsps);
  }

  // create rop units
  for (uint32_t i = 0; i < NUM_ROP_UNITS; ++i) {
    snprintf(sname, 100, "cluster%d-rop_unit%d", cluster_id, i);      
    rop_units_.at(i) = RopUnit::Create(sname, cores_per_rop, arch, dcrs.rop_dcrs);
    for (uint32_t j = 0; j < arch.num_threads(); ++j) {
      rop_units_.at(i)->MemReqs.at(j).bind(&ocaches_->CoreReqPorts.at(i).at(j));
      ocaches_->CoreRspPorts.at(i).at(j).bind(&rop_units_.at(i)->MemRsps.at(j));
    }
  }

  // create tex units
  for (uint32_t i = 0; i < NUM_TEX_UNITS; ++i) {
    snprintf(sname, 100, "cluster%d-tex_unit%d", cluster_id, i);      
    tex_units_.at(i) = TexUnit::Create(sname, cores_per_tex, arch, dcrs.tex_dcrs, TexUnit::Config{
      2, // address latency
      6, // sampler latency
    });      
    for (uint32_t j = 0; j < arch.num_threads(); ++j) {
      tex_units_.at(i)->MemReqs.at(j).bind(&tcaches_->CoreReqPorts.at(i).at(j));
      tcaches_->CoreRspPorts.at(i).at(j).bind(&tex_units_.at(i)->MemRsps.at(j));
    }
  }

  // create shared memory blocks
  for (uint32_t i = 0; i < cores_per_cluster; ++i) {
    snprintf(sname, 100, "cluster%d-shared_mem%d", cluster_id, i);
    sharedmems_.at(i) = SharedMem::Create(sname, SharedMem::Config{
      uint32_t(SMEM_LOCAL_SIZE) * arch.num_warps() * arch.num_threads(),
      SMEM_LOCAL_SIZE,
      arch.num_threads(), 
      arch.num_threads(),
      log2ceil(STACK_SIZE),
      1,
      false
    });
  }

  // create cores
  for (uint32_t i = 0; i < cores_per_cluster; ++i) {
    uint32_t raster_idx = i / cores_per_raster;
    uint32_t rop_idx    = i / cores_per_rop;
    uint32_t tex_idx    = i / cores_per_tex;

    uint32_t core_id = cluster_id * cores_per_cluster + i;

    cores_.at(i) = Core::Create(core_id, 
                                this, 
                                arch, 
                                dcrs, 
                                sharedmems_.at(i), 
                                raster_units_.at(raster_idx), 
                                rop_units_.at(rop_idx), 
                                tex_units_.at(tex_idx));

    cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0));
    icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0));      

    for (uint32_t j = 0; j < arch.num_threads(); ++j) {
      snprintf(sname, 100, "cluster%d-smem_demux%d_%d", cluster_id, i, j);
      auto smem_demux = SMemDemux::Create(sname);
      
      cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
      smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));        
      
      smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
      dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);

      smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
      sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
    }
  }
}

Cluster::~Cluster() {
  //--
}

void Cluster::attach_ram(RAM* ram) {
  for (auto core : cores_) {
    core->attach_ram(ram);
  }
  for (auto raster_unit : raster_units_) {
    raster_unit->attach_ram(ram);
  }
  for (auto rop_unit : rop_units_) {
    rop_unit->attach_ram(ram);
  }
  for (auto tex_unit : tex_units_) {
    tex_unit->attach_ram(ram);
  }
}

bool Cluster::running() const {
  for (auto& core : cores_) {
    if (core->running())
      return true;
  }
  return false;
}

bool Cluster::getIRegValue(int* value, int reg) const {
  for (auto& core : cores_) {
    if (core->check_exit()) {
      *value = core->getIRegValue(reg);
      return true;
    }
  }
  return false;
}

void Cluster::bind(SimPort<MemReq>* mem_req_port, SimPort<MemRsp>* mem_rsp_port) {    
    l2cache_->MemReqPort.bind(mem_req_port);
    mem_rsp_port->bind(&l2cache_->MemRspPort);
}

ProcessorImpl* Cluster::processor() const {
  return processor_;
}

Cluster::PerfStats Cluster::perf_stats() const {
  Cluster::PerfStats perf;
  perf.icache = icaches_->perf_stats();
  perf.dcache = dcaches_->perf_stats();    
  perf.tcache = tcaches_->perf_stats();
  perf.ocache = ocaches_->perf_stats();
  perf.rcache = rcaches_->perf_stats();
  perf.l2cache = l2cache_->perf_stats();

  for (auto sharedmem : sharedmems_) {
    perf.sharedmem += sharedmem->perf_stats();
  }
  
  for (uint32_t i = 0; i < NUM_RASTER_UNITS; ++i) {
    perf.raster_unit += raster_units_.at(i)->perf_stats();
  }
  
  for (uint32_t i = 0; i < NUM_ROP_UNITS; ++i) {
    perf.rop_unit += rop_units_.at(i)->perf_stats();
  }

  for (uint32_t i = 0; i < NUM_TEX_UNITS; ++i) {
    perf.tex_unit += tex_units_.at(i)->perf_stats();
  }    
  
  return perf;
}