simx multicore fix

This commit is contained in:
Blaise Tine 2021-12-01 00:12:16 -05:00
parent 4477cbeed1
commit 092ff42ab4
7 changed files with 47 additions and 24 deletions

View file

@ -43,15 +43,20 @@ echo "begin clustering tests..."
# warp/threads configurations
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=demo
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=demo --args="-n1"
# L2/L3
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=demo --args="-n1"
echo "clustering tests done!"
}
@ -101,12 +106,14 @@ CONFIGS="-DMEM_BLOCK_SIZE=16 -DL1_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsi
# test cache banking
CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=io_addr
# test cache multi-porting
CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --debug --args="-n1"
CONFIGS="-DL2_NUM_PORTS=2 -DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr
CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=io_addr
CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=io_addr
# test 128-bit MEM block
CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo

View file

@ -9,11 +9,10 @@
#include <vortex.h>
#include <vx_utils.h>
#include <processor.h>
#include <constants.h>
#include <VX_config.h>
#include <util.h>
#define RAM_PAGE_SIZE 4096
using namespace vortex;
///////////////////////////////////////////////////////////////////////////////
@ -58,7 +57,7 @@ private:
class vx_device {
public:
vx_device()
: arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS)
: arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS)
, ram_(RAM_PAGE_SIZE)
, mem_allocation_(ALLOC_BASE_ADDR)
{}

View file

@ -6,6 +6,8 @@
#define MEM_LATENCY 24
#endif
#define RAM_PAGE_SIZE 4096
namespace vortex {
enum Constants {

View file

@ -21,6 +21,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
, arch_(arch)
, decoder_(arch)
, mmu_(0, arch.wsize(), true)
, smem_(RAM_PAGE_SIZE)
, tex_units_(NUM_TEX_UNITS, this)
, warps_(arch.num_warps())
, barriers_(arch.num_barriers(), 0)
@ -380,7 +381,12 @@ Word Core::icache_read(Addr addr, Size size) {
Word Core::dcache_read(Addr addr, Size size) {
Word data;
mmu_.read(&data, addr, size, 0);
auto type = get_addr_type(addr, size);
if (type == AddrType::Shared) {
smem_.read(&data, addr & (SMEM_SIZE-1), size);
} else {
mmu_.read(&data, addr, size, 0);
}
return data;
}
@ -389,7 +395,12 @@ void Core::dcache_write(Addr addr, Word data, Size size) {
&& addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {
this->writeToStdOut(addr, data);
} else {
mmu_.write(&data, addr, size, 0);
auto type = get_addr_type(addr, size);
if (type == AddrType::Shared) {
smem_.write(&data, addr & (SMEM_SIZE-1), size);
} else {
mmu_.write(&data, addr, size, 0);
}
}
}

View file

@ -137,6 +137,7 @@ private:
const ArchDef arch_;
const Decoder decoder_;
MemoryUnit mmu_;
RAM smem_;
std::vector<TexUnit> tex_units_;
std::vector<std::shared_ptr<Warp>> warps_;

View file

@ -6,11 +6,10 @@
#include <stdlib.h>
#include <sys/stat.h>
#include "processor.h"
#include "constants.h"
#include <util.h>
#include "args.h"
#define RAM_PAGE_SIZE 4096
using namespace vortex;
int main(int argc, char **argv) {

View file

@ -18,8 +18,9 @@ Processor::Processor(const ArchDef& arch)
// connect memory sub-systen
memsim_ = MemSim::Create(1, MEM_LATENCY);
std::vector<SimPort<MemReq>*> mem_req_ports(1);
std::vector<SimPort<MemReq>*> mem_req_ports(1);
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1);
mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
@ -46,6 +47,7 @@ Processor::Processor(const ArchDef& arch)
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3cache_->CoreReqPorts.at(i);
mem_rsp_ports.at(i) = &l3cache_->CoreRspPorts.at(i);
@ -57,13 +59,17 @@ Processor::Processor(const ArchDef& arch)
mem_req_ports.resize(NUM_CLUSTERS);
mem_rsp_ports.resize(NUM_CLUSTERS);
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
mem_req_ports.at(i) = &l3_mem_switch_->ReqIn.at(i);
mem_rsp_ports.at(i) = &l3_mem_switch_->RspOut.at(i);
}
}
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
std::vector<SimPort<MemReq>*> cluster_mem_req_ports(cores_per_cluster);
std::vector<SimPort<MemRsp>*> cluster_mem_rsp_ports(cores_per_cluster);
if (L2_ENABLE) {
auto& l2cache = l2caches_.at(i);
l2cache = Cache::Create("l2cache", Cache::Config{
@ -74,40 +80,38 @@ Processor::Processor(const ArchDef& arch)
32, // address bits
L2_NUM_BANKS, // number of banks
L2_NUM_PORTS, // number of ports
NUM_CORES, // request size
(uint8_t)cores_per_cluster, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
mem_rsp_ports.at(i)->bind(&l2cache->MemRspPort);
l2cache->MemReqPort.bind(mem_req_ports.at(i));
mem_req_ports.resize(cores_per_cluster);
mem_rsp_ports.resize(cores_per_cluster);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
cluster_mem_req_ports.at(j) = &l2cache->CoreReqPorts.at(j);
cluster_mem_rsp_ports.at(j) = &l2cache->CoreRspPorts.at(j);
}
} else if (cores_per_cluster > 1) {
} else {
auto& l2_mem_switch = l2_mem_switches_.at(i);
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, NUM_CORES);
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
l2_mem_switch = Switch<MemReq, MemRsp>::Create("l2_arb", ArbiterType::RoundRobin, cores_per_cluster);
mem_rsp_ports.at(i)->bind(&l2_mem_switch->RspIn);
l2_mem_switch->ReqOut.bind(mem_req_ports.at(i));
mem_req_ports.resize(cores_per_cluster);
mem_rsp_ports.resize(cores_per_cluster);
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
cluster_mem_req_ports.at(j) = &l2_mem_switch->ReqIn.at(j);
cluster_mem_rsp_ports.at(j) = &l2_mem_switch->RspOut.at(j);
}
}
for (uint32_t j = 0; j < cores_per_cluster; ++j) {
auto& core = cores_.at((i * NUM_CLUSTERS) + j);
mem_rsp_ports.at(i)->bind(&core->MemRspPort);
core->MemReqPort.bind(mem_req_ports.at(j));
cluster_mem_rsp_ports.at(j)->bind(&core->MemRspPort);
core->MemReqPort.bind(cluster_mem_req_ports.at(j));
}
}
}