Moved tc_num, tc_size param to makefile args

This commit is contained in:
Nayan Sivakumar Nair 2024-06-21 22:23:24 -04:00
parent 0e3badf723
commit a378aed67c
14 changed files with 70 additions and 42 deletions

View file

@ -114,14 +114,6 @@ case $i in
LOGFILE=${i#*=}
shift
;;
--tc_size=*)
TC_SIZE=${i#*=}
shift
;;
--tc_num=*)
TC_NUM=${i#*=}
shift
;;
--help)
show_help
exit 0
@ -190,7 +182,7 @@ then
fi
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS"
# CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS"
echo "CONFIGS=$CONFIGS"
if [ $REBUILD -ne 0 ]

View file

@ -124,6 +124,7 @@ regression()
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar"
echo "regression tests done!"
}

View file

@ -197,6 +197,9 @@
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
`define VX_MAT_MUL_SIZE 12'hFC4
`define VX_TC_NUM 12'hFC5
`define VX_TC_SIZE 12'hFC6
`endif // VX_TYPES_VH

View file

@ -32,7 +32,7 @@ using namespace vortex;
class vx_device {
public:
vx_device()
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, TC_SIZE, TC_NUM)
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
, ram_(0, RAM_PAGE_SIZE)
, processor_(arch_)
, global_mem_(ALLOC_BASE_ADDR,
@ -69,12 +69,12 @@ public:
case VX_CAPS_NUM_CORES:
_value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_TC_SIZE:
_value = TC_SIZE;
break;
case VX_CAPS_TC_NUM:
_value = TC_NUM;
break;
// case VX_CAPS_TC_SIZE:
// _value = TC_SIZE;
// break;
// case VX_CAPS_TC_NUM:
// _value = TC_NUM;
// break;
case VX_CAPS_CACHE_LINE_SIZE:
_value = CACHE_BLOCK_SIZE;
break;

View file

@ -35,11 +35,9 @@ private:
uint16_t num_barriers_;
uint16_t ipdom_size_;
uint64_t local_mem_base_;
uint16_t tc_size_;
uint16_t tc_num_;
public:
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint64_t tc_size, uint64_t tc_num)
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
: num_threads_(num_threads)
, num_warps_(num_warps)
, num_cores_(num_cores)
@ -51,8 +49,6 @@ public:
, num_barriers_(NUM_BARRIERS)
, ipdom_size_((num_threads-1) * 2)
, local_mem_base_(LMEM_BASE_ADDR)
, tc_size_ (tc_size)
, tc_num_ (tc_num)
{}
uint16_t vsize() const {
@ -98,14 +94,6 @@ public:
uint16_t socket_size() const {
return socket_size_;
}
uint16_t tc_size() const {
return tc_size_;
}
uint16_t tc_num() const {
return tc_num_;
}
};

View file

@ -74,7 +74,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
, core_(core)
, warps_(arch.num_warps(), arch)
, barriers_(arch.num_barriers(), 0)
, scratchpad(std::vector<Word>(core->arch().tc_size() * core->arch().tc_size() * 32768)) //Fix this
, scratchpad(std::vector<Word>(32 * 32 * 32768)) //Fix this : Max TC_SIZE = 32
{
this->clear();
}
@ -355,6 +355,11 @@ Word Emulator::get_tiles()
return mat_size;
}
Word Emulator::get_tc_size()
{
return tc_size;
}
Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
auto core_perf = core_->perf_stats();
switch (addr) {
@ -387,6 +392,8 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base();
case VX_CSR_MSCRATCH: return csr_mscratch_;
case VX_MAT_MUL_SIZE: return mat_size;
case VX_TC_NUM: return tc_num;
case VX_TC_SIZE: return tc_size;
CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles);
CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs);
@ -500,6 +507,13 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
case VX_MAT_MUL_SIZE:
mat_size = value;
break;
case VX_TC_NUM:
tc_num = value;
break;
case VX_TC_SIZE:
tc_size = value;
break;
default: {
std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
std::abort();

View file

@ -55,6 +55,7 @@ public:
int get_exitcode() const;
Word get_tiles();
Word get_tc_size();
private:
@ -125,6 +126,8 @@ private:
wspawn_t wspawn_;
std::vector<Word> scratchpad;
uint32_t mat_size;
uint32_t tc_size;
uint32_t tc_num;
};
}

View file

@ -1419,8 +1419,11 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
{ //TODO - make it data-type flexible
uint32_t mem_bytes = 1;
DP(3, "mem_bytes=" << mem_bytes << std::endl);
uint16_t tc_size = core_->arch().tc_size();
uint32_t TC_per_warp = core_->arch().tc_num();
uint16_t tc_size = this->get_csr(VX_TC_SIZE, 0, wid);
uint32_t TC_per_warp = this->get_csr(VX_TC_NUM, 0, wid);
DP(3, "tc_size=" << tc_size << std::endl);
DP(3, "TC_per_warp=" << TC_per_warp << std::endl);
//Number of loads - dependant on the thread config
uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid); //CSR instruction before MLOAD will ensure that this csr has value

View file

@ -255,7 +255,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
TcuUnit::TcuUnit(const SimContext& ctx, Core* core)
: FuncUnit(ctx, core, "TCU")
, tc_size (core_->arch().tc_size())
// , tc_size (core_->arch().tc_size())
{}
void TcuUnit::tick() {
@ -267,6 +267,8 @@ void TcuUnit::tick() {
auto& output = Outputs.at(i);
auto trace = input.front();
uint32_t n_tiles = core_->emulator_.get_tiles();
uint32_t tc_size = core_->emulator_.get_tc_size();
switch (trace->tcu_type) {
case TCUType::TCU_MUL:
{ //mat size = n_tiles * tc_size

View file

@ -103,7 +103,7 @@ private:
class TcuUnit : public FuncUnit {
public:
TcuUnit(const SimContext& ctx, Core*);
uint64_t tc_size;
// uint64_t tc_size;
void tick();
};

View file

@ -83,7 +83,7 @@ int main(int argc, char **argv) {
{
// create processor configuation
Arch arch(num_threads, num_warps, num_cores, tc_size, tc_num);
Arch arch(num_threads, num_warps, num_cores);
// create memory module
RAM ram(0, RAM_PAGE_SIZE);

View file

@ -9,6 +9,6 @@ SRCS := $(SRC_DIR)/main.cpp
VX_SRCS := $(SRC_DIR)/kernel.cpp
OPTS ?= -n128 -d1
OPTS ?= -n512 -d1 -s4 -t4
include ../common.mk

View file

@ -13,7 +13,7 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
unsigned c_addr = reinterpret_cast<unsigned>(dst_ptr);
uint32_t tc_size = arg->tc_size;
int TC_per_warp = arg->TC_per_warp;
uint32_t TC_per_warp = arg->TC_per_warp;
unsigned num_threads = arg->num_threads;
int num_warps = arg->num_warps;
uint32_t matrix_size = arg->matrix_size;
@ -104,6 +104,9 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
unsigned b_addr_base = b_addr + offset*arg->data_size;
unsigned c_addr_base = c_addr + offset_c*arg->data_size;
csr_write(VX_MAT_MUL_SIZE,n_tiles);
csr_write(VX_TC_NUM,TC_per_warp);
csr_write(VX_TC_SIZE,tc_size);
mload (0, a_addr_base);
mload (1, b_addr_base);
//In case of multiple threads - sync load

View file

@ -21,6 +21,9 @@
const char* kernel_file = "kernel.vxbin";
uint32_t matrix_size = 0;
uint32_t tc_num = 4;
uint32_t TC_size = 8;
vx_device_h device = nullptr;
vx_buffer_h A_buffer = nullptr;
vx_buffer_h B_buffer = nullptr;
@ -38,7 +41,7 @@ static void show_usage() {
static void parse_args(int argc, char **argv, uint32_t &data_size) {
int c;
while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) {
while ((c = getopt(argc, argv, "n:k:d:t:s:h?")) != -1) {
switch (c) {
case 'n':
matrix_size = atoi(optarg);
@ -48,7 +51,13 @@ static void parse_args(int argc, char **argv, uint32_t &data_size) {
break;
case 'd':
data_size = atoi(optarg);
break;
break;
case 't':
tc_num = atoi(optarg);
break;
case 's':
TC_size = atoi(optarg);
break;
case 'h':
case '?': {
show_usage();
@ -141,12 +150,22 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads, tc_size, TC_per_warp;
uint64_t num_cores, num_warps, num_threads;
uint32_t tc_size, TC_per_warp;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
std::cout << "Debug :: tc_size (optarg) = " << TC_size << std::endl;
std::cout << "Debug :: tc_num (optarg) = " << tc_num << std::endl;
//Add assert/knob
tc_size = TC_size;
TC_per_warp = tc_num;
// RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
// RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
std::cout << "Debug :: tc_size = " << tc_size << std::endl;
std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl;