Fixes for PR

This commit is contained in:
Nayan Sivakumar Nair 2024-06-25 03:18:50 -04:00
parent a378aed67c
commit 5b0fc8cbd4
18 changed files with 77 additions and 91 deletions

View file

@ -48,8 +48,6 @@ PERF_CLASS=0
REBUILD=2
TEMPBUILD=0
LOGFILE=run.log
TC_SIZE=567
TC_NUM=123
for i in "$@"
do
@ -182,7 +180,6 @@ then
fi
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
# CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS"
echo "CONFIGS=$CONFIGS"
if [ $REBUILD -ne 0 ]

View file

@ -124,7 +124,9 @@ regression()
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar"
# test for matmul
CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1"
echo "regression tests done!"
}

View file

@ -111,20 +111,20 @@
`endif
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
// Size of Tensor Core
`ifndef TC_SIZE
`define TC_SIZE 4
`define TC_SIZE 8
`endif
// Number of TCs per Warp
`ifndef TC_NUM
`define TC_NUM 1
`define TC_NUM 4
`endif
// Number of TCU units
`ifndef NUM_TCU_LANES
`define NUM_TCU_LANES `TC_NUM
`endif
// Number of TCU units
`ifndef NUM_TCU_BLOCKS
`define NUM_TCU_BLOCKS `ISSUE_WIDTH
`endif

View file

@ -196,7 +196,7 @@
`define VX_CSR_NUM_CORES 12'hFC2
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
`define VX_MAT_MUL_SIZE 12'hFC4
`define VX_MAT_MUL_SIZE 12'hFC4 // VX_MAT_MUL_SIZE = Matrix Size / TC Size
`define VX_TC_NUM 12'hFC5
`define VX_TC_SIZE 12'hFC6

View file

@ -222,21 +222,19 @@ inline void vx_fence() {
}
//Matrix load
//Converted instruction type cause destination registers were not getiing blocked otherwise
inline void mload(unsigned dest, unsigned addr)
inline void vx_matrix_load(unsigned dest, unsigned addr)
{
asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
}
//mat store
inline void ms(unsigned addr)
//Matrix Store
inline void vx_matrix_store(unsigned addr)
{
asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr));
}
//mat mul
//num tiles along reduced K dimension of matmul as imm value (can use rd,rs field to expand range of n_tiles from 12 bits)
inline void mm()
//Matrix Mul
inline void vx_matrix_mul()
{
asm volatile (".insn i 0x7b, 2, x0, 0(x0)");
}

View file

@ -1,22 +0,0 @@
# Define arrays for threads, warps, and matrix sizes
matrix_sizes=(16 32 64 128 256 512)
tcsizes=(8 16 32)
tcnums=(4 8 16 32)
#lsulanes=(4 16)
#cores=(32)
# Loop through each combination of threads and warps
for size in "${matrix_sizes[@]}"; do
sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" ../tests/regression/matmul/Makefile
sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" tests/regression/matmul/Makefile
echo "Matrix size changed to ${size} in Makefile"
for tcsize in "${tcsizes[@]}"; do
for tcnum in "${tcnums[@]}"; do
log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
command="./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --tc_size=${tcsize} --tc_num=${tcnum} --rebuild=1 --perf=1 > ${log_name} 2>&1"
echo "$command"
eval "$command"
done
done
done

View file

@ -69,12 +69,12 @@ public:
case VX_CAPS_NUM_CORES:
_value = NUM_CORES * NUM_CLUSTERS;
break;
// case VX_CAPS_TC_SIZE:
// _value = TC_SIZE;
// break;
// case VX_CAPS_TC_NUM:
// _value = TC_NUM;
// break;
case VX_CAPS_TC_SIZE:
_value = TC_SIZE;
break;
case VX_CAPS_TC_NUM:
_value = TC_NUM;
break;
case VX_CAPS_CACHE_LINE_SIZE:
_value = CACHE_BLOCK_SIZE;
break;

View file

@ -410,9 +410,9 @@ static const char* op_string(const Instr &instr) {
case Opcode::TCU:
switch(func3)
{
case 0: return "ML"; //
case 1: return "MS"; //
case 2: return "MATMUL";
case 0: return "ML"; // Matrix Load
case 1: return "MS"; // Matrix Store
case 2: return "MATMUL"; // Matrix Multiply
default:
std::abort();
}

View file

@ -74,7 +74,10 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
, core_(core)
, warps_(arch.num_warps(), arch)
, barriers_(arch.num_barriers(), 0)
, scratchpad(std::vector<Word>(32 * 32 * 32768)) //Fix this : Max TC_SIZE = 32
// Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is
// considered to be big enough to hold input tiles for one output tile.
// In future versions, scratchpad size should be fixed to an appropriate value.
, scratchpad(std::vector<Word>(32 * 32 * 32768))
{
this->clear();
}
@ -360,6 +363,11 @@ Word Emulator::get_tc_size()
return tc_size;
}
Word Emulator::get_tc_num()
{
return tc_num;
}
Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
auto core_perf = core_->perf_stats();
switch (addr) {

View file

@ -56,7 +56,8 @@ public:
Word get_tiles();
Word get_tc_size();
Word get_tc_num();
private:
struct ipdom_entry_t {

View file

@ -1429,8 +1429,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid); //CSR instruction before MLOAD will ensure that this csr has value
int num_data_per_thread;
int num_data_per_thread_st;
int num_threads_actv;
int num_threads_actv_st;
uint32_t num_threads_actv;
uint32_t num_threads_actv_st;
uint32_t data_bytes_load;
uint32_t data_bytes_store;
uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp);
@ -1506,7 +1506,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
trace->data = trace_data;
uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
for (uint32_t t = thread_start; t < num_threads_actv_st; ++t)
{
@ -1521,12 +1520,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
//Store C
for (int n=0; n<num_data_per_thread_st; n++)
{
uint64_t mem_addr = (base_addr+(n*mem_bytes));
uint32_t csr_index = (2*num_data_per_thread_st) + n;
uint32_t scratchpad_index = (tc_size*tc_size*2) + (t*num_data_per_thread) + n;
//scratchpad -> csr (TODO :: removed intermediate CSR stage ; incorporate limited scratchmad implementation)
//core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_);
Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
*temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n];
@ -1534,7 +1527,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
}
}
//Clear the scratchpad
for(int i =0 ; i < scratchpad.size(); i++)
for(long unsigned int i=0 ; i < scratchpad.size(); i++)
{
scratchpad[i] = 0;
}
@ -1545,7 +1538,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
DP(4, "TCU MULTIPLY MAT");
trace->fu_type = FUType::TCU;
trace->tcu_type = TCUType::TCU_MUL;
uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp);
for (uint32_t t = thread_start; t < num_threads_actv; ++t)
{
@ -1556,12 +1548,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
//TC operation [only 1 thread in 1 warp needs to do this]
if (t%threads_per_tc == 0)
{
//TODO : change to systolic array implementation
uint32_t thread_offset = t*(tc_size*tc_size);
int loop_offset = 0;
int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
/*
// TODO : Fix needed for functional correctness
// TODO : change to systolic array implementation
uint32_t thread_offset = t*(tc_size*tc_size);
int loop_offset = 0;
int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation?
{
for (int i = 0; i < tc_size; i++) { //ROW-1

View file

@ -255,7 +255,6 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
TcuUnit::TcuUnit(const SimContext& ctx, Core* core)
: FuncUnit(ctx, core, "TCU")
// , tc_size (core_->arch().tc_size())
{}
void TcuUnit::tick() {

View file

@ -103,7 +103,6 @@ private:
class TcuUnit : public FuncUnit {
public:
TcuUnit(const SimContext& ctx, Core*);
// uint64_t tc_size;
void tick();
};

View file

@ -35,8 +35,6 @@ static void show_usage() {
uint32_t num_threads = NUM_THREADS;
uint32_t num_warps = NUM_WARPS;
uint32_t num_cores = NUM_CORES;
uint32_t tc_size = TC_SIZE;
uint32_t tc_num = TC_NUM;
bool showStats = false;
const char* program = nullptr;

View file

@ -9,6 +9,6 @@ SRCS := $(SRC_DIR)/main.cpp
VX_SRCS := $(SRC_DIR)/kernel.cpp
OPTS ?= -n512 -d1 -s4 -t4
OPTS ?= -n128 -d1
include ../common.mk

View file

@ -107,15 +107,15 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
csr_write(VX_TC_NUM,TC_per_warp);
csr_write(VX_TC_SIZE,tc_size);
mload (0, a_addr_base);
mload (1, b_addr_base);
vx_matrix_load (0, a_addr_base);
vx_matrix_load (1, b_addr_base);
//In case of multiple threads - sync load
vx_fence();
mm(); //Assuming padding to ensure matrix size is a multiple of tc_size
vx_matrix_mul(); //Assuming padding to ensure matrix size is a multiple of tc_size
vx_fence();
if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit_c)
ms(c_addr_base);
vx_matrix_store(c_addr_base);
//In case of multiple threads - sync store
vx_fence();
}

View file

@ -21,8 +21,6 @@
const char* kernel_file = "kernel.vxbin";
uint32_t matrix_size = 0;
uint32_t tc_num = 4;
uint32_t TC_size = 8;
vx_device_h device = nullptr;
vx_buffer_h A_buffer = nullptr;
@ -41,7 +39,7 @@ static void show_usage() {
static void parse_args(int argc, char **argv, uint32_t &data_size) {
int c;
while ((c = getopt(argc, argv, "n:k:d:t:s:h?")) != -1) {
while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) {
switch (c) {
case 'n':
matrix_size = atoi(optarg);
@ -52,12 +50,6 @@ static void parse_args(int argc, char **argv, uint32_t &data_size) {
case 'd':
data_size = atoi(optarg);
break;
case 't':
tc_num = atoi(optarg);
break;
case 's':
TC_size = atoi(optarg);
break;
case 'h':
case '?': {
show_usage();
@ -151,21 +143,15 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
uint32_t tc_size, TC_per_warp;
uint64_t tc_size, TC_per_warp;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
std::cout << "Debug :: tc_size (optarg) = " << TC_size << std::endl;
std::cout << "Debug :: tc_num (optarg) = " << tc_num << std::endl;
//Add assert/knob
tc_size = TC_size;
TC_per_warp = tc_num;
// RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
// RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
std::cout << "Debug :: tc_size = " << tc_size << std::endl;
std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl;

View file

@ -0,0 +1,26 @@
#!/bin/bash
# README:
# This script launches a sweep of TC_SIZE, TC_NUM and MATRIX SIZES
# default values of NUM_WARPS=32, NUM_THREADS=32, NUM_CORES=4, DATA_SIZE=1
# Edit matrix_sizes, tcsizes & tcnums variables to vary the sweep limits
# Define arrays for tc_size,tc_num and matrix sizes
matrix_sizes=(16 32 64 128 256 512)
tcsizes=(8 16 32)
tcnums=(4 8 16 32)
cd ../../../build/
# Loop through each combination of above configs
for size in "${matrix_sizes[@]}"; do
for tcsize in "${tcsizes[@]}"; do
for tcnum in "${tcnums[@]}"; do
mkdir -p sim_final/mat${size}
log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
cmd="CONFIGS=\"-DTC_NUM=${tcnum} -DTC_SIZE=${tcsize}\" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args=\"-n${size} -d1\" --rebuild=1 --perf=1 > ${log_name} 2>&1"
echo $cmd
eval $cmd
done
done
done