mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
Fixes for PR
This commit is contained in:
parent
a378aed67c
commit
5b0fc8cbd4
18 changed files with 77 additions and 91 deletions
|
@ -48,8 +48,6 @@ PERF_CLASS=0
|
|||
REBUILD=2
|
||||
TEMPBUILD=0
|
||||
LOGFILE=run.log
|
||||
TC_SIZE=567
|
||||
TC_NUM=123
|
||||
|
||||
for i in "$@"
|
||||
do
|
||||
|
@ -182,7 +180,6 @@ then
|
|||
fi
|
||||
|
||||
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
|
||||
# CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS"
|
||||
echo "CONFIGS=$CONFIGS"
|
||||
|
||||
if [ $REBUILD -ne 0 ]
|
||||
|
|
|
@ -124,7 +124,9 @@ regression()
|
|||
# test local barrier
|
||||
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
|
||||
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar"
|
||||
|
||||
|
||||
# test for matmul
|
||||
CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1"
|
||||
|
||||
echo "regression tests done!"
|
||||
}
|
||||
|
|
|
@ -111,20 +111,20 @@
|
|||
`endif
|
||||
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
|
||||
|
||||
// Size of Tensor Core
|
||||
`ifndef TC_SIZE
|
||||
`define TC_SIZE 4
|
||||
`define TC_SIZE 8
|
||||
`endif
|
||||
|
||||
// Number of TCs per Warp
|
||||
`ifndef TC_NUM
|
||||
`define TC_NUM 1
|
||||
`define TC_NUM 4
|
||||
`endif
|
||||
|
||||
// Number of TCU units
|
||||
`ifndef NUM_TCU_LANES
|
||||
`define NUM_TCU_LANES `TC_NUM
|
||||
`endif
|
||||
|
||||
// Number of TCU units
|
||||
`ifndef NUM_TCU_BLOCKS
|
||||
`define NUM_TCU_BLOCKS `ISSUE_WIDTH
|
||||
`endif
|
||||
|
|
|
@ -196,7 +196,7 @@
|
|||
`define VX_CSR_NUM_CORES 12'hFC2
|
||||
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
|
||||
|
||||
`define VX_MAT_MUL_SIZE 12'hFC4
|
||||
`define VX_MAT_MUL_SIZE 12'hFC4 // VX_MAT_MUL_SIZE = Matrix Size / TC Size
|
||||
`define VX_TC_NUM 12'hFC5
|
||||
`define VX_TC_SIZE 12'hFC6
|
||||
|
||||
|
|
|
@ -222,21 +222,19 @@ inline void vx_fence() {
|
|||
}
|
||||
|
||||
//Matrix load
|
||||
//Converted instruction type cause destination registers were not getiing blocked otherwise
|
||||
inline void mload(unsigned dest, unsigned addr)
|
||||
inline void vx_matrix_load(unsigned dest, unsigned addr)
|
||||
{
|
||||
asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
|
||||
}
|
||||
|
||||
//mat store
|
||||
inline void ms(unsigned addr)
|
||||
//Matrix Store
|
||||
inline void vx_matrix_store(unsigned addr)
|
||||
{
|
||||
asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr));
|
||||
}
|
||||
|
||||
//mat mul
|
||||
//num tiles along reduced K dimension of matmul as imm value (can use rd,rs field to expand range of n_tiles from 12 bits)
|
||||
inline void mm()
|
||||
//Matrix Mul
|
||||
inline void vx_matrix_mul()
|
||||
{
|
||||
asm volatile (".insn i 0x7b, 2, x0, 0(x0)");
|
||||
}
|
||||
|
|
22
run_final.sh
22
run_final.sh
|
@ -1,22 +0,0 @@
|
|||
# Define arrays for threads, warps, and matrix sizes
|
||||
matrix_sizes=(16 32 64 128 256 512)
|
||||
tcsizes=(8 16 32)
|
||||
tcnums=(4 8 16 32)
|
||||
#lsulanes=(4 16)
|
||||
#cores=(32)
|
||||
|
||||
|
||||
# Loop through each combination of threads and warps
|
||||
for size in "${matrix_sizes[@]}"; do
|
||||
sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" ../tests/regression/matmul/Makefile
|
||||
sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" tests/regression/matmul/Makefile
|
||||
echo "Matrix size changed to ${size} in Makefile"
|
||||
for tcsize in "${tcsizes[@]}"; do
|
||||
for tcnum in "${tcnums[@]}"; do
|
||||
log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
|
||||
command="./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --tc_size=${tcsize} --tc_num=${tcnum} --rebuild=1 --perf=1 > ${log_name} 2>&1"
|
||||
echo "$command"
|
||||
eval "$command"
|
||||
done
|
||||
done
|
||||
done
|
|
@ -69,12 +69,12 @@ public:
|
|||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
// case VX_CAPS_TC_SIZE:
|
||||
// _value = TC_SIZE;
|
||||
// break;
|
||||
// case VX_CAPS_TC_NUM:
|
||||
// _value = TC_NUM;
|
||||
// break;
|
||||
case VX_CAPS_TC_SIZE:
|
||||
_value = TC_SIZE;
|
||||
break;
|
||||
case VX_CAPS_TC_NUM:
|
||||
_value = TC_NUM;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
|
|
|
@ -410,9 +410,9 @@ static const char* op_string(const Instr &instr) {
|
|||
case Opcode::TCU:
|
||||
switch(func3)
|
||||
{
|
||||
case 0: return "ML"; //
|
||||
case 1: return "MS"; //
|
||||
case 2: return "MATMUL";
|
||||
case 0: return "ML"; // Matrix Load
|
||||
case 1: return "MS"; // Matrix Store
|
||||
case 2: return "MATMUL"; // Matrix Multiply
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
|
|
@ -74,7 +74,10 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
|
|||
, core_(core)
|
||||
, warps_(arch.num_warps(), arch)
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, scratchpad(std::vector<Word>(32 * 32 * 32768)) //Fix this : Max TC_SIZE = 32
|
||||
// Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is
|
||||
// considered to be big enough to hold input tiles for one output tile.
|
||||
// In future versions, scratchpad size should be fixed to an appropriate value.
|
||||
, scratchpad(std::vector<Word>(32 * 32 * 32768))
|
||||
{
|
||||
this->clear();
|
||||
}
|
||||
|
@ -360,6 +363,11 @@ Word Emulator::get_tc_size()
|
|||
return tc_size;
|
||||
}
|
||||
|
||||
Word Emulator::get_tc_num()
|
||||
{
|
||||
return tc_num;
|
||||
}
|
||||
|
||||
Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
auto core_perf = core_->perf_stats();
|
||||
switch (addr) {
|
||||
|
|
|
@ -56,7 +56,8 @@ public:
|
|||
|
||||
Word get_tiles();
|
||||
Word get_tc_size();
|
||||
|
||||
Word get_tc_num();
|
||||
|
||||
private:
|
||||
|
||||
struct ipdom_entry_t {
|
||||
|
|
|
@ -1429,8 +1429,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid); //CSR instruction before MLOAD will ensure that this csr has value
|
||||
int num_data_per_thread;
|
||||
int num_data_per_thread_st;
|
||||
int num_threads_actv;
|
||||
int num_threads_actv_st;
|
||||
uint32_t num_threads_actv;
|
||||
uint32_t num_threads_actv_st;
|
||||
uint32_t data_bytes_load;
|
||||
uint32_t data_bytes_store;
|
||||
uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp);
|
||||
|
@ -1506,7 +1506,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
|
||||
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
|
||||
trace->data = trace_data;
|
||||
uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
|
||||
|
||||
for (uint32_t t = thread_start; t < num_threads_actv_st; ++t)
|
||||
{
|
||||
|
@ -1521,12 +1520,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
//Store C
|
||||
for (int n=0; n<num_data_per_thread_st; n++)
|
||||
{
|
||||
uint64_t mem_addr = (base_addr+(n*mem_bytes));
|
||||
uint32_t csr_index = (2*num_data_per_thread_st) + n;
|
||||
uint32_t scratchpad_index = (tc_size*tc_size*2) + (t*num_data_per_thread) + n;
|
||||
|
||||
//scratchpad -> csr (TODO :: removed intermediate CSR stage ; incorporate limited scratchmad implementation)
|
||||
//core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_);
|
||||
Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
|
||||
*temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n];
|
||||
|
||||
|
@ -1534,7 +1527,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
}
|
||||
}
|
||||
//Clear the scratchpad
|
||||
for(int i =0 ; i < scratchpad.size(); i++)
|
||||
for(long unsigned int i=0 ; i < scratchpad.size(); i++)
|
||||
{
|
||||
scratchpad[i] = 0;
|
||||
}
|
||||
|
@ -1545,7 +1538,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
DP(4, "TCU MULTIPLY MAT");
|
||||
trace->fu_type = FUType::TCU;
|
||||
trace->tcu_type = TCUType::TCU_MUL;
|
||||
uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
|
||||
uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp);
|
||||
for (uint32_t t = thread_start; t < num_threads_actv; ++t)
|
||||
{
|
||||
|
@ -1556,12 +1548,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
//TC operation [only 1 thread in 1 warp needs to do this]
|
||||
if (t%threads_per_tc == 0)
|
||||
{
|
||||
//TODO : change to systolic array implementation
|
||||
uint32_t thread_offset = t*(tc_size*tc_size);
|
||||
int loop_offset = 0;
|
||||
int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
|
||||
/*
|
||||
// TODO : Fix needed for functional correctness
|
||||
// TODO : change to systolic array implementation
|
||||
uint32_t thread_offset = t*(tc_size*tc_size);
|
||||
|
||||
int loop_offset = 0;
|
||||
int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
|
||||
uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
|
||||
for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation?
|
||||
{
|
||||
for (int i = 0; i < tc_size; i++) { //ROW-1
|
||||
|
|
|
@ -255,7 +255,6 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
|
|||
|
||||
TcuUnit::TcuUnit(const SimContext& ctx, Core* core)
|
||||
: FuncUnit(ctx, core, "TCU")
|
||||
// , tc_size (core_->arch().tc_size())
|
||||
{}
|
||||
|
||||
void TcuUnit::tick() {
|
||||
|
|
|
@ -103,7 +103,6 @@ private:
|
|||
class TcuUnit : public FuncUnit {
|
||||
public:
|
||||
TcuUnit(const SimContext& ctx, Core*);
|
||||
// uint64_t tc_size;
|
||||
void tick();
|
||||
};
|
||||
|
||||
|
|
|
@ -35,8 +35,6 @@ static void show_usage() {
|
|||
uint32_t num_threads = NUM_THREADS;
|
||||
uint32_t num_warps = NUM_WARPS;
|
||||
uint32_t num_cores = NUM_CORES;
|
||||
uint32_t tc_size = TC_SIZE;
|
||||
uint32_t tc_num = TC_NUM;
|
||||
bool showStats = false;
|
||||
const char* program = nullptr;
|
||||
|
||||
|
|
|
@ -9,6 +9,6 @@ SRCS := $(SRC_DIR)/main.cpp
|
|||
|
||||
VX_SRCS := $(SRC_DIR)/kernel.cpp
|
||||
|
||||
OPTS ?= -n512 -d1 -s4 -t4
|
||||
OPTS ?= -n128 -d1
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -107,15 +107,15 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
|||
csr_write(VX_TC_NUM,TC_per_warp);
|
||||
csr_write(VX_TC_SIZE,tc_size);
|
||||
|
||||
mload (0, a_addr_base);
|
||||
mload (1, b_addr_base);
|
||||
vx_matrix_load (0, a_addr_base);
|
||||
vx_matrix_load (1, b_addr_base);
|
||||
//In case of multiple threads - sync load
|
||||
vx_fence();
|
||||
|
||||
mm(); //Assuming padding to ensure matrix size is a multiple of tc_size
|
||||
vx_matrix_mul(); //Assuming padding to ensure matrix size is a multiple of tc_size
|
||||
vx_fence();
|
||||
if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit_c)
|
||||
ms(c_addr_base);
|
||||
vx_matrix_store(c_addr_base);
|
||||
//In case of multiple threads - sync store
|
||||
vx_fence();
|
||||
}
|
||||
|
|
|
@ -21,8 +21,6 @@
|
|||
|
||||
const char* kernel_file = "kernel.vxbin";
|
||||
uint32_t matrix_size = 0;
|
||||
uint32_t tc_num = 4;
|
||||
uint32_t TC_size = 8;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h A_buffer = nullptr;
|
||||
|
@ -41,7 +39,7 @@ static void show_usage() {
|
|||
|
||||
static void parse_args(int argc, char **argv, uint32_t &data_size) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:k:d:t:s:h?")) != -1) {
|
||||
while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
matrix_size = atoi(optarg);
|
||||
|
@ -52,12 +50,6 @@ static void parse_args(int argc, char **argv, uint32_t &data_size) {
|
|||
case 'd':
|
||||
data_size = atoi(optarg);
|
||||
break;
|
||||
case 't':
|
||||
tc_num = atoi(optarg);
|
||||
break;
|
||||
case 's':
|
||||
TC_size = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
|
@ -151,21 +143,15 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
uint32_t tc_size, TC_per_warp;
|
||||
uint64_t tc_size, TC_per_warp;
|
||||
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
std::cout << "Debug :: tc_size (optarg) = " << TC_size << std::endl;
|
||||
std::cout << "Debug :: tc_num (optarg) = " << tc_num << std::endl;
|
||||
|
||||
//Add assert/knob
|
||||
tc_size = TC_size;
|
||||
TC_per_warp = tc_num;
|
||||
|
||||
// RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
|
||||
// RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
|
||||
|
||||
std::cout << "Debug :: tc_size = " << tc_size << std::endl;
|
||||
std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl;
|
||||
|
|
26
tests/regression/matmul/matmul_regression.sh
Executable file
26
tests/regression/matmul/matmul_regression.sh
Executable file
|
@ -0,0 +1,26 @@
|
|||
#!/bin/bash
|
||||
|
||||
# README:
|
||||
# This script launches a sweep of TC_SIZE, TC_NUM and MATRIX SIZES
|
||||
# default values of NUM_WARPS=32, NUM_THREADS=32, NUM_CORES=4, DATA_SIZE=1
|
||||
# Edit matrix_sizes, tcsizes & tcnums variables to vary the sweep limits
|
||||
|
||||
# Define arrays for tc_size,tc_num and matrix sizes
|
||||
matrix_sizes=(16 32 64 128 256 512)
|
||||
tcsizes=(8 16 32)
|
||||
tcnums=(4 8 16 32)
|
||||
|
||||
cd ../../../build/
|
||||
|
||||
# Loop through each combination of above configs
|
||||
for size in "${matrix_sizes[@]}"; do
|
||||
for tcsize in "${tcsizes[@]}"; do
|
||||
for tcnum in "${tcnums[@]}"; do
|
||||
mkdir -p sim_final/mat${size}
|
||||
log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
|
||||
cmd="CONFIGS=\"-DTC_NUM=${tcnum} -DTC_SIZE=${tcsize}\" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args=\"-n${size} -d1\" --rebuild=1 --perf=1 > ${log_name} 2>&1"
|
||||
echo $cmd
|
||||
eval $cmd
|
||||
done
|
||||
done
|
||||
done
|
Loading…
Add table
Add a link
Reference in a new issue