mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
Tensor cores in Vortex
This commit is contained in:
parent
abdea91120
commit
99c6a1af5a
23 changed files with 898 additions and 31 deletions
|
@ -48,6 +48,8 @@ PERF_CLASS=0
|
|||
REBUILD=2
|
||||
TEMPBUILD=0
|
||||
LOGFILE=run.log
|
||||
TC_SIZE=567
|
||||
TC_NUM=123
|
||||
|
||||
for i in "$@"
|
||||
do
|
||||
|
@ -112,6 +114,14 @@ case $i in
|
|||
LOGFILE=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--tc_size=*)
|
||||
TC_SIZE=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--tc_num=*)
|
||||
TC_NUM=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--help)
|
||||
show_help
|
||||
exit 0
|
||||
|
@ -180,7 +190,7 @@ then
|
|||
fi
|
||||
|
||||
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
|
||||
|
||||
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DTC_NUM=$TC_NUM -DTC_SIZE=$TC_SIZE $L2 $L3 $PERF_FLAG $CONFIGS"
|
||||
echo "CONFIGS=$CONFIGS"
|
||||
|
||||
if [ $REBUILD -ne 0 ]
|
||||
|
|
|
@ -111,6 +111,24 @@
|
|||
`endif
|
||||
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
|
||||
|
||||
`ifndef TC_SIZE
|
||||
`define TC_SIZE 4
|
||||
`endif
|
||||
|
||||
`ifndef TC_NUM
|
||||
`define TC_NUM 1
|
||||
`endif
|
||||
|
||||
// Number of TCU units
|
||||
`ifndef NUM_TCU_LANES
|
||||
`define NUM_TCU_LANES `TC_NUM
|
||||
`endif
|
||||
|
||||
// Number of TCU units
|
||||
`ifndef NUM_TCU_BLOCKS
|
||||
`define NUM_TCU_BLOCKS `ISSUE_WIDTH
|
||||
`endif
|
||||
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_ENABLED 1
|
||||
`else
|
||||
|
|
|
@ -196,4 +196,7 @@
|
|||
`define VX_CSR_NUM_CORES 12'hFC2
|
||||
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
|
||||
|
||||
`define VX_MAT_MUL_SIZE 12'hFC4
|
||||
|
||||
|
||||
`endif // VX_TYPES_VH
|
||||
|
|
|
@ -221,6 +221,26 @@ inline void vx_fence() {
|
|||
__asm__ volatile ("fence iorw, iorw");
|
||||
}
|
||||
|
||||
//Matrix load
|
||||
//Converted instruction type cause destination registers were not getiing blocked otherwise
|
||||
inline void mload(unsigned dest, unsigned addr)
|
||||
{
|
||||
asm volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
|
||||
}
|
||||
|
||||
//mat store
|
||||
inline void ms(unsigned addr)
|
||||
{
|
||||
asm volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr));
|
||||
}
|
||||
|
||||
//mat mul
|
||||
//num tiles along reduced K dimension of matmul as imm value (can use rd,rs field to expand range of n_tiles from 12 bits)
|
||||
inline void mm()
|
||||
{
|
||||
asm volatile (".insn i 0x7b, 2, x0, 0(x0)");
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -34,6 +34,8 @@ typedef void* vx_buffer_h;
|
|||
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
|
||||
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
|
||||
#define VX_CAPS_ISA_FLAGS 0x7
|
||||
#define VX_CAPS_TC_SIZE 0x8
|
||||
#define VX_CAPS_TC_NUM 0x9
|
||||
|
||||
// device isa flags
|
||||
#define VX_ISA_STD_A (1ull << 0)
|
||||
|
|
|
@ -32,7 +32,7 @@ using namespace vortex;
|
|||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
|
||||
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES, TC_SIZE, TC_NUM)
|
||||
, ram_(0, RAM_PAGE_SIZE)
|
||||
, processor_(arch_)
|
||||
, global_mem_(ALLOC_BASE_ADDR,
|
||||
|
@ -69,6 +69,12 @@ public:
|
|||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_TC_SIZE:
|
||||
_value = TC_SIZE;
|
||||
break;
|
||||
case VX_CAPS_TC_NUM:
|
||||
_value = TC_NUM;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
|
|
|
@ -35,9 +35,11 @@ private:
|
|||
uint16_t num_barriers_;
|
||||
uint16_t ipdom_size_;
|
||||
uint64_t local_mem_base_;
|
||||
uint16_t tc_size_;
|
||||
uint16_t tc_num_;
|
||||
|
||||
public:
|
||||
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores)
|
||||
Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores, uint64_t tc_size, uint64_t tc_num)
|
||||
: num_threads_(num_threads)
|
||||
, num_warps_(num_warps)
|
||||
, num_cores_(num_cores)
|
||||
|
@ -49,6 +51,8 @@ public:
|
|||
, num_barriers_(NUM_BARRIERS)
|
||||
, ipdom_size_((num_threads-1) * 2)
|
||||
, local_mem_base_(LMEM_BASE_ADDR)
|
||||
, tc_size_ (tc_size)
|
||||
, tc_num_ (tc_num)
|
||||
{}
|
||||
|
||||
uint16_t vsize() const {
|
||||
|
@ -94,6 +98,15 @@ public:
|
|||
uint16_t socket_size() const {
|
||||
return socket_size_;
|
||||
}
|
||||
|
||||
uint16_t tc_size() const {
|
||||
return tc_size_;
|
||||
}
|
||||
|
||||
uint16_t tc_num() const {
|
||||
return tc_num_;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
|
@ -105,12 +105,14 @@ Core::Core(const SimContext& ctx,
|
|||
dispatchers_.at((int)FUType::FPU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES);
|
||||
dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES);
|
||||
dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES);
|
||||
|
||||
dispatchers_.at((int)FUType::TCU) = SimPlatform::instance().create_object<Dispatcher>(arch, 2, NUM_TCU_BLOCKS, NUM_TCU_LANES);
|
||||
|
||||
// initialize execute units
|
||||
func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object<AluUnit>(this);
|
||||
func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object<FpuUnit>(this);
|
||||
func_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
|
||||
func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
|
||||
func_units_.at((int)FUType::TCU) = SimPlatform::instance().create_object<TcuUnit>(this);
|
||||
|
||||
// bind commit arbiters
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
|
|
|
@ -170,6 +170,7 @@ private:
|
|||
friend class AluUnit;
|
||||
friend class FpuUnit;
|
||||
friend class SfuUnit;
|
||||
friend class TcuUnit;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
|
|
|
@ -51,6 +51,7 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
|
|||
{Opcode::EXT2, InstType::R4},
|
||||
{Opcode::R_W, InstType::R},
|
||||
{Opcode::I_W, InstType::I},
|
||||
{Opcode::TCU, InstType::I},
|
||||
};
|
||||
|
||||
enum Constants {
|
||||
|
@ -405,6 +406,16 @@ static const char* op_string(const Instr &instr) {
|
|||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
||||
case Opcode::TCU:
|
||||
switch(func3)
|
||||
{
|
||||
case 0: return "ML"; //
|
||||
case 1: return "MS"; //
|
||||
case 2: return "MATMUL";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
@ -543,6 +554,14 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
|
|||
|
||||
case InstType::I: {
|
||||
switch (op) {
|
||||
case Opcode::TCU: {
|
||||
instr->setDestReg(rs1, RegType::Integer);
|
||||
instr->addSrcReg(rs1, RegType::Integer);
|
||||
instr->setFunc3(func3);
|
||||
instr->setFunc7(func7);
|
||||
auto imm = code >> shift_rs2;
|
||||
instr->setImm(sext(imm, width_i_imm));
|
||||
} break;
|
||||
case Opcode::I:
|
||||
case Opcode::I_W:
|
||||
case Opcode::JALR:
|
||||
|
|
|
@ -74,6 +74,7 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core)
|
|||
, core_(core)
|
||||
, warps_(arch.num_warps(), arch)
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, scratchpad(std::vector<Word>(core->arch().tc_size() * core->arch().tc_size() * 32768)) //Fix this
|
||||
{
|
||||
this->clear();
|
||||
}
|
||||
|
@ -110,6 +111,11 @@ void Emulator::clear() {
|
|||
active_warps_.set(0);
|
||||
warps_[0].tmask.set(0);
|
||||
wspawn_.valid = false;
|
||||
|
||||
for (auto& reg : scratchpad)
|
||||
{
|
||||
reg = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void Emulator::attach_ram(RAM* ram) {
|
||||
|
@ -344,6 +350,11 @@ void Emulator::cout_flush() {
|
|||
case (addr + (VX_CSR_MPM_BASE_H-VX_CSR_MPM_BASE)) : return ((value >> 32) & 0xFFFFFFFF)
|
||||
#endif
|
||||
|
||||
Word Emulator::get_tiles()
|
||||
{
|
||||
return mat_size;
|
||||
}
|
||||
|
||||
Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
auto core_perf = core_->perf_stats();
|
||||
switch (addr) {
|
||||
|
@ -375,6 +386,8 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_NUM_CORES: return uint32_t(arch_.num_cores()) * arch_.num_clusters();
|
||||
case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base();
|
||||
case VX_CSR_MSCRATCH: return csr_mscratch_;
|
||||
case VX_MAT_MUL_SIZE: return mat_size;
|
||||
|
||||
CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles);
|
||||
CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs);
|
||||
default:
|
||||
|
@ -484,6 +497,9 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_MNSTATUS:
|
||||
case VX_CSR_MCAUSE:
|
||||
break;
|
||||
case VX_MAT_MUL_SIZE:
|
||||
mat_size = value;
|
||||
break;
|
||||
default: {
|
||||
std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
|
||||
std::abort();
|
||||
|
@ -500,4 +516,4 @@ void Emulator::update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid) {
|
|||
this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid);
|
||||
this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,6 +53,8 @@ public:
|
|||
bool wspawn(uint32_t num_warps, Word nextPC);
|
||||
|
||||
int get_exitcode() const;
|
||||
|
||||
Word get_tiles();
|
||||
|
||||
private:
|
||||
|
||||
|
@ -121,6 +123,8 @@ private:
|
|||
MemoryUnit mmu_;
|
||||
Word csr_mscratch_;
|
||||
wspawn_t wspawn_;
|
||||
std::vector<Word> scratchpad;
|
||||
uint32_t mat_size;
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include "emulator.h"
|
||||
#include "instr.h"
|
||||
#include "core.h"
|
||||
#include "VX_types.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
|
@ -1414,6 +1415,184 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
std::abort();
|
||||
}
|
||||
} break;
|
||||
case Opcode::TCU:
|
||||
{ //TODO - make it data-type flexible
|
||||
uint32_t mem_bytes = 1;
|
||||
DP(3, "mem_bytes=" << mem_bytes << std::endl);
|
||||
uint16_t tc_size = core_->arch().tc_size();
|
||||
uint32_t TC_per_warp = core_->arch().tc_num();
|
||||
|
||||
//Number of loads - dependant on the thread config
|
||||
uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid); //CSR instruction before MLOAD will ensure that this csr has value
|
||||
int num_data_per_thread;
|
||||
int num_data_per_thread_st;
|
||||
int num_threads_actv;
|
||||
int num_threads_actv_st;
|
||||
uint32_t data_bytes_load;
|
||||
uint32_t data_bytes_store;
|
||||
uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp);
|
||||
|
||||
//int num_warps = MIN()
|
||||
//int active_tcs = MIN (TC_per_warp, num_output_tiles/num_warps)
|
||||
//LOAD
|
||||
if(num_threads > tc_size*tc_size*n_tiles*TC_per_warp)
|
||||
{
|
||||
num_threads_actv = tc_size*tc_size*n_tiles*TC_per_warp;
|
||||
num_data_per_thread = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
num_threads_actv = num_threads;
|
||||
num_data_per_thread = (tc_size*tc_size*n_tiles)/num_threads_per_tc;
|
||||
}
|
||||
data_bytes_load = mem_bytes*num_data_per_thread;
|
||||
|
||||
//STORE
|
||||
|
||||
// DP(3, "DEBUG :: num_threads = " << num_threads);
|
||||
// DP(3, "DEBUG :: tc_size*tc_size = " << tc_size*tc_size);
|
||||
//DP(3, "imm = " << immsrc);
|
||||
|
||||
if(num_threads > tc_size*tc_size*TC_per_warp)
|
||||
{
|
||||
num_threads_actv_st = tc_size*tc_size*TC_per_warp;
|
||||
num_data_per_thread_st = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
num_threads_actv_st = num_threads;
|
||||
num_data_per_thread_st = (tc_size*tc_size)/num_threads_per_tc;
|
||||
}
|
||||
data_bytes_store = mem_bytes*num_data_per_thread_st;
|
||||
|
||||
DP(3, "Num Tiles=" << n_tiles << std::endl);
|
||||
|
||||
switch (func3) {
|
||||
case 0:
|
||||
{ //Matrix Load
|
||||
|
||||
DP (4, "TCU LOAD");
|
||||
trace->fu_type = FUType::LSU;
|
||||
trace->lsu_type = LsuType::TCU_LOAD;
|
||||
|
||||
trace->used_iregs.set(rsrc0);
|
||||
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
|
||||
trace->data = trace_data;
|
||||
|
||||
for (uint32_t t = thread_start; t < num_threads_actv; ++t)
|
||||
{
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
DP(3, "Thread ID" << t);
|
||||
|
||||
uint32_t base_addr = rsdata[t][0].i ;
|
||||
trace_data->mem_addrs.at(t) = {base_addr, data_bytes_load};
|
||||
|
||||
//Load A or B (depends on immsrc)
|
||||
int loop_offset = 0;
|
||||
DP(3, "n_tiles = " << n_tiles << "; num_data_per_thread = " << num_data_per_thread <<std::endl);
|
||||
for (int n=0; n<num_data_per_thread; n++)
|
||||
{
|
||||
Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
|
||||
this->dcache_read(temp_ref, (base_addr+(n*mem_bytes)+(loop_offset*mem_bytes)), mem_bytes);
|
||||
|
||||
scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n] = *temp_ref;
|
||||
DP(3, "Scratchpad Index: " << loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n << ", Value: " << scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n]);
|
||||
}
|
||||
//loop_offset += tc_size*tc_size;
|
||||
//}
|
||||
}
|
||||
rd_write = true;
|
||||
} break;
|
||||
case 1:
|
||||
{
|
||||
DP(4, "TCU STORE");
|
||||
trace->fu_type = FUType::LSU;
|
||||
trace->lsu_type = LsuType::TCU_STORE;
|
||||
|
||||
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
|
||||
trace->data = trace_data;
|
||||
uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
|
||||
|
||||
for (uint32_t t = thread_start; t < num_threads_actv_st; ++t)
|
||||
{
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
|
||||
DP(3, "Thread ID" << t);
|
||||
uint32_t base_addr = rsdata[t][0].i ;
|
||||
|
||||
trace_data->mem_addrs.at(t) = {base_addr, data_bytes_store};
|
||||
|
||||
//Store C
|
||||
for (int n=0; n<num_data_per_thread_st; n++)
|
||||
{
|
||||
uint64_t mem_addr = (base_addr+(n*mem_bytes));
|
||||
uint32_t csr_index = (2*num_data_per_thread_st) + n;
|
||||
uint32_t scratchpad_index = (tc_size*tc_size*2) + (t*num_data_per_thread) + n;
|
||||
|
||||
//scratchpad -> csr (TODO :: can intermediate step of moving to CSR be skipped?)
|
||||
//core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_);
|
||||
Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
|
||||
*temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n];
|
||||
|
||||
this->dcache_write(temp_ref, base_addr+(n*mem_bytes), mem_bytes);
|
||||
}
|
||||
}
|
||||
//Clear the scratchpad
|
||||
for(int i =0 ; i < scratchpad.size(); i++)
|
||||
{
|
||||
scratchpad[i] = 0;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
{ //Matrix Multiply
|
||||
DP(4, "TCU MULTIPLY MAT");
|
||||
trace->fu_type = FUType::TCU;
|
||||
trace->tcu_type = TCUType::TCU_MUL;
|
||||
uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2;
|
||||
uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp);
|
||||
for (uint32_t t = thread_start; t < num_threads_actv; ++t)
|
||||
{
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
|
||||
DP(3, "Thread ID" << t);
|
||||
//TC operation [only 1 thread in 1 warp needs to do this]
|
||||
if (t%threads_per_tc == 0)
|
||||
{
|
||||
//TODO - change to systolic array implementation
|
||||
uint32_t thread_offset = t*(tc_size*tc_size);
|
||||
int loop_offset = 0;
|
||||
int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
|
||||
// Loop over all tiles - output stationary
|
||||
//for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation?
|
||||
//{
|
||||
/*
|
||||
for (int i = 0; i < tc_size; i++) { //ROW-1
|
||||
for (int j = 0; j < tc_size; j++) { //COL-2
|
||||
int sum = 0;
|
||||
for (int k = 0; k < tc_size; k++)
|
||||
{ //COL-1
|
||||
sum = sum + scratchpad[loop_offset + thread_offset*n_tiles + i * tc_size + k] *scratchpad[loop_offset + thread_offset*n_tiles + offset_b + (k * tc_size + j)];
|
||||
}
|
||||
scratchpad[accu_offset + thread_offset +(i * tc_size + j)] += sum; //[i * col2 + j] = sum
|
||||
DP(3, "Scratchpad Index: " << accu_offset + (i * tc_size + j) << " , Value=" << scratchpad[accu_offset + (i * tc_size + j)]);
|
||||
|
||||
}
|
||||
}
|
||||
*/
|
||||
//loop_offset += tc_size*tc_size; //Move to the next tiled matmul fragment
|
||||
//}
|
||||
}
|
||||
}
|
||||
|
||||
}break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#include "core.h"
|
||||
#include "constants.h"
|
||||
#include "cache_sim.h"
|
||||
#include "VX_types.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
|
@ -162,7 +163,7 @@ void LsuUnit::tick() {
|
|||
continue;
|
||||
}
|
||||
|
||||
bool is_write = (trace->lsu_type == LsuType::STORE);
|
||||
bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE));
|
||||
|
||||
// check pending queue capacity
|
||||
if (!is_write && state.pending_rd_reqs.full()) {
|
||||
|
@ -175,13 +176,14 @@ void LsuUnit::tick() {
|
|||
}
|
||||
|
||||
uint32_t tag = 0;
|
||||
|
||||
if (!is_write) {
|
||||
tag = state.pending_rd_reqs.allocate({trace, 0});
|
||||
}
|
||||
|
||||
// send memory request
|
||||
auto num_reqs = this->send_requests(trace, block_idx, tag);
|
||||
|
||||
|
||||
if (!is_write) {
|
||||
state.pending_rd_reqs.at(tag).count = num_reqs;
|
||||
}
|
||||
|
@ -200,7 +202,14 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
|
|||
int count = 0;
|
||||
|
||||
auto trace_data = std::dynamic_pointer_cast<LsuTraceData>(trace->data);
|
||||
bool is_write = (trace->lsu_type == LsuType::STORE);
|
||||
bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE));
|
||||
|
||||
uint16_t req_per_thread = 1;
|
||||
if ((trace->lsu_type == LsuType::TCU_LOAD) || (trace->lsu_type == LsuType::TCU_STORE))
|
||||
{
|
||||
req_per_thread= (1>(trace_data->mem_addrs.at(0).size)/4)? 1: ((trace_data->mem_addrs.at(0).size)/4);
|
||||
}
|
||||
|
||||
auto t0 = trace->pid * NUM_LSU_LANES;
|
||||
|
||||
for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) {
|
||||
|
@ -213,33 +222,69 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) {
|
|||
|
||||
auto mem_addr = trace_data->mem_addrs.at(t);
|
||||
auto type = get_addr_type(mem_addr.addr);
|
||||
// DT(3, "addr_type = " << type << ", " << *trace);
|
||||
uint32_t mem_bytes = 1;
|
||||
for (int i = 0; i < req_per_thread; i++)
|
||||
{
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr.addr + (i*mem_bytes);
|
||||
mem_req.write = is_write;
|
||||
mem_req.type = type;
|
||||
mem_req.tag = tag;
|
||||
mem_req.cid = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
|
||||
dcache_req_port.push(mem_req, 1);
|
||||
DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
|
||||
<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
|
||||
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr.addr;
|
||||
mem_req.write = is_write;
|
||||
mem_req.type = type;
|
||||
mem_req.tag = tag;
|
||||
mem_req.cid = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
|
||||
dcache_req_port.push(mem_req, 1);
|
||||
DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag
|
||||
<< ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace);
|
||||
|
||||
if (is_write) {
|
||||
++core_->perf_stats_.stores;
|
||||
} else {
|
||||
++core_->perf_stats_.loads;
|
||||
++pending_loads_;
|
||||
if (is_write) {
|
||||
++core_->perf_stats_.stores;
|
||||
} else {
|
||||
++core_->perf_stats_.loads;
|
||||
++pending_loads_;
|
||||
}
|
||||
|
||||
++count;
|
||||
}
|
||||
|
||||
++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TcuUnit::TcuUnit(const SimContext& ctx, Core* core)
|
||||
: FuncUnit(ctx, core, "TCU")
|
||||
, tc_size (core_->arch().tc_size())
|
||||
{}
|
||||
|
||||
void TcuUnit::tick() {
|
||||
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto& input = Inputs.at(i);
|
||||
if (input.empty())
|
||||
continue;
|
||||
auto& output = Outputs.at(i);
|
||||
auto trace = input.front();
|
||||
uint32_t n_tiles = core_->emulator_.get_tiles();
|
||||
switch (trace->tcu_type) {
|
||||
case TCUType::TCU_MUL:
|
||||
{ //mat size = n_tiles * tc_size
|
||||
int matmul_latency = (n_tiles * tc_size) + tc_size + tc_size;
|
||||
output.push(trace, matmul_latency);
|
||||
DT(3, "matmul_latency = " << matmul_latency << ", " << *trace);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, "pipeline-execute: op=" << trace->tcu_type << ", " << *trace);
|
||||
input.pop();
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
SfuUnit::SfuUnit(const SimContext& ctx, Core* core)
|
||||
: FuncUnit(ctx, core, "SFU")
|
||||
{}
|
||||
|
|
|
@ -100,6 +100,15 @@ private:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class TcuUnit : public FuncUnit {
|
||||
public:
|
||||
TcuUnit(const SimContext& ctx, Core*);
|
||||
uint64_t tc_size;
|
||||
void tick();
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class SfuUnit : public FuncUnit {
|
||||
public:
|
||||
SfuUnit(const SimContext& ctx, Core*);
|
||||
|
|
|
@ -46,7 +46,7 @@ enum class Opcode {
|
|||
EXT1 = 0x0b,
|
||||
EXT2 = 0x2b,
|
||||
EXT3 = 0x5b,
|
||||
EXT4 = 0x7b
|
||||
TCU = 0x7b
|
||||
};
|
||||
|
||||
enum class InstType {
|
||||
|
|
|
@ -75,6 +75,7 @@ public:
|
|||
AluType alu_type;
|
||||
FpuType fpu_type;
|
||||
SfuType sfu_type;
|
||||
TCUType tcu_type;
|
||||
};
|
||||
|
||||
ITraceData::Ptr data;
|
||||
|
|
|
@ -35,6 +35,8 @@ static void show_usage() {
|
|||
uint32_t num_threads = NUM_THREADS;
|
||||
uint32_t num_warps = NUM_WARPS;
|
||||
uint32_t num_cores = NUM_CORES;
|
||||
uint32_t tc_size = TC_SIZE;
|
||||
uint32_t tc_num = TC_NUM;
|
||||
bool showStats = false;
|
||||
const char* program = nullptr;
|
||||
|
||||
|
@ -81,7 +83,7 @@ int main(int argc, char **argv) {
|
|||
|
||||
{
|
||||
// create processor configuation
|
||||
Arch arch(num_threads, num_warps, num_cores);
|
||||
Arch arch(num_threads, num_warps, num_cores, tc_size, tc_num);
|
||||
|
||||
// create memory module
|
||||
RAM ram(0, RAM_PAGE_SIZE);
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#include <VX_config.h>
|
||||
#include <simobject.h>
|
||||
#include "debug.h"
|
||||
#include <iostream>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
|
@ -78,6 +79,7 @@ enum class FUType {
|
|||
LSU,
|
||||
FPU,
|
||||
SFU,
|
||||
TCU,
|
||||
Count
|
||||
};
|
||||
|
||||
|
@ -87,6 +89,7 @@ inline std::ostream &operator<<(std::ostream &os, const FUType& type) {
|
|||
case FUType::LSU: os << "LSU"; break;
|
||||
case FUType::FPU: os << "FPU"; break;
|
||||
case FUType::SFU: os << "SFU"; break;
|
||||
case FUType::TCU: os << "TCU"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
|
@ -118,14 +121,30 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
|
|||
|
||||
enum class LsuType {
|
||||
LOAD,
|
||||
TCU_LOAD,
|
||||
STORE,
|
||||
TCU_STORE,
|
||||
FENCE
|
||||
};
|
||||
|
||||
enum class TCUType {
|
||||
TCU_MUL
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const TCUType& type) {
|
||||
switch (type) {
|
||||
case TCUType::TCU_MUL: os << "TCU MUL"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
|
||||
switch (type) {
|
||||
case LsuType::LOAD: os << "LOAD"; break;
|
||||
case LsuType::TCU_LOAD: os << "TCU_LOAD"; break;
|
||||
case LsuType::STORE: os << "STORE"; break;
|
||||
case LsuType::TCU_STORE: os << "TCU_STORE"; break;
|
||||
case LsuType::FENCE: os << "FENCE"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
|
@ -383,7 +402,7 @@ public:
|
|||
, type_(type)
|
||||
, delay_(delay)
|
||||
, cursors_(num_outputs, 0)
|
||||
, num_reqs_(num_inputs / num_outputs)
|
||||
, num_reqs_(log2ceil(num_inputs / num_outputs))
|
||||
{
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 32);
|
||||
|
@ -407,7 +426,7 @@ public:
|
|||
void tick() {
|
||||
uint32_t I = Inputs.size();
|
||||
uint32_t O = Outputs.size();
|
||||
uint32_t R = num_reqs_;
|
||||
uint32_t R = 1 << num_reqs_;
|
||||
|
||||
// skip bypass mode
|
||||
if (I == O)
|
||||
|
|
14
tests/regression/matmul/Makefile
Normal file
14
tests/regression/matmul/Makefile
Normal file
|
@ -0,0 +1,14 @@
|
|||
ROOT_DIR := $(realpath ../../..)
|
||||
include $(ROOT_DIR)/config.mk
|
||||
|
||||
PROJECT := matmul
|
||||
|
||||
SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT)
|
||||
|
||||
SRCS := $(SRC_DIR)/main.cpp
|
||||
|
||||
VX_SRCS := $(SRC_DIR)/kernel.cpp
|
||||
|
||||
OPTS ?= -n128 -d1
|
||||
|
||||
include ../common.mk
|
17
tests/regression/matmul/common.h
Normal file
17
tests/regression/matmul/common.h
Normal file
|
@ -0,0 +1,17 @@
|
|||
#ifndef _COMMON_H_
|
||||
#define _COMMON_H_
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_tasks;
|
||||
uint32_t num_warps;
|
||||
uint32_t num_threads;
|
||||
uint32_t TC_per_warp;
|
||||
uint32_t matrix_size;
|
||||
uint32_t data_size;
|
||||
uint64_t tc_size;
|
||||
uint64_t src0_addr;
|
||||
uint64_t src1_addr;
|
||||
uint64_t dst_addr;
|
||||
} kernel_arg_t;
|
||||
|
||||
#endif
|
124
tests/regression/matmul/kernel.cpp
Normal file
124
tests/regression/matmul/kernel.cpp
Normal file
|
@ -0,0 +1,124 @@
|
|||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
uint32_t task_id = blockIdx.x;
|
||||
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
|
||||
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
|
||||
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
|
||||
unsigned a_addr = reinterpret_cast<unsigned>(src0_ptr);
|
||||
unsigned b_addr = reinterpret_cast<unsigned>(src1_ptr);
|
||||
unsigned c_addr = reinterpret_cast<unsigned>(dst_ptr);
|
||||
|
||||
uint32_t tc_size = arg->tc_size;
|
||||
int TC_per_warp = arg->TC_per_warp;
|
||||
unsigned num_threads = arg->num_threads;
|
||||
int num_warps = arg->num_warps;
|
||||
uint32_t matrix_size = arg->matrix_size;
|
||||
|
||||
int n_tiles = matrix_size/tc_size;
|
||||
int num_output_tiles = (matrix_size*matrix_size)/(tc_size*tc_size);
|
||||
|
||||
int num_tasks = arg->num_tasks;
|
||||
|
||||
//Assuming matrix size always > tensor core size
|
||||
int warps_actual;
|
||||
if (TC_per_warp > num_output_tiles)
|
||||
warps_actual = 1;
|
||||
else
|
||||
warps_actual = num_output_tiles/TC_per_warp;
|
||||
|
||||
int num_warps_actual = (warps_actual < num_warps)? warps_actual: num_warps;
|
||||
int num_threads_per_tc = (1> num_threads/TC_per_warp)? 1: num_threads/TC_per_warp;
|
||||
|
||||
int num_tasks_per_thread = (1> (num_tasks/(num_threads*num_warps_actual)))? 1: (num_tasks/(num_threads*num_warps_actual));
|
||||
int num_tasks_per_warp = (1 > num_tasks/num_warps_actual)? 1:num_tasks/num_warps_actual;
|
||||
int task_id_first_warp = task_id%num_tasks_per_warp;
|
||||
|
||||
//A&B
|
||||
int num_data_per_op_tile = tc_size*tc_size*n_tiles;
|
||||
int num_data_per_warp = num_data_per_op_tile*((1> (num_output_tiles/num_warps_actual))?1:(num_output_tiles/num_warps_actual));
|
||||
|
||||
int addr_shift;
|
||||
if (((tc_size*tc_size*n_tiles)/(num_threads)) > 1)
|
||||
addr_shift = (tc_size*tc_size*n_tiles)/(num_threads);
|
||||
else
|
||||
addr_shift = 1;
|
||||
//Offset for 1st warp
|
||||
int offset = ((task_id_first_warp/num_tasks_per_thread)*addr_shift) + ((task_id_first_warp%num_tasks_per_thread)*num_data_per_op_tile);
|
||||
offset = offset + (num_data_per_warp*(task_id/num_tasks_per_warp));
|
||||
|
||||
//C
|
||||
int num_data_per_op_tile_c = tc_size*tc_size;
|
||||
int num_data_per_warp_c = num_data_per_warp/n_tiles;
|
||||
|
||||
int addr_shift_c;
|
||||
if (((tc_size*tc_size)/(num_threads)) > 1)
|
||||
addr_shift_c = tc_size;
|
||||
else
|
||||
addr_shift_c = 1;
|
||||
//Offset for 1st warp
|
||||
int offset_c = ((task_id_first_warp/num_tasks_per_thread)*addr_shift_c) + ((task_id_first_warp%num_tasks_per_thread)*num_data_per_op_tile_c);
|
||||
offset_c = offset_c + (num_data_per_warp_c*(task_id/num_tasks_per_warp));
|
||||
|
||||
int thread_limit = (num_threads < tc_size*tc_size*n_tiles*TC_per_warp)? num_threads : tc_size*tc_size*n_tiles*TC_per_warp;
|
||||
int thread_limit_c = (num_threads<tc_size*tc_size)? num_threads:tc_size*tc_size;
|
||||
|
||||
//OLD TASK DISTRIBUTION // For 8x8 matrix, 2x2 tc_size, 1 tc_num, 4threads, 2warps => 64 tasks => 32 tasks/warp => 8 tasks/thread
|
||||
/*task0->thread0, warp0
|
||||
task1->thread0 , warp0
|
||||
task2->thread0 , warp0
|
||||
.
|
||||
task7->thread0
|
||||
task8->thread1
|
||||
task9->thread1
|
||||
.
|
||||
.
|
||||
------
|
||||
task32 -> thread0, warp1
|
||||
task33 -> thread1, warp1
|
||||
.
|
||||
*/
|
||||
|
||||
//NEW TASK DISTRIBUTION // For 8x8 matrix, 2x2 tc_size, 1 tc_num, 4threads, 2warps => 64 tasks => 32 tasks/warp => 8 tasks/thread
|
||||
/*task0->thread0, warp0
|
||||
task1->thread1 , warp0
|
||||
task2->thread2 , warp0
|
||||
task3->thread3 ,...
|
||||
task4->thread0
|
||||
task5->thread1
|
||||
.
|
||||
.
|
||||
------
|
||||
task32 -> thread0, warp1
|
||||
task33 -> thread1, warp1
|
||||
.
|
||||
.*/
|
||||
|
||||
//TODO :: change this for new task->thread distribution
|
||||
if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit)
|
||||
{
|
||||
unsigned a_addr_base = a_addr + offset*arg->data_size;
|
||||
unsigned b_addr_base = b_addr + offset*arg->data_size;
|
||||
unsigned c_addr_base = c_addr + offset_c*arg->data_size;
|
||||
csr_write(VX_MAT_MUL_SIZE,n_tiles);
|
||||
mload (0, a_addr_base);
|
||||
mload (1, b_addr_base);
|
||||
//In case of multiple threads - sync load
|
||||
vx_fence();
|
||||
|
||||
mm(); //Assuming padding to ensure matrix size is a multiple of tc_size
|
||||
vx_fence();
|
||||
if (((task_id%num_tasks_per_warp)/num_tasks_per_thread) < thread_limit_c)
|
||||
ms(c_addr_base);
|
||||
//In case of multiple threads - sync store
|
||||
vx_fence();
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
343
tests/regression/matmul/main.cpp
Normal file
343
tests/regression/matmul/main.cpp
Normal file
|
@ -0,0 +1,343 @@
|
|||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <vortex.h>
|
||||
#include <cmath>
|
||||
#include "common.h"
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const char* kernel_file = "kernel.vxbin";
|
||||
uint32_t matrix_size = 0;
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h A_buffer = nullptr;
|
||||
vx_buffer_h B_buffer = nullptr;
|
||||
vx_buffer_h C_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
|
||||
std::vector<uint8_t> staging_buf;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Test." << std::endl;
|
||||
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv, uint32_t &data_size) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:k:d:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
matrix_size = atoi(optarg);
|
||||
break;
|
||||
case 'k':
|
||||
kernel_file = optarg;
|
||||
break;
|
||||
case 'd':
|
||||
data_size = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(A_buffer);
|
||||
vx_mem_free(B_buffer);
|
||||
vx_mem_free(C_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename TYPE>
|
||||
class mainVariables
|
||||
{
|
||||
public:
|
||||
// Constructor
|
||||
mainVariables(uint32_t bufSize, uint32_t dataSize, uint32_t matrixSize)
|
||||
: buf_size(bufSize), data_size(dataSize), matrix_size(matrixSize)
|
||||
{
|
||||
// Resize vectors to specified sizes
|
||||
src_A.resize(buf_size/data_size);
|
||||
src_B.resize(buf_size/data_size);
|
||||
refs.resize(buf_size/data_size);
|
||||
}
|
||||
|
||||
void init_inputs ()
|
||||
{
|
||||
std::cout << "inside init" << std::endl;
|
||||
for (uint32_t i = 0; i < matrix_size*matrix_size; ++i)
|
||||
{
|
||||
auto a = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
auto b = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
src_A[i] = static_cast<TYPE>(a * matrix_size);
|
||||
src_B[i] = static_cast<TYPE>(b * matrix_size);
|
||||
}
|
||||
}
|
||||
|
||||
void matmul_cpu()
|
||||
{
|
||||
for (uint32_t row = 0; row < matrix_size; ++row)
|
||||
{
|
||||
for (uint32_t col = 0; col < matrix_size; ++col)
|
||||
{
|
||||
TYPE sum(0);
|
||||
for (uint32_t e = 0; e < matrix_size; ++e) {
|
||||
sum += src_A[row * matrix_size + e] * src_B[e * matrix_size + col];
|
||||
}
|
||||
refs[row * matrix_size + col] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Public variables
|
||||
std::vector<TYPE> src_A;
|
||||
std::vector<TYPE> src_B;
|
||||
std::vector<TYPE> refs;
|
||||
|
||||
std::vector<uint8_t> A_mat;
|
||||
std::vector<uint8_t> B_mat;
|
||||
|
||||
private:
|
||||
uint32_t buf_size;
|
||||
uint32_t data_size;
|
||||
uint32_t matrix_size;
|
||||
};
|
||||
|
||||
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
uint32_t data_size = 0;
|
||||
parse_args(argc, argv, data_size);
|
||||
if (matrix_size == 0) {
|
||||
matrix_size = 2;
|
||||
}
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads, tc_size, TC_per_warp;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_SIZE, &tc_size));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_TC_NUM, &TC_per_warp));
|
||||
|
||||
std::cout << "Debug :: tc_size = " << tc_size << std::endl;
|
||||
std::cout << "Debug :: tc_num = " << TC_per_warp << std::endl;
|
||||
|
||||
int threads_per_tc;
|
||||
//TODO - can be changed
|
||||
//Number of output tiles * number of threads
|
||||
if (TC_per_warp > num_threads)
|
||||
threads_per_tc = 1;
|
||||
else
|
||||
threads_per_tc = num_threads/TC_per_warp;
|
||||
|
||||
uint32_t num_tasks = ((matrix_size*matrix_size)/(tc_size*tc_size))*threads_per_tc;
|
||||
|
||||
//size of each operand
|
||||
uint32_t buf_size = ((matrix_size*matrix_size)/(tc_size*tc_size))*(matrix_size/(tc_size))*(tc_size*tc_size)*data_size;
|
||||
|
||||
//256
|
||||
std::cout << "Debug :: buf_size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &A_buffer));
|
||||
RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &B_buffer));
|
||||
RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &C_buffer));
|
||||
RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "A_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "B_addr=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "C_addr=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
mainVariables<int> variables (buf_size, data_size, matrix_size);
|
||||
variables.init_inputs();
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
// generate source data
|
||||
//////////////////////////////////////////////////
|
||||
variables.matmul_cpu();
|
||||
|
||||
uint32_t tc_size_f = tc_size*tc_size;
|
||||
uint32_t n_tiles = matrix_size/tc_size;
|
||||
|
||||
variables.A_mat.resize(buf_size);
|
||||
variables.B_mat.resize(buf_size);
|
||||
|
||||
//Demand matrix creation for A / traverse through the rows
|
||||
for(uint32_t k=0; k<n_tiles; k++)
|
||||
{
|
||||
//traverse through output tiles in a row
|
||||
for(uint32_t i=0; i<n_tiles; i++)
|
||||
{
|
||||
//traverse through tiles for one output tile
|
||||
for(uint32_t j=0; j< n_tiles; j++)
|
||||
{
|
||||
for(int t=0; t < tc_size*tc_size; t++)
|
||||
{
|
||||
variables.A_mat[n_tiles*n_tiles*tc_size_f*k + n_tiles*tc_size_f*i+tc_size_f*j + t] = variables.src_A[k*tc_size*matrix_size+ tc_size*j +(t/tc_size)*matrix_size + t%tc_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Demand matrix creation for B / traverse through the rows
|
||||
for(uint32_t k=0; k<n_tiles; k++)
|
||||
{
|
||||
//traverse through output tiles in a row
|
||||
for(uint32_t i=0; i<n_tiles; i++)
|
||||
{
|
||||
//traverse through tiles for one output tile
|
||||
for(uint32_t j=0; j< n_tiles; j++)
|
||||
{
|
||||
for(int t=0; t < tc_size*tc_size; t++)
|
||||
{
|
||||
variables.B_mat[n_tiles*n_tiles*tc_size_f*k + n_tiles*tc_size_f*i+tc_size_f*j + t] = variables.src_B[i*tc_size+ tc_size*matrix_size*j +(t/tc_size)*matrix_size + t%tc_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
// upload matrix A buffer
|
||||
{
|
||||
std::cout << "upload matrix A buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(A_buffer, (int8_t*)variables.A_mat.data(), 0, buf_size));
|
||||
}
|
||||
|
||||
// upload matrix B buffer
|
||||
{
|
||||
std::cout << "upload matrix B buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(B_buffer, (int8_t*)variables.B_mat.data(), 0, buf_size));
|
||||
}
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
//Prep kernel arguments
|
||||
//////////////////////////////////////////////////
|
||||
//1
|
||||
std::cout << "Debug :: num_tasks = " << num_tasks << std::endl;
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.num_warps = num_warps;
|
||||
kernel_arg.num_threads = num_threads;
|
||||
kernel_arg.TC_per_warp = TC_per_warp;
|
||||
//1
|
||||
kernel_arg.matrix_size = matrix_size;
|
||||
kernel_arg.data_size = data_size;
|
||||
kernel_arg.tc_size = tc_size;
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
//////////////////////////////////////////////////
|
||||
//////////////////////////////////////////////////
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev((int8_t*)variables.B_mat.data(), C_buffer, 0, buf_size));
|
||||
|
||||
// verify result (TODO : needs to be fixed for for functional correctness)
|
||||
/*
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (int8_t*)staging_buf.data();
|
||||
uint64_t tc_size = kernel_arg.tc_size;
|
||||
std::cout << "tc_size = " << tc_size << std::endl;
|
||||
int Result[matrix_size*matrix_size];
|
||||
int n_tiles = (matrix_size/tc_size);
|
||||
int tc_size_f = tc_size*tc_size;
|
||||
|
||||
//converting buf ptr (tile by tile) to CPU style linear (row by row)
|
||||
for(int k = 0; k < matrix_size/tc_size; k+= 1)
|
||||
{
|
||||
for(int j = 0; j < matrix_size; j+= tc_size)
|
||||
{
|
||||
for(int i =0; i < tc_size*tc_size; i++)
|
||||
{
|
||||
Result[ tc_size*matrix_size*k +j+ (i/tc_size)*matrix_size +i%(tc_size)] = buf_ptr[matrix_size*tc_size*k+tc_size*j+i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < matrix_size*matrix_size; ++i) {
|
||||
//int ref = i + i;
|
||||
int cur = Result[i];
|
||||
if (cur != refs[i]) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
std::cout << "CONDITIONALLY PASSED!" << std::endl;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue