diff --git a/kernel/include/vx_intrinsics.h b/kernel/include/vx_intrinsics.h index f22819246..6000065e9 100644 --- a/kernel/include/vx_intrinsics.h +++ b/kernel/include/vx_intrinsics.h @@ -221,24 +221,6 @@ inline void vx_fence() { __asm__ volatile ("fence iorw, iorw"); } -//Matrix load -inline void vx_matrix_load(unsigned dest, unsigned addr) -{ - __asm__ volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr)); -} - -//Matrix Store -inline void vx_matrix_store(unsigned addr) -{ - __asm__ volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr)); -} - -//Matrix Mul -inline void vx_matrix_mul() -{ - __asm__ volatile (".insn i 0x7b, 2, x0, 0(x0)"); -} - #ifdef __cplusplus } #endif diff --git a/kernel/include/vx_tensor.h b/kernel/include/vx_tensor.h new file mode 100644 index 000000000..f6edcbfba --- /dev/null +++ b/kernel/include/vx_tensor.h @@ -0,0 +1,65 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef __VX_TENSOR_H__ +#define __VX_TENSOR_H__ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +} +#endif + +namespace tensor { + +enum frag_layout_t { row_major, col_major }; +enum mem_layout_t { mem_row_major, mem_col_major }; + +template +struct fragment { + typedef T DType; + static const frag_layout_t Layout = L; + typedef T VType __attribute__((vector_size(8 * sizeof(void*)))); + VType data; +}; + +template +void fill_fragment(Frag &frag, size_t value) { + // empty skeleton +} + +template +void load_matrix_sync(Frag &frag, const void *ptr, size_t ld) { + // empty skeleton +} + +// Perform the matrix multiply-accumulate: D = A * B + C +template +void mma_sync(FragD &D, const FragA &A, const FragB &B, const FragC &C) { + // empty skeleton +} + +// Store a fragment result back to global memory +template +void store_matrix_sync(void *ptr, const Frag &frag, size_t ld, mem_layout_t layout) { + // empty skeleton +} + +} // namespace wmma + +#endif // __VX_TENSOR_H__ \ No newline at end of file diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 088821fe7..4b78832d2 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -548,18 +548,6 @@ Word Emulator::get_csr(uint32_t addr, uint32_t wid, uint32_t tid) { CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls); } } break; - #ifdef EXT_V_ENABLE - case VX_DCR_MPM_CLASS_VEC: { - VecUnit::PerfStats vec_perf_stats; - vec_perf_stats += vec_unit_->perf_stats(); - switch (addr) { - CSR_READ_64(VX_CSR_MPM_VEC_READS, vec_perf_stats.reads); - CSR_READ_64(VX_CSR_MPM_VEC_WRITES, vec_perf_stats.writes); - CSR_READ_64(VX_CSR_MPM_VEC_LAT, vec_perf_stats.latency); - CSR_READ_64(VX_CSR_MPM_VEC_ST, vec_perf_stats.stalls); - } - } break; - #endif default: std::cerr << "Error: invalid MPM CLASS: value=" << perf_class << std::endl; std::abort(); diff --git a/sim/simx/tensor_unit.cpp b/sim/simx/tensor_unit.cpp new file mode 100644 index 000000000..b72016c9d --- /dev/null +++ b/sim/simx/tensor_unit.cpp @@ -0,0 +1,97 @@ + +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "tensor_unit.h" + +using namespace vortex; + +template +class FMAD : public SimObject> { +public: + SimPort Input; + SimPort Output; + + FMAD(const SimContext &ctx, const char* name) + : SimObject>(ctx, name) + , Input(this) + , Output(this) + {} + + virtual ~FMAD() {} + + void reset() { + //-- + } + + void tick() { + //-- + } +}; + +class TensorUnit::Impl { +public: + Impl(TensorUnit* simobject, const Config& config, Core* core) + : simobject_(simobject) + , config_(config) + , core_(core) + , perf_stats_() + {} + + ~Impl() { + // Destructor logic if needed + } + + void reset() { + perf_stats_ = PerfStats(); + } + + void tick() { + // Implement the tick logic here + } + + const PerfStats& perf_stats() const { + return perf_stats_; + } + +private: + TensorUnit* simobject_; + Config config_; + Core* core_; + PerfStats perf_stats_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +TensorUnit::TensorUnit(const SimContext &ctx, const char* name, const Config& config, Core* core) + : SimObject(ctx, name) + , Inputs(config.num_ports, this) + , Outputs(config.num_ports, this) + , impl_(new Impl(this, config, core)) +{} + +TensorUnit::~TensorUnit() { + delete impl_; +} + +void TensorUnit::reset() { + impl_->reset(); +} + +void TensorUnit::tick() { + impl_->tick(); +} + +const TensorUnit::PerfStats &TensorUnit::perf_stats() const { + return impl_->perf_stats(); +} \ No newline at end of file diff --git a/sim/simx/tensor_unit.h b/sim/simx/tensor_unit.h new file mode 100644 index 000000000..eaa84615f --- /dev/null +++ b/sim/simx/tensor_unit.h @@ -0,0 +1,66 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "instr_trace.h" + +namespace vortex { + +class Core; + +class TensorUnit : public SimObject { +public: + struct Config { + uint8_t num_ports; + uint8_t mac_latency; + + Config() + : num_ports(0) + , mac_latency(0) + {} + }; + + struct PerfStats { + uint64_t latency; + + PerfStats() + : latency(0) + {} + + PerfStats& operator+=(const PerfStats& rhs) { + this->latency += rhs.latency; + return *this; + } + }; + + std::vector> Inputs; + std::vector> Outputs; + + TensorUnit(const SimContext &ctx, const char* name, const Config& config, Core* core); + + virtual ~TensorUnit(); + + virtual void reset(); + + virtual void tick(); + + const PerfStats& perf_stats() const; + +private: + class Impl; + Impl* impl_; +}; + +} // namespace vortex diff --git a/tests/regression/sgemm_tpu/Makefile b/tests/regression/sgemm_tpu/Makefile new file mode 100644 index 000000000..28ad15cfd --- /dev/null +++ b/tests/regression/sgemm_tpu/Makefile @@ -0,0 +1,14 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := sgemm_tpu + +SRC_DIR := $(VORTEX_HOME)/tests/regression/$(PROJECT) + +SRCS := $(SRC_DIR)/main.cpp + +VX_SRCS := $(SRC_DIR)/kernel.cpp + +OPTS ?= -n32 + +include ../common.mk \ No newline at end of file diff --git a/tests/regression/sgemm_tpu/common.h b/tests/regression/sgemm_tpu/common.h new file mode 100644 index 000000000..b755970fc --- /dev/null +++ b/tests/regression/sgemm_tpu/common.h @@ -0,0 +1,25 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#include +#include + +#ifndef I_TYPE +#define I_TYPE vortex::half_t +#endif + +#ifndef O_TYPE +#define O_TYPE float +#endif + +typedef struct { + uint32_t grid_dim[2]; + uint32_t block_dim[2]; + uint32_t tileM, tileN, tileK; + uint32_t M, N, K; + uint64_t A_addr; + uint64_t B_addr; + uint64_t C_addr; +} kernel_arg_t; + +#endif diff --git a/tests/regression/sgemm_tpu/kernel.cpp b/tests/regression/sgemm_tpu/kernel.cpp new file mode 100644 index 000000000..fde56e0da --- /dev/null +++ b/tests/regression/sgemm_tpu/kernel.cpp @@ -0,0 +1,46 @@ +#include +#include +#include "common.h" + +void kernel_body(kernel_arg_t* __UNIFORM__ arg) { + auto A = reinterpret_cast(arg->A_addr); + auto B = reinterpret_cast(arg->B_addr); + auto C = reinterpret_cast(arg->C_addr); + + tensor::fragment fragA; + tensor::fragment fragB; + tensor::fragment fragC; + + // calculate tile row & column based on block index + uint32_t tile_row = blockIdx.y * arg->tileM; + uint32_t tile_col = blockIdx.x * arg->tileN; + + uint32_t N = arg->N; + uint32_t K = arg->K; + uint32_t tileK = arg->tileK; + + // Initialize accumulator tile to zero + tensor::fill_fragment(fragC, 0.0f); + + for (int i = 0; i < K; i += tileK) { + // Load A tile + auto tileA = A + (tile_row * K + i); + tensor::load_matrix_sync(fragA, tileA, K); + + // Load B tile + auto tileB = B + (i * k + tile_col); + tensor::load_matrix_sync(fragB, tileB, K); + + // Matrix multiply-accumulate: c += a * b + tensor::mma_sync(fragC, fragA, fragB, fragC); + } + + // Store the computed C tile + auto tileC = C + (tile_row * N + tile_col); + tensor::store_matrix_sync(tileC, fragC, N, tensor::mem_row_major); +} + +int main() { + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); + return vx_spawn_threads(2, arg->grid_dim, arg->block_dim, (vx_kernel_func_cb)kernel_body, arg); +} diff --git a/tests/regression/sgemm_tpu/main.cpp b/tests/regression/sgemm_tpu/main.cpp new file mode 100644 index 000000000..a9356be62 --- /dev/null +++ b/tests/regression/sgemm_tpu/main.cpp @@ -0,0 +1,343 @@ +#include +#include +#include +#include +#include +#include +#include +#include "common.h" +#include + +#define FLOAT_ULP 6 + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +template +class Comparator {}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "int8"; + } + static int8_t generate() { + return (int8_t)rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a); + } + return false; + } + return true; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "int8"; + } + static int generate() { + return (int)rand(); + } + static bool compare(int a, int b, int index, int errors) { + if (a != b) { + if (errors < 100) { + printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a); + } + return false; + } + return true; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "f16"; + } + static vortex::half_t generate() { + return static_cast(float(rand()) / RAND_MAX); + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); + } + return false; + } + return true; + } +}; + +template <> +class Comparator { +public: + static const char* type_str() { + return "float"; + } + static int generate() { + return static_cast(rand()) / RAND_MAX; + } + static bool compare(float a, float b, int index, int errors) { + union fi_t { float f; int32_t i; }; + fi_t fa, fb; + fa.f = a; + fb.f = b; + auto d = std::abs(fa.i - fb.i); + if (d > FLOAT_ULP) { + if (errors < 100) { + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); + } + return false; + } + return true; + } +}; + +static void matmul_cpu(O_TYPE* C, const I_TYPE* A, const I_TYPE* B, uint32_t M, uint32_t N, uint32_t K) { + for (uint32_t m = 0; m < M; ++m) { + for (uint32_t n = 0; n < N; ++n) { + O_TYPE sum(0); + for (uint32_t k = 0; k < K; ++k) { + sum += O_TYPE(A[m*K + k] * B[k*N + n]); + } + C[m*N + n] = sum; + } + } +} + +const char* kernel_file = "kernel.vxbin"; +uint32_t M = 32; +uint32_t N = 32; +uint32_t K = 32; + +vx_device_h device = nullptr; +vx_buffer_h A_buffer = nullptr; +vx_buffer_h B_buffer = nullptr; +vx_buffer_h C_buffer = nullptr; +vx_buffer_h krnl_buffer = nullptr; +vx_buffer_h args_buffer = nullptr; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-m: m] [-n N] [-k: K] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "m:n:k:h")) != -1) { + switch (c) { + case 'm': + M = atoi(optarg); + break; + case 'n': + N = atoi(optarg); + break; + case 'k': + K = atoi(optarg); + break; + case 'h': + show_usage(); + exit(0); + break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(A_buffer); + vx_mem_free(B_buffer); + vx_mem_free(C_buffer); + vx_mem_free(krnl_buffer); + vx_mem_free(args_buffer); + vx_dev_close(device); + } +} + +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint64_t NT; + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &NT)); + std::cout << "GPU warp size: " << NT << std::endl; + + uint64_t isa_flags; + RT_CHECK(vx_dev_caps(device, VX_CAPS_ISA_FLAGS, &isa_flags)); + uint32_t XlenB = 4 * VX_ISA_ARCH(isa_flags); + std::cout << "GPU XLEN: " << 8 * XlenB << std::endl; + + // tile format ratio + uint32_t o_ratio = XlenB / sizeof(O_TYPE); + uint32_t i_ratio = XlenB / sizeof(I_TYPE); + + // determine tensor tile size + uint32_t logNT = log2(NT); + uint32_t tileM = 4 * (1 << (logNT / 2)) * o_ratio; + uint32_t tileN = (logNT % 2 == 0) ? tileM / 2 : tileN; + uint32_t tileK = std::min(tileM, tileN) * i_ratio; + + std::cout << "GPU tensor tileM=" << tileM << ", tileN=" << tileM << ", tileK=" << tileM << std::endl; + + if ((M & (tileM - 1)) != 0) { + std::cout << "Error: M must be a multiple of tensor tileM!" << std::endl; + return -1; + } + + if ((N & (tileN - 1)) != 0) { + std::cout << "Error: M must be a multiple of tensor tileN!" << std::endl; + return -1; + } + + if ((K & (tileK - 1)) != 0) { + std::cout << "Error: M must be a multiple of tensor tileK!" << std::endl; + return -1; + } + + kernel_arg.tileM = tileM; + kernel_arg.tileN = tileN; + kernel_arg.tileK = tileK; + + size_t sizeA = M * K; + size_t sizeB = K * N; + size_t sizeC = M * N; + + std::cout << "input data type: " << Comparator::type_str() << std::endl; + std::cout << "output data type: " << Comparator::type_str() << std::endl; + std::cout << "matrix A: " << M << "x" << K << std::endl; + std::cout << "matrix B: " << K << "x" << N << std::endl; + std::cout << "matrix C: " << M << "x" << N << std::endl; + + // set block size to warp size + kernel_arg.grid_dim[0] = N / tileN; + kernel_arg.grid_dim[1] = M / tileM; + kernel_arg.block_dim[0] = NT; // warp size + kernel_arg.block_dim[1] = 1; + + // set matrix dimensions + kernel_arg.M = M; + kernel_arg.N = N; + kernel_arg.K = K; + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, sizeA * sizeof(I_TYPE), VX_MEM_READ, &A_buffer)); + RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.A_addr)); + RT_CHECK(vx_mem_alloc(device, sizeB * sizeof(I_TYPE), VX_MEM_READ, &B_buffer)); + RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.B_addr)); + RT_CHECK(vx_mem_alloc(device, sizeC * sizeof(O_TYPE), VX_MEM_WRITE, &C_buffer)); + RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.C_addr)); + + std::cout << "A_addr=0x" << std::hex << kernel_arg.A_addr << std::endl; + std::cout << "B_addr=0x" << std::hex << kernel_arg.B_addr << std::endl; + std::cout << "C_addr=0x" << std::hex << kernel_arg.C_addr << std::endl; + + // generate source data + std::vector h_A(sizeA); + std::vector h_B(sizeB); + for (uint32_t i = 0; i < sizeA; ++i) { + h_A[i] = Comparator::generate(); + } + for (uint32_t i = 0; i < sizeB; ++i) { + h_B[i] = Comparator::generate(); + } + + // upload matrix A buffer + { + std::cout << "upload matrix A buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(A_buffer, h_A.data(), 0, sizeA * sizeof(I_TYPE))); + } + + // upload matrix B buffer + { + std::cout << "upload matrix B buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(B_buffer, h_B.data(), 0, sizeB * sizeof(I_TYPE))); + } + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer)); + + auto time_start = std::chrono::high_resolution_clock::now(); + + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device, krnl_buffer, args_buffer)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + auto time_end = std::chrono::high_resolution_clock::now(); + double elapsed = std::chrono::duration_cast(time_end - time_start).count(); + printf("Elapsed time: %lg ms\n", elapsed); + + // download destination buffer + std::vector h_C(sizeC); + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(h_C.data(), C_buffer, 0, sizeC * sizeof(O_TYPE))); + + // verify result + std::cout << "verify result" << std::endl; + int errors = 0; + { + std::vector h_ref(sizeC); + matmul_cpu(h_ref.data(), h_A.data(), h_B.data(), M, N, K); + + for (uint32_t i = 0; i < h_ref.size(); ++i) { + if (!Comparator::compare(h_C[i], h_ref[i], i, errors)) { + ++errors; + } + } + } + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return errors; + } + + std::cout << "PASSED!" << std::endl; + + return 0; +} \ No newline at end of file