vortex/kernel/include/vx_intrinsics.h
2025-03-17 04:30:50 -07:00

246 lines
7.6 KiB
C

// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// The intrinsics implemented use RISC-V assembler pseudo-directives defined here:
// https://sourceware.org/binutils/docs/as/RISC_002dV_002dFormats.html
#ifndef __VX_INTRINSICS_H__
#define __VX_INTRINSICS_H__
#include <stddef.h>
#include <stdint.h>
#include <VX_types.h>
#if defined(__clang__)
#define __UNIFORM__ __attribute__((annotate("vortex.uniform")))
#else
#define __UNIFORM__
#endif
#ifdef __cplusplus
extern "C" {
#endif
#define RISCV_CUSTOM0 0x0B
#define RISCV_CUSTOM1 0x2B
#define RISCV_CUSTOM2 0x5B
#define RISCV_CUSTOM3 0x7B
#define csr_read(csr) ({ \
size_t __r; \
__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr) : "memory"); \
__r; \
})
#define csr_write(csr, val) ({ \
size_t __v = (size_t)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v) : "memory"); \
else \
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v) : "memory"); \
})
#define csr_swap(csr, val) ({ \
size_t __r; \
size_t __v = (size_t)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v) : "memory"); \
else \
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v) : "memory"); \
__r; \
})
#define csr_read_set(csr, val) ({ \
size_t __r; \
size_t __v = (size_t)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v) : "memory"); \
else \
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v) : "memory"); \
__r; \
})
#define csr_set(csr, val) ({ \
size_t __v = (size_t)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v) : "memory"); \
else \
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v) : "memory"); \
})
#define csr_read_clear(csr, val) ({ \
size_t __r; \
size_t __v = (size_t)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v) : "memory"); \
else \
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v) : "memory"); \
__r; \
})
#define csr_clear(csr, val) ({ \
size_t __v = (size_t)(val); \
if (__builtin_constant_p(val) && __v < 32) \
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v) : "memory"); \
else \
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v) : "memory"); \
})
// Set thread mask
inline void vx_tmc(int thread_mask) {
__asm__ volatile (".insn r %0, 0, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(thread_mask));
}
// disable all threads in the current warp
inline void vx_tmc_zero() {
__asm__ volatile (".insn r %0, 0, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM0));
}
// switch execution to single thread zero
inline void vx_tmc_one() {
__asm__ volatile (
"li a0, 1\n\t" // Load immediate value 1 into a0 (x10) register
".insn r %0, 0, 0, x0, a0, x0" :: "i"(RISCV_CUSTOM0) : "a0"
);
}
// Set thread predicate
inline void vx_pred(int condition, int thread_mask) {
__asm__ volatile (".insn r %0, 5, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
}
// Set thread not predicate
inline void vx_pred_n(int condition, int thread_mask) {
__asm__ volatile (".insn r %0, 5, 0, x1, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
}
// Spawn warps
typedef void (*vx_wspawn_pfn)();
inline void vx_wspawn(int num_warps, vx_wspawn_pfn func_ptr) {
__asm__ volatile (".insn r %0, 1, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(num_warps), "r"(func_ptr));
}
// Split on a predicate
inline int vx_split(int predicate) {
int ret;
__asm__ volatile (".insn r %1, 2, 0, %0, %2, x0" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
return ret;
}
// Split on a not predicate
inline int vx_split_n(int predicate) {
int ret;
__asm__ volatile (".insn r %1, 2, 0, %0, %2, x1" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
return ret;
}
// Join
inline void vx_join(int stack_ptr) {
__asm__ volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr));
}
// Warp Barrier
inline void vx_barrier(int barried_id, int num_warps) {
__asm__ volatile (".insn r %0, 4, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(barried_id), "r"(num_warps));
}
// Return current thread identifier
inline int vx_thread_id() {
int ret;
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_ID));
return ret;
}
// Return current warp identifier
inline int vx_warp_id() {
int ret;
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_ID));
return ret;
}
// Return current core identifier
inline int vx_core_id() {
int ret;
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CORE_ID));
return ret;
}
// Return active threads mask
inline int vx_active_threads() {
int ret;
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_ACTIVE_THREADS));
return ret;
}
// Return active warps mask
inline int vx_active_warps() {
int ret;
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_ACTIVE_WARPS));
return ret;
}
// Return the number of threads per warp
inline int vx_num_threads() {
int ret;
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_THREADS));
return ret;
}
// Return the number of warps per core
inline int vx_num_warps() {
int ret;
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS));
return ret;
}
// Return the number of cores per cluster
inline int vx_num_cores() {
int ret;
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CORES));
return ret;
}
// Return the hart identifier (thread id accross the processor)
inline int vx_hart_id() {
int ret;
__asm__ volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_MHARTID));
return ret;
}
inline void vx_fence() {
__asm__ volatile ("fence iorw, iorw");
}
//Matrix load
inline void vx_matrix_load(unsigned dest, unsigned addr)
{
__asm__ volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
}
//Matrix Store
inline void vx_matrix_store(unsigned addr)
{
__asm__ volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr));
}
//Matrix Mul
inline void vx_matrix_mul()
{
__asm__ volatile (".insn i 0x7b, 2, x0, 0(x0)");
}
#ifdef __cplusplus
}
#endif
#endif // __VX_INTRINSICS_H__