mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 22:07:41 -04:00
Migrating all tests to new kernel launch API
This commit is contained in:
parent
8b63305201
commit
250a5741f7
77 changed files with 961 additions and 1108 deletions
|
@ -232,11 +232,11 @@ then
|
|||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
else
|
||||
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
fi
|
||||
|
||||
|
@ -257,11 +257,11 @@ then
|
|||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
DEBUG=1 OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
else
|
||||
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
DEBUG=1 make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
fi
|
||||
fi
|
||||
|
|
|
@ -133,6 +133,9 @@ opencl()
|
|||
make -C tests/opencl run-simx
|
||||
make -C tests/opencl run-rtlsim
|
||||
|
||||
./ci/blackbox.sh --driver=simx --app=lbm --warps=8
|
||||
./ci/blackbox.sh --driver=rtlsim --app=lbm --warps=8
|
||||
|
||||
echo "opencl tests done!"
|
||||
}
|
||||
|
||||
|
|
|
@ -96,9 +96,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
|
||||
wire [127:0] afu_id = `AFU_ACCEL_UUID;
|
||||
|
||||
wire [63:0] dev_caps = {8'b0,
|
||||
wire [63:0] dev_caps = {16'b0,
|
||||
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
|
||||
8'(`NUM_BARRIERS),
|
||||
16'(`NUM_CORES * `NUM_CLUSTERS),
|
||||
8'(`NUM_WARPS),
|
||||
8'(`NUM_THREADS),
|
||||
|
|
|
@ -142,9 +142,8 @@ module VX_afu_ctrl #(
|
|||
RSTATE_DATA = 2'd1;
|
||||
|
||||
// device caps
|
||||
wire [63:0] dev_caps = {8'b0,
|
||||
wire [63:0] dev_caps = {16'b0,
|
||||
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
|
||||
8'(`NUM_BARRIERS),
|
||||
16'(`NUM_CORES * `NUM_CLUSTERS),
|
||||
8'(`NUM_WARPS),
|
||||
8'(`NUM_THREADS),
|
||||
|
|
|
@ -17,9 +17,17 @@
|
|||
#ifndef __VX_INTRINSICS_H__
|
||||
#define __VX_INTRINSICS_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
#if __riscv_xlen == 64
|
||||
typedef unsigned long size_t; // 64-bit RISC-V
|
||||
#elif __riscv_xlen == 32
|
||||
typedef unsigned int size_t; // 32-bit RISC-V
|
||||
#else
|
||||
#error "Unknown RISC-V architecture"
|
||||
#endif
|
||||
|
||||
#if defined(__clang__)
|
||||
#define __UNIFORM__ __attribute__((annotate("vortex.uniform")))
|
||||
#else
|
||||
|
|
|
@ -21,16 +21,6 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void (*vx_spawn_tasks_cb)(uint32_t task_id, void *arg);
|
||||
|
||||
typedef void (*vx_serial_cb)(void *arg);
|
||||
|
||||
void vx_spawn_tasks(uint32_t num_tasks, vx_spawn_tasks_cb callback, const void * arg);
|
||||
|
||||
void vx_serial(vx_serial_cb callback, const void * arg);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef union {
|
||||
struct {
|
||||
uint32_t x;
|
||||
|
@ -46,23 +36,28 @@ extern dim3_t gridDim;
|
|||
extern dim3_t blockDim;
|
||||
|
||||
extern __thread uint32_t __local_group_id;
|
||||
extern uint32_t __groups_per_core;
|
||||
extern uint32_t __warps_per_group;
|
||||
|
||||
typedef void (*vx_kernel_func_cb)(void *arg);
|
||||
|
||||
typedef void (*vx_serial_cb)(void *arg);
|
||||
|
||||
#define __local_mem(size) \
|
||||
(void*)((int8_t*)csr_read(VX_CSR_LOCAL_MEM_BASE) + __local_group_id * size)
|
||||
|
||||
#define __syncthreads() \
|
||||
vx_barrier(__COUNTER__ * __groups_per_core + __local_group_id, __warps_per_group)
|
||||
vx_barrier(__local_group_id, __warps_per_group)
|
||||
|
||||
// launch a kernel function with a grid of blocks and block of threads
|
||||
int vx_spawn_threads(uint32_t dimension,
|
||||
const uint32_t* grid_dim,
|
||||
const uint32_t* block_dim,
|
||||
vx_kernel_func_cb kernel_func,
|
||||
const void* arg);
|
||||
|
||||
// function call serialization
|
||||
void vx_serial(vx_serial_cb callback, const void * arg);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -13,7 +13,6 @@
|
|||
|
||||
#include <vx_spawn.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <inttypes.h>
|
||||
#include <vx_print.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -24,142 +23,12 @@ extern "C" {
|
|||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
vx_spawn_tasks_cb callback;
|
||||
const void* arg;
|
||||
uint32_t all_tasks_offset;
|
||||
uint32_t remain_tasks_offset;
|
||||
uint32_t warp_batches;
|
||||
uint32_t remaining_warps;
|
||||
} wspawn_tasks_args_t;
|
||||
|
||||
static void __attribute__ ((noinline)) process_all_tasks() {
|
||||
wspawn_tasks_args_t* targs = (wspawn_tasks_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
|
||||
uint32_t threads_per_warp = vx_num_threads();
|
||||
uint32_t warp_id = vx_warp_id();
|
||||
uint32_t thread_id = vx_thread_id();
|
||||
|
||||
uint32_t start_warp = (warp_id * targs->warp_batches) + MIN(warp_id, targs->remaining_warps);
|
||||
uint32_t iterations = targs->warp_batches + (warp_id < targs->remaining_warps);
|
||||
|
||||
uint32_t start_task_id = targs->all_tasks_offset + (start_warp * threads_per_warp) + thread_id;
|
||||
uint32_t end_task_id = start_task_id + iterations * threads_per_warp;
|
||||
|
||||
vx_spawn_tasks_cb callback = targs->callback;
|
||||
const void* arg = targs->arg;
|
||||
for (uint32_t task_id = start_task_id; task_id < end_task_id; task_id += threads_per_warp) {
|
||||
callback(task_id, (void*)arg);
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) process_remaining_tasks() {
|
||||
wspawn_tasks_args_t* targs = (wspawn_tasks_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
|
||||
uint32_t thread_id = vx_thread_id();
|
||||
uint32_t task_id = targs->remain_tasks_offset + thread_id;
|
||||
|
||||
(targs->callback)(task_id, (void*)targs->arg);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) process_all_tasks_stub() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// process all tasks
|
||||
process_all_tasks();
|
||||
|
||||
// disable warp
|
||||
vx_tmc_zero();
|
||||
}
|
||||
|
||||
void vx_spawn_tasks(uint32_t num_tasks, vx_spawn_tasks_cb callback , const void * arg) {
|
||||
// device specifications
|
||||
uint32_t num_cores = vx_num_cores();
|
||||
uint32_t warps_per_core = vx_num_warps();
|
||||
uint32_t threads_per_warp = vx_num_threads();
|
||||
uint32_t core_id = vx_core_id();
|
||||
|
||||
// calculate necessary active cores
|
||||
uint32_t threads_per_core = warps_per_core * threads_per_warp;
|
||||
uint32_t needed_cores = (num_tasks + threads_per_core - 1) / threads_per_core;
|
||||
uint32_t active_cores = MIN(needed_cores, num_cores);
|
||||
|
||||
// only active cores participate
|
||||
if (core_id >= active_cores)
|
||||
return;
|
||||
|
||||
// number of tasks per core
|
||||
uint32_t tasks_per_core = num_tasks / active_cores;
|
||||
uint32_t remaining_tasks_per_core = num_tasks - tasks_per_core * active_cores;
|
||||
if (core_id < remaining_tasks_per_core)
|
||||
tasks_per_core++;
|
||||
|
||||
// calculate number of warps to activate
|
||||
uint32_t total_warps_per_core = tasks_per_core / threads_per_warp;
|
||||
uint32_t remaining_tasks = tasks_per_core - total_warps_per_core * threads_per_warp;
|
||||
uint32_t active_warps = total_warps_per_core;
|
||||
uint32_t warp_batches = 1, remaining_warps = 0;
|
||||
if (active_warps > warps_per_core) {
|
||||
active_warps = warps_per_core;
|
||||
warp_batches = total_warps_per_core / active_warps;
|
||||
remaining_warps = total_warps_per_core - warp_batches * active_warps;
|
||||
}
|
||||
|
||||
// calculate offsets for task distribution
|
||||
uint32_t all_tasks_offset = core_id * tasks_per_core + MIN(core_id, remaining_tasks_per_core);
|
||||
uint32_t remain_tasks_offset = all_tasks_offset + (tasks_per_core - remaining_tasks);
|
||||
|
||||
// prepare scheduler arguments
|
||||
wspawn_tasks_args_t wspawn_args = {
|
||||
callback,
|
||||
arg,
|
||||
all_tasks_offset,
|
||||
remain_tasks_offset,
|
||||
warp_batches,
|
||||
remaining_warps
|
||||
};
|
||||
csr_write(VX_CSR_MSCRATCH, &wspawn_args);
|
||||
|
||||
if (active_warps >= 1) {
|
||||
// execute callback on other warps
|
||||
vx_wspawn(active_warps, process_all_tasks_stub);
|
||||
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// process all tasks
|
||||
process_all_tasks();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
}
|
||||
|
||||
if (remaining_tasks != 0) {
|
||||
// activate remaining threads
|
||||
uint32_t tmask = (1 << remaining_tasks) - 1;
|
||||
vx_tmc(tmask);
|
||||
|
||||
// process remaining tasks
|
||||
process_remaining_tasks();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
}
|
||||
|
||||
// wait for spawned tasks to complete
|
||||
vx_wspawn(1, 0);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__thread dim3_t blockIdx;
|
||||
__thread dim3_t threadIdx;
|
||||
dim3_t gridDim;
|
||||
dim3_t blockDim;
|
||||
|
||||
__thread uint32_t __local_group_id;
|
||||
uint32_t __groups_per_core;
|
||||
uint32_t __warps_per_group;
|
||||
|
||||
typedef struct {
|
||||
|
@ -171,10 +40,68 @@ typedef struct {
|
|||
uint32_t warps_per_group;
|
||||
uint32_t groups_per_core;
|
||||
uint32_t remaining_mask;
|
||||
} wspawn_task_groups_args_t;
|
||||
} wspawn_groups_args_t;
|
||||
|
||||
static void __attribute__ ((noinline)) process_all_task_groups() {
|
||||
wspawn_task_groups_args_t* targs = (wspawn_task_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
typedef struct {
|
||||
vx_kernel_func_cb callback;
|
||||
const void* arg;
|
||||
uint32_t all_tasks_offset;
|
||||
uint32_t remain_tasks_offset;
|
||||
uint32_t warp_batches;
|
||||
uint32_t remaining_warps;
|
||||
} wspawn_threads_args_t;
|
||||
|
||||
static void __attribute__ ((noinline)) process_threads() {
|
||||
wspawn_threads_args_t* targs = (wspawn_threads_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
|
||||
uint32_t threads_per_warp = vx_num_threads();
|
||||
uint32_t warp_id = vx_warp_id();
|
||||
uint32_t thread_id = vx_thread_id();
|
||||
|
||||
uint32_t start_warp = (warp_id * targs->warp_batches) + MIN(warp_id, targs->remaining_warps);
|
||||
uint32_t iterations = targs->warp_batches + (warp_id < targs->remaining_warps);
|
||||
|
||||
uint32_t start_task_id = targs->all_tasks_offset + (start_warp * threads_per_warp) + thread_id;
|
||||
uint32_t end_task_id = start_task_id + iterations * threads_per_warp;
|
||||
|
||||
__local_group_id = 0;
|
||||
threadIdx.x = 0;
|
||||
threadIdx.y = 0;
|
||||
threadIdx.z = 0;
|
||||
|
||||
vx_kernel_func_cb callback = targs->callback;
|
||||
const void* arg = targs->arg;
|
||||
|
||||
for (uint32_t task_id = start_task_id; task_id < end_task_id; task_id += threads_per_warp) {
|
||||
blockIdx.x = task_id % gridDim.x;
|
||||
blockIdx.y = (task_id / gridDim.x) % gridDim.y;
|
||||
blockIdx.z = task_id / (gridDim.x * gridDim.y);
|
||||
callback((void*)arg);
|
||||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) process_remaining_threads() {
|
||||
wspawn_threads_args_t* targs = (wspawn_threads_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
|
||||
uint32_t thread_id = vx_thread_id();
|
||||
uint32_t task_id = targs->remain_tasks_offset + thread_id;
|
||||
|
||||
(targs->callback)((void*)targs->arg);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) process_threads_stub() {
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// process all tasks
|
||||
process_threads();
|
||||
|
||||
// disable warp
|
||||
vx_tmc_zero();
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) process_thread_groups() {
|
||||
wspawn_groups_args_t* targs = (wspawn_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
|
||||
uint32_t threads_per_warp = vx_num_threads();
|
||||
uint32_t warp_id = vx_warp_id();
|
||||
|
@ -209,8 +136,8 @@ static void __attribute__ ((noinline)) process_all_task_groups() {
|
|||
}
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) process_all_task_groups_stub() {
|
||||
wspawn_task_groups_args_t* targs = (wspawn_task_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
static void __attribute__ ((noinline)) process_thread_groups_stub() {
|
||||
wspawn_groups_args_t* targs = (wspawn_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
uint32_t warps_per_group = targs->warps_per_group;
|
||||
uint32_t remaining_mask = targs->remaining_mask;
|
||||
uint32_t warp_id = vx_warp_id();
|
||||
|
@ -220,8 +147,8 @@ static void __attribute__ ((noinline)) process_all_task_groups_stub() {
|
|||
// activate threads
|
||||
vx_tmc(threads_mask);
|
||||
|
||||
// process all tasks
|
||||
process_all_task_groups();
|
||||
// process thread groups
|
||||
process_thread_groups();
|
||||
|
||||
// disable all warps except warp0
|
||||
vx_tmc(0 == vx_warp_id());
|
||||
|
@ -253,10 +180,12 @@ int vx_spawn_threads(uint32_t dimension,
|
|||
// check group size
|
||||
uint32_t threads_per_core = warps_per_core * threads_per_warp;
|
||||
if (threads_per_core < group_size) {
|
||||
vx_printf("error: group_size > threads_per_core (%d, %d)\n", group_size, threads_per_core);
|
||||
vx_printf("error: group_size > threads_per_core (%d,%d)\n", group_size, threads_per_core);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (group_size > 1) {
|
||||
// calculate number of warps per group
|
||||
uint32_t warps_per_group = group_size / threads_per_warp;
|
||||
uint32_t remaining_threads = group_size - warps_per_group * threads_per_warp;
|
||||
uint32_t remaining_mask = -1;
|
||||
|
@ -265,6 +194,7 @@ int vx_spawn_threads(uint32_t dimension,
|
|||
++warps_per_group;
|
||||
}
|
||||
|
||||
// calculate necessary active cores
|
||||
uint32_t needed_warps = num_groups * warps_per_group;
|
||||
uint32_t needed_cores = (needed_warps + warps_per_core-1) / warps_per_core;
|
||||
uint32_t active_cores = MIN(needed_cores, num_cores);
|
||||
|
@ -273,6 +203,7 @@ int vx_spawn_threads(uint32_t dimension,
|
|||
if (core_id >= active_cores)
|
||||
return 0;
|
||||
|
||||
// total number of groups per core
|
||||
uint32_t total_groups_per_core = num_groups / active_cores;
|
||||
uint32_t remaining_groups_per_core = num_groups - active_cores * total_groups_per_core;
|
||||
if (core_id < remaining_groups_per_core)
|
||||
|
@ -293,7 +224,7 @@ int vx_spawn_threads(uint32_t dimension,
|
|||
uint32_t group_offset = core_id * total_groups_per_core + MIN(core_id, remaining_groups_per_core);
|
||||
|
||||
// set scheduler arguments
|
||||
wspawn_task_groups_args_t wspawn_args = {
|
||||
wspawn_groups_args_t wspawn_args = {
|
||||
kernel_func,
|
||||
arg,
|
||||
group_offset,
|
||||
|
@ -306,16 +237,85 @@ int vx_spawn_threads(uint32_t dimension,
|
|||
csr_write(VX_CSR_MSCRATCH, &wspawn_args);
|
||||
|
||||
// set global variables
|
||||
__groups_per_core = groups_per_core;
|
||||
__warps_per_group = warps_per_group;
|
||||
|
||||
// execute callback on other warps
|
||||
vx_wspawn(active_warps, process_all_task_groups_stub);
|
||||
vx_wspawn(active_warps, process_thread_groups_stub);
|
||||
|
||||
// execute callback on warp0
|
||||
process_all_task_groups_stub();
|
||||
process_thread_groups_stub();
|
||||
} else {
|
||||
uint32_t num_tasks = num_groups;
|
||||
__warps_per_group = 0;
|
||||
|
||||
// wait for spawned tasks to complete
|
||||
// calculate necessary active cores
|
||||
uint32_t needed_cores = (num_tasks + threads_per_core - 1) / threads_per_core;
|
||||
uint32_t active_cores = MIN(needed_cores, num_cores);
|
||||
|
||||
// only active cores participate
|
||||
if (core_id >= active_cores)
|
||||
return 0;
|
||||
|
||||
// number of tasks per core
|
||||
uint32_t tasks_per_core = num_tasks / active_cores;
|
||||
uint32_t remaining_tasks_per_core = num_tasks - tasks_per_core * active_cores;
|
||||
if (core_id < remaining_tasks_per_core)
|
||||
++tasks_per_core;
|
||||
|
||||
// calculate number of warps to activate
|
||||
uint32_t total_warps_per_core = tasks_per_core / threads_per_warp;
|
||||
uint32_t remaining_tasks = tasks_per_core - total_warps_per_core * threads_per_warp;
|
||||
uint32_t active_warps = total_warps_per_core;
|
||||
uint32_t warp_batches = 1, remaining_warps = 0;
|
||||
if (active_warps > warps_per_core) {
|
||||
active_warps = warps_per_core;
|
||||
warp_batches = total_warps_per_core / active_warps;
|
||||
remaining_warps = total_warps_per_core - warp_batches * active_warps;
|
||||
}
|
||||
|
||||
// calculate offsets for task distribution
|
||||
uint32_t all_tasks_offset = core_id * tasks_per_core + MIN(core_id, remaining_tasks_per_core);
|
||||
uint32_t remain_tasks_offset = all_tasks_offset + (tasks_per_core - remaining_tasks);
|
||||
|
||||
// prepare scheduler arguments
|
||||
wspawn_threads_args_t wspawn_args = {
|
||||
kernel_func,
|
||||
arg,
|
||||
all_tasks_offset,
|
||||
remain_tasks_offset,
|
||||
warp_batches,
|
||||
remaining_warps
|
||||
};
|
||||
csr_write(VX_CSR_MSCRATCH, &wspawn_args);
|
||||
|
||||
if (active_warps >= 1) {
|
||||
// execute callback on other warps
|
||||
vx_wspawn(active_warps, process_threads_stub);
|
||||
|
||||
// activate all threads
|
||||
vx_tmc(-1);
|
||||
|
||||
// process threads
|
||||
process_threads();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
}
|
||||
|
||||
if (remaining_tasks != 0) {
|
||||
// activate remaining threads
|
||||
uint32_t tmask = (1 << remaining_tasks) - 1;
|
||||
vx_tmc(tmask);
|
||||
|
||||
// process remaining threads
|
||||
process_remaining_threads();
|
||||
|
||||
// back to single-threaded
|
||||
vx_tmc_one();
|
||||
}
|
||||
}
|
||||
|
||||
// wait for spawned warps to complete
|
||||
vx_wspawn(1, 0);
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -34,7 +34,6 @@ typedef void* vx_buffer_h;
|
|||
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
|
||||
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
|
||||
#define VX_CAPS_ISA_FLAGS 0x7
|
||||
#define VX_CAPS_NUM_BARRIERS 0x8
|
||||
|
||||
// device isa flags
|
||||
#define VX_ISA_STD_A (1ull << 0)
|
||||
|
@ -126,7 +125,7 @@ int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_
|
|||
int vx_upload_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer);
|
||||
|
||||
// calculate cooperative threads array occupancy
|
||||
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_barriers, uint32_t* max_localmem);
|
||||
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_localmem);
|
||||
|
||||
// performance counters
|
||||
int vx_dump_perf(vx_device_h hdevice, FILE* stream);
|
||||
|
|
|
@ -218,9 +218,6 @@ public:
|
|||
case VX_CAPS_NUM_CORES:
|
||||
_value = (dev_caps_ >> 24) & 0xffff;
|
||||
break;
|
||||
case VX_CAPS_NUM_BARRIERS:
|
||||
_value = (dev_caps_ >> 40) & 0xff;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
|
|
|
@ -65,9 +65,6 @@ public:
|
|||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_NUM_BARRIERS:
|
||||
_value = NUM_BARRIERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
|
|
|
@ -69,9 +69,6 @@ public:
|
|||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_NUM_BARRIERS:
|
||||
_value = NUM_BARRIERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
|
|
|
@ -626,7 +626,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_barriers, uint32_t* max_localmem) {
|
||||
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_localmem) {
|
||||
// check group size
|
||||
uint64_t warps_per_core, threads_per_warp;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_WARPS, &warps_per_core), {
|
||||
|
@ -637,7 +637,7 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
|
|||
});
|
||||
uint32_t threads_per_core = warps_per_core * threads_per_warp;
|
||||
if (group_size > threads_per_core) {
|
||||
printf("Error: device cannot schedule group size > (%d)\n", threads_per_core);
|
||||
printf("Error: cannot schedule kernel with group_size > threads_per_core (%d,%d)\n", group_size, threads_per_core);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -645,19 +645,6 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
|
|||
int warps_per_group = (group_size + threads_per_warp-1) / threads_per_warp;
|
||||
int groups_per_core = warps_per_core / warps_per_group;
|
||||
|
||||
// check barriers capacity
|
||||
if (max_barriers) {
|
||||
uint64_t num_barriers;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_BARRIERS, &num_barriers), {
|
||||
return err;
|
||||
});
|
||||
if (warps_per_group < 2) {
|
||||
*max_barriers = -1;
|
||||
} else {
|
||||
*max_barriers = num_barriers / groups_per_core;
|
||||
}
|
||||
}
|
||||
|
||||
// check local memory capacity
|
||||
if (max_localmem) {
|
||||
uint64_t local_mem_size;
|
||||
|
|
|
@ -390,9 +390,6 @@ public:
|
|||
case VX_CAPS_NUM_CORES:
|
||||
_value = (dev_caps_ >> 24) & 0xffff;
|
||||
break;
|
||||
case VX_CAPS_NUM_BARRIERS:
|
||||
_value = (dev_caps_ >> 40) & 0xff;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
|
|
|
@ -204,8 +204,8 @@ typedef struct {
|
|||
int st_buffer_src[ST_BUF_SZ];
|
||||
int st_buffer_dst[ST_BUF_SZ];
|
||||
|
||||
void st_kernel(int task_id, const st_args_t * __UNIFORM__ arg) {
|
||||
arg->dst[task_id] = arg->src[task_id];
|
||||
void st_kernel(const st_args_t * __UNIFORM__ arg) {
|
||||
arg->dst[blockIdx.x] = arg->src[blockIdx.x];
|
||||
}
|
||||
|
||||
int test_spawn_tasks() {
|
||||
|
@ -219,7 +219,8 @@ int test_spawn_tasks() {
|
|||
st_buffer_src[i] = 65 + i;
|
||||
}
|
||||
|
||||
vx_spawn_tasks(ST_BUF_SZ, (vx_spawn_tasks_cb)st_kernel, &arg);
|
||||
uint32_t num_tasks(ST_BUF_SZ);
|
||||
vx_spawn_threads(1, &num_tasks, nullptr, (vx_kernel_func_cb)st_kernel, &arg);
|
||||
|
||||
return check_error(st_buffer_dst, 0, ST_BUF_SZ);
|
||||
}
|
||||
|
|
|
@ -35,7 +35,6 @@ run-simx:
|
|||
$(MAKE) -C transpose run-simx
|
||||
$(MAKE) -C spmv run-simx
|
||||
$(MAKE) -C stencil run-simx
|
||||
$(MAKE) -C lbm run-simx
|
||||
$(MAKE) -C nearn run-simx
|
||||
$(MAKE) -C guassian run-simx
|
||||
$(MAKE) -C kmeans run-simx
|
||||
|
@ -57,7 +56,6 @@ run-rtlsim:
|
|||
$(MAKE) -C transpose run-rtlsim
|
||||
$(MAKE) -C spmv run-rtlsim
|
||||
$(MAKE) -C stencil run-rtlsim
|
||||
$(MAKE) -C lbm run-rtlsim
|
||||
$(MAKE) -C nearn run-rtlsim
|
||||
$(MAKE) -C guassian run-rtlsim
|
||||
$(MAKE) -C kmeans run-rtlsim
|
||||
|
@ -79,7 +77,6 @@ run-opae:
|
|||
$(MAKE) -C transpose run-opae
|
||||
$(MAKE) -C spmv run-opae
|
||||
$(MAKE) -C stencil run-opae
|
||||
$(MAKE) -C lbm run-opae
|
||||
$(MAKE) -C nearn run-opae
|
||||
$(MAKE) -C guassian run-opae
|
||||
$(MAKE) -C kmeans run-opae
|
||||
|
|
|
@ -256,17 +256,10 @@ free(allPlatforms);*/
|
|||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
std::abort();
|
||||
oclHandles.program = clCreateProgramWithSource(
|
||||
oclHandles.context, 1, (const char**)&kernel_bin, &kernel_size, &resultCL);
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
std::abort();
|
||||
oclHandles.program = clCreateProgramWithBinary(
|
||||
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &resultCL);
|
||||
#endif
|
||||
free(kernel_bin);
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
|
||||
throw(string("InitCL()::Error: Loading Binary into cl_program. "
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
OPTS ?= $(SRC_DIR)/graph4096.txt
|
||||
|
||||
include ../common.mk
|
||||
|
|
Binary file not shown.
|
@ -15,7 +15,7 @@
|
|||
#include "CLHelper.h"
|
||||
#include "util.h"
|
||||
|
||||
#define MAX_THREADS_PER_BLOCK 256
|
||||
#define MAX_THREADS_PER_BLOCK 16
|
||||
|
||||
// Structure to hold a node information
|
||||
struct Node {
|
||||
|
|
|
@ -9,6 +9,11 @@ SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/oclUtils.cpp $(SRC_DIR)/shrUtils.cpp $(SRC
|
|||
|
||||
CXXFLAGS += -I$(SRC_DIR)
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
OPTS ?=
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -52,17 +52,12 @@ extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqPar
|
|||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
cl_device_id device_id = oclGetFirstDev(cxGPUContext);
|
||||
#ifdef HOSTGPU
|
||||
|
||||
ciErrNum = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernel_bin, &kernel_size, &ciErrNum);
|
||||
#else
|
||||
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
cpBlackScholes = clCreateProgramWithBinary(
|
||||
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
|
||||
#endif
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("...building BlackScholes program\n");
|
||||
ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);
|
||||
|
||||
|
|
|
@ -6,17 +6,14 @@ XRT_SYN_DIR ?= $(VORTEX_HOME)/hw/syn/xilinx/xrt
|
|||
XRT_DEVICE_INDEX ?= 0
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
VX_LLCFLAGS += -target-feature +f -target-feature +d -target-abi lp64
|
||||
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
STARTUP_ADDR ?= 0x180000000
|
||||
else
|
||||
VX_LLCFLAGS += -target-feature +f -target-abi ilp32f
|
||||
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
STARTUP_ADDR ?= 0x80000000
|
||||
endif
|
||||
|
||||
POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
|
||||
POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
|
||||
POCL_PATH ?= $(TOOLDIR)/pocl
|
||||
|
||||
LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
|
||||
|
||||
|
@ -30,13 +27,10 @@ VX_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections
|
|||
VX_CFLAGS += -I$(ROOT_DIR)/hw -I$(VORTEX_KN_PATH)/include -DXLEN_$(XLEN) -DNDEBUG
|
||||
VX_CFLAGS += -Xclang -target-feature -Xclang +vortex
|
||||
VX_CFLAGS += -Xclang -target-feature -Xclang +zicond
|
||||
VX_CFLAGS += -mllvm -disable-loop-idiom-all # disable memset/memcpy loop idiom
|
||||
VX_CFLAGS += -mllvm -disable-loop-idiom-all
|
||||
#VX_CFLAGS += -mllvm -vortex-branch-divergence=0
|
||||
#VX_CFLAGS += -mllvm -print-after-all
|
||||
|
||||
VX_LLCFLAGS += -target-feature +m -target-feature +vortex
|
||||
#VX_LLCFLAGS += -mllvm -vortex-branch-divergence=0
|
||||
|
||||
VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/scripts/link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(ROOT_DIR)/kernel/libvortex.a $(VX_LIBS)
|
||||
|
||||
VX_BINTOOL += OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scripts/vxbin.py
|
||||
|
@ -44,17 +38,20 @@ VX_BINTOOL += OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scri
|
|||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
|
||||
CXXFLAGS += -pthread
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
CXXFLAGS += -I$(POCL_PATH)/include
|
||||
|
||||
POCL_CC_FLAGS = LLVM_PREFIX=$(LLVM_VORTEX) POCL_VORTEX_BINTOOL="$(VX_BINTOOL)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)"
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
POCL_CC_FLAGS += POCL_DEBUG=all
|
||||
POCL_RT_FLAGS += POCL_DEBUG=all
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
LDFLAGS += -Wl,-rpath,$(LLVM_VORTEX)/lib
|
||||
|
||||
ifeq ($(TARGET), fpga)
|
||||
OPAE_DRV_PATHS ?= libopae-c.so
|
||||
else
|
||||
|
@ -68,16 +65,8 @@ endif
|
|||
endif
|
||||
|
||||
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
|
||||
OBJS_HOST := $(addsuffix .host.o, $(notdir $(SRCS)))
|
||||
|
||||
.DEFAULT_GOAL := all
|
||||
all: $(PROJECT) kernel.pocl
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
kernel.pocl: $(SRC_DIR)/kernel.cl
|
||||
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) $(POCL_CC_FLAGS) POCL_VORTEX_BINTOOL="$(VX_BINTOOL)" POCL_VORTEX_LLCFLAGS="$(VX_LLCFLAGS)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o $@ $<
|
||||
all: $(PROJECT)
|
||||
|
||||
%.cc.o: $(SRC_DIR)/%.cc
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
@ -88,49 +77,36 @@ kernel.pocl: $(SRC_DIR)/kernel.cl
|
|||
%.c.o: $(SRC_DIR)/%.c
|
||||
$(CC) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
%.cc.host.o: $(SRC_DIR)/%.cc
|
||||
$(CXX) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
|
||||
$(PROJECT): $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(OBJS) $(LDFLAGS) -L$(ROOT_DIR)/runtime -lvortex -L$(POCL_PATH)/lib -lOpenCL -o $@
|
||||
|
||||
%.cpp.host.o: $(SRC_DIR)/%.cpp
|
||||
$(CXX) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
|
||||
$(PROJECT).host: $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(OBJS) $(LDFLAGS) -lOpenCL -o $@
|
||||
|
||||
%.c.host.o: $(SRC_DIR)/%.c
|
||||
$(CC) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
|
||||
|
||||
ifndef USE_SETUP
|
||||
setup:
|
||||
endif
|
||||
|
||||
$(PROJECT): setup $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out setup, $^) $(LDFLAGS) -L$(ROOT_DIR)/runtime -lvortex -L$(POCL_RT_PATH)/lib -lOpenCL -o $@
|
||||
|
||||
$(PROJECT).host: setup $(OBJS_HOST)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out setup, $^) $(LDFLAGS) -lOpenCL -o $@
|
||||
|
||||
run-gpu: $(PROJECT).host kernel.cl
|
||||
run-gpu: $(PROJECT).host $(KERNEL_SRCS)
|
||||
./$(PROJECT).host $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) VORTEX_DRIVER=simx ./$(PROJECT) $(OPTS)
|
||||
run-simx: $(PROJECT) $(KERNEL_SRCS)
|
||||
LD_LIBRARY_PATH=$(POCL_PATH)/lib:$(ROOT_DIR)/runtime:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) $(POCL_CC_FLAGS) VORTEX_DRIVER=simx ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) VORTEX_DRIVER=rtlsim ./$(PROJECT) $(OPTS)
|
||||
run-rtlsim: $(PROJECT) $(KERNEL_SRCS)
|
||||
LD_LIBRARY_PATH=$(POCL_PATH)/lib:$(ROOT_DIR)/runtime:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) $(POCL_CC_FLAGS) VORTEX_DRIVER=rtlsim ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-opae: $(PROJECT) kernel.pocl
|
||||
SCOPE_JSON_PATH=$(ROOT_DIR)/runtime/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) VORTEX_DRIVER=opae ./$(PROJECT) $(OPTS)
|
||||
run-opae: $(PROJECT) $(KERNEL_SRCS)
|
||||
SCOPE_JSON_PATH=$(ROOT_DIR)/runtime/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_PATH)/lib:$(ROOT_DIR)/runtime:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) $(POCL_CC_FLAGS) VORTEX_DRIVER=opae ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-xrt: $(PROJECT) kernel.pocl
|
||||
run-xrt: $(PROJECT) $(KERNEL_SRCS)
|
||||
ifeq ($(TARGET), hw)
|
||||
XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) VORTEX_DRIVER=xrt ./$(PROJECT) $(OPTS)
|
||||
XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_PATH)/lib:$(ROOT_DIR)/runtime:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) $(POCL_CC_FLAGS) VORTEX_DRIVER=xrt ./$(PROJECT) $(OPTS)
|
||||
else
|
||||
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) VORTEX_DRIVER=xrt ./$(PROJECT) $(OPTS)
|
||||
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_PATH)/lib:$(ROOT_DIR)/runtime:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) $(POCL_CC_FLAGS) VORTEX_DRIVER=xrt ./$(PROJECT) $(OPTS)
|
||||
endif
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean-kernel:
|
||||
rm -rf *.dump *.pocl
|
||||
rm -rf *.dump *.ll
|
||||
|
||||
clean-host:
|
||||
rm -rf $(PROJECT) $(PROJECT).host *.o *.log .depend
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
OPTS ?= -n32
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -166,17 +166,10 @@ int main (int argc, char **argv) {
|
|||
o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
return -1;
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/oclUtils.cpp $(SRC_DIR)/shrUtils.cpp $(SRC_DIR)/cmd_arg_reader.cpp
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
OPTS ?= -size=4096
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -30,11 +30,7 @@
|
|||
|
||||
// Name of the file with the source code for the computation kernel
|
||||
// *********************************************************************
|
||||
#ifdef HOSTGPU
|
||||
const char* cSourceFile = "kernel.cl";
|
||||
#else
|
||||
const char* cSourceFile = "kernel.pocl";
|
||||
#endif
|
||||
|
||||
// Host buffers for demo
|
||||
// *********************************************************************
|
||||
|
@ -171,12 +167,9 @@ int main(int argc, char **argv)
|
|||
shrLog("clCreateProgramWithSource...\n");
|
||||
cl_int binary_status;
|
||||
cl_program program;
|
||||
#ifdef HOSTGPU
|
||||
|
||||
program = clCreateProgramWithSource(
|
||||
cxGPUContext, 1, (const char**)&cSourceCL, &szKernelLength, &ciErrNum);
|
||||
#else
|
||||
program = clCreateProgramWithBinary(cxGPUContext, 1, cdDevices, &szKernelLength, (const uint8_t**)&cSourceCL, &binary_status, &ciErrNum);
|
||||
#endif
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
/*// Build the program with 'mad' Optimization option
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/clutils.cpp $(SRC_DIR)/utils.cpp
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
# usage: [-f <input_file>] [-s <size>]
|
||||
OPTS ?= -q -s 32
|
||||
|
||||
|
|
|
@ -883,16 +883,9 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
|
|||
cl_program clProgramReturn;
|
||||
|
||||
// Create the program object
|
||||
#ifdef HOSTGPU
|
||||
int err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, &kernel_size, &status);
|
||||
#else
|
||||
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
clProgramReturn = clCreateProgramWithBinary(
|
||||
context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
|
||||
#endif
|
||||
free(kernel_bin);
|
||||
cl_errChk(status, "Creating program", true);
|
||||
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/read_input.c $(SRC_DIR)/rmse.c $(SRC_DIR)/kmeans_clustering.c $(SRC_DIR)/cluster.c $(SRC_DIR)/getopt.c
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
# usage: -f<features> -p<points>
|
||||
OPTS ?= -f100 -p100
|
||||
|
||||
|
|
|
@ -201,7 +201,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
|||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
cl_program prog;
|
||||
#ifdef HOSTGPU
|
||||
|
||||
err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: read_kernel_file() => %d\n", err);
|
||||
|
@ -209,19 +209,6 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
|||
}
|
||||
prog = clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &err);
|
||||
#else
|
||||
err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: read_kernel_file() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
prog = clCreateProgramWithBinary(
|
||||
context, 1, device_list, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateProgramWithBinary() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
#endif
|
||||
|
||||
free(kernel_bin);
|
||||
|
||||
|
|
|
@ -15,8 +15,10 @@ lbm_macros.h: $(SRC_DIR)/lbm_macros.h
|
|||
layout_config.h: $(SRC_DIR)/layout_config.h
|
||||
cp $< $@
|
||||
|
||||
setup: lbm_macros.h layout_config.h
|
||||
USE_SETUP := yes
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl lbm_macros.h layout_config.h
|
||||
|
||||
# Usage: #iter [-i input_file] [-o output_file]
|
||||
OPTS ?= 1 -i $(SRC_DIR)/32_32_8_ldc.of
|
||||
|
|
|
@ -318,19 +318,11 @@ void OpenCL_initialize(struct pb_Parameters *p, OpenCL_Param *prm) {
|
|||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
|
||||
#ifdef HOSTGPU
|
||||
clStatus = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
CHECK_ERROR("read_kernel_file")
|
||||
prm->clProgram = clCreateProgramWithSource(
|
||||
prm->clContext, 1, (const char**)&kernel_bin, &kernel_size, &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithSource")
|
||||
#else
|
||||
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
CHECK_ERROR("read_kernel_file")
|
||||
prm->clProgram = clCreateProgramWithBinary(
|
||||
prm->clContext, 1, &prm->clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithBinary")
|
||||
#endif
|
||||
|
||||
//char clOptions[100];
|
||||
//sprintf(clOptions, "-I src/opencl_base");
|
||||
|
|
|
@ -7,12 +7,17 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/clutils.cpp $(SRC_DIR)/utils.cpp
|
||||
|
||||
OPTS ?= filelist.log
|
||||
|
||||
filelist.log:
|
||||
echo "$(SRC_DIR)/cane4_0.db\n$(SRC_DIR)/cane4_1.db" > filelist.log
|
||||
|
||||
setup: filelist.log
|
||||
USE_SETUP := yes
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl filelist.log
|
||||
|
||||
OPTS ?= filelist.log
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -882,16 +882,9 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
|
|||
cl_program clProgramReturn;
|
||||
|
||||
// Create the program object
|
||||
#ifdef HOSTGPU
|
||||
int err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, &kernel_size, &status);
|
||||
#else
|
||||
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
clProgramReturn = clCreateProgramWithBinary(
|
||||
context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
|
||||
#endif
|
||||
free(kernel_bin);
|
||||
cl_errChk(status, "Creating program", true);
|
||||
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
OPTS ?= -n4
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -119,17 +119,10 @@ int main (int argc, char **argv) {
|
|||
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
OPTS ?= -f -n16
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -129,17 +129,10 @@ int main (int argc, char **argv) {
|
|||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
OPTS ?= -n16
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -158,17 +158,10 @@ int main (int argc, char **argv) {
|
|||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
return -1;
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
OPTS ?= -n1024
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -169,17 +169,10 @@ int main(int argc, char **argv) {
|
|||
cl_mem memObjects[2] = {0, 0};
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK_ERR(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK_ERR(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
OPTS ?= -n16
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -171,17 +171,10 @@ int main(int argc, char **argv) {
|
|||
cl_mem memObjects[2] = {0, 0};
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK_ERR(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK_ERR(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
|
|
@ -10,8 +10,10 @@ SRCS := $(SRC_DIR)/main.cc
|
|||
common.h: $(SRC_DIR)/common.h
|
||||
cp $< $@
|
||||
|
||||
setup: common.h
|
||||
USE_SETUP := yes
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl common.h
|
||||
|
||||
OPTS ?= -n32
|
||||
|
||||
|
|
|
@ -196,17 +196,10 @@ int main (int argc, char **argv) {
|
|||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
|
|
@ -10,8 +10,10 @@ SRCS := $(SRC_DIR)/main.cc
|
|||
common.h: $(SRC_DIR)/common.h
|
||||
cp $< $@
|
||||
|
||||
setup: common.h
|
||||
USE_SETUP := yes
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl common.h
|
||||
|
||||
OPTS ?= -n16
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#ifndef COMMON_H
|
||||
#define COMMON_H
|
||||
|
||||
#define TILE_SIZE 8
|
||||
#define TILE_SIZE 4
|
||||
|
||||
#ifndef TYPE
|
||||
#define TYPE float
|
||||
|
|
|
@ -197,17 +197,10 @@ int main (int argc, char **argv) {
|
|||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
return -1;
|
||||
|
|
|
@ -10,8 +10,10 @@ SRCS := $(SRC_DIR)/main.cc
|
|||
common.h: $(SRC_DIR)/common.h
|
||||
cp $< $@
|
||||
|
||||
setup: common.h
|
||||
USE_SETUP := yes
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl common.h
|
||||
|
||||
OPTS ?= -n16
|
||||
|
||||
|
|
|
@ -140,7 +140,7 @@ static void cleanup() {
|
|||
}
|
||||
|
||||
uint32_t size = 16;
|
||||
uint32_t tile_size = 8;
|
||||
uint32_t tile_size = 4;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-t tile size] [-h: help]\n");
|
||||
|
@ -201,17 +201,10 @@ int main (int argc, char **argv) {
|
|||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
return -1;
|
||||
|
|
|
@ -9,6 +9,11 @@ SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/parboil_opencl.c $(SRC_DIR)/args.c $(SRC_D
|
|||
|
||||
CXXFLAGS += -I$(SRC_DIR)
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
# Usage: -i matrix_file,vector_file [-o output_file]
|
||||
OPTS ?= -i $(SRC_DIR)/1138_bus.mtx,$(SRC_DIR)/1138_bus.vec
|
||||
|
||||
|
|
|
@ -162,19 +162,11 @@ int main(int argc, char **argv) {
|
|||
cl_int binary_status = 0;
|
||||
cl_program clProgram;
|
||||
|
||||
#ifdef HOSTGPU
|
||||
clStatus = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
CHECK_ERROR("read_kernel_file")
|
||||
clProgram = clCreateProgramWithSource(
|
||||
clContext, 1, (const char**)&kernel_bin, &kernel_size, &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithSource")
|
||||
#else
|
||||
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
CHECK_ERROR("read_kernel_file")
|
||||
clProgram = clCreateProgramWithBinary(
|
||||
clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithBinary")
|
||||
#endif
|
||||
|
||||
char clOptions[50];
|
||||
sprintf(clOptions, "");
|
||||
|
|
|
@ -9,6 +9,11 @@ SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/args.c $(SRC_DIR)/parboil_opencl.c $(SRC_D
|
|||
|
||||
CXXFLAGS += -I$(SRC_DIR)
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
# Usage: #nx #ny #nz #iter -i input_file [-o output_file]
|
||||
OPTS ?= 64 64 8 1 -i $(SRC_DIR)/64x64x8.bin
|
||||
|
||||
|
|
|
@ -211,19 +211,11 @@ int main(int argc, char** argv) {
|
|||
cl_int binary_status = 0;
|
||||
cl_program clProgram;
|
||||
|
||||
#ifdef HOSTGPU
|
||||
clStatus = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
CHECK_ERROR("read_kernel_file")
|
||||
clProgram = clCreateProgramWithSource(
|
||||
clContext, 1, (const char**)&kernel_bin, &kernel_size, &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithSource")
|
||||
#else
|
||||
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
CHECK_ERROR("read_kernel_file")
|
||||
clProgram = clCreateProgramWithBinary(
|
||||
clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithBinary")
|
||||
#endif
|
||||
|
||||
char clOptions[50];
|
||||
sprintf(clOptions,"-I src/opencl_base");
|
||||
|
|
|
@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
|
|||
|
||||
SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/oclUtils.cpp $(SRC_DIR)/shrUtils.cpp $(SRC_DIR)/cmd_arg_reader.cpp $(SRC_DIR)/transpose_gold.cpp
|
||||
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl
|
||||
|
||||
OPTS ?= -width=128 -height=128
|
||||
|
||||
#CXXFLAGS += -D_DEBUG
|
||||
|
|
|
@ -23,7 +23,7 @@
|
|||
#include "oclUtils.h"
|
||||
#include "shrQATest.h"
|
||||
|
||||
#define BLOCK_DIM 16
|
||||
#define BLOCK_DIM 4
|
||||
|
||||
// max GPU's to manage for multi-GPU parallel compute
|
||||
const unsigned int MAX_GPU_COUNT = 1;
|
||||
|
@ -319,7 +319,6 @@ int runTest( const int argc, const char** argv)
|
|||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
|
||||
#ifdef HOSTGPU
|
||||
ciErrNum = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
|
||||
if (ciErrNum != CL_SUCCESS) {
|
||||
shrLog(" Error %i in read_kernel_file call !!!\n\n", ciErrNum);
|
||||
|
@ -331,19 +330,6 @@ int runTest( const int argc, const char** argv)
|
|||
shrLog(" Error %i in clCreateProgramWithSource call !!!\n\n", ciErrNum);
|
||||
return ciErrNum;
|
||||
}
|
||||
#else
|
||||
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
if (ciErrNum != CL_SUCCESS) {
|
||||
shrLog(" Error %i in read_kernel_file call !!!\n\n", ciErrNum);
|
||||
return ciErrNum;
|
||||
}
|
||||
rv_program = clCreateProgramWithBinary(
|
||||
cxGPUContext, 1, cdDevices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
|
||||
if (ciErrNum != CL_SUCCESS) {
|
||||
shrLog(" Error %i in clCreateProgramWithBinary call !!!\n\n", ciErrNum);
|
||||
return ciErrNum;
|
||||
}
|
||||
#endif
|
||||
|
||||
// build the program
|
||||
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
|
||||
|
|
|
@ -10,8 +10,10 @@ SRCS := $(SRC_DIR)/main.cc
|
|||
common.h: $(SRC_DIR)/common.h
|
||||
cp $< $@
|
||||
|
||||
setup: common.h
|
||||
USE_SETUP := yes
|
||||
kernel.cl: $(SRC_DIR)/kernel.cl
|
||||
cp $< $@
|
||||
|
||||
KERNEL_SRCS := kernel.cl common.h
|
||||
|
||||
OPTS ?= -n64
|
||||
|
||||
|
|
|
@ -181,17 +181,10 @@ int main (int argc, char **argv) {
|
|||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
|
|
@ -6,13 +6,12 @@
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_tasks;
|
||||
uint32_t grid_dim[2];
|
||||
uint32_t width;
|
||||
uint32_t log2_width;
|
||||
uint64_t lmem_addr;
|
||||
uint64_t I_addr;
|
||||
uint64_t W_addr;
|
||||
uint64_t O_addr;
|
||||
bool use_lmem;
|
||||
} kernel_arg_t;
|
||||
|
||||
#endif
|
||||
|
|
|
@ -7,19 +7,14 @@ inline char is_log2(uint32_t x) {
|
|||
return ((x & (x-1)) == 0);
|
||||
}
|
||||
|
||||
void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto I = reinterpret_cast<TYPE*>(arg->I_addr);
|
||||
auto W = reinterpret_cast<TYPE*>((arg->lmem_addr != 0) ? arg->lmem_addr : arg->W_addr);
|
||||
auto W = reinterpret_cast<TYPE*>(arg->use_lmem ? __local_mem(0) : (void*)arg->W_addr);
|
||||
auto O = reinterpret_cast<TYPE*>(arg->O_addr);
|
||||
auto width = arg->width;
|
||||
|
||||
uint32_t row, col;
|
||||
if (is_log2(width)) {
|
||||
row = task_id >> arg->log2_width;
|
||||
col = task_id & (width-1);
|
||||
} else {
|
||||
row = task_id / width;
|
||||
}
|
||||
int col = blockIdx.x;
|
||||
int row = blockIdx.y;
|
||||
|
||||
// Adjust for padded borders
|
||||
int paddedWidth = width + 2;
|
||||
|
@ -46,14 +41,13 @@ void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
if (arg->lmem_addr != 0) {
|
||||
if (arg->use_lmem) {
|
||||
// populate local memory
|
||||
auto W = reinterpret_cast<TYPE*>(arg->W_addr);
|
||||
auto L = reinterpret_cast<TYPE*>(arg->lmem_addr);
|
||||
auto L = reinterpret_cast<TYPE*>(__local_mem(0));
|
||||
for (int i = 0; i < (3*3); ++i) {
|
||||
L[i] = W[i];
|
||||
}
|
||||
}
|
||||
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
||||
|
|
|
@ -135,9 +135,7 @@ static void parse_args(int argc, char **argv) {
|
|||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(I_buffer);
|
||||
if (!use_lmem) {
|
||||
vx_mem_free(W_buffer);
|
||||
}
|
||||
vx_mem_free(O_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
|
@ -155,14 +153,13 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint32_t num_points = size * size;
|
||||
|
||||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "matrix size: " << size << "x" << size << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = num_points;
|
||||
kernel_arg.grid_dim[0] = size;
|
||||
kernel_arg.grid_dim[1] = size;
|
||||
kernel_arg.width = size;
|
||||
kernel_arg.log2_width = log2(size);
|
||||
kernel_arg.use_lmem = use_lmem;
|
||||
|
||||
uint32_t o_points = size * size;
|
||||
uint32_t i_points = (size+2) * (size+2);
|
||||
|
@ -188,8 +185,6 @@ int main(int argc, char *argv[]) {
|
|||
cleanup();
|
||||
exit(1);
|
||||
}
|
||||
} else {
|
||||
kernel_arg.lmem_addr = 0;
|
||||
}
|
||||
|
||||
std::cout << "dev_argI=0x" << std::hex << kernel_arg.I_addr << std::endl;
|
||||
|
|
|
@ -3,13 +3,13 @@
|
|||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
|
||||
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
|
||||
auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
|
||||
|
||||
uint32_t count = arg->task_size;
|
||||
uint32_t offset = task_id * count;
|
||||
uint32_t offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
|
||||
|
@ -18,6 +18,5 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
||||
|
|
|
@ -7,10 +7,12 @@
|
|||
|
||||
// Parallel Selection sort
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
int32_t* src_ptr = (int32_t*)arg->src_addr;
|
||||
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
|
||||
|
||||
uint32_t task_id = blockIdx.x;
|
||||
|
||||
int value = src_ptr[task_id];
|
||||
|
||||
// none taken
|
||||
|
@ -78,6 +80,5 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
||||
|
|
|
@ -5,19 +5,19 @@
|
|||
#include <vx_print.h>
|
||||
#include "common.h"
|
||||
|
||||
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* __UNIFORM__ arg);
|
||||
typedef void (*PFN_Kernel)(kernel_arg_t* __UNIFORM__ arg);
|
||||
|
||||
inline float __ieee754_sqrtf (float x) {
|
||||
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
|
||||
return x;
|
||||
}
|
||||
|
||||
void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_iadd(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
int32_t a = src0_ptr[offset+i];
|
||||
|
@ -27,12 +27,12 @@ void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_imul(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -42,12 +42,12 @@ void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_idiv(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -57,12 +57,12 @@ void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_idiv_mul(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -74,12 +74,12 @@ void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fadd(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
float a = src0_ptr[offset+i];
|
||||
|
@ -89,12 +89,12 @@ void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fsub(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -104,12 +104,12 @@ void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fmul(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -119,12 +119,12 @@ void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fmadd(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -134,12 +134,12 @@ void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fmsub(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -149,12 +149,12 @@ void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fnmadd(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -164,12 +164,12 @@ void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fnmsub(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -179,12 +179,12 @@ void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fnmadd_madd(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -196,12 +196,12 @@ void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fdiv(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -211,12 +211,12 @@ void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fdiv2(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -228,12 +228,12 @@ void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fsqrt(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -243,12 +243,12 @@ void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_ftoi(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -259,12 +259,12 @@ void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_ftou(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (uint32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -275,12 +275,12 @@ void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_itof(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -291,12 +291,12 @@ void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_utof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_utof(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -311,12 +311,12 @@ inline float fclamp(float a, float b, float c) {
|
|||
return fmin(fmax(a, b), c);
|
||||
}
|
||||
|
||||
void kernel_fclamp(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_fclamp(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -329,12 +329,12 @@ inline int iclamp(int a, int b, int c) {
|
|||
return std::min(std::max(a, b), c);
|
||||
}
|
||||
|
||||
void kernel_iclamp(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_iclamp(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int*)arg->src0_addr;
|
||||
auto src1_ptr = (int*)arg->src1_addr;
|
||||
auto dst_ptr = (int*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
|
@ -343,12 +343,12 @@ void kernel_iclamp(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_trigo(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_trigo(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
auto offset = blockIdx.x * count;
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
uint32_t j = offset + i;
|
||||
auto a = src0_ptr[j];
|
||||
|
@ -361,7 +361,7 @@ void kernel_trigo(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
}
|
||||
}
|
||||
|
||||
void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_bar(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto num_cores = vx_num_cores();
|
||||
auto num_warps = vx_num_warps();
|
||||
auto num_threads = vx_num_threads();
|
||||
|
@ -389,10 +389,10 @@ void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
vx_barrier(0, num_warps);
|
||||
|
||||
// update destination
|
||||
dst_ptr[task_id] += 1;
|
||||
dst_ptr[blockIdx.x] += 1;
|
||||
}
|
||||
|
||||
void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_gbar(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto num_cores = vx_num_cores();
|
||||
auto num_warps = vx_num_warps();
|
||||
auto num_threads = vx_num_threads();
|
||||
|
@ -418,7 +418,7 @@ void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
vx_barrier(0x80000000, num_cores);
|
||||
|
||||
// update destination
|
||||
dst_ptr[task_id] += 1;
|
||||
dst_ptr[blockIdx.x] += 1;
|
||||
}
|
||||
|
||||
static PFN_Kernel sc_tests[24];
|
||||
|
@ -452,6 +452,5 @@ void register_tests() {
|
|||
int main() {
|
||||
register_tests();
|
||||
auto arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)sc_tests[arg->testid], arg);
|
||||
}
|
|
@ -3,13 +3,13 @@
|
|||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
uint32_t count = arg->task_size;
|
||||
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
|
||||
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
|
||||
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
|
||||
|
||||
uint32_t offset = task_id * count;
|
||||
uint32_t offset = blockIdx.x * count;
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
|
||||
}
|
||||
|
@ -19,6 +19,5 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
||||
|
|
|
@ -3,17 +3,16 @@
|
|||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
uint64_t* src_ptr = (uint64_t*)arg->src_addr;
|
||||
uint32_t* dst_ptr = (uint32_t*)arg->dst_addr;
|
||||
|
||||
int32_t* addr_ptr = (int32_t*)(src_ptr[task_id]);
|
||||
int32_t* addr_ptr = (int32_t*)(src_ptr[blockIdx.x]);
|
||||
|
||||
dst_ptr[task_id] = *addr_ptr;
|
||||
dst_ptr[blockIdx.x] = *addr_ptr;
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
||||
|
|
|
@ -3,13 +3,13 @@
|
|||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
uint32_t stride = arg->stride;
|
||||
uint32_t* addr_ptr = (uint32_t*)arg->src0_addr;
|
||||
float* src_ptr = (float*)arg->src1_addr;
|
||||
float* dst_ptr = (float*)arg->dst_addr;
|
||||
|
||||
uint32_t offset = task_id * stride;
|
||||
uint32_t offset = blockIdx.x * stride;
|
||||
|
||||
for (uint32_t i = 0; i < stride; ++i) {
|
||||
float value = 0.0f;
|
||||
|
@ -24,6 +24,5 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
||||
|
|
|
@ -4,15 +4,14 @@
|
|||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
int cid = vx_core_id();
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
uint32_t cid = vx_core_id();
|
||||
char* src_ptr = (char*)arg->src_addr;
|
||||
char value = 'A' + src_ptr[task_id];
|
||||
vx_printf("cid=%d: task=%d, value=%c\n", cid, task_id, value);
|
||||
char value = 'A' + src_ptr[blockIdx.x];
|
||||
vx_printf("cid=%d: task=%d, value=%c\n", cid, blockIdx.x, value);
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
||||
|
|
|
@ -6,10 +6,10 @@
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t size;
|
||||
uint32_t tile_size;
|
||||
uint32_t grid_dim[2];
|
||||
uint32_t block_dim[2];
|
||||
uint32_t size;
|
||||
uint32_t tile_size;
|
||||
uint64_t A_addr;
|
||||
uint64_t B_addr;
|
||||
uint64_t C_addr;
|
||||
|
|
|
@ -170,10 +170,9 @@ int main(int argc, char *argv[]) {
|
|||
kernel_arg.tile_size = tile_size;
|
||||
|
||||
// check work group occupancy
|
||||
uint32_t max_barriers, max_localmem;
|
||||
RT_CHECK(vx_check_occupancy(device, group_size, &max_barriers, &max_localmem));
|
||||
std::cout << "occupancy: max_barriers=" << max_barriers << ", max_localmem=" << max_localmem << " bytes" << std::endl;
|
||||
RT_CHECK(max_barriers < 2);
|
||||
uint32_t max_localmem;
|
||||
RT_CHECK(vx_check_occupancy(device, group_size, &max_localmem));
|
||||
std::cout << "occupancy: max_localmem=" << max_localmem << " bytes" << std::endl;
|
||||
RT_CHECK(max_localmem < local_mem);
|
||||
|
||||
// allocate device memory
|
||||
|
|
|
@ -6,9 +6,8 @@
|
|||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_tasks;
|
||||
uint32_t grid_dim[2];
|
||||
uint32_t size;
|
||||
uint32_t log2_size;
|
||||
uint64_t A_addr;
|
||||
uint64_t B_addr;
|
||||
uint64_t C_addr;
|
||||
|
|
|
@ -7,31 +7,24 @@ inline char is_log2(uint32_t x) {
|
|||
return ((x & (x-1)) == 0);
|
||||
}
|
||||
|
||||
void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto A = reinterpret_cast<TYPE*>(arg->A_addr);
|
||||
auto B = reinterpret_cast<TYPE*>(arg->B_addr);
|
||||
auto C = reinterpret_cast<TYPE*>(arg->C_addr);
|
||||
auto size = arg->size;
|
||||
|
||||
uint32_t row, col;
|
||||
if (is_log2(size)) {
|
||||
row = task_id >> arg->log2_size;
|
||||
col = task_id & (size-1);
|
||||
} else {
|
||||
row = task_id / size;
|
||||
col = task_id % size;
|
||||
}
|
||||
int col = blockIdx.x;
|
||||
int row = blockIdx.y;
|
||||
|
||||
TYPE sum(0);
|
||||
for (int e = 0; e < size; ++e) {
|
||||
sum += A[row * size + e] * B[e * size + col];
|
||||
}
|
||||
|
||||
C[task_id] = sum;
|
||||
C[row * size + col] = sum;
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
||||
|
|
|
@ -146,9 +146,9 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "matrix size: " << size << "x" << size << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = size_sq;
|
||||
kernel_arg.grid_dim[0] = size;
|
||||
kernel_arg.grid_dim[1] = size;
|
||||
kernel_arg.size = size;
|
||||
kernel_arg.log2_size = log2(size);
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
|
|
|
@ -3,23 +3,22 @@
|
|||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
uint32_t num_points = arg->num_points;
|
||||
auto src_ptr = (TYPE*)arg->src_addr;
|
||||
auto dst_ptr = (TYPE*)arg->dst_addr;
|
||||
|
||||
auto ref_value = src_ptr[task_id];
|
||||
auto ref_value = src_ptr[blockIdx.x];
|
||||
|
||||
uint32_t pos = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto cur_value = src_ptr[i];
|
||||
pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id));
|
||||
pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < blockIdx.x));
|
||||
}
|
||||
dst_ptr[pos] = ref_value;
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
||||
|
|
|
@ -1,18 +1,15 @@
|
|||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
|
||||
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
|
||||
auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
|
||||
|
||||
dst_ptr[task_id] = src0_ptr[task_id] + src1_ptr[task_id];
|
||||
dst_ptr[blockIdx.x] = src0_ptr[blockIdx.x] + src1_ptr[blockIdx.x];
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue