Migrating all tests to new kernel launch API

This commit is contained in:
Blaise Tine 2024-06-11 02:53:36 -07:00
parent 8b63305201
commit 250a5741f7
77 changed files with 961 additions and 1108 deletions

View file

@ -232,11 +232,11 @@ then
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
@ -257,11 +257,11 @@ then
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
DEBUG=1 OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
DEBUG=1 make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
fi

View file

@ -133,6 +133,9 @@ opencl()
make -C tests/opencl run-simx
make -C tests/opencl run-rtlsim
./ci/blackbox.sh --driver=simx --app=lbm --warps=8
./ci/blackbox.sh --driver=rtlsim --app=lbm --warps=8
echo "opencl tests done!"
}

View file

@ -96,9 +96,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire [127:0] afu_id = `AFU_ACCEL_UUID;
wire [63:0] dev_caps = {8'b0,
wire [63:0] dev_caps = {16'b0,
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
8'(`NUM_BARRIERS),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
8'(`NUM_THREADS),

View file

@ -142,9 +142,8 @@ module VX_afu_ctrl #(
RSTATE_DATA = 2'd1;
// device caps
wire [63:0] dev_caps = {8'b0,
wire [63:0] dev_caps = {16'b0,
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
8'(`NUM_BARRIERS),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
8'(`NUM_THREADS),

View file

@ -17,9 +17,17 @@
#ifndef __VX_INTRINSICS_H__
#define __VX_INTRINSICS_H__
#include <stddef.h>
#include <stdint.h>
#include <VX_types.h>
#if __riscv_xlen == 64
typedef unsigned long size_t; // 64-bit RISC-V
#elif __riscv_xlen == 32
typedef unsigned int size_t; // 32-bit RISC-V
#else
#error "Unknown RISC-V architecture"
#endif
#if defined(__clang__)
#define __UNIFORM__ __attribute__((annotate("vortex.uniform")))
#else

View file

@ -21,16 +21,6 @@
extern "C" {
#endif
typedef void (*vx_spawn_tasks_cb)(uint32_t task_id, void *arg);
typedef void (*vx_serial_cb)(void *arg);
void vx_spawn_tasks(uint32_t num_tasks, vx_spawn_tasks_cb callback, const void * arg);
void vx_serial(vx_serial_cb callback, const void * arg);
///////////////////////////////////////////////////////////////////////////////
typedef union {
struct {
uint32_t x;
@ -46,23 +36,28 @@ extern dim3_t gridDim;
extern dim3_t blockDim;
extern __thread uint32_t __local_group_id;
extern uint32_t __groups_per_core;
extern uint32_t __warps_per_group;
typedef void (*vx_kernel_func_cb)(void *arg);
typedef void (*vx_serial_cb)(void *arg);
#define __local_mem(size) \
(void*)((int8_t*)csr_read(VX_CSR_LOCAL_MEM_BASE) + __local_group_id * size)
#define __syncthreads() \
vx_barrier(__COUNTER__ * __groups_per_core + __local_group_id, __warps_per_group)
vx_barrier(__local_group_id, __warps_per_group)
// launch a kernel function with a grid of blocks and block of threads
int vx_spawn_threads(uint32_t dimension,
const uint32_t* grid_dim,
const uint32_t* block_dim,
vx_kernel_func_cb kernel_func,
const void* arg);
// function call serialization
void vx_serial(vx_serial_cb callback, const void * arg);
#ifdef __cplusplus
}
#endif

View file

@ -13,7 +13,6 @@
#include <vx_spawn.h>
#include <vx_intrinsics.h>
#include <inttypes.h>
#include <vx_print.h>
#ifdef __cplusplus
@ -24,142 +23,12 @@ extern "C" {
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
typedef struct {
vx_spawn_tasks_cb callback;
const void* arg;
uint32_t all_tasks_offset;
uint32_t remain_tasks_offset;
uint32_t warp_batches;
uint32_t remaining_warps;
} wspawn_tasks_args_t;
static void __attribute__ ((noinline)) process_all_tasks() {
wspawn_tasks_args_t* targs = (wspawn_tasks_args_t*)csr_read(VX_CSR_MSCRATCH);
uint32_t threads_per_warp = vx_num_threads();
uint32_t warp_id = vx_warp_id();
uint32_t thread_id = vx_thread_id();
uint32_t start_warp = (warp_id * targs->warp_batches) + MIN(warp_id, targs->remaining_warps);
uint32_t iterations = targs->warp_batches + (warp_id < targs->remaining_warps);
uint32_t start_task_id = targs->all_tasks_offset + (start_warp * threads_per_warp) + thread_id;
uint32_t end_task_id = start_task_id + iterations * threads_per_warp;
vx_spawn_tasks_cb callback = targs->callback;
const void* arg = targs->arg;
for (uint32_t task_id = start_task_id; task_id < end_task_id; task_id += threads_per_warp) {
callback(task_id, (void*)arg);
}
}
static void __attribute__ ((noinline)) process_remaining_tasks() {
wspawn_tasks_args_t* targs = (wspawn_tasks_args_t*)csr_read(VX_CSR_MSCRATCH);
uint32_t thread_id = vx_thread_id();
uint32_t task_id = targs->remain_tasks_offset + thread_id;
(targs->callback)(task_id, (void*)targs->arg);
}
static void __attribute__ ((noinline)) process_all_tasks_stub() {
// activate all threads
vx_tmc(-1);
// process all tasks
process_all_tasks();
// disable warp
vx_tmc_zero();
}
void vx_spawn_tasks(uint32_t num_tasks, vx_spawn_tasks_cb callback , const void * arg) {
// device specifications
uint32_t num_cores = vx_num_cores();
uint32_t warps_per_core = vx_num_warps();
uint32_t threads_per_warp = vx_num_threads();
uint32_t core_id = vx_core_id();
// calculate necessary active cores
uint32_t threads_per_core = warps_per_core * threads_per_warp;
uint32_t needed_cores = (num_tasks + threads_per_core - 1) / threads_per_core;
uint32_t active_cores = MIN(needed_cores, num_cores);
// only active cores participate
if (core_id >= active_cores)
return;
// number of tasks per core
uint32_t tasks_per_core = num_tasks / active_cores;
uint32_t remaining_tasks_per_core = num_tasks - tasks_per_core * active_cores;
if (core_id < remaining_tasks_per_core)
tasks_per_core++;
// calculate number of warps to activate
uint32_t total_warps_per_core = tasks_per_core / threads_per_warp;
uint32_t remaining_tasks = tasks_per_core - total_warps_per_core * threads_per_warp;
uint32_t active_warps = total_warps_per_core;
uint32_t warp_batches = 1, remaining_warps = 0;
if (active_warps > warps_per_core) {
active_warps = warps_per_core;
warp_batches = total_warps_per_core / active_warps;
remaining_warps = total_warps_per_core - warp_batches * active_warps;
}
// calculate offsets for task distribution
uint32_t all_tasks_offset = core_id * tasks_per_core + MIN(core_id, remaining_tasks_per_core);
uint32_t remain_tasks_offset = all_tasks_offset + (tasks_per_core - remaining_tasks);
// prepare scheduler arguments
wspawn_tasks_args_t wspawn_args = {
callback,
arg,
all_tasks_offset,
remain_tasks_offset,
warp_batches,
remaining_warps
};
csr_write(VX_CSR_MSCRATCH, &wspawn_args);
if (active_warps >= 1) {
// execute callback on other warps
vx_wspawn(active_warps, process_all_tasks_stub);
// activate all threads
vx_tmc(-1);
// process all tasks
process_all_tasks();
// back to single-threaded
vx_tmc_one();
}
if (remaining_tasks != 0) {
// activate remaining threads
uint32_t tmask = (1 << remaining_tasks) - 1;
vx_tmc(tmask);
// process remaining tasks
process_remaining_tasks();
// back to single-threaded
vx_tmc_one();
}
// wait for spawned tasks to complete
vx_wspawn(1, 0);
}
///////////////////////////////////////////////////////////////////////////////
__thread dim3_t blockIdx;
__thread dim3_t threadIdx;
dim3_t gridDim;
dim3_t blockDim;
__thread uint32_t __local_group_id;
uint32_t __groups_per_core;
uint32_t __warps_per_group;
typedef struct {
@ -171,10 +40,68 @@ typedef struct {
uint32_t warps_per_group;
uint32_t groups_per_core;
uint32_t remaining_mask;
} wspawn_task_groups_args_t;
} wspawn_groups_args_t;
static void __attribute__ ((noinline)) process_all_task_groups() {
wspawn_task_groups_args_t* targs = (wspawn_task_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
typedef struct {
vx_kernel_func_cb callback;
const void* arg;
uint32_t all_tasks_offset;
uint32_t remain_tasks_offset;
uint32_t warp_batches;
uint32_t remaining_warps;
} wspawn_threads_args_t;
static void __attribute__ ((noinline)) process_threads() {
wspawn_threads_args_t* targs = (wspawn_threads_args_t*)csr_read(VX_CSR_MSCRATCH);
uint32_t threads_per_warp = vx_num_threads();
uint32_t warp_id = vx_warp_id();
uint32_t thread_id = vx_thread_id();
uint32_t start_warp = (warp_id * targs->warp_batches) + MIN(warp_id, targs->remaining_warps);
uint32_t iterations = targs->warp_batches + (warp_id < targs->remaining_warps);
uint32_t start_task_id = targs->all_tasks_offset + (start_warp * threads_per_warp) + thread_id;
uint32_t end_task_id = start_task_id + iterations * threads_per_warp;
__local_group_id = 0;
threadIdx.x = 0;
threadIdx.y = 0;
threadIdx.z = 0;
vx_kernel_func_cb callback = targs->callback;
const void* arg = targs->arg;
for (uint32_t task_id = start_task_id; task_id < end_task_id; task_id += threads_per_warp) {
blockIdx.x = task_id % gridDim.x;
blockIdx.y = (task_id / gridDim.x) % gridDim.y;
blockIdx.z = task_id / (gridDim.x * gridDim.y);
callback((void*)arg);
}
}
static void __attribute__ ((noinline)) process_remaining_threads() {
wspawn_threads_args_t* targs = (wspawn_threads_args_t*)csr_read(VX_CSR_MSCRATCH);
uint32_t thread_id = vx_thread_id();
uint32_t task_id = targs->remain_tasks_offset + thread_id;
(targs->callback)((void*)targs->arg);
}
static void __attribute__ ((noinline)) process_threads_stub() {
// activate all threads
vx_tmc(-1);
// process all tasks
process_threads();
// disable warp
vx_tmc_zero();
}
static void __attribute__ ((noinline)) process_thread_groups() {
wspawn_groups_args_t* targs = (wspawn_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
uint32_t threads_per_warp = vx_num_threads();
uint32_t warp_id = vx_warp_id();
@ -209,8 +136,8 @@ static void __attribute__ ((noinline)) process_all_task_groups() {
}
}
static void __attribute__ ((noinline)) process_all_task_groups_stub() {
wspawn_task_groups_args_t* targs = (wspawn_task_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
static void __attribute__ ((noinline)) process_thread_groups_stub() {
wspawn_groups_args_t* targs = (wspawn_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
uint32_t warps_per_group = targs->warps_per_group;
uint32_t remaining_mask = targs->remaining_mask;
uint32_t warp_id = vx_warp_id();
@ -220,8 +147,8 @@ static void __attribute__ ((noinline)) process_all_task_groups_stub() {
// activate threads
vx_tmc(threads_mask);
// process all tasks
process_all_task_groups();
// process thread groups
process_thread_groups();
// disable all warps except warp0
vx_tmc(0 == vx_warp_id());
@ -253,10 +180,12 @@ int vx_spawn_threads(uint32_t dimension,
// check group size
uint32_t threads_per_core = warps_per_core * threads_per_warp;
if (threads_per_core < group_size) {
vx_printf("error: group_size > threads_per_core (%d, %d)\n", group_size, threads_per_core);
vx_printf("error: group_size > threads_per_core (%d,%d)\n", group_size, threads_per_core);
return -1;
}
if (group_size > 1) {
// calculate number of warps per group
uint32_t warps_per_group = group_size / threads_per_warp;
uint32_t remaining_threads = group_size - warps_per_group * threads_per_warp;
uint32_t remaining_mask = -1;
@ -265,6 +194,7 @@ int vx_spawn_threads(uint32_t dimension,
++warps_per_group;
}
// calculate necessary active cores
uint32_t needed_warps = num_groups * warps_per_group;
uint32_t needed_cores = (needed_warps + warps_per_core-1) / warps_per_core;
uint32_t active_cores = MIN(needed_cores, num_cores);
@ -273,6 +203,7 @@ int vx_spawn_threads(uint32_t dimension,
if (core_id >= active_cores)
return 0;
// total number of groups per core
uint32_t total_groups_per_core = num_groups / active_cores;
uint32_t remaining_groups_per_core = num_groups - active_cores * total_groups_per_core;
if (core_id < remaining_groups_per_core)
@ -293,7 +224,7 @@ int vx_spawn_threads(uint32_t dimension,
uint32_t group_offset = core_id * total_groups_per_core + MIN(core_id, remaining_groups_per_core);
// set scheduler arguments
wspawn_task_groups_args_t wspawn_args = {
wspawn_groups_args_t wspawn_args = {
kernel_func,
arg,
group_offset,
@ -306,16 +237,85 @@ int vx_spawn_threads(uint32_t dimension,
csr_write(VX_CSR_MSCRATCH, &wspawn_args);
// set global variables
__groups_per_core = groups_per_core;
__warps_per_group = warps_per_group;
// execute callback on other warps
vx_wspawn(active_warps, process_all_task_groups_stub);
vx_wspawn(active_warps, process_thread_groups_stub);
// execute callback on warp0
process_all_task_groups_stub();
process_thread_groups_stub();
} else {
uint32_t num_tasks = num_groups;
__warps_per_group = 0;
// wait for spawned tasks to complete
// calculate necessary active cores
uint32_t needed_cores = (num_tasks + threads_per_core - 1) / threads_per_core;
uint32_t active_cores = MIN(needed_cores, num_cores);
// only active cores participate
if (core_id >= active_cores)
return 0;
// number of tasks per core
uint32_t tasks_per_core = num_tasks / active_cores;
uint32_t remaining_tasks_per_core = num_tasks - tasks_per_core * active_cores;
if (core_id < remaining_tasks_per_core)
++tasks_per_core;
// calculate number of warps to activate
uint32_t total_warps_per_core = tasks_per_core / threads_per_warp;
uint32_t remaining_tasks = tasks_per_core - total_warps_per_core * threads_per_warp;
uint32_t active_warps = total_warps_per_core;
uint32_t warp_batches = 1, remaining_warps = 0;
if (active_warps > warps_per_core) {
active_warps = warps_per_core;
warp_batches = total_warps_per_core / active_warps;
remaining_warps = total_warps_per_core - warp_batches * active_warps;
}
// calculate offsets for task distribution
uint32_t all_tasks_offset = core_id * tasks_per_core + MIN(core_id, remaining_tasks_per_core);
uint32_t remain_tasks_offset = all_tasks_offset + (tasks_per_core - remaining_tasks);
// prepare scheduler arguments
wspawn_threads_args_t wspawn_args = {
kernel_func,
arg,
all_tasks_offset,
remain_tasks_offset,
warp_batches,
remaining_warps
};
csr_write(VX_CSR_MSCRATCH, &wspawn_args);
if (active_warps >= 1) {
// execute callback on other warps
vx_wspawn(active_warps, process_threads_stub);
// activate all threads
vx_tmc(-1);
// process threads
process_threads();
// back to single-threaded
vx_tmc_one();
}
if (remaining_tasks != 0) {
// activate remaining threads
uint32_t tmask = (1 << remaining_tasks) - 1;
vx_tmc(tmask);
// process remaining threads
process_remaining_threads();
// back to single-threaded
vx_tmc_one();
}
}
// wait for spawned warps to complete
vx_wspawn(1, 0);
return 0;

View file

@ -34,7 +34,6 @@ typedef void* vx_buffer_h;
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
#define VX_CAPS_ISA_FLAGS 0x7
#define VX_CAPS_NUM_BARRIERS 0x8
// device isa flags
#define VX_ISA_STD_A (1ull << 0)
@ -126,7 +125,7 @@ int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_
int vx_upload_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer);
// calculate cooperative threads array occupancy
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_barriers, uint32_t* max_localmem);
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_localmem);
// performance counters
int vx_dump_perf(vx_device_h hdevice, FILE* stream);

View file

@ -218,9 +218,6 @@ public:
case VX_CAPS_NUM_CORES:
_value = (dev_caps_ >> 24) & 0xffff;
break;
case VX_CAPS_NUM_BARRIERS:
_value = (dev_caps_ >> 40) & 0xff;
break;
case VX_CAPS_CACHE_LINE_SIZE:
_value = CACHE_BLOCK_SIZE;
break;

View file

@ -65,9 +65,6 @@ public:
case VX_CAPS_NUM_CORES:
_value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_NUM_BARRIERS:
_value = NUM_BARRIERS;
break;
case VX_CAPS_CACHE_LINE_SIZE:
_value = CACHE_BLOCK_SIZE;
break;

View file

@ -69,9 +69,6 @@ public:
case VX_CAPS_NUM_CORES:
_value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_NUM_BARRIERS:
_value = NUM_BARRIERS;
break;
case VX_CAPS_CACHE_LINE_SIZE:
_value = CACHE_BLOCK_SIZE;
break;

View file

@ -626,7 +626,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
return 0;
}
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_barriers, uint32_t* max_localmem) {
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_localmem) {
// check group size
uint64_t warps_per_core, threads_per_warp;
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_WARPS, &warps_per_core), {
@ -637,7 +637,7 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
});
uint32_t threads_per_core = warps_per_core * threads_per_warp;
if (group_size > threads_per_core) {
printf("Error: device cannot schedule group size > (%d)\n", threads_per_core);
printf("Error: cannot schedule kernel with group_size > threads_per_core (%d,%d)\n", group_size, threads_per_core);
return -1;
}
@ -645,19 +645,6 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
int warps_per_group = (group_size + threads_per_warp-1) / threads_per_warp;
int groups_per_core = warps_per_core / warps_per_group;
// check barriers capacity
if (max_barriers) {
uint64_t num_barriers;
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_BARRIERS, &num_barriers), {
return err;
});
if (warps_per_group < 2) {
*max_barriers = -1;
} else {
*max_barriers = num_barriers / groups_per_core;
}
}
// check local memory capacity
if (max_localmem) {
uint64_t local_mem_size;

View file

@ -390,9 +390,6 @@ public:
case VX_CAPS_NUM_CORES:
_value = (dev_caps_ >> 24) & 0xffff;
break;
case VX_CAPS_NUM_BARRIERS:
_value = (dev_caps_ >> 40) & 0xff;
break;
case VX_CAPS_CACHE_LINE_SIZE:
_value = CACHE_BLOCK_SIZE;
break;

View file

@ -204,8 +204,8 @@ typedef struct {
int st_buffer_src[ST_BUF_SZ];
int st_buffer_dst[ST_BUF_SZ];
void st_kernel(int task_id, const st_args_t * __UNIFORM__ arg) {
arg->dst[task_id] = arg->src[task_id];
void st_kernel(const st_args_t * __UNIFORM__ arg) {
arg->dst[blockIdx.x] = arg->src[blockIdx.x];
}
int test_spawn_tasks() {
@ -219,7 +219,8 @@ int test_spawn_tasks() {
st_buffer_src[i] = 65 + i;
}
vx_spawn_tasks(ST_BUF_SZ, (vx_spawn_tasks_cb)st_kernel, &arg);
uint32_t num_tasks(ST_BUF_SZ);
vx_spawn_threads(1, &num_tasks, nullptr, (vx_kernel_func_cb)st_kernel, &arg);
return check_error(st_buffer_dst, 0, ST_BUF_SZ);
}

View file

@ -35,7 +35,6 @@ run-simx:
$(MAKE) -C transpose run-simx
$(MAKE) -C spmv run-simx
$(MAKE) -C stencil run-simx
$(MAKE) -C lbm run-simx
$(MAKE) -C nearn run-simx
$(MAKE) -C guassian run-simx
$(MAKE) -C kmeans run-simx
@ -57,7 +56,6 @@ run-rtlsim:
$(MAKE) -C transpose run-rtlsim
$(MAKE) -C spmv run-rtlsim
$(MAKE) -C stencil run-rtlsim
$(MAKE) -C lbm run-rtlsim
$(MAKE) -C nearn run-rtlsim
$(MAKE) -C guassian run-rtlsim
$(MAKE) -C kmeans run-rtlsim
@ -79,7 +77,6 @@ run-opae:
$(MAKE) -C transpose run-opae
$(MAKE) -C spmv run-opae
$(MAKE) -C stencil run-opae
$(MAKE) -C lbm run-opae
$(MAKE) -C nearn run-opae
$(MAKE) -C guassian run-opae
$(MAKE) -C kmeans run-opae

View file

@ -256,17 +256,10 @@ free(allPlatforms);*/
size_t kernel_size;
cl_int binary_status = 0;
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
std::abort();
oclHandles.program = clCreateProgramWithSource(
oclHandles.context, 1, (const char**)&kernel_bin, &kernel_size, &resultCL);
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
std::abort();
oclHandles.program = clCreateProgramWithBinary(
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &resultCL);
#endif
free(kernel_bin);
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
throw(string("InitCL()::Error: Loading Binary into cl_program. "

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
OPTS ?= $(SRC_DIR)/graph4096.txt
include ../common.mk

Binary file not shown.

View file

@ -15,7 +15,7 @@
#include "CLHelper.h"
#include "util.h"
#define MAX_THREADS_PER_BLOCK 256
#define MAX_THREADS_PER_BLOCK 16
// Structure to hold a node information
struct Node {

View file

@ -9,6 +9,11 @@ SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/oclUtils.cpp $(SRC_DIR)/shrUtils.cpp $(SRC
CXXFLAGS += -I$(SRC_DIR)
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
OPTS ?=
include ../common.mk

View file

@ -52,17 +52,12 @@ extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqPar
size_t kernel_size;
cl_int binary_status = 0;
cl_device_id device_id = oclGetFirstDev(cxGPUContext);
#ifdef HOSTGPU
ciErrNum = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
shrCheckError(ciErrNum, CL_SUCCESS);
cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&kernel_bin, &kernel_size, &ciErrNum);
#else
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
shrCheckError(ciErrNum, CL_SUCCESS);
cpBlackScholes = clCreateProgramWithBinary(
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
#endif
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("...building BlackScholes program\n");
ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);

View file

@ -6,17 +6,14 @@ XRT_SYN_DIR ?= $(VORTEX_HOME)/hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
VX_LLCFLAGS += -target-feature +f -target-feature +d -target-abi lp64
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
STARTUP_ADDR ?= 0x180000000
else
VX_LLCFLAGS += -target-feature +f -target-abi ilp32f
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
endif
POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
POCL_PATH ?= $(TOOLDIR)/pocl
LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
@ -30,13 +27,10 @@ VX_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections
VX_CFLAGS += -I$(ROOT_DIR)/hw -I$(VORTEX_KN_PATH)/include -DXLEN_$(XLEN) -DNDEBUG
VX_CFLAGS += -Xclang -target-feature -Xclang +vortex
VX_CFLAGS += -Xclang -target-feature -Xclang +zicond
VX_CFLAGS += -mllvm -disable-loop-idiom-all # disable memset/memcpy loop idiom
VX_CFLAGS += -mllvm -disable-loop-idiom-all
#VX_CFLAGS += -mllvm -vortex-branch-divergence=0
#VX_CFLAGS += -mllvm -print-after-all
VX_LLCFLAGS += -target-feature +m -target-feature +vortex
#VX_LLCFLAGS += -mllvm -vortex-branch-divergence=0
VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/scripts/link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(ROOT_DIR)/kernel/libvortex.a $(VX_LIBS)
VX_BINTOOL += OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scripts/vxbin.py
@ -44,17 +38,20 @@ VX_BINTOOL += OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scri
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
CXXFLAGS += -pthread
CXXFLAGS += -I$(POCL_RT_PATH)/include
CXXFLAGS += -I$(POCL_PATH)/include
POCL_CC_FLAGS = LLVM_PREFIX=$(LLVM_VORTEX) POCL_VORTEX_BINTOOL="$(VX_BINTOOL)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)"
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
POCL_CC_FLAGS += POCL_DEBUG=all
POCL_RT_FLAGS += POCL_DEBUG=all
else
CXXFLAGS += -O2 -DNDEBUG
endif
LDFLAGS += -Wl,-rpath,$(LLVM_VORTEX)/lib
ifeq ($(TARGET), fpga)
OPAE_DRV_PATHS ?= libopae-c.so
else
@ -68,16 +65,8 @@ endif
endif
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
OBJS_HOST := $(addsuffix .host.o, $(notdir $(SRCS)))
.DEFAULT_GOAL := all
all: $(PROJECT) kernel.pocl
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
kernel.pocl: $(SRC_DIR)/kernel.cl
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) $(POCL_CC_FLAGS) POCL_VORTEX_BINTOOL="$(VX_BINTOOL)" POCL_VORTEX_LLCFLAGS="$(VX_LLCFLAGS)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o $@ $<
all: $(PROJECT)
%.cc.o: $(SRC_DIR)/%.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
@ -88,49 +77,36 @@ kernel.pocl: $(SRC_DIR)/kernel.cl
%.c.o: $(SRC_DIR)/%.c
$(CC) $(CXXFLAGS) -c $< -o $@
%.cc.host.o: $(SRC_DIR)/%.cc
$(CXX) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
$(PROJECT): $(OBJS)
$(CXX) $(CXXFLAGS) $(OBJS) $(LDFLAGS) -L$(ROOT_DIR)/runtime -lvortex -L$(POCL_PATH)/lib -lOpenCL -o $@
%.cpp.host.o: $(SRC_DIR)/%.cpp
$(CXX) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
$(PROJECT).host: $(OBJS)
$(CXX) $(CXXFLAGS) $(OBJS) $(LDFLAGS) -lOpenCL -o $@
%.c.host.o: $(SRC_DIR)/%.c
$(CC) $(CXXFLAGS) -DHOSTGPU -c $< -o $@
ifndef USE_SETUP
setup:
endif
$(PROJECT): setup $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out setup, $^) $(LDFLAGS) -L$(ROOT_DIR)/runtime -lvortex -L$(POCL_RT_PATH)/lib -lOpenCL -o $@
$(PROJECT).host: setup $(OBJS_HOST)
$(CXX) $(CXXFLAGS) $(filter-out setup, $^) $(LDFLAGS) -lOpenCL -o $@
run-gpu: $(PROJECT).host kernel.cl
run-gpu: $(PROJECT).host $(KERNEL_SRCS)
./$(PROJECT).host $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) VORTEX_DRIVER=simx ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) $(KERNEL_SRCS)
LD_LIBRARY_PATH=$(POCL_PATH)/lib:$(ROOT_DIR)/runtime:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) $(POCL_CC_FLAGS) VORTEX_DRIVER=simx ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) VORTEX_DRIVER=rtlsim ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) $(KERNEL_SRCS)
LD_LIBRARY_PATH=$(POCL_PATH)/lib:$(ROOT_DIR)/runtime:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) $(POCL_CC_FLAGS) VORTEX_DRIVER=rtlsim ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) kernel.pocl
SCOPE_JSON_PATH=$(ROOT_DIR)/runtime/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) VORTEX_DRIVER=opae ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) $(KERNEL_SRCS)
SCOPE_JSON_PATH=$(ROOT_DIR)/runtime/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_PATH)/lib:$(ROOT_DIR)/runtime:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) $(POCL_CC_FLAGS) VORTEX_DRIVER=opae ./$(PROJECT) $(OPTS)
run-xrt: $(PROJECT) kernel.pocl
run-xrt: $(PROJECT) $(KERNEL_SRCS)
ifeq ($(TARGET), hw)
XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) VORTEX_DRIVER=xrt ./$(PROJECT) $(OPTS)
XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_PATH)/lib:$(ROOT_DIR)/runtime:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) $(POCL_CC_FLAGS) VORTEX_DRIVER=xrt ./$(PROJECT) $(OPTS)
else
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) VORTEX_DRIVER=xrt ./$(PROJECT) $(OPTS)
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_PATH)/lib:$(ROOT_DIR)/runtime:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) $(POCL_CC_FLAGS) VORTEX_DRIVER=xrt ./$(PROJECT) $(OPTS)
endif
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean-kernel:
rm -rf *.dump *.pocl
rm -rf *.dump *.ll
clean-host:
rm -rf $(PROJECT) $(PROJECT).host *.o *.log .depend

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
OPTS ?= -n32
include ../common.mk

View file

@ -166,17 +166,10 @@ int main (int argc, char **argv) {
o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
if (program == NULL) {
cleanup();
return -1;

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/oclUtils.cpp $(SRC_DIR)/shrUtils.cpp $(SRC_DIR)/cmd_arg_reader.cpp
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
OPTS ?= -size=4096
include ../common.mk

View file

@ -30,11 +30,7 @@
// Name of the file with the source code for the computation kernel
// *********************************************************************
#ifdef HOSTGPU
const char* cSourceFile = "kernel.cl";
#else
const char* cSourceFile = "kernel.pocl";
#endif
// Host buffers for demo
// *********************************************************************
@ -171,12 +167,9 @@ int main(int argc, char **argv)
shrLog("clCreateProgramWithSource...\n");
cl_int binary_status;
cl_program program;
#ifdef HOSTGPU
program = clCreateProgramWithSource(
cxGPUContext, 1, (const char**)&cSourceCL, &szKernelLength, &ciErrNum);
#else
program = clCreateProgramWithBinary(cxGPUContext, 1, cdDevices, &szKernelLength, (const uint8_t**)&cSourceCL, &binary_status, &ciErrNum);
#endif
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
/*// Build the program with 'mad' Optimization option

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/clutils.cpp $(SRC_DIR)/utils.cpp
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
# usage: [-f <input_file>] [-s <size>]
OPTS ?= -q -s 32

View file

@ -883,16 +883,9 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
cl_program clProgramReturn;
// Create the program object
#ifdef HOSTGPU
int err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, &kernel_size, &status);
#else
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
clProgramReturn = clCreateProgramWithBinary(
context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
#endif
free(kernel_bin);
cl_errChk(status, "Creating program", true);

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/read_input.c $(SRC_DIR)/rmse.c $(SRC_DIR)/kmeans_clustering.c $(SRC_DIR)/cluster.c $(SRC_DIR)/getopt.c
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
# usage: -f<features> -p<points>
OPTS ?= -f100 -p100

View file

@ -201,7 +201,7 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
size_t kernel_size;
cl_int binary_status = 0;
cl_program prog;
#ifdef HOSTGPU
err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
if (err != CL_SUCCESS) {
printf("ERROR: read_kernel_file() => %d\n", err);
@ -209,19 +209,6 @@ int allocate(int n_points, int n_features, int n_clusters, float **feature) {
}
prog = clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &err);
#else
err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
if (err != CL_SUCCESS) {
printf("ERROR: read_kernel_file() => %d\n", err);
return -1;
}
prog = clCreateProgramWithBinary(
context, 1, device_list, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateProgramWithBinary() => %d\n", err);
return -1;
}
#endif
free(kernel_bin);

View file

@ -15,8 +15,10 @@ lbm_macros.h: $(SRC_DIR)/lbm_macros.h
layout_config.h: $(SRC_DIR)/layout_config.h
cp $< $@
setup: lbm_macros.h layout_config.h
USE_SETUP := yes
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl lbm_macros.h layout_config.h
# Usage: #iter [-i input_file] [-o output_file]
OPTS ?= 1 -i $(SRC_DIR)/32_32_8_ldc.of

View file

@ -318,19 +318,11 @@ void OpenCL_initialize(struct pb_Parameters *p, OpenCL_Param *prm) {
size_t kernel_size;
cl_int binary_status = 0;
#ifdef HOSTGPU
clStatus = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
prm->clProgram = clCreateProgramWithSource(
prm->clContext, 1, (const char**)&kernel_bin, &kernel_size, &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
#else
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
prm->clProgram = clCreateProgramWithBinary(
prm->clContext, 1, &prm->clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
CHECK_ERROR("clCreateProgramWithBinary")
#endif
//char clOptions[100];
//sprintf(clOptions, "-I src/opencl_base");

View file

@ -7,12 +7,17 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/clutils.cpp $(SRC_DIR)/utils.cpp
OPTS ?= filelist.log
filelist.log:
echo "$(SRC_DIR)/cane4_0.db\n$(SRC_DIR)/cane4_1.db" > filelist.log
setup: filelist.log
USE_SETUP := yes
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl filelist.log
OPTS ?= filelist.log
include ../common.mk

View file

@ -882,16 +882,9 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
cl_program clProgramReturn;
// Create the program object
#ifdef HOSTGPU
int err = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&kernel_bin, &kernel_size, &status);
#else
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
clProgramReturn = clCreateProgramWithBinary(
context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
#endif
free(kernel_bin);
cl_errChk(status, "Creating program", true);

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
OPTS ?= -n4
include ../common.mk

View file

@ -119,17 +119,10 @@ int main (int argc, char **argv) {
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
OPTS ?= -f -n16
include ../common.mk

View file

@ -129,17 +129,10 @@ int main (int argc, char **argv) {
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
OPTS ?= -n16
include ../common.mk

View file

@ -158,17 +158,10 @@ int main (int argc, char **argv) {
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
if (program == NULL) {
cleanup();
return -1;

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
OPTS ?= -n1024
include ../common.mk

View file

@ -169,17 +169,10 @@ int main(int argc, char **argv) {
cl_mem memObjects[2] = {0, 0};
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK_ERR(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK_ERR(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
OPTS ?= -n16
include ../common.mk

View file

@ -171,17 +171,10 @@ int main(int argc, char **argv) {
cl_mem memObjects[2] = {0, 0};
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK_ERR(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK_ERR(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

View file

@ -10,8 +10,10 @@ SRCS := $(SRC_DIR)/main.cc
common.h: $(SRC_DIR)/common.h
cp $< $@
setup: common.h
USE_SETUP := yes
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl common.h
OPTS ?= -n32

View file

@ -196,17 +196,10 @@ int main (int argc, char **argv) {
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

View file

@ -10,8 +10,10 @@ SRCS := $(SRC_DIR)/main.cc
common.h: $(SRC_DIR)/common.h
cp $< $@
setup: common.h
USE_SETUP := yes
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl common.h
OPTS ?= -n16

View file

@ -1,7 +1,7 @@
#ifndef COMMON_H
#define COMMON_H
#define TILE_SIZE 8
#define TILE_SIZE 4
#ifndef TYPE
#define TYPE float

View file

@ -197,17 +197,10 @@ int main (int argc, char **argv) {
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
if (program == NULL) {
cleanup();
return -1;

View file

@ -10,8 +10,10 @@ SRCS := $(SRC_DIR)/main.cc
common.h: $(SRC_DIR)/common.h
cp $< $@
setup: common.h
USE_SETUP := yes
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl common.h
OPTS ?= -n16

View file

@ -140,7 +140,7 @@ static void cleanup() {
}
uint32_t size = 16;
uint32_t tile_size = 8;
uint32_t tile_size = 4;
static void show_usage() {
printf("Usage: [-n size] [-t tile size] [-h: help]\n");
@ -201,17 +201,10 @@ int main (int argc, char **argv) {
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
if (program == NULL) {
cleanup();
return -1;

View file

@ -9,6 +9,11 @@ SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/parboil_opencl.c $(SRC_DIR)/args.c $(SRC_D
CXXFLAGS += -I$(SRC_DIR)
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
# Usage: -i matrix_file,vector_file [-o output_file]
OPTS ?= -i $(SRC_DIR)/1138_bus.mtx,$(SRC_DIR)/1138_bus.vec

View file

@ -162,19 +162,11 @@ int main(int argc, char **argv) {
cl_int binary_status = 0;
cl_program clProgram;
#ifdef HOSTGPU
clStatus = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
clProgram = clCreateProgramWithSource(
clContext, 1, (const char**)&kernel_bin, &kernel_size, &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
#else
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
clProgram = clCreateProgramWithBinary(
clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
CHECK_ERROR("clCreateProgramWithBinary")
#endif
char clOptions[50];
sprintf(clOptions, "");

View file

@ -9,6 +9,11 @@ SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/args.c $(SRC_DIR)/parboil_opencl.c $(SRC_D
CXXFLAGS += -I$(SRC_DIR)
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
# Usage: #nx #ny #nz #iter -i input_file [-o output_file]
OPTS ?= 64 64 8 1 -i $(SRC_DIR)/64x64x8.bin

View file

@ -211,19 +211,11 @@ int main(int argc, char** argv) {
cl_int binary_status = 0;
cl_program clProgram;
#ifdef HOSTGPU
clStatus = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
clProgram = clCreateProgramWithSource(
clContext, 1, (const char**)&kernel_bin, &kernel_size, &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
#else
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
clProgram = clCreateProgramWithBinary(
clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
CHECK_ERROR("clCreateProgramWithBinary")
#endif
char clOptions[50];
sprintf(clOptions,"-I src/opencl_base");

View file

@ -7,6 +7,11 @@ SRC_DIR := $(VORTEX_HOME)/tests/opencl/$(PROJECT)
SRCS := $(SRC_DIR)/main.cc $(SRC_DIR)/oclUtils.cpp $(SRC_DIR)/shrUtils.cpp $(SRC_DIR)/cmd_arg_reader.cpp $(SRC_DIR)/transpose_gold.cpp
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl
OPTS ?= -width=128 -height=128
#CXXFLAGS += -D_DEBUG

View file

@ -23,7 +23,7 @@
#include "oclUtils.h"
#include "shrQATest.h"
#define BLOCK_DIM 16
#define BLOCK_DIM 4
// max GPU's to manage for multi-GPU parallel compute
const unsigned int MAX_GPU_COUNT = 1;
@ -319,7 +319,6 @@ int runTest( const int argc, const char** argv)
size_t kernel_size;
cl_int binary_status = 0;
#ifdef HOSTGPU
ciErrNum = read_kernel_file("kernel.cl", &kernel_bin, &kernel_size);
if (ciErrNum != CL_SUCCESS) {
shrLog(" Error %i in read_kernel_file call !!!\n\n", ciErrNum);
@ -331,19 +330,6 @@ int runTest( const int argc, const char** argv)
shrLog(" Error %i in clCreateProgramWithSource call !!!\n\n", ciErrNum);
return ciErrNum;
}
#else
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
if (ciErrNum != CL_SUCCESS) {
shrLog(" Error %i in read_kernel_file call !!!\n\n", ciErrNum);
return ciErrNum;
}
rv_program = clCreateProgramWithBinary(
cxGPUContext, 1, cdDevices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
if (ciErrNum != CL_SUCCESS) {
shrLog(" Error %i in clCreateProgramWithBinary call !!!\n\n", ciErrNum);
return ciErrNum;
}
#endif
// build the program
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);

View file

@ -10,8 +10,10 @@ SRCS := $(SRC_DIR)/main.cc
common.h: $(SRC_DIR)/common.h
cp $< $@
setup: common.h
USE_SETUP := yes
kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
KERNEL_SRCS := kernel.cl common.h
OPTS ?= -n64

View file

@ -181,17 +181,10 @@ int main (int argc, char **argv) {
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

View file

@ -6,13 +6,12 @@
#endif
typedef struct {
uint32_t num_tasks;
uint32_t grid_dim[2];
uint32_t width;
uint32_t log2_width;
uint64_t lmem_addr;
uint64_t I_addr;
uint64_t W_addr;
uint64_t O_addr;
bool use_lmem;
} kernel_arg_t;
#endif

View file

@ -7,19 +7,14 @@ inline char is_log2(uint32_t x) {
return ((x & (x-1)) == 0);
}
void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
auto I = reinterpret_cast<TYPE*>(arg->I_addr);
auto W = reinterpret_cast<TYPE*>((arg->lmem_addr != 0) ? arg->lmem_addr : arg->W_addr);
auto W = reinterpret_cast<TYPE*>(arg->use_lmem ? __local_mem(0) : (void*)arg->W_addr);
auto O = reinterpret_cast<TYPE*>(arg->O_addr);
auto width = arg->width;
uint32_t row, col;
if (is_log2(width)) {
row = task_id >> arg->log2_width;
col = task_id & (width-1);
} else {
row = task_id / width;
}
int col = blockIdx.x;
int row = blockIdx.y;
// Adjust for padded borders
int paddedWidth = width + 2;
@ -46,14 +41,13 @@ void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
if (arg->lmem_addr != 0) {
if (arg->use_lmem) {
// populate local memory
auto W = reinterpret_cast<TYPE*>(arg->W_addr);
auto L = reinterpret_cast<TYPE*>(arg->lmem_addr);
auto L = reinterpret_cast<TYPE*>(__local_mem(0));
for (int i = 0; i < (3*3); ++i) {
L[i] = W[i];
}
}
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}

View file

@ -135,9 +135,7 @@ static void parse_args(int argc, char **argv) {
void cleanup() {
if (device) {
vx_mem_free(I_buffer);
if (!use_lmem) {
vx_mem_free(W_buffer);
}
vx_mem_free(O_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
@ -155,14 +153,13 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = size * size;
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "matrix size: " << size << "x" << size << std::endl;
kernel_arg.num_tasks = num_points;
kernel_arg.grid_dim[0] = size;
kernel_arg.grid_dim[1] = size;
kernel_arg.width = size;
kernel_arg.log2_width = log2(size);
kernel_arg.use_lmem = use_lmem;
uint32_t o_points = size * size;
uint32_t i_points = (size+2) * (size+2);
@ -188,8 +185,6 @@ int main(int argc, char *argv[]) {
cleanup();
exit(1);
}
} else {
kernel_arg.lmem_addr = 0;
}
std::cout << "dev_argI=0x" << std::hex << kernel_arg.I_addr << std::endl;

View file

@ -3,13 +3,13 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
uint32_t count = arg->task_size;
uint32_t offset = task_id * count;
uint32_t offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
@ -18,6 +18,5 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}

View file

@ -7,10 +7,12 @@
// Parallel Selection sort
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t task_id = blockIdx.x;
int value = src_ptr[task_id];
// none taken
@ -78,6 +80,5 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}

View file

@ -5,19 +5,19 @@
#include <vx_print.h>
#include "common.h"
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* __UNIFORM__ arg);
typedef void (*PFN_Kernel)(kernel_arg_t* __UNIFORM__ arg);
inline float __ieee754_sqrtf (float x) {
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
return x;
}
void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_iadd(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
@ -27,12 +27,12 @@ void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_imul(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -42,12 +42,12 @@ void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_idiv(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -57,12 +57,12 @@ void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_idiv_mul(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -74,12 +74,12 @@ void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fadd(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
@ -89,12 +89,12 @@ void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fsub(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -104,12 +104,12 @@ void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fmul(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -119,12 +119,12 @@ void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fmadd(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -134,12 +134,12 @@ void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fmsub(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -149,12 +149,12 @@ void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fnmadd(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -164,12 +164,12 @@ void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fnmsub(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -179,12 +179,12 @@ void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fnmadd_madd(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -196,12 +196,12 @@ void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fdiv(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -211,12 +211,12 @@ void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fdiv2(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -228,12 +228,12 @@ void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fsqrt(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -243,12 +243,12 @@ void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_ftoi(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -259,12 +259,12 @@ void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_ftou(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -275,12 +275,12 @@ void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_itof(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -291,12 +291,12 @@ void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_utof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_utof(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -311,12 +311,12 @@ inline float fclamp(float a, float b, float c) {
return fmin(fmax(a, b), c);
}
void kernel_fclamp(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_fclamp(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -329,12 +329,12 @@ inline int iclamp(int a, int b, int c) {
return std::min(std::max(a, b), c);
}
void kernel_iclamp(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_iclamp(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int*)arg->src0_addr;
auto src1_ptr = (int*)arg->src1_addr;
auto dst_ptr = (int*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
@ -343,12 +343,12 @@ void kernel_iclamp(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_trigo(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_trigo(kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
auto offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
uint32_t j = offset + i;
auto a = src0_ptr[j];
@ -361,7 +361,7 @@ void kernel_trigo(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
}
void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_bar(kernel_arg_t* __UNIFORM__ arg) {
auto num_cores = vx_num_cores();
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
@ -389,10 +389,10 @@ void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
vx_barrier(0, num_warps);
// update destination
dst_ptr[task_id] += 1;
dst_ptr[blockIdx.x] += 1;
}
void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_gbar(kernel_arg_t* __UNIFORM__ arg) {
auto num_cores = vx_num_cores();
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
@ -418,7 +418,7 @@ void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
vx_barrier(0x80000000, num_cores);
// update destination
dst_ptr[task_id] += 1;
dst_ptr[blockIdx.x] += 1;
}
static PFN_Kernel sc_tests[24];
@ -452,6 +452,5 @@ void register_tests() {
int main() {
register_tests();
auto arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
return 0;
return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)sc_tests[arg->testid], arg);
}

View file

@ -3,13 +3,13 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
uint32_t offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
}
@ -19,6 +19,5 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}

View file

@ -3,17 +3,16 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
uint64_t* src_ptr = (uint64_t*)arg->src_addr;
uint32_t* dst_ptr = (uint32_t*)arg->dst_addr;
int32_t* addr_ptr = (int32_t*)(src_ptr[task_id]);
int32_t* addr_ptr = (int32_t*)(src_ptr[blockIdx.x]);
dst_ptr[task_id] = *addr_ptr;
dst_ptr[blockIdx.x] = *addr_ptr;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}

View file

@ -3,13 +3,13 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
uint32_t stride = arg->stride;
uint32_t* addr_ptr = (uint32_t*)arg->src0_addr;
float* src_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * stride;
uint32_t offset = blockIdx.x * stride;
for (uint32_t i = 0; i < stride; ++i) {
float value = 0.0f;
@ -24,6 +24,5 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
return vx_spawn_threads(1, &arg->num_tasks, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}

View file

@ -4,15 +4,14 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int cid = vx_core_id();
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
uint32_t cid = vx_core_id();
char* src_ptr = (char*)arg->src_addr;
char value = 'A' + src_ptr[task_id];
vx_printf("cid=%d: task=%d, value=%c\n", cid, task_id, value);
char value = 'A' + src_ptr[blockIdx.x];
vx_printf("cid=%d: task=%d, value=%c\n", cid, blockIdx.x, value);
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}

View file

@ -6,10 +6,10 @@
#endif
typedef struct {
uint32_t size;
uint32_t tile_size;
uint32_t grid_dim[2];
uint32_t block_dim[2];
uint32_t size;
uint32_t tile_size;
uint64_t A_addr;
uint64_t B_addr;
uint64_t C_addr;

View file

@ -170,10 +170,9 @@ int main(int argc, char *argv[]) {
kernel_arg.tile_size = tile_size;
// check work group occupancy
uint32_t max_barriers, max_localmem;
RT_CHECK(vx_check_occupancy(device, group_size, &max_barriers, &max_localmem));
std::cout << "occupancy: max_barriers=" << max_barriers << ", max_localmem=" << max_localmem << " bytes" << std::endl;
RT_CHECK(max_barriers < 2);
uint32_t max_localmem;
RT_CHECK(vx_check_occupancy(device, group_size, &max_localmem));
std::cout << "occupancy: max_localmem=" << max_localmem << " bytes" << std::endl;
RT_CHECK(max_localmem < local_mem);
// allocate device memory

View file

@ -6,9 +6,8 @@
#endif
typedef struct {
uint32_t num_tasks;
uint32_t grid_dim[2];
uint32_t size;
uint32_t log2_size;
uint64_t A_addr;
uint64_t B_addr;
uint64_t C_addr;

View file

@ -7,31 +7,24 @@ inline char is_log2(uint32_t x) {
return ((x & (x-1)) == 0);
}
void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
auto A = reinterpret_cast<TYPE*>(arg->A_addr);
auto B = reinterpret_cast<TYPE*>(arg->B_addr);
auto C = reinterpret_cast<TYPE*>(arg->C_addr);
auto size = arg->size;
uint32_t row, col;
if (is_log2(size)) {
row = task_id >> arg->log2_size;
col = task_id & (size-1);
} else {
row = task_id / size;
col = task_id % size;
}
int col = blockIdx.x;
int row = blockIdx.y;
TYPE sum(0);
for (int e = 0; e < size; ++e) {
sum += A[row * size + e] * B[e * size + col];
}
C[task_id] = sum;
C[row * size + col] = sum;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
return vx_spawn_threads(2, arg->grid_dim, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}

View file

@ -146,9 +146,9 @@ int main(int argc, char *argv[]) {
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "matrix size: " << size << "x" << size << std::endl;
kernel_arg.num_tasks = size_sq;
kernel_arg.grid_dim[0] = size;
kernel_arg.grid_dim[1] = size;
kernel_arg.size = size;
kernel_arg.log2_size = log2(size);
// allocate device memory
std::cout << "allocate device memory" << std::endl;

View file

@ -3,23 +3,22 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
uint32_t num_points = arg->num_points;
auto src_ptr = (TYPE*)arg->src_addr;
auto dst_ptr = (TYPE*)arg->dst_addr;
auto ref_value = src_ptr[task_id];
auto ref_value = src_ptr[blockIdx.x];
uint32_t pos = 0;
for (uint32_t i = 0; i < num_points; ++i) {
auto cur_value = src_ptr[i];
pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id));
pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < blockIdx.x));
}
dst_ptr[pos] = ref_value;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}

View file

@ -1,18 +1,15 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
dst_ptr[task_id] = src0_ptr[task_id] + src1_ptr[task_id];
dst_ptr[blockIdx.x] = src0_ptr[blockIdx.x] + src1_ptr[blockIdx.x];
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
return vx_spawn_threads(1, &arg->num_points, nullptr, (vx_kernel_func_cb)kernel_body, arg);
}