add benchmakrs

This commit is contained in:
Shinnung Jeong 2024-04-23 00:12:51 -04:00
parent 7ee20fca2b
commit ea26d69751
349 changed files with 190817 additions and 0 deletions

1
tests/opencl/b+tree/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*.txt

View file

@ -0,0 +1,7 @@
PROJECT = btree
SRCS = main.cc kernel_gpu_opencl_wrapper.cc kernel_gpu_opencl_wrapper_2.cc num.cc opencl.cc timer.cc timing.cpp
OPTS ?= file mil.txt command command.txt
include common.mk

View file

@ -0,0 +1,159 @@
include ../../common/make.config
#Can be changed by `make TYPE=CPU`
TYPE = GPU
#Library
ifeq ($(TYPE),GPU)
OPENCL_INC = $(NV_OPENCL_INC)
OPENCL_LIB = $(NV_OPENCL_LIB)
else
OPENCL_INC = $(INTEL_OPENCL_INC)
OPENCL_LIB = $(INTEL_OPENCL_LIB)
endif
C_C = gcc
OCL_LIB = -lOpenCL
# ====================================================================================================100
# NVCC
# ====================================================================================================100
# CUD_C = nvcc
# OMP_FLAG = -Xcompiler paste_one_here
# ====================================================================================================100
# PGCC
# ====================================================================================================100
# C_C = pgcc
# OMP_FLAG = -mp
# ACC_FLAG = -ta=nvidia \
# -Minfo \
# -Mbounds
# ====================================================================================================100
# END
# ====================================================================================================100
# ========================================================================================================================================================================================================200
# EXECUTABLES (LINK OBJECTS TOGETHER INTO BINARY)
# ========================================================================================================================================================================================================200
b+tree.out: ./main.o \
./kernel/kernel_gpu_opencl_wrapper.o \
./kernel/kernel_gpu_opencl_wrapper_2.o \
./util/timer/timer.o \
./util/num/num.o \
./util/opencl/opencl.o \
./timing.o
$(C_C) ./main.o \
./kernel/kernel_gpu_opencl_wrapper.o \
./kernel/kernel_gpu_opencl_wrapper_2.o \
./util/timer/timer.o \
./util/num/num.o \
./util/opencl/opencl.o \
./timing.o \
-lm \
-L$(OPENCL_LIB) \
-lOpenCL \
-o b+tree.out
run: b+tree.out
./b+tree.out file ../../data/b+tree/mil.txt command ../../data/b+tree/command.txt -p 0 -d 0
# ========================================================================================================================================================================================================200
# OBJECTS (COMPILE SOURCE FILES INTO OBJECTS)
# ========================================================================================================================================================================================================200
# ======================================================================================================================================================150
# MAIN FUNCTION
# ======================================================================================================================================================150
main.o: ./common.h \
./main.h \
./main.c
$(C_C) $(KERNEL_DIM) ./main.c \
-c \
-I$(OPENCL_INC) \
-o ./main.o \
-Wno-unused-result \
-O3
# ======================================================================================================================================================150
# KERNELS
# ======================================================================================================================================================150
./kernel/kernel_gpu_opencl_wrapper.o: ./common.h \
./kernel/kernel_gpu_opencl_wrapper.h \
./kernel/kernel_gpu_opencl_wrapper.c
$(C_C) $(KERNEL_DIM) ./kernel/kernel_gpu_opencl_wrapper.c \
-c \
-o ./kernel/kernel_gpu_opencl_wrapper.o \
-O3 \
-I../util \
-DTIMING \
-I$(OPENCL_INC)
./kernel/kernel_gpu_opencl_wrapper_2.o: ./common.h \
./kernel/kernel_gpu_opencl_wrapper_2.h \
./kernel/kernel_gpu_opencl_wrapper_2.c
$(C_C) $(KERNEL_DIM) ./kernel/kernel_gpu_opencl_wrapper_2.c \
-c \
-o ./kernel/kernel_gpu_opencl_wrapper_2.o \
-O3 \
-I../util \
-DTIMING \
-I$(OPENCL_INC)
timing.o: ../util/timing.h ../util/timing.c
$(C_C) ../util/timing.c -I../util -c -o timing.o -I$(OPENCL_INC)
# ======================================================================================================================================================150
# UTILITIES
# ======================================================================================================================================================150
./util/timer/timer.o: ./common.h \
./util/timer/timer.h \
./util/timer/timer.c
$(C_C) ./util/timer/timer.c \
-c \
-o ./util/timer/timer.o \
-O3
./util/num/num.o: ./common.h \
./util/num/num.h \
./util/num/num.c
$(C_C) ./util/num/num.c \
-c \
-o ./util/num/num.o \
-O3
./util/opencl/opencl.o: ./common.h \
./util/opencl/opencl.h \
./util/opencl/opencl.c
$(C_C) ./util/opencl/opencl.c \
-c \
-o ./util/opencl/opencl.o \
-O3 \
-I$(OPENCL_INC)
# ======================================================================================================================================================150
# END
# ======================================================================================================================================================150
# ========================================================================================================================================================================================================200
# DELETE
# ========================================================================================================================================================================================================200
clean:
rm *.o *.out \
./kernel/*.o \
./util/timer/*.o \
./util/num/*.o \
./util/opencl/*.o \
output.txt 2>/dev/null || true
# ========================================================================================================================================================================================================200
# END
# ========================================================================================================================================================================================================200

View file

@ -0,0 +1,7 @@
******Adjustable work group size*****
The kernel 1: RD_WG_SIZE_0_0 RD_WG_SIZE_0
The kernel 2: RD_WG_SIZE_1_0 RD_WG_SIZE_1
USAGE:
make clean
make KERNEL_DIM="-DRD_WG_SIZE_0=256 -DRD_WG_SIZE_1=256"

BIN
tests/opencl/b+tree/btree Executable file

Binary file not shown.

View file

@ -0,0 +1,476 @@
// # ifdef __cplusplus
// extern "C" {
// # endif
#ifndef LIST_H
# define LIST_H
//===============================================================================================================================================================================================================200
// DEFINE/INCLUDE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// INCLUDE (for some reason these are not recognized when defined in main file before this one is included)
//======================================================================================================================================================150
#include <stdint.h> // (in path known to compiler) needed by uint32_t
#include <stdbool.h> // (in path known to compiler) needed by true/false, bool
#include <stdlib.h> // (in path known to compiler) needed by malloc
#include <stdio.h>
//======================================================================================================================================================150
// DEFINE
//======================================================================================================================================================150
#define fp float
#define Version "1.5"
#ifdef WINDOWS
#define bool char
#define false 0
#define true 1
#endif
/* #define DEFAULT_ORDER 256 */
#ifdef RD_WG_SIZE_0_0
#define DEFAULT_ORDER RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define DEFAULT_ORDER RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define DEFAULT_ORDER RD_WG_SIZE
#else
#define DEFAULT_ORDER 256
#endif
#ifdef RD_WG_SIZE_1_0
#define DEFAULT_ORDER_2 RD_WG_SIZE_1_0
#elif defined(RD_WG_SIZE_1)
#define DEFAULT_ORDER_2 RD_WG_SIZE_1
#elif defined(RD_WG_SIZE)
#define DEFAULT_ORDER_2 RD_WG_SIZE
#else
#define DEFAULT_ORDER_2 256
#endif
#define malloc(size) ({ \
void *_tmp; \
\
if (!(_tmp = malloc(size))) { \
fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__); \
exit(-1); \
} \
\
_tmp; \
})
//======================================================================================================================================================150
// STRUCTURES
//======================================================================================================================================================150
// struct list_item;
typedef struct list_item list_item_t;
typedef struct list_t {
list_item_t *head, *tail;
uint32_t length;
int32_t (*compare)(const void *key, const void *with);
void (*datum_delete)(void *);
} list_t;
typedef list_item_t *list_iterator_t;
typedef list_item_t *list_reverse_iterator_t;
/* Type representing the record
* to which a given key refers.
* In a real B+ tree system, the
* record would hold data (in a database)
* or a file (in an operating system)
* or some other information.
* Users can rewrite this part of the code
* to change the type and content
* of the value field.
*/
typedef struct record {
int value;
} record;
/* Type representing a node in the B+ tree.
* This type is general enough to serve for both
* the leaf and the internal node.
* The heart of the node is the array
* of keys and the array of corresponding
* pointers. The relation between keys
* and pointers differs between leaves and
* internal nodes. In a leaf, the index
* of each key equals the index of its corresponding
* pointer, with a maximum of order - 1 key-pointer
* pairs. The last pointer points to the
* leaf to the right (or NULL in the case
* of the rightmost leaf).
* In an internal node, the first pointer
* refers to lower nodes with keys less than
* the smallest key in the keys array. Then,
* with indices i starting at 0, the pointer
* at i + 1 points to the subtree with keys
* greater than or equal to the key in this
* node at index i.
* The num_keys field is used to keep
* track of the number of valid keys.
* In an internal node, the number of valid
* pointers is always num_keys + 1.
* In a leaf, the number of valid pointers
* to data is always num_keys. The
* last leaf pointer points to the next leaf.
*/
typedef struct node {
void ** pointers;
int * keys;
struct node * parent;
bool is_leaf;
int num_keys;
struct node * next; // Used for queue.
} node;
//
typedef struct knode {
int location;
int indices [DEFAULT_ORDER + 1];
int keys [DEFAULT_ORDER + 1];
bool is_leaf;
int num_keys;
} knode;
struct list_item {
struct list_item *pred, *next;
void *datum;
};
//===============================================================================================================================================================================================================200
// PROTOTYPES
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// Other
//======================================================================================================================================================150
void
list_item_init( list_item_t *li,
void *datum);
void
list_item_delete( list_item_t *li,
void (*datum_delete)(void *datum));
void
list_insert_item_tail( list_t *l,
list_item_t *i);
void
list_insert_item_before(list_t *l,
list_item_t *next,
list_item_t *i);
void
list_insert_item_after( list_t *l,
list_item_t *pred,
list_item_t *i);
void
list_insert_item_sorted(list_t *l,
list_item_t *i);
//======================================================================================================================================================150
// ???
//======================================================================================================================================================150
void
list_init( list_t *l,
int32_t (*compare)(const void *key, const void *with),
void (*datum_delete)(void *datum));
void
list_delete(list_t *l);
void
list_reset(list_t *l);
void
list_insert_head( list_t *l,
void *v);
void
list_insert_tail( list_t *l,
void *v);
void
list_insert_before(list_t *l,
list_item_t *next,
void *v);
void
list_insert_after( list_t *l,
list_item_t *pred,
void *v);
void
list_insert_sorted( list_t *l,
void *v);
void
list_insert_item_head( list_t *l,
list_item_t *i);
void
list_remove_item( list_t *l,
list_item_t *i);
void
list_remove_head(list_t *l);
void
list_remove_tail(list_t *l);
list_item_t *
list_find_item( list_t *l,
void *datum);
list_item_t *
list_get_head_item(list_t *l);
list_item_t *
list_get_tail_item(list_t *l);
void *
list_find( list_t *l,
void *datum);
void *
list_get_head(list_t *l);
void *
list_get_tail(list_t *l);
uint32_t
list_get_length(list_t *l);
bool
list_is_empty(list_t *l);
bool
list_not_empty(list_t *l);
void
list_visit_items( list_t *l,
void (*visitor)(void *v));
void *
list_item_get_datum(list_item_t *li);
void
list_iterator_init( list_t *l,
list_iterator_t *li);
void
list_iterator_delete(list_iterator_t *li);
void
list_iterator_next(list_iterator_t *li);
void
list_iterator_prev(list_iterator_t *li);
void *
list_iterator_get_datum(list_iterator_t *li);
bool
list_iterator_is_valid(list_iterator_t *li);
void
list_reverse_iterator_init( list_t *l,
list_iterator_t *li);
void
list_reverse_iterator_delete(list_iterator_t *li);
void
list_reverse_iterator_next(list_iterator_t *li);
void
list_reverse_iterator_prev(list_iterator_t *li);
void *
list_reverse_iterator_get_datum(list_iterator_t *li);
bool
list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
//======================================================================================================================================================150
// Output and utility
//======================================================================================================================================================150
void *
kmalloc(int size);
long
transform_to_cuda( node *n,
bool verbose); //returns actual mem used in a long
void
usage_1( void );
void
usage_2( void );
void
enqueue( node * new_node );
node *
dequeue( void );
int
height( node * root );
int
path_to_root( node * root,
node * child );
void
print_leaves( node * root );
void
print_tree( node * root );
node *
find_leaf( node * root,
int key,
bool verbose );
record *
find( node * root,
int key,
bool verbose );
int
cut( int length );
//======================================================================================================================================================150
// Insertion
//======================================================================================================================================================150
record *
make_record(int value);
node *
make_node( void );
node *
make_leaf( void );
int
get_left_index( node * parent,
node * left);
node *
insert_into_leaf( node * leaf,
int key, record * pointer );
node *
insert_into_leaf_after_splitting( node * root,
node * leaf,
int key,
record * pointer);
node *
insert_into_node( node * root,
node * parent,
int left_index,
int key,
node * right);
node *
insert_into_node_after_splitting( node * root,
node * parent,
int left_index,
int key,
node * right);
node *
insert_into_parent( node * root,
node * left,
int key,
node * right);
node *
insert_into_new_root( node * left,
int key,
node * right);
node *
start_new_tree( int key,
record * pointer);
node *
insert( node * root,
int key,
int value );
//======================================================================================================================================================150
// Deletion
//======================================================================================================================================================150
int
get_neighbor_index(node * n );
node *
adjust_root(node * root);
node *
coalesce_nodes( node * root,
node * n,
node * neighbor,
int neighbor_index,
int k_prime);
node *
redistribute_nodes( node * root,
node * n,
node * neighbor,
int neighbor_index,
int k_prime_index,
int k_prime);
node *
delete_entry( node * root,
node * n,
int key,
void * pointer );
node *
deleteVal( node * root,
int key );
//===============================================================================================================================================================================================================200
// HEADER
//===============================================================================================================================================================================================================200
extern int platform_id_inuse;
extern int device_id_inuse;
extern cl_device_type device_type;
// int main( int argc,
// char *argv []);
//===============================================================================================================================================================================================================200
// END
//===============================================================================================================================================================================================================200
// #endif
// # ifdef __cplusplus
// }
# endif

View file

@ -0,0 +1,126 @@
XLEN ?= 32
TOOLDIR ?= /opt
TARGET ?= opaesim
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
K_CFLAGS += -march=rv64imafd -mabi=ilp64d
STARTUP_ADDR ?= 0x180000000
else
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
K_CFLAGS += -march=rv32imaf -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
K_CFLAGS += -I$(VORTEX_KN_PATH)/include -DNDEBUG -DLLVM_VOTEX
K_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a -lm
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing -Wno-unused-result
CXXFLAGS += -pthread
CXXFLAGS += -I$(POCL_RT_PATH)/include
ifdef HOSTGPU
CXXFLAGS += -DHOSTGPU
LDFLAGS += -lOpenCL
else
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so
endif
# Debugigng
#ifdef DEBUG
CXXFLAGS += -g -O0
#else
# CXXFLAGS += -O2 -DNDEBUG
#endif
ifeq ($(TARGET), fpga)
OPAE_DRV_PATHS ?= libopae-c.so
else
ifeq ($(TARGET), asesim)
OPAE_DRV_PATHS ?= libopae-c-ase.so
else
ifeq ($(TARGET), opaesim)
OPAE_DRV_PATHS ?= libopae-c-sim.so
endif
endif
endif
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
all: $(PROJECT) kernel.pocl kernel2.pocl
kernel.pocl: kernel_gpu_opencl.cl
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel_gpu_opencl.cl
kernel2.pocl: kernel_gpu_opencl_2.cl
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel2.pocl kernel_gpu_opencl_2.cl
#backprop.o: backprop.c
# $(CXX) $(CXXFLAGS) backprop.c -c -Wno-unused-result
%.cc.o: %.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
%.cpp.o: %.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@
%.c.o: %.c
$(CC) $(CXXFLAGS) -c $< -o $@
$(PROJECT): $(OBJS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-hostgpu: $(PROJECT) kernel.pocl
./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) kernel.pocl
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-xrt: $(PROJECT) kernel.pocl
ifeq ($(TARGET), hw)
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
else
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
endif
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.dump *.pocl
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View file

@ -0,0 +1,109 @@
// #ifdef __cplusplus
// extern "C" {
// #endif
//========================================================================================================================================================================================================200
// DEFINE/INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// DEFINE
//======================================================================================================================================================150
// double precision support (switch between as needed for NVIDIA/AMD)
#ifdef AMDAPP
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
// clBuildProgram compiler cannot link this file for some reason, so had to redefine constants and structures below
//#include ../common.h // (in directory specified to compiler) main function header
//======================================================================================================================================================150
// DEFINE (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
//======================================================================================================================================================150
// change to double if double precision needed
#define fp float
#define DEFAULT_ORDER 256
//======================================================================================================================================================150
// STRUCTURES (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
//======================================================================================================================================================150
// Type representing the record to which a given key refers. In a real B+ tree system, the record would hold data (in a database) or a file (in an operating system) or some other information.
// Users can rewrite this part of the code to change the type and content of the value field.
typedef struct record {
int value;
} record;
// ???
typedef struct knode {
int location;
int indices [DEFAULT_ORDER + 1];
int keys [DEFAULT_ORDER + 1];
bool is_leaf;
int num_keys;
} knode;
//========================================================================================================================================================================================================200
// findK function
//========================================================================================================================================================================================================200
__kernel void
findK( long height,
__global knode *knodesD,
long knodes_elem,
__global record *recordsD,
__global long *currKnodeD,
__global long *offsetD,
__global int *keysD,
__global record *ansD)
{
// private thread IDs
int thid = get_local_id(0);
int bid = get_group_id(0);
// processtree levels
int i;
for(i = 0; i < height; i++){
// if value is between the two keys
if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
// this conditional statement is inserted to avoid crush due to but in original code
// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
}
}
//__syncthreads();
barrier(CLK_LOCAL_MEM_FENCE);
// set for next tree level
if(thid==0){
currKnodeD[bid] = offsetD[bid];
}
//__syncthreads();
barrier(CLK_LOCAL_MEM_FENCE);
}
//At this point, we have a candidate leaf node which may contain
//the target record. Check each key to hopefully find the record
if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
}
}
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200
// #ifdef __cplusplus
// }
// #endif

View file

@ -0,0 +1,111 @@
//========================================================================================================================================================================================================200
// DEFINE/INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// DEFINE
//======================================================================================================================================================150
// double precision support (switch between as needed for NVIDIA/AMD)
#ifdef AMDAPP
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
// clBuildProgram compiler cannot link this file for some reason, so had to redefine constants and structures below
// #include ../common.h // (in directory specified to compiler) main function header
//======================================================================================================================================================150
// DEFINE (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
//======================================================================================================================================================150
// change to double if double precision needed
#define fp float
#define DEFAULT_ORDER_2 256
//======================================================================================================================================================150
// STRUCTURES (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
//======================================================================================================================================================150
// ???
typedef struct knode {
int location;
int indices [DEFAULT_ORDER_2 + 1];
int keys [DEFAULT_ORDER_2 + 1];
bool is_leaf;
int num_keys;
} knode;
//========================================================================================================================================================================================================200
// findRangeK function
//========================================================================================================================================================================================================200
__kernel void
findRangeK( long height,
__global knode *knodesD,
long knodes_elem,
__global long *currKnodeD,
__global long *offsetD,
__global long *lastKnodeD,
__global long *offset_2D,
__global int *startD,
__global int *endD,
__global int *RecstartD,
__global int *ReclenD)
{
// private thread IDs
int thid = get_local_id(0);
int bid = get_group_id(0);
// ???
int i;
for(i = 0; i < height; i++){
if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
// this conditional statement is inserted to avoid crush due to but in original code
// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
}
}
if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
// this conditional statement is inserted to avoid crush due to but in original code
// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
}
}
//__syncthreads();
barrier(CLK_LOCAL_MEM_FENCE);
// set for next tree level
if(thid==0){
currKnodeD[bid] = offsetD[bid];
lastKnodeD[bid] = offset_2D[bid];
}
// __syncthreads();
barrier(CLK_LOCAL_MEM_FENCE);
}
// Find the index of the starting record
if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
}
// __syncthreads();
barrier(CLK_LOCAL_MEM_FENCE);
// Find the index of the ending record
if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
}
}
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200

View file

@ -0,0 +1,753 @@
// #ifdef __cplusplus
// extern "C" {
// #endif
//========================================================================================================================================================================================================200
// DEFINE/INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// LIBRARIES
//======================================================================================================================================================150
#include <CL/cl.h> // (in directory provided to compiler) needed by OpenCL types and functions
#include <string.h> // (in directory known to compiler) needed by memset
//======================================================================================================================================================150
// COMMON
//======================================================================================================================================================150
#include "common.h" // (in directory provided here)
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "opencl.h" // (in directory provided here)
#include "timer.h" // (in directory provided here)
#ifdef TIMING
#include "timing.h"
#endif
//======================================================================================================================================================150
// HEADER
//======================================================================================================================================================150
#include "kernel_gpu_opencl_wrapper.h" // (in directory provided here)
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp_ = fopen(filename, "r");
if (NULL == fp_) {
fprintf(stderr, "Failed to load kernel. %s\n", filename);
return -1;
}
fseek(fp_ , 0 , SEEK_END);
long fsize = ftell(fp_);
rewind(fp_);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp_);
fclose(fp_);
return 0;
}
//========================================================================================================================================================================================================200
// KERNEL_GPU_CUDA_WRAPPER FUNCTION
//========================================================================================================================================================================================================200
void
kernel_gpu_opencl_wrapper( record *records,
long records_mem,
knode *knodes,
long knodes_elem,
long knodes_mem,
int order,
long maxheight,
int count,
long *currKnode,
long *offset,
int *keys,
record *ans)
{
cl_device_type device_type = CL_DEVICE_TYPE_GPU;
int platform_id_inuse = 0; // platform id in use (default: 0)
int device_id_inuse = 0; // platform id in use (default: 0)
//======================================================================================================================================================150
// CPU VARIABLES
//======================================================================================================================================================150
#ifdef TIMING
struct timeval tv;
struct timeval tv_total_start, tv_total_end;
struct timeval tv_init_end;
struct timeval tv_h2d_start, tv_h2d_end;
struct timeval tv_d2h_start, tv_d2h_end;
struct timeval tv_kernel_start, tv_kernel_end;
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
struct timeval tv_close_start, tv_close_end;
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time= 0,
d2h_time = 0, close_time = 0, total_time = 0;
#endif
#ifdef TIMING
gettimeofday(&tv_total_start, NULL);
#endif
//======================================================================================================================================================150
// GPU SETUP
//======================================================================================================================================================150
//====================================================================================================100
// INITIAL DRIVER OVERHEAD
//====================================================================================================100
// cudaThreadSynchronize();
//====================================================================================================100
// COMMON VARIABLES
//====================================================================================================100
// common variables
cl_int error;
//====================================================================================================100
// GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
//====================================================================================================100
// Get the number of available platforms
cl_uint num_platforms;
error = clGetPlatformIDs( 0,
NULL,
&num_platforms);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Get the list of available platforms
cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
error = clGetPlatformIDs( num_platforms,
platforms,
NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Select the 1st platform
cl_platform_id platform = platforms[platform_id_inuse];
// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
char pbuf[100];
error = clGetPlatformInfo( platform,
CL_PLATFORM_VENDOR,
sizeof(pbuf),
pbuf,
NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
printf("Platform: %s\n", pbuf);
//====================================================================================================100
// GET DEVICE INFORMATION
//====================================================================================================100
cl_uint devices_size;
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &devices_size);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
if (devices_size == 0) {
printf("There are no devices for Platform %d\n", platform_id_inuse);
exit(0);
}
printf("Device num: %u\n", devices_size);
// Get the list of devices (previousely selected for the context)
cl_device_id *devices = (cl_device_id *) malloc(devices_size);
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, devices_size,
devices, NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Select the device
cl_device_id device;
device = devices[device_id_inuse];
// Check device type
error = clGetDeviceInfo(device, CL_DEVICE_TYPE,
sizeof(device_type), (void *)&device_type, NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
if(device_type == CL_DEVICE_TYPE_GPU)
printf("Creating GPU Context\n");
else if (device_type == CL_DEVICE_TYPE_CPU)
printf("Creating CPU Context\n");
else
printf("This Context Type Not Supported\n");
// Get the name of the selected device
error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf),
pbuf, NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
printf("Device: %s\n", pbuf);
//====================================================================================================100
// CREATE CONTEXT FOR THE PLATFORM
//====================================================================================================100
// Create context properties for selected platform
cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
(cl_context_properties) platform,
0};
// Create context for selected platform being GPU
cl_context context;
context = clCreateContextFromType( context_properties,
device_type,
NULL,
NULL,
&error);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//====================================================================================================100
// CREATE COMMAND QUEUE FOR THE DEVICE
//====================================================================================================100
// Create a command queue
cl_command_queue command_queue;
#ifdef TIMING
command_queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &error);
#else
command_queue = clCreateCommandQueue(context, device, 0, &error);
#endif
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//====================================================================================================100
// CREATE PROGRAM, COMPILE IT
//====================================================================================================100
// Load kernel source code from file
/*
const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
size_t sourceSize = strlen(source);
// Create the program
cl_program program = clCreateProgramWithSource( context,
1,
&source,
&sourceSize,
&error);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
*/
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
std::abort();
cl_program program = clCreateProgramWithBinary(
context, 1, &devices[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &error);
free(kernel_bin);
char clOptions[110];
// sprintf(clOptions,"-I../../src");
sprintf(clOptions,"-I./../");
#ifdef DEFAULT_ORDER
sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER=%d", DEFAULT_ORDER);
#endif
/*
// Compile the program
error = clBuildProgram( program,
1,
&device,
clOptions,
NULL,
NULL);
*/
error = clBuildProgram(program, 1, &devices[device_id_inuse], NULL, NULL, NULL);
if(error != CL_SUCCESS) {
printf("ERROR: clBuildProgram() => %d\n", error); return; }
// Print warnings and errors from compilation
static char log[65536];
memset(log, 0, sizeof(log));
clGetProgramBuildInfo( program,
device,
CL_PROGRAM_BUILD_LOG,
sizeof(log)-1,
log,
NULL);
printf("-----OpenCL Compiler Output-----\n");
if (strstr(log,"warning:") || strstr(log, "error:"))
printf("<<<<\n%s\n>>>>\n", log);
printf("--------------------------------\n");
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Create kernel
cl_kernel kernel;
kernel = clCreateKernel(program,
"findK",
&error);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
gettimeofday(&tv_init_end, NULL);
tvsub(&tv_init_end, &tv_total_start, &tv);
init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
//====================================================================================================100
// END
//====================================================================================================100
//======================================================================================================================================================150
// GPU MEMORY (MALLOC)
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN
//====================================================================================================100
//==================================================50
// recordsD
//==================================================50
cl_mem recordsD;
recordsD = clCreateBuffer( context,
CL_MEM_READ_WRITE,
records_mem,
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// knodesD
//==================================================50
cl_mem knodesD;
knodesD = clCreateBuffer( context,
CL_MEM_READ_WRITE,
knodes_mem,
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// currKnodeD
//==================================================50
cl_mem currKnodeD;
currKnodeD = clCreateBuffer( context,
CL_MEM_READ_WRITE,
count*sizeof(long),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// offsetD
//==================================================50
cl_mem offsetD;
offsetD = clCreateBuffer( context,
CL_MEM_READ_WRITE,
count*sizeof(long),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// keysD
//==================================================50
cl_mem keysD;
keysD = clCreateBuffer( context,
CL_MEM_READ_WRITE,
count*sizeof(long),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
gettimeofday(&tv_mem_alloc_end, NULL);
tvsub(&tv_mem_alloc_end, &tv_init_end, &tv);
mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
//==================================================50
// END
//==================================================50
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansD
//==================================================50
cl_mem ansD;
ansD = clCreateBuffer( context,
CL_MEM_READ_WRITE,
count*sizeof(record),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// END
//==================================================50
//====================================================================================================100
// END
//====================================================================================================100
//======================================================================================================================================================150
// GPU MEMORY COPY
//======================================================================================================================================================150
//====================================================================================================100
// GPU MEMORY (MALLOC) COPY IN
//====================================================================================================100
//==================================================50
// recordsD
//==================================================50
cl_event event;
error = clEnqueueWriteBuffer( command_queue, // command queue
recordsD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
records_mem, // size to be copied
records, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// knodesD
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
knodesD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
knodes_mem, // size to be copied
knodes, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// currKnodeD
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
currKnodeD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(long), // size to be copied
currKnode, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// offsetD
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
offsetD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(long), // size to be copied
offset, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// keysD
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
keysD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(int), // size to be copied
keys, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// END
//==================================================50
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansD
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
ansD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(record), // size to be copied
ans, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// END
//==================================================50
//====================================================================================================100
// END
//====================================================================================================100
//======================================================================================================================================================150
// findK kernel
//======================================================================================================================================================150
//====================================================================================================100
// Execution Parameters
//====================================================================================================100
size_t local_work_size[1];
local_work_size[0] = order < 1024 ? order : 1024;
size_t global_work_size[1];
global_work_size[0] = count * local_work_size[0];
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
//====================================================================================================100
// Kernel Arguments
//====================================================================================================100
clSetKernelArg( kernel,
0,
sizeof(long),
(void *) &maxheight);
clSetKernelArg( kernel,
1,
sizeof(cl_mem),
(void *) &knodesD);
clSetKernelArg( kernel,
2,
sizeof(long),
(void *) &knodes_elem);
clSetKernelArg( kernel,
3,
sizeof(cl_mem),
(void *) &recordsD);
clSetKernelArg( kernel,
4,
sizeof(cl_mem),
(void *) &currKnodeD);
clSetKernelArg( kernel,
5,
sizeof(cl_mem),
(void *) &offsetD);
clSetKernelArg( kernel,
6,
sizeof(cl_mem),
(void *) &keysD);
clSetKernelArg( kernel,
7,
sizeof(cl_mem),
(void *) &ansD);
//====================================================================================================100
// Kernel
//====================================================================================================100
error = clEnqueueNDRangeKernel( command_queue,
kernel,
1,
NULL,
global_work_size,
local_work_size,
0,
NULL,
&event);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
#ifdef TIMING
kernel_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//====================================================================================================100
// END
//====================================================================================================100
//======================================================================================================================================================150
// GPU MEMORY COPY (CONTD.)
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansD
//==================================================50
error = clEnqueueReadBuffer(command_queue, // The command queue.
ansD, // The image on the device.
CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
0, // Offset. None in this case.
count*sizeof(record), // Size to copy.
ans, // The pointer to the image on the host.
0, // Number of events in wait list. Not used.
NULL, // Event wait list. Not used.
&event); // Event object for determining status. Not used.
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
d2h_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// END
//==================================================50
//====================================================================================================100
// END
//====================================================================================================100
//======================================================================================================================================================150
// GPU MEMORY DEALLOCATION
//======================================================================================================================================================150
#ifdef TIMING
gettimeofday(&tv_close_start, NULL);
#endif
// Release kernels...
clReleaseKernel(kernel);
// Now the program...
clReleaseProgram(program);
// Clean up the device memory...
clReleaseMemObject(recordsD);
clReleaseMemObject(knodesD);
clReleaseMemObject(currKnodeD);
clReleaseMemObject(offsetD);
clReleaseMemObject(keysD);
clReleaseMemObject(ansD);
// Flush the queue
error = clFlush(command_queue);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// ...and finally, the queue and context.
clReleaseCommandQueue(command_queue);
// ???
clReleaseContext(context);
#ifdef TIMING
gettimeofday(&tv_close_end, NULL);
tvsub(&tv_close_end, &tv_close_start, &tv);
close_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
tvsub(&tv_close_end, &tv_total_start, &tv);
total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
#ifdef TIMING
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
printf("Init: %f\n", init_time);
printf("MemAlloc: %f\n", mem_alloc_time);
printf("HtoD: %f\n", h2d_time);
printf("Exec: %f\n", kernel_time);
printf("DtoH: %f\n", d2h_time);
printf("Close: %f\n", close_time);
printf("Total: %f\n", total_time);
#endif
//======================================================================================================================================================150
// END
//======================================================================================================================================================150
}
//========================================================================================================================================================================================================200
// END
//========================================================================================================================================================================================================200
// #ifdef __cplusplus
// }
// #endif

View file

@ -0,0 +1,31 @@
// #ifdef __cplusplus
// extern "C" {
// #endif
//========================================================================================================================================================================================================200
// KERNEL_GPU_OPENCL_WRAPPER HEADER
//========================================================================================================================================================================================================200
void
kernel_gpu_opencl_wrapper( record *records,
long records_mem,
knode *knodes,
long knodes_elem,
long knodes_mem,
int order,
long maxheight,
int count,
long *currKnode,
long *offset,
int *keys,
record *ans);
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200
// #ifdef __cplusplus
// }
// #endif

View file

@ -0,0 +1,886 @@
// #ifdef __cplusplus
// extern "C" {
// #endif
//========================================================================================================================================================================================================200
// INCLUDE
//========================================================================================================================================================================================================200
//======================================================================================================================================================150
// LIBRARIES
//======================================================================================================================================================150
#include <CL/cl.h> // (in directory provided to compiler) needed by OpenCL types and functions
#include <string.h> // (in directory known to compiler) needed by memset
#include <stdio.h> // (in directory known to compiler) needed by printf, stderr
//======================================================================================================================================================150
// COMMON
//======================================================================================================================================================150
#include "common.h" // (in directory provided here)
//======================================================================================================================================================150
// UTILITIES
//======================================================================================================================================================150
#include "timer.h" // (in directory provided here)
#include "opencl.h"
#ifdef TIMING
#include "timing.h"
#endif
//======================================================================================================================================================150
// HEADER
//======================================================================================================================================================150
#include "kernel_gpu_opencl_wrapper_2.h" // (in directory provided here)
//========================================================================================================================================================================================================200
// FUNCTION
//========================================================================================================================================================================================================200
int read_kernel_file2(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp_ = fopen(filename, "r");
if (NULL == fp_) {
fprintf(stderr, "Failed to load kernel.2 %s\n", filename);
return -1;
}
fseek(fp_ , 0 , SEEK_END);
long fsize = ftell(fp_);
rewind(fp_);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp_);
fclose(fp_);
return 0;
}
void
kernel_gpu_opencl_wrapper_2(knode *knodes,
long knodes_elem,
long knodes_mem,
int order,
long maxheight,
int count,
long *currKnode,
long *offset,
long *lastKnode,
long *offset_2,
int *start,
int *end,
int *recstart,
int *reclength)
{
cl_device_type device_type = CL_DEVICE_TYPE_GPU;
int platform_id_inuse = 0; // platform id in use (default: 0)
int device_id_inuse = 0; // platform id in use (default: 0)
//======================================================================================================================================================150
// CPU VARIABLES
//======================================================================================================================================================150
//Primitives for timing
#ifdef TIMING
struct timeval tv;
struct timeval tv_total_start, tv_total_end;
struct timeval tv_init_end;
struct timeval tv_h2d_start, tv_h2d_end;
struct timeval tv_d2h_start, tv_d2h_end;
struct timeval tv_kernel_start, tv_kernel_end;
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
struct timeval tv_close_start, tv_close_end;
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time= 0,
d2h_time = 0, close_time = 0, total_time = 0;
#endif
#ifdef TIMING
gettimeofday(&tv_total_start, NULL);
#endif
//======================================================================================================================================================150
// GPU SETUP
//======================================================================================================================================================150
//====================================================================================================100
// INITIAL DRIVER OVERHEAD
//====================================================================================================100
// cudaThreadSynchronize();
//====================================================================================================100
// COMMON VARIABLES
//====================================================================================================100
// common variables
cl_int error;
//====================================================================================================100
// GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
//====================================================================================================100
// Get the number of available platforms
cl_uint num_platforms;
error = clGetPlatformIDs( 0,
NULL,
&num_platforms);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Get the list of available platforms
cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
error = clGetPlatformIDs( num_platforms,
platforms,
NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Select the 1st platform
cl_platform_id platform = platforms[platform_id_inuse];
// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
char pbuf[100];
error = clGetPlatformInfo( platform,
CL_PLATFORM_VENDOR,
sizeof(pbuf),
pbuf,
NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
printf("Platform: %s\n", pbuf);
//====================================================================================================100
// GET DEVICE INFORMATION
//====================================================================================================100
cl_uint devices_size;
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &devices_size);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
if (devices_size == 0) {
printf("There are no devices for Platform %d\n", platform_id_inuse);
exit(0);
}
printf("Device num: %u\n", devices_size);
// Get the list of devices (previousely selected for the context)
cl_device_id *devices = (cl_device_id *) malloc(devices_size);
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, devices_size,
devices, NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Select the device
cl_device_id device;
device = devices[device_id_inuse];
// Check device type
error = clGetDeviceInfo(device, CL_DEVICE_TYPE,
sizeof(device_type), (void *)&device_type, NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
if(device_type == CL_DEVICE_TYPE_GPU)
printf("Creating GPU Context\n");
else if (device_type == CL_DEVICE_TYPE_CPU)
printf("Creating CPU Context\n");
else
printf("This Context Type Not Supported\n");
// Get the name of the selected device
error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf),
pbuf, NULL);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
printf("Device: %s\n", pbuf);
//====================================================================================================100
// CREATE CONTEXT FOR THE PLATFORM
//====================================================================================================100
// Create context properties for selected platform
cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
(cl_context_properties) platform,
0};
// Create context for selected platform being GPU
cl_context context;
context = clCreateContextFromType( context_properties,
device_type,
NULL,
NULL,
&error);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//====================================================================================================100
// CREATE COMMAND QUEUE FOR THE DEVICE
//====================================================================================================100
// Create a command queue
cl_command_queue command_queue;
#ifdef TIMING
command_queue = clCreateCommandQueue(context, device,
CL_QUEUE_PROFILING_ENABLE, &error);
#else
command_queue = clCreateCommandQueue(context, device, 0, &error);
#endif
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//====================================================================================================100
// CREATE PROGRAM, COMPILE IT
//====================================================================================================100
// Load kernel source code from file
/*
const char *source = load_kernel_source("./kernel/kernel_gpu_opencl_2.cl");
size_t sourceSize = strlen(source);
// Create the program
cl_program program = clCreateProgramWithSource( context,
1,
&source,
&sourceSize,
&error);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
*/
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
if (0 != read_kernel_file2("kernel2.pocl", &kernel_bin, &kernel_size))
std::abort();
cl_program program = clCreateProgramWithBinary(
context, 1, &devices[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &error);
free(kernel_bin);
char clOptions[110];
// sprintf(clOptions,"-I../../src");
sprintf(clOptions,"-I./../");
#ifdef DEFAULT_ORDER_2
sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER_2=%d", DEFAULT_ORDER_2);
#endif
error = clBuildProgram(program, 1, &devices[device_id_inuse], NULL, NULL, NULL);
if(error != CL_SUCCESS) {
printf("ERROR: clBuildProgram() => %d\n", error); return; }
// Compile the program
/*
error = clBuildProgram( program,
1,
&device,
clOptions,
NULL,
NULL);*/
// Print warnings and errors from compilation
static char log[65536];
memset(log, 0, sizeof(log));
clGetProgramBuildInfo( program,
device,
CL_PROGRAM_BUILD_LOG,
sizeof(log)-1,
log,
NULL);
printf("-----OpenCL Compiler Output-----\n");
if (strstr(log,"warning:") || strstr(log, "error:"))
printf("<<<<\n%s\n>>>>\n", log);
printf("--------------------------------\n");
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Create kernel
cl_kernel kernel;
kernel = clCreateKernel(program,
"findRangeK",
&error);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
gettimeofday(&tv_init_end, NULL);
tvsub(&tv_init_end, &tv_total_start, &tv);
init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
//====================================================================================================100
// END
//====================================================================================================100
//======================================================================================================================================================150
// GPU MEMORY MALLOC
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN
//====================================================================================================100
//==================================================50
// knodesD
//==================================================50
cl_mem knodesD;
knodesD = clCreateBuffer( context,
CL_MEM_READ_WRITE,
knodes_mem,
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// currKnodeD
//==================================================50
cl_mem currKnodeD;
currKnodeD = clCreateBuffer(context,
CL_MEM_READ_WRITE,
count*sizeof(long),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// offsetD
//==================================================50
cl_mem offsetD;
offsetD = clCreateBuffer( context,
CL_MEM_READ_WRITE,
count*sizeof(long),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// lastKnodeD
//==================================================50
cl_mem lastKnodeD;
lastKnodeD = clCreateBuffer(context,
CL_MEM_READ_WRITE,
count*sizeof(long),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// offset_2D
//==================================================50
cl_mem offset_2D;
offset_2D = clCreateBuffer(context,
CL_MEM_READ_WRITE,
count*sizeof(long),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// startD
//==================================================50
cl_mem startD;
startD = clCreateBuffer(context,
CL_MEM_READ_WRITE,
count*sizeof(int),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// endD
//==================================================50
cl_mem endD;
endD = clCreateBuffer( context,
CL_MEM_READ_WRITE,
count*sizeof(int),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// END
//==================================================50
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansDStart
//==================================================50
cl_mem ansDStart;
ansDStart = clCreateBuffer( context,
CL_MEM_READ_WRITE,
count*sizeof(int),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
//==================================================50
// ansDLength
//==================================================50
cl_mem ansDLength;
ansDLength = clCreateBuffer( context,
CL_MEM_READ_WRITE,
count*sizeof(int),
NULL,
&error );
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
gettimeofday(&tv_mem_alloc_end, NULL);
tvsub(&tv_mem_alloc_end, &tv_init_end, &tv);
mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
//==================================================50
// END
//==================================================50
//====================================================================================================100
// END
//====================================================================================================100
//======================================================================================================================================================150
// GPU MEMORY COPY
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN
//====================================================================================================100
//==================================================50
// knodesD
//==================================================50
cl_event event;
error = clEnqueueWriteBuffer( command_queue, // command queue
knodesD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
knodes_mem, // size to be copied
knodes, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// currKnodeD
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
currKnodeD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(long), // size to be copied
currKnode, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// offsetD
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
offsetD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(long), // size to be copied
offset, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// lastKnodeD
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
lastKnodeD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(long), // size to be copied
lastKnode, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// offset_2D
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
offset_2D, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(long), // size to be copied
offset_2, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// startD
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
startD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(int), // size to be copied
start, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// endD
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
endD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(int), // size to be copied
end, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// END
//==================================================50
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansDStart
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
endD, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(int), // size to be copied
end, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// ansDLength
//==================================================50
error = clEnqueueWriteBuffer( command_queue, // command queue
ansDLength, // destination
1, // block the source from access until this copy operation complates (1=yes, 0=no)
0, // offset in destination to write to
count*sizeof(int), // size to be copied
reclength, // source
0, // # of events in the list of events to wait for
NULL, // list of events to wait for
&event); // ID of this operation to be used by waiting operations
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
h2d_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// END
//==================================================50
//======================================================================================================================================================150
// KERNEL
//======================================================================================================================================================150
//====================================================================================================100
// Execution Parameters
//====================================================================================================100
size_t local_work_size[1];
local_work_size[0] = order < 1024 ? order : 1024;
size_t global_work_size[1];
global_work_size[0] = count * local_work_size[0];
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
//====================================================================================================100
// Kernel Arguments
//====================================================================================================100
clSetKernelArg( kernel,
0,
sizeof(long),
(void *) &maxheight);
clSetKernelArg( kernel,
1,
sizeof(cl_mem),
(void *) &knodesD);
clSetKernelArg( kernel,
2,
sizeof(long),
(void *) &knodes_elem);
clSetKernelArg( kernel,
3,
sizeof(cl_mem),
(void *) &currKnodeD);
clSetKernelArg( kernel,
4,
sizeof(cl_mem),
(void *) &offsetD);
clSetKernelArg( kernel,
5,
sizeof(cl_mem),
(void *) &lastKnodeD);
clSetKernelArg( kernel,
6,
sizeof(cl_mem),
(void *) &offset_2D);
clSetKernelArg( kernel,
7,
sizeof(cl_mem),
(void *) &startD);
clSetKernelArg( kernel,
8,
sizeof(cl_mem),
(void *) &endD);
clSetKernelArg( kernel,
9,
sizeof(cl_mem),
(void *) &ansDStart);
clSetKernelArg( kernel,
10,
sizeof(cl_mem),
(void *) &ansDLength);
//====================================================================================================100
// Kernel
//====================================================================================================100
error = clEnqueueNDRangeKernel( command_queue,
kernel,
1,
NULL,
global_work_size,
local_work_size,
0,
NULL,
&event);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
#ifdef TIMING
kernel_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//====================================================================================================100
// END
//====================================================================================================100
//======================================================================================================================================================150
// GPU MEMORY COPY (CONTD.)
//======================================================================================================================================================150
//====================================================================================================100
// DEVICE IN/OUT
//====================================================================================================100
//==================================================50
// ansDStart
//==================================================50
error = clEnqueueReadBuffer(command_queue, // The command queue.
ansDStart, // The image on the device.
CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
0, // Offset. None in this case.
count*sizeof(int), // Size to copy.
recstart, // The pointer to the image on the host.
0, // Number of events in wait list. Not used.
NULL, // Event wait list. Not used.
&event); // Event object for determining status. Not used.
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
d2h_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// ansDLength
//==================================================50
error = clEnqueueReadBuffer(command_queue, // The command queue.
ansDLength, // The image on the device.
CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
0, // Offset. None in this case.
count*sizeof(int), // Size to copy.
reclength, // The pointer to the image on the host.
0, // Number of events in wait list. Not used.
NULL, // Event wait list. Not used.
&event); // Event object for determining status. Not used.
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
#ifdef TIMING
d2h_time += probe_event_time(event, command_queue);
#endif
clReleaseEvent(event);
//==================================================50
// END
//==================================================50
//====================================================================================================100
// END
//====================================================================================================100
//======================================================================================================================================================150
// GPU MEMORY DEALLOCATION
//======================================================================================================================================================150
#ifdef TIMING
gettimeofday(&tv_close_start, NULL);
#endif
// Release kernels...
clReleaseKernel(kernel);
// Now the program...
clReleaseProgram(program);
// Clean up the device memory...
clReleaseMemObject(knodesD);
clReleaseMemObject(currKnodeD);
clReleaseMemObject(offsetD);
clReleaseMemObject(lastKnodeD);
clReleaseMemObject(offset_2D);
clReleaseMemObject(startD);
clReleaseMemObject(endD);
clReleaseMemObject(ansDStart);
clReleaseMemObject(ansDLength);
// Flush the queue
error = clFlush(command_queue);
if (error != CL_SUCCESS)
fatal_CL(error, __LINE__);
// ...and finally, the queue and context.
clReleaseCommandQueue(command_queue);
// ???
clReleaseContext(context);
#ifdef TIMING
gettimeofday(&tv_close_end, NULL);
tvsub(&tv_close_end, &tv_close_start, &tv);
close_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
tvsub(&tv_close_end, &tv_total_start, &tv);
total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
//======================================================================================================================================================150
// DISPLAY TIMING
//======================================================================================================================================================150
#ifdef TIMING
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
printf("Init: %f\n", init_time);
printf("MemAlloc: %f\n", mem_alloc_time);
printf("HtoD: %f\n", h2d_time);
printf("Exec: %f\n", kernel_time);
printf("DtoH: %f\n", d2h_time);
printf("Close: %f\n", close_time);
printf("Total: %f\n", total_time);
#endif
//======================================================================================================================================================150
// END
//======================================================================================================================================================150
}
//========================================================================================================================================================================================================200
// END
//========================================================================================================================================================================================================200
// #ifdef __cplusplus
// }
// #endif

View file

@ -0,0 +1,33 @@
// #ifdef __cplusplus
// extern "C" {
// #endif
//========================================================================================================================================================================================================200
// KERNEL_GPU_OPENCL_WRAPPER HEADER
//========================================================================================================================================================================================================200
void
kernel_gpu_opencl_wrapper_2(knode *knodes,
long knodes_elem,
long knodes_mem,
int order,
long maxheight,
int count,
long *currKnode,
long *offset,
long *lastKnode,
long *offset_2,
int *start,
int *end,
int *recstart,
int *reclength);
//========================================================================================================================================================================================================200
// End
//========================================================================================================================================================================================================200
// #ifdef __cplusplus
// }
// #endif

2457
tests/opencl/b+tree/main.cc Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,19 @@
// # ifdef __cplusplus
// extern "C" {
// # endif
//===============================================================================================================================================================================================================200
// HEADER
//===============================================================================================================================================================================================================200
int
main( int argc,
char *argv []);
//===============================================================================================================================================================================================================200
// END
//===============================================================================================================================================================================================================200
// # ifdef __cplusplus
// }
// # endif

View file

@ -0,0 +1,53 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// DESCRIPTION
//===============================================================================================================================================================================================================200
// Returns: 0 if string does not represent integer
// 1 if string represents integer
//===============================================================================================================================================================================================================200
// NUM CODE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// ISINTEGER FUNCTION
//======================================================================================================================================================150
int isInteger(char *str){
//====================================================================================================100
// make sure it's not empty
//====================================================================================================100
if (*str == '\0'){
return 0;
}
//====================================================================================================100
// if any digit is not a number, return false
//====================================================================================================100
for(; *str != '\0'; str++){
if (*str < 48 || *str > 57){ // digit characters (need to include . if checking for float)
return 0;
}
}
//====================================================================================================100
// it got past all my checks so I think it's a number
//====================================================================================================100
return 1;
}
//===============================================================================================================================================================================================================200
// END NUM CODE
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

21
tests/opencl/b+tree/num.h Normal file
View file

@ -0,0 +1,21 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// FILE HEADER
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// ISINTEGER FUNCTION PROTOTYPE
//======================================================================================================================================================150
int isInteger(char *str);
//===============================================================================================================================================================================================================200
// END FILE HEADER
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View file

@ -0,0 +1,138 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// INCLUDE/DEFINE
//===============================================================================================================================================================================================================200
#include "opencl.h" // (in directory) function headers
#include <CL/cl.h>
//===============================================================================================================================================================================================================200
// LOAD KERNEL SOURCE CODE FUNCTION
//===============================================================================================================================================================================================================200
char *
load_kernel_source(const char *filename)
{
// Open the source file
FILE *file = fopen(filename, "r");
if (file == NULL){
fatal("Error opening kernel source file\n");
}
// Determine the size of the file
if (fseek(file, 0, SEEK_END)){
fatal("Error reading kernel source file\n");
}
size_t size = ftell(file);
// Allocate space for the source code (plus one for null-terminator)
char *source = (char *) malloc(size + 1);
// Read the source code into the string
fseek(file, 0, SEEK_SET);
// printf("Number of elements: %lu\nSize = %lu", fread(source, 1, size, file), size);
// exit(1);
if (fread(source, 1, size, file) != size){
fatal("Error reading kernel source file\n");
}
// Null-terminate the string
source[size] = '\0';
// Return the pointer to the string
return source;
}
//===============================================================================================================================================================================================================200
// PRINT ERROR FUNCTION
//===============================================================================================================================================================================================================200
void
fatal(const char *s)
{
fprintf(stderr, "Error: %s\n", s);
exit(1);
}
//===============================================================================================================================================================================================================200
// PRINT OPENCL ERROR FUNCTION
//===============================================================================================================================================================================================================200
void
fatal_CL(cl_int error, int line_no) {
printf("Error at line %d: ", line_no);
switch(error) {
case CL_SUCCESS: printf("CL_SUCCESS\n"); break;
case CL_DEVICE_NOT_FOUND: printf("CL_DEVICE_NOT_FOUND\n"); break;
case CL_DEVICE_NOT_AVAILABLE: printf("CL_DEVICE_NOT_AVAILABLE\n"); break;
case CL_COMPILER_NOT_AVAILABLE: printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
case CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
case CL_PROFILING_INFO_NOT_AVAILABLE: printf("CL_PROFILING_INFO_NOT_AVAILABLE\n"); break;
case CL_MEM_COPY_OVERLAP: printf("CL_MEM_COPY_OVERLAP\n"); break;
case CL_IMAGE_FORMAT_MISMATCH: printf("CL_IMAGE_FORMAT_MISMATCH\n"); break;
case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("CL_IMAGE_FORMAT_NOT_SUPPORTED\n"); break;
case CL_BUILD_PROGRAM_FAILURE: printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
case CL_MAP_FAILURE: printf("CL_MAP_FAILURE\n"); break;
case CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
case CL_INVALID_DEVICE_TYPE: printf("CL_INVALID_DEVICE_TYPE\n"); break;
case CL_INVALID_PLATFORM: printf("CL_INVALID_PLATFORM\n"); break;
case CL_INVALID_DEVICE: printf("CL_INVALID_DEVICE\n"); break;
case CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
case CL_INVALID_QUEUE_PROPERTIES: printf("CL_INVALID_QUEUE_PROPERTIES\n"); break;
case CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
case CL_INVALID_HOST_PTR: printf("CL_INVALID_HOST_PTR\n"); break;
case CL_INVALID_MEM_OBJECT: printf("CL_INVALID_MEM_OBJECT\n"); break;
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR\n"); break;
case CL_INVALID_IMAGE_SIZE: printf("CL_INVALID_IMAGE_SIZE\n"); break;
case CL_INVALID_SAMPLER: printf("CL_INVALID_SAMPLER\n"); break;
case CL_INVALID_BINARY: printf("CL_INVALID_BINARY\n"); break;
case CL_INVALID_BUILD_OPTIONS: printf("CL_INVALID_BUILD_OPTIONS\n"); break;
case CL_INVALID_PROGRAM: printf("CL_INVALID_PROGRAM\n"); break;
case CL_INVALID_PROGRAM_EXECUTABLE: printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
case CL_INVALID_KERNEL_NAME: printf("CL_INVALID_KERNEL_NAME\n"); break;
case CL_INVALID_KERNEL_DEFINITION: printf("CL_INVALID_KERNEL_DEFINITION\n"); break;
case CL_INVALID_KERNEL: printf("CL_INVALID_KERNEL\n"); break;
case CL_INVALID_ARG_INDEX: printf("CL_INVALID_ARG_INDEX\n"); break;
case CL_INVALID_ARG_VALUE: printf("CL_INVALID_ARG_VALUE\n"); break;
case CL_INVALID_ARG_SIZE: printf("CL_INVALID_ARG_SIZE\n"); break;
case CL_INVALID_KERNEL_ARGS: printf("CL_INVALID_KERNEL_ARGS\n"); break;
case CL_INVALID_WORK_DIMENSION: printf("CL_INVALID_WORK_DIMENSION\n"); break;
case CL_INVALID_WORK_GROUP_SIZE: printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
case CL_INVALID_WORK_ITEM_SIZE: printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
case CL_INVALID_GLOBAL_OFFSET: printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
case CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
case CL_INVALID_EVENT: printf("CL_INVALID_EVENT\n"); break;
case CL_INVALID_OPERATION: printf("CL_INVALID_OPERATION\n"); break;
case CL_INVALID_GL_OBJECT: printf("CL_INVALID_GL_OBJECT\n"); break;
case CL_INVALID_BUFFER_SIZE: printf("CL_INVALID_BUFFER_SIZE\n"); break;
case CL_INVALID_MIP_LEVEL: printf("CL_INVALID_MIP_LEVEL\n"); break;
case CL_INVALID_GLOBAL_WORK_SIZE: printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
#ifdef CL_VERSION_1_1
case CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
/* case CL_INVALID_PROPERTY: printf("CL_INVALID_PROPERTY\n"); break; */
#endif
default: printf("Invalid OpenCL error code\n");
}
exit(error);
}
//===============================================================================================================================================================================================================200
// END
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View file

@ -0,0 +1,40 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// INCLUDE/DEFINE
//===============================================================================================================================================================================================================200
#include <stdio.h> // (in path known to compiler) needed by printf
#include <CL/cl.h> // (in path specified to compiler) needed by OpenCL types
//===============================================================================================================================================================================================================200
// LOAD KERNEL SOURCE CODE FUNCTION HEADER
//===============================================================================================================================================================================================================200
char *
load_kernel_source(const char *filename);
//===============================================================================================================================================================================================================200
// PRINT ERROR FUNCTION HEADER
//===============================================================================================================================================================================================================200
void
fatal(const char *s);
//===============================================================================================================================================================================================================200
// PRINT OPENCL ERROR FUNCTION HEADER
//===============================================================================================================================================================================================================200
void
fatal_CL(cl_int error, int line_no);
//===============================================================================================================================================================================================================200
// END
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View file

@ -0,0 +1,37 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// TIMER CODE
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// INCLUDE/DEFINE
//======================================================================================================================================================150
#include <stdlib.h>
#include "timer.h"
//======================================================================================================================================================150
// FUNCTIONS
//======================================================================================================================================================150
//====================================================================================================100
// DISPLAY TIME
//====================================================================================================100
// Returns the current system time in microseconds
long long get_time() {
struct timeval tv;
gettimeofday(&tv, NULL);
return (tv.tv_sec * 1000000) + tv.tv_usec;
}
//===============================================================================================================================================================================================================200
// END TIMER CODE
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View file

@ -0,0 +1,23 @@
#ifdef __cplusplus
extern "C" {
#endif
//===============================================================================================================================================================================================================200
// TIMER HEADER
#include <sys/time.h> // (in directory known to compiler) needed by gettimeofday
//===============================================================================================================================================================================================================200
//======================================================================================================================================================150
// FUNCTION PROTOTYPES
//======================================================================================================================================================150
long long
get_time();
//===============================================================================================================================================================================================================200
// END TIMER HEADER
//===============================================================================================================================================================================================================200
#ifdef __cplusplus
}
#endif

View file

@ -0,0 +1,40 @@
#include <stdio.h>
#include "timing.h"
void time_measure_start(struct timeval *tv)
{
gettimeofday(tv, NULL);
}
void time_measure_end(struct timeval *tv)
{
struct timeval tv_now, tv_diff;
double d;
gettimeofday(&tv_now, NULL);
tvsub(&tv_now, tv, &tv_diff);
d = (double) tv_diff.tv_sec * 1000.0 + (double) tv_diff.tv_usec / 1000.0;
printf("Time (Memory Copy and Launch) = %f (ms)\n", d);
}
float probe_event_time(cl_event event, cl_command_queue command_queue) {
cl_int error=0;
cl_ulong eventStart,eventEnd;
clFinish(command_queue);
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),&eventStart,NULL);
if (error != CL_SUCCESS) {
printf("ERROR (%d) in event start profiling.\n", error);
return 0;
}
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),&eventEnd,NULL);
if (error != CL_SUCCESS) {
printf("ERROR (%d) in event end profiling.\n", error);
return 0;
}
return (float)((eventEnd-eventStart)/1000000.0);
}

View file

@ -0,0 +1,25 @@
#ifndef __TIMING_H__
#define __TIMING_H__
#include <sys/time.h>
#include <CL/cl.h>
void time_measure_start(struct timeval *tv);
void time_measure_end(struct timeval *tv);
/* tvsub: ret = x - y. */
static inline void tvsub(struct timeval *x,
struct timeval *y,
struct timeval *ret)
{
ret->tv_sec = x->tv_sec - y->tv_sec;
ret->tv_usec = x->tv_usec - y->tv_usec;
if (ret->tv_usec < 0) {
ret->tv_sec--;
ret->tv_usec += 1000000;
}
}
float probe_event_time(cl_event, cl_command_queue);
#endif

View file

@ -0,0 +1,8 @@
PROJECT = backprop
SRCS = backprop.cpp imagenet.cpp facetrain.cpp backprop_ocl.cpp timing.cpp
OPTS ?= -n 1024 -p 0 -d 0
#OPTS ?= -n 32768 -p 0 -d 0
include common.mk

View file

@ -0,0 +1,473 @@
/*
******************************************************************
* HISTORY
* 15-Oct-94 Jeff Shufelt (js), Carnegie Mellon University
* Prepared for 15-681, Fall 1994.
* Modified by Shuai Che
******************************************************************
*/
#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include "backprop.h"
#include <math.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#define ABS(x) (((x) > 0.0) ? (x) : (-(x)))
#define fastcopy(to,from,len)\
{\
register char *_to,*_from;\
register int _i,_l;\
_to = (char *)(to);\
_from = (char *)(from);\
_l = (len);\
for (_i = 0; _i < _l; _i++) *_to++ = *_from++;\
}
/*** Return random number between 0.0 and 1.0 ***/
float drnd()
{
return ((float) rand() / (float) BIGRND);
}
/*** Return random number between -1.0 and 1.0 ***/
float dpn1()
{
return ((drnd() * 2.0) - 1.0);
}
/*** The squashing function. Currently, it's a sigmoid. ***/
float squash(float x)
{
float m;
//x = -x;
//m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
//return(1.0 / (1.0 + m));
return (1.0 / (1.0 + exp(-x)));
}
/*** Allocate 1d array of floats ***/
float *alloc_1d_dbl(int n)
{
float *new_t;
new_t = (float *) malloc ((unsigned) (n * sizeof (float)));
if (new_t == NULL) {
printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
return (NULL);
}
return (new_t);
}
/*** Allocate 2d array of floats ***/
float **alloc_2d_dbl(int m, int n)
{
int i;
float **new_t;
new_t = (float **) malloc ((unsigned) (m * sizeof (float *)));
if (new_t == NULL) {
printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
return (NULL);
}
for (i = 0; i < m; i++) {
new_t[i] = alloc_1d_dbl(n);
}
return (new_t);
}
void bpnn_randomize_weights(float **w, int m, int n)
{
int i, j;
for (i = 0; i <= m; i++) {
for (j = 0; j <= n; j++) {
w[i][j] = (float) rand()/RAND_MAX;
// w[i][j] = dpn1();
}
}
}
void bpnn_randomize_row(float *w, int m)
{
int i;
for (i = 0; i <= m; i++) {
//w[i] = (float) rand()/RAND_MAX;
w[i] = 0.1;
}
}
void bpnn_zero_weights(float **w, int m, int n)
{
int i, j;
for (i = 0; i <= m; i++) {
for (j = 0; j <= n; j++) {
w[i][j] = 0.0;
}
}
}
void bpnn_initialize(int seed)
{
printf("Random number generator seed: %d\n", seed);
srand(seed);
}
BPNN *bpnn_internal_create(int n_in, int n_hidden, int n_out)
{
BPNN *newnet;
newnet = (BPNN *) malloc (sizeof (BPNN));
if (newnet == NULL) {
printf("BPNN_CREATE: Couldn't allocate neural network\n");
return (NULL);
}
newnet->input_n = n_in;
newnet->hidden_n = n_hidden;
newnet->output_n = n_out;
newnet->input_units = alloc_1d_dbl(n_in + 1);
newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
newnet->output_units = alloc_1d_dbl(n_out + 1);
newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
newnet->output_delta = alloc_1d_dbl(n_out + 1);
newnet->target = alloc_1d_dbl(n_out + 1);
newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
return (newnet);
}
void bpnn_free(BPNN *net)
{
int n1, n2, i;
n1 = net->input_n;
n2 = net->hidden_n;
free((char *) net->input_units);
free((char *) net->hidden_units);
free((char *) net->output_units);
free((char *) net->hidden_delta);
free((char *) net->output_delta);
free((char *) net->target);
for (i = 0; i <= n1; i++) {
free((char *) net->input_weights[i]);
free((char *) net->input_prev_weights[i]);
}
free((char *) net->input_weights);
free((char *) net->input_prev_weights);
for (i = 0; i <= n2; i++) {
free((char *) net->hidden_weights[i]);
free((char *) net->hidden_prev_weights[i]);
}
free((char *) net->hidden_weights);
free((char *) net->hidden_prev_weights);
free((char *) net);
}
/*** Creates a new fully-connected network from scratch,
with the given numbers of input, hidden, and output units.
Threshold units are automatically included. All weights are
randomly initialized.
Space is also allocated for temporary storage (momentum weights,
error computations, etc).
***/
BPNN *bpnn_create(int n_in, int n_hidden, int n_out)
{
BPNN *newnet;
newnet = bpnn_internal_create(n_in, n_hidden, n_out);
#ifdef INITZERO
bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
#else
bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
#endif
bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
bpnn_randomize_row(newnet->target, n_out);
return (newnet);
}
void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2)
{
float sum;
int j, k;
/*** Set up thresholding unit ***/
l1[0] = 1.0;
#ifdef OPEN
omp_set_num_threads(NUM_THREAD);
#pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
#endif
/*** For each unit in second layer ***/
for (j = 1; j <= n2; j++) {
/*** Compute weighted sum of its inputs ***/
sum = 0.0;
for (k = 0; k <= n1; k++) {
sum += conn[k][j] * l1[k];
}
l2[j] = squash(sum);
}
}
void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err)
{
int j;
float o, t, errsum;
errsum = 0.0;
for (j = 1; j <= nj; j++) {
o = output[j];
t = target[j];
delta[j] = o * (1.0 - o) * (t - o);
errsum += ABS(delta[j]);
}
*err = errsum;
}
void bpnn_hidden_error(float *delta_h,
int nh,
float *delta_o,
int no,
float **who,
float *hidden,
float *err)
{
int j, k;
float h, sum, errsum;
errsum = 0.0;
for (j = 1; j <= nh; j++) {
h = hidden[j];
sum = 0.0;
for (k = 1; k <= no; k++) {
sum += delta_o[k] * who[j][k];
}
delta_h[j] = h * (1.0 - h) * sum;
errsum += ABS(delta_h[j]);
}
*err = errsum;
}
void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw)
{
float new_dw;
int k, j;
ly[0] = 1.0;
#ifdef OPEN
omp_set_num_threads(NUM_THREAD);
#pragma omp parallel for \
shared(oldw, w, delta) \
private(j, k, new_dw) \
firstprivate(ndelta, nly, momentum)
#endif
for (j = 1; j <= ndelta; j++) {
for (k = 0; k <= nly; k++) {
new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
w[k][j] += new_dw;
oldw[k][j] = new_dw;
}
}
}
void bpnn_feedforward(BPNN *net)
{
int in, hid, out;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
/*** Feed forward input activations. ***/
bpnn_layerforward(net->input_units, net->hidden_units,
net->input_weights, in, hid);
bpnn_layerforward(net->hidden_units, net->output_units,
net->hidden_weights, hid, out);
}
void bpnn_train(BPNN *net, float *eo, float *eh)
{
int in, hid, out;
float out_err, hid_err;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
/*** Feed forward input activations. ***/
bpnn_layerforward(net->input_units, net->hidden_units,
net->input_weights, in, hid);
bpnn_layerforward(net->hidden_units, net->output_units,
net->hidden_weights, hid, out);
/*** Compute error on output and hidden units. ***/
bpnn_output_error(net->output_delta, net->target, net->output_units,
out, &out_err);
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
net->hidden_weights, net->hidden_units, &hid_err);
*eo = out_err;
*eh = hid_err;
/*** Adjust input and hidden weights. ***/
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
net->hidden_weights, net->hidden_prev_weights);
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
net->input_weights, net->input_prev_weights);
}
void bpnn_save(BPNN *net, char *filename)
{
int n1, n2, n3, i, j, memcnt;
float dvalue, **w;
char *mem;
///add//
FILE *pFile;
pFile = fopen( filename, "w+" );
///////
/*
if ((fd = creat(filename, 0644)) == -1) {
printf("BPNN_SAVE: Cannot create '%s'\n", filename);
return;
}
*/
n1 = net->input_n; n2 = net->hidden_n; n3 = net->output_n;
printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
//fflush(stdout);
//write(fd, (char *) &n1, sizeof(int));
//write(fd, (char *) &n2, sizeof(int));
//write(fd, (char *) &n3, sizeof(int));
fwrite( (char *) &n1 , sizeof(char), sizeof(char), pFile);
fwrite( (char *) &n2 , sizeof(char), sizeof(char), pFile);
fwrite( (char *) &n3 , sizeof(char), sizeof(char), pFile);
memcnt = 0;
w = net->input_weights;
mem = (char *) malloc ((unsigned) ((n1+1) * (n2+1) * sizeof(float)));
for (i = 0; i <= n1; i++) {
for (j = 0; j <= n2; j++) {
dvalue = w[i][j];
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
memcnt += sizeof(float);
}
}
//write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
fwrite( mem , (unsigned)(sizeof(float)), (unsigned) ((n1+1) * (n2+1) * sizeof(float)) , pFile);
free(mem);
memcnt = 0;
w = net->hidden_weights;
mem = (char *) malloc ((unsigned) ((n2+1) * (n3+1) * sizeof(float)));
for (i = 0; i <= n2; i++) {
for (j = 0; j <= n3; j++) {
dvalue = w[i][j];
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
memcnt += sizeof(float);
}
}
//write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
fwrite( mem , sizeof(float), (unsigned) ((n2+1) * (n3+1) * sizeof(float)) , pFile);
free(mem);
fclose(pFile);
return;
}
BPNN *bpnn_read(char *filename)
{
char *mem;
BPNN *new_t;
int fd, n1, n2, n3, i, j, memcnt;
if ((fd = open(filename, 0, 0644)) == -1) {
return (NULL);
}
printf("Reading '%s'\n", filename); //fflush(stdout);
read(fd, (char *) &n1, sizeof(int));
read(fd, (char *) &n2, sizeof(int));
read(fd, (char *) &n3, sizeof(int));
new_t = bpnn_internal_create(n1, n2, n3);
printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
printf("Reading input weights..."); //fflush(stdout);
memcnt = 0;
mem = (char *) malloc ((unsigned) ((n1+1) * (n2+1) * sizeof(float)));
read(fd, mem, (n1+1) * (n2+1) * sizeof(float));
for (i = 0; i <= n1; i++) {
for (j = 0; j <= n2; j++) {
fastcopy(&(new_t->input_weights[i][j]), &mem[memcnt], sizeof(float));
memcnt += sizeof(float);
}
}
free(mem);
printf("Done\nReading hidden weights..."); //fflush(stdout);
memcnt = 0;
mem = (char *) malloc ((unsigned) ((n2+1) * (n3+1) * sizeof(float)));
read(fd, mem, (n2+1) * (n3+1) * sizeof(float));
for (i = 0; i <= n2; i++) {
for (j = 0; j <= n3; j++) {
fastcopy(&(new_t->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
memcnt += sizeof(float);
}
}
free(mem);
close(fd);
printf("Done\n"); //fflush(stdout);
bpnn_zero_weights(new_t->input_prev_weights, n1, n2);
bpnn_zero_weights(new_t->hidden_prev_weights, n2, n3);
return (new_t);
}

View file

@ -0,0 +1,70 @@
#ifndef _BACKPROP_H_
#define _BACKPROP_H_
#define BIGRND 0x7fffffff
#define THREADS 256
#define WIDTH 16 // shared memory width
#define HEIGHT 16 // shared memory height
#define BLOCK_SIZE 16
#define ETA 0.3 //eta value
#define MOMENTUM 0.3 //momentum value
#define NUM_THREAD 4 //OpenMP threads
#include <CL/cl.h>
typedef struct {
int input_n; /* number of input units */
int hidden_n; /* number of hidden units */
int output_n; /* number of output units */
float *input_units; /* the input units */
float *hidden_units; /* the hidden units */
float *output_units; /* the output units */
float *hidden_delta; /* storage for hidden unit error */
float *output_delta; /* storage for output unit error */
float *target; /* storage for target vector */
float **input_weights; /* weights from input to hidden layer */
float **hidden_weights; /* weights from hidden to output layer */
/*** The next two are for momentum ***/
float **input_prev_weights; /* previous change on input to hidden wgt */
float **hidden_prev_weights; /* previous change on hidden to output wgt */
} BPNN;
/*** User-level functions ***/
//void bpnn_initialize();
void bpnn_initialize(int seed);
BPNN *bpnn_create(int n_in, int n_hidden, int n_out);
void bpnn_free(BPNN *net);
//BPNN *bpnn_create();
//void bpnn_free();
void bpnn_train(BPNN *net, float *eo, float *eh);
//void bpnn_train();
//void bpnn_feedforward();
void bpnn_feedforward(BPNN *net);
void bpnn_save(BPNN *net, char *filename);
//void bpnn_save();
//BPNN *bpnn_read();
BPNN *bpnn_read(char *filename);
void load(BPNN *net);
int bpnn_train_kernel(BPNN *net, float *eo, float *eh);
void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2);
void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err);
void bpnn_hidden_error(float *delta_h, int nh, float *delta_o, int no, float **who, float *hidden, float *err);
void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw);
int setup(int argc, char** argv);
float **alloc_2d_dbl(int m, int n);
float squash(float x);
/*** OpenCL config variables ***/
extern int platform_id_inuse;
extern int device_id_inuse;
extern cl_device_type device_type;
#endif

View file

@ -0,0 +1,402 @@
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <sys/time.h>
#include "backprop.h"
#ifdef NV //NVIDIA
#include <oclUtils.h>
#else
#include <CL/cl.h>
#endif
#ifdef TIMING
#include "timing.h"
#endif
////////////////////////////////////////////////////////////////////////////////
// local variables
static cl_context context;
static cl_command_queue cmd_queue;
static cl_device_id * device_list;
static cl_uint num_devices;
// OCL config
int platform_id_inuse = 0; // platform id in use (default: 0)
int device_id_inuse = 0; //device id in use (default : 0)
cl_device_type device_type = CL_DEVICE_TYPE_GPU;
//Primitives for timing
#ifdef TIMING
struct timeval tv;
struct timeval tv_total_start, tv_total_end;
struct timeval tv_init_end;
struct timeval tv_h2d_start, tv_h2d_end;
struct timeval tv_d2h_start, tv_d2h_end;
struct timeval tv_kernel_start, tv_kernel_end;
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
struct timeval tv_close_start, tv_close_end;
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
d2h_time = 0, close_time = 0, total_time = 0;
#endif
static int initialize(void)
{
cl_int result;
size_t size;
cl_uint num_platforms;
// get OpenCL platforms
if (clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(0,0,*) failed\n"); return -1; }
cl_platform_id all_platform_id[num_platforms];
if (clGetPlatformIDs(num_platforms, all_platform_id, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(*,*,0) failed\n"); return -1; }
cl_platform_id platform_id = all_platform_id[platform_id_inuse];
// get device
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices) != CL_SUCCESS) { printf("ERROR: clGetDeviceIDs failed\n"); return -1; };
printf("num_devices = %d\n", num_devices);
if(device_id_inuse > num_devices) {
printf("Invalid Device Number\n");
return -1;
}
device_list = new cl_device_id[num_devices];
//device_list = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
if( !device_list ) { printf("ERROR: new cl_device_id[] failed\n"); return -1; }
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, num_devices, device_list, NULL) != CL_SUCCESS) { printf("ERROR: clGetDeviceIDs failed\n"); return -1; };
// get device type
if (clGetDeviceInfo(device_list[device_id_inuse], CL_DEVICE_TYPE, sizeof(device_type), (void *)&device_type, NULL)!= CL_SUCCESS) { printf("ERROR: clGetDeviceIDs failed\n"); return -1; };
// create OpenCL context
cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0};
context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", device_type == CL_DEVICE_TYPE_GPU ? "GPU" : "CPU"); return -1; }
// create command queue for the specific device
#ifdef TIMING
cmd_queue = clCreateCommandQueue( context, device_list[device_id_inuse], CL_QUEUE_PROFILING_ENABLE, NULL );
#else
cmd_queue = clCreateCommandQueue( context, device_list[device_id_inuse], 0, NULL );
#endif
if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
return 0;
}
static int shutdown()
{
// release resources
if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
if( context ) clReleaseContext( context );
if( device_list ) delete[] device_list;
// reset all variables
cmd_queue = 0;
context = 0;
device_list = 0;
num_devices = 0;
device_type = CL_DEVICE_TYPE_GPU;
return 0;
}
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
unsigned int num_threads = 0;
unsigned int num_blocks = 0;
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{
setup(argc, argv);
}
int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
{
int in, hid, out;
float out_err, hid_err;
in = net->input_n;
hid = net->hidden_n;
out = net->output_n;
int sourcesize = 1024*1024;
char * source = (char *)calloc(sourcesize, sizeof(char));
if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
// read the kernel core source
const char * kernel_bp1 = "bpnn_layerforward_ocl";
const char * kernel_bp2 = "bpnn_adjust_weights_ocl";
//const char * tempchar = "./backprop_kernel.cl";
/*
FILE * fp = fopen(tempchar, "rb");
if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
fread(source + strlen(source), sourcesize, 1, fp);
fclose(fp);
*/
#ifdef TIMING
gettimeofday(&tv_total_start, NULL);
#endif
if(initialize()) return -1;
// compile kernel
cl_int err = 0;
/*
const char * slist[2] = { source, 0 };
cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
{ // show warnings/errors
//static char log[65536]; memset(log, 0, sizeof(log));
//cl_device_id device_id = 0;
//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
}
if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
*/
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
std::abort();
cl_program prog = clCreateProgramWithBinary(
context, 1, &device_list[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &err);
free(kernel_bin);
err = clBuildProgram(prog, 1, &device_list[device_id_inuse], NULL, NULL, NULL);
if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
cl_kernel kernel1;
cl_kernel kernel2;
kernel1 = clCreateKernel(prog, kernel_bp1, &err);
kernel2 = clCreateKernel(prog, kernel_bp2, &err);
if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
clReleaseProgram(prog);
#ifdef TIMING
gettimeofday(&tv_init_end, NULL);
tvsub(&tv_init_end, &tv_total_start, &tv);
init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
float *input_weights_one_dim;
float *input_weights_prev_one_dim;
float * partial_sum;
float sum;
float num_blocks = in / BLOCK_SIZE;
input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
// set global and local workitems
size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 };
size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
// todo: fix mem allocation
int m = 0;
for (int k = 0; k <= in; k++) {
for (int j = 0; j <= hid; j++) {
input_weights_one_dim[m] = net->input_weights[k][j];
input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
m++;
}
}
cl_mem input_hidden_ocl;
cl_mem input_ocl;
cl_mem output_hidden_ocl;
cl_mem hidden_partial_sum;
cl_mem hidden_delta_ocl;
cl_mem input_prev_weights_ocl;
#ifdef TIMING
gettimeofday(&tv_mem_alloc_start, NULL);
#endif
input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
#ifdef TIMING
gettimeofday(&tv_mem_alloc_end, NULL);
tvsub(&tv_mem_alloc_end, &tv_mem_alloc_start, &tv);
mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
printf("Performing %s computation\n", device_type == CL_DEVICE_TYPE_GPU ? "GPU" : "CPU");
cl_event event;
cl_event write_event[3];
//write buffers
err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, &write_event[0]);
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, &write_event[1]);
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
#ifdef TIMING
h2d_time += probe_event_time(write_event[0],cmd_queue);
h2d_time += probe_event_time(write_event[1],cmd_queue);
#endif
clReleaseEvent(write_event[0]);
clReleaseEvent(write_event[1]);
clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
clSetKernelArg(kernel1, 4, sizeof(float) * HEIGHT, (void*)NULL );
clSetKernelArg(kernel1, 5, sizeof(float ) * HEIGHT * WIDTH, (void*)NULL );
clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 2, NULL, global_work, local_work, 0, 0, &event);
if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
#ifdef TIMING
kernel_time += probe_event_time(event,cmd_queue);
#endif
clReleaseEvent(event);
err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, &event);
if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: partial sum\n"); return -1; }
#ifdef TIMING
d2h_time += probe_event_time(event,cmd_queue);
#endif
clReleaseEvent(event);
for (int j = 1; j <= hid; j++) {
sum = 0.0;
for (int k = 0; k < num_blocks; k++) {
sum += partial_sum[k * hid + j-1] ;
}
sum += net->input_weights[0][j];
net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
}
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl, 1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, &write_event[0]);
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, &write_event[1]);
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, &write_event[2]);
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
#ifdef TIMING
h2d_time += probe_event_time(write_event[0],cmd_queue);
h2d_time += probe_event_time(write_event[1],cmd_queue);
h2d_time += probe_event_time(write_event[2],cmd_queue);
#endif
clReleaseEvent(write_event[0]);
clReleaseEvent(write_event[1]);
clReleaseEvent(write_event[2]);
clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, &event);
if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
#ifdef TIMING
kernel_time += probe_event_time(event,cmd_queue);
#endif
clReleaseEvent(event);
err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, &event);
if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_ocl\n"); return -1; }
#ifdef TIMING
d2h_time += probe_event_time(event,cmd_queue);
#endif
clReleaseEvent(event);
err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, &event);
if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }
#ifdef TIMING
d2h_time += probe_event_time(event,cmd_queue);
#endif
clReleaseEvent(event);
#ifdef TIMING
gettimeofday(&tv_close_start, NULL);
#endif
clReleaseMemObject(input_ocl);
clReleaseMemObject(output_hidden_ocl);
clReleaseMemObject(input_hidden_ocl);
clReleaseMemObject(hidden_partial_sum);
clReleaseMemObject(input_prev_weights_ocl);
free(input_weights_prev_one_dim);
free(partial_sum);
free(input_weights_one_dim);
shutdown();
#ifdef TIMING
gettimeofday(&tv_close_end, NULL);
tvsub(&tv_close_end, &tv_close_start, &tv);
close_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
tvsub(&tv_close_end, &tv_total_start, &tv);
total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
printf("Init: %f\n", init_time);
printf("MemAlloc: %f\n", mem_alloc_time);
printf("HtoD: %f\n", h2d_time);
printf("Exec: %f\n", kernel_time);
printf("DtoH: %f\n", d2h_time);
printf("Close: %f\n", close_time);
printf("Total: %f\n", total_time);
#endif
}

View file

@ -0,0 +1,123 @@
XLEN ?= 32
TOOLDIR ?= /opt
TARGET ?= opaesim
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
K_CFLAGS += -march=rv64imafd -mabi=ilp64d
STARTUP_ADDR ?= 0x180000000
else
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
K_CFLAGS += -march=rv32imaf -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
K_CFLAGS += -I$(VORTEX_KN_PATH)/include -DNDEBUG -DLLVM_VOTEX
K_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a -lm
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing -Wno-unused-result
CXXFLAGS += -pthread
CXXFLAGS += -I$(POCL_RT_PATH)/include
ifdef HOSTGPU
CXXFLAGS += -DHOSTGPU
LDFLAGS += -lOpenCL
else
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so
endif
# Debugigng
#ifdef DEBUG
CXXFLAGS += -g -O0
#else
# CXXFLAGS += -O2 -DNDEBUG
#endif
ifeq ($(TARGET), fpga)
OPAE_DRV_PATHS ?= libopae-c.so
else
ifeq ($(TARGET), asesim)
OPAE_DRV_PATHS ?= libopae-c-ase.so
else
ifeq ($(TARGET), opaesim)
OPAE_DRV_PATHS ?= libopae-c-sim.so
endif
endif
endif
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
all: $(PROJECT) kernel.pocl
kernel.pocl: kernel.cl
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
#backprop.o: backprop.c
# $(CXX) $(CXXFLAGS) backprop.c -c -Wno-unused-result
%.cc.o: %.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
%.cpp.o: %.cpp
$(CXX) $(CXXFLAGS) -c $< -o $@
%.c.o: %.c
$(CC) $(CXXFLAGS) -c $< -o $@
$(PROJECT): $(OBJS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-hostgpu: $(PROJECT) kernel.pocl
./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) kernel.pocl
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-xrt: $(PROJECT) kernel.pocl
ifeq ($(TARGET), hw)
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
else
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
endif
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.dump *.pocl
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View file

@ -0,0 +1,70 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include "backprop.h"
#include "omp.h"
#include <CL/cl.h>
#include <string.h>
extern char *strcpy();
extern void exit();
int layer_size = 0;
void backprop_face()
{
BPNN *net;
int i;
float out_err, hid_err;
net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
printf("Input layer size : %d\n", layer_size);
load(net);
//entering the training kernel, only one iteration
printf("Starting training kernel\n");
bpnn_train_kernel(net, &out_err, &hid_err);
bpnn_free(net);
printf("\nFinish the training for one iteration\n");
}
int setup(int argc, char **argv)
{
layer_size = -1;
int cur_arg;
for (cur_arg = 1; cur_arg<argc; cur_arg++) {
if (strcmp(argv[cur_arg], "-h") == 0) {
fprintf(stderr, "usage: backprop <-n num of input elements> [-p platform_id] [-d device_id] [-t device_type]\n");
exit(0);
}
else if (strcmp(argv[cur_arg], "-n") == 0) {
if (argc >= cur_arg + 1) {
layer_size = atoi(argv[cur_arg+1]);
cur_arg++;
}
}
else if (strcmp(argv[cur_arg], "-p") == 0) {
if (argc >= cur_arg + 1) {
platform_id_inuse = atoi(argv[cur_arg+1]);
cur_arg++;
}
}
else if (strcmp(argv[cur_arg], "-d") == 0) {
if (argc >= cur_arg + 1) {
device_id_inuse = atoi(argv[cur_arg+1]);
cur_arg++;
}
}
}
if (layer_size % 16 != 0){
fprintf(stderr, "The number of input points must be divided by 16\n");
exit(0);
}
int seed = 7;
bpnn_initialize(seed);
backprop_face();
exit(0);
}

View file

@ -0,0 +1,24 @@
#include <stdio.h>
#include <stdlib.h>
#include "backprop.h"
extern int layer_size;
void load(BPNN *net)
//BPNN *net;
{
float *units;
int nr, nc, imgsize, i, j, k;
nr = layer_size;
imgsize = nr * nc;
units = net->input_units;
k = 1;
for (i = 0; i < nr; i++) {
units[k] = (float) rand()/RAND_MAX ;
k++;
}
}

View file

@ -0,0 +1,90 @@
#define THREADS 256
#define WIDTH 16
#define HEIGHT 16
#define ETA 0.3f
#define MOMENTUM 0.3f
#ifndef _BACKPROP_CUDA_KERNEL_H_
#define _BACKPROP_CUDA_KERNEL_H_
#define WM(i, j) weight_matrix[(j) + (i) * WIDTH]
__kernel void
bpnn_layerforward_ocl(__global float *input_cuda,
__global float *output_hidden_cuda,
__global float *input_hidden_cuda,
__global float *hidden_partial_sum,
__local float *input_node,
__local float *weight_matrix,
int in,
int hid)
{
int by = get_group_id(1);
int tx = get_local_id(0);
int ty = get_local_id(1);
int index = ( hid + 1 ) * HEIGHT * by + ( hid + 1 ) * ty + tx + 1 + ( hid + 1 ) ;
int index_in = HEIGHT * by + ty + 1;
if ( tx == 0 )
input_node[ty] = input_cuda[index_in] ;
barrier(CLK_LOCAL_MEM_FENCE);
weight_matrix[ty * WIDTH + tx] = input_hidden_cuda[index];
barrier(CLK_LOCAL_MEM_FENCE);
weight_matrix[ty * WIDTH + tx]= weight_matrix[ty * WIDTH + tx] * input_node[ty];
barrier(CLK_LOCAL_MEM_FENCE);
for ( int i = 1 ; i <= HEIGHT ; i=i*2){
//for ( int i = 1 ; i <= 4 ; i++){
int power_two = i;
//int power_two = 2 << (i - 1);
if( ty % power_two == 0 )
weight_matrix[ty * WIDTH + tx]= weight_matrix[ty * WIDTH + tx] + weight_matrix[(ty + power_two/2)* WIDTH + tx];
barrier(CLK_LOCAL_MEM_FENCE);
}
input_hidden_cuda[index] = weight_matrix[ty * WIDTH + tx];
barrier(CLK_LOCAL_MEM_FENCE);
if ( tx == 0 ) {
hidden_partial_sum[by * hid + ty] = weight_matrix[tx* WIDTH + ty];
}
}
__kernel void bpnn_adjust_weights_ocl( __global float * delta,
int hid,
__global float * ly,
int in,
__global float * w,
__global float * oldw)
{
int by = get_group_id(1);
int tx = get_local_id(0);
int ty = get_local_id(1);
int index = ( hid + 1 ) * HEIGHT * by + ( hid + 1 ) * ty + tx + 1 + ( hid + 1 ) ;
int index_y = HEIGHT * by + ty + 1;
int index_x = tx + 1;
w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
oldw[index] = ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
barrier(CLK_LOCAL_MEM_FENCE);
if (ty == 0 && by ==0){
w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
}
}
#endif

View file

@ -0,0 +1,40 @@
#include <stdio.h>
#include "timing.h"
void time_measure_start(struct timeval *tv)
{
gettimeofday(tv, NULL);
}
void time_measure_end(struct timeval *tv)
{
struct timeval tv_now, tv_diff;
double d;
gettimeofday(&tv_now, NULL);
tvsub(&tv_now, tv, &tv_diff);
d = (double) tv_diff.tv_sec * 1000.0 + (double) tv_diff.tv_usec / 1000.0;
printf("Time (Memory Copy and Launch) = %f (ms)\n", d);
}
float probe_event_time(cl_event event, cl_command_queue command_queue) {
cl_int error=0;
cl_ulong eventStart,eventEnd;
clFinish(command_queue);
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),&eventStart,NULL);
if (error != CL_SUCCESS) {
printf("ERROR (%d) in event start profiling.\n", error);
return 0;
}
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),&eventEnd,NULL);
if (error != CL_SUCCESS) {
printf("ERROR (%d) in event end profiling.\n", error);
return 0;
}
return (float)((eventEnd-eventStart)/1000000.0);
}

View file

@ -0,0 +1,25 @@
#ifndef __TIMING_H__
#define __TIMING_H__
#include <sys/time.h>
#include <CL/cl.h>
void time_measure_start(struct timeval *tv);
void time_measure_end(struct timeval *tv);
/* tvsub: ret = x - y. */
static inline void tvsub(struct timeval *x,
struct timeval *y,
struct timeval *ret)
{
ret->tv_sec = x->tv_sec - y->tv_sec;
ret->tv_usec = x->tv_usec - y->tv_usec;
if (ret->tv_usec < 0) {
ret->tv_sec--;
ret->tv_usec += 1000000;
}
}
float probe_event_time(cl_event, cl_command_queue);
#endif

View file

@ -0,0 +1,931 @@
//------------------------------------------
//--cambine:helper function for OpenCL
//--programmer: Jianbin Fang
//--date: 27/12/2010
//------------------------------------------
#ifndef _CL_HELPER_
#define _CL_HELPER_
#include <CL/cl.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <string>
#ifdef TIMING
#include "timing.h"
#endif
using std::string;
using std::ifstream;
using std::cerr;
using std::endl;
//using std::cout;
//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
#define WORK_DIM 2 //work-items dimensions
extern float init_time, mem_alloc_time, h2d_time, kernel_time,
d2h_time, close_time, total_time;
struct oclHandleStruct {
cl_context context;
cl_device_id *devices;
cl_command_queue queue;
cl_program program;
cl_int cl_status;
std::string error_str;
std::vector<cl_kernel> kernel;
};
struct oclHandleStruct oclHandles;
struct _clDeviceProp{
char device_name[100];
};
char kernel_file[100] = "Kernels.cl";
int total_kernels = 2;
string kernel_names[2] = {"BFS_1", "BFS_2"};
int work_group_size = 512;
int platform_id_inuse = 0; // platform id in use (default: 0)
int device_id_inuse = 0; //device id in use (default : 0)
cl_device_type device_type = CL_DEVICE_TYPE_GPU;
void _clGetDeviceProperties(int idx, _clDeviceProp *prop) throw(string){
oclHandles.cl_status= clGetDeviceInfo(oclHandles.devices[idx], CL_DEVICE_NAME, 100, prop->device_name, NULL);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS){
oclHandles.error_str = "exception in _clGetDeviceProperties-> ";
switch(oclHandles.cl_status){
case CL_INVALID_DEVICE:
oclHandles.error_str += "CL_INVALID_DEVICE";
break;
case CL_INVALID_VALUE:
oclHandles.error_str += "CL_INVALID_VALUE";
break;
default:
oclHandles.error_str += "unknown reasons";
break;
}
throw(oclHandles.error_str);
}
#endif
}
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
/*
* Converts the contents of a file into a string
*/
string FileToString(const string fileName)
{
ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
try {
size_t size;
char* str;
string s;
if(f.is_open()) {
size_t fileSize;
f.seekg(0, ifstream::end);
size = fileSize = f.tellg();
f.seekg(0, ifstream::beg);
str = new char[size+1];
if (!str) throw(string("Could not allocate memory"));
f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete [] str;
return s;
}
} catch(std::string msg) {
printf("Exception caught in FileToString(): %s\n");
if(f.is_open())
f.close();
} catch(...) {
printf("Exception caught in FileToString()\n");
if(f.is_open())
f.close();
}
string errorMsg = "FileToString()::Error: Unable to open file "
+ fileName;
throw(errorMsg);
}
//---------------------------------------
//Read command line parameters
//
void _clCmdParams(int argc, char* argv[])
{
for (int i =0; i < argc; ++i) {
switch (argv[i][1]) {
case 'g': //--g stands for size of work group
if (++i < argc) {
sscanf(argv[i], "%u", &work_group_size);
} else {
printf("Could not read argument after option %d\n", argv[i-1]);
throw;
}
break;
case 'd': //--d stands for device id used in computaion
if (++i < argc) {
sscanf(argv[i], "%u", &device_id_inuse);
} else {
printf("Could not read argument after option %d\n", argv[i-1]);
throw;
}
break;
case 'p': // --p stands for platform id used in computation
if (++i < argc) {
sscanf(argv[i], "%u", &platform_id_inuse);
} else {
printf("Could not read argument after option %d\n", argv[i-1]);
throw;
}
break;
/*
case 't': // --t stands for device type, 0:GPU, 1:CPU
if (++i < argc) {
sscanf(argv[i], "%u", &device_type);
device_type = (device_type == 0) ? CL_DEVICE_TYPE_GPU
: CL_DEVICE_TYPE_CPU;
} else {
std::cerr << "Could not read argument after option " << argv[i-1] << std::endl;
throw;
}
break;
*/
default:
;
}
}
}
//---------------------------------------
//Initlize CL objects
//--description: there are 5 steps to initialize all the OpenCL objects needed
//--revised on 04/01/2011: get the number of devices and
// devices have no relationship with context
void _clInit()
{
cl_int resultCL;
oclHandles.context = NULL;
oclHandles.devices = NULL;
oclHandles.queue = NULL;
oclHandles.program = NULL;
cl_uint deviceListSize;
//-----------------------------------------------
//--cambine-1: find the available platforms and select one
cl_uint numPlatforms;
cl_platform_id targetPlatform = NULL;
resultCL = clGetPlatformIDs(0, NULL, &numPlatforms);
if (resultCL != CL_SUCCESS)
throw (string("InitCL()::Error: Getting number of platforms (clGetPlatformIDs)"));
printf("number of platforms:%d\n",numPlatforms); //by cambine
if (!(numPlatforms > 0))
throw (string("InitCL()::Error: No platforms found (clGetPlatformIDs)"));
cl_platform_id* allPlatforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id));
resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
if (resultCL != CL_SUCCESS)
throw (string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
for (int i = 0; i < numPlatforms; i++) {
char pbuff[128];
resultCL = clGetPlatformInfo( allPlatforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuff),
pbuff,
NULL);
if (resultCL != CL_SUCCESS)
throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
printf("vendor is %s\n",pbuff);
}
/* Select the target platform. Default: first platform */
targetPlatform = allPlatforms[platform_id_inuse];
free(allPlatforms);
//-----------------------------------------------
//--cambine-3: detect OpenCL devices
/* First, get the size of device list */
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_ALL, 0, NULL, &deviceListSize);
if(oclHandles.cl_status!=CL_SUCCESS) {
throw(string("exception in _clInit -> clGetDeviceIDs"));
}
if (deviceListSize == 0)
throw(string("InitCL()::Error: No devices found."));
printf("device number: %d\n");
/* Now, allocate the device list */
oclHandles.devices = (cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
if (oclHandles.devices == 0)
throw(string("InitCL()::Error: Could not allocate memory."));
/* Next, get the device list data */
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_ALL, deviceListSize,
oclHandles.devices, NULL);
if(oclHandles.cl_status!=CL_SUCCESS) {
throw(string("exception in _clInit -> clGetDeviceIDs-2"));
}
_clDeviceProp prop;
_clGetDeviceProperties(device_id_inuse, &prop);
printf("--cambine: device name=%s\n", prop.device_name);
/* Then, get device type */
/*
oclHandles.cl_status = clGetDeviceInfo(oclHandles.devices[device_id_inuse],
CL_DEVICE_TYPE, sizeof(cl_device_type), (void *)&device_type,
NULL);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string("error in Getting Device Info"));
}
if (device_type == CL_DEVICE_TYPE_GPU)
printf("Creating GPU Context\n");
else if (device_type == CL_DEVICE_TYPE_CPU)
printf("Creating CPU Context\n");
else
throw(string("unsupported device type"));
*/
//-----------------------------------------------
//--cambine-2: create an OpenCL context
cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)targetPlatform, 0 };
/*
oclHandles.context = clCreateContextFromType(cprops,
device_type,
NULL,
NULL,
&resultCL);
*/
oclHandles.context = clCreateContext(0,
deviceListSize,
oclHandles.devices,
NULL,
NULL,
&resultCL);
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
throw (string("InitCL()::Error: Creating Context (clCreateContextFromType)"));
//-----------------------------------------------
//--cambine-4: Create an OpenCL command queue
#ifdef TIMING
oclHandles.queue = clCreateCommandQueue(oclHandles.context,
oclHandles.devices[device_id_inuse], CL_QUEUE_PROFILING_ENABLE,
&resultCL);
#else
oclHandles.queue = clCreateCommandQueue(oclHandles.context,
oclHandles.devices[device_id_inuse], 0, &resultCL);
#endif
if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
//-----------------------------------------------
//--cambine-5: Load CL file, build CL program object, create CL kernel object
/*
std::string source_str = FileToString(kernel_file);
const char * source = source_str.c_str();
size_t sourceSize[] = { source_str.length() };
oclHandles.program = clCreateProgramWithSource(oclHandles.context,
1,
&source,
sourceSize,
&resultCL);
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)"));
*/
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
std::abort();
oclHandles.program = clCreateProgramWithBinary(
oclHandles.context, 1, &oclHandles.devices[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &resultCL);
free(kernel_bin);
//insert debug information
//std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
//options += " -cl-nv-opt-level=3";
resultCL = clBuildProgram(oclHandles.program, deviceListSize, oclHandles.devices, NULL, NULL,NULL);
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) {
printf("InitCL()::Error: In clBuildProgram\n");
size_t length;
resultCL = clGetProgramBuildInfo(oclHandles.program,
oclHandles.devices[device_id_inuse],
CL_PROGRAM_BUILD_LOG,
0,
NULL,
&length);
if(resultCL != CL_SUCCESS)
throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
char* buffer = (char*)malloc(length);
resultCL = clGetProgramBuildInfo(oclHandles.program,
oclHandles.devices[device_id_inuse],
CL_PROGRAM_BUILD_LOG,
length,
buffer,
NULL);
if(resultCL != CL_SUCCESS)
throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
printf("%s\n", buffer);
free(buffer);
throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
}
//get program information in intermediate representation
#ifdef PTX_MSG
size_t binary_sizes[deviceListSize];
char * binaries[deviceListSize];
//figure out number of devices and the sizes of the binary for each device.
oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*deviceListSize, &binary_sizes, NULL );
if(oclHandles.cl_status!=CL_SUCCESS) {
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
}
printf("--cambine: %d", binary_sizes);
//copy over all of the generated binaries.
for(int i=0; i<deviceListSize; i++)
binaries[i] = (char *)malloc( sizeof(char)*(binary_sizes[i]+1));
oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES, sizeof(char *)*deviceListSize, binaries, NULL );
if(oclHandles.cl_status!=CL_SUCCESS) {
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
}
for(int i=0; i<deviceListSize; i++)
binaries[i][binary_sizes[i]] = '\0';
printf("--cambine:writing ptd information...\n");
FILE * ptx_file = fopen("cl.ptx","w");
if(ptx_file==NULL) {
throw(string("exceptions in allocate ptx file."));
}
fprintf(ptx_file,"%s",binaries[DEVICE_ID_INUSE]);
fclose(ptx_file);
printf("--cambine:writing ptd information done.\n");
for(int i=0; i<deviceListSize; i++)
free(binaries[i]);
#endif
for (int nKernel = 0; nKernel < total_kernels; nKernel++) {
/* get a kernel object handle for a kernel with the given name */
cl_kernel kernel = clCreateKernel(oclHandles.program,
(kernel_names[nKernel]).c_str(),
&resultCL);
if ((resultCL != CL_SUCCESS) || (kernel == NULL)) {
string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" + kernel_names[nKernel] + "\"";
throw(errorMsg);
}
oclHandles.kernel.push_back(kernel);
}
//get resource alocation information
#ifdef RES_MSG
char * build_log;
size_t ret_val_size;
oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSE], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
if(oclHandles.cl_status!=CL_SUCCESS) {
throw(string("exceptions in _InitCL -> getting resource information"));
}
build_log = (char *)malloc(ret_val_size+1);
oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSE], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
if(oclHandles.cl_status!=CL_SUCCESS) {
throw(string("exceptions in _InitCL -> getting resources allocation information-2"));
}
build_log[ret_val_size] = '\0';
printf("--cambine: %s\n", build_log);
free(build_log);
#endif
}
//---------------------------------------
//release CL objects
void _clRelease()
{
char errorFlag = false;
for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++) {
if (oclHandles.kernel[nKernel] != NULL) {
cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
if (resultCL != CL_SUCCESS) {
printf("ReleaseCL()::Error: In clReleaseKernel\n");
errorFlag = true;
}
oclHandles.kernel[nKernel] = NULL;
}
oclHandles.kernel.clear();
}
if (oclHandles.program != NULL) {
cl_int resultCL = clReleaseProgram(oclHandles.program);
if (resultCL != CL_SUCCESS) {
printf("ReleaseCL()::Error: In clReleaseProgram\n");
errorFlag = true;
}
oclHandles.program = NULL;
}
if (oclHandles.queue != NULL) {
cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
if (resultCL != CL_SUCCESS) {
printf("ReleaseCL()::Error: In clReleaseCommandQueue\n");
errorFlag = true;
}
oclHandles.queue = NULL;
}
free(oclHandles.devices);
if (oclHandles.context != NULL) {
cl_int resultCL = clReleaseContext(oclHandles.context);
if (resultCL != CL_SUCCESS) {
printf("ReleaseCL()::Error: In clReleaseContext\n");
errorFlag = true;
}
oclHandles.context = NULL;
}
if (errorFlag) throw(string("ReleaseCL()::Error encountered."));
}
//--------------------------------------------------------
//--cambine:create buffer and then copy data from host to device
cl_mem _clCreateAndCpyMem(int size, void * h_mem_source) throw(string)
{
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, \
size, h_mem_source, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem()"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: create read only buffer for devices
//--date: 17/01/2011
cl_mem _clMallocRW(int size, void * h_mem_ptr) throw(string)
{
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_WRITE, size, NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMallocRW"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: create read and write buffer for devices
//--date: 17/01/2011
cl_mem _clMalloc(int size, void * h_mem_ptr) throw(string)
{
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMalloc"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: transfer data from host to device
//--date: 17/01/2011
void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string)
{
cl_event event;
oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem,
CL_TRUE, 0, size, h_mem_ptr, 0, NULL, &event);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMemcpyH2D"));
#endif
#ifdef TIMING
h2d_time += probe_event_time(event, oclHandles.queue);
#endif
}
//--------------------------------------------------------
//--cambine:create buffer and then copy data from host to device with pinned
// memory
cl_mem _clCreateAndCpyPinnedMem(int size, float* h_mem_source) throw(string)
{
cl_mem d_mem, d_mem_pinned;
float * h_mem_pinned = NULL;
d_mem_pinned = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, \
size, NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
#endif
//------------
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, \
size, NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
#endif
//----------
h_mem_pinned = (cl_float *)clEnqueueMapBuffer(oclHandles.queue, d_mem_pinned, CL_TRUE, \
CL_MAP_WRITE, 0, size, 0, NULL, \
NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
#endif
int element_number = size/sizeof(float);
#pragma omp parallel for
for(int i=0; i<element_number; i++) {
h_mem_pinned[i] = h_mem_source[i];
}
//----------
oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem, \
CL_TRUE, 0, size, h_mem_pinned, \
0, NULL, NULL);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
#endif
return d_mem;
}
//--------------------------------------------------------
//--cambine:create write only buffer on device
cl_mem _clMallocWO(int size) throw(string)
{
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0, &oclHandles.cl_status);
#ifdef ERRMSG
if(oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateMem()"));
#endif
return d_mem;
}
//--------------------------------------------------------
//transfer data from device to host
void _clMemcpyD2H(cl_mem d_mem, int size, void * h_mem) throw(string)
{
cl_event event;
oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem, 0,0, &event);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
switch(oclHandles.cl_status) {
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_VALUE:
oclHandles.error_str += "CL_INVALID_VALUE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if(oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
#ifdef TIMING
d2h_time += probe_event_time(event, oclHandles.queue);
#endif
}
//--------------------------------------------------------
//set kernel arguments
void _clSetArgs(int kernel_id, int arg_idx, void * d_mem, int size = 0) throw(string)
{
if(!size) {
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, sizeof(d_mem), &d_mem);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
switch(oclHandles.cl_status) {
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_ARG_INDEX:
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
break;
case CL_INVALID_ARG_VALUE:
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_SAMPLER:
oclHandles.error_str += "CL_INVALID_SAMPLER";
break;
case CL_INVALID_ARG_SIZE:
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if(oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
} else {
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
switch(oclHandles.cl_status) {
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_ARG_INDEX:
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
break;
case CL_INVALID_ARG_VALUE:
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_SAMPLER:
oclHandles.error_str += "CL_INVALID_SAMPLER";
break;
case CL_INVALID_ARG_SIZE:
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if(oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
}
void _clFinish() throw(string)
{
oclHandles.cl_status = clFinish(oclHandles.queue);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clFinish";
switch(oclHandles.cl_status) {
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reasons";
break;
}
if(oclHandles.cl_status!=CL_SUCCESS) {
throw(oclHandles.error_str);
}
#endif
}
//--------------------------------------------------------
//--cambine:enqueue kernel
void _clInvokeKernel(int kernel_id, int work_items, int work_group_size) throw(string)
{
cl_uint work_dim = WORK_DIM;
cl_event e[1];
if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size
work_items = work_items + (work_group_size-(work_items%work_group_size));
size_t local_work_size[] = {work_group_size, 1};
size_t global_work_size[] = {work_items, 1};
oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
global_work_size, local_work_size, 0, 0, &(e[0]) );
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
switch(oclHandles.cl_status) {
case CL_INVALID_PROGRAM_EXECUTABLE:
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
break;
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_KERNEL_ARGS:
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
break;
case CL_INVALID_WORK_DIMENSION:
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
break;
case CL_INVALID_GLOBAL_WORK_SIZE:
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
break;
case CL_INVALID_WORK_GROUP_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
break;
case CL_INVALID_WORK_ITEM_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
break;
case CL_INVALID_GLOBAL_OFFSET:
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if(oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
#ifdef TIMING
kernel_time += probe_event_time(e[0], oclHandles.queue);
#endif
//_clFinish();
// oclHandles.cl_status = clWaitForEvents(1, &e[0]);
// #ifdef ERRMSG
// if (oclHandles.cl_status!= CL_SUCCESS)
// throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
// #endif
}
void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x, int group_y) throw(string)
{
cl_uint work_dim = WORK_DIM;
size_t local_work_size[] = {group_x, group_y};
size_t global_work_size[] = {range_x, range_y};
cl_event e[1];
/*if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size
work_items = work_items + (work_group_size-(work_items%work_group_size));*/
oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
global_work_size, local_work_size, 0, 0, &(e[0]) );
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
switch(oclHandles.cl_status) {
case CL_INVALID_PROGRAM_EXECUTABLE:
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
break;
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_KERNEL_ARGS:
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
break;
case CL_INVALID_WORK_DIMENSION:
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
break;
case CL_INVALID_GLOBAL_WORK_SIZE:
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
break;
case CL_INVALID_WORK_GROUP_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
break;
case CL_INVALID_WORK_ITEM_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
break;
case CL_INVALID_GLOBAL_OFFSET:
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if(oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
#ifdef TIMING
kernel_time += probe_event_time(e[0], oclHandles.queue);
#endif
//_clFinish();
/*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
#ifdef ERRMSG
if (oclHandles.cl_status!= CL_SUCCESS)
throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
#endif*/
}
//--------------------------------------------------------
//release OpenCL objects
void _clFree(cl_mem ob) throw(string)
{
if(ob!=NULL)
oclHandles.cl_status = clReleaseMemObject(ob);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clFree() ->";
switch(oclHandles.cl_status) {
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if (oclHandles.cl_status!= CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
#endif //_CL_HELPER_

View file

@ -0,0 +1,7 @@
PROJECT = bfs
SRCS = bfs.cpp timer.cc
OPTS ?= graph4096.txt 0 0 0
include ../common.mk

BIN
tests/opencl/bfs2/bfs Executable file

Binary file not shown.

351
tests/opencl/bfs2/bfs.cpp Normal file
View file

@ -0,0 +1,351 @@
//--by Jianbin Fang
#define __CL_ENABLE_EXCEPTIONS
#include <cstdlib>
#include <iostream>
#include <string>
#include <cstring>
#ifdef PROFILING
#include "timer.h"
#endif
#ifdef TIMING
#include "timing.h"
#endif
#include "CLHelper.h"
#include "util.h"
#define MAX_THREADS_PER_BLOCK 256
//Structure to hold a node information
struct Node {
int starting;
int no_of_edges;
};
//Primitives for timing
#ifdef TIMING
struct timeval tv;
struct timeval tv_total_start, tv_total_end;
struct timeval tv_h2d_start, tv_h2d_end;
struct timeval tv_d2h_start, tv_d2h_end;
struct timeval tv_kernel_start, tv_kernel_end;
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
struct timeval tv_close_start, tv_close_end;
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time= 0,
d2h_time = 0, close_time = 0, total_time = 0;
#endif
//----------------------------------------------------------
//--bfs on cpu
//--programmer: jianbin
//--date: 26/01/2011
//--note: width is changed to the new_width
//----------------------------------------------------------
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
char *h_graph_visited, int *h_cost_ref)
{
char stop;
int k = 0;
do {
//if no thread changes this value then the loop stops
stop=false;
for(int tid = 0; tid < no_of_nodes; tid++ ) {
if (h_graph_mask[tid] == true) {
h_graph_mask[tid]=false;
for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++) {
int id = h_graph_edges[i]; //--cambine: node id is connected with node tid
if(!h_graph_visited[id]) { //--cambine: if node id has not been visited, enter the body below
h_cost_ref[id]=h_cost_ref[tid]+1;
h_updating_graph_mask[id]=true;
}
}
}
}
for(int tid=0; tid< no_of_nodes ; tid++ ) {
if (h_updating_graph_mask[tid] == true) {
h_graph_mask[tid]=true;
h_graph_visited[tid]=true;
stop=true;
h_updating_graph_mask[tid]=false;
}
}
k++;
} while(stop);
}
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
char *h_graph_visited, int *h_cost)
throw(std::string)
{
//int number_elements = height*width;
char h_over;
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
d_graph_visited, d_cost, d_over;
try {
#ifdef TIMING
gettimeofday(&tv_total_start, NULL);
#endif
_clInit();
#ifdef TIMING
gettimeofday(&tv_mem_alloc_start, NULL);
tvsub(&tv_mem_alloc_start, &tv_total_start, &tv);
init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
//--1 transfer data from host to device
d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask);
d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask);
d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited);
d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
d_over = _clMallocRW(sizeof(char), &h_over);
#ifdef TIMING
gettimeofday(&tv_mem_alloc_end, NULL);
tvsub(&tv_mem_alloc_end, &tv_mem_alloc_start, &tv);
mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
#endif
_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);
_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask);
_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited);
_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);
//--2 invoke kernel
#ifdef PROFILING
timer kernel_timer;
double kernel_time = 0.0;
kernel_timer.reset();
kernel_timer.start();
#endif
do {
h_over = false;
_clMemcpyH2D(d_over, sizeof(char), &h_over);
//--kernel 0
int kernel_id = 0;
int kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_cost);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//int work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
//--kernel 1
kernel_id = 1;
kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_over);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
//work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
_clMemcpyD2H(d_over,sizeof(char), &h_over);
} while(h_over);
_clFinish();
#ifdef PROFILING
kernel_timer.stop();
kernel_time = kernel_timer.getTimeInSeconds();
#endif
//--3 transfer data from device to host
_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
#ifdef TIMING
gettimeofday(&tv_close_start, NULL);
#endif
//--statistics
#ifdef PROFILING
printf("kernel time(s): %d\n", kernel_time );
#endif
//--4 release cl resources.
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
} catch(std::string msg) {
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
std::string e_str = "in run_transpose_gpu -> ";
e_str += msg;
throw(e_str);
}
#ifdef TIMING
gettimeofday(&tv_close_end, NULL);
tvsub(&tv_close_end, &tv_close_start, &tv);
close_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
tvsub(&tv_close_end, &tv_total_start, &tv);
total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
printf("Init: %f\n", init_time);
printf("MemAlloc: %f\n", mem_alloc_time);
printf("HtoD: %f\n", h2d_time);
printf("Exec: %f\n", kernel_time);
printf("DtoH: %f\n", d2h_time);
printf("Close: %f\n", close_time);
printf("Total: %f\n", total_time);
#endif
return ;
}
void Usage(int argc, char**argv)
{
fprintf(stderr,"Usage: %s <input_file> [-p platform] [-d device] [-t gpu(0)/cpu(1)]\n", argv[0]);
}
//----------------------------------------------------------
//--cambine: main function
//--author: created by Jianbin Fang
//--date: 25/01/2011
//----------------------------------------------------------
int main(int argc, char * argv[])
{
int no_of_nodes;
int edge_list_size;
FILE *fp;
Node* h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
try {
char *input_f;
if(argc < 2) {
Usage(argc, argv);
exit(0);
}
_clCmdParams(argc, argv);
input_f = argv[1];
printf("Reading File\n");
//Read in Graph from a file
fp = fopen(input_f,"r");
if(!fp) {
printf("Error Reading graph file\n");
return 0;
}
int source = 0;
fscanf(fp,"%d",&no_of_nodes);
int num_of_blocks = 1;
int num_of_threads_per_block = no_of_nodes;
//Make execution Parameters according to the number of nodes
//Distribute threads across multiple Blocks if necessary
if(no_of_nodes>MAX_THREADS_PER_BLOCK) {
num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
work_group_size = num_of_threads_per_block;
// allocate host memory
h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes);
int start, edgeno;
// initalize the memory
for(int i = 0; i < no_of_nodes; i++) {
fscanf(fp,"%d %d",&start,&edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i]=false;
h_updating_graph_mask[i]=false;
h_graph_visited[i]=false;
}
//read the source node from the file
fscanf(fp,"%d",&source);
source=0;
//set the source node as true in the mask
h_graph_mask[source]=true;
h_graph_visited[source]=true;
fscanf(fp,"%d",&edge_list_size);
int id,cost;
int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);
for(int i=0; i < edge_list_size ; i++) {
fscanf(fp,"%d",&id);
fscanf(fp,"%d",&cost);
h_graph_edges[i] = id;
}
if(fp)
fclose(fp);
// allocate mem for the result on host side
int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes);
int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes);
for(int i=0; i<no_of_nodes; i++) {
h_cost[i]=-1;
h_cost_ref[i] = -1;
}
h_cost[source]=0;
h_cost_ref[source]=0;
//---------------------------------------------------------
//--gpu entry
run_bfs_gpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
//---------------------------------------------------------
//--cpu entry
// initalize the memory again
for(int i = 0; i < no_of_nodes; i++) {
h_graph_mask[i]=false;
h_updating_graph_mask[i]=false;
h_graph_visited[i]=false;
}
//set the source node as true in the mask
source=0;
h_graph_mask[source]=true;
h_graph_visited[source]=true;
run_bfs_cpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost_ref);
//---------------------------------------------------------
//--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
//release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
} catch(std::string msg) {
printf("--cambine: exception in main -> %s\n", msg);
//release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
}
return 0;
}

View file

@ -0,0 +1,50 @@
/* ============================================================
//--cambine: kernel funtion of Breadth-First-Search
//--author: created by Jianbin Fang
//--date: 06/12/2010
============================================================ */
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
//Structure to hold a node information
typedef struct{
int starting;
int no_of_edges;
} Node;
//--7 parameters
__kernel void BFS_1( const __global Node* g_graph_nodes,
const __global int* g_graph_edges,
__global char* g_graph_mask,
__global char* g_updating_graph_mask,
__global char* g_graph_visited,
__global int* g_cost,
const int no_of_nodes){
int tid = get_global_id(0);
if( tid<no_of_nodes && g_graph_mask[tid]){
g_graph_mask[tid]=false;
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
int id = g_graph_edges[i];
if(!g_graph_visited[id]){
g_cost[id]=g_cost[tid]+1;
g_updating_graph_mask[id]=true;
}
}
}
}
//--5 parameters
__kernel void BFS_2(__global char* g_graph_mask,
__global char* g_updating_graph_mask,
__global char* g_graph_visited,
__global char* g_over,
const int no_of_nodes
) {
int tid = get_global_id(0);
if( tid<no_of_nodes && g_updating_graph_mask[tid]){
g_graph_mask[tid]=true;
g_graph_visited[tid]=true;
*g_over=true;
g_updating_graph_mask[tid]=false;
}
}

View file

@ -0,0 +1,78 @@
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iomanip>
#include "timer.h"
using namespace std;
double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
double timer::get_CPU_speed_in_MHz()
{
#if defined __linux__
ifstream infile("/proc/cpuinfo");
char buffer[256], *colon;
while (infile.good()) {
infile.getline(buffer, 256);
if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
return atof(colon + 2);
}
#endif
return 0.0;
}
void timer::print_time(ostream &str, const char *which, double time) const
{
static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 };
const char **unit = units;
time = 1000.0 * time / CPU_speed_in_MHz;
while (time >= 999.5 && unit[1] != 0) {
time /= 1000.0;
++ unit;
}
str << which << " = " << setprecision(3) << setw(4) << time << *unit;
}
ostream &timer::print(ostream &str)
{
str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
if (CPU_speed_in_MHz == 0)
str << "could not determine CPU speed\n";
else if (count > 0) {
double total = static_cast<double>(total_time);
print_time(str, "avg", total / static_cast<double>(count));
print_time(str, ", total", total);
str << ", count = " << setw(9) << count << '\n';
}
else
str << "not used\n";
return str;
}
ostream &operator << (ostream &str, class timer &timer)
{
return timer.print(str);
}
double timer::getTimeInSeconds()
{
double total = static_cast<double>(total_time);
double res = (total / 1000000.0) / CPU_speed_in_MHz;
return res;
}

128
tests/opencl/bfs2/timer.h Normal file
View file

@ -0,0 +1,128 @@
#ifndef timer_h
#define timer_h
#include <iostream>
class timer {
public:
timer(const char *name = 0);
timer(const char *name, std::ostream &write_on_exit);
~timer();
void start(), stop();
void reset();
std::ostream &print(std::ostream &);
double getTimeInSeconds();
private:
void print_time(std::ostream &, const char *which, double time) const;
union {
long long total_time;
struct {
#if defined __PPC__
int high, low;
#else
int low, high;
#endif
};
};
unsigned long long count;
const char *const name;
std::ostream *const write_on_exit;
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
};
std::ostream &operator << (std::ostream &, class timer &);
inline void timer::reset()
{
total_time = 0;
count = 0;
}
inline timer::timer(const char *name)
:
name(name),
write_on_exit(0)
{
reset();
}
inline timer::timer(const char *name, std::ostream &write_on_exit)
:
name(name),
write_on_exit(&write_on_exit)
{
reset();
}
inline timer::~timer()
{
if (write_on_exit != 0)
print(*write_on_exit);
}
inline void timer::start()
{
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
total_time -= ((unsigned long long) edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
asm volatile
(
"rdtsc\n\t"
"subl %%eax, %0\n\t"
"sbbl %%edx, %1"
:
"+m" (low), "+m" (high)
:
:
"eax", "edx"
);
#else
#error Compiler/Architecture not recognized
#endif
}
inline void timer::stop()
{
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx;
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
total_time += ((unsigned long long) edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
asm volatile
(
"rdtsc\n\t"
"addl %%eax, %0\n\t"
"adcl %%edx, %1"
:
"+m" (low), "+m" (high)
:
:
"eax", "edx"
);
#endif
++ count;
}
#endif

72
tests/opencl/bfs2/util.h Normal file
View file

@ -0,0 +1,72 @@
#ifndef _C_UTIL_
#define _C_UTIL_
#include <math.h>
#include <iostream>
#include <omp.h>
//-------------------------------------------------------------------
//--initialize array with maximum limit
//-------------------------------------------------------------------
template<typename datatype>
void fill(datatype *A, const int n, const datatype maxi){
for (int j = 0; j < n; j++)
{
A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
}
}
//--print matrix
template<typename datatype>
void print_matrix(datatype *A, int height, int width){
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int idx = i*width + j;
std::cout<<A[idx]<<" ";
}
std::cout<<std::endl;
}
return;
}
//-------------------------------------------------------------------
//--verify results
//-------------------------------------------------------------------
#define MAX_RELATIVE_ERROR .002
template<typename datatype>
void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
char passed = true;
#pragma omp parallel for
for (int i=0; i<size; i++){
if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
passed = false;
}
}
if (passed){
printf("--cambine:passed:-)\n");
}
else{
printf("--cambine: failed:-(\n");
}
return ;
}
template<typename datatype>
void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
char passed = true;
//#pragma omp parallel for
for (int i=0; i<size; i++){
if (cpu_results[i]!=gpu_results[i]){
passed = false;
}
}
if (passed){
printf("--cambine:passed:-)\n");
}
else{
printf("--cambine: failed:-(\n");
}
return ;
}
#endif

2
tests/opencl/cfd/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
*.txt
*.out

1572
tests/opencl/cfd/CLHelper.h Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,7 @@
PROJECT = cfd
SRCS = euler3d.cc timing.cpp
OPTS ?= fvcorr.domn.097K
include ../common.mk

View file

@ -0,0 +1,36 @@
include ../../common/make.config
#Can be changed by `make TYPE=CPU`
TYPE = GPU
#Library
ifeq ($(TYPE),GPU)
OPENCL_INC = $(NV_OPENCL_INC)
OPENCL_LIB = $(NV_OPENCL_LIB)
else
OPENCL_INC = $(INTEL_OPENCL_INC)
OPENCL_LIB = $(INTEL_OPENCL_LIB)
endif
#C compiler
CC = g++
FLAGS = -O3
SRC = euler3d.cpp ../util/timing.c
EXE = euler3d.out
release:$(SRC)
$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -I../util -DTIMING $(FLAGS)
errmsg:$(SRC)
$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -D ERRMSG $(FLAGS)
ptx:$(SRC)
$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -D PTX_MSG $(FLAGS)
res:$(SRC)
$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -D RES_MSG $(FLAGS)
clean: $(SRC)
rm -f $(EXE) $(EXE).linkinfo result*

10
tests/opencl/cfd/README Normal file
View file

@ -0,0 +1,10 @@
******Adjustable work group size*****
RD_WG_SIZE for all
RD_WG_SIZE_1 or RD_WG_SIZE_1_0 for initialize_variables
RD_WG_SIZE_2 or RD_WG_SIZE_2_0 for compute_step_factor
RD_WG_SIZE_3 or RD_WG_SIZE_3_0 for compute_flux
RD_WG_SIZE_4 or RD_WG_SIZE_4_0 for time_step
USAGE:
make clean
make KERNEL_DIM="-DRD_WG_SIZE_1=128 -DRD_WG_SIZE_2=192 -DRD_WG_SIZE_3=128 -DRD_WG_SIZE_4=256"

BIN
tests/opencl/cfd/cfd Executable file

Binary file not shown.

418
tests/opencl/cfd/euler3d.cc Normal file
View file

@ -0,0 +1,418 @@
/********************************************************************
euler3d.cpp
: parallelized code of CFD
- original code from the AIAA-2009-4001 by Andrew Corrigan, acorriga@gmu.edu
- parallelization with OpenCL API has been applied by
Jianbin Fang - j.fang@tudelft.nl
Delft University of Technology
Faculty of Electrical Engineering, Mathematics and Computer Science
Department of Software Technology
Parallel and Distributed Systems Group
on 24/03/2011
********************************************************************/
#include <iostream>
#include <fstream>
#include <math.h>
#include "CLHelper.h"
/*
* Options
*
*/
#define GAMMA 1.4f
#define iterations 2000
#ifndef block_length
#define block_length 192
#endif
#define NDIM 3
#define NNB 4
#define RK 3 // 3rd order RK
#define ff_mach 1.2f
#define deg_angle_of_attack 0.0f
/*
* not options
*/
#if block_length > 128
#warning "the kernels may fail too launch on some systems if the block length is too large"
#endif
#define VAR_DENSITY 0
#define VAR_MOMENTUM 1
#define VAR_DENSITY_ENERGY (VAR_MOMENTUM+NDIM)
#define NVAR (VAR_DENSITY_ENERGY+1)
//self-defined user type
typedef struct{
float x;
float y;
float z;
} float3;
/*
* Generic functions
*/
template <typename T>
cl_mem alloc(int N){
cl_mem mem_d = _clMalloc(sizeof(T)*N);
return mem_d;
}
template <typename T>
void dealloc(cl_mem array){
_clFree(array);
}
template <typename T>
void copy(cl_mem dst, cl_mem src, int N){
_clMemcpyD2D(dst, src, N*sizeof(T));
}
template <typename T>
void upload(cl_mem dst, T* src, int N){
_clMemcpyH2D(dst, src, N*sizeof(T));
}
template <typename T>
void download(T* dst, cl_mem src, int N){
_clMemcpyD2H(dst, src, N*sizeof(T));
}
void dump(cl_mem variables, int nel, int nelr){
float* h_variables = new float[nelr*NVAR];
download(h_variables, variables, nelr*NVAR);
{
std::ofstream file("density.txt");
file << nel << " " << nelr << std::endl;
for(int i = 0; i < nel; i++) file << h_variables[i + VAR_DENSITY*nelr] << std::endl;
}
{
std::ofstream file("momentum.txt");
file << nel << " " << nelr << std::endl;
for(int i = 0; i < nel; i++)
{
for(int j = 0; j != NDIM; j++)
file << h_variables[i + (VAR_MOMENTUM+j)*nelr] << " ";
file << std::endl;
}
}
{
std::ofstream file("density_energy.txt");
file << nel << " " << nelr << std::endl;
for(int i = 0; i < nel; i++) file << h_variables[i + VAR_DENSITY_ENERGY*nelr] << std::endl;
}
delete[] h_variables;
}
void initialize_variables(int nelr, cl_mem variables, cl_mem ff_variable) throw(string){
int work_items = nelr;
int work_group_size = BLOCK_SIZE_1;
int kernel_id = 1;
int arg_idx = 0;
_clSetArgs(kernel_id, arg_idx++, variables);
_clSetArgs(kernel_id, arg_idx++, ff_variable);
_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
_clInvokeKernel(kernel_id, work_items, work_group_size);
}
void compute_step_factor(int nelr, cl_mem variables, cl_mem areas, cl_mem step_factors){
int work_items = nelr;
int work_group_size = BLOCK_SIZE_2;
int kernel_id = 2;
int arg_idx = 0;
_clSetArgs(kernel_id, arg_idx++, variables);
_clSetArgs(kernel_id, arg_idx++, areas);
_clSetArgs(kernel_id, arg_idx++, step_factors);
_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
_clInvokeKernel(kernel_id, work_items, work_group_size);
}
void compute_flux(int nelr, cl_mem elements_surrounding_elements, cl_mem normals, cl_mem variables, cl_mem ff_variable, \
cl_mem fluxes, cl_mem ff_flux_contribution_density_energy,
cl_mem ff_flux_contribution_momentum_x,
cl_mem ff_flux_contribution_momentum_y,
cl_mem ff_flux_contribution_momentum_z){
int work_items = nelr;
int work_group_size = BLOCK_SIZE_3;
int kernel_id = 3;
int arg_idx = 0;
_clSetArgs(kernel_id, arg_idx++, elements_surrounding_elements);
_clSetArgs(kernel_id, arg_idx++, normals);
_clSetArgs(kernel_id, arg_idx++, variables);
_clSetArgs(kernel_id, arg_idx++, ff_variable);
_clSetArgs(kernel_id, arg_idx++, fluxes);
_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_density_energy);
_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_x);
_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_y);
_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_z);
_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
_clInvokeKernel(kernel_id, work_items, work_group_size);
}
void time_step(int j, int nelr, cl_mem old_variables, cl_mem variables, cl_mem step_factors, cl_mem fluxes){
int work_items = nelr;
int work_group_size = BLOCK_SIZE_4;
int kernel_id = 4;
int arg_idx = 0;
_clSetArgs(kernel_id, arg_idx++, &j, sizeof(int));
_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
_clSetArgs(kernel_id, arg_idx++, old_variables);
_clSetArgs(kernel_id, arg_idx++, variables);
_clSetArgs(kernel_id, arg_idx++, step_factors);
_clSetArgs(kernel_id, arg_idx++, fluxes);
_clInvokeKernel(kernel_id, work_items, work_group_size);
}
inline void compute_flux_contribution(float& density, float3& momentum, float& density_energy, float& pressure, float3& velocity, float3& fc_momentum_x, float3& fc_momentum_y, float3& fc_momentum_z, float3& fc_density_energy)
{
fc_momentum_x.x = velocity.x*momentum.x + pressure;
fc_momentum_x.y = velocity.x*momentum.y;
fc_momentum_x.z = velocity.x*momentum.z;
fc_momentum_y.x = fc_momentum_x.y;
fc_momentum_y.y = velocity.y*momentum.y + pressure;
fc_momentum_y.z = velocity.y*momentum.z;
fc_momentum_z.x = fc_momentum_x.z;
fc_momentum_z.y = fc_momentum_y.z;
fc_momentum_z.z = velocity.z*momentum.z + pressure;
float de_p = density_energy+pressure;
fc_density_energy.x = velocity.x*de_p;
fc_density_energy.y = velocity.y*de_p;
fc_density_energy.z = velocity.z*de_p;
}
/*
* Main function
*/
int main(int argc, char** argv){
printf("WG size of kernel:initialize = %d, WG size of kernel:compute_step_factor = %d, WG size of kernel:compute_flux = %d, WG size of kernel:time_step = %d\n", BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
if (argc < 2){
//std::cout << "specify data file name and [device type] [device id]" << std::endl;
return 0;
}
const char* data_file_name = argv[1];
_clCmdParams(argc, argv);
cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z, ff_flux_contribution_density_energy;
cl_mem areas, elements_surrounding_elements, normals;
cl_mem variables, old_variables, fluxes, step_factors;
float h_ff_variable[NVAR];
try{
_clInit(device_type, device_id);
// set far field conditions and load them into constant memory on the gpu
{
//float h_ff_variable[NVAR];
const float angle_of_attack = float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
h_ff_variable[VAR_DENSITY] = float(1.4);
float ff_pressure = float(1.0f);
float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]);
float ff_speed = float(ff_mach)*ff_speed_of_sound;
float3 ff_velocity;
ff_velocity.x = ff_speed*float(cos((float)angle_of_attack));
ff_velocity.y = ff_speed*float(sin((float)angle_of_attack));
ff_velocity.z = 0.0f;
h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;
h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*(float(0.5f)*(ff_speed*ff_speed)) + (ff_pressure / float(GAMMA-1.0f));
float3 h_ff_momentum;
h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0);
h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1);
h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2);
float3 h_ff_flux_contribution_momentum_x;
float3 h_ff_flux_contribution_momentum_y;
float3 h_ff_flux_contribution_momentum_z;
float3 h_ff_flux_contribution_density_energy;
compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum, h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, ff_velocity, h_ff_flux_contribution_momentum_x, h_ff_flux_contribution_momentum_y, h_ff_flux_contribution_momentum_z, h_ff_flux_contribution_density_energy);
// copy far field conditions to the gpu
//cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z, ff_flux_contribution_density_energy;
ff_variable = _clMalloc(NVAR*sizeof(float));
ff_flux_contribution_momentum_x = _clMalloc(sizeof(float3));
ff_flux_contribution_momentum_y = _clMalloc(sizeof(float3));
ff_flux_contribution_momentum_z = _clMalloc(sizeof(float3));
ff_flux_contribution_density_energy = _clMalloc(sizeof(float3));
_clMemcpyH2D(ff_variable, h_ff_variable, NVAR*sizeof(float));
_clMemcpyH2D(ff_flux_contribution_momentum_x, &h_ff_flux_contribution_momentum_x, sizeof(float3));
_clMemcpyH2D(ff_flux_contribution_momentum_y, &h_ff_flux_contribution_momentum_y, sizeof(float3));
_clMemcpyH2D(ff_flux_contribution_momentum_z, &h_ff_flux_contribution_momentum_z, sizeof(float3));
_clMemcpyH2D(ff_flux_contribution_density_energy, &h_ff_flux_contribution_density_energy, sizeof(float3));
_clFinish();
}
int nel;
int nelr;
// read in domain geometry
//float* areas;
//int* elements_surrounding_elements;
//float* normals;
{
std::ifstream file(data_file_name);
if(!file.good()){
throw(string("can not find/open file!"));
}
file >> nel;
nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));
////std::cout<<"--cambine: nel="<<nel<<", nelr="<<nelr<<std::endl;
float* h_areas = new float[nelr];
int* h_elements_surrounding_elements = new int[nelr*NNB];
float* h_normals = new float[nelr*NDIM*NNB];
// read in data
for(int i = 0; i < nel; i++)
{
file >> h_areas[i];
for(int j = 0; j < NNB; j++)
{
file >> h_elements_surrounding_elements[i + j*nelr];
if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1;
h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering
for(int k = 0; k < NDIM; k++)
{
file >> h_normals[i + (j + k*NNB)*nelr];
h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr];
}
}
}
// fill in remaining data
int last = nel-1;
for(int i = nel; i < nelr; i++)
{
h_areas[i] = h_areas[last];
for(int j = 0; j < NNB; j++)
{
// duplicate the last element
h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr];
for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr];
}
}
areas = alloc<float>(nelr);
upload<float>(areas, h_areas, nelr);
elements_surrounding_elements = alloc<int>(nelr*NNB);
upload<int>(elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB);
normals = alloc<float>(nelr*NDIM*NNB);
upload<float>(normals, h_normals, nelr*NDIM*NNB);
delete[] h_areas;
delete[] h_elements_surrounding_elements;
delete[] h_normals;
}
// Create arrays and set initial conditions
variables = alloc<float>(nelr*NVAR);
int tp = 0;
initialize_variables(nelr, variables, ff_variable);
old_variables = alloc<float>(nelr*NVAR);
fluxes = alloc<float>(nelr*NVAR);
step_factors = alloc<float>(nelr);
// make sure all memory is floatly allocated before we start timing
initialize_variables(nelr, old_variables, ff_variable);
initialize_variables(nelr, fluxes, ff_variable);
_clMemset(step_factors, 0, sizeof(float)*nelr);
// make sure CUDA isn't still doing something before we start timing
_clFinish();
// these need to be computed the first time in order to compute time step
printf( "Starting...\n");
// Begin iterations
for(int i = 0; i < iterations; i++){
copy<float>(old_variables, variables, nelr*NVAR);
// for the first iteration we compute the time step
compute_step_factor(nelr, variables, areas, step_factors);
for(int j = 0; j < RK; j++){
compute_flux(nelr, elements_surrounding_elements, normals, variables, ff_variable, fluxes, ff_flux_contribution_density_energy, \
ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z);
time_step(j, nelr, old_variables, variables, step_factors, fluxes);
}
}
_clFinish();
//std::cout << "Saving solution..." << std::endl;
dump(variables, nel, nelr);
//std::cout << "Saved solution..." << std::endl;
_clStatistics();
//std::cout << "Cleaning up..." << std::endl;
//--release resources
_clFree(ff_variable);
_clFree(ff_flux_contribution_momentum_x);
_clFree(ff_flux_contribution_momentum_y);
_clFree(ff_flux_contribution_momentum_z);
_clFree(ff_flux_contribution_density_energy);
_clFree(areas);
_clFree(elements_surrounding_elements);
_clFree(normals);
_clFree(variables);
_clFree(old_variables);
_clFree(fluxes);
_clFree(step_factors);
_clRelease();
printf("Done...\n");
_clPrintTiming();
}
catch(string msg){
printf("--cambine:( an exception catched in main body ->%s\n", msg.c_str());
_clFree(ff_variable);
_clFree(ff_flux_contribution_momentum_x);
_clFree(ff_flux_contribution_momentum_y);
_clFree(ff_flux_contribution_momentum_z);
_clFree(ff_flux_contribution_density_energy);
_clFree(areas);
_clFree(elements_surrounding_elements);
_clFree(normals);
_clFree(variables);
_clFree(old_variables);
_clFree(fluxes);
_clFree(step_factors);
_clRelease();
}
catch(...){
//std::cout<<"--cambine:( unknow exceptions in main body..."<<std::endl;
_clFree(ff_variable);
_clFree(ff_flux_contribution_momentum_x);
_clFree(ff_flux_contribution_momentum_y);
_clFree(ff_flux_contribution_momentum_z);
_clFree(ff_flux_contribution_density_energy);
_clFree(areas);
_clFree(elements_surrounding_elements);
_clFree(normals);
_clFree(variables);
_clFree(old_variables);
_clFree(fluxes);
_clFree(step_factors);
_clRelease();
}
return 0;
}

File diff suppressed because it is too large Load diff

284
tests/opencl/cfd/kernel.cl Normal file
View file

@ -0,0 +1,284 @@
/* ============================================================
//--functions: kernel funtion
//--programmer: Jianbin Fang
//--date: 24/03/2011
============================================================ */
#ifndef _KERNEL_
#define _KERNEL_
#define GAMMA (1.4f)
#define NDIM 3
#define NNB 4
#define RK 3 // 3rd order RK
#define ff_mach 1.2f
#define deg_angle_of_attack 0.0f
#define VAR_DENSITY 0
#define VAR_MOMENTUM 1
#define VAR_DENSITY_ENERGY (VAR_MOMENTUM+NDIM)
#define NVAR (VAR_DENSITY_ENERGY+1)
//#pragma OPENCL EXTENSION CL_MAD : enable
//self-defined user type
typedef struct{
float x;
float y;
float z;
} FLOAT3;
/*------------------------------------------------------------
@function: set memory
@params:
mem_d: target memory to be set;
val: set the target memory to value 'val'
num_bytes: the number of bytes all together
@return: through mem_d
------------------------------------------------------------*/
__kernel void memset_kernel(__global char * mem_d, short val, int ct){
const int thread_id = get_global_id(0);
if( thread_id >= ct) return;
mem_d[thread_id] = val;
}
//--cambine: omit &
inline void compute_velocity(float density, FLOAT3 momentum, FLOAT3* velocity){
velocity->x = momentum.x / density;
velocity->y = momentum.y / density;
velocity->z = momentum.z / density;
}
inline float compute_speed_sqd(FLOAT3 velocity){
return velocity.x*velocity.x + velocity.y*velocity.y + velocity.z*velocity.z;
}
inline float compute_pressure(float density, float density_energy, float speed_sqd){
return ((float)(GAMMA) - (float)(1.0f))*(density_energy - (float)(0.5f)*density*speed_sqd);
}
inline float compute_speed_of_sound(float density, float pressure){
//return sqrtf(float(GAMMA)*pressure/density);
return sqrt((float)(GAMMA)*pressure/density);
}
inline void compute_flux_contribution(float density, FLOAT3 momentum, float density_energy, float pressure, FLOAT3 velocity, FLOAT3* fc_momentum_x, FLOAT3* fc_momentum_y, FLOAT3* fc_momentum_z, FLOAT3* fc_density_energy)
{
fc_momentum_x->x = velocity.x*momentum.x + pressure;
fc_momentum_x->y = velocity.x*momentum.y;
fc_momentum_x->z = velocity.x*momentum.z;
fc_momentum_y->x = fc_momentum_x->y;
fc_momentum_y->y = velocity.y*momentum.y + pressure;
fc_momentum_y->z = velocity.y*momentum.z;
fc_momentum_z->x = fc_momentum_x->z;
fc_momentum_z->y = fc_momentum_y->z;
fc_momentum_z->z = velocity.z*momentum.z + pressure;
float de_p = density_energy+pressure;
fc_density_energy->x = velocity.x*de_p;
fc_density_energy->y = velocity.y*de_p;
fc_density_energy->z = velocity.z*de_p;
}
__kernel void initialize_variables(__global float* variables, __constant float* ff_variable, int nelr){
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0);
if( i >= nelr) return;
for(int j = 0; j < NVAR; j++)
variables[i + j*nelr] = ff_variable[j];
}
__kernel void compute_step_factor(__global float* variables,
__global float* areas,
__global float* step_factors,
int nelr){
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0);
if( i >= nelr) return;
float density = variables[i + VAR_DENSITY*nelr];
FLOAT3 momentum;
momentum.x = variables[i + (VAR_MOMENTUM+0)*nelr];
momentum.y = variables[i + (VAR_MOMENTUM+1)*nelr];
momentum.z = variables[i + (VAR_MOMENTUM+2)*nelr];
float density_energy = variables[i + VAR_DENSITY_ENERGY*nelr];
FLOAT3 velocity; compute_velocity(density, momentum, &velocity);
float speed_sqd = compute_speed_sqd(velocity);
//float speed_sqd;
//compute_speed_sqd(velocity, speed_sqd);
float pressure = compute_pressure(density, density_energy, speed_sqd);
float speed_of_sound = compute_speed_of_sound(density, pressure);
// dt = float(0.5f) * sqrtf(areas[i]) / (||v|| + c).... but when we do time stepping, this later would need to be divided by the area, so we just do it all at once
//step_factors[i] = (float)(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
step_factors[i] = (float)(0.5f) / (sqrt(areas[i]) * (sqrt(speed_sqd) + speed_of_sound));
}
__kernel void compute_flux(
__global int* elements_surrounding_elements,
__global float* normals,
__global float* variables,
__constant float* ff_variable,
__global float* fluxes,
__constant FLOAT3* ff_flux_contribution_density_energy,
__constant FLOAT3* ff_flux_contribution_momentum_x,
__constant FLOAT3* ff_flux_contribution_momentum_y,
__constant FLOAT3* ff_flux_contribution_momentum_z,
int nelr){
const float smoothing_coefficient = (float)(0.2f);
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0);
if( i >= nelr) return;
int j, nb;
FLOAT3 normal; float normal_len;
float factor;
float density_i = variables[i + VAR_DENSITY*nelr];
FLOAT3 momentum_i;
momentum_i.x = variables[i + (VAR_MOMENTUM+0)*nelr];
momentum_i.y = variables[i + (VAR_MOMENTUM+1)*nelr];
momentum_i.z = variables[i + (VAR_MOMENTUM+2)*nelr];
float density_energy_i = variables[i + VAR_DENSITY_ENERGY*nelr];
FLOAT3 velocity_i; compute_velocity(density_i, momentum_i, &velocity_i);
float speed_sqd_i = compute_speed_sqd(velocity_i);
//float speed_sqd_i;
//compute_speed_sqd(velocity_i, speed_sqd_i);
//float speed_i = sqrtf(speed_sqd_i);
float speed_i = sqrt(speed_sqd_i);
float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i);
float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
FLOAT3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z;
FLOAT3 flux_contribution_i_density_energy;
compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i, &flux_contribution_i_momentum_x, &flux_contribution_i_momentum_y, &flux_contribution_i_momentum_z, &flux_contribution_i_density_energy);
float flux_i_density = (float)(0.0f);
FLOAT3 flux_i_momentum;
flux_i_momentum.x = (float)(0.0f);
flux_i_momentum.y = (float)(0.0f);
flux_i_momentum.z = (float)(0.0f);
float flux_i_density_energy = (float)(0.0f);
FLOAT3 velocity_nb;
float density_nb, density_energy_nb;
FLOAT3 momentum_nb;
FLOAT3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z;
FLOAT3 flux_contribution_nb_density_energy;
float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
#pragma unroll
for(j = 0; j < NNB; j++)
{
nb = elements_surrounding_elements[i + j*nelr];
normal.x = normals[i + (j + 0*NNB)*nelr];
normal.y = normals[i + (j + 1*NNB)*nelr];
normal.z = normals[i + (j + 2*NNB)*nelr];
//normal_len = sqrtf(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);
normal_len = sqrt(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);
if(nb >= 0) // a legitimate neighbor
{
density_nb = variables[nb + VAR_DENSITY*nelr];
momentum_nb.x = variables[nb + (VAR_MOMENTUM+0)*nelr];
momentum_nb.y = variables[nb + (VAR_MOMENTUM+1)*nelr];
momentum_nb.z = variables[nb + (VAR_MOMENTUM+2)*nelr];
density_energy_nb = variables[nb + VAR_DENSITY_ENERGY*nelr];
compute_velocity(density_nb, momentum_nb, &velocity_nb);
speed_sqd_nb = compute_speed_sqd(velocity_nb);
pressure_nb = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
compute_flux_contribution(density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb, &flux_contribution_nb_momentum_x, &flux_contribution_nb_momentum_y, &flux_contribution_nb_momentum_z, &flux_contribution_nb_density_energy);
// artificial viscosity
factor = -normal_len*smoothing_coefficient*(float)(0.5f)*(speed_i + sqrt(speed_sqd_nb) + speed_of_sound_i + speed_of_sound_nb);
flux_i_density += factor*(density_i-density_nb);
flux_i_density_energy += factor*(density_energy_i-density_energy_nb);
flux_i_momentum.x += factor*(momentum_i.x-momentum_nb.x);
flux_i_momentum.y += factor*(momentum_i.y-momentum_nb.y);
flux_i_momentum.z += factor*(momentum_i.z-momentum_nb.z);
// accumulate cell-centered fluxes
factor = (float)(0.5f)*normal.x;
flux_i_density += factor*(momentum_nb.x+momentum_i.x);
flux_i_density_energy += factor*(flux_contribution_nb_density_energy.x+flux_contribution_i_density_energy.x);
flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.x+flux_contribution_i_momentum_x.x);
flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.x+flux_contribution_i_momentum_y.x);
flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.x+flux_contribution_i_momentum_z.x);
factor = (float)(0.5f)*normal.y;
flux_i_density += factor*(momentum_nb.y+momentum_i.y);
flux_i_density_energy += factor*(flux_contribution_nb_density_energy.y+flux_contribution_i_density_energy.y);
flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.y+flux_contribution_i_momentum_x.y);
flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.y+flux_contribution_i_momentum_y.y);
flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.y+flux_contribution_i_momentum_z.y);
factor = (float)(0.5f)*normal.z;
flux_i_density += factor*(momentum_nb.z+momentum_i.z);
flux_i_density_energy += factor*(flux_contribution_nb_density_energy.z+flux_contribution_i_density_energy.z);
flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.z+flux_contribution_i_momentum_x.z);
flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.z+flux_contribution_i_momentum_y.z);
flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.z+flux_contribution_i_momentum_z.z);
}
else if(nb == -1) // a wing boundary
{
flux_i_momentum.x += normal.x*pressure_i;
flux_i_momentum.y += normal.y*pressure_i;
flux_i_momentum.z += normal.z*pressure_i;
}
else if(nb == -2) // a far field boundary
{
factor = (float)(0.5f)*normal.x;
flux_i_density += factor*(ff_variable[VAR_MOMENTUM+0]+momentum_i.x);
flux_i_density_energy += factor*(ff_flux_contribution_density_energy[0].x+flux_contribution_i_density_energy.x);
flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x[0].x + flux_contribution_i_momentum_x.x);
flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y[0].x + flux_contribution_i_momentum_y.x);
flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z[0].x + flux_contribution_i_momentum_z.x);
factor = (float)(0.5f)*normal.y;
flux_i_density += factor*(ff_variable[VAR_MOMENTUM+1]+momentum_i.y);
flux_i_density_energy += factor*(ff_flux_contribution_density_energy[0].y+flux_contribution_i_density_energy.y);
flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x[0].y + flux_contribution_i_momentum_x.y);
flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y[0].y + flux_contribution_i_momentum_y.y);
flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z[0].y + flux_contribution_i_momentum_z.y);
factor = (float)(0.5f)*normal.z;
flux_i_density += factor*(ff_variable[VAR_MOMENTUM+2]+momentum_i.z);
flux_i_density_energy += factor*(ff_flux_contribution_density_energy[0].z+flux_contribution_i_density_energy.z);
flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x[0].z + flux_contribution_i_momentum_x.z);
flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y[0].z + flux_contribution_i_momentum_y.z);
flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z[0].z + flux_contribution_i_momentum_z.z);
}
}
fluxes[i + VAR_DENSITY*nelr] = flux_i_density;
fluxes[i + (VAR_MOMENTUM+0)*nelr] = flux_i_momentum.x;
fluxes[i + (VAR_MOMENTUM+1)*nelr] = flux_i_momentum.y;
fluxes[i + (VAR_MOMENTUM+2)*nelr] = flux_i_momentum.z;
fluxes[i + VAR_DENSITY_ENERGY*nelr] = flux_i_density_energy;
}
__kernel void time_step(int j, int nelr,
__global float* old_variables,
__global float* variables,
__global float* step_factors,
__global float* fluxes){
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
const int i = get_global_id(0);
if( i >= nelr) return;
float factor = step_factors[i]/(float)(RK+1-j);
variables[i + VAR_DENSITY*nelr] = old_variables[i + VAR_DENSITY*nelr] + factor*fluxes[i + VAR_DENSITY*nelr];
variables[i + VAR_DENSITY_ENERGY*nelr] = old_variables[i + VAR_DENSITY_ENERGY*nelr] + factor*fluxes[i + VAR_DENSITY_ENERGY*nelr];
variables[i + (VAR_MOMENTUM+0)*nelr] = old_variables[i + (VAR_MOMENTUM+0)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+0)*nelr];
variables[i + (VAR_MOMENTUM+1)*nelr] = old_variables[i + (VAR_MOMENTUM+1)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+1)*nelr];
variables[i + (VAR_MOMENTUM+2)*nelr] = old_variables[i + (VAR_MOMENTUM+2)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+2)*nelr];
}
#endif

View file

@ -0,0 +1,40 @@
#include <stdio.h>
#include "timing.h"
void time_measure_start(struct timeval *tv)
{
gettimeofday(tv, NULL);
}
void time_measure_end(struct timeval *tv)
{
struct timeval tv_now, tv_diff;
double d;
gettimeofday(&tv_now, NULL);
tvsub(&tv_now, tv, &tv_diff);
d = (double) tv_diff.tv_sec * 1000.0 + (double) tv_diff.tv_usec / 1000.0;
printf("Time (Memory Copy and Launch) = %f (ms)\n", d);
}
float probe_event_time(cl_event event, cl_command_queue command_queue) {
cl_int error=0;
cl_ulong eventStart,eventEnd;
clFinish(command_queue);
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),&eventStart,NULL);
if (error != CL_SUCCESS) {
printf("ERROR (%d) in event start profiling.\n", error);
return 0;
}
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,
sizeof(cl_ulong),&eventEnd,NULL);
if (error != CL_SUCCESS) {
printf("ERROR (%d) in event end profiling.\n", error);
return 0;
}
return (float)((eventEnd-eventStart)/1000000.0);
}

25
tests/opencl/cfd/timing.h Normal file
View file

@ -0,0 +1,25 @@
#ifndef __TIMING_H__
#define __TIMING_H__
#include <sys/time.h>
#include <CL/cl.h>
void time_measure_start(struct timeval *tv);
void time_measure_end(struct timeval *tv);
/* tvsub: ret = x - y. */
static inline void tvsub(struct timeval *x,
struct timeval *y,
struct timeval *ret)
{
ret->tv_sec = x->tv_sec - y->tv_sec;
ret->tv_usec = x->tv_usec - y->tv_usec;
if (ret->tv_usec < 0) {
ret->tv_sec--;
ret->tv_usec += 1000000;
}
}
float probe_event_time(cl_event, cl_command_queue);
#endif

131
tests/opencl/cfd/util.h Normal file
View file

@ -0,0 +1,131 @@
#ifndef _C_UTIL_
#define _C_UTIL_
#include <math.h>
#include <iostream>
#include <omp.h>
#include <sys/time.h>
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE_0 RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE_0 RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_0 RD_WG_SIZE
#else
#define BLOCK_SIZE_0 192
#endif
#ifdef RD_WG_SIZE_1_0
#define BLOCK_SIZE_1 RD_WG_SIZE_1_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE_1 RD_WG_SIZE_1
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_1 RD_WG_SIZE
#else
#define BLOCK_SIZE_1 192
#endif
#ifdef RD_WG_SIZE_2_0
#define BLOCK_SIZE_2 RD_WG_SIZE_2_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE_2 RD_WG_SIZE_2
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_2 RD_WG_SIZE
#else
#define BLOCK_SIZE_2 192
#endif
#ifdef RD_WG_SIZE_3_0
#define BLOCK_SIZE_3 RD_WG_SIZE_3_0
#elif defined(RD_WG_SIZE_3)
#define BLOCK_SIZE_3 RD_WG_SIZE_3
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_3 RD_WG_SIZE
#else
#define BLOCK_SIZE_3 192
#endif
#ifdef RD_WG_SIZE_4_0
#define BLOCK_SIZE_4 RD_WG_SIZE_4_0
#elif defined(RD_WG_SIZE_4)
#define BLOCK_SIZE_4 RD_WG_SIZE_4
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE_4 RD_WG_SIZE
#else
#define BLOCK_SIZE_4 192
#endif
using std::endl;
double gettime() {
struct timeval t;
gettimeofday(&t,NULL);
return t.tv_sec+t.tv_usec*1e-6;
}
//-------------------------------------------------------------------
//--initialize array with maximum limit
//-------------------------------------------------------------------
template<typename datatype>
void fill(datatype *A, const int n, const datatype maxi){
for (int j = 0; j < n; j++){
A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
}
}
//--print matrix
template<typename datatype>
void print_matrix(datatype *A, int height, int width){
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int idx = i*width + j;
std::cout<<A[idx]<<" ";
}
std::cout<<std::endl;
}
return;
}
//-------------------------------------------------------------------
//--verify results
//-------------------------------------------------------------------
#define MAX_RELATIVE_ERROR .002
template<typename datatype>
void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
bool passed = true;
#pragma omp parallel for
for (int i=0; i<size; i++){
if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
passed = false;
}
}
if (passed){
std::cout << "--cambine:passed:-)" << std::endl;
}
else{
std::cout << "--cambine: failed:-(" << std::endl;
}
return ;
}
template<typename datatype>
void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
bool passed = true;
//#pragma omp parallel for
for (int i=0; i<size; i++){
if (cpu_results[i]!=gpu_results[i]){
passed = false;
}
}
if (passed){
std::cout << "--cambine:passed:-)" << std::endl;
}
else{
std::cout << "--cambine: failed:-(" << std::endl;
}
return ;
}
#endif

View file

@ -0,0 +1,9 @@
PROJECT = cutcp
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
CXXFLAGS += -I.
OPTS ?=
include ../common.mk

617
tests/opencl/cutcp2/args.c Normal file
View file

@ -0,0 +1,617 @@
#include <parboil.h>
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
/*****************************************************************************/
/* Memory management routines */
/* Free an array of owned strings. */
void
pb_FreeStringArray(char **string_array)
{
char **p;
if (!string_array) return;
for (p = string_array; *p; p++) free(*p);
free(string_array);
}
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version)
{
if (name == NULL) {
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
exit(-1);
}
struct pb_PlatformParam *ret =
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
ret->name = name;
ret->version = version;
return ret;
}
void
pb_FreePlatformParam(struct pb_PlatformParam *p)
{
if (p == NULL) return;
free(p->name);
free(p->version);
free(p);
}
struct pb_DeviceParam *
pb_DeviceParam_index(int index)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_INDEX;
ret->index = index;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_cpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_CPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_gpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_GPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_ACCELERATOR;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_name(char *name)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_NAME;
ret->name = name;
return ret;
}
void
pb_FreeDeviceParam(struct pb_DeviceParam *p)
{
if (p == NULL) return;
switch(p->criterion) {
case pb_Device_NAME:
free(p->name);
break;
case pb_Device_INDEX:
case pb_Device_CPU:
case pb_Device_ACCELERATOR:
break;
default:
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
exit(-1);
}
}
void
pb_FreeParameters(struct pb_Parameters *p)
{
free(p->outFile);
pb_FreeStringArray(p->inpFiles);
pb_FreePlatformParam(p->platform);
pb_FreeDeviceParam(p->device);
free(p);
}
/*****************************************************************************/
/* Parse a comma-delimited list of strings into an
* array of strings. */
static char **
read_string_array(char *in)
{
char **ret;
int i;
int count; /* Number of items in the input */
char *substring; /* Current substring within 'in' */
/* Count the number of items in the string */
count = 1;
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
/* Allocate storage */
ret = (char **)malloc((count + 1) * sizeof(char *));
/* Create copies of the strings from the list */
substring = in;
for (i = 0; i < count; i++) {
char *substring_end;
int substring_length;
/* Find length of substring */
for (substring_end = substring;
(*substring_end != ',') && (*substring_end != 0);
substring_end++);
substring_length = substring_end - substring;
/* Allocate memory and copy the substring */
ret[i] = (char *)malloc(substring_length + 1);
memcpy(ret[i], substring, substring_length);
ret[i][substring_length] = 0;
/* go to next substring */
substring = substring_end + 1;
}
ret[i] = NULL; /* Write the sentinel value */
return ret;
}
static void
report_parse_error(const char *str)
{
fputs(str, stderr);
}
/* Interpret a string as a 'pb_DeviceParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_DeviceParam *
read_device_param(char *str)
{
/* Try different ways of interpreting 'device_string' until one works */
/* If argument is an integer, then interpret it as a device index */
errno = 0;
char *end;
long device_int = strtol(str, &end, 10);
if (!errno) {
/* Negative numbers are not valid */
if (device_int < 0 || device_int > INT_MAX) return NULL;
return pb_DeviceParam_index(device_int);
}
/* Match against predefined strings */
if (strcmp(str, "CPU") == 0)
return pb_DeviceParam_cpu();
if (strcmp(str, "GPU") == 0)
return pb_DeviceParam_gpu();
if (strcmp(str, "ACCELERATOR") == 0)
return pb_DeviceParam_accelerator();
/* Assume any other string is a device name */
return pb_DeviceParam_name(strdup(str));
}
/* Interpret a string as a 'pb_PlatformParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_PlatformParam *
read_platform_param(char *str)
{
int separator_index; /* Index of the '-' character separating
* name and version number. It's -1 if
* there's no '-' character. */
/* Find the last occurrence of '-' in 'str' */
{
char *cur;
separator_index = -1;
for (cur = str; *cur; cur++) {
if (*cur == '-') separator_index = cur - str;
}
}
/* The platform name is either the entire string, or all characters before
* the separator */
int name_length = separator_index == -1 ? strlen(str) : separator_index;
char *name_str = (char *)malloc(name_length + 1);
memcpy(name_str, str, name_length);
name_str[name_length] = 0;
/* The version is either NULL, or all characters after the separator */
char *version_str;
if (separator_index == -1) {
version_str = NULL;
}
else {
const char *version_input_str = str + separator_index + 1;
int version_length = strlen(version_input_str);
version_str = (char *)malloc(version_length + 1);
memcpy(version_str, version_input_str, version_length);
version_str[version_length] = 0;
}
/* Create output structure */
return pb_PlatformParam(name_str, version_str);
}
/****************************************************************************/
/* Argument parsing state */
/* Argument parsing state.
*
* Arguments that are interpreted by the argument parser are removed from
* the list. Variables 'argc' and 'argn' do not count arguments that have
* been removed.
*
* During argument parsing, the array of arguments is compacted, overwriting
* the erased arguments. Variable 'argv_put' points to the array element
* where the next argument will be written. Variable 'argv_get' points to
* the array element where the next argument will be read from.
*/
struct argparse {
int argc; /* Number of arguments. Mutable. */
int argn; /* Current argument index. */
char **argv_get; /* Argument value being read. */
char **argv_put; /* Argument value being written.
* argv_put <= argv_get. */
};
static void
initialize_argparse(struct argparse *ap, int argc, char **argv)
{
ap->argc = argc;
ap->argn = 0;
ap->argv_get = ap->argv_put = argv;
}
/* Finish argument parsing, without processing the remaining arguments.
* Write new argument count into _argc. */
static void
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
{
/* Move the remaining arguments */
for(; ap->argn < ap->argc; ap->argn++)
*ap->argv_put++ = *ap->argv_get++;
/* Update the argument count */
*_argc = ap->argc;
/* Insert a terminating NULL */
argv[ap->argc] = NULL;
}
/* Delete the current argument. The argument will not be visible
* when argument parsing is done. */
static void
delete_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "delete_argument\n");
}
ap->argc--;
ap->argv_get++;
}
/* Go to the next argument. Also, move the current argument to its
* final location in argv. */
static void
next_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "next_argument\n");
}
/* Move argument to its new location. */
*ap->argv_put++ = *ap->argv_get++;
ap->argn++;
}
static int
is_end_of_arguments(struct argparse *ap)
{
return ap->argn == ap->argc;
}
/* Get the current argument */
static char *
get_argument(struct argparse *ap)
{
return *ap->argv_get;
}
/* Get the current argument, and also delete it */
static char *
consume_argument(struct argparse *ap)
{
char *ret = get_argument(ap);
delete_argument(ap);
return ret;
}
/****************************************************************************/
/* The result of parsing a command-line argument */
typedef enum {
ARGPARSE_OK, /* Success */
ARGPARSE_ERROR, /* Error */
ARGPARSE_DONE /* Success, and do not continue parsing */
} result;
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
/* A command-line option */
struct option {
char short_name; /* If not 0, the one-character
* name of this option */
const char *long_name; /* If not NULL, the long name of this option */
parse_action *action; /* What to do when this option occurs.
* Sentinel value is NULL.
*/
};
/* Output file
*
* -o FILE
*/
static result
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-o'\n");
return ARGPARSE_ERROR;
}
/* Replace the output file name */
free(params->outFile);
params->outFile = strdup(consume_argument(ap));
return ARGPARSE_OK;
}
/* Input files
*
* -i FILE,FILE,...
*/
static result
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-i'\n");
return ARGPARSE_ERROR;
}
/* Replace the input file list */
pb_FreeStringArray(params->inpFiles);
params->inpFiles = read_string_array(consume_argument(ap));
return ARGPARSE_OK;
}
/* End of options
*
* --
*/
static result
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
{
return ARGPARSE_DONE;
}
/* OpenCL device
*
* --device X
*/
static result
parse_device(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a device */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--device'\n");
return ARGPARSE_ERROR;
}
char *device_string = consume_argument(ap);
struct pb_DeviceParam *device_param = read_device_param(device_string);
if (!device_param) {
report_parse_error("Unrecognized device specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreeDeviceParam(params->device);
params->device = device_param;
return ARGPARSE_OK;
}
static result
parse_platform(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a platform */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--platform'\n");
return ARGPARSE_ERROR;
}
char *platform_string = consume_argument(ap);
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
if (!platform_param) {
report_parse_error("Unrecognized platform specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreePlatformParam(params->platform);
params->platform = platform_param;
return ARGPARSE_OK;
}
static struct option options[] = {
{ 'o', NULL, &parse_output_file },
{ 'i', NULL, &parse_input_files },
{ '-', NULL, &parse_end_options },
{ 0, "device", &parse_device },
{ 0, "platform", &parse_platform },
{ 0, NULL, NULL }
};
static int
is_last_option(struct option *op)
{
return op->action == NULL;
}
/****************************************************************************/
/* Parse command-line parameters.
* Return zero on error, nonzero otherwise.
* On error, the other outputs may be invalid.
*
* The information collected from parameters is used to update
* 'ret'. 'ret' should be initialized.
*
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
*/
static int
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
{
char *err_message;
struct argparse ap;
/* Each argument */
initialize_argparse(&ap, *_argc, argv);
while(!is_end_of_arguments(&ap)) {
result arg_result; /* Result of parsing this option */
char *arg = get_argument(&ap);
/* Process this argument */
if (arg[0] == '-') {
/* Single-character flag */
if ((arg[1] != 0) && (arg[2] == 0)) {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching short option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->short_name == arg[1]) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
/* Long flag */
if (arg[1] == '-') {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching long option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
}
else {
/* Other arguments are ignored */
next_argument(&ap);
arg_result = ARGPARSE_OK;
goto option_was_processed;
}
option_was_processed:
/* Decide what to do next based on 'arg_result' */
switch(arg_result) {
case ARGPARSE_OK:
/* Continue processing */
break;
case ARGPARSE_ERROR:
/* Error exit from the function */
return 0;
case ARGPARSE_DONE:
/* Normal exit from the argument parsing loop */
goto end_of_options;
}
} /* end for each argument */
/* If all arguments were processed, then normal exit from the loop */
end_of_options:
finalize_argparse(&ap, _argc, argv);
return 1;
}
/*****************************************************************************/
/* Other exported functions */
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv)
{
struct pb_Parameters *ret =
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
/* Initialize the parameters structure */
ret->outFile = NULL;
ret->inpFiles = (char **)malloc(sizeof(char *));
ret->inpFiles[0] = NULL;
ret->platform = NULL;
ret->device = NULL;
/* Read parameters and update _argc, argv */
if (!pb_ParseParameters(ret, _argc, argv)) {
/* Parse error */
pb_FreeParameters(ret);
return NULL;
}
return ret;
}
int
pb_Parameters_CountInputs(struct pb_Parameters *p)
{
int n;
for (n = 0; p->inpFiles[n]; n++);
return n;
}

View file

@ -0,0 +1,37 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef ATOM_H
#define ATOM_H
#ifdef __cplusplus
extern "C" {
#endif
typedef struct Atom_t {
float x, y, z, q;
} Atom;
typedef struct Atoms_t {
Atom *atoms;
int size;
} Atoms;
typedef struct Vec3_t {
float x, y, z;
} Vec3;
Atoms *read_atom_file(const char *fname);
void free_atom(Atoms *atom);
void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
#ifdef __cplusplus
}
#endif
#endif /* ATOM_H */

BIN
tests/opencl/cutcp2/cutcp Executable file

Binary file not shown.

View file

@ -0,0 +1,195 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#undef DEBUG_PASS_RATE
#define CHECK_CYLINDER_CPU
#define CELLEN 4.f
#define INV_CELLEN (1.f/CELLEN)
extern int cpu_compute_cutoff_potential_lattice(
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms /* array of atoms */
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float gridspacing = lattice->dim.h;
int natoms = atoms->size;
Atom *atom = atoms->atoms;
const float a2 = cutoff * cutoff;
const float inv_a2 = 1.f / a2;
float s;
const float inv_gridspacing = 1.f / gridspacing;
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
/* lattice point radius about each atom */
int n;
int i, j, k;
int ia, ib, ic;
int ja, jb, jc;
int ka, kb, kc;
int index;
int koff, jkoff;
float x, y, z, q;
float dx, dy, dz;
float dz2, dydz2, r2;
float e;
float xstart, ystart;
float *pg;
int gindex;
int ncell, nxcell, nycell, nzcell;
int *first, *next;
float inv_cellen = INV_CELLEN;
Vec3 minext, maxext; /* Extent of atom bounding box */
float xmin, ymin, zmin;
float xmax, ymax, zmax;
#if DEBUG_PASS_RATE
unsigned long long pass_count = 0;
unsigned long long fail_count = 0;
#endif
/* find min and max extent */
get_atom_extent(&minext, &maxext, atoms);
/* number of cells in each dimension */
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
ncell = nxcell * nycell * nzcell;
/* allocate for cursor link list implementation */
first = (int *) malloc(ncell * sizeof(int));
for (gindex = 0; gindex < ncell; gindex++) {
first[gindex] = -1;
}
next = (int *) malloc(natoms * sizeof(int));
for (n = 0; n < natoms; n++) {
next[n] = -1;
}
/* geometric hashing */
for (n = 0; n < natoms; n++) {
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
gindex = (k*nycell + j)*nxcell + i;
next[n] = first[gindex];
first[gindex] = n;
}
/* traverse the grid cells */
for (gindex = 0; gindex < ncell; gindex++) {
for (n = first[gindex]; n != -1; n = next[n]) {
x = atom[n].x - xlo;
y = atom[n].y - ylo;
z = atom[n].z - zlo;
q = atom[n].q;
/* find closest grid point with position less than or equal to atom */
ic = (int) (x * inv_gridspacing);
jc = (int) (y * inv_gridspacing);
kc = (int) (z * inv_gridspacing);
/* find extent of surrounding box of grid points */
ia = ic - radius;
ib = ic + radius + 1;
ja = jc - radius;
jb = jc + radius + 1;
ka = kc - radius;
kb = kc + radius + 1;
/* trim box edges so that they are within grid point lattice */
if (ia < 0) ia = 0;
if (ib >= nx) ib = nx-1;
if (ja < 0) ja = 0;
if (jb >= ny) jb = ny-1;
if (ka < 0) ka = 0;
if (kb >= nz) kb = nz-1;
/* loop over surrounding grid points */
xstart = ia*gridspacing - x;
ystart = ja*gridspacing - y;
dz = ka*gridspacing - z;
for (k = ka; k <= kb; k++, dz += gridspacing) {
koff = k*ny;
dz2 = dz*dz;
dy = ystart;
for (j = ja; j <= jb; j++, dy += gridspacing) {
jkoff = (koff + j)*nx;
dydz2 = dy*dy + dz2;
#ifdef CHECK_CYLINDER_CPU
if (dydz2 >= a2) continue;
#endif
dx = xstart;
index = jkoff + ia;
pg = lattice->lattice + index;
#if defined(__INTEL_COMPILER)
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
e = q * (1/sqrtf(r2)) * s;
*pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
}
#else
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
if (r2 >= a2)
{
#ifdef DEBUG_PASS_RATE
fail_count++;
#endif
continue;
}
#ifdef DEBUG_PASS_RATE
pass_count++;
#endif
s = (1.f - r2 * inv_a2);
e = q * (1/sqrtf(r2)) * s * s;
*pg += e;
}
#endif
}
} /* end loop over surrounding grid points */
} /* end loop over atoms in a gridcell */
} /* end loop over gridcells */
/* free memory */
free(next);
free(first);
/* For debugging: print the number of times that the test passed/failed */
#ifdef DEBUG_PASS_RATE
printf ("Pass :%lld\n", pass_count);
printf ("Fail :%lld\n", fail_count);
#endif
return 0;
}

View file

@ -0,0 +1,508 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#include "macros.h"
#include "ocl.h"
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (NULL == filename || NULL == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
typedef cl_int4 xyz;
//extern "C" int gpu_compute_cutoff_potential_lattice(
int gpu_compute_cutoff_potential_lattice(
struct pb_TimerSet *timers,
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms, /* array of atoms */
int verbose, /* print info/debug messages */
struct pb_Parameters *parameters
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float h = lattice->dim.h;
int natoms = atoms->size;
Atom *atom = atoms->atoms;
xyz nbrlist[NBRLIST_MAXLEN];
int nbrlistlen = 0;
int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */
int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */
int num_excluded = 0;
int xRegionDim, yRegionDim, zRegionDim;
int xRegionIndex, yRegionIndex, zRegionIndex;
int xOffset, yOffset, zOffset;
int lnx, lny, lnz, lnall;
float *regionZeroAddr, *thisRegion;
cl_mem regionZeroCl;
int index, indexRegion;
int c;
xyz binDim;
int nbins;
cl_float4 *binBaseAddr, *binZeroAddr;
cl_mem binBaseCl, binZeroCl;
int *bincntBaseAddr, *bincntZeroAddr;
Atoms *extra = NULL;
cl_mem NbrListLen;
cl_mem NbrList;
int i, j, k, n;
int sum, total;
float avgFillFull, avgFillCover;
const float cutoff2 = cutoff * cutoff;
const float inv_cutoff2 = 1.f / cutoff2;
size_t gridDim[3], blockDim[3];
// The "compute" timer should be active upon entry to this function
/* pad lattice to be factor of 8 in each dimension */
xRegionDim = (int) ceilf(nx/8.f);
yRegionDim = (int) ceilf(ny/8.f);
zRegionDim = (int) ceilf(nz/8.f);
lnx = 8 * xRegionDim;
lny = 8 * yRegionDim;
lnz = 8 * zRegionDim;
lnall = lnx * lny * lnz;
/* will receive energies from OpenCL */
regionZeroAddr = (float *) malloc(lnall * sizeof(float));
/* create bins */
c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
nbins = binDim.x * binDim.y * binDim.z;
binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
/* create neighbor list */
if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
float s = sqrtf(3);
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
int cnt = 0;
/* develop neighbor list around 1 cell */
if (2*c + 1 > NBRLIST_DIM) {
fprintf(stderr, "must have cutoff <= %f\n",
(NBRLIST_DIM-1)/2 * BIN_LENGTH);
return -1;
}
for (k = -c; k <= c; k++) {
for (j = -c; j <= c; j++) {
for (i = -c; i <= c; i++) {
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
nbrlist[cnt].x = i;
nbrlist[cnt].y = j;
nbrlist[cnt].z = k;
cnt++;
}
}
}
nbrlistlen = cnt;
}
else if (8*h <= 2*BIN_LENGTH) {
float s = 2.f*sqrtf(3);
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
int cnt = 0;
/* develop neighbor list around 3-cube of cells */
if (2*c + 3 > NBRLIST_DIM) {
fprintf(stderr, "must have cutoff <= %f\n",
(NBRLIST_DIM-3)/2 * BIN_LENGTH);
return -1;
}
for (k = -c; k <= c; k++) {
for (j = -c; j <= c; j++) {
for (i = -c; i <= c; i++) {
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
nbrlist[cnt].x = i;
nbrlist[cnt].y = j;
nbrlist[cnt].z = k;
cnt++;
}
}
}
nbrlistlen = cnt;
}
else {
fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
return -1;
}
/* perform geometric hashing of atoms into bins */
{
/* array of extra atoms, permit average of one extra per bin */
Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
int extra_len = 0;
for (n = 0; n < natoms; n++) {
cl_float4 p;
p.x = atom[n].x - xlo;
p.y = atom[n].y - ylo;
p.z = atom[n].z - zlo;
p.w = atom[n].q;
i = (int) floorf(p.x * BIN_INVLEN);
j = (int) floorf(p.y * BIN_INVLEN);
k = (int) floorf(p.z * BIN_INVLEN);
if (i >= -c && i < binDim.x - c &&
j >= -c && j < binDim.y - c &&
k >= -c && k < binDim.z - c &&
atom[n].q != 0) {
int index = (k * binDim.y + j) * binDim.x + i;
cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
int bindex = bincntZeroAddr[index];
if (bindex < BIN_DEPTH) {
/* copy atom into bin and increase counter for this bin */
bin[bindex] = p;
bincntZeroAddr[index]++;
}
else {
/* add index to array of extra atoms to be computed with CPU */
if (extra_len >= nbins) {
fprintf(stderr, "exceeded space for storing extra atoms\n");
return -1;
}
extra_atoms[extra_len] = atom[n];
extra_len++;
}
}
else {
/* excluded atoms are either outside bins or neutrally charged */
num_excluded++;
}
}
/* Save result */
extra = (Atoms *)malloc(sizeof(Atoms));
extra->atoms = extra_atoms;
extra->size = extra_len;
}
/* bin stats */
sum = total = 0;
for (n = 0; n < nbins; n++) {
binHistoFull[ bincntBaseAddr[n] ]++;
sum += bincntBaseAddr[n];
total += BIN_DEPTH;
}
avgFillFull = sum / (float) total;
sum = total = 0;
for (k = 0; k < binDim.z - 2*c; k++) {
for (j = 0; j < binDim.y - 2*c; j++) {
for (i = 0; i < binDim.x - 2*c; i++) {
int index = (k * binDim.y + j) * binDim.x + i;
binHistoCover[ bincntZeroAddr[index] ]++;
sum += bincntZeroAddr[index];
total += BIN_DEPTH;
}
}
}
avgFillCover = sum / (float) total;
if (verbose) {
/* report */
printf("number of atoms = %d\n", natoms);
printf("lattice spacing = %g\n", h);
printf("cutoff distance = %g\n", cutoff);
printf("\n");
printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
printf("number of bytes for lattice data = %u\n", (unsigned int) (lnall*sizeof(float)));
printf("\n");
printf("bin padding thickness = %d\n", c);
printf("bin cover dimensions = %d %d %d\n",
binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
printf("number of bins = %d\n", nbins);
printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
printf("%% overhead space = %g\n",
(natoms / (double) (nbins * BIN_DEPTH)) * 100);
printf("number of bytes for bin data = %u\n",
(unsigned int)(nbins * BIN_DEPTH * sizeof(cl_float4)));
printf("\n");
printf("bin histogram with padding:\n");
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]);
sum += binHistoFull[n];
}
printf(" total number of bins: %d\n", sum);
printf(" %% average fill: %g\n", avgFillFull * 100);
printf("\n");
printf("bin histogram excluding padding:\n");
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]);
sum += binHistoCover[n];
}
printf(" total number of bins: %d\n", sum);
printf(" %% average fill: %g\n", avgFillCover * 100);
printf("\n");
printf("number of extra atoms = %d\n", extra->size);
printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
printf("\n");
/* sanity check on bins */
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
sum += n * binHistoFull[n];
}
sum += extra->size + num_excluded;
printf("sanity check on bin histogram with edges: "
"sum + others = %d\n", sum);
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
sum += n * binHistoCover[n];
}
sum += extra->size + num_excluded;
printf("sanity check on bin histogram excluding edges: "
"sum + others = %d\n", sum);
printf("\n");
/* neighbor list */
printf("neighbor list length = %d\n", nbrlistlen);
printf("\n");
}
pb_Context* pb_context;
pb_context = pb_InitOpenCLContext(parameters);
if (pb_context == NULL) {
fprintf (stderr, "Error: No OpenCL platform/device can be found.");
return -1;
}
cl_int clStatus;
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
cl_context clContext = (cl_context) pb_context->clContext;
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
CHECK_ERROR("clCreateCommandQueue")
pb_SetOpenCL(&clContext, &clCommandQueue);
//const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
cl_program clProgram = clCreateProgramWithBinary(
clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
char clOptions[50];
sprintf(clOptions,"-I src/opencl_base"); //-cl-nv-verbose
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
if (clStatus != CL_SUCCESS) {
size_t string_size = 0;
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
0, NULL, &string_size);
char* string = (char*)malloc(string_size*sizeof(char));
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
string_size, string, NULL);
puts(string);
}
CHECK_ERROR("clBuildProgram")
cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus);
CHECK_ERROR("clCreateKernel")
/* setup OpenCL kernel parameters */
blockDim[0] = 8;
blockDim[1] = 8;
blockDim[2] = 2;
gridDim[0] = 4 * xRegionDim * blockDim[0];
gridDim[1] = yRegionDim * blockDim[1];
gridDim[2] = 1 * blockDim[2];
/* allocate and initialize memory on OpenCL device */
pb_SwitchToTimer(timers, pb_TimerID_COPY);
if (verbose) {
printf("Allocating %.2fMB on OpenCL device for potentials\n",
lnall * sizeof(float) / (double) (1024*1024));
}
regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
// clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float));
if (verbose) {
printf("Allocating %.2fMB on OpenCL device for atom bins\n",
nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
}
binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
//Sub buffers are not supported in OpenCL v1.0
int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
if (verbose)
printf("\n");
clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&regionZeroCl);
clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
CHECK_ERROR("clSetKernelArg")
/* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
printf(" computing plane %d\r", zRegionIndex);
fflush(stdout);
clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
CHECK_ERROR("clSetKernelArg")
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
CHECK_ERROR("clEnqueueNDRangeKernel")
clStatus = clFinish(clCommandQueue);
CHECK_ERROR("clFinish")
}
printf("Finished OpenCL kernel calls\n");
/* copy result regions from OpenCL device */
pb_SwitchToTimer(timers, pb_TimerID_COPY);
clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL);
CHECK_ERROR("clEnqueueReadBuffer")
/* free OpenCL memory allocations */
clStatus = clReleaseMemObject(regionZeroCl);
clStatus = clReleaseMemObject(binBaseCl);
clStatus = clReleaseMemObject(NbrListLen);
clStatus = clReleaseMemObject(NbrList);
CHECK_ERROR("clReleaseMemObject")
clStatus = clReleaseKernel(clKernel);
clStatus = clReleaseProgram(clProgram);
clStatus = clReleaseCommandQueue(clCommandQueue);
clStatus = clReleaseContext(clContext);
//free((void*)clSource[0]);
/* transpose regions back into lattice */
pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
for (k = 0; k < nz; k++) {
zRegionIndex = (k >> 3);
zOffset = (k & 7);
for (j = 0; j < ny; j++) {
yRegionIndex = (j >> 3);
yOffset = (j & 7);
for (i = 0; i < nx; i++) {
xRegionIndex = (i >> 3);
xOffset = (i & 7);
thisRegion = regionZeroAddr
+ ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
+ xRegionIndex) * REGION_SIZE;
indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
index = (k * ny + j) * nx + i;
lattice->lattice[index] = thisRegion[indexRegion];
}
}
}
/* handle extra atoms */
if (extra->size > 0) {
printf("computing extra atoms on CPU\n");
if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
"for extra atoms\n");
return -1;
}
printf("\n");
}
/* cleanup memory allocations */
free(regionZeroAddr);
free(binBaseAddr);
free(bincntBaseAddr);
free_atom(extra);
return 0;
}

View file

@ -0,0 +1,72 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef CUTOFF_H
#define CUTOFF_H
#ifdef __cplusplus
extern "C" {
#endif
#define SHIFTED
/* A structure to record how points in 3D space map to array
elements. Array element (z, y, x)
where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
*/
typedef struct LatticeDim_t {
/* Number of lattice points in x, y, z dimensions */
int nx, ny, nz;
/* Lowest corner of lattice */
Vec3 lo;
/* Lattice spacing */
float h;
} LatticeDim;
/* An electric potential field sampled on a regular grid. The
lattice size and grid point positions are specified by 'dim'.
*/
typedef struct Lattice_t {
LatticeDim dim;
float *lattice;
} Lattice;
LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
Lattice *create_lattice(LatticeDim dim);
void destroy_lattice(Lattice *);
int gpu_compute_cutoff_potential_lattice(
struct pb_TimerSet *timers,
Lattice *lattice,
float cutoff, /* cutoff distance */
Atoms *atom, /* array of atoms */
int verbose, /* print info/debug messages */
struct pb_Parameters *parameters
);
int cpu_compute_cutoff_potential_lattice(
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms /* array of atoms */
);
int remove_exclusions(
Lattice *lattice, /* the lattice */
float exclcutoff, /* exclusion cutoff distance */
Atoms *atom /* array of atoms */
);
#ifdef __cplusplus
}
#endif
#endif /* CUTOFF_H */

157
tests/opencl/cutcp2/excl.c Normal file
View file

@ -0,0 +1,157 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#define CELLEN 4.f
#define INV_CELLEN (1.f/CELLEN)
extern int remove_exclusions(
Lattice *lattice, /* the lattice */
float cutoff, /* exclusion cutoff distance */
Atoms *atoms /* array of atoms */
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float gridspacing = lattice->dim.h;
Atom *atom = atoms->atoms;
const float a2 = cutoff * cutoff;
const float inv_gridspacing = 1.f / gridspacing;
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
/* lattice point radius about each atom */
int n;
int i, j, k;
int ia, ib, ic;
int ja, jb, jc;
int ka, kb, kc;
int index;
int koff, jkoff;
float x, y, z, q;
float dx, dy, dz;
float dz2, dydz2, r2;
float e;
float xstart, ystart;
float *pg;
int gindex;
int ncell, nxcell, nycell, nzcell;
int *first, *next;
float inv_cellen = INV_CELLEN;
Vec3 minext, maxext;
/* find min and max extent */
get_atom_extent(&minext, &maxext, atoms);
/* number of cells in each dimension */
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
ncell = nxcell * nycell * nzcell;
/* allocate for cursor link list implementation */
first = (int *) malloc(ncell * sizeof(int));
for (gindex = 0; gindex < ncell; gindex++) {
first[gindex] = -1;
}
next = (int *) malloc(atoms->size * sizeof(int));
for (n = 0; n < atoms->size; n++) {
next[n] = -1;
}
/* geometric hashing */
for (n = 0; n < atoms->size; n++) {
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
gindex = (k*nycell + j)*nxcell + i;
next[n] = first[gindex];
first[gindex] = n;
}
/* traverse the grid cells */
for (gindex = 0; gindex < ncell; gindex++) {
for (n = first[gindex]; n != -1; n = next[n]) {
x = atom[n].x - xlo;
y = atom[n].y - ylo;
z = atom[n].z - zlo;
q = atom[n].q;
/* find closest grid point with position less than or equal to atom */
ic = (int) (x * inv_gridspacing);
jc = (int) (y * inv_gridspacing);
kc = (int) (z * inv_gridspacing);
/* find extent of surrounding box of grid points */
ia = ic - radius;
ib = ic + radius + 1;
ja = jc - radius;
jb = jc + radius + 1;
ka = kc - radius;
kb = kc + radius + 1;
/* trim box edges so that they are within grid point lattice */
if (ia < 0) ia = 0;
if (ib >= nx) ib = nx-1;
if (ja < 0) ja = 0;
if (jb >= ny) jb = ny-1;
if (ka < 0) ka = 0;
if (kb >= nz) kb = nz-1;
/* loop over surrounding grid points */
xstart = ia*gridspacing - x;
ystart = ja*gridspacing - y;
dz = ka*gridspacing - z;
for (k = ka; k <= kb; k++, dz += gridspacing) {
koff = k*ny;
dz2 = dz*dz;
dy = ystart;
for (j = ja; j <= jb; j++, dy += gridspacing) {
jkoff = (koff + j)*nx;
dydz2 = dy*dy + dz2;
dx = xstart;
index = jkoff + ia;
pg = lattice->lattice + index;
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
/* If atom and lattice point are too close, set the lattice value
* to zero */
if (r2 < a2) *pg = 0;
}
}
} /* end loop over surrounding grid points */
} /* end loop over atoms in a gridcell */
} /* end loop over gridcells */
/* free memory */
free(next);
free(first);
return 0;
}

View file

@ -0,0 +1,55 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
//#include <endian.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <inttypes.h>
#include "gpu_info.h"
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm)
{
int max_thread;
int max_block=8;
if(major==1)
{
if(minor>=2)
max_thread=1024;
else
max_thread=768;
}
else if(major==2)
max_thread=1536;
else
//newer GPU //keep using 2.0
max_thread=1536;
int _grid;
int _thread;
if(task*pad>sm*max_thread)
{
_thread=max_thread/max_block;
_grid = ((task*pad+_thread-1)/_thread)*_thread;
}
else
{
_thread=pad;
_grid=task*pad;
}
thread[0]=_thread;
grid[0]=_grid;
}

View file

@ -0,0 +1,28 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef __GPUINFOH__
#define __GPUINFOH__
#ifdef __cplusplus
extern "C" {
#endif
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,104 @@
/*
* potential lattice is decomposed into size 8^3 lattice point "regions"
*
* THIS IMPLEMENTATION: one thread per lattice point
* thread block size 128 gives 4 thread blocks per region
* kernel is invoked for each x-y plane of regions,
* where gridDim.x is 4*(x region dimension) so that blockIdx.x
* can absorb the z sub-region index in its 2 lowest order bits
*
* Regions are stored contiguously in memory in row-major order
*
* The bins have to not only cover the region, but they need to surround
* the outer edges so that region sides and corners can still use
* neighbor list stencil. The binZeroAddr is actually a shifted pointer into
* the bin array (binZeroAddr = binBaseAddr + (c*binDim_y + c)*binDim_x + c)
* where c = ceil(cutoff / binsize). This allows for negative offsets to
* be added to myBinIndex.
*
* The (0,0,0) spatial origin corresponds to lower left corner of both
* regionZeroAddr and binZeroAddr. The atom coordinates are translated
* during binning to enforce this assumption.
*/
#include "macros.h"
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
typedef int4 xyz;
__kernel void opencl_cutoff_potential_lattice(
int binDim_x,
int binDim_y,
__global float4 *binBaseAddr,
int offset,
float h, /* lattice spacing */
float cutoff2, /* square of cutoff distance */
float inv_cutoff2,
__global float *regionZeroAddr, /* address of lattice regions starting at origin */
int zRegionIndex,
__constant int *NbrListLen,
__constant xyz *NbrList
)
{
__global float4* binZeroAddr = binBaseAddr + offset;
__global float *myRegionAddr;
int Bx, By, Bz;
/* thread id */
const int tid = (get_local_id(2)*get_local_size(1) +
get_local_id(1))*get_local_size(0) + get_local_id(0);
/* this is the start of the sub-region indexed by tid */
myRegionAddr = regionZeroAddr + ((zRegionIndex*get_num_groups(1)
+ get_group_id(1))*(get_num_groups(0)>>2) + (get_group_id(0)>>2))*REGION_SIZE
+ (get_group_id(0)&3)*SUB_REGION_SIZE;
/* spatial coordinate of this lattice point */
float x = (8 * (get_group_id(0) >> 2) + get_local_id(0)) * h;
float y = (8 * get_group_id(1) + get_local_id(1)) * h;
float z = (8 * zRegionIndex + 2*(get_group_id(0)&3) + get_local_id(2)) * h;
float dx;
float dy;
float dz;
float r2;
float s;
int totalbins = 0;
/* bin number determined by center of region */
Bx = (int) floor((8 * (get_group_id(0) >> 2) + 4) * h * BIN_INVLEN);
By = (int) floor((8 * get_group_id(1) + 4) * h * BIN_INVLEN);
Bz = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
float energy = 0.f;
int bincnt;
for (bincnt = 0; bincnt < *NbrListLen; bincnt++) {
int i = Bx + NbrList[bincnt].x;
int j = By + NbrList[bincnt].y;
int k = Bz + NbrList[bincnt].z;
__global float4* p_global = binZeroAddr +
(((k*binDim_y + j)*binDim_x + i) * BIN_DEPTH);
int m;
for (m = 0; m < BIN_DEPTH; m++) {
float aq = p_global[m].w;
if (0.f != aq) {
dx = p_global[m].x - x;
dy = p_global[m].y - y;
dz = p_global[m].z - z;
r2 = dx*dx + dy*dy + dz*dz;
if (r2 < cutoff2) {
s = (1.f - r2 * inv_cutoff2);
energy += aq * rsqrt(r2) * s * s;
}
}
} /* end loop over atoms in bin */
} /* end loop over neighbor list */
/* store into global memory */
myRegionAddr[tid+0] = energy;
}

View file

@ -0,0 +1,69 @@
#ifndef __MACROSH__
#define __MACROSH__
#ifdef __DEVICE_EMULATION__
#define DEBUG
/* define which grid block and which thread to examine */
#define BX 0
#define BY 0
#define TX 0
#define TY 0
#define TZ 0
#define EMU(code) do { \
if (blockIdx.x==BX && blockIdx.y==BY && \
threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
code; \
} \
} while (0)
#define INT(n) printf("%s = %d\n", #n, n)
#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
(double)(f).y, (double)(f).z, (double)(f).w)
#else
#define EMU(code)
#define INT(n)
#define FLOAT(f)
#define INT3(n)
#define FLOAT4(f)
#endif
/* report error from OpenCL */
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
/*
* neighbor list:
* stored in constant memory as table of offsets
* flat index addressing is computed by kernel
*
* reserve enough memory for 11^3 stencil of grid cells
* this fits within 16K of memory
*/
#define NBRLIST_DIM 11
#define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
/*
* atom bins cached into shared memory for processing
*
* this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
* should permit scheduling of up to 3 thread blocks per SM
*/
#define BIN_DEPTH 8 /* max number of atoms per bin */
#define BIN_SIZE 32 /* size of bin in floats */
#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
#define BIN_LENGTH 4.f /* spatial length in Angstroms */
#define BIN_INVLEN (1.f / BIN_LENGTH)
/* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
* so that bin fill should be 80% (for non-empty regions of space) */
#define REGION_SIZE 512 /* number of floats in lattice region */
#define SUB_REGION_SIZE 128 /* number of floats in lattice sub-region */
#endif

190
tests/opencl/cutcp2/main.cc Normal file
View file

@ -0,0 +1,190 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#include "output.h"
#define ERRTOL 1e-4f
#define NOKERNELS 0
#define CUTOFF1 1
#define CUTOFF6 32
#define CUTOFF6OVERLAP 64
#define CUTOFFCPU 16384
int appenddata(const char *filename, int size, double time) {
FILE *fp;
fp=fopen(filename, "a");
if (fp == NULL) {
printf("error appending to file %s..\n", filename);
return -1;
}
fprintf(fp, "%d %.3f\n", size, time);
fclose(fp);
return 0;
}
LatticeDim
lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
{
LatticeDim ret;
ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
ret.lo = lo;
ret.h = h;
return ret;
}
Lattice *
create_lattice(LatticeDim dim)
{
int size;
Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
if (lat == NULL) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
lat->dim = dim;
/* Round up the allocated size to a multiple of 8 */
size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
lat->lattice = (float *)calloc(size, sizeof(float));
if (lat->lattice == NULL) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
return lat;
}
void
destroy_lattice(Lattice *lat)
{
if (lat) {
free(lat->lattice);
free(lat);
}
}
int main(int argc, char *argv[]) {
Atoms *atom;
LatticeDim lattice_dim;
Lattice *gpu_lattice;
Vec3 min_ext, max_ext; /* Bounding box of atoms */
Vec3 lo, hi; /* Bounding box with padding */
float h = 0.5f; /* Lattice spacing */
float cutoff = 12.f; /* Cutoff radius */
float exclcutoff = 1.f; /* Radius for exclusion */
float padding = 0.5f; /* Bounding box padding distance */
int n;
struct pb_Parameters *parameters;
struct pb_TimerSet timers;
/* Read input parameters */
parameters = pb_ReadParameters(&argc, argv);
if (parameters == NULL) {
exit(1);
}
parameters->inpFiles = (char **)malloc(sizeof(char *) * 2);
parameters->inpFiles[0] = (char *)malloc(100);
parameters->inpFiles[1] = NULL;
strncpy(parameters->inpFiles[0], "watbox.sl40.pqr", 100);
/* Expect one input file */
if (pb_Parameters_CountInputs(parameters) != 1) {
fprintf(stderr, "Expecting one input file\n");
exit(1);
}
pb_InitializeTimerSet(&timers);
pb_SwitchToTimer(&timers, pb_TimerID_IO);
{
const char *pqrfilename = parameters->inpFiles[0];
if (!(atom = read_atom_file(pqrfilename))) {
fprintf(stderr, "read_atom_file() failed\n");
exit(1);
}
printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
}
/* find extent of domain */
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
get_atom_extent(&min_ext, &max_ext, atom);
printf("extent of domain is:\n");
printf(" minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
printf("padding domain by %g Angstroms\n", padding);
lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
lattice_dim = lattice_from_bounding_box(lo, hi, h);
gpu_lattice = create_lattice(lattice_dim);
printf("\n");
/*
* Run OpenCL kernel
* (Begin and end with COMPUTE timer active)
*/
if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0, parameters)) {
fprintf(stderr, "Computation failed\n");
exit(1);
}
/*
* Zero the lattice points that are too close to an atom. This is
* necessary for numerical stability.
*/
if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
exit(1);
}
printf("\n");
pb_SwitchToTimer(&timers, pb_TimerID_IO);
/* Print output */
if (parameters->outFile) {
//write_lattice_summary(parameters->outFile, gpu_lattice);
}
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
/* Cleanup */
destroy_lattice(gpu_lattice);
free_atom(atom);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
pb_FreeParameters(parameters);
return 0;
}

49
tests/opencl/cutcp2/ocl.c Normal file
View file

@ -0,0 +1,49 @@
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include "ocl.h"
char* readFile(const char* fileName)
{
FILE* fp;
fp = fopen(fileName,"r");
if(fp == NULL)
{
printf("Error 1!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
long size = ftell(fp);
rewind(fp);
char* buffer = (char*)malloc(sizeof(char)*(size+1));
if(buffer == NULL)
{
printf("Error 2!\n");
fclose(fp);
exit(1);
}
size_t res = fread(buffer,1,size,fp);
if(res != size)
{
printf("Error 3!\n");
fclose(fp);
exit(1);
}
buffer[size] = 0;
fclose(fp);
return buffer;
}
void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
{
cl_int clStatus;
char* temp = (char*)malloc(size);
memset(temp,val,size);
clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
free(temp);
}

25
tests/opencl/cutcp2/ocl.h Normal file
View file

@ -0,0 +1,25 @@
#ifndef __OCLH__
#define __OCLH__
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
void clMemSet(cl_command_queue, cl_mem, int, size_t);
char* readFile(const char*);
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,67 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
void
write_lattice_summary(const char *filename, Lattice *lattice)
{
float *lattice_data = lattice->lattice;
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
/* Open output file */
FILE *outfile = fopen(filename, "w");
if (outfile == NULL) {
fprintf(stderr, "Cannot open output file\n");
exit(1);
}
/* Write the sum of the the absolute values of all lattice potentials */
{
double abspotential = 0.0;
float tmp;
int i;
for (i = 0; i < nx * ny * nz; i++)
abspotential += fabs((double) lattice_data[i]);
tmp = (float) abspotential;
fwrite(&tmp, 1, sizeof(float), outfile);
}
/* Write the size of a lattice plane */
{
uint32_t tmp;
tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
fwrite(&tmp, 1, sizeof(uint32_t), outfile);
}
/* Write the plane of lattice data at z=0 and z = nz-1 */
{
int plane_size = nx * ny;
fwrite(lattice_data, plane_size, sizeof(float), outfile);
fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
outfile);
}
/* Cleanup */
fclose(outfile);
}

View file

@ -0,0 +1,25 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef OUTPUT_H
#define OUTPUT_H
#include "cutoff.h"
#ifdef __cplusplus
extern "C" {
#endif
void
write_lattice_summary(const char *filename, Lattice *lattice);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,348 @@
/*
* (c) 2010 The Board of Trustees of the University of Illinois.
*/
#ifndef PARBOIL_HEADER
#define PARBOIL_HEADER
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
#include <unistd.h>
/* A platform as specified by the user on the command line */
struct pb_PlatformParam {
char *name; /* The platform name. This string is owned. */
char *version; /* The platform version; may be NULL.
* This string is owned. */
};
/* Create a PlatformParam from the given strings.
* 'name' must not be NULL. 'version' may be NULL.
* If not NULL, the strings should have been allocated by malloc(),
* and they will be owned by the returned object.
*/
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version);
void
pb_FreePlatformParam(struct pb_PlatformParam *);
/* A criterion for how to select a device */
enum pb_DeviceSelectionCriterion {
pb_Device_INDEX, /* Enumerate the devices and select one
* by its number */
pb_Device_CPU, /* Select a CPU device */
pb_Device_GPU, /* Select a GPU device */
pb_Device_ACCELERATOR, /* Select an accelerator device */
pb_Device_NAME /* Select a device by name */
};
/* A device as specified by the user on the command line */
struct pb_DeviceParam {
enum pb_DeviceSelectionCriterion criterion;
union {
int index; /* If criterion == pb_Device_INDEX,
* the index of the device */
char *name; /* If criterion == pb_Device_NAME,
* the name of the device.
* This string is owned. */
};
};
struct pb_DeviceParam *
pb_DeviceParam_index(int index);
struct pb_DeviceParam *
pb_DeviceParam_cpu(void);
struct pb_DeviceParam *
pb_DeviceParam_gpu(void);
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void);
/* Create a by-name device selection criterion.
* The string should have been allocated by malloc(), and it will will be
* owned by the returned object.
*/
struct pb_DeviceParam *
pb_DeviceParam_name(char *name);
void
pb_FreeDeviceParam(struct pb_DeviceParam *);
/* Command line parameters for benchmarks */
struct pb_Parameters {
char *outFile; /* If not NULL, the raw output of the
* computation should be saved to this
* file. The string is owned. */
char **inpFiles; /* A NULL-terminated array of strings
* holding the input file(s) for the
* computation. The array and strings
* are owned. */
struct pb_PlatformParam *platform; /* If not NULL, the platform
* specified on the command line. */
struct pb_DeviceParam *device; /* If not NULL, the device
* specified on the command line. */
};
/* Read command-line parameters.
*
* The argc and argv parameters to main are read, and any parameters
* interpreted by this function are removed from the argument list.
*
* A new instance of struct pb_Parameters is returned.
* If there is an error, then an error message is printed on stderr
* and NULL is returned.
*/
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv);
/* Free an instance of struct pb_Parameters.
*/
void
pb_FreeParameters(struct pb_Parameters *p);
void
pb_FreeStringArray(char **);
/* Count the number of input files in a pb_Parameters instance.
*/
int
pb_Parameters_CountInputs(struct pb_Parameters *p);
/* A time or duration. */
//#if _POSIX_VERSION >= 200112L
typedef unsigned long long pb_Timestamp; /* time in microseconds */
//#else
//# error "Timestamps not implemented"
//#endif
enum pb_TimerState {
pb_Timer_STOPPED,
pb_Timer_RUNNING,
};
struct pb_Timer {
enum pb_TimerState state;
pb_Timestamp elapsed; /* Amount of time elapsed so far */
pb_Timestamp init; /* Beginning of the current time interval,
* if state is RUNNING. End of the last
* recorded time interfal otherwise. */
};
/* Reset a timer.
* Use this to initialize a timer or to clear
* its elapsed time. The reset timer is stopped.
*/
void
pb_ResetTimer(struct pb_Timer *timer);
/* Start a timer. The timer is set to RUNNING mode and
* time elapsed while the timer is running is added to
* the timer.
* The timer should not already be running.
*/
void
pb_StartTimer(struct pb_Timer *timer);
/* Stop a timer.
* This stops adding elapsed time to the timer.
* The timer should not already be stopped.
*/
void
pb_StopTimer(struct pb_Timer *timer);
/* Get the elapsed time in seconds. */
double
pb_GetElapsedTime(struct pb_Timer *timer);
/* Execution time is assigned to one of these categories. */
enum pb_TimerID {
pb_TimerID_NONE = 0,
pb_TimerID_IO, /* Time spent in input/output */
pb_TimerID_KERNEL, /* Time spent computing on the device,
* recorded asynchronously */
pb_TimerID_COPY, /* Time spent synchronously moving data
* to/from device and allocating/freeing
* memory on the device */
pb_TimerID_DRIVER, /* Time spent in the host interacting with the
* driver, primarily for recording the time
* spent queueing asynchronous operations */
pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
pb_TimerID_COMPUTE, /* Time for all program execution other
* than parsing command line arguments,
* I/O, kernel, and copy */
pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and
* host activity: automatically filled in,
* not intended for direct usage */
pb_TimerID_LAST /* Number of timer IDs */
};
/* Dynamic list of asynchronously tracked times between events */
struct pb_async_time_marker_list {
char *label; // actually just a pointer to a string
enum pb_TimerID timerID; /* The ID to which the interval beginning
* with this marker should be attributed */
void * marker;
//cudaEvent_t marker; /* The driver event for this marker */
struct pb_async_time_marker_list *next;
};
struct pb_SubTimer {
char *label;
struct pb_Timer timer;
struct pb_SubTimer *next;
};
struct pb_SubTimerList {
struct pb_SubTimer *current;
struct pb_SubTimer *subtimer_list;
};
/* A set of timers for recording execution times. */
struct pb_TimerSet {
enum pb_TimerID current;
struct pb_async_time_marker_list* async_markers;
pb_Timestamp async_begin;
pb_Timestamp wall_begin;
struct pb_Timer timers[pb_TimerID_LAST];
struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
};
/* Reset all timers in the set. */
void
pb_InitializeTimerSet(struct pb_TimerSet *timers);
void
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
/* Select which timer the next interval of time should be accounted
* to. The selected timer is started and other timers are stopped.
* Using pb_TimerID_NONE stops all timers. */
void
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
void
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
/* Print timer values to standard output. */
void
pb_PrintTimerSet(struct pb_TimerSet *timers);
/* Release timer resources */
void
pb_DestroyTimerSet(struct pb_TimerSet * timers);
void
pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
typedef struct pb_Device_tag {
char* name;
void* clDevice;
int id;
unsigned int in_use;
unsigned int available;
} pb_Device;
struct pb_Context_tag;
typedef struct pb_Context_tag pb_Context;
typedef struct pb_Platform_tag {
char* name;
char* version;
void* clPlatform;
unsigned int in_use;
pb_Context** contexts;
pb_Device** devices;
} pb_Platform;
struct pb_Context_tag {
void* clPlatformId;
void* clContext;
void* clDeviceId;
pb_Platform* pb_platform;
pb_Device* pb_device;
};
// verbosely print out list of platforms and their devices to the console.
pb_Platform**
pb_GetPlatforms();
// Choose a platform according to the given platform specification
pb_Platform*
pb_GetPlatform(struct pb_PlatformParam *platform);
// choose a platform: by name, name & version
pb_Platform*
pb_GetPlatformByName(const char* name);
pb_Platform*
pb_GetPlatformByNameAndVersion(const char* name, const char* version);
// Choose a device according to the given device specification
pb_Device*
pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
pb_Device**
pb_GetDevices(pb_Platform* pb_platform);
// choose a device by name.
pb_Device*
pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
pb_Platform*
pb_GetPlatformByEnvVars();
pb_Context*
pb_InitOpenCLContext(struct pb_Parameters* parameters);
void
pb_ReleasePlatforms();
void
pb_ReleaseContext(pb_Context* c);
void
pb_PrintPlatformInfo(pb_Context* c);
void
perf_init();
//#define MEASURE_KERNEL_TIME
#include <CL/cl.h>
#ifdef MEASURE_KERNEL_TIME
#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
cl_int
pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
cl_kernel /* kernel */,
cl_uint /* work_dim */,
const size_t * /* global_work_offset */,
const size_t * /* global_work_size */,
const size_t * /* local_work_size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */);
#endif
enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
void pb_sig_float(char*, float*, int);
void pb_sig_double(char*, double*, int);
void pb_sig_short(char*, short*, int);
void pb_sig_int(char*, int*, int);
void pb_sig_uchar(char*, unsigned char*, unsigned int);
void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
#ifdef __cplusplus
}
#endif
#endif //PARBOIL_HEADER

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,139 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "atom.h"
#define LINELEN 96
#define INITLEN 20
Atoms *read_atom_file(const char *fname)
{
FILE *file;
char line[LINELEN];
Atom *atom; /* Atom array */
int len = INITLEN; /* Size of atom array */
int cnt = 0; /* Number of atoms read */
/* allocate initial atom array */
atom = (Atom *) malloc(len * sizeof(Atom));
if (NULL==atom) {
fprintf(stderr, "can't allocate memory\n");
return NULL;
}
int i;
for (i = 0; i < len; ++i) {
atom[i].x = i+0;
atom[i].y = i+1;
atom[i].z = i+2;
atom[i].q = 1;
}
#if 0
/* open atom "pqr" file */
file = fopen(fname, "r");
if (NULL==file) {
fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
return NULL;
}
/* loop to read pqr file line by line */
while (fgets(line, LINELEN, file) != NULL) {
if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
continue; /* skip anything that isn't an atom record */
}
if (cnt==len) { /* extend atom array */
void *tmp = realloc(atom, 2*len*sizeof(Atom));
if (NULL==tmp) {
fprintf(stderr, "can't allocate more memory\n");
return NULL;
}
atom = (Atom *) tmp;
len *= 2;
}
/* read position coordinates and charge from atom record */
if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
&(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
return NULL;
}
cnt++; /* count atoms as we store them */
}
/* verify EOF and close file */
if ( !feof(file) ) {
fprintf(stderr, "did not find EOF\n");
return NULL;
}
if (fclose(file)) {
fprintf(stderr, "can't close file\n");
return NULL;
}
#endif
/* Build the output data structure */
{
Atoms *out = (Atoms *)malloc(sizeof(Atoms));
if (NULL == out) {
fprintf(stderr, "can't allocate memory\n");
return NULL;
}
out->size = cnt;
out->atoms = atom;
return out;
}
}
void free_atom(Atoms *atom)
{
if (atom) {
free(atom->atoms);
free(atom);
}
}
void
get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
{
Atom *atoms = atom->atoms;
int natoms = atom->size;
Vec3 lo;
Vec3 hi;
int n;
hi.x = lo.x = atoms[0].x;
hi.y = lo.y = atoms[0].y;
hi.z = lo.z = atoms[0].z;
for (n = 1; n < natoms; n++) {
lo.x = fminf(lo.x, atoms[n].x);
hi.x = fmaxf(hi.x, atoms[n].x);
lo.y = fminf(lo.y, atoms[n].y);
hi.y = fmaxf(hi.y, atoms[n].y);
lo.z = fminf(lo.z, atoms[n].z);
hi.z = fmaxf(hi.z, atoms[n].z);
}
*out_lo = lo;
*out_hi = hi;
}

File diff suppressed because it is too large Load diff

1
tests/opencl/dwt2d/.gitignore vendored Normal file
View file

@ -0,0 +1 @@
*.bmp.dwt.*

BIN
tests/opencl/dwt2d/16.bmp Normal file

Binary file not shown.

BIN
tests/opencl/dwt2d/192.bmp Normal file

Binary file not shown.

View file

@ -0,0 +1,7 @@
PROJECT = dwt2d
SRCS = components.cc dwt.cc main.cc
OPTS ?= dwt2d_inputs/._192.bmp -D 192x192 -f -5 -l 3
include ../common.mk

View file

@ -0,0 +1,46 @@
#all:
# g++ -o components.o -I/usr/local/cuda-5.5/include -c components.cpp
# g++ -o prog -I/usr/local/cuda-5.5/include main.cpp components.o -lOpenCL
#
include ../../common/make.config
#Can be changed by `make TYPE=CPU`
TYPE = GPU
#Library
ifeq ($(TYPE),GPU)
OPENCL_INC = $(NV_OPENCL_INC)
OPENCL_LIB = $(NV_OPENCL_LIB)
else
OPENCL_INC = $(INTEL_OPENCL_INC)
OPENCL_LIB = $(INTEL_OPENCL_LIB)
endif
ifdef OUTPUT
override OUTPUT = -DOUTPUT
endif
OUTPUT = -DOUTPUT
C_C = g++
OCL_LIB = -lOpenCL -L$(OPENCL_LIB)
#OCL_INC = -I/usr/local/cuda-5.5/include
OCL_INC = -I$(OPENCL_INC)
default: dwt2d
components:
$(C_C) -o components.o $(OCL_INC) -c components.cpp
dwt2d:
$(C_C) -o dwt2d.out $(OCL_INC) \
-I../util -DTIMING \
main.cpp ../util/timing.c \
$(components) \
$(OUTPUT) \
$(OCL_LIB)
clean:
rm -rf *.o dwt2d.out
rm *.bmp.dwt.*

22
tests/opencl/dwt2d/README Normal file
View file

@ -0,0 +1,22 @@
//DESCRITPTION
This is the OpenCL version of the code.
The JPEG2000 standard uses 2D Discrete Wavelet Transform (2D DWT), which consumes a significant part of the total encoding time
// USE
**************OUTPUT********************
USAGE:
make clean
make OUTPUT=Y
**************PARAMETERS*****************
USEAGE:
./dwt2d [otpions] src_img.rgb <out_img.dwt>
-d, --dimension dimensions of src img, e.g. 1920x1080
-l, --level DWT level, default 3
-f, --forward forward transform
-5, --53 5/3 transform

View file

@ -0,0 +1,25 @@
#ifndef _COMMON_H
#define _COMMON_H
//24-bit multiplication is faster on G80,
//but we must be sure to multiply integers
//only within [-8M, 8M - 1] range
#define IMUL(a, b) __mul24(a, b)
#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
/*
# define cudaCheckError( msg ) { \
cudaError_t err = cudaGetLastError(); \
if( cudaSuccess != err) { \
fprintf(stderr, "%s: %i: %s: %s.\n", \
__FILE__, __LINE__, msg, cudaGetErrorString( err) ); \
exit(-1); \
} }
# define cudaCheckAsyncError( msg ) { \
cudaThreadSynchronize(); \
cudaCheckError( msg ); \
}
*/
#endif

View file

@ -0,0 +1,29 @@
#include <unistd.h>
#include <error.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
#include "components.h"
#include "common.h"
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/opencl.h>
#endif
/* the old "components.cu" has been separate into two parts
"components.cpp",contains functions
"components.cl", contains all kernel functions
*/
/* Separate compoents of 8bit RGB source image */
//need add some segments

View file

@ -0,0 +1,21 @@
#include <unistd.h>
#include <error.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <assert.h>
//#include <stddef.h>
#ifndef _COMPONENTS_H
#define _COMPONENTS_H
/* Separate compoents of source 8bit RGB image */
template<typename T>
void rgbToComponents(T d_r, T d_g, T d_b, unsigned char * src, int width, int height);
/* Copy a 8bit source image data into a color compoment of type T */
//template<typename T>
//void bwToComponent(T *d_c, unsigned char * src, int width, int height);
#endif

95
tests/opencl/dwt2d/dwt.cc Normal file
View file

@ -0,0 +1,95 @@
#include <stdio.h>
#include <fcntl.h>
#include <assert.h>
#include <errno.h>
#include <sys/time.h>
#include <unistd.h>
#include <error.h>
/*#include "dwt_cuda/dwt.h"
#include "dwt_cuda/common.h"
#include "dwt.h"
#include "common.h"
*/
#include "common.h"
#include "dwt.h"
#include "common.h"
/*
inline void fdwt(float *in, float *out, int width, int height, int levels)
{
dwt_cuda::fdwt97(in, out, width, height, levels);
}
inline void fdwt(int *in, int *out, int width, int height, int levels)
{
dwt_cuda::fdwt53(in, out, width, height, levels);
}
inline void rdwt(float *in, float *out, int width, int height, int levels)
{
dwt_cuda::rdwt97(in, out, width, height, levels);
}
inline void rdwt(int *in, int *out, int width, int height, int levels)
{
dwt_cuda::rdwt53(in, out, width, height, levels);
}
*/
template<typename T>
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
{
//need add segments
}
template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool);
template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool);
void samplesToChar(unsigned char * dst, float * src, int samplesNum)
{
int i;
for(i = 0; i < samplesNum; i++) {
float r = (src[i]+0.5f) * 255;
if (r > 255) r = 255;
if (r < 0) r = 0;
dst[i] = (unsigned char)r;
}
}
void samplesToChar(unsigned char * dst, int * src, int samplesNum)
{
int i;
for(i = 0; i < samplesNum; i++) {
int r = src[i]+128;
if (r > 255) r = 255;
if (r < 0) r = 0;
dst[i] = (unsigned char)r;
}
}
///* Write output linear orderd*/
template<typename T>
int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
const char * filename, const char * suffix)
{
//need add segments
}
template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
/* Write output visual ordered */
template<typename T>
int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
int stages, const char * filename, const char * suffix)
{
//need add segments
}
template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);

15
tests/opencl/dwt2d/dwt.h Normal file
View file

@ -0,0 +1,15 @@
#ifndef _DWT_H
#define _DWT_H
template<typename T>
int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward);
template<typename T>
int writeNStage2DDWT(T *component_cuda, int width, int height,
int stages, const char * filename, const char * suffix);
template<typename T>
int writeLinear(T *component_cuda, int width, int height,
const char * filename, const char * suffix);
#endif

BIN
tests/opencl/dwt2d/dwt2d Executable file

Binary file not shown.

BIN
tests/opencl/dwt2d/dwt2d_data Executable file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show more