mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-25 06:17:38 -04:00
add benchmakrs
This commit is contained in:
parent
7ee20fca2b
commit
ea26d69751
349 changed files with 190817 additions and 0 deletions
1
tests/opencl/b+tree/.gitignore
vendored
Normal file
1
tests/opencl/b+tree/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
*.txt
|
7
tests/opencl/b+tree/Makefile
Normal file
7
tests/opencl/b+tree/Makefile
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
PROJECT = btree
|
||||||
|
|
||||||
|
SRCS = main.cc kernel_gpu_opencl_wrapper.cc kernel_gpu_opencl_wrapper_2.cc num.cc opencl.cc timer.cc timing.cpp
|
||||||
|
|
||||||
|
OPTS ?= file mil.txt command command.txt
|
||||||
|
|
||||||
|
include common.mk
|
159
tests/opencl/b+tree/Makefile_old
Normal file
159
tests/opencl/b+tree/Makefile_old
Normal file
|
@ -0,0 +1,159 @@
|
||||||
|
include ../../common/make.config
|
||||||
|
|
||||||
|
#Can be changed by `make TYPE=CPU`
|
||||||
|
TYPE = GPU
|
||||||
|
|
||||||
|
#Library
|
||||||
|
ifeq ($(TYPE),GPU)
|
||||||
|
OPENCL_INC = $(NV_OPENCL_INC)
|
||||||
|
OPENCL_LIB = $(NV_OPENCL_LIB)
|
||||||
|
else
|
||||||
|
OPENCL_INC = $(INTEL_OPENCL_INC)
|
||||||
|
OPENCL_LIB = $(INTEL_OPENCL_LIB)
|
||||||
|
endif
|
||||||
|
|
||||||
|
C_C = gcc
|
||||||
|
OCL_LIB = -lOpenCL
|
||||||
|
|
||||||
|
# ====================================================================================================100
|
||||||
|
# NVCC
|
||||||
|
# ====================================================================================================100
|
||||||
|
|
||||||
|
# CUD_C = nvcc
|
||||||
|
# OMP_FLAG = -Xcompiler paste_one_here
|
||||||
|
|
||||||
|
# ====================================================================================================100
|
||||||
|
# PGCC
|
||||||
|
# ====================================================================================================100
|
||||||
|
|
||||||
|
# C_C = pgcc
|
||||||
|
# OMP_FLAG = -mp
|
||||||
|
# ACC_FLAG = -ta=nvidia \
|
||||||
|
# -Minfo \
|
||||||
|
# -Mbounds
|
||||||
|
|
||||||
|
# ====================================================================================================100
|
||||||
|
# END
|
||||||
|
# ====================================================================================================100
|
||||||
|
|
||||||
|
# ========================================================================================================================================================================================================200
|
||||||
|
# EXECUTABLES (LINK OBJECTS TOGETHER INTO BINARY)
|
||||||
|
# ========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
b+tree.out: ./main.o \
|
||||||
|
./kernel/kernel_gpu_opencl_wrapper.o \
|
||||||
|
./kernel/kernel_gpu_opencl_wrapper_2.o \
|
||||||
|
./util/timer/timer.o \
|
||||||
|
./util/num/num.o \
|
||||||
|
./util/opencl/opencl.o \
|
||||||
|
./timing.o
|
||||||
|
$(C_C) ./main.o \
|
||||||
|
./kernel/kernel_gpu_opencl_wrapper.o \
|
||||||
|
./kernel/kernel_gpu_opencl_wrapper_2.o \
|
||||||
|
./util/timer/timer.o \
|
||||||
|
./util/num/num.o \
|
||||||
|
./util/opencl/opencl.o \
|
||||||
|
./timing.o \
|
||||||
|
-lm \
|
||||||
|
-L$(OPENCL_LIB) \
|
||||||
|
-lOpenCL \
|
||||||
|
-o b+tree.out
|
||||||
|
|
||||||
|
run: b+tree.out
|
||||||
|
./b+tree.out file ../../data/b+tree/mil.txt command ../../data/b+tree/command.txt -p 0 -d 0
|
||||||
|
|
||||||
|
# ========================================================================================================================================================================================================200
|
||||||
|
# OBJECTS (COMPILE SOURCE FILES INTO OBJECTS)
|
||||||
|
# ========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
# ======================================================================================================================================================150
|
||||||
|
# MAIN FUNCTION
|
||||||
|
# ======================================================================================================================================================150
|
||||||
|
|
||||||
|
main.o: ./common.h \
|
||||||
|
./main.h \
|
||||||
|
./main.c
|
||||||
|
$(C_C) $(KERNEL_DIM) ./main.c \
|
||||||
|
-c \
|
||||||
|
-I$(OPENCL_INC) \
|
||||||
|
-o ./main.o \
|
||||||
|
-Wno-unused-result \
|
||||||
|
-O3
|
||||||
|
|
||||||
|
# ======================================================================================================================================================150
|
||||||
|
# KERNELS
|
||||||
|
# ======================================================================================================================================================150
|
||||||
|
|
||||||
|
./kernel/kernel_gpu_opencl_wrapper.o: ./common.h \
|
||||||
|
./kernel/kernel_gpu_opencl_wrapper.h \
|
||||||
|
./kernel/kernel_gpu_opencl_wrapper.c
|
||||||
|
$(C_C) $(KERNEL_DIM) ./kernel/kernel_gpu_opencl_wrapper.c \
|
||||||
|
-c \
|
||||||
|
-o ./kernel/kernel_gpu_opencl_wrapper.o \
|
||||||
|
-O3 \
|
||||||
|
-I../util \
|
||||||
|
-DTIMING \
|
||||||
|
-I$(OPENCL_INC)
|
||||||
|
|
||||||
|
./kernel/kernel_gpu_opencl_wrapper_2.o: ./common.h \
|
||||||
|
./kernel/kernel_gpu_opencl_wrapper_2.h \
|
||||||
|
./kernel/kernel_gpu_opencl_wrapper_2.c
|
||||||
|
$(C_C) $(KERNEL_DIM) ./kernel/kernel_gpu_opencl_wrapper_2.c \
|
||||||
|
-c \
|
||||||
|
-o ./kernel/kernel_gpu_opencl_wrapper_2.o \
|
||||||
|
-O3 \
|
||||||
|
-I../util \
|
||||||
|
-DTIMING \
|
||||||
|
-I$(OPENCL_INC)
|
||||||
|
|
||||||
|
timing.o: ../util/timing.h ../util/timing.c
|
||||||
|
$(C_C) ../util/timing.c -I../util -c -o timing.o -I$(OPENCL_INC)
|
||||||
|
|
||||||
|
|
||||||
|
# ======================================================================================================================================================150
|
||||||
|
# UTILITIES
|
||||||
|
# ======================================================================================================================================================150
|
||||||
|
|
||||||
|
./util/timer/timer.o: ./common.h \
|
||||||
|
./util/timer/timer.h \
|
||||||
|
./util/timer/timer.c
|
||||||
|
$(C_C) ./util/timer/timer.c \
|
||||||
|
-c \
|
||||||
|
-o ./util/timer/timer.o \
|
||||||
|
-O3
|
||||||
|
|
||||||
|
./util/num/num.o: ./common.h \
|
||||||
|
./util/num/num.h \
|
||||||
|
./util/num/num.c
|
||||||
|
$(C_C) ./util/num/num.c \
|
||||||
|
-c \
|
||||||
|
-o ./util/num/num.o \
|
||||||
|
-O3
|
||||||
|
|
||||||
|
./util/opencl/opencl.o: ./common.h \
|
||||||
|
./util/opencl/opencl.h \
|
||||||
|
./util/opencl/opencl.c
|
||||||
|
$(C_C) ./util/opencl/opencl.c \
|
||||||
|
-c \
|
||||||
|
-o ./util/opencl/opencl.o \
|
||||||
|
-O3 \
|
||||||
|
-I$(OPENCL_INC)
|
||||||
|
|
||||||
|
# ======================================================================================================================================================150
|
||||||
|
# END
|
||||||
|
# ======================================================================================================================================================150
|
||||||
|
|
||||||
|
# ========================================================================================================================================================================================================200
|
||||||
|
# DELETE
|
||||||
|
# ========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm *.o *.out \
|
||||||
|
./kernel/*.o \
|
||||||
|
./util/timer/*.o \
|
||||||
|
./util/num/*.o \
|
||||||
|
./util/opencl/*.o \
|
||||||
|
output.txt 2>/dev/null || true
|
||||||
|
# ========================================================================================================================================================================================================200
|
||||||
|
# END
|
||||||
|
# ========================================================================================================================================================================================================200
|
7
tests/opencl/b+tree/README
Normal file
7
tests/opencl/b+tree/README
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
******Adjustable work group size*****
|
||||||
|
The kernel 1: RD_WG_SIZE_0_0 RD_WG_SIZE_0
|
||||||
|
The kernel 2: RD_WG_SIZE_1_0 RD_WG_SIZE_1
|
||||||
|
|
||||||
|
USAGE:
|
||||||
|
make clean
|
||||||
|
make KERNEL_DIM="-DRD_WG_SIZE_0=256 -DRD_WG_SIZE_1=256"
|
BIN
tests/opencl/b+tree/btree
Executable file
BIN
tests/opencl/b+tree/btree
Executable file
Binary file not shown.
476
tests/opencl/b+tree/common.h
Normal file
476
tests/opencl/b+tree/common.h
Normal file
|
@ -0,0 +1,476 @@
|
||||||
|
// # ifdef __cplusplus
|
||||||
|
// extern "C" {
|
||||||
|
// # endif
|
||||||
|
|
||||||
|
#ifndef LIST_H
|
||||||
|
# define LIST_H
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// DEFINE/INCLUDE
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// INCLUDE (for some reason these are not recognized when defined in main file before this one is included)
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#include <stdint.h> // (in path known to compiler) needed by uint32_t
|
||||||
|
#include <stdbool.h> // (in path known to compiler) needed by true/false, bool
|
||||||
|
#include <stdlib.h> // (in path known to compiler) needed by malloc
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// DEFINE
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#define fp float
|
||||||
|
|
||||||
|
#define Version "1.5"
|
||||||
|
|
||||||
|
#ifdef WINDOWS
|
||||||
|
#define bool char
|
||||||
|
#define false 0
|
||||||
|
#define true 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* #define DEFAULT_ORDER 256 */
|
||||||
|
|
||||||
|
#ifdef RD_WG_SIZE_0_0
|
||||||
|
#define DEFAULT_ORDER RD_WG_SIZE_0_0
|
||||||
|
#elif defined(RD_WG_SIZE_0)
|
||||||
|
#define DEFAULT_ORDER RD_WG_SIZE_0
|
||||||
|
#elif defined(RD_WG_SIZE)
|
||||||
|
#define DEFAULT_ORDER RD_WG_SIZE
|
||||||
|
#else
|
||||||
|
#define DEFAULT_ORDER 256
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef RD_WG_SIZE_1_0
|
||||||
|
#define DEFAULT_ORDER_2 RD_WG_SIZE_1_0
|
||||||
|
#elif defined(RD_WG_SIZE_1)
|
||||||
|
#define DEFAULT_ORDER_2 RD_WG_SIZE_1
|
||||||
|
#elif defined(RD_WG_SIZE)
|
||||||
|
#define DEFAULT_ORDER_2 RD_WG_SIZE
|
||||||
|
#else
|
||||||
|
#define DEFAULT_ORDER_2 256
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#define malloc(size) ({ \
|
||||||
|
void *_tmp; \
|
||||||
|
\
|
||||||
|
if (!(_tmp = malloc(size))) { \
|
||||||
|
fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__); \
|
||||||
|
exit(-1); \
|
||||||
|
} \
|
||||||
|
\
|
||||||
|
_tmp; \
|
||||||
|
})
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// STRUCTURES
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
// struct list_item;
|
||||||
|
typedef struct list_item list_item_t;
|
||||||
|
|
||||||
|
typedef struct list_t {
|
||||||
|
list_item_t *head, *tail;
|
||||||
|
uint32_t length;
|
||||||
|
int32_t (*compare)(const void *key, const void *with);
|
||||||
|
void (*datum_delete)(void *);
|
||||||
|
} list_t;
|
||||||
|
|
||||||
|
typedef list_item_t *list_iterator_t;
|
||||||
|
typedef list_item_t *list_reverse_iterator_t;
|
||||||
|
|
||||||
|
/* Type representing the record
|
||||||
|
* to which a given key refers.
|
||||||
|
* In a real B+ tree system, the
|
||||||
|
* record would hold data (in a database)
|
||||||
|
* or a file (in an operating system)
|
||||||
|
* or some other information.
|
||||||
|
* Users can rewrite this part of the code
|
||||||
|
* to change the type and content
|
||||||
|
* of the value field.
|
||||||
|
*/
|
||||||
|
typedef struct record {
|
||||||
|
int value;
|
||||||
|
} record;
|
||||||
|
|
||||||
|
/* Type representing a node in the B+ tree.
|
||||||
|
* This type is general enough to serve for both
|
||||||
|
* the leaf and the internal node.
|
||||||
|
* The heart of the node is the array
|
||||||
|
* of keys and the array of corresponding
|
||||||
|
* pointers. The relation between keys
|
||||||
|
* and pointers differs between leaves and
|
||||||
|
* internal nodes. In a leaf, the index
|
||||||
|
* of each key equals the index of its corresponding
|
||||||
|
* pointer, with a maximum of order - 1 key-pointer
|
||||||
|
* pairs. The last pointer points to the
|
||||||
|
* leaf to the right (or NULL in the case
|
||||||
|
* of the rightmost leaf).
|
||||||
|
* In an internal node, the first pointer
|
||||||
|
* refers to lower nodes with keys less than
|
||||||
|
* the smallest key in the keys array. Then,
|
||||||
|
* with indices i starting at 0, the pointer
|
||||||
|
* at i + 1 points to the subtree with keys
|
||||||
|
* greater than or equal to the key in this
|
||||||
|
* node at index i.
|
||||||
|
* The num_keys field is used to keep
|
||||||
|
* track of the number of valid keys.
|
||||||
|
* In an internal node, the number of valid
|
||||||
|
* pointers is always num_keys + 1.
|
||||||
|
* In a leaf, the number of valid pointers
|
||||||
|
* to data is always num_keys. The
|
||||||
|
* last leaf pointer points to the next leaf.
|
||||||
|
*/
|
||||||
|
typedef struct node {
|
||||||
|
void ** pointers;
|
||||||
|
int * keys;
|
||||||
|
struct node * parent;
|
||||||
|
bool is_leaf;
|
||||||
|
int num_keys;
|
||||||
|
struct node * next; // Used for queue.
|
||||||
|
} node;
|
||||||
|
|
||||||
|
//
|
||||||
|
typedef struct knode {
|
||||||
|
int location;
|
||||||
|
int indices [DEFAULT_ORDER + 1];
|
||||||
|
int keys [DEFAULT_ORDER + 1];
|
||||||
|
bool is_leaf;
|
||||||
|
int num_keys;
|
||||||
|
} knode;
|
||||||
|
|
||||||
|
struct list_item {
|
||||||
|
struct list_item *pred, *next;
|
||||||
|
void *datum;
|
||||||
|
};
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// PROTOTYPES
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// Other
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
void
|
||||||
|
list_item_init( list_item_t *li,
|
||||||
|
void *datum);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_item_delete( list_item_t *li,
|
||||||
|
void (*datum_delete)(void *datum));
|
||||||
|
|
||||||
|
void
|
||||||
|
list_insert_item_tail( list_t *l,
|
||||||
|
list_item_t *i);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_insert_item_before(list_t *l,
|
||||||
|
list_item_t *next,
|
||||||
|
list_item_t *i);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_insert_item_after( list_t *l,
|
||||||
|
list_item_t *pred,
|
||||||
|
list_item_t *i);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_insert_item_sorted(list_t *l,
|
||||||
|
list_item_t *i);
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// ???
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
void
|
||||||
|
list_init( list_t *l,
|
||||||
|
int32_t (*compare)(const void *key, const void *with),
|
||||||
|
void (*datum_delete)(void *datum));
|
||||||
|
|
||||||
|
void
|
||||||
|
list_delete(list_t *l);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_reset(list_t *l);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_insert_head( list_t *l,
|
||||||
|
void *v);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_insert_tail( list_t *l,
|
||||||
|
void *v);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_insert_before(list_t *l,
|
||||||
|
list_item_t *next,
|
||||||
|
void *v);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_insert_after( list_t *l,
|
||||||
|
list_item_t *pred,
|
||||||
|
void *v);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_insert_sorted( list_t *l,
|
||||||
|
void *v);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_insert_item_head( list_t *l,
|
||||||
|
list_item_t *i);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_remove_item( list_t *l,
|
||||||
|
list_item_t *i);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_remove_head(list_t *l);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_remove_tail(list_t *l);
|
||||||
|
|
||||||
|
list_item_t *
|
||||||
|
list_find_item( list_t *l,
|
||||||
|
void *datum);
|
||||||
|
|
||||||
|
list_item_t *
|
||||||
|
list_get_head_item(list_t *l);
|
||||||
|
|
||||||
|
list_item_t *
|
||||||
|
list_get_tail_item(list_t *l);
|
||||||
|
|
||||||
|
void *
|
||||||
|
list_find( list_t *l,
|
||||||
|
void *datum);
|
||||||
|
|
||||||
|
void *
|
||||||
|
list_get_head(list_t *l);
|
||||||
|
|
||||||
|
void *
|
||||||
|
list_get_tail(list_t *l);
|
||||||
|
|
||||||
|
uint32_t
|
||||||
|
list_get_length(list_t *l);
|
||||||
|
|
||||||
|
bool
|
||||||
|
list_is_empty(list_t *l);
|
||||||
|
|
||||||
|
bool
|
||||||
|
list_not_empty(list_t *l);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_visit_items( list_t *l,
|
||||||
|
void (*visitor)(void *v));
|
||||||
|
|
||||||
|
void *
|
||||||
|
list_item_get_datum(list_item_t *li);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_iterator_init( list_t *l,
|
||||||
|
list_iterator_t *li);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_iterator_delete(list_iterator_t *li);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_iterator_next(list_iterator_t *li);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_iterator_prev(list_iterator_t *li);
|
||||||
|
|
||||||
|
void *
|
||||||
|
list_iterator_get_datum(list_iterator_t *li);
|
||||||
|
|
||||||
|
bool
|
||||||
|
list_iterator_is_valid(list_iterator_t *li);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_reverse_iterator_init( list_t *l,
|
||||||
|
list_iterator_t *li);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_reverse_iterator_delete(list_iterator_t *li);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_reverse_iterator_next(list_iterator_t *li);
|
||||||
|
|
||||||
|
void
|
||||||
|
list_reverse_iterator_prev(list_iterator_t *li);
|
||||||
|
|
||||||
|
void *
|
||||||
|
list_reverse_iterator_get_datum(list_iterator_t *li);
|
||||||
|
|
||||||
|
bool
|
||||||
|
list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// Output and utility
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
void *
|
||||||
|
kmalloc(int size);
|
||||||
|
|
||||||
|
long
|
||||||
|
transform_to_cuda( node *n,
|
||||||
|
bool verbose); //returns actual mem used in a long
|
||||||
|
|
||||||
|
void
|
||||||
|
usage_1( void );
|
||||||
|
|
||||||
|
void
|
||||||
|
usage_2( void );
|
||||||
|
|
||||||
|
void
|
||||||
|
enqueue( node * new_node );
|
||||||
|
|
||||||
|
node *
|
||||||
|
dequeue( void );
|
||||||
|
|
||||||
|
int
|
||||||
|
height( node * root );
|
||||||
|
|
||||||
|
int
|
||||||
|
path_to_root( node * root,
|
||||||
|
node * child );
|
||||||
|
|
||||||
|
void
|
||||||
|
print_leaves( node * root );
|
||||||
|
|
||||||
|
void
|
||||||
|
print_tree( node * root );
|
||||||
|
|
||||||
|
node *
|
||||||
|
find_leaf( node * root,
|
||||||
|
int key,
|
||||||
|
bool verbose );
|
||||||
|
|
||||||
|
record *
|
||||||
|
find( node * root,
|
||||||
|
int key,
|
||||||
|
bool verbose );
|
||||||
|
|
||||||
|
int
|
||||||
|
cut( int length );
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// Insertion
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
record *
|
||||||
|
make_record(int value);
|
||||||
|
|
||||||
|
node *
|
||||||
|
make_node( void );
|
||||||
|
|
||||||
|
node *
|
||||||
|
make_leaf( void );
|
||||||
|
|
||||||
|
int
|
||||||
|
get_left_index( node * parent,
|
||||||
|
node * left);
|
||||||
|
|
||||||
|
node *
|
||||||
|
insert_into_leaf( node * leaf,
|
||||||
|
int key, record * pointer );
|
||||||
|
|
||||||
|
node *
|
||||||
|
insert_into_leaf_after_splitting( node * root,
|
||||||
|
node * leaf,
|
||||||
|
int key,
|
||||||
|
record * pointer);
|
||||||
|
|
||||||
|
node *
|
||||||
|
insert_into_node( node * root,
|
||||||
|
node * parent,
|
||||||
|
int left_index,
|
||||||
|
int key,
|
||||||
|
node * right);
|
||||||
|
|
||||||
|
node *
|
||||||
|
insert_into_node_after_splitting( node * root,
|
||||||
|
node * parent,
|
||||||
|
int left_index,
|
||||||
|
int key,
|
||||||
|
node * right);
|
||||||
|
|
||||||
|
node *
|
||||||
|
insert_into_parent( node * root,
|
||||||
|
node * left,
|
||||||
|
int key,
|
||||||
|
node * right);
|
||||||
|
|
||||||
|
node *
|
||||||
|
insert_into_new_root( node * left,
|
||||||
|
int key,
|
||||||
|
node * right);
|
||||||
|
|
||||||
|
node *
|
||||||
|
start_new_tree( int key,
|
||||||
|
record * pointer);
|
||||||
|
|
||||||
|
node *
|
||||||
|
insert( node * root,
|
||||||
|
int key,
|
||||||
|
int value );
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// Deletion
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
int
|
||||||
|
get_neighbor_index(node * n );
|
||||||
|
|
||||||
|
node *
|
||||||
|
adjust_root(node * root);
|
||||||
|
|
||||||
|
node *
|
||||||
|
coalesce_nodes( node * root,
|
||||||
|
node * n,
|
||||||
|
node * neighbor,
|
||||||
|
int neighbor_index,
|
||||||
|
int k_prime);
|
||||||
|
|
||||||
|
node *
|
||||||
|
redistribute_nodes( node * root,
|
||||||
|
node * n,
|
||||||
|
node * neighbor,
|
||||||
|
int neighbor_index,
|
||||||
|
int k_prime_index,
|
||||||
|
int k_prime);
|
||||||
|
|
||||||
|
node *
|
||||||
|
delete_entry( node * root,
|
||||||
|
node * n,
|
||||||
|
int key,
|
||||||
|
void * pointer );
|
||||||
|
|
||||||
|
node *
|
||||||
|
deleteVal( node * root,
|
||||||
|
int key );
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// HEADER
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
extern int platform_id_inuse;
|
||||||
|
extern int device_id_inuse;
|
||||||
|
extern cl_device_type device_type;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// int main( int argc,
|
||||||
|
// char *argv []);
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// END
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
// # ifdef __cplusplus
|
||||||
|
// }
|
||||||
|
# endif
|
126
tests/opencl/b+tree/common.mk
Normal file
126
tests/opencl/b+tree/common.mk
Normal file
|
@ -0,0 +1,126 @@
|
||||||
|
XLEN ?= 32
|
||||||
|
TOOLDIR ?= /opt
|
||||||
|
|
||||||
|
TARGET ?= opaesim
|
||||||
|
|
||||||
|
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
|
||||||
|
XRT_DEVICE_INDEX ?= 0
|
||||||
|
|
||||||
|
ifeq ($(XLEN),64)
|
||||||
|
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
|
||||||
|
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||||
|
K_CFLAGS += -march=rv64imafd -mabi=ilp64d
|
||||||
|
STARTUP_ADDR ?= 0x180000000
|
||||||
|
else
|
||||||
|
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
|
||||||
|
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||||
|
K_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||||
|
STARTUP_ADDR ?= 0x80000000
|
||||||
|
endif
|
||||||
|
|
||||||
|
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||||
|
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
|
||||||
|
|
||||||
|
POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
|
||||||
|
POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
|
||||||
|
|
||||||
|
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||||
|
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
|
||||||
|
|
||||||
|
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
|
||||||
|
|
||||||
|
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
|
||||||
|
LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
|
||||||
|
|
||||||
|
K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
|
||||||
|
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
|
||||||
|
K_CFLAGS += -I$(VORTEX_KN_PATH)/include -DNDEBUG -DLLVM_VOTEX
|
||||||
|
K_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a -lm
|
||||||
|
|
||||||
|
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||||
|
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing -Wno-unused-result
|
||||||
|
CXXFLAGS += -pthread
|
||||||
|
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||||
|
|
||||||
|
ifdef HOSTGPU
|
||||||
|
CXXFLAGS += -DHOSTGPU
|
||||||
|
LDFLAGS += -lOpenCL
|
||||||
|
else
|
||||||
|
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Debugigng
|
||||||
|
#ifdef DEBUG
|
||||||
|
CXXFLAGS += -g -O0
|
||||||
|
#else
|
||||||
|
# CXXFLAGS += -O2 -DNDEBUG
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), fpga)
|
||||||
|
OPAE_DRV_PATHS ?= libopae-c.so
|
||||||
|
else
|
||||||
|
ifeq ($(TARGET), asesim)
|
||||||
|
OPAE_DRV_PATHS ?= libopae-c-ase.so
|
||||||
|
else
|
||||||
|
ifeq ($(TARGET), opaesim)
|
||||||
|
OPAE_DRV_PATHS ?= libopae-c-sim.so
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
|
||||||
|
|
||||||
|
all: $(PROJECT) kernel.pocl kernel2.pocl
|
||||||
|
|
||||||
|
kernel.pocl: kernel_gpu_opencl.cl
|
||||||
|
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel_gpu_opencl.cl
|
||||||
|
|
||||||
|
kernel2.pocl: kernel_gpu_opencl_2.cl
|
||||||
|
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel2.pocl kernel_gpu_opencl_2.cl
|
||||||
|
|
||||||
|
#backprop.o: backprop.c
|
||||||
|
# $(CXX) $(CXXFLAGS) backprop.c -c -Wno-unused-result
|
||||||
|
|
||||||
|
%.cc.o: %.cc
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
%.cpp.o: %.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
%.c.o: %.c
|
||||||
|
$(CC) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
$(PROJECT): $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||||
|
|
||||||
|
run-hostgpu: $(PROJECT) kernel.pocl
|
||||||
|
./$(PROJECT) $(OPTS)
|
||||||
|
|
||||||
|
run-simx: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||||
|
|
||||||
|
run-rtlsim: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||||
|
|
||||||
|
run-opae: $(PROJECT) kernel.pocl
|
||||||
|
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||||
|
|
||||||
|
run-xrt: $(PROJECT) kernel.pocl
|
||||||
|
ifeq ($(TARGET), hw)
|
||||||
|
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||||
|
else
|
||||||
|
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||||
|
endif
|
||||||
|
|
||||||
|
.depend: $(SRCS)
|
||||||
|
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(PROJECT) *.o .depend
|
||||||
|
|
||||||
|
clean-all: clean
|
||||||
|
rm -rf *.dump *.pocl
|
||||||
|
|
||||||
|
ifneq ($(MAKECMDGOALS),clean)
|
||||||
|
-include .depend
|
||||||
|
endif
|
109
tests/opencl/b+tree/kernel_gpu_opencl.cl
Executable file
109
tests/opencl/b+tree/kernel_gpu_opencl.cl
Executable file
|
@ -0,0 +1,109 @@
|
||||||
|
// #ifdef __cplusplus
|
||||||
|
// extern "C" {
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// DEFINE/INCLUDE
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// DEFINE
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
// double precision support (switch between as needed for NVIDIA/AMD)
|
||||||
|
#ifdef AMDAPP
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
|
||||||
|
#else
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// clBuildProgram compiler cannot link this file for some reason, so had to redefine constants and structures below
|
||||||
|
//#include ../common.h // (in directory specified to compiler) main function header
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// DEFINE (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
// change to double if double precision needed
|
||||||
|
#define fp float
|
||||||
|
|
||||||
|
#define DEFAULT_ORDER 256
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// STRUCTURES (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
// Type representing the record to which a given key refers. In a real B+ tree system, the record would hold data (in a database) or a file (in an operating system) or some other information.
|
||||||
|
// Users can rewrite this part of the code to change the type and content of the value field.
|
||||||
|
typedef struct record {
|
||||||
|
int value;
|
||||||
|
} record;
|
||||||
|
|
||||||
|
// ???
|
||||||
|
typedef struct knode {
|
||||||
|
int location;
|
||||||
|
int indices [DEFAULT_ORDER + 1];
|
||||||
|
int keys [DEFAULT_ORDER + 1];
|
||||||
|
bool is_leaf;
|
||||||
|
int num_keys;
|
||||||
|
} knode;
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// findK function
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
findK( long height,
|
||||||
|
__global knode *knodesD,
|
||||||
|
long knodes_elem,
|
||||||
|
__global record *recordsD,
|
||||||
|
|
||||||
|
__global long *currKnodeD,
|
||||||
|
__global long *offsetD,
|
||||||
|
__global int *keysD,
|
||||||
|
__global record *ansD)
|
||||||
|
{
|
||||||
|
|
||||||
|
// private thread IDs
|
||||||
|
int thid = get_local_id(0);
|
||||||
|
int bid = get_group_id(0);
|
||||||
|
|
||||||
|
// processtree levels
|
||||||
|
int i;
|
||||||
|
for(i = 0; i < height; i++){
|
||||||
|
|
||||||
|
// if value is between the two keys
|
||||||
|
if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
|
||||||
|
// this conditional statement is inserted to avoid crush due to but in original code
|
||||||
|
// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
|
||||||
|
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
|
||||||
|
if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
|
||||||
|
offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//__syncthreads();
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
// set for next tree level
|
||||||
|
if(thid==0){
|
||||||
|
currKnodeD[bid] = offsetD[bid];
|
||||||
|
}
|
||||||
|
//__syncthreads();
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//At this point, we have a candidate leaf node which may contain
|
||||||
|
//the target record. Check each key to hopefully find the record
|
||||||
|
if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
|
||||||
|
ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// End
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
// #ifdef __cplusplus
|
||||||
|
// }
|
||||||
|
// #endif
|
111
tests/opencl/b+tree/kernel_gpu_opencl_2.cl
Executable file
111
tests/opencl/b+tree/kernel_gpu_opencl_2.cl
Executable file
|
@ -0,0 +1,111 @@
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// DEFINE/INCLUDE
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// DEFINE
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
// double precision support (switch between as needed for NVIDIA/AMD)
|
||||||
|
#ifdef AMDAPP
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
|
||||||
|
#else
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// clBuildProgram compiler cannot link this file for some reason, so had to redefine constants and structures below
|
||||||
|
// #include ../common.h // (in directory specified to compiler) main function header
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// DEFINE (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
// change to double if double precision needed
|
||||||
|
#define fp float
|
||||||
|
|
||||||
|
#define DEFAULT_ORDER_2 256
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// STRUCTURES (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
// ???
|
||||||
|
typedef struct knode {
|
||||||
|
int location;
|
||||||
|
int indices [DEFAULT_ORDER_2 + 1];
|
||||||
|
int keys [DEFAULT_ORDER_2 + 1];
|
||||||
|
bool is_leaf;
|
||||||
|
int num_keys;
|
||||||
|
} knode;
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// findRangeK function
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
findRangeK( long height,
|
||||||
|
__global knode *knodesD,
|
||||||
|
long knodes_elem,
|
||||||
|
|
||||||
|
__global long *currKnodeD,
|
||||||
|
__global long *offsetD,
|
||||||
|
__global long *lastKnodeD,
|
||||||
|
__global long *offset_2D,
|
||||||
|
__global int *startD,
|
||||||
|
__global int *endD,
|
||||||
|
__global int *RecstartD,
|
||||||
|
__global int *ReclenD)
|
||||||
|
{
|
||||||
|
|
||||||
|
// private thread IDs
|
||||||
|
int thid = get_local_id(0);
|
||||||
|
int bid = get_group_id(0);
|
||||||
|
|
||||||
|
// ???
|
||||||
|
int i;
|
||||||
|
for(i = 0; i < height; i++){
|
||||||
|
|
||||||
|
if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
|
||||||
|
// this conditional statement is inserted to avoid crush due to but in original code
|
||||||
|
// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
|
||||||
|
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
|
||||||
|
if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
|
||||||
|
offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
|
||||||
|
// this conditional statement is inserted to avoid crush due to but in original code
|
||||||
|
// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
|
||||||
|
// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
|
||||||
|
if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
|
||||||
|
offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//__syncthreads();
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
// set for next tree level
|
||||||
|
if(thid==0){
|
||||||
|
currKnodeD[bid] = offsetD[bid];
|
||||||
|
lastKnodeD[bid] = offset_2D[bid];
|
||||||
|
}
|
||||||
|
// __syncthreads();
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Find the index of the starting record
|
||||||
|
if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
|
||||||
|
RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
|
||||||
|
}
|
||||||
|
// __syncthreads();
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
// Find the index of the ending record
|
||||||
|
if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
|
||||||
|
ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// End
|
||||||
|
//========================================================================================================================================================================================================200
|
753
tests/opencl/b+tree/kernel_gpu_opencl_wrapper.cc
Normal file
753
tests/opencl/b+tree/kernel_gpu_opencl_wrapper.cc
Normal file
|
@ -0,0 +1,753 @@
|
||||||
|
// #ifdef __cplusplus
|
||||||
|
// extern "C" {
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// DEFINE/INCLUDE
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// LIBRARIES
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#include <CL/cl.h> // (in directory provided to compiler) needed by OpenCL types and functions
|
||||||
|
#include <string.h> // (in directory known to compiler) needed by memset
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// COMMON
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#include "common.h" // (in directory provided here)
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// UTILITIES
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#include "opencl.h" // (in directory provided here)
|
||||||
|
#include "timer.h" // (in directory provided here)
|
||||||
|
#ifdef TIMING
|
||||||
|
#include "timing.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// HEADER
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#include "kernel_gpu_opencl_wrapper.h" // (in directory provided here)
|
||||||
|
|
||||||
|
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||||
|
if (nullptr == filename || nullptr == data || 0 == size)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
FILE* fp_ = fopen(filename, "r");
|
||||||
|
if (NULL == fp_) {
|
||||||
|
fprintf(stderr, "Failed to load kernel. %s\n", filename);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
fseek(fp_ , 0 , SEEK_END);
|
||||||
|
long fsize = ftell(fp_);
|
||||||
|
rewind(fp_);
|
||||||
|
|
||||||
|
*data = (uint8_t*)malloc(fsize);
|
||||||
|
*size = fread(*data, 1, fsize, fp_);
|
||||||
|
|
||||||
|
fclose(fp_);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// KERNEL_GPU_CUDA_WRAPPER FUNCTION
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
void
|
||||||
|
kernel_gpu_opencl_wrapper( record *records,
|
||||||
|
long records_mem,
|
||||||
|
knode *knodes,
|
||||||
|
long knodes_elem,
|
||||||
|
long knodes_mem,
|
||||||
|
|
||||||
|
int order,
|
||||||
|
long maxheight,
|
||||||
|
int count,
|
||||||
|
|
||||||
|
long *currKnode,
|
||||||
|
long *offset,
|
||||||
|
int *keys,
|
||||||
|
record *ans)
|
||||||
|
{
|
||||||
|
|
||||||
|
cl_device_type device_type = CL_DEVICE_TYPE_GPU;
|
||||||
|
int platform_id_inuse = 0; // platform id in use (default: 0)
|
||||||
|
int device_id_inuse = 0; // platform id in use (default: 0)
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// CPU VARIABLES
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
struct timeval tv;
|
||||||
|
struct timeval tv_total_start, tv_total_end;
|
||||||
|
struct timeval tv_init_end;
|
||||||
|
struct timeval tv_h2d_start, tv_h2d_end;
|
||||||
|
struct timeval tv_d2h_start, tv_d2h_end;
|
||||||
|
struct timeval tv_kernel_start, tv_kernel_end;
|
||||||
|
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
|
||||||
|
struct timeval tv_close_start, tv_close_end;
|
||||||
|
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time= 0,
|
||||||
|
d2h_time = 0, close_time = 0, total_time = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_total_start, NULL);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// GPU SETUP
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// INITIAL DRIVER OVERHEAD
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// cudaThreadSynchronize();
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// COMMON VARIABLES
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// common variables
|
||||||
|
cl_int error;
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// Get the number of available platforms
|
||||||
|
cl_uint num_platforms;
|
||||||
|
error = clGetPlatformIDs( 0,
|
||||||
|
NULL,
|
||||||
|
&num_platforms);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// Get the list of available platforms
|
||||||
|
cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
|
||||||
|
error = clGetPlatformIDs( num_platforms,
|
||||||
|
platforms,
|
||||||
|
NULL);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// Select the 1st platform
|
||||||
|
cl_platform_id platform = platforms[platform_id_inuse];
|
||||||
|
|
||||||
|
// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
|
||||||
|
char pbuf[100];
|
||||||
|
error = clGetPlatformInfo( platform,
|
||||||
|
CL_PLATFORM_VENDOR,
|
||||||
|
sizeof(pbuf),
|
||||||
|
pbuf,
|
||||||
|
NULL);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
printf("Platform: %s\n", pbuf);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// GET DEVICE INFORMATION
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
cl_uint devices_size;
|
||||||
|
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &devices_size);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
if (devices_size == 0) {
|
||||||
|
printf("There are no devices for Platform %d\n", platform_id_inuse);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
printf("Device num: %u\n", devices_size);
|
||||||
|
// Get the list of devices (previousely selected for the context)
|
||||||
|
cl_device_id *devices = (cl_device_id *) malloc(devices_size);
|
||||||
|
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, devices_size,
|
||||||
|
devices, NULL);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// Select the device
|
||||||
|
cl_device_id device;
|
||||||
|
device = devices[device_id_inuse];
|
||||||
|
|
||||||
|
// Check device type
|
||||||
|
error = clGetDeviceInfo(device, CL_DEVICE_TYPE,
|
||||||
|
sizeof(device_type), (void *)&device_type, NULL);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
if(device_type == CL_DEVICE_TYPE_GPU)
|
||||||
|
printf("Creating GPU Context\n");
|
||||||
|
else if (device_type == CL_DEVICE_TYPE_CPU)
|
||||||
|
printf("Creating CPU Context\n");
|
||||||
|
else
|
||||||
|
printf("This Context Type Not Supported\n");
|
||||||
|
|
||||||
|
// Get the name of the selected device
|
||||||
|
error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf),
|
||||||
|
pbuf, NULL);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
printf("Device: %s\n", pbuf);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// CREATE CONTEXT FOR THE PLATFORM
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// Create context properties for selected platform
|
||||||
|
cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
|
||||||
|
(cl_context_properties) platform,
|
||||||
|
0};
|
||||||
|
|
||||||
|
// Create context for selected platform being GPU
|
||||||
|
cl_context context;
|
||||||
|
context = clCreateContextFromType( context_properties,
|
||||||
|
device_type,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
&error);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// CREATE COMMAND QUEUE FOR THE DEVICE
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// Create a command queue
|
||||||
|
cl_command_queue command_queue;
|
||||||
|
#ifdef TIMING
|
||||||
|
command_queue = clCreateCommandQueue(context, device,
|
||||||
|
CL_QUEUE_PROFILING_ENABLE, &error);
|
||||||
|
#else
|
||||||
|
command_queue = clCreateCommandQueue(context, device, 0, &error);
|
||||||
|
#endif
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// CREATE PROGRAM, COMPILE IT
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// Load kernel source code from file
|
||||||
|
/*
|
||||||
|
const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
|
||||||
|
size_t sourceSize = strlen(source);
|
||||||
|
|
||||||
|
// Create the program
|
||||||
|
cl_program program = clCreateProgramWithSource( context,
|
||||||
|
1,
|
||||||
|
&source,
|
||||||
|
&sourceSize,
|
||||||
|
&error);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
*/
|
||||||
|
uint8_t *kernel_bin = NULL;
|
||||||
|
size_t kernel_size;
|
||||||
|
cl_int binary_status = 0;
|
||||||
|
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||||
|
std::abort();
|
||||||
|
|
||||||
|
cl_program program = clCreateProgramWithBinary(
|
||||||
|
context, 1, &devices[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &error);
|
||||||
|
free(kernel_bin);
|
||||||
|
|
||||||
|
char clOptions[110];
|
||||||
|
// sprintf(clOptions,"-I../../src");
|
||||||
|
sprintf(clOptions,"-I./../");
|
||||||
|
|
||||||
|
#ifdef DEFAULT_ORDER
|
||||||
|
sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER=%d", DEFAULT_ORDER);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*
|
||||||
|
// Compile the program
|
||||||
|
error = clBuildProgram( program,
|
||||||
|
1,
|
||||||
|
&device,
|
||||||
|
clOptions,
|
||||||
|
NULL,
|
||||||
|
NULL);
|
||||||
|
*/
|
||||||
|
error = clBuildProgram(program, 1, &devices[device_id_inuse], NULL, NULL, NULL);
|
||||||
|
if(error != CL_SUCCESS) {
|
||||||
|
printf("ERROR: clBuildProgram() => %d\n", error); return; }
|
||||||
|
|
||||||
|
|
||||||
|
// Print warnings and errors from compilation
|
||||||
|
static char log[65536];
|
||||||
|
memset(log, 0, sizeof(log));
|
||||||
|
clGetProgramBuildInfo( program,
|
||||||
|
device,
|
||||||
|
CL_PROGRAM_BUILD_LOG,
|
||||||
|
sizeof(log)-1,
|
||||||
|
log,
|
||||||
|
NULL);
|
||||||
|
printf("-----OpenCL Compiler Output-----\n");
|
||||||
|
if (strstr(log,"warning:") || strstr(log, "error:"))
|
||||||
|
printf("<<<<\n%s\n>>>>\n", log);
|
||||||
|
printf("--------------------------------\n");
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// Create kernel
|
||||||
|
cl_kernel kernel;
|
||||||
|
kernel = clCreateKernel(program,
|
||||||
|
"findK",
|
||||||
|
&error);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_init_end, NULL);
|
||||||
|
tvsub(&tv_init_end, &tv_total_start, &tv);
|
||||||
|
init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// END
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// GPU MEMORY (MALLOC)
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// DEVICE IN
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// recordsD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem recordsD;
|
||||||
|
recordsD = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
records_mem,
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// knodesD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem knodesD;
|
||||||
|
knodesD = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
knodes_mem,
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// currKnodeD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem currKnodeD;
|
||||||
|
currKnodeD = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(long),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// offsetD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem offsetD;
|
||||||
|
offsetD = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(long),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// keysD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem keysD;
|
||||||
|
keysD = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(long),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_mem_alloc_end, NULL);
|
||||||
|
tvsub(&tv_mem_alloc_end, &tv_init_end, &tv);
|
||||||
|
mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// END
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// DEVICE IN/OUT
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// ansD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem ansD;
|
||||||
|
ansD = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(record),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// END
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// END
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// GPU MEMORY COPY
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// GPU MEMORY (MALLOC) COPY IN
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// recordsD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_event event;
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
recordsD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
records_mem, // size to be copied
|
||||||
|
records, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// knodesD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
knodesD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
knodes_mem, // size to be copied
|
||||||
|
knodes, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// currKnodeD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
currKnodeD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(long), // size to be copied
|
||||||
|
currKnode, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// offsetD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
offsetD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(long), // size to be copied
|
||||||
|
offset, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// keysD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
keysD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(int), // size to be copied
|
||||||
|
keys, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// END
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// DEVICE IN/OUT
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// ansD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
ansD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(record), // size to be copied
|
||||||
|
ans, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// END
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// END
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// findK kernel
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// Execution Parameters
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
size_t local_work_size[1];
|
||||||
|
local_work_size[0] = order < 1024 ? order : 1024;
|
||||||
|
size_t global_work_size[1];
|
||||||
|
global_work_size[0] = count * local_work_size[0];
|
||||||
|
|
||||||
|
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// Kernel Arguments
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
0,
|
||||||
|
sizeof(long),
|
||||||
|
(void *) &maxheight);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
1,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &knodesD);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
2,
|
||||||
|
sizeof(long),
|
||||||
|
(void *) &knodes_elem);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
3,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &recordsD);
|
||||||
|
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
4,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &currKnodeD);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
5,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &offsetD);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
6,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &keysD);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
7,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &ansD);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// Kernel
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
error = clEnqueueNDRangeKernel( command_queue,
|
||||||
|
kernel,
|
||||||
|
1,
|
||||||
|
NULL,
|
||||||
|
global_work_size,
|
||||||
|
local_work_size,
|
||||||
|
0,
|
||||||
|
NULL,
|
||||||
|
&event);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
|
||||||
|
#ifdef TIMING
|
||||||
|
kernel_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// END
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// GPU MEMORY COPY (CONTD.)
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// DEVICE IN/OUT
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// ansD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueReadBuffer(command_queue, // The command queue.
|
||||||
|
ansD, // The image on the device.
|
||||||
|
CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
|
||||||
|
0, // Offset. None in this case.
|
||||||
|
count*sizeof(record), // Size to copy.
|
||||||
|
ans, // The pointer to the image on the host.
|
||||||
|
0, // Number of events in wait list. Not used.
|
||||||
|
NULL, // Event wait list. Not used.
|
||||||
|
&event); // Event object for determining status. Not used.
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
d2h_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// END
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// END
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// GPU MEMORY DEALLOCATION
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_close_start, NULL);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Release kernels...
|
||||||
|
clReleaseKernel(kernel);
|
||||||
|
|
||||||
|
// Now the program...
|
||||||
|
clReleaseProgram(program);
|
||||||
|
|
||||||
|
// Clean up the device memory...
|
||||||
|
clReleaseMemObject(recordsD);
|
||||||
|
clReleaseMemObject(knodesD);
|
||||||
|
|
||||||
|
clReleaseMemObject(currKnodeD);
|
||||||
|
clReleaseMemObject(offsetD);
|
||||||
|
clReleaseMemObject(keysD);
|
||||||
|
clReleaseMemObject(ansD);
|
||||||
|
|
||||||
|
// Flush the queue
|
||||||
|
error = clFlush(command_queue);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// ...and finally, the queue and context.
|
||||||
|
clReleaseCommandQueue(command_queue);
|
||||||
|
|
||||||
|
// ???
|
||||||
|
clReleaseContext(context);
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_close_end, NULL);
|
||||||
|
tvsub(&tv_close_end, &tv_close_start, &tv);
|
||||||
|
close_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
tvsub(&tv_close_end, &tv_total_start, &tv);
|
||||||
|
total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// DISPLAY TIMING
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
|
||||||
|
printf("Init: %f\n", init_time);
|
||||||
|
printf("MemAlloc: %f\n", mem_alloc_time);
|
||||||
|
printf("HtoD: %f\n", h2d_time);
|
||||||
|
printf("Exec: %f\n", kernel_time);
|
||||||
|
printf("DtoH: %f\n", d2h_time);
|
||||||
|
printf("Close: %f\n", close_time);
|
||||||
|
printf("Total: %f\n", total_time);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// END
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// END
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
// #ifdef __cplusplus
|
||||||
|
// }
|
||||||
|
// #endif
|
31
tests/opencl/b+tree/kernel_gpu_opencl_wrapper.h
Executable file
31
tests/opencl/b+tree/kernel_gpu_opencl_wrapper.h
Executable file
|
@ -0,0 +1,31 @@
|
||||||
|
// #ifdef __cplusplus
|
||||||
|
// extern "C" {
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// KERNEL_GPU_OPENCL_WRAPPER HEADER
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
void
|
||||||
|
kernel_gpu_opencl_wrapper( record *records,
|
||||||
|
long records_mem,
|
||||||
|
knode *knodes,
|
||||||
|
long knodes_elem,
|
||||||
|
long knodes_mem,
|
||||||
|
|
||||||
|
int order,
|
||||||
|
long maxheight,
|
||||||
|
int count,
|
||||||
|
|
||||||
|
long *currKnode,
|
||||||
|
long *offset,
|
||||||
|
int *keys,
|
||||||
|
record *ans);
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// End
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
// #ifdef __cplusplus
|
||||||
|
// }
|
||||||
|
// #endif
|
886
tests/opencl/b+tree/kernel_gpu_opencl_wrapper_2.cc
Normal file
886
tests/opencl/b+tree/kernel_gpu_opencl_wrapper_2.cc
Normal file
|
@ -0,0 +1,886 @@
|
||||||
|
// #ifdef __cplusplus
|
||||||
|
// extern "C" {
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// INCLUDE
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// LIBRARIES
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#include <CL/cl.h> // (in directory provided to compiler) needed by OpenCL types and functions
|
||||||
|
#include <string.h> // (in directory known to compiler) needed by memset
|
||||||
|
#include <stdio.h> // (in directory known to compiler) needed by printf, stderr
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// COMMON
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#include "common.h" // (in directory provided here)
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// UTILITIES
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#include "timer.h" // (in directory provided here)
|
||||||
|
#include "opencl.h"
|
||||||
|
#ifdef TIMING
|
||||||
|
#include "timing.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// HEADER
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#include "kernel_gpu_opencl_wrapper_2.h" // (in directory provided here)
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// FUNCTION
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
int read_kernel_file2(const char* filename, uint8_t** data, size_t* size) {
|
||||||
|
if (nullptr == filename || nullptr == data || 0 == size)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
FILE* fp_ = fopen(filename, "r");
|
||||||
|
if (NULL == fp_) {
|
||||||
|
fprintf(stderr, "Failed to load kernel.2 %s\n", filename);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
fseek(fp_ , 0 , SEEK_END);
|
||||||
|
long fsize = ftell(fp_);
|
||||||
|
rewind(fp_);
|
||||||
|
|
||||||
|
*data = (uint8_t*)malloc(fsize);
|
||||||
|
*size = fread(*data, 1, fsize, fp_);
|
||||||
|
|
||||||
|
fclose(fp_);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
kernel_gpu_opencl_wrapper_2(knode *knodes,
|
||||||
|
long knodes_elem,
|
||||||
|
long knodes_mem,
|
||||||
|
|
||||||
|
int order,
|
||||||
|
long maxheight,
|
||||||
|
int count,
|
||||||
|
|
||||||
|
long *currKnode,
|
||||||
|
long *offset,
|
||||||
|
long *lastKnode,
|
||||||
|
long *offset_2,
|
||||||
|
int *start,
|
||||||
|
int *end,
|
||||||
|
int *recstart,
|
||||||
|
int *reclength)
|
||||||
|
{
|
||||||
|
cl_device_type device_type = CL_DEVICE_TYPE_GPU;
|
||||||
|
int platform_id_inuse = 0; // platform id in use (default: 0)
|
||||||
|
int device_id_inuse = 0; // platform id in use (default: 0)
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// CPU VARIABLES
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//Primitives for timing
|
||||||
|
#ifdef TIMING
|
||||||
|
struct timeval tv;
|
||||||
|
struct timeval tv_total_start, tv_total_end;
|
||||||
|
struct timeval tv_init_end;
|
||||||
|
struct timeval tv_h2d_start, tv_h2d_end;
|
||||||
|
struct timeval tv_d2h_start, tv_d2h_end;
|
||||||
|
struct timeval tv_kernel_start, tv_kernel_end;
|
||||||
|
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
|
||||||
|
struct timeval tv_close_start, tv_close_end;
|
||||||
|
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time= 0,
|
||||||
|
d2h_time = 0, close_time = 0, total_time = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_total_start, NULL);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// GPU SETUP
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// INITIAL DRIVER OVERHEAD
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// cudaThreadSynchronize();
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// COMMON VARIABLES
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// common variables
|
||||||
|
cl_int error;
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// Get the number of available platforms
|
||||||
|
cl_uint num_platforms;
|
||||||
|
error = clGetPlatformIDs( 0,
|
||||||
|
NULL,
|
||||||
|
&num_platforms);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// Get the list of available platforms
|
||||||
|
cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
|
||||||
|
error = clGetPlatformIDs( num_platforms,
|
||||||
|
platforms,
|
||||||
|
NULL);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// Select the 1st platform
|
||||||
|
cl_platform_id platform = platforms[platform_id_inuse];
|
||||||
|
|
||||||
|
// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
|
||||||
|
char pbuf[100];
|
||||||
|
error = clGetPlatformInfo( platform,
|
||||||
|
CL_PLATFORM_VENDOR,
|
||||||
|
sizeof(pbuf),
|
||||||
|
pbuf,
|
||||||
|
NULL);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
printf("Platform: %s\n", pbuf);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// GET DEVICE INFORMATION
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
cl_uint devices_size;
|
||||||
|
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &devices_size);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
if (devices_size == 0) {
|
||||||
|
printf("There are no devices for Platform %d\n", platform_id_inuse);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
printf("Device num: %u\n", devices_size);
|
||||||
|
// Get the list of devices (previousely selected for the context)
|
||||||
|
cl_device_id *devices = (cl_device_id *) malloc(devices_size);
|
||||||
|
error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, devices_size,
|
||||||
|
devices, NULL);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// Select the device
|
||||||
|
cl_device_id device;
|
||||||
|
device = devices[device_id_inuse];
|
||||||
|
|
||||||
|
// Check device type
|
||||||
|
error = clGetDeviceInfo(device, CL_DEVICE_TYPE,
|
||||||
|
sizeof(device_type), (void *)&device_type, NULL);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
if(device_type == CL_DEVICE_TYPE_GPU)
|
||||||
|
printf("Creating GPU Context\n");
|
||||||
|
else if (device_type == CL_DEVICE_TYPE_CPU)
|
||||||
|
printf("Creating CPU Context\n");
|
||||||
|
else
|
||||||
|
printf("This Context Type Not Supported\n");
|
||||||
|
|
||||||
|
// Get the name of the selected device
|
||||||
|
error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf),
|
||||||
|
pbuf, NULL);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
printf("Device: %s\n", pbuf);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// CREATE CONTEXT FOR THE PLATFORM
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// Create context properties for selected platform
|
||||||
|
cl_context_properties context_properties[3] = { CL_CONTEXT_PLATFORM,
|
||||||
|
(cl_context_properties) platform,
|
||||||
|
0};
|
||||||
|
|
||||||
|
// Create context for selected platform being GPU
|
||||||
|
cl_context context;
|
||||||
|
context = clCreateContextFromType( context_properties,
|
||||||
|
device_type,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
&error);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// CREATE COMMAND QUEUE FOR THE DEVICE
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// Create a command queue
|
||||||
|
cl_command_queue command_queue;
|
||||||
|
#ifdef TIMING
|
||||||
|
command_queue = clCreateCommandQueue(context, device,
|
||||||
|
CL_QUEUE_PROFILING_ENABLE, &error);
|
||||||
|
#else
|
||||||
|
command_queue = clCreateCommandQueue(context, device, 0, &error);
|
||||||
|
#endif
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// CREATE PROGRAM, COMPILE IT
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// Load kernel source code from file
|
||||||
|
/*
|
||||||
|
const char *source = load_kernel_source("./kernel/kernel_gpu_opencl_2.cl");
|
||||||
|
size_t sourceSize = strlen(source);
|
||||||
|
|
||||||
|
// Create the program
|
||||||
|
cl_program program = clCreateProgramWithSource( context,
|
||||||
|
1,
|
||||||
|
&source,
|
||||||
|
&sourceSize,
|
||||||
|
&error);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
*/
|
||||||
|
uint8_t *kernel_bin = NULL;
|
||||||
|
size_t kernel_size;
|
||||||
|
cl_int binary_status = 0;
|
||||||
|
if (0 != read_kernel_file2("kernel2.pocl", &kernel_bin, &kernel_size))
|
||||||
|
std::abort();
|
||||||
|
|
||||||
|
cl_program program = clCreateProgramWithBinary(
|
||||||
|
context, 1, &devices[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &error);
|
||||||
|
free(kernel_bin);
|
||||||
|
|
||||||
|
|
||||||
|
char clOptions[110];
|
||||||
|
// sprintf(clOptions,"-I../../src");
|
||||||
|
sprintf(clOptions,"-I./../");
|
||||||
|
|
||||||
|
#ifdef DEFAULT_ORDER_2
|
||||||
|
sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER_2=%d", DEFAULT_ORDER_2);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
error = clBuildProgram(program, 1, &devices[device_id_inuse], NULL, NULL, NULL);
|
||||||
|
if(error != CL_SUCCESS) {
|
||||||
|
printf("ERROR: clBuildProgram() => %d\n", error); return; }
|
||||||
|
|
||||||
|
|
||||||
|
// Compile the program
|
||||||
|
/*
|
||||||
|
error = clBuildProgram( program,
|
||||||
|
1,
|
||||||
|
&device,
|
||||||
|
clOptions,
|
||||||
|
NULL,
|
||||||
|
NULL);*/
|
||||||
|
// Print warnings and errors from compilation
|
||||||
|
static char log[65536];
|
||||||
|
memset(log, 0, sizeof(log));
|
||||||
|
clGetProgramBuildInfo( program,
|
||||||
|
device,
|
||||||
|
CL_PROGRAM_BUILD_LOG,
|
||||||
|
sizeof(log)-1,
|
||||||
|
log,
|
||||||
|
NULL);
|
||||||
|
printf("-----OpenCL Compiler Output-----\n");
|
||||||
|
if (strstr(log,"warning:") || strstr(log, "error:"))
|
||||||
|
printf("<<<<\n%s\n>>>>\n", log);
|
||||||
|
printf("--------------------------------\n");
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// Create kernel
|
||||||
|
cl_kernel kernel;
|
||||||
|
kernel = clCreateKernel(program,
|
||||||
|
"findRangeK",
|
||||||
|
&error);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_init_end, NULL);
|
||||||
|
tvsub(&tv_init_end, &tv_total_start, &tv);
|
||||||
|
init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// END
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// GPU MEMORY MALLOC
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// DEVICE IN
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// knodesD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem knodesD;
|
||||||
|
knodesD = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
knodes_mem,
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// currKnodeD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem currKnodeD;
|
||||||
|
currKnodeD = clCreateBuffer(context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(long),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// offsetD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem offsetD;
|
||||||
|
offsetD = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(long),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// lastKnodeD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem lastKnodeD;
|
||||||
|
lastKnodeD = clCreateBuffer(context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(long),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// offset_2D
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem offset_2D;
|
||||||
|
offset_2D = clCreateBuffer(context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(long),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// startD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem startD;
|
||||||
|
startD = clCreateBuffer(context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(int),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// endD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem endD;
|
||||||
|
endD = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(int),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// END
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// DEVICE IN/OUT
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// ansDStart
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem ansDStart;
|
||||||
|
ansDStart = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(int),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// ansDLength
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_mem ansDLength;
|
||||||
|
ansDLength = clCreateBuffer( context,
|
||||||
|
CL_MEM_READ_WRITE,
|
||||||
|
count*sizeof(int),
|
||||||
|
NULL,
|
||||||
|
&error );
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_mem_alloc_end, NULL);
|
||||||
|
tvsub(&tv_mem_alloc_end, &tv_init_end, &tv);
|
||||||
|
mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// END
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// END
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// GPU MEMORY COPY
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// DEVICE IN
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// knodesD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
cl_event event;
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
knodesD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
knodes_mem, // size to be copied
|
||||||
|
knodes, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// currKnodeD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
currKnodeD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(long), // size to be copied
|
||||||
|
currKnode, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// offsetD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
offsetD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(long), // size to be copied
|
||||||
|
offset, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// lastKnodeD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
lastKnodeD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(long), // size to be copied
|
||||||
|
lastKnode, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// offset_2D
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
offset_2D, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(long), // size to be copied
|
||||||
|
offset_2, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// startD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
startD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(int), // size to be copied
|
||||||
|
start, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// endD
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
endD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(int), // size to be copied
|
||||||
|
end, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// END
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// DEVICE IN/OUT
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// ansDStart
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
endD, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(int), // size to be copied
|
||||||
|
end, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// ansDLength
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueWriteBuffer( command_queue, // command queue
|
||||||
|
ansDLength, // destination
|
||||||
|
1, // block the source from access until this copy operation complates (1=yes, 0=no)
|
||||||
|
0, // offset in destination to write to
|
||||||
|
count*sizeof(int), // size to be copied
|
||||||
|
reclength, // source
|
||||||
|
0, // # of events in the list of events to wait for
|
||||||
|
NULL, // list of events to wait for
|
||||||
|
&event); // ID of this operation to be used by waiting operations
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// END
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// KERNEL
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// Execution Parameters
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
size_t local_work_size[1];
|
||||||
|
local_work_size[0] = order < 1024 ? order : 1024;
|
||||||
|
size_t global_work_size[1];
|
||||||
|
global_work_size[0] = count * local_work_size[0];
|
||||||
|
|
||||||
|
printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// Kernel Arguments
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
0,
|
||||||
|
sizeof(long),
|
||||||
|
(void *) &maxheight);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
1,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &knodesD);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
2,
|
||||||
|
sizeof(long),
|
||||||
|
(void *) &knodes_elem);
|
||||||
|
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
3,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &currKnodeD);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
4,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &offsetD);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
5,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &lastKnodeD);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
6,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &offset_2D);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
7,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &startD);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
8,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &endD);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
9,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &ansDStart);
|
||||||
|
clSetKernelArg( kernel,
|
||||||
|
10,
|
||||||
|
sizeof(cl_mem),
|
||||||
|
(void *) &ansDLength);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// Kernel
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
error = clEnqueueNDRangeKernel( command_queue,
|
||||||
|
kernel,
|
||||||
|
1,
|
||||||
|
NULL,
|
||||||
|
global_work_size,
|
||||||
|
local_work_size,
|
||||||
|
0,
|
||||||
|
NULL,
|
||||||
|
&event);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
|
||||||
|
#ifdef TIMING
|
||||||
|
kernel_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// END
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// GPU MEMORY COPY (CONTD.)
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// DEVICE IN/OUT
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// ansDStart
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueReadBuffer(command_queue, // The command queue.
|
||||||
|
ansDStart, // The image on the device.
|
||||||
|
CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
|
||||||
|
0, // Offset. None in this case.
|
||||||
|
count*sizeof(int), // Size to copy.
|
||||||
|
recstart, // The pointer to the image on the host.
|
||||||
|
0, // Number of events in wait list. Not used.
|
||||||
|
NULL, // Event wait list. Not used.
|
||||||
|
&event); // Event object for determining status. Not used.
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
d2h_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// ansDLength
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
error = clEnqueueReadBuffer(command_queue, // The command queue.
|
||||||
|
ansDLength, // The image on the device.
|
||||||
|
CL_TRUE, // Blocking? (ie. Wait at this line until read has finished?)
|
||||||
|
0, // Offset. None in this case.
|
||||||
|
count*sizeof(int), // Size to copy.
|
||||||
|
reclength, // The pointer to the image on the host.
|
||||||
|
0, // Number of events in wait list. Not used.
|
||||||
|
NULL, // Event wait list. Not used.
|
||||||
|
&event); // Event object for determining status. Not used.
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
#ifdef TIMING
|
||||||
|
d2h_time += probe_event_time(event, command_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
//==================================================50
|
||||||
|
// END
|
||||||
|
//==================================================50
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// END
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// GPU MEMORY DEALLOCATION
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_close_start, NULL);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Release kernels...
|
||||||
|
clReleaseKernel(kernel);
|
||||||
|
|
||||||
|
// Now the program...
|
||||||
|
clReleaseProgram(program);
|
||||||
|
|
||||||
|
// Clean up the device memory...
|
||||||
|
clReleaseMemObject(knodesD);
|
||||||
|
|
||||||
|
clReleaseMemObject(currKnodeD);
|
||||||
|
clReleaseMemObject(offsetD);
|
||||||
|
clReleaseMemObject(lastKnodeD);
|
||||||
|
clReleaseMemObject(offset_2D);
|
||||||
|
clReleaseMemObject(startD);
|
||||||
|
clReleaseMemObject(endD);
|
||||||
|
clReleaseMemObject(ansDStart);
|
||||||
|
clReleaseMemObject(ansDLength);
|
||||||
|
|
||||||
|
// Flush the queue
|
||||||
|
error = clFlush(command_queue);
|
||||||
|
if (error != CL_SUCCESS)
|
||||||
|
fatal_CL(error, __LINE__);
|
||||||
|
|
||||||
|
// ...and finally, the queue and context.
|
||||||
|
clReleaseCommandQueue(command_queue);
|
||||||
|
|
||||||
|
// ???
|
||||||
|
clReleaseContext(context);
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_close_end, NULL);
|
||||||
|
tvsub(&tv_close_end, &tv_close_start, &tv);
|
||||||
|
close_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
tvsub(&tv_close_end, &tv_total_start, &tv);
|
||||||
|
total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
#endif
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// DISPLAY TIMING
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
|
||||||
|
printf("Init: %f\n", init_time);
|
||||||
|
printf("MemAlloc: %f\n", mem_alloc_time);
|
||||||
|
printf("HtoD: %f\n", h2d_time);
|
||||||
|
printf("Exec: %f\n", kernel_time);
|
||||||
|
printf("DtoH: %f\n", d2h_time);
|
||||||
|
printf("Close: %f\n", close_time);
|
||||||
|
printf("Total: %f\n", total_time);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// END
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// END
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
// #ifdef __cplusplus
|
||||||
|
// }
|
||||||
|
// #endif
|
33
tests/opencl/b+tree/kernel_gpu_opencl_wrapper_2.h
Executable file
33
tests/opencl/b+tree/kernel_gpu_opencl_wrapper_2.h
Executable file
|
@ -0,0 +1,33 @@
|
||||||
|
// #ifdef __cplusplus
|
||||||
|
// extern "C" {
|
||||||
|
// #endif
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// KERNEL_GPU_OPENCL_WRAPPER HEADER
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
void
|
||||||
|
kernel_gpu_opencl_wrapper_2(knode *knodes,
|
||||||
|
long knodes_elem,
|
||||||
|
long knodes_mem,
|
||||||
|
|
||||||
|
int order,
|
||||||
|
long maxheight,
|
||||||
|
int count,
|
||||||
|
|
||||||
|
long *currKnode,
|
||||||
|
long *offset,
|
||||||
|
long *lastKnode,
|
||||||
|
long *offset_2,
|
||||||
|
int *start,
|
||||||
|
int *end,
|
||||||
|
int *recstart,
|
||||||
|
int *reclength);
|
||||||
|
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
// End
|
||||||
|
//========================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
// #ifdef __cplusplus
|
||||||
|
// }
|
||||||
|
// #endif
|
2457
tests/opencl/b+tree/main.cc
Normal file
2457
tests/opencl/b+tree/main.cc
Normal file
File diff suppressed because it is too large
Load diff
19
tests/opencl/b+tree/main.h
Normal file
19
tests/opencl/b+tree/main.h
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
// # ifdef __cplusplus
|
||||||
|
// extern "C" {
|
||||||
|
// # endif
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// HEADER
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
int
|
||||||
|
main( int argc,
|
||||||
|
char *argv []);
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// END
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
// # ifdef __cplusplus
|
||||||
|
// }
|
||||||
|
// # endif
|
53
tests/opencl/b+tree/num.cc
Normal file
53
tests/opencl/b+tree/num.cc
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// DESCRIPTION
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
// Returns: 0 if string does not represent integer
|
||||||
|
// 1 if string represents integer
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// NUM CODE
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// ISINTEGER FUNCTION
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
int isInteger(char *str){
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// make sure it's not empty
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
if (*str == '\0'){
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// if any digit is not a number, return false
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
for(; *str != '\0'; str++){
|
||||||
|
if (*str < 48 || *str > 57){ // digit characters (need to include . if checking for float)
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// it got past all my checks so I think it's a number
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// END NUM CODE
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
21
tests/opencl/b+tree/num.h
Normal file
21
tests/opencl/b+tree/num.h
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// FILE HEADER
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// ISINTEGER FUNCTION PROTOTYPE
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
int isInteger(char *str);
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// END FILE HEADER
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
138
tests/opencl/b+tree/opencl.cc
Normal file
138
tests/opencl/b+tree/opencl.cc
Normal file
|
@ -0,0 +1,138 @@
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// INCLUDE/DEFINE
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
#include "opencl.h" // (in directory) function headers
|
||||||
|
#include <CL/cl.h>
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// LOAD KERNEL SOURCE CODE FUNCTION
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
char *
|
||||||
|
load_kernel_source(const char *filename)
|
||||||
|
{
|
||||||
|
// Open the source file
|
||||||
|
FILE *file = fopen(filename, "r");
|
||||||
|
if (file == NULL){
|
||||||
|
fatal("Error opening kernel source file\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Determine the size of the file
|
||||||
|
if (fseek(file, 0, SEEK_END)){
|
||||||
|
fatal("Error reading kernel source file\n");
|
||||||
|
}
|
||||||
|
size_t size = ftell(file);
|
||||||
|
|
||||||
|
// Allocate space for the source code (plus one for null-terminator)
|
||||||
|
char *source = (char *) malloc(size + 1);
|
||||||
|
|
||||||
|
// Read the source code into the string
|
||||||
|
fseek(file, 0, SEEK_SET);
|
||||||
|
// printf("Number of elements: %lu\nSize = %lu", fread(source, 1, size, file), size);
|
||||||
|
// exit(1);
|
||||||
|
if (fread(source, 1, size, file) != size){
|
||||||
|
fatal("Error reading kernel source file\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Null-terminate the string
|
||||||
|
source[size] = '\0';
|
||||||
|
|
||||||
|
// Return the pointer to the string
|
||||||
|
return source;
|
||||||
|
}
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// PRINT ERROR FUNCTION
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
void
|
||||||
|
fatal(const char *s)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Error: %s\n", s);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// PRINT OPENCL ERROR FUNCTION
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
void
|
||||||
|
fatal_CL(cl_int error, int line_no) {
|
||||||
|
|
||||||
|
printf("Error at line %d: ", line_no);
|
||||||
|
|
||||||
|
switch(error) {
|
||||||
|
|
||||||
|
case CL_SUCCESS: printf("CL_SUCCESS\n"); break;
|
||||||
|
case CL_DEVICE_NOT_FOUND: printf("CL_DEVICE_NOT_FOUND\n"); break;
|
||||||
|
case CL_DEVICE_NOT_AVAILABLE: printf("CL_DEVICE_NOT_AVAILABLE\n"); break;
|
||||||
|
case CL_COMPILER_NOT_AVAILABLE: printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
|
||||||
|
case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
|
||||||
|
case CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
|
||||||
|
case CL_PROFILING_INFO_NOT_AVAILABLE: printf("CL_PROFILING_INFO_NOT_AVAILABLE\n"); break;
|
||||||
|
case CL_MEM_COPY_OVERLAP: printf("CL_MEM_COPY_OVERLAP\n"); break;
|
||||||
|
case CL_IMAGE_FORMAT_MISMATCH: printf("CL_IMAGE_FORMAT_MISMATCH\n"); break;
|
||||||
|
case CL_IMAGE_FORMAT_NOT_SUPPORTED: printf("CL_IMAGE_FORMAT_NOT_SUPPORTED\n"); break;
|
||||||
|
case CL_BUILD_PROGRAM_FAILURE: printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
|
||||||
|
case CL_MAP_FAILURE: printf("CL_MAP_FAILURE\n"); break;
|
||||||
|
case CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
|
||||||
|
case CL_INVALID_DEVICE_TYPE: printf("CL_INVALID_DEVICE_TYPE\n"); break;
|
||||||
|
case CL_INVALID_PLATFORM: printf("CL_INVALID_PLATFORM\n"); break;
|
||||||
|
case CL_INVALID_DEVICE: printf("CL_INVALID_DEVICE\n"); break;
|
||||||
|
case CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
|
||||||
|
case CL_INVALID_QUEUE_PROPERTIES: printf("CL_INVALID_QUEUE_PROPERTIES\n"); break;
|
||||||
|
case CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
|
||||||
|
case CL_INVALID_HOST_PTR: printf("CL_INVALID_HOST_PTR\n"); break;
|
||||||
|
case CL_INVALID_MEM_OBJECT: printf("CL_INVALID_MEM_OBJECT\n"); break;
|
||||||
|
case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR\n"); break;
|
||||||
|
case CL_INVALID_IMAGE_SIZE: printf("CL_INVALID_IMAGE_SIZE\n"); break;
|
||||||
|
case CL_INVALID_SAMPLER: printf("CL_INVALID_SAMPLER\n"); break;
|
||||||
|
case CL_INVALID_BINARY: printf("CL_INVALID_BINARY\n"); break;
|
||||||
|
case CL_INVALID_BUILD_OPTIONS: printf("CL_INVALID_BUILD_OPTIONS\n"); break;
|
||||||
|
case CL_INVALID_PROGRAM: printf("CL_INVALID_PROGRAM\n"); break;
|
||||||
|
case CL_INVALID_PROGRAM_EXECUTABLE: printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
|
||||||
|
case CL_INVALID_KERNEL_NAME: printf("CL_INVALID_KERNEL_NAME\n"); break;
|
||||||
|
case CL_INVALID_KERNEL_DEFINITION: printf("CL_INVALID_KERNEL_DEFINITION\n"); break;
|
||||||
|
case CL_INVALID_KERNEL: printf("CL_INVALID_KERNEL\n"); break;
|
||||||
|
case CL_INVALID_ARG_INDEX: printf("CL_INVALID_ARG_INDEX\n"); break;
|
||||||
|
case CL_INVALID_ARG_VALUE: printf("CL_INVALID_ARG_VALUE\n"); break;
|
||||||
|
case CL_INVALID_ARG_SIZE: printf("CL_INVALID_ARG_SIZE\n"); break;
|
||||||
|
case CL_INVALID_KERNEL_ARGS: printf("CL_INVALID_KERNEL_ARGS\n"); break;
|
||||||
|
case CL_INVALID_WORK_DIMENSION: printf("CL_INVALID_WORK_DIMENSION\n"); break;
|
||||||
|
case CL_INVALID_WORK_GROUP_SIZE: printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
|
||||||
|
case CL_INVALID_WORK_ITEM_SIZE: printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
|
||||||
|
case CL_INVALID_GLOBAL_OFFSET: printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
|
||||||
|
case CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
|
||||||
|
case CL_INVALID_EVENT: printf("CL_INVALID_EVENT\n"); break;
|
||||||
|
case CL_INVALID_OPERATION: printf("CL_INVALID_OPERATION\n"); break;
|
||||||
|
case CL_INVALID_GL_OBJECT: printf("CL_INVALID_GL_OBJECT\n"); break;
|
||||||
|
case CL_INVALID_BUFFER_SIZE: printf("CL_INVALID_BUFFER_SIZE\n"); break;
|
||||||
|
case CL_INVALID_MIP_LEVEL: printf("CL_INVALID_MIP_LEVEL\n"); break;
|
||||||
|
case CL_INVALID_GLOBAL_WORK_SIZE: printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
|
||||||
|
|
||||||
|
#ifdef CL_VERSION_1_1
|
||||||
|
case CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
|
||||||
|
case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
|
||||||
|
/* case CL_INVALID_PROPERTY: printf("CL_INVALID_PROPERTY\n"); break; */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
default: printf("Invalid OpenCL error code\n");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
exit(error);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// END
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
40
tests/opencl/b+tree/opencl.h
Normal file
40
tests/opencl/b+tree/opencl.h
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// INCLUDE/DEFINE
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
#include <stdio.h> // (in path known to compiler) needed by printf
|
||||||
|
|
||||||
|
#include <CL/cl.h> // (in path specified to compiler) needed by OpenCL types
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// LOAD KERNEL SOURCE CODE FUNCTION HEADER
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
char *
|
||||||
|
load_kernel_source(const char *filename);
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// PRINT ERROR FUNCTION HEADER
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
void
|
||||||
|
fatal(const char *s);
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// PRINT OPENCL ERROR FUNCTION HEADER
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
void
|
||||||
|
fatal_CL(cl_int error, int line_no);
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// END
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
37
tests/opencl/b+tree/timer.cc
Normal file
37
tests/opencl/b+tree/timer.cc
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// TIMER CODE
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// INCLUDE/DEFINE
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "timer.h"
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// FUNCTIONS
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
//====================================================================================================100
|
||||||
|
// DISPLAY TIME
|
||||||
|
//====================================================================================================100
|
||||||
|
|
||||||
|
// Returns the current system time in microseconds
|
||||||
|
long long get_time() {
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
return (tv.tv_sec * 1000000) + tv.tv_usec;
|
||||||
|
}
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// END TIMER CODE
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
23
tests/opencl/b+tree/timer.h
Normal file
23
tests/opencl/b+tree/timer.h
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// TIMER HEADER
|
||||||
|
#include <sys/time.h> // (in directory known to compiler) needed by gettimeofday
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
// FUNCTION PROTOTYPES
|
||||||
|
//======================================================================================================================================================150
|
||||||
|
|
||||||
|
long long
|
||||||
|
get_time();
|
||||||
|
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
// END TIMER HEADER
|
||||||
|
//===============================================================================================================================================================================================================200
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
40
tests/opencl/b+tree/timing.cpp
Normal file
40
tests/opencl/b+tree/timing.cpp
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "timing.h"
|
||||||
|
|
||||||
|
void time_measure_start(struct timeval *tv)
|
||||||
|
{
|
||||||
|
gettimeofday(tv, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void time_measure_end(struct timeval *tv)
|
||||||
|
{
|
||||||
|
struct timeval tv_now, tv_diff;
|
||||||
|
double d;
|
||||||
|
|
||||||
|
gettimeofday(&tv_now, NULL);
|
||||||
|
tvsub(&tv_now, tv, &tv_diff);
|
||||||
|
|
||||||
|
d = (double) tv_diff.tv_sec * 1000.0 + (double) tv_diff.tv_usec / 1000.0;
|
||||||
|
printf("Time (Memory Copy and Launch) = %f (ms)\n", d);
|
||||||
|
}
|
||||||
|
|
||||||
|
float probe_event_time(cl_event event, cl_command_queue command_queue) {
|
||||||
|
cl_int error=0;
|
||||||
|
cl_ulong eventStart,eventEnd;
|
||||||
|
clFinish(command_queue);
|
||||||
|
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,
|
||||||
|
sizeof(cl_ulong),&eventStart,NULL);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
printf("ERROR (%d) in event start profiling.\n", error);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,
|
||||||
|
sizeof(cl_ulong),&eventEnd,NULL);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
printf("ERROR (%d) in event end profiling.\n", error);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (float)((eventEnd-eventStart)/1000000.0);
|
||||||
|
}
|
25
tests/opencl/b+tree/timing.h
Normal file
25
tests/opencl/b+tree/timing.h
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
#ifndef __TIMING_H__
|
||||||
|
#define __TIMING_H__
|
||||||
|
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
|
void time_measure_start(struct timeval *tv);
|
||||||
|
void time_measure_end(struct timeval *tv);
|
||||||
|
|
||||||
|
/* tvsub: ret = x - y. */
|
||||||
|
static inline void tvsub(struct timeval *x,
|
||||||
|
struct timeval *y,
|
||||||
|
struct timeval *ret)
|
||||||
|
{
|
||||||
|
ret->tv_sec = x->tv_sec - y->tv_sec;
|
||||||
|
ret->tv_usec = x->tv_usec - y->tv_usec;
|
||||||
|
if (ret->tv_usec < 0) {
|
||||||
|
ret->tv_sec--;
|
||||||
|
ret->tv_usec += 1000000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float probe_event_time(cl_event, cl_command_queue);
|
||||||
|
|
||||||
|
#endif
|
8
tests/opencl/backprop/Makefile
Normal file
8
tests/opencl/backprop/Makefile
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
PROJECT = backprop
|
||||||
|
|
||||||
|
SRCS = backprop.cpp imagenet.cpp facetrain.cpp backprop_ocl.cpp timing.cpp
|
||||||
|
|
||||||
|
OPTS ?= -n 1024 -p 0 -d 0
|
||||||
|
#OPTS ?= -n 32768 -p 0 -d 0
|
||||||
|
|
||||||
|
include common.mk
|
473
tests/opencl/backprop/backprop.cpp
Normal file
473
tests/opencl/backprop/backprop.cpp
Normal file
|
@ -0,0 +1,473 @@
|
||||||
|
/*
|
||||||
|
******************************************************************
|
||||||
|
* HISTORY
|
||||||
|
* 15-Oct-94 Jeff Shufelt (js), Carnegie Mellon University
|
||||||
|
* Prepared for 15-681, Fall 1994.
|
||||||
|
* Modified by Shuai Che
|
||||||
|
******************************************************************
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <omp.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "backprop.h"
|
||||||
|
#include <math.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
|
||||||
|
#define ABS(x) (((x) > 0.0) ? (x) : (-(x)))
|
||||||
|
|
||||||
|
#define fastcopy(to,from,len)\
|
||||||
|
{\
|
||||||
|
register char *_to,*_from;\
|
||||||
|
register int _i,_l;\
|
||||||
|
_to = (char *)(to);\
|
||||||
|
_from = (char *)(from);\
|
||||||
|
_l = (len);\
|
||||||
|
for (_i = 0; _i < _l; _i++) *_to++ = *_from++;\
|
||||||
|
}
|
||||||
|
|
||||||
|
/*** Return random number between 0.0 and 1.0 ***/
|
||||||
|
float drnd()
|
||||||
|
{
|
||||||
|
return ((float) rand() / (float) BIGRND);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*** Return random number between -1.0 and 1.0 ***/
|
||||||
|
float dpn1()
|
||||||
|
{
|
||||||
|
return ((drnd() * 2.0) - 1.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*** The squashing function. Currently, it's a sigmoid. ***/
|
||||||
|
|
||||||
|
float squash(float x)
|
||||||
|
{
|
||||||
|
float m;
|
||||||
|
//x = -x;
|
||||||
|
//m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
|
||||||
|
//return(1.0 / (1.0 + m));
|
||||||
|
return (1.0 / (1.0 + exp(-x)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*** Allocate 1d array of floats ***/
|
||||||
|
|
||||||
|
float *alloc_1d_dbl(int n)
|
||||||
|
{
|
||||||
|
float *new_t;
|
||||||
|
|
||||||
|
new_t = (float *) malloc ((unsigned) (n * sizeof (float)));
|
||||||
|
if (new_t == NULL) {
|
||||||
|
printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
|
||||||
|
return (NULL);
|
||||||
|
}
|
||||||
|
return (new_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*** Allocate 2d array of floats ***/
|
||||||
|
|
||||||
|
float **alloc_2d_dbl(int m, int n)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
float **new_t;
|
||||||
|
|
||||||
|
new_t = (float **) malloc ((unsigned) (m * sizeof (float *)));
|
||||||
|
if (new_t == NULL) {
|
||||||
|
printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
|
||||||
|
return (NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < m; i++) {
|
||||||
|
new_t[i] = alloc_1d_dbl(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (new_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void bpnn_randomize_weights(float **w, int m, int n)
|
||||||
|
{
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
for (i = 0; i <= m; i++) {
|
||||||
|
for (j = 0; j <= n; j++) {
|
||||||
|
w[i][j] = (float) rand()/RAND_MAX;
|
||||||
|
// w[i][j] = dpn1();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void bpnn_randomize_row(float *w, int m)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
for (i = 0; i <= m; i++) {
|
||||||
|
//w[i] = (float) rand()/RAND_MAX;
|
||||||
|
w[i] = 0.1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void bpnn_zero_weights(float **w, int m, int n)
|
||||||
|
{
|
||||||
|
int i, j;
|
||||||
|
|
||||||
|
for (i = 0; i <= m; i++) {
|
||||||
|
for (j = 0; j <= n; j++) {
|
||||||
|
w[i][j] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void bpnn_initialize(int seed)
|
||||||
|
{
|
||||||
|
printf("Random number generator seed: %d\n", seed);
|
||||||
|
srand(seed);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
BPNN *bpnn_internal_create(int n_in, int n_hidden, int n_out)
|
||||||
|
{
|
||||||
|
BPNN *newnet;
|
||||||
|
|
||||||
|
newnet = (BPNN *) malloc (sizeof (BPNN));
|
||||||
|
if (newnet == NULL) {
|
||||||
|
printf("BPNN_CREATE: Couldn't allocate neural network\n");
|
||||||
|
return (NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
newnet->input_n = n_in;
|
||||||
|
newnet->hidden_n = n_hidden;
|
||||||
|
newnet->output_n = n_out;
|
||||||
|
newnet->input_units = alloc_1d_dbl(n_in + 1);
|
||||||
|
newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
|
||||||
|
newnet->output_units = alloc_1d_dbl(n_out + 1);
|
||||||
|
|
||||||
|
newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
|
||||||
|
newnet->output_delta = alloc_1d_dbl(n_out + 1);
|
||||||
|
newnet->target = alloc_1d_dbl(n_out + 1);
|
||||||
|
|
||||||
|
newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
|
||||||
|
newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
|
||||||
|
|
||||||
|
newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
|
||||||
|
newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
|
||||||
|
|
||||||
|
return (newnet);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void bpnn_free(BPNN *net)
|
||||||
|
{
|
||||||
|
int n1, n2, i;
|
||||||
|
|
||||||
|
n1 = net->input_n;
|
||||||
|
n2 = net->hidden_n;
|
||||||
|
|
||||||
|
free((char *) net->input_units);
|
||||||
|
free((char *) net->hidden_units);
|
||||||
|
free((char *) net->output_units);
|
||||||
|
|
||||||
|
free((char *) net->hidden_delta);
|
||||||
|
free((char *) net->output_delta);
|
||||||
|
free((char *) net->target);
|
||||||
|
|
||||||
|
for (i = 0; i <= n1; i++) {
|
||||||
|
free((char *) net->input_weights[i]);
|
||||||
|
free((char *) net->input_prev_weights[i]);
|
||||||
|
}
|
||||||
|
free((char *) net->input_weights);
|
||||||
|
free((char *) net->input_prev_weights);
|
||||||
|
|
||||||
|
for (i = 0; i <= n2; i++) {
|
||||||
|
free((char *) net->hidden_weights[i]);
|
||||||
|
free((char *) net->hidden_prev_weights[i]);
|
||||||
|
}
|
||||||
|
free((char *) net->hidden_weights);
|
||||||
|
free((char *) net->hidden_prev_weights);
|
||||||
|
|
||||||
|
free((char *) net);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*** Creates a new fully-connected network from scratch,
|
||||||
|
with the given numbers of input, hidden, and output units.
|
||||||
|
Threshold units are automatically included. All weights are
|
||||||
|
randomly initialized.
|
||||||
|
|
||||||
|
Space is also allocated for temporary storage (momentum weights,
|
||||||
|
error computations, etc).
|
||||||
|
***/
|
||||||
|
|
||||||
|
BPNN *bpnn_create(int n_in, int n_hidden, int n_out)
|
||||||
|
{
|
||||||
|
|
||||||
|
BPNN *newnet;
|
||||||
|
|
||||||
|
newnet = bpnn_internal_create(n_in, n_hidden, n_out);
|
||||||
|
|
||||||
|
#ifdef INITZERO
|
||||||
|
bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
|
||||||
|
#else
|
||||||
|
bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
|
||||||
|
#endif
|
||||||
|
bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
|
||||||
|
bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
|
||||||
|
bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
|
||||||
|
bpnn_randomize_row(newnet->target, n_out);
|
||||||
|
return (newnet);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2)
|
||||||
|
{
|
||||||
|
float sum;
|
||||||
|
int j, k;
|
||||||
|
|
||||||
|
/*** Set up thresholding unit ***/
|
||||||
|
l1[0] = 1.0;
|
||||||
|
#ifdef OPEN
|
||||||
|
omp_set_num_threads(NUM_THREAD);
|
||||||
|
#pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
|
||||||
|
#endif
|
||||||
|
/*** For each unit in second layer ***/
|
||||||
|
for (j = 1; j <= n2; j++) {
|
||||||
|
|
||||||
|
/*** Compute weighted sum of its inputs ***/
|
||||||
|
sum = 0.0;
|
||||||
|
for (k = 0; k <= n1; k++) {
|
||||||
|
sum += conn[k][j] * l1[k];
|
||||||
|
}
|
||||||
|
l2[j] = squash(sum);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err)
|
||||||
|
{
|
||||||
|
int j;
|
||||||
|
float o, t, errsum;
|
||||||
|
errsum = 0.0;
|
||||||
|
for (j = 1; j <= nj; j++) {
|
||||||
|
o = output[j];
|
||||||
|
t = target[j];
|
||||||
|
delta[j] = o * (1.0 - o) * (t - o);
|
||||||
|
errsum += ABS(delta[j]);
|
||||||
|
}
|
||||||
|
*err = errsum;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void bpnn_hidden_error(float *delta_h,
|
||||||
|
int nh,
|
||||||
|
float *delta_o,
|
||||||
|
int no,
|
||||||
|
float **who,
|
||||||
|
float *hidden,
|
||||||
|
float *err)
|
||||||
|
{
|
||||||
|
int j, k;
|
||||||
|
float h, sum, errsum;
|
||||||
|
|
||||||
|
errsum = 0.0;
|
||||||
|
for (j = 1; j <= nh; j++) {
|
||||||
|
h = hidden[j];
|
||||||
|
sum = 0.0;
|
||||||
|
for (k = 1; k <= no; k++) {
|
||||||
|
sum += delta_o[k] * who[j][k];
|
||||||
|
}
|
||||||
|
delta_h[j] = h * (1.0 - h) * sum;
|
||||||
|
errsum += ABS(delta_h[j]);
|
||||||
|
}
|
||||||
|
*err = errsum;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw)
|
||||||
|
{
|
||||||
|
float new_dw;
|
||||||
|
int k, j;
|
||||||
|
ly[0] = 1.0;
|
||||||
|
|
||||||
|
#ifdef OPEN
|
||||||
|
omp_set_num_threads(NUM_THREAD);
|
||||||
|
#pragma omp parallel for \
|
||||||
|
shared(oldw, w, delta) \
|
||||||
|
private(j, k, new_dw) \
|
||||||
|
firstprivate(ndelta, nly, momentum)
|
||||||
|
#endif
|
||||||
|
for (j = 1; j <= ndelta; j++) {
|
||||||
|
for (k = 0; k <= nly; k++) {
|
||||||
|
new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
|
||||||
|
w[k][j] += new_dw;
|
||||||
|
oldw[k][j] = new_dw;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void bpnn_feedforward(BPNN *net)
|
||||||
|
{
|
||||||
|
int in, hid, out;
|
||||||
|
|
||||||
|
in = net->input_n;
|
||||||
|
hid = net->hidden_n;
|
||||||
|
out = net->output_n;
|
||||||
|
|
||||||
|
/*** Feed forward input activations. ***/
|
||||||
|
bpnn_layerforward(net->input_units, net->hidden_units,
|
||||||
|
net->input_weights, in, hid);
|
||||||
|
bpnn_layerforward(net->hidden_units, net->output_units,
|
||||||
|
net->hidden_weights, hid, out);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void bpnn_train(BPNN *net, float *eo, float *eh)
|
||||||
|
{
|
||||||
|
int in, hid, out;
|
||||||
|
float out_err, hid_err;
|
||||||
|
|
||||||
|
in = net->input_n;
|
||||||
|
hid = net->hidden_n;
|
||||||
|
out = net->output_n;
|
||||||
|
|
||||||
|
/*** Feed forward input activations. ***/
|
||||||
|
bpnn_layerforward(net->input_units, net->hidden_units,
|
||||||
|
net->input_weights, in, hid);
|
||||||
|
bpnn_layerforward(net->hidden_units, net->output_units,
|
||||||
|
net->hidden_weights, hid, out);
|
||||||
|
|
||||||
|
/*** Compute error on output and hidden units. ***/
|
||||||
|
bpnn_output_error(net->output_delta, net->target, net->output_units,
|
||||||
|
out, &out_err);
|
||||||
|
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
|
||||||
|
net->hidden_weights, net->hidden_units, &hid_err);
|
||||||
|
*eo = out_err;
|
||||||
|
*eh = hid_err;
|
||||||
|
|
||||||
|
/*** Adjust input and hidden weights. ***/
|
||||||
|
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
|
||||||
|
net->hidden_weights, net->hidden_prev_weights);
|
||||||
|
bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
|
||||||
|
net->input_weights, net->input_prev_weights);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void bpnn_save(BPNN *net, char *filename)
|
||||||
|
{
|
||||||
|
int n1, n2, n3, i, j, memcnt;
|
||||||
|
float dvalue, **w;
|
||||||
|
char *mem;
|
||||||
|
///add//
|
||||||
|
FILE *pFile;
|
||||||
|
pFile = fopen( filename, "w+" );
|
||||||
|
///////
|
||||||
|
/*
|
||||||
|
if ((fd = creat(filename, 0644)) == -1) {
|
||||||
|
printf("BPNN_SAVE: Cannot create '%s'\n", filename);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
n1 = net->input_n; n2 = net->hidden_n; n3 = net->output_n;
|
||||||
|
printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
|
||||||
|
//fflush(stdout);
|
||||||
|
|
||||||
|
//write(fd, (char *) &n1, sizeof(int));
|
||||||
|
//write(fd, (char *) &n2, sizeof(int));
|
||||||
|
//write(fd, (char *) &n3, sizeof(int));
|
||||||
|
|
||||||
|
fwrite( (char *) &n1 , sizeof(char), sizeof(char), pFile);
|
||||||
|
fwrite( (char *) &n2 , sizeof(char), sizeof(char), pFile);
|
||||||
|
fwrite( (char *) &n3 , sizeof(char), sizeof(char), pFile);
|
||||||
|
|
||||||
|
memcnt = 0;
|
||||||
|
w = net->input_weights;
|
||||||
|
mem = (char *) malloc ((unsigned) ((n1+1) * (n2+1) * sizeof(float)));
|
||||||
|
for (i = 0; i <= n1; i++) {
|
||||||
|
for (j = 0; j <= n2; j++) {
|
||||||
|
dvalue = w[i][j];
|
||||||
|
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
|
||||||
|
memcnt += sizeof(float);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
|
||||||
|
fwrite( mem , (unsigned)(sizeof(float)), (unsigned) ((n1+1) * (n2+1) * sizeof(float)) , pFile);
|
||||||
|
free(mem);
|
||||||
|
|
||||||
|
memcnt = 0;
|
||||||
|
w = net->hidden_weights;
|
||||||
|
mem = (char *) malloc ((unsigned) ((n2+1) * (n3+1) * sizeof(float)));
|
||||||
|
for (i = 0; i <= n2; i++) {
|
||||||
|
for (j = 0; j <= n3; j++) {
|
||||||
|
dvalue = w[i][j];
|
||||||
|
fastcopy(&mem[memcnt], &dvalue, sizeof(float));
|
||||||
|
memcnt += sizeof(float);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
//write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
|
||||||
|
fwrite( mem , sizeof(float), (unsigned) ((n2+1) * (n3+1) * sizeof(float)) , pFile);
|
||||||
|
free(mem);
|
||||||
|
|
||||||
|
fclose(pFile);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
BPNN *bpnn_read(char *filename)
|
||||||
|
{
|
||||||
|
char *mem;
|
||||||
|
BPNN *new_t;
|
||||||
|
int fd, n1, n2, n3, i, j, memcnt;
|
||||||
|
|
||||||
|
if ((fd = open(filename, 0, 0644)) == -1) {
|
||||||
|
return (NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Reading '%s'\n", filename); //fflush(stdout);
|
||||||
|
|
||||||
|
read(fd, (char *) &n1, sizeof(int));
|
||||||
|
read(fd, (char *) &n2, sizeof(int));
|
||||||
|
read(fd, (char *) &n3, sizeof(int));
|
||||||
|
new_t = bpnn_internal_create(n1, n2, n3);
|
||||||
|
|
||||||
|
printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
|
||||||
|
printf("Reading input weights..."); //fflush(stdout);
|
||||||
|
|
||||||
|
memcnt = 0;
|
||||||
|
mem = (char *) malloc ((unsigned) ((n1+1) * (n2+1) * sizeof(float)));
|
||||||
|
read(fd, mem, (n1+1) * (n2+1) * sizeof(float));
|
||||||
|
for (i = 0; i <= n1; i++) {
|
||||||
|
for (j = 0; j <= n2; j++) {
|
||||||
|
fastcopy(&(new_t->input_weights[i][j]), &mem[memcnt], sizeof(float));
|
||||||
|
memcnt += sizeof(float);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(mem);
|
||||||
|
|
||||||
|
printf("Done\nReading hidden weights..."); //fflush(stdout);
|
||||||
|
|
||||||
|
memcnt = 0;
|
||||||
|
mem = (char *) malloc ((unsigned) ((n2+1) * (n3+1) * sizeof(float)));
|
||||||
|
read(fd, mem, (n2+1) * (n3+1) * sizeof(float));
|
||||||
|
for (i = 0; i <= n2; i++) {
|
||||||
|
for (j = 0; j <= n3; j++) {
|
||||||
|
fastcopy(&(new_t->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
|
||||||
|
memcnt += sizeof(float);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
free(mem);
|
||||||
|
close(fd);
|
||||||
|
|
||||||
|
printf("Done\n"); //fflush(stdout);
|
||||||
|
|
||||||
|
bpnn_zero_weights(new_t->input_prev_weights, n1, n2);
|
||||||
|
bpnn_zero_weights(new_t->hidden_prev_weights, n2, n3);
|
||||||
|
|
||||||
|
return (new_t);
|
||||||
|
}
|
70
tests/opencl/backprop/backprop.h
Normal file
70
tests/opencl/backprop/backprop.h
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
#ifndef _BACKPROP_H_
|
||||||
|
#define _BACKPROP_H_
|
||||||
|
|
||||||
|
#define BIGRND 0x7fffffff
|
||||||
|
#define THREADS 256
|
||||||
|
#define WIDTH 16 // shared memory width
|
||||||
|
#define HEIGHT 16 // shared memory height
|
||||||
|
#define BLOCK_SIZE 16
|
||||||
|
|
||||||
|
#define ETA 0.3 //eta value
|
||||||
|
#define MOMENTUM 0.3 //momentum value
|
||||||
|
#define NUM_THREAD 4 //OpenMP threads
|
||||||
|
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int input_n; /* number of input units */
|
||||||
|
int hidden_n; /* number of hidden units */
|
||||||
|
int output_n; /* number of output units */
|
||||||
|
|
||||||
|
float *input_units; /* the input units */
|
||||||
|
float *hidden_units; /* the hidden units */
|
||||||
|
float *output_units; /* the output units */
|
||||||
|
|
||||||
|
float *hidden_delta; /* storage for hidden unit error */
|
||||||
|
float *output_delta; /* storage for output unit error */
|
||||||
|
|
||||||
|
float *target; /* storage for target vector */
|
||||||
|
|
||||||
|
float **input_weights; /* weights from input to hidden layer */
|
||||||
|
float **hidden_weights; /* weights from hidden to output layer */
|
||||||
|
|
||||||
|
/*** The next two are for momentum ***/
|
||||||
|
float **input_prev_weights; /* previous change on input to hidden wgt */
|
||||||
|
float **hidden_prev_weights; /* previous change on hidden to output wgt */
|
||||||
|
} BPNN;
|
||||||
|
|
||||||
|
|
||||||
|
/*** User-level functions ***/
|
||||||
|
|
||||||
|
//void bpnn_initialize();
|
||||||
|
void bpnn_initialize(int seed);
|
||||||
|
BPNN *bpnn_create(int n_in, int n_hidden, int n_out);
|
||||||
|
void bpnn_free(BPNN *net);
|
||||||
|
//BPNN *bpnn_create();
|
||||||
|
//void bpnn_free();
|
||||||
|
void bpnn_train(BPNN *net, float *eo, float *eh);
|
||||||
|
//void bpnn_train();
|
||||||
|
//void bpnn_feedforward();
|
||||||
|
void bpnn_feedforward(BPNN *net);
|
||||||
|
void bpnn_save(BPNN *net, char *filename);
|
||||||
|
//void bpnn_save();
|
||||||
|
//BPNN *bpnn_read();
|
||||||
|
BPNN *bpnn_read(char *filename);
|
||||||
|
void load(BPNN *net);
|
||||||
|
int bpnn_train_kernel(BPNN *net, float *eo, float *eh);
|
||||||
|
void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2);
|
||||||
|
void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err);
|
||||||
|
void bpnn_hidden_error(float *delta_h, int nh, float *delta_o, int no, float **who, float *hidden, float *err);
|
||||||
|
void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw);
|
||||||
|
int setup(int argc, char** argv);
|
||||||
|
float **alloc_2d_dbl(int m, int n);
|
||||||
|
float squash(float x);
|
||||||
|
|
||||||
|
/*** OpenCL config variables ***/
|
||||||
|
extern int platform_id_inuse;
|
||||||
|
extern int device_id_inuse;
|
||||||
|
extern cl_device_type device_type;
|
||||||
|
|
||||||
|
#endif
|
402
tests/opencl/backprop/backprop_ocl.cpp
Normal file
402
tests/opencl/backprop/backprop_ocl.cpp
Normal file
|
@ -0,0 +1,402 @@
|
||||||
|
// includes, system
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include "backprop.h"
|
||||||
|
|
||||||
|
#ifdef NV //NVIDIA
|
||||||
|
#include <oclUtils.h>
|
||||||
|
#else
|
||||||
|
#include <CL/cl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
#include "timing.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// local variables
|
||||||
|
static cl_context context;
|
||||||
|
static cl_command_queue cmd_queue;
|
||||||
|
static cl_device_id * device_list;
|
||||||
|
static cl_uint num_devices;
|
||||||
|
|
||||||
|
// OCL config
|
||||||
|
int platform_id_inuse = 0; // platform id in use (default: 0)
|
||||||
|
int device_id_inuse = 0; //device id in use (default : 0)
|
||||||
|
cl_device_type device_type = CL_DEVICE_TYPE_GPU;
|
||||||
|
|
||||||
|
//Primitives for timing
|
||||||
|
#ifdef TIMING
|
||||||
|
struct timeval tv;
|
||||||
|
struct timeval tv_total_start, tv_total_end;
|
||||||
|
struct timeval tv_init_end;
|
||||||
|
struct timeval tv_h2d_start, tv_h2d_end;
|
||||||
|
struct timeval tv_d2h_start, tv_d2h_end;
|
||||||
|
struct timeval tv_kernel_start, tv_kernel_end;
|
||||||
|
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
|
||||||
|
struct timeval tv_close_start, tv_close_end;
|
||||||
|
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
|
||||||
|
d2h_time = 0, close_time = 0, total_time = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static int initialize(void)
|
||||||
|
{
|
||||||
|
cl_int result;
|
||||||
|
size_t size;
|
||||||
|
cl_uint num_platforms;
|
||||||
|
|
||||||
|
// get OpenCL platforms
|
||||||
|
if (clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(0,0,*) failed\n"); return -1; }
|
||||||
|
cl_platform_id all_platform_id[num_platforms];
|
||||||
|
if (clGetPlatformIDs(num_platforms, all_platform_id, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(*,*,0) failed\n"); return -1; }
|
||||||
|
cl_platform_id platform_id = all_platform_id[platform_id_inuse];
|
||||||
|
|
||||||
|
// get device
|
||||||
|
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices) != CL_SUCCESS) { printf("ERROR: clGetDeviceIDs failed\n"); return -1; };
|
||||||
|
printf("num_devices = %d\n", num_devices);
|
||||||
|
if(device_id_inuse > num_devices) {
|
||||||
|
printf("Invalid Device Number\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
device_list = new cl_device_id[num_devices];
|
||||||
|
//device_list = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
|
||||||
|
if( !device_list ) { printf("ERROR: new cl_device_id[] failed\n"); return -1; }
|
||||||
|
if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, num_devices, device_list, NULL) != CL_SUCCESS) { printf("ERROR: clGetDeviceIDs failed\n"); return -1; };
|
||||||
|
|
||||||
|
// get device type
|
||||||
|
if (clGetDeviceInfo(device_list[device_id_inuse], CL_DEVICE_TYPE, sizeof(device_type), (void *)&device_type, NULL)!= CL_SUCCESS) { printf("ERROR: clGetDeviceIDs failed\n"); return -1; };
|
||||||
|
|
||||||
|
// create OpenCL context
|
||||||
|
cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0};
|
||||||
|
context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
|
||||||
|
if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", device_type == CL_DEVICE_TYPE_GPU ? "GPU" : "CPU"); return -1; }
|
||||||
|
|
||||||
|
// create command queue for the specific device
|
||||||
|
#ifdef TIMING
|
||||||
|
cmd_queue = clCreateCommandQueue( context, device_list[device_id_inuse], CL_QUEUE_PROFILING_ENABLE, NULL );
|
||||||
|
#else
|
||||||
|
cmd_queue = clCreateCommandQueue( context, device_list[device_id_inuse], 0, NULL );
|
||||||
|
#endif
|
||||||
|
if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int shutdown()
|
||||||
|
{
|
||||||
|
// release resources
|
||||||
|
if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
|
||||||
|
if( context ) clReleaseContext( context );
|
||||||
|
if( device_list ) delete[] device_list;
|
||||||
|
|
||||||
|
// reset all variables
|
||||||
|
cmd_queue = 0;
|
||||||
|
context = 0;
|
||||||
|
device_list = 0;
|
||||||
|
num_devices = 0;
|
||||||
|
device_type = CL_DEVICE_TYPE_GPU;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
double gettime() {
|
||||||
|
struct timeval t;
|
||||||
|
gettimeofday(&t,NULL);
|
||||||
|
return t.tv_sec+t.tv_usec*1e-6;
|
||||||
|
}
|
||||||
|
|
||||||
|
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||||
|
if (nullptr == filename || nullptr == data || 0 == size)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
FILE* fp = fopen(filename, "r");
|
||||||
|
if (NULL == fp) {
|
||||||
|
fprintf(stderr, "Failed to load kernel.");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
fseek(fp , 0 , SEEK_END);
|
||||||
|
long fsize = ftell(fp);
|
||||||
|
rewind(fp);
|
||||||
|
|
||||||
|
*data = (uint8_t*)malloc(fsize);
|
||||||
|
*size = fread(*data, 1, fsize, fp);
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
unsigned int num_threads = 0;
|
||||||
|
unsigned int num_blocks = 0;
|
||||||
|
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
// Program main
|
||||||
|
////////////////////////////////////////////////////////////////////////////////
|
||||||
|
int
|
||||||
|
main( int argc, char** argv)
|
||||||
|
{
|
||||||
|
setup(argc, argv);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
|
||||||
|
{
|
||||||
|
int in, hid, out;
|
||||||
|
float out_err, hid_err;
|
||||||
|
|
||||||
|
in = net->input_n;
|
||||||
|
hid = net->hidden_n;
|
||||||
|
out = net->output_n;
|
||||||
|
|
||||||
|
int sourcesize = 1024*1024;
|
||||||
|
char * source = (char *)calloc(sourcesize, sizeof(char));
|
||||||
|
if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
|
||||||
|
|
||||||
|
// read the kernel core source
|
||||||
|
const char * kernel_bp1 = "bpnn_layerforward_ocl";
|
||||||
|
const char * kernel_bp2 = "bpnn_adjust_weights_ocl";
|
||||||
|
//const char * tempchar = "./backprop_kernel.cl";
|
||||||
|
/*
|
||||||
|
FILE * fp = fopen(tempchar, "rb");
|
||||||
|
if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
|
||||||
|
fread(source + strlen(source), sourcesize, 1, fp);
|
||||||
|
fclose(fp);
|
||||||
|
*/
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_total_start, NULL);
|
||||||
|
#endif
|
||||||
|
if(initialize()) return -1;
|
||||||
|
|
||||||
|
// compile kernel
|
||||||
|
cl_int err = 0;
|
||||||
|
/*
|
||||||
|
const char * slist[2] = { source, 0 };
|
||||||
|
cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
|
||||||
|
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
|
||||||
|
{ // show warnings/errors
|
||||||
|
//static char log[65536]; memset(log, 0, sizeof(log));
|
||||||
|
//cl_device_id device_id = 0;
|
||||||
|
//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
|
||||||
|
//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
|
||||||
|
//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
|
||||||
|
}
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
|
||||||
|
*/
|
||||||
|
uint8_t *kernel_bin = NULL;
|
||||||
|
size_t kernel_size;
|
||||||
|
cl_int binary_status = 0;
|
||||||
|
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||||
|
std::abort();
|
||||||
|
|
||||||
|
cl_program prog = clCreateProgramWithBinary(
|
||||||
|
context, 1, &device_list[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &err);
|
||||||
|
free(kernel_bin);
|
||||||
|
|
||||||
|
err = clBuildProgram(prog, 1, &device_list[device_id_inuse], NULL, NULL, NULL);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
|
||||||
|
|
||||||
|
|
||||||
|
cl_kernel kernel1;
|
||||||
|
cl_kernel kernel2;
|
||||||
|
kernel1 = clCreateKernel(prog, kernel_bp1, &err);
|
||||||
|
kernel2 = clCreateKernel(prog, kernel_bp2, &err);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
|
||||||
|
clReleaseProgram(prog);
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_init_end, NULL);
|
||||||
|
tvsub(&tv_init_end, &tv_total_start, &tv);
|
||||||
|
init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
float *input_weights_one_dim;
|
||||||
|
float *input_weights_prev_one_dim;
|
||||||
|
float * partial_sum;
|
||||||
|
float sum;
|
||||||
|
float num_blocks = in / BLOCK_SIZE;
|
||||||
|
|
||||||
|
input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
|
||||||
|
input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
|
||||||
|
partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
|
||||||
|
|
||||||
|
// set global and local workitems
|
||||||
|
size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 };
|
||||||
|
size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
|
||||||
|
|
||||||
|
// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
|
||||||
|
// todo: fix mem allocation
|
||||||
|
int m = 0;
|
||||||
|
for (int k = 0; k <= in; k++) {
|
||||||
|
for (int j = 0; j <= hid; j++) {
|
||||||
|
input_weights_one_dim[m] = net->input_weights[k][j];
|
||||||
|
input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
|
||||||
|
m++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_mem input_hidden_ocl;
|
||||||
|
cl_mem input_ocl;
|
||||||
|
cl_mem output_hidden_ocl;
|
||||||
|
cl_mem hidden_partial_sum;
|
||||||
|
cl_mem hidden_delta_ocl;
|
||||||
|
cl_mem input_prev_weights_ocl;
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_mem_alloc_start, NULL);
|
||||||
|
#endif
|
||||||
|
input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
|
||||||
|
input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
|
||||||
|
output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
|
||||||
|
hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
|
||||||
|
hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
|
||||||
|
input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_mem_alloc_end, NULL);
|
||||||
|
tvsub(&tv_mem_alloc_end, &tv_mem_alloc_start, &tv);
|
||||||
|
mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
printf("Performing %s computation\n", device_type == CL_DEVICE_TYPE_GPU ? "GPU" : "CPU");
|
||||||
|
cl_event event;
|
||||||
|
cl_event write_event[3];
|
||||||
|
|
||||||
|
//write buffers
|
||||||
|
err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, &write_event[0]);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
|
||||||
|
|
||||||
|
err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, &write_event[1]);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(write_event[0],cmd_queue);
|
||||||
|
h2d_time += probe_event_time(write_event[1],cmd_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(write_event[0]);
|
||||||
|
clReleaseEvent(write_event[1]);
|
||||||
|
|
||||||
|
clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
|
||||||
|
clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
|
||||||
|
clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
|
||||||
|
clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
|
||||||
|
clSetKernelArg(kernel1, 4, sizeof(float) * HEIGHT, (void*)NULL );
|
||||||
|
clSetKernelArg(kernel1, 5, sizeof(float ) * HEIGHT * WIDTH, (void*)NULL );
|
||||||
|
clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
|
||||||
|
clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
|
||||||
|
|
||||||
|
err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 2, NULL, global_work, local_work, 0, 0, &event);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
|
||||||
|
#ifdef TIMING
|
||||||
|
kernel_time += probe_event_time(event,cmd_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, &event);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: partial sum\n"); return -1; }
|
||||||
|
#ifdef TIMING
|
||||||
|
d2h_time += probe_event_time(event,cmd_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
for (int j = 1; j <= hid; j++) {
|
||||||
|
sum = 0.0;
|
||||||
|
for (int k = 0; k < num_blocks; k++) {
|
||||||
|
sum += partial_sum[k * hid + j-1] ;
|
||||||
|
}
|
||||||
|
sum += net->input_weights[0][j];
|
||||||
|
net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
|
||||||
|
bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
|
||||||
|
bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);
|
||||||
|
bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
|
||||||
|
|
||||||
|
err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl, 1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, &write_event[0]);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
|
||||||
|
|
||||||
|
err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, &write_event[1]);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
|
||||||
|
|
||||||
|
err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, &write_event[2]);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(write_event[0],cmd_queue);
|
||||||
|
h2d_time += probe_event_time(write_event[1],cmd_queue);
|
||||||
|
h2d_time += probe_event_time(write_event[2],cmd_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(write_event[0]);
|
||||||
|
clReleaseEvent(write_event[1]);
|
||||||
|
clReleaseEvent(write_event[2]);
|
||||||
|
|
||||||
|
clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
|
||||||
|
clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
|
||||||
|
clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
|
||||||
|
clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
|
||||||
|
clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
|
||||||
|
clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
|
||||||
|
|
||||||
|
err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, &event);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
|
||||||
|
#ifdef TIMING
|
||||||
|
kernel_time += probe_event_time(event,cmd_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, &event);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_ocl\n"); return -1; }
|
||||||
|
#ifdef TIMING
|
||||||
|
d2h_time += probe_event_time(event,cmd_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, &event);
|
||||||
|
if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }
|
||||||
|
#ifdef TIMING
|
||||||
|
d2h_time += probe_event_time(event,cmd_queue);
|
||||||
|
#endif
|
||||||
|
clReleaseEvent(event);
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_close_start, NULL);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
clReleaseMemObject(input_ocl);
|
||||||
|
clReleaseMemObject(output_hidden_ocl);
|
||||||
|
clReleaseMemObject(input_hidden_ocl);
|
||||||
|
clReleaseMemObject(hidden_partial_sum);
|
||||||
|
clReleaseMemObject(input_prev_weights_ocl);
|
||||||
|
|
||||||
|
free(input_weights_prev_one_dim);
|
||||||
|
free(partial_sum);
|
||||||
|
free(input_weights_one_dim);
|
||||||
|
|
||||||
|
shutdown();
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_close_end, NULL);
|
||||||
|
tvsub(&tv_close_end, &tv_close_start, &tv);
|
||||||
|
close_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
tvsub(&tv_close_end, &tv_total_start, &tv);
|
||||||
|
total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
|
||||||
|
printf("Init: %f\n", init_time);
|
||||||
|
printf("MemAlloc: %f\n", mem_alloc_time);
|
||||||
|
printf("HtoD: %f\n", h2d_time);
|
||||||
|
printf("Exec: %f\n", kernel_time);
|
||||||
|
printf("DtoH: %f\n", d2h_time);
|
||||||
|
printf("Close: %f\n", close_time);
|
||||||
|
printf("Total: %f\n", total_time);
|
||||||
|
#endif
|
||||||
|
}
|
123
tests/opencl/backprop/common.mk
Normal file
123
tests/opencl/backprop/common.mk
Normal file
|
@ -0,0 +1,123 @@
|
||||||
|
XLEN ?= 32
|
||||||
|
TOOLDIR ?= /opt
|
||||||
|
|
||||||
|
TARGET ?= opaesim
|
||||||
|
|
||||||
|
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
|
||||||
|
XRT_DEVICE_INDEX ?= 0
|
||||||
|
|
||||||
|
ifeq ($(XLEN),64)
|
||||||
|
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
|
||||||
|
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||||
|
K_CFLAGS += -march=rv64imafd -mabi=ilp64d
|
||||||
|
STARTUP_ADDR ?= 0x180000000
|
||||||
|
else
|
||||||
|
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
|
||||||
|
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||||
|
K_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||||
|
STARTUP_ADDR ?= 0x80000000
|
||||||
|
endif
|
||||||
|
|
||||||
|
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||||
|
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
|
||||||
|
|
||||||
|
POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
|
||||||
|
POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
|
||||||
|
|
||||||
|
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||||
|
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
|
||||||
|
|
||||||
|
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
|
||||||
|
|
||||||
|
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
|
||||||
|
LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
|
||||||
|
|
||||||
|
K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
|
||||||
|
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
|
||||||
|
K_CFLAGS += -I$(VORTEX_KN_PATH)/include -DNDEBUG -DLLVM_VOTEX
|
||||||
|
K_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a -lm
|
||||||
|
|
||||||
|
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||||
|
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing -Wno-unused-result
|
||||||
|
CXXFLAGS += -pthread
|
||||||
|
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||||
|
|
||||||
|
ifdef HOSTGPU
|
||||||
|
CXXFLAGS += -DHOSTGPU
|
||||||
|
LDFLAGS += -lOpenCL
|
||||||
|
else
|
||||||
|
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so
|
||||||
|
endif
|
||||||
|
|
||||||
|
# Debugigng
|
||||||
|
#ifdef DEBUG
|
||||||
|
CXXFLAGS += -g -O0
|
||||||
|
#else
|
||||||
|
# CXXFLAGS += -O2 -DNDEBUG
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ifeq ($(TARGET), fpga)
|
||||||
|
OPAE_DRV_PATHS ?= libopae-c.so
|
||||||
|
else
|
||||||
|
ifeq ($(TARGET), asesim)
|
||||||
|
OPAE_DRV_PATHS ?= libopae-c-ase.so
|
||||||
|
else
|
||||||
|
ifeq ($(TARGET), opaesim)
|
||||||
|
OPAE_DRV_PATHS ?= libopae-c-sim.so
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
|
||||||
|
|
||||||
|
all: $(PROJECT) kernel.pocl
|
||||||
|
|
||||||
|
kernel.pocl: kernel.cl
|
||||||
|
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||||
|
|
||||||
|
#backprop.o: backprop.c
|
||||||
|
# $(CXX) $(CXXFLAGS) backprop.c -c -Wno-unused-result
|
||||||
|
|
||||||
|
%.cc.o: %.cc
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
%.cpp.o: %.cpp
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
%.c.o: %.c
|
||||||
|
$(CC) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
$(PROJECT): $(OBJS)
|
||||||
|
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||||
|
|
||||||
|
run-hostgpu: $(PROJECT) kernel.pocl
|
||||||
|
./$(PROJECT) $(OPTS)
|
||||||
|
|
||||||
|
run-simx: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||||
|
|
||||||
|
run-rtlsim: $(PROJECT) kernel.pocl
|
||||||
|
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||||
|
|
||||||
|
run-opae: $(PROJECT) kernel.pocl
|
||||||
|
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||||
|
|
||||||
|
run-xrt: $(PROJECT) kernel.pocl
|
||||||
|
ifeq ($(TARGET), hw)
|
||||||
|
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||||
|
else
|
||||||
|
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||||
|
endif
|
||||||
|
|
||||||
|
.depend: $(SRCS)
|
||||||
|
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(PROJECT) *.o .depend
|
||||||
|
|
||||||
|
clean-all: clean
|
||||||
|
rm -rf *.dump *.pocl
|
||||||
|
|
||||||
|
ifneq ($(MAKECMDGOALS),clean)
|
||||||
|
-include .depend
|
||||||
|
endif
|
70
tests/opencl/backprop/facetrain.cpp
Normal file
70
tests/opencl/backprop/facetrain.cpp
Normal file
|
@ -0,0 +1,70 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include "backprop.h"
|
||||||
|
#include "omp.h"
|
||||||
|
#include <CL/cl.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
extern char *strcpy();
|
||||||
|
extern void exit();
|
||||||
|
|
||||||
|
int layer_size = 0;
|
||||||
|
|
||||||
|
void backprop_face()
|
||||||
|
{
|
||||||
|
BPNN *net;
|
||||||
|
int i;
|
||||||
|
float out_err, hid_err;
|
||||||
|
net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
|
||||||
|
|
||||||
|
printf("Input layer size : %d\n", layer_size);
|
||||||
|
load(net);
|
||||||
|
//entering the training kernel, only one iteration
|
||||||
|
printf("Starting training kernel\n");
|
||||||
|
bpnn_train_kernel(net, &out_err, &hid_err);
|
||||||
|
bpnn_free(net);
|
||||||
|
printf("\nFinish the training for one iteration\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
int setup(int argc, char **argv)
|
||||||
|
{
|
||||||
|
layer_size = -1;
|
||||||
|
|
||||||
|
int cur_arg;
|
||||||
|
for (cur_arg = 1; cur_arg<argc; cur_arg++) {
|
||||||
|
if (strcmp(argv[cur_arg], "-h") == 0) {
|
||||||
|
fprintf(stderr, "usage: backprop <-n num of input elements> [-p platform_id] [-d device_id] [-t device_type]\n");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
else if (strcmp(argv[cur_arg], "-n") == 0) {
|
||||||
|
if (argc >= cur_arg + 1) {
|
||||||
|
layer_size = atoi(argv[cur_arg+1]);
|
||||||
|
cur_arg++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (strcmp(argv[cur_arg], "-p") == 0) {
|
||||||
|
if (argc >= cur_arg + 1) {
|
||||||
|
platform_id_inuse = atoi(argv[cur_arg+1]);
|
||||||
|
cur_arg++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (strcmp(argv[cur_arg], "-d") == 0) {
|
||||||
|
if (argc >= cur_arg + 1) {
|
||||||
|
device_id_inuse = atoi(argv[cur_arg+1]);
|
||||||
|
cur_arg++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (layer_size % 16 != 0){
|
||||||
|
fprintf(stderr, "The number of input points must be divided by 16\n");
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
int seed = 7;
|
||||||
|
bpnn_initialize(seed);
|
||||||
|
backprop_face();
|
||||||
|
|
||||||
|
exit(0);
|
||||||
|
}
|
24
tests/opencl/backprop/imagenet.cpp
Normal file
24
tests/opencl/backprop/imagenet.cpp
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "backprop.h"
|
||||||
|
|
||||||
|
extern int layer_size;
|
||||||
|
|
||||||
|
void load(BPNN *net)
|
||||||
|
//BPNN *net;
|
||||||
|
{
|
||||||
|
float *units;
|
||||||
|
int nr, nc, imgsize, i, j, k;
|
||||||
|
|
||||||
|
nr = layer_size;
|
||||||
|
|
||||||
|
imgsize = nr * nc;
|
||||||
|
units = net->input_units;
|
||||||
|
|
||||||
|
k = 1;
|
||||||
|
for (i = 0; i < nr; i++) {
|
||||||
|
units[k] = (float) rand()/RAND_MAX ;
|
||||||
|
k++;
|
||||||
|
}
|
||||||
|
}
|
90
tests/opencl/backprop/kernel.cl
Normal file
90
tests/opencl/backprop/kernel.cl
Normal file
|
@ -0,0 +1,90 @@
|
||||||
|
#define THREADS 256
|
||||||
|
#define WIDTH 16
|
||||||
|
#define HEIGHT 16
|
||||||
|
#define ETA 0.3f
|
||||||
|
#define MOMENTUM 0.3f
|
||||||
|
|
||||||
|
#ifndef _BACKPROP_CUDA_KERNEL_H_
|
||||||
|
#define _BACKPROP_CUDA_KERNEL_H_
|
||||||
|
#define WM(i, j) weight_matrix[(j) + (i) * WIDTH]
|
||||||
|
|
||||||
|
__kernel void
|
||||||
|
bpnn_layerforward_ocl(__global float *input_cuda,
|
||||||
|
__global float *output_hidden_cuda,
|
||||||
|
__global float *input_hidden_cuda,
|
||||||
|
__global float *hidden_partial_sum,
|
||||||
|
__local float *input_node,
|
||||||
|
__local float *weight_matrix,
|
||||||
|
int in,
|
||||||
|
int hid)
|
||||||
|
{
|
||||||
|
|
||||||
|
int by = get_group_id(1);
|
||||||
|
int tx = get_local_id(0);
|
||||||
|
int ty = get_local_id(1);
|
||||||
|
|
||||||
|
int index = ( hid + 1 ) * HEIGHT * by + ( hid + 1 ) * ty + tx + 1 + ( hid + 1 ) ;
|
||||||
|
|
||||||
|
int index_in = HEIGHT * by + ty + 1;
|
||||||
|
|
||||||
|
if ( tx == 0 )
|
||||||
|
input_node[ty] = input_cuda[index_in] ;
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
weight_matrix[ty * WIDTH + tx] = input_hidden_cuda[index];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
weight_matrix[ty * WIDTH + tx]= weight_matrix[ty * WIDTH + tx] * input_node[ty];
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
for ( int i = 1 ; i <= HEIGHT ; i=i*2){
|
||||||
|
//for ( int i = 1 ; i <= 4 ; i++){
|
||||||
|
int power_two = i;
|
||||||
|
//int power_two = 2 << (i - 1);
|
||||||
|
|
||||||
|
if( ty % power_two == 0 )
|
||||||
|
weight_matrix[ty * WIDTH + tx]= weight_matrix[ty * WIDTH + tx] + weight_matrix[(ty + power_two/2)* WIDTH + tx];
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
input_hidden_cuda[index] = weight_matrix[ty * WIDTH + tx];
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if ( tx == 0 ) {
|
||||||
|
hidden_partial_sum[by * hid + ty] = weight_matrix[tx* WIDTH + ty];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void bpnn_adjust_weights_ocl( __global float * delta,
|
||||||
|
int hid,
|
||||||
|
__global float * ly,
|
||||||
|
int in,
|
||||||
|
__global float * w,
|
||||||
|
__global float * oldw)
|
||||||
|
{
|
||||||
|
|
||||||
|
int by = get_group_id(1);
|
||||||
|
int tx = get_local_id(0);
|
||||||
|
int ty = get_local_id(1);
|
||||||
|
|
||||||
|
int index = ( hid + 1 ) * HEIGHT * by + ( hid + 1 ) * ty + tx + 1 + ( hid + 1 ) ;
|
||||||
|
int index_y = HEIGHT * by + ty + 1;
|
||||||
|
int index_x = tx + 1;
|
||||||
|
|
||||||
|
w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
|
||||||
|
oldw[index] = ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
|
||||||
|
|
||||||
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
|
if (ty == 0 && by ==0){
|
||||||
|
w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
|
||||||
|
oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif
|
40
tests/opencl/backprop/timing.cpp
Normal file
40
tests/opencl/backprop/timing.cpp
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "timing.h"
|
||||||
|
|
||||||
|
void time_measure_start(struct timeval *tv)
|
||||||
|
{
|
||||||
|
gettimeofday(tv, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void time_measure_end(struct timeval *tv)
|
||||||
|
{
|
||||||
|
struct timeval tv_now, tv_diff;
|
||||||
|
double d;
|
||||||
|
|
||||||
|
gettimeofday(&tv_now, NULL);
|
||||||
|
tvsub(&tv_now, tv, &tv_diff);
|
||||||
|
|
||||||
|
d = (double) tv_diff.tv_sec * 1000.0 + (double) tv_diff.tv_usec / 1000.0;
|
||||||
|
printf("Time (Memory Copy and Launch) = %f (ms)\n", d);
|
||||||
|
}
|
||||||
|
|
||||||
|
float probe_event_time(cl_event event, cl_command_queue command_queue) {
|
||||||
|
cl_int error=0;
|
||||||
|
cl_ulong eventStart,eventEnd;
|
||||||
|
clFinish(command_queue);
|
||||||
|
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,
|
||||||
|
sizeof(cl_ulong),&eventStart,NULL);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
printf("ERROR (%d) in event start profiling.\n", error);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,
|
||||||
|
sizeof(cl_ulong),&eventEnd,NULL);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
printf("ERROR (%d) in event end profiling.\n", error);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (float)((eventEnd-eventStart)/1000000.0);
|
||||||
|
}
|
25
tests/opencl/backprop/timing.h
Normal file
25
tests/opencl/backprop/timing.h
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
#ifndef __TIMING_H__
|
||||||
|
#define __TIMING_H__
|
||||||
|
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
|
void time_measure_start(struct timeval *tv);
|
||||||
|
void time_measure_end(struct timeval *tv);
|
||||||
|
|
||||||
|
/* tvsub: ret = x - y. */
|
||||||
|
static inline void tvsub(struct timeval *x,
|
||||||
|
struct timeval *y,
|
||||||
|
struct timeval *ret)
|
||||||
|
{
|
||||||
|
ret->tv_sec = x->tv_sec - y->tv_sec;
|
||||||
|
ret->tv_usec = x->tv_usec - y->tv_usec;
|
||||||
|
if (ret->tv_usec < 0) {
|
||||||
|
ret->tv_sec--;
|
||||||
|
ret->tv_usec += 1000000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float probe_event_time(cl_event, cl_command_queue);
|
||||||
|
|
||||||
|
#endif
|
931
tests/opencl/bfs2/CLHelper.h
Normal file
931
tests/opencl/bfs2/CLHelper.h
Normal file
|
@ -0,0 +1,931 @@
|
||||||
|
//------------------------------------------
|
||||||
|
//--cambine:helper function for OpenCL
|
||||||
|
//--programmer: Jianbin Fang
|
||||||
|
//--date: 27/12/2010
|
||||||
|
//------------------------------------------
|
||||||
|
#ifndef _CL_HELPER_
|
||||||
|
#define _CL_HELPER_
|
||||||
|
|
||||||
|
#include <CL/cl.h>
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
#include "timing.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
using std::string;
|
||||||
|
using std::ifstream;
|
||||||
|
using std::cerr;
|
||||||
|
using std::endl;
|
||||||
|
//using std::cout;
|
||||||
|
//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
|
||||||
|
#define WORK_DIM 2 //work-items dimensions
|
||||||
|
|
||||||
|
extern float init_time, mem_alloc_time, h2d_time, kernel_time,
|
||||||
|
d2h_time, close_time, total_time;
|
||||||
|
|
||||||
|
struct oclHandleStruct {
|
||||||
|
cl_context context;
|
||||||
|
cl_device_id *devices;
|
||||||
|
cl_command_queue queue;
|
||||||
|
cl_program program;
|
||||||
|
cl_int cl_status;
|
||||||
|
std::string error_str;
|
||||||
|
std::vector<cl_kernel> kernel;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct oclHandleStruct oclHandles;
|
||||||
|
|
||||||
|
struct _clDeviceProp{
|
||||||
|
char device_name[100];
|
||||||
|
};
|
||||||
|
|
||||||
|
char kernel_file[100] = "Kernels.cl";
|
||||||
|
int total_kernels = 2;
|
||||||
|
string kernel_names[2] = {"BFS_1", "BFS_2"};
|
||||||
|
int work_group_size = 512;
|
||||||
|
int platform_id_inuse = 0; // platform id in use (default: 0)
|
||||||
|
int device_id_inuse = 0; //device id in use (default : 0)
|
||||||
|
cl_device_type device_type = CL_DEVICE_TYPE_GPU;
|
||||||
|
|
||||||
|
void _clGetDeviceProperties(int idx, _clDeviceProp *prop) throw(string){
|
||||||
|
|
||||||
|
oclHandles.cl_status= clGetDeviceInfo(oclHandles.devices[idx], CL_DEVICE_NAME, 100, prop->device_name, NULL);
|
||||||
|
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS){
|
||||||
|
oclHandles.error_str = "exception in _clGetDeviceProperties-> ";
|
||||||
|
switch(oclHandles.cl_status){
|
||||||
|
case CL_INVALID_DEVICE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_DEVICE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_VALUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_VALUE";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "unknown reasons";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||||
|
if (nullptr == filename || nullptr == data || 0 == size)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
FILE* fp = fopen(filename, "r");
|
||||||
|
if (NULL == fp) {
|
||||||
|
fprintf(stderr, "Failed to load kernel.");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
fseek(fp , 0 , SEEK_END);
|
||||||
|
long fsize = ftell(fp);
|
||||||
|
rewind(fp);
|
||||||
|
|
||||||
|
*data = (uint8_t*)malloc(fsize);
|
||||||
|
*size = fread(*data, 1, fsize, fp);
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Converts the contents of a file into a string
|
||||||
|
*/
|
||||||
|
string FileToString(const string fileName)
|
||||||
|
{
|
||||||
|
ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
|
||||||
|
|
||||||
|
try {
|
||||||
|
size_t size;
|
||||||
|
char* str;
|
||||||
|
string s;
|
||||||
|
|
||||||
|
if(f.is_open()) {
|
||||||
|
size_t fileSize;
|
||||||
|
f.seekg(0, ifstream::end);
|
||||||
|
size = fileSize = f.tellg();
|
||||||
|
f.seekg(0, ifstream::beg);
|
||||||
|
|
||||||
|
str = new char[size+1];
|
||||||
|
if (!str) throw(string("Could not allocate memory"));
|
||||||
|
|
||||||
|
f.read(str, fileSize);
|
||||||
|
f.close();
|
||||||
|
str[size] = '\0';
|
||||||
|
|
||||||
|
s = str;
|
||||||
|
delete [] str;
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
} catch(std::string msg) {
|
||||||
|
printf("Exception caught in FileToString(): %s\n");
|
||||||
|
if(f.is_open())
|
||||||
|
f.close();
|
||||||
|
} catch(...) {
|
||||||
|
printf("Exception caught in FileToString()\n");
|
||||||
|
if(f.is_open())
|
||||||
|
f.close();
|
||||||
|
}
|
||||||
|
string errorMsg = "FileToString()::Error: Unable to open file "
|
||||||
|
+ fileName;
|
||||||
|
throw(errorMsg);
|
||||||
|
}
|
||||||
|
//---------------------------------------
|
||||||
|
//Read command line parameters
|
||||||
|
//
|
||||||
|
void _clCmdParams(int argc, char* argv[])
|
||||||
|
{
|
||||||
|
for (int i =0; i < argc; ++i) {
|
||||||
|
switch (argv[i][1]) {
|
||||||
|
case 'g': //--g stands for size of work group
|
||||||
|
if (++i < argc) {
|
||||||
|
sscanf(argv[i], "%u", &work_group_size);
|
||||||
|
} else {
|
||||||
|
printf("Could not read argument after option %d\n", argv[i-1]);
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 'd': //--d stands for device id used in computaion
|
||||||
|
if (++i < argc) {
|
||||||
|
sscanf(argv[i], "%u", &device_id_inuse);
|
||||||
|
} else {
|
||||||
|
printf("Could not read argument after option %d\n", argv[i-1]);
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 'p': // --p stands for platform id used in computation
|
||||||
|
if (++i < argc) {
|
||||||
|
sscanf(argv[i], "%u", &platform_id_inuse);
|
||||||
|
} else {
|
||||||
|
printf("Could not read argument after option %d\n", argv[i-1]);
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
/*
|
||||||
|
case 't': // --t stands for device type, 0:GPU, 1:CPU
|
||||||
|
if (++i < argc) {
|
||||||
|
sscanf(argv[i], "%u", &device_type);
|
||||||
|
device_type = (device_type == 0) ? CL_DEVICE_TYPE_GPU
|
||||||
|
: CL_DEVICE_TYPE_CPU;
|
||||||
|
} else {
|
||||||
|
std::cerr << "Could not read argument after option " << argv[i-1] << std::endl;
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
*/
|
||||||
|
default:
|
||||||
|
;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------------
|
||||||
|
//Initlize CL objects
|
||||||
|
//--description: there are 5 steps to initialize all the OpenCL objects needed
|
||||||
|
//--revised on 04/01/2011: get the number of devices and
|
||||||
|
// devices have no relationship with context
|
||||||
|
void _clInit()
|
||||||
|
{
|
||||||
|
cl_int resultCL;
|
||||||
|
|
||||||
|
oclHandles.context = NULL;
|
||||||
|
oclHandles.devices = NULL;
|
||||||
|
oclHandles.queue = NULL;
|
||||||
|
oclHandles.program = NULL;
|
||||||
|
|
||||||
|
cl_uint deviceListSize;
|
||||||
|
|
||||||
|
//-----------------------------------------------
|
||||||
|
//--cambine-1: find the available platforms and select one
|
||||||
|
|
||||||
|
cl_uint numPlatforms;
|
||||||
|
cl_platform_id targetPlatform = NULL;
|
||||||
|
|
||||||
|
resultCL = clGetPlatformIDs(0, NULL, &numPlatforms);
|
||||||
|
if (resultCL != CL_SUCCESS)
|
||||||
|
throw (string("InitCL()::Error: Getting number of platforms (clGetPlatformIDs)"));
|
||||||
|
printf("number of platforms:%d\n",numPlatforms); //by cambine
|
||||||
|
|
||||||
|
if (!(numPlatforms > 0))
|
||||||
|
throw (string("InitCL()::Error: No platforms found (clGetPlatformIDs)"));
|
||||||
|
|
||||||
|
cl_platform_id* allPlatforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id));
|
||||||
|
|
||||||
|
resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
|
||||||
|
if (resultCL != CL_SUCCESS)
|
||||||
|
throw (string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
|
||||||
|
|
||||||
|
for (int i = 0; i < numPlatforms; i++) {
|
||||||
|
char pbuff[128];
|
||||||
|
resultCL = clGetPlatformInfo( allPlatforms[i],
|
||||||
|
CL_PLATFORM_VENDOR,
|
||||||
|
sizeof(pbuff),
|
||||||
|
pbuff,
|
||||||
|
NULL);
|
||||||
|
if (resultCL != CL_SUCCESS)
|
||||||
|
throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
|
||||||
|
|
||||||
|
printf("vendor is %s\n",pbuff);
|
||||||
|
|
||||||
|
}
|
||||||
|
/* Select the target platform. Default: first platform */
|
||||||
|
targetPlatform = allPlatforms[platform_id_inuse];
|
||||||
|
free(allPlatforms);
|
||||||
|
|
||||||
|
//-----------------------------------------------
|
||||||
|
//--cambine-3: detect OpenCL devices
|
||||||
|
/* First, get the size of device list */
|
||||||
|
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_ALL, 0, NULL, &deviceListSize);
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS) {
|
||||||
|
throw(string("exception in _clInit -> clGetDeviceIDs"));
|
||||||
|
}
|
||||||
|
if (deviceListSize == 0)
|
||||||
|
throw(string("InitCL()::Error: No devices found."));
|
||||||
|
printf("device number: %d\n");
|
||||||
|
|
||||||
|
/* Now, allocate the device list */
|
||||||
|
oclHandles.devices = (cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
|
||||||
|
|
||||||
|
if (oclHandles.devices == 0)
|
||||||
|
throw(string("InitCL()::Error: Could not allocate memory."));
|
||||||
|
|
||||||
|
/* Next, get the device list data */
|
||||||
|
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_ALL, deviceListSize,
|
||||||
|
oclHandles.devices, NULL);
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS) {
|
||||||
|
throw(string("exception in _clInit -> clGetDeviceIDs-2"));
|
||||||
|
}
|
||||||
|
|
||||||
|
_clDeviceProp prop;
|
||||||
|
_clGetDeviceProperties(device_id_inuse, &prop);
|
||||||
|
printf("--cambine: device name=%s\n", prop.device_name);
|
||||||
|
|
||||||
|
/* Then, get device type */
|
||||||
|
/*
|
||||||
|
oclHandles.cl_status = clGetDeviceInfo(oclHandles.devices[device_id_inuse],
|
||||||
|
CL_DEVICE_TYPE, sizeof(cl_device_type), (void *)&device_type,
|
||||||
|
NULL);
|
||||||
|
if (oclHandles.cl_status != CL_SUCCESS) {
|
||||||
|
throw(string("error in Getting Device Info"));
|
||||||
|
}
|
||||||
|
if (device_type == CL_DEVICE_TYPE_GPU)
|
||||||
|
printf("Creating GPU Context\n");
|
||||||
|
else if (device_type == CL_DEVICE_TYPE_CPU)
|
||||||
|
printf("Creating CPU Context\n");
|
||||||
|
else
|
||||||
|
throw(string("unsupported device type"));
|
||||||
|
*/
|
||||||
|
//-----------------------------------------------
|
||||||
|
//--cambine-2: create an OpenCL context
|
||||||
|
cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)targetPlatform, 0 };
|
||||||
|
/*
|
||||||
|
oclHandles.context = clCreateContextFromType(cprops,
|
||||||
|
device_type,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
&resultCL);
|
||||||
|
*/
|
||||||
|
oclHandles.context = clCreateContext(0,
|
||||||
|
deviceListSize,
|
||||||
|
oclHandles.devices,
|
||||||
|
NULL,
|
||||||
|
NULL,
|
||||||
|
&resultCL);
|
||||||
|
|
||||||
|
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
|
||||||
|
throw (string("InitCL()::Error: Creating Context (clCreateContextFromType)"));
|
||||||
|
|
||||||
|
//-----------------------------------------------
|
||||||
|
//--cambine-4: Create an OpenCL command queue
|
||||||
|
#ifdef TIMING
|
||||||
|
oclHandles.queue = clCreateCommandQueue(oclHandles.context,
|
||||||
|
oclHandles.devices[device_id_inuse], CL_QUEUE_PROFILING_ENABLE,
|
||||||
|
&resultCL);
|
||||||
|
#else
|
||||||
|
oclHandles.queue = clCreateCommandQueue(oclHandles.context,
|
||||||
|
oclHandles.devices[device_id_inuse], 0, &resultCL);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
|
||||||
|
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
|
||||||
|
|
||||||
|
//-----------------------------------------------
|
||||||
|
//--cambine-5: Load CL file, build CL program object, create CL kernel object
|
||||||
|
/*
|
||||||
|
std::string source_str = FileToString(kernel_file);
|
||||||
|
const char * source = source_str.c_str();
|
||||||
|
size_t sourceSize[] = { source_str.length() };
|
||||||
|
|
||||||
|
oclHandles.program = clCreateProgramWithSource(oclHandles.context,
|
||||||
|
1,
|
||||||
|
&source,
|
||||||
|
sourceSize,
|
||||||
|
&resultCL);
|
||||||
|
|
||||||
|
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
|
||||||
|
throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)"));
|
||||||
|
*/
|
||||||
|
uint8_t *kernel_bin = NULL;
|
||||||
|
size_t kernel_size;
|
||||||
|
cl_int binary_status = 0;
|
||||||
|
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||||
|
std::abort();
|
||||||
|
|
||||||
|
oclHandles.program = clCreateProgramWithBinary(
|
||||||
|
oclHandles.context, 1, &oclHandles.devices[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &resultCL);
|
||||||
|
free(kernel_bin);
|
||||||
|
|
||||||
|
//insert debug information
|
||||||
|
//std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
|
||||||
|
//options += " -cl-nv-opt-level=3";
|
||||||
|
resultCL = clBuildProgram(oclHandles.program, deviceListSize, oclHandles.devices, NULL, NULL,NULL);
|
||||||
|
|
||||||
|
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) {
|
||||||
|
printf("InitCL()::Error: In clBuildProgram\n");
|
||||||
|
|
||||||
|
size_t length;
|
||||||
|
resultCL = clGetProgramBuildInfo(oclHandles.program,
|
||||||
|
oclHandles.devices[device_id_inuse],
|
||||||
|
CL_PROGRAM_BUILD_LOG,
|
||||||
|
0,
|
||||||
|
NULL,
|
||||||
|
&length);
|
||||||
|
if(resultCL != CL_SUCCESS)
|
||||||
|
throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
|
||||||
|
|
||||||
|
char* buffer = (char*)malloc(length);
|
||||||
|
resultCL = clGetProgramBuildInfo(oclHandles.program,
|
||||||
|
oclHandles.devices[device_id_inuse],
|
||||||
|
CL_PROGRAM_BUILD_LOG,
|
||||||
|
length,
|
||||||
|
buffer,
|
||||||
|
NULL);
|
||||||
|
if(resultCL != CL_SUCCESS)
|
||||||
|
throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
|
||||||
|
|
||||||
|
printf("%s\n", buffer);
|
||||||
|
free(buffer);
|
||||||
|
|
||||||
|
throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
|
||||||
|
}
|
||||||
|
|
||||||
|
//get program information in intermediate representation
|
||||||
|
#ifdef PTX_MSG
|
||||||
|
size_t binary_sizes[deviceListSize];
|
||||||
|
char * binaries[deviceListSize];
|
||||||
|
//figure out number of devices and the sizes of the binary for each device.
|
||||||
|
oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*deviceListSize, &binary_sizes, NULL );
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS) {
|
||||||
|
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("--cambine: %d", binary_sizes);
|
||||||
|
//copy over all of the generated binaries.
|
||||||
|
for(int i=0; i<deviceListSize; i++)
|
||||||
|
binaries[i] = (char *)malloc( sizeof(char)*(binary_sizes[i]+1));
|
||||||
|
oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES, sizeof(char *)*deviceListSize, binaries, NULL );
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS) {
|
||||||
|
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
|
||||||
|
}
|
||||||
|
for(int i=0; i<deviceListSize; i++)
|
||||||
|
binaries[i][binary_sizes[i]] = '\0';
|
||||||
|
printf("--cambine:writing ptd information...\n");
|
||||||
|
FILE * ptx_file = fopen("cl.ptx","w");
|
||||||
|
if(ptx_file==NULL) {
|
||||||
|
throw(string("exceptions in allocate ptx file."));
|
||||||
|
}
|
||||||
|
fprintf(ptx_file,"%s",binaries[DEVICE_ID_INUSE]);
|
||||||
|
fclose(ptx_file);
|
||||||
|
printf("--cambine:writing ptd information done.\n");
|
||||||
|
for(int i=0; i<deviceListSize; i++)
|
||||||
|
free(binaries[i]);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (int nKernel = 0; nKernel < total_kernels; nKernel++) {
|
||||||
|
/* get a kernel object handle for a kernel with the given name */
|
||||||
|
cl_kernel kernel = clCreateKernel(oclHandles.program,
|
||||||
|
(kernel_names[nKernel]).c_str(),
|
||||||
|
&resultCL);
|
||||||
|
|
||||||
|
if ((resultCL != CL_SUCCESS) || (kernel == NULL)) {
|
||||||
|
string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" + kernel_names[nKernel] + "\"";
|
||||||
|
throw(errorMsg);
|
||||||
|
}
|
||||||
|
|
||||||
|
oclHandles.kernel.push_back(kernel);
|
||||||
|
}
|
||||||
|
//get resource alocation information
|
||||||
|
#ifdef RES_MSG
|
||||||
|
char * build_log;
|
||||||
|
size_t ret_val_size;
|
||||||
|
oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSE], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS) {
|
||||||
|
throw(string("exceptions in _InitCL -> getting resource information"));
|
||||||
|
}
|
||||||
|
|
||||||
|
build_log = (char *)malloc(ret_val_size+1);
|
||||||
|
oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSE], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS) {
|
||||||
|
throw(string("exceptions in _InitCL -> getting resources allocation information-2"));
|
||||||
|
}
|
||||||
|
build_log[ret_val_size] = '\0';
|
||||||
|
printf("--cambine: %s\n", build_log);
|
||||||
|
free(build_log);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
//---------------------------------------
|
||||||
|
//release CL objects
|
||||||
|
void _clRelease()
|
||||||
|
{
|
||||||
|
char errorFlag = false;
|
||||||
|
|
||||||
|
for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++) {
|
||||||
|
if (oclHandles.kernel[nKernel] != NULL) {
|
||||||
|
cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
|
||||||
|
if (resultCL != CL_SUCCESS) {
|
||||||
|
printf("ReleaseCL()::Error: In clReleaseKernel\n");
|
||||||
|
errorFlag = true;
|
||||||
|
}
|
||||||
|
oclHandles.kernel[nKernel] = NULL;
|
||||||
|
}
|
||||||
|
oclHandles.kernel.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oclHandles.program != NULL) {
|
||||||
|
cl_int resultCL = clReleaseProgram(oclHandles.program);
|
||||||
|
if (resultCL != CL_SUCCESS) {
|
||||||
|
printf("ReleaseCL()::Error: In clReleaseProgram\n");
|
||||||
|
errorFlag = true;
|
||||||
|
}
|
||||||
|
oclHandles.program = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (oclHandles.queue != NULL) {
|
||||||
|
cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
|
||||||
|
if (resultCL != CL_SUCCESS) {
|
||||||
|
printf("ReleaseCL()::Error: In clReleaseCommandQueue\n");
|
||||||
|
errorFlag = true;
|
||||||
|
}
|
||||||
|
oclHandles.queue = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
free(oclHandles.devices);
|
||||||
|
|
||||||
|
if (oclHandles.context != NULL) {
|
||||||
|
cl_int resultCL = clReleaseContext(oclHandles.context);
|
||||||
|
if (resultCL != CL_SUCCESS) {
|
||||||
|
printf("ReleaseCL()::Error: In clReleaseContext\n");
|
||||||
|
errorFlag = true;
|
||||||
|
}
|
||||||
|
oclHandles.context = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errorFlag) throw(string("ReleaseCL()::Error encountered."));
|
||||||
|
}
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//--cambine:create buffer and then copy data from host to device
|
||||||
|
cl_mem _clCreateAndCpyMem(int size, void * h_mem_source) throw(string)
|
||||||
|
{
|
||||||
|
cl_mem d_mem;
|
||||||
|
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, \
|
||||||
|
size, h_mem_source, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateAndCpyMem()"));
|
||||||
|
#endif
|
||||||
|
return d_mem;
|
||||||
|
}
|
||||||
|
//-------------------------------------------------------
|
||||||
|
//--cambine: create read only buffer for devices
|
||||||
|
//--date: 17/01/2011
|
||||||
|
cl_mem _clMallocRW(int size, void * h_mem_ptr) throw(string)
|
||||||
|
{
|
||||||
|
cl_mem d_mem;
|
||||||
|
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_WRITE, size, NULL, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clMallocRW"));
|
||||||
|
#endif
|
||||||
|
return d_mem;
|
||||||
|
}
|
||||||
|
//-------------------------------------------------------
|
||||||
|
//--cambine: create read and write buffer for devices
|
||||||
|
//--date: 17/01/2011
|
||||||
|
cl_mem _clMalloc(int size, void * h_mem_ptr) throw(string)
|
||||||
|
{
|
||||||
|
cl_mem d_mem;
|
||||||
|
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, NULL, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clMalloc"));
|
||||||
|
#endif
|
||||||
|
return d_mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
//-------------------------------------------------------
|
||||||
|
//--cambine: transfer data from host to device
|
||||||
|
//--date: 17/01/2011
|
||||||
|
void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string)
|
||||||
|
{
|
||||||
|
cl_event event;
|
||||||
|
oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem,
|
||||||
|
CL_TRUE, 0, size, h_mem_ptr, 0, NULL, &event);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clMemcpyH2D"));
|
||||||
|
#endif
|
||||||
|
#ifdef TIMING
|
||||||
|
h2d_time += probe_event_time(event, oclHandles.queue);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//--cambine:create buffer and then copy data from host to device with pinned
|
||||||
|
// memory
|
||||||
|
cl_mem _clCreateAndCpyPinnedMem(int size, float* h_mem_source) throw(string)
|
||||||
|
{
|
||||||
|
cl_mem d_mem, d_mem_pinned;
|
||||||
|
float * h_mem_pinned = NULL;
|
||||||
|
d_mem_pinned = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR, \
|
||||||
|
size, NULL, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
|
||||||
|
#endif
|
||||||
|
//------------
|
||||||
|
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, \
|
||||||
|
size, NULL, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
|
||||||
|
#endif
|
||||||
|
//----------
|
||||||
|
h_mem_pinned = (cl_float *)clEnqueueMapBuffer(oclHandles.queue, d_mem_pinned, CL_TRUE, \
|
||||||
|
CL_MAP_WRITE, 0, size, 0, NULL, \
|
||||||
|
NULL, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
|
||||||
|
#endif
|
||||||
|
int element_number = size/sizeof(float);
|
||||||
|
#pragma omp parallel for
|
||||||
|
for(int i=0; i<element_number; i++) {
|
||||||
|
h_mem_pinned[i] = h_mem_source[i];
|
||||||
|
}
|
||||||
|
//----------
|
||||||
|
oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem, \
|
||||||
|
CL_TRUE, 0, size, h_mem_pinned, \
|
||||||
|
0, NULL, NULL);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return d_mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//--cambine:create write only buffer on device
|
||||||
|
cl_mem _clMallocWO(int size) throw(string)
|
||||||
|
{
|
||||||
|
cl_mem d_mem;
|
||||||
|
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0, &oclHandles.cl_status);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clCreateMem()"));
|
||||||
|
#endif
|
||||||
|
return d_mem;
|
||||||
|
}
|
||||||
|
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//transfer data from device to host
|
||||||
|
void _clMemcpyD2H(cl_mem d_mem, int size, void * h_mem) throw(string)
|
||||||
|
{
|
||||||
|
cl_event event;
|
||||||
|
oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem, 0,0, &event);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
|
||||||
|
switch(oclHandles.cl_status) {
|
||||||
|
case CL_INVALID_COMMAND_QUEUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_CONTEXT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_MEM_OBJECT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_VALUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_VALUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_EVENT_WAIT_LIST:
|
||||||
|
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||||
|
break;
|
||||||
|
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||||
|
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unknown reason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
#ifdef TIMING
|
||||||
|
d2h_time += probe_event_time(event, oclHandles.queue);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//set kernel arguments
|
||||||
|
void _clSetArgs(int kernel_id, int arg_idx, void * d_mem, int size = 0) throw(string)
|
||||||
|
{
|
||||||
|
if(!size) {
|
||||||
|
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, sizeof(d_mem), &d_mem);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
|
||||||
|
switch(oclHandles.cl_status) {
|
||||||
|
case CL_INVALID_KERNEL:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_INDEX:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_VALUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_MEM_OBJECT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_SAMPLER:
|
||||||
|
oclHandles.error_str += "CL_INVALID_SAMPLER";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unknown reason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
|
||||||
|
switch(oclHandles.cl_status) {
|
||||||
|
case CL_INVALID_KERNEL:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_INDEX:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_VALUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_MEM_OBJECT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_SAMPLER:
|
||||||
|
oclHandles.error_str += "CL_INVALID_SAMPLER";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_ARG_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unknown reason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
void _clFinish() throw(string)
|
||||||
|
{
|
||||||
|
oclHandles.cl_status = clFinish(oclHandles.queue);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clFinish";
|
||||||
|
switch(oclHandles.cl_status) {
|
||||||
|
case CL_INVALID_COMMAND_QUEUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unknown reasons";
|
||||||
|
break;
|
||||||
|
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status!=CL_SUCCESS) {
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//--cambine:enqueue kernel
|
||||||
|
void _clInvokeKernel(int kernel_id, int work_items, int work_group_size) throw(string)
|
||||||
|
{
|
||||||
|
cl_uint work_dim = WORK_DIM;
|
||||||
|
cl_event e[1];
|
||||||
|
if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size
|
||||||
|
work_items = work_items + (work_group_size-(work_items%work_group_size));
|
||||||
|
size_t local_work_size[] = {work_group_size, 1};
|
||||||
|
size_t global_work_size[] = {work_items, 1};
|
||||||
|
oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
|
||||||
|
global_work_size, local_work_size, 0, 0, &(e[0]) );
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
|
||||||
|
switch(oclHandles.cl_status) {
|
||||||
|
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_COMMAND_QUEUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_KERNEL:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_CONTEXT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_KERNEL_ARGS:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_DIMENSION:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_GLOBAL_WORK_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_GROUP_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_ITEM_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_GLOBAL_OFFSET:
|
||||||
|
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||||
|
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_EVENT_WAIT_LIST:
|
||||||
|
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unkown reseason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
kernel_time += probe_event_time(e[0], oclHandles.queue);
|
||||||
|
#endif
|
||||||
|
//_clFinish();
|
||||||
|
// oclHandles.cl_status = clWaitForEvents(1, &e[0]);
|
||||||
|
// #ifdef ERRMSG
|
||||||
|
// if (oclHandles.cl_status!= CL_SUCCESS)
|
||||||
|
// throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
|
||||||
|
// #endif
|
||||||
|
}
|
||||||
|
void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x, int group_y) throw(string)
|
||||||
|
{
|
||||||
|
cl_uint work_dim = WORK_DIM;
|
||||||
|
size_t local_work_size[] = {group_x, group_y};
|
||||||
|
size_t global_work_size[] = {range_x, range_y};
|
||||||
|
cl_event e[1];
|
||||||
|
/*if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size
|
||||||
|
work_items = work_items + (work_group_size-(work_items%work_group_size));*/
|
||||||
|
oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
|
||||||
|
global_work_size, local_work_size, 0, 0, &(e[0]) );
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
|
||||||
|
switch(oclHandles.cl_status) {
|
||||||
|
case CL_INVALID_PROGRAM_EXECUTABLE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_COMMAND_QUEUE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_KERNEL:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_CONTEXT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_CONTEXT";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_KERNEL_ARGS:
|
||||||
|
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_DIMENSION:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_GLOBAL_WORK_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_GROUP_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_WORK_ITEM_SIZE:
|
||||||
|
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_GLOBAL_OFFSET:
|
||||||
|
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
|
||||||
|
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
|
||||||
|
break;
|
||||||
|
case CL_INVALID_EVENT_WAIT_LIST:
|
||||||
|
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unkown reseason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if(oclHandles.cl_status != CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
kernel_time += probe_event_time(e[0], oclHandles.queue);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//_clFinish();
|
||||||
|
/*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
if (oclHandles.cl_status!= CL_SUCCESS)
|
||||||
|
throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
|
||||||
|
#endif*/
|
||||||
|
}
|
||||||
|
|
||||||
|
//--------------------------------------------------------
|
||||||
|
//release OpenCL objects
|
||||||
|
void _clFree(cl_mem ob) throw(string)
|
||||||
|
{
|
||||||
|
if(ob!=NULL)
|
||||||
|
oclHandles.cl_status = clReleaseMemObject(ob);
|
||||||
|
#ifdef ERRMSG
|
||||||
|
oclHandles.error_str = "excpetion in _clFree() ->";
|
||||||
|
switch(oclHandles.cl_status) {
|
||||||
|
case CL_INVALID_MEM_OBJECT:
|
||||||
|
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_RESOURCES:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
|
||||||
|
break;
|
||||||
|
case CL_OUT_OF_HOST_MEMORY:
|
||||||
|
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
oclHandles.error_str += "Unkown reseason";
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (oclHandles.cl_status!= CL_SUCCESS)
|
||||||
|
throw(oclHandles.error_str);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif //_CL_HELPER_
|
7
tests/opencl/bfs2/Makefile
Normal file
7
tests/opencl/bfs2/Makefile
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
PROJECT = bfs
|
||||||
|
|
||||||
|
SRCS = bfs.cpp timer.cc
|
||||||
|
|
||||||
|
OPTS ?= graph4096.txt 0 0 0
|
||||||
|
|
||||||
|
include ../common.mk
|
BIN
tests/opencl/bfs2/bfs
Executable file
BIN
tests/opencl/bfs2/bfs
Executable file
Binary file not shown.
351
tests/opencl/bfs2/bfs.cpp
Normal file
351
tests/opencl/bfs2/bfs.cpp
Normal file
|
@ -0,0 +1,351 @@
|
||||||
|
//--by Jianbin Fang
|
||||||
|
|
||||||
|
#define __CL_ENABLE_EXCEPTIONS
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
|
#ifdef PROFILING
|
||||||
|
#include "timer.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
#include "timing.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include "CLHelper.h"
|
||||||
|
#include "util.h"
|
||||||
|
|
||||||
|
#define MAX_THREADS_PER_BLOCK 256
|
||||||
|
|
||||||
|
//Structure to hold a node information
|
||||||
|
struct Node {
|
||||||
|
int starting;
|
||||||
|
int no_of_edges;
|
||||||
|
};
|
||||||
|
|
||||||
|
//Primitives for timing
|
||||||
|
#ifdef TIMING
|
||||||
|
struct timeval tv;
|
||||||
|
struct timeval tv_total_start, tv_total_end;
|
||||||
|
struct timeval tv_h2d_start, tv_h2d_end;
|
||||||
|
struct timeval tv_d2h_start, tv_d2h_end;
|
||||||
|
struct timeval tv_kernel_start, tv_kernel_end;
|
||||||
|
struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
|
||||||
|
struct timeval tv_close_start, tv_close_end;
|
||||||
|
float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time= 0,
|
||||||
|
d2h_time = 0, close_time = 0, total_time = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//----------------------------------------------------------
|
||||||
|
//--bfs on cpu
|
||||||
|
//--programmer: jianbin
|
||||||
|
//--date: 26/01/2011
|
||||||
|
//--note: width is changed to the new_width
|
||||||
|
//----------------------------------------------------------
|
||||||
|
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
|
||||||
|
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
|
||||||
|
char *h_graph_visited, int *h_cost_ref)
|
||||||
|
{
|
||||||
|
char stop;
|
||||||
|
int k = 0;
|
||||||
|
do {
|
||||||
|
//if no thread changes this value then the loop stops
|
||||||
|
stop=false;
|
||||||
|
for(int tid = 0; tid < no_of_nodes; tid++ ) {
|
||||||
|
if (h_graph_mask[tid] == true) {
|
||||||
|
h_graph_mask[tid]=false;
|
||||||
|
for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++) {
|
||||||
|
int id = h_graph_edges[i]; //--cambine: node id is connected with node tid
|
||||||
|
if(!h_graph_visited[id]) { //--cambine: if node id has not been visited, enter the body below
|
||||||
|
h_cost_ref[id]=h_cost_ref[tid]+1;
|
||||||
|
h_updating_graph_mask[id]=true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for(int tid=0; tid< no_of_nodes ; tid++ ) {
|
||||||
|
if (h_updating_graph_mask[tid] == true) {
|
||||||
|
h_graph_mask[tid]=true;
|
||||||
|
h_graph_visited[tid]=true;
|
||||||
|
stop=true;
|
||||||
|
h_updating_graph_mask[tid]=false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
k++;
|
||||||
|
} while(stop);
|
||||||
|
}
|
||||||
|
//----------------------------------------------------------
|
||||||
|
//--breadth first search on GPUs
|
||||||
|
//----------------------------------------------------------
|
||||||
|
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
|
||||||
|
int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
|
||||||
|
char *h_graph_visited, int *h_cost)
|
||||||
|
throw(std::string)
|
||||||
|
{
|
||||||
|
|
||||||
|
//int number_elements = height*width;
|
||||||
|
char h_over;
|
||||||
|
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
|
||||||
|
d_graph_visited, d_cost, d_over;
|
||||||
|
try {
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_total_start, NULL);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
_clInit();
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_mem_alloc_start, NULL);
|
||||||
|
tvsub(&tv_mem_alloc_start, &tv_total_start, &tv);
|
||||||
|
init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//--1 transfer data from host to device
|
||||||
|
d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
|
||||||
|
d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
|
||||||
|
d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask);
|
||||||
|
d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask);
|
||||||
|
d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited);
|
||||||
|
|
||||||
|
d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
|
||||||
|
d_over = _clMallocRW(sizeof(char), &h_over);
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_mem_alloc_end, NULL);
|
||||||
|
tvsub(&tv_mem_alloc_end, &tv_mem_alloc_start, &tv);
|
||||||
|
mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
_clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
|
||||||
|
_clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);
|
||||||
|
_clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask);
|
||||||
|
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask);
|
||||||
|
_clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited);
|
||||||
|
_clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);
|
||||||
|
|
||||||
|
//--2 invoke kernel
|
||||||
|
#ifdef PROFILING
|
||||||
|
timer kernel_timer;
|
||||||
|
double kernel_time = 0.0;
|
||||||
|
kernel_timer.reset();
|
||||||
|
kernel_timer.start();
|
||||||
|
#endif
|
||||||
|
do {
|
||||||
|
h_over = false;
|
||||||
|
_clMemcpyH2D(d_over, sizeof(char), &h_over);
|
||||||
|
|
||||||
|
//--kernel 0
|
||||||
|
int kernel_id = 0;
|
||||||
|
int kernel_idx = 0;
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_cost);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
|
||||||
|
|
||||||
|
//int work_items = no_of_nodes;
|
||||||
|
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
|
||||||
|
|
||||||
|
//--kernel 1
|
||||||
|
kernel_id = 1;
|
||||||
|
kernel_idx = 0;
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, d_over);
|
||||||
|
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
|
||||||
|
|
||||||
|
//work_items = no_of_nodes;
|
||||||
|
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
|
||||||
|
|
||||||
|
_clMemcpyD2H(d_over,sizeof(char), &h_over);
|
||||||
|
} while(h_over);
|
||||||
|
|
||||||
|
_clFinish();
|
||||||
|
#ifdef PROFILING
|
||||||
|
kernel_timer.stop();
|
||||||
|
kernel_time = kernel_timer.getTimeInSeconds();
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//--3 transfer data from device to host
|
||||||
|
_clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
|
||||||
|
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_close_start, NULL);
|
||||||
|
#endif
|
||||||
|
//--statistics
|
||||||
|
#ifdef PROFILING
|
||||||
|
printf("kernel time(s): %d\n", kernel_time );
|
||||||
|
#endif
|
||||||
|
//--4 release cl resources.
|
||||||
|
_clFree(d_graph_nodes);
|
||||||
|
_clFree(d_graph_edges);
|
||||||
|
_clFree(d_graph_mask);
|
||||||
|
_clFree(d_updating_graph_mask);
|
||||||
|
_clFree(d_graph_visited);
|
||||||
|
_clFree(d_cost);
|
||||||
|
_clFree(d_over);
|
||||||
|
_clRelease();
|
||||||
|
} catch(std::string msg) {
|
||||||
|
_clFree(d_graph_nodes);
|
||||||
|
_clFree(d_graph_edges);
|
||||||
|
_clFree(d_graph_mask);
|
||||||
|
_clFree(d_updating_graph_mask);
|
||||||
|
_clFree(d_graph_visited);
|
||||||
|
_clFree(d_cost);
|
||||||
|
_clFree(d_over);
|
||||||
|
_clRelease();
|
||||||
|
std::string e_str = "in run_transpose_gpu -> ";
|
||||||
|
e_str += msg;
|
||||||
|
throw(e_str);
|
||||||
|
}
|
||||||
|
#ifdef TIMING
|
||||||
|
gettimeofday(&tv_close_end, NULL);
|
||||||
|
tvsub(&tv_close_end, &tv_close_start, &tv);
|
||||||
|
close_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
tvsub(&tv_close_end, &tv_total_start, &tv);
|
||||||
|
total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
|
||||||
|
|
||||||
|
printf("Init: %f\n", init_time);
|
||||||
|
printf("MemAlloc: %f\n", mem_alloc_time);
|
||||||
|
printf("HtoD: %f\n", h2d_time);
|
||||||
|
printf("Exec: %f\n", kernel_time);
|
||||||
|
printf("DtoH: %f\n", d2h_time);
|
||||||
|
printf("Close: %f\n", close_time);
|
||||||
|
printf("Total: %f\n", total_time);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
void Usage(int argc, char**argv)
|
||||||
|
{
|
||||||
|
|
||||||
|
fprintf(stderr,"Usage: %s <input_file> [-p platform] [-d device] [-t gpu(0)/cpu(1)]\n", argv[0]);
|
||||||
|
|
||||||
|
}
|
||||||
|
//----------------------------------------------------------
|
||||||
|
//--cambine: main function
|
||||||
|
//--author: created by Jianbin Fang
|
||||||
|
//--date: 25/01/2011
|
||||||
|
//----------------------------------------------------------
|
||||||
|
int main(int argc, char * argv[])
|
||||||
|
{
|
||||||
|
int no_of_nodes;
|
||||||
|
int edge_list_size;
|
||||||
|
FILE *fp;
|
||||||
|
Node* h_graph_nodes;
|
||||||
|
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
|
||||||
|
try {
|
||||||
|
char *input_f;
|
||||||
|
if(argc < 2) {
|
||||||
|
Usage(argc, argv);
|
||||||
|
exit(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
_clCmdParams(argc, argv);
|
||||||
|
input_f = argv[1];
|
||||||
|
printf("Reading File\n");
|
||||||
|
//Read in Graph from a file
|
||||||
|
fp = fopen(input_f,"r");
|
||||||
|
if(!fp) {
|
||||||
|
printf("Error Reading graph file\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int source = 0;
|
||||||
|
|
||||||
|
fscanf(fp,"%d",&no_of_nodes);
|
||||||
|
|
||||||
|
int num_of_blocks = 1;
|
||||||
|
int num_of_threads_per_block = no_of_nodes;
|
||||||
|
|
||||||
|
//Make execution Parameters according to the number of nodes
|
||||||
|
//Distribute threads across multiple Blocks if necessary
|
||||||
|
if(no_of_nodes>MAX_THREADS_PER_BLOCK) {
|
||||||
|
num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK);
|
||||||
|
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
|
||||||
|
}
|
||||||
|
work_group_size = num_of_threads_per_block;
|
||||||
|
// allocate host memory
|
||||||
|
h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
|
||||||
|
h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
|
||||||
|
h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
|
||||||
|
h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes);
|
||||||
|
|
||||||
|
int start, edgeno;
|
||||||
|
// initalize the memory
|
||||||
|
for(int i = 0; i < no_of_nodes; i++) {
|
||||||
|
fscanf(fp,"%d %d",&start,&edgeno);
|
||||||
|
h_graph_nodes[i].starting = start;
|
||||||
|
h_graph_nodes[i].no_of_edges = edgeno;
|
||||||
|
h_graph_mask[i]=false;
|
||||||
|
h_updating_graph_mask[i]=false;
|
||||||
|
h_graph_visited[i]=false;
|
||||||
|
}
|
||||||
|
//read the source node from the file
|
||||||
|
fscanf(fp,"%d",&source);
|
||||||
|
source=0;
|
||||||
|
//set the source node as true in the mask
|
||||||
|
h_graph_mask[source]=true;
|
||||||
|
h_graph_visited[source]=true;
|
||||||
|
fscanf(fp,"%d",&edge_list_size);
|
||||||
|
int id,cost;
|
||||||
|
int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);
|
||||||
|
for(int i=0; i < edge_list_size ; i++) {
|
||||||
|
fscanf(fp,"%d",&id);
|
||||||
|
fscanf(fp,"%d",&cost);
|
||||||
|
h_graph_edges[i] = id;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(fp)
|
||||||
|
fclose(fp);
|
||||||
|
// allocate mem for the result on host side
|
||||||
|
int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes);
|
||||||
|
int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes);
|
||||||
|
for(int i=0; i<no_of_nodes; i++) {
|
||||||
|
h_cost[i]=-1;
|
||||||
|
h_cost_ref[i] = -1;
|
||||||
|
}
|
||||||
|
h_cost[source]=0;
|
||||||
|
h_cost_ref[source]=0;
|
||||||
|
//---------------------------------------------------------
|
||||||
|
//--gpu entry
|
||||||
|
run_bfs_gpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
|
||||||
|
//---------------------------------------------------------
|
||||||
|
//--cpu entry
|
||||||
|
// initalize the memory again
|
||||||
|
for(int i = 0; i < no_of_nodes; i++) {
|
||||||
|
h_graph_mask[i]=false;
|
||||||
|
h_updating_graph_mask[i]=false;
|
||||||
|
h_graph_visited[i]=false;
|
||||||
|
}
|
||||||
|
//set the source node as true in the mask
|
||||||
|
source=0;
|
||||||
|
h_graph_mask[source]=true;
|
||||||
|
h_graph_visited[source]=true;
|
||||||
|
run_bfs_cpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost_ref);
|
||||||
|
//---------------------------------------------------------
|
||||||
|
//--result varification
|
||||||
|
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
|
||||||
|
//release host memory
|
||||||
|
free(h_graph_nodes);
|
||||||
|
free(h_graph_mask);
|
||||||
|
free(h_updating_graph_mask);
|
||||||
|
free(h_graph_visited);
|
||||||
|
|
||||||
|
} catch(std::string msg) {
|
||||||
|
printf("--cambine: exception in main -> %s\n", msg);
|
||||||
|
//release host memory
|
||||||
|
free(h_graph_nodes);
|
||||||
|
free(h_graph_mask);
|
||||||
|
free(h_updating_graph_mask);
|
||||||
|
free(h_graph_visited);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
50
tests/opencl/bfs2/kernel.cl
Normal file
50
tests/opencl/bfs2/kernel.cl
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
/* ============================================================
|
||||||
|
//--cambine: kernel funtion of Breadth-First-Search
|
||||||
|
//--author: created by Jianbin Fang
|
||||||
|
//--date: 06/12/2010
|
||||||
|
============================================================ */
|
||||||
|
#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
|
||||||
|
//Structure to hold a node information
|
||||||
|
typedef struct{
|
||||||
|
int starting;
|
||||||
|
int no_of_edges;
|
||||||
|
} Node;
|
||||||
|
//--7 parameters
|
||||||
|
__kernel void BFS_1( const __global Node* g_graph_nodes,
|
||||||
|
const __global int* g_graph_edges,
|
||||||
|
__global char* g_graph_mask,
|
||||||
|
__global char* g_updating_graph_mask,
|
||||||
|
__global char* g_graph_visited,
|
||||||
|
__global int* g_cost,
|
||||||
|
const int no_of_nodes){
|
||||||
|
int tid = get_global_id(0);
|
||||||
|
if( tid<no_of_nodes && g_graph_mask[tid]){
|
||||||
|
g_graph_mask[tid]=false;
|
||||||
|
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
|
||||||
|
int id = g_graph_edges[i];
|
||||||
|
if(!g_graph_visited[id]){
|
||||||
|
g_cost[id]=g_cost[tid]+1;
|
||||||
|
g_updating_graph_mask[id]=true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//--5 parameters
|
||||||
|
__kernel void BFS_2(__global char* g_graph_mask,
|
||||||
|
__global char* g_updating_graph_mask,
|
||||||
|
__global char* g_graph_visited,
|
||||||
|
__global char* g_over,
|
||||||
|
const int no_of_nodes
|
||||||
|
) {
|
||||||
|
int tid = get_global_id(0);
|
||||||
|
if( tid<no_of_nodes && g_updating_graph_mask[tid]){
|
||||||
|
|
||||||
|
g_graph_mask[tid]=true;
|
||||||
|
g_graph_visited[tid]=true;
|
||||||
|
*g_over=true;
|
||||||
|
g_updating_graph_mask[tid]=false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
78
tests/opencl/bfs2/timer.cc
Normal file
78
tests/opencl/bfs2/timer.cc
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <cstring>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iomanip>
|
||||||
|
|
||||||
|
#include "timer.h"
|
||||||
|
|
||||||
|
|
||||||
|
using namespace std;
|
||||||
|
|
||||||
|
double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
|
||||||
|
|
||||||
|
|
||||||
|
double timer::get_CPU_speed_in_MHz()
|
||||||
|
{
|
||||||
|
#if defined __linux__
|
||||||
|
ifstream infile("/proc/cpuinfo");
|
||||||
|
char buffer[256], *colon;
|
||||||
|
|
||||||
|
while (infile.good()) {
|
||||||
|
infile.getline(buffer, 256);
|
||||||
|
|
||||||
|
if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
|
||||||
|
return atof(colon + 2);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void timer::print_time(ostream &str, const char *which, double time) const
|
||||||
|
{
|
||||||
|
static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 };
|
||||||
|
const char **unit = units;
|
||||||
|
|
||||||
|
time = 1000.0 * time / CPU_speed_in_MHz;
|
||||||
|
|
||||||
|
while (time >= 999.5 && unit[1] != 0) {
|
||||||
|
time /= 1000.0;
|
||||||
|
++ unit;
|
||||||
|
}
|
||||||
|
|
||||||
|
str << which << " = " << setprecision(3) << setw(4) << time << *unit;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ostream &timer::print(ostream &str)
|
||||||
|
{
|
||||||
|
str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
|
||||||
|
|
||||||
|
if (CPU_speed_in_MHz == 0)
|
||||||
|
str << "could not determine CPU speed\n";
|
||||||
|
else if (count > 0) {
|
||||||
|
double total = static_cast<double>(total_time);
|
||||||
|
|
||||||
|
print_time(str, "avg", total / static_cast<double>(count));
|
||||||
|
print_time(str, ", total", total);
|
||||||
|
str << ", count = " << setw(9) << count << '\n';
|
||||||
|
}
|
||||||
|
else
|
||||||
|
str << "not used\n";
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
ostream &operator << (ostream &str, class timer &timer)
|
||||||
|
{
|
||||||
|
return timer.print(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
double timer::getTimeInSeconds()
|
||||||
|
{
|
||||||
|
double total = static_cast<double>(total_time);
|
||||||
|
double res = (total / 1000000.0) / CPU_speed_in_MHz;
|
||||||
|
return res;
|
||||||
|
}
|
128
tests/opencl/bfs2/timer.h
Normal file
128
tests/opencl/bfs2/timer.h
Normal file
|
@ -0,0 +1,128 @@
|
||||||
|
#ifndef timer_h
|
||||||
|
#define timer_h
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
|
||||||
|
class timer {
|
||||||
|
public:
|
||||||
|
timer(const char *name = 0);
|
||||||
|
timer(const char *name, std::ostream &write_on_exit);
|
||||||
|
|
||||||
|
~timer();
|
||||||
|
|
||||||
|
void start(), stop();
|
||||||
|
void reset();
|
||||||
|
std::ostream &print(std::ostream &);
|
||||||
|
|
||||||
|
double getTimeInSeconds();
|
||||||
|
|
||||||
|
private:
|
||||||
|
void print_time(std::ostream &, const char *which, double time) const;
|
||||||
|
|
||||||
|
union {
|
||||||
|
long long total_time;
|
||||||
|
struct {
|
||||||
|
#if defined __PPC__
|
||||||
|
int high, low;
|
||||||
|
#else
|
||||||
|
int low, high;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
unsigned long long count;
|
||||||
|
const char *const name;
|
||||||
|
std::ostream *const write_on_exit;
|
||||||
|
|
||||||
|
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
std::ostream &operator << (std::ostream &, class timer &);
|
||||||
|
|
||||||
|
|
||||||
|
inline void timer::reset()
|
||||||
|
{
|
||||||
|
total_time = 0;
|
||||||
|
count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline timer::timer(const char *name)
|
||||||
|
:
|
||||||
|
name(name),
|
||||||
|
write_on_exit(0)
|
||||||
|
{
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline timer::timer(const char *name, std::ostream &write_on_exit)
|
||||||
|
:
|
||||||
|
name(name),
|
||||||
|
write_on_exit(&write_on_exit)
|
||||||
|
{
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline timer::~timer()
|
||||||
|
{
|
||||||
|
if (write_on_exit != 0)
|
||||||
|
print(*write_on_exit);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void timer::start()
|
||||||
|
{
|
||||||
|
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
|
||||||
|
unsigned eax, edx;
|
||||||
|
|
||||||
|
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
|
||||||
|
|
||||||
|
total_time -= ((unsigned long long) edx << 32) + eax;
|
||||||
|
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
|
||||||
|
asm volatile
|
||||||
|
(
|
||||||
|
"rdtsc\n\t"
|
||||||
|
"subl %%eax, %0\n\t"
|
||||||
|
"sbbl %%edx, %1"
|
||||||
|
:
|
||||||
|
"+m" (low), "+m" (high)
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"eax", "edx"
|
||||||
|
);
|
||||||
|
#else
|
||||||
|
#error Compiler/Architecture not recognized
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
inline void timer::stop()
|
||||||
|
{
|
||||||
|
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
|
||||||
|
unsigned eax, edx;
|
||||||
|
|
||||||
|
asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
|
||||||
|
|
||||||
|
total_time += ((unsigned long long) edx << 32) + eax;
|
||||||
|
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
|
||||||
|
asm volatile
|
||||||
|
(
|
||||||
|
"rdtsc\n\t"
|
||||||
|
"addl %%eax, %0\n\t"
|
||||||
|
"adcl %%edx, %1"
|
||||||
|
:
|
||||||
|
"+m" (low), "+m" (high)
|
||||||
|
:
|
||||||
|
:
|
||||||
|
"eax", "edx"
|
||||||
|
);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
++ count;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
72
tests/opencl/bfs2/util.h
Normal file
72
tests/opencl/bfs2/util.h
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
#ifndef _C_UTIL_
|
||||||
|
#define _C_UTIL_
|
||||||
|
#include <math.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <omp.h>
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
//--initialize array with maximum limit
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
template<typename datatype>
|
||||||
|
void fill(datatype *A, const int n, const datatype maxi){
|
||||||
|
for (int j = 0; j < n; j++)
|
||||||
|
{
|
||||||
|
A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//--print matrix
|
||||||
|
template<typename datatype>
|
||||||
|
void print_matrix(datatype *A, int height, int width){
|
||||||
|
for(int i=0; i<height; i++){
|
||||||
|
for(int j=0; j<width; j++){
|
||||||
|
int idx = i*width + j;
|
||||||
|
std::cout<<A[idx]<<" ";
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
//--verify results
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
#define MAX_RELATIVE_ERROR .002
|
||||||
|
template<typename datatype>
|
||||||
|
void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
|
||||||
|
|
||||||
|
char passed = true;
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i=0; i<size; i++){
|
||||||
|
if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
|
||||||
|
passed = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (passed){
|
||||||
|
printf("--cambine:passed:-)\n");
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
printf("--cambine: failed:-(\n");
|
||||||
|
}
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
template<typename datatype>
|
||||||
|
void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
|
||||||
|
|
||||||
|
char passed = true;
|
||||||
|
//#pragma omp parallel for
|
||||||
|
for (int i=0; i<size; i++){
|
||||||
|
if (cpu_results[i]!=gpu_results[i]){
|
||||||
|
passed = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (passed){
|
||||||
|
printf("--cambine:passed:-)\n");
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
printf("--cambine: failed:-(\n");
|
||||||
|
}
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
2
tests/opencl/cfd/.gitignore
vendored
Normal file
2
tests/opencl/cfd/.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
*.txt
|
||||||
|
*.out
|
1572
tests/opencl/cfd/CLHelper.h
Normal file
1572
tests/opencl/cfd/CLHelper.h
Normal file
File diff suppressed because it is too large
Load diff
7
tests/opencl/cfd/Makefile
Normal file
7
tests/opencl/cfd/Makefile
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
PROJECT = cfd
|
||||||
|
|
||||||
|
SRCS = euler3d.cc timing.cpp
|
||||||
|
|
||||||
|
OPTS ?= fvcorr.domn.097K
|
||||||
|
|
||||||
|
include ../common.mk
|
36
tests/opencl/cfd/Makefile_old
Normal file
36
tests/opencl/cfd/Makefile_old
Normal file
|
@ -0,0 +1,36 @@
|
||||||
|
include ../../common/make.config
|
||||||
|
|
||||||
|
#Can be changed by `make TYPE=CPU`
|
||||||
|
TYPE = GPU
|
||||||
|
|
||||||
|
#Library
|
||||||
|
ifeq ($(TYPE),GPU)
|
||||||
|
OPENCL_INC = $(NV_OPENCL_INC)
|
||||||
|
OPENCL_LIB = $(NV_OPENCL_LIB)
|
||||||
|
else
|
||||||
|
OPENCL_INC = $(INTEL_OPENCL_INC)
|
||||||
|
OPENCL_LIB = $(INTEL_OPENCL_LIB)
|
||||||
|
endif
|
||||||
|
|
||||||
|
#C compiler
|
||||||
|
CC = g++
|
||||||
|
|
||||||
|
FLAGS = -O3
|
||||||
|
SRC = euler3d.cpp ../util/timing.c
|
||||||
|
|
||||||
|
EXE = euler3d.out
|
||||||
|
|
||||||
|
release:$(SRC)
|
||||||
|
$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -I../util -DTIMING $(FLAGS)
|
||||||
|
|
||||||
|
errmsg:$(SRC)
|
||||||
|
$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -D ERRMSG $(FLAGS)
|
||||||
|
|
||||||
|
ptx:$(SRC)
|
||||||
|
$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -D PTX_MSG $(FLAGS)
|
||||||
|
|
||||||
|
res:$(SRC)
|
||||||
|
$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -D RES_MSG $(FLAGS)
|
||||||
|
|
||||||
|
clean: $(SRC)
|
||||||
|
rm -f $(EXE) $(EXE).linkinfo result*
|
10
tests/opencl/cfd/README
Normal file
10
tests/opencl/cfd/README
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
******Adjustable work group size*****
|
||||||
|
RD_WG_SIZE for all
|
||||||
|
RD_WG_SIZE_1 or RD_WG_SIZE_1_0 for initialize_variables
|
||||||
|
RD_WG_SIZE_2 or RD_WG_SIZE_2_0 for compute_step_factor
|
||||||
|
RD_WG_SIZE_3 or RD_WG_SIZE_3_0 for compute_flux
|
||||||
|
RD_WG_SIZE_4 or RD_WG_SIZE_4_0 for time_step
|
||||||
|
|
||||||
|
USAGE:
|
||||||
|
make clean
|
||||||
|
make KERNEL_DIM="-DRD_WG_SIZE_1=128 -DRD_WG_SIZE_2=192 -DRD_WG_SIZE_3=128 -DRD_WG_SIZE_4=256"
|
BIN
tests/opencl/cfd/cfd
Executable file
BIN
tests/opencl/cfd/cfd
Executable file
Binary file not shown.
418
tests/opencl/cfd/euler3d.cc
Normal file
418
tests/opencl/cfd/euler3d.cc
Normal file
|
@ -0,0 +1,418 @@
|
||||||
|
/********************************************************************
|
||||||
|
euler3d.cpp
|
||||||
|
: parallelized code of CFD
|
||||||
|
|
||||||
|
- original code from the AIAA-2009-4001 by Andrew Corrigan, acorriga@gmu.edu
|
||||||
|
- parallelization with OpenCL API has been applied by
|
||||||
|
Jianbin Fang - j.fang@tudelft.nl
|
||||||
|
Delft University of Technology
|
||||||
|
Faculty of Electrical Engineering, Mathematics and Computer Science
|
||||||
|
Department of Software Technology
|
||||||
|
Parallel and Distributed Systems Group
|
||||||
|
on 24/03/2011
|
||||||
|
********************************************************************/
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <fstream>
|
||||||
|
#include <math.h>
|
||||||
|
#include "CLHelper.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Options
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
#define GAMMA 1.4f
|
||||||
|
#define iterations 2000
|
||||||
|
#ifndef block_length
|
||||||
|
#define block_length 192
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define NDIM 3
|
||||||
|
#define NNB 4
|
||||||
|
|
||||||
|
#define RK 3 // 3rd order RK
|
||||||
|
#define ff_mach 1.2f
|
||||||
|
#define deg_angle_of_attack 0.0f
|
||||||
|
|
||||||
|
/*
|
||||||
|
* not options
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#if block_length > 128
|
||||||
|
#warning "the kernels may fail too launch on some systems if the block length is too large"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#define VAR_DENSITY 0
|
||||||
|
#define VAR_MOMENTUM 1
|
||||||
|
#define VAR_DENSITY_ENERGY (VAR_MOMENTUM+NDIM)
|
||||||
|
#define NVAR (VAR_DENSITY_ENERGY+1)
|
||||||
|
|
||||||
|
//self-defined user type
|
||||||
|
typedef struct{
|
||||||
|
float x;
|
||||||
|
float y;
|
||||||
|
float z;
|
||||||
|
} float3;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Generic functions
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
cl_mem alloc(int N){
|
||||||
|
cl_mem mem_d = _clMalloc(sizeof(T)*N);
|
||||||
|
return mem_d;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void dealloc(cl_mem array){
|
||||||
|
_clFree(array);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void copy(cl_mem dst, cl_mem src, int N){
|
||||||
|
_clMemcpyD2D(dst, src, N*sizeof(T));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void upload(cl_mem dst, T* src, int N){
|
||||||
|
_clMemcpyH2D(dst, src, N*sizeof(T));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void download(T* dst, cl_mem src, int N){
|
||||||
|
_clMemcpyD2H(dst, src, N*sizeof(T));
|
||||||
|
}
|
||||||
|
|
||||||
|
void dump(cl_mem variables, int nel, int nelr){
|
||||||
|
float* h_variables = new float[nelr*NVAR];
|
||||||
|
download(h_variables, variables, nelr*NVAR);
|
||||||
|
|
||||||
|
{
|
||||||
|
std::ofstream file("density.txt");
|
||||||
|
file << nel << " " << nelr << std::endl;
|
||||||
|
for(int i = 0; i < nel; i++) file << h_variables[i + VAR_DENSITY*nelr] << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
std::ofstream file("momentum.txt");
|
||||||
|
file << nel << " " << nelr << std::endl;
|
||||||
|
for(int i = 0; i < nel; i++)
|
||||||
|
{
|
||||||
|
for(int j = 0; j != NDIM; j++)
|
||||||
|
file << h_variables[i + (VAR_MOMENTUM+j)*nelr] << " ";
|
||||||
|
file << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
std::ofstream file("density_energy.txt");
|
||||||
|
file << nel << " " << nelr << std::endl;
|
||||||
|
for(int i = 0; i < nel; i++) file << h_variables[i + VAR_DENSITY_ENERGY*nelr] << std::endl;
|
||||||
|
}
|
||||||
|
delete[] h_variables;
|
||||||
|
}
|
||||||
|
|
||||||
|
void initialize_variables(int nelr, cl_mem variables, cl_mem ff_variable) throw(string){
|
||||||
|
|
||||||
|
int work_items = nelr;
|
||||||
|
int work_group_size = BLOCK_SIZE_1;
|
||||||
|
int kernel_id = 1;
|
||||||
|
int arg_idx = 0;
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, variables);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, ff_variable);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
|
||||||
|
_clInvokeKernel(kernel_id, work_items, work_group_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_step_factor(int nelr, cl_mem variables, cl_mem areas, cl_mem step_factors){
|
||||||
|
|
||||||
|
int work_items = nelr;
|
||||||
|
int work_group_size = BLOCK_SIZE_2;
|
||||||
|
int kernel_id = 2;
|
||||||
|
int arg_idx = 0;
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, variables);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, areas);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, step_factors);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
|
||||||
|
_clInvokeKernel(kernel_id, work_items, work_group_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void compute_flux(int nelr, cl_mem elements_surrounding_elements, cl_mem normals, cl_mem variables, cl_mem ff_variable, \
|
||||||
|
cl_mem fluxes, cl_mem ff_flux_contribution_density_energy,
|
||||||
|
cl_mem ff_flux_contribution_momentum_x,
|
||||||
|
cl_mem ff_flux_contribution_momentum_y,
|
||||||
|
cl_mem ff_flux_contribution_momentum_z){
|
||||||
|
|
||||||
|
int work_items = nelr;
|
||||||
|
int work_group_size = BLOCK_SIZE_3;
|
||||||
|
int kernel_id = 3;
|
||||||
|
int arg_idx = 0;
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, elements_surrounding_elements);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, normals);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, variables);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, ff_variable);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, fluxes);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_density_energy);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_x);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_y);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_z);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
|
||||||
|
_clInvokeKernel(kernel_id, work_items, work_group_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void time_step(int j, int nelr, cl_mem old_variables, cl_mem variables, cl_mem step_factors, cl_mem fluxes){
|
||||||
|
|
||||||
|
int work_items = nelr;
|
||||||
|
int work_group_size = BLOCK_SIZE_4;
|
||||||
|
int kernel_id = 4;
|
||||||
|
int arg_idx = 0;
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, &j, sizeof(int));
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, old_variables);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, variables);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, step_factors);
|
||||||
|
_clSetArgs(kernel_id, arg_idx++, fluxes);
|
||||||
|
|
||||||
|
_clInvokeKernel(kernel_id, work_items, work_group_size);
|
||||||
|
}
|
||||||
|
inline void compute_flux_contribution(float& density, float3& momentum, float& density_energy, float& pressure, float3& velocity, float3& fc_momentum_x, float3& fc_momentum_y, float3& fc_momentum_z, float3& fc_density_energy)
|
||||||
|
{
|
||||||
|
fc_momentum_x.x = velocity.x*momentum.x + pressure;
|
||||||
|
fc_momentum_x.y = velocity.x*momentum.y;
|
||||||
|
fc_momentum_x.z = velocity.x*momentum.z;
|
||||||
|
|
||||||
|
|
||||||
|
fc_momentum_y.x = fc_momentum_x.y;
|
||||||
|
fc_momentum_y.y = velocity.y*momentum.y + pressure;
|
||||||
|
fc_momentum_y.z = velocity.y*momentum.z;
|
||||||
|
|
||||||
|
fc_momentum_z.x = fc_momentum_x.z;
|
||||||
|
fc_momentum_z.y = fc_momentum_y.z;
|
||||||
|
fc_momentum_z.z = velocity.z*momentum.z + pressure;
|
||||||
|
|
||||||
|
float de_p = density_energy+pressure;
|
||||||
|
fc_density_energy.x = velocity.x*de_p;
|
||||||
|
fc_density_energy.y = velocity.y*de_p;
|
||||||
|
fc_density_energy.z = velocity.z*de_p;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Main function
|
||||||
|
*/
|
||||||
|
int main(int argc, char** argv){
|
||||||
|
printf("WG size of kernel:initialize = %d, WG size of kernel:compute_step_factor = %d, WG size of kernel:compute_flux = %d, WG size of kernel:time_step = %d\n", BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
|
||||||
|
|
||||||
|
if (argc < 2){
|
||||||
|
//std::cout << "specify data file name and [device type] [device id]" << std::endl;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
const char* data_file_name = argv[1];
|
||||||
|
_clCmdParams(argc, argv);
|
||||||
|
cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z, ff_flux_contribution_density_energy;
|
||||||
|
cl_mem areas, elements_surrounding_elements, normals;
|
||||||
|
cl_mem variables, old_variables, fluxes, step_factors;
|
||||||
|
float h_ff_variable[NVAR];
|
||||||
|
|
||||||
|
try{
|
||||||
|
_clInit(device_type, device_id);
|
||||||
|
// set far field conditions and load them into constant memory on the gpu
|
||||||
|
{
|
||||||
|
//float h_ff_variable[NVAR];
|
||||||
|
const float angle_of_attack = float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
|
||||||
|
|
||||||
|
h_ff_variable[VAR_DENSITY] = float(1.4);
|
||||||
|
|
||||||
|
float ff_pressure = float(1.0f);
|
||||||
|
float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]);
|
||||||
|
float ff_speed = float(ff_mach)*ff_speed_of_sound;
|
||||||
|
|
||||||
|
float3 ff_velocity;
|
||||||
|
ff_velocity.x = ff_speed*float(cos((float)angle_of_attack));
|
||||||
|
ff_velocity.y = ff_speed*float(sin((float)angle_of_attack));
|
||||||
|
ff_velocity.z = 0.0f;
|
||||||
|
|
||||||
|
h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
|
||||||
|
h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
|
||||||
|
h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;
|
||||||
|
|
||||||
|
h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*(float(0.5f)*(ff_speed*ff_speed)) + (ff_pressure / float(GAMMA-1.0f));
|
||||||
|
|
||||||
|
float3 h_ff_momentum;
|
||||||
|
h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0);
|
||||||
|
h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1);
|
||||||
|
h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2);
|
||||||
|
float3 h_ff_flux_contribution_momentum_x;
|
||||||
|
float3 h_ff_flux_contribution_momentum_y;
|
||||||
|
float3 h_ff_flux_contribution_momentum_z;
|
||||||
|
float3 h_ff_flux_contribution_density_energy;
|
||||||
|
compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum, h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, ff_velocity, h_ff_flux_contribution_momentum_x, h_ff_flux_contribution_momentum_y, h_ff_flux_contribution_momentum_z, h_ff_flux_contribution_density_energy);
|
||||||
|
|
||||||
|
// copy far field conditions to the gpu
|
||||||
|
//cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z, ff_flux_contribution_density_energy;
|
||||||
|
ff_variable = _clMalloc(NVAR*sizeof(float));
|
||||||
|
ff_flux_contribution_momentum_x = _clMalloc(sizeof(float3));
|
||||||
|
ff_flux_contribution_momentum_y = _clMalloc(sizeof(float3));
|
||||||
|
ff_flux_contribution_momentum_z = _clMalloc(sizeof(float3));
|
||||||
|
ff_flux_contribution_density_energy = _clMalloc(sizeof(float3));
|
||||||
|
_clMemcpyH2D(ff_variable, h_ff_variable, NVAR*sizeof(float));
|
||||||
|
_clMemcpyH2D(ff_flux_contribution_momentum_x, &h_ff_flux_contribution_momentum_x, sizeof(float3));
|
||||||
|
_clMemcpyH2D(ff_flux_contribution_momentum_y, &h_ff_flux_contribution_momentum_y, sizeof(float3));
|
||||||
|
_clMemcpyH2D(ff_flux_contribution_momentum_z, &h_ff_flux_contribution_momentum_z, sizeof(float3));
|
||||||
|
_clMemcpyH2D(ff_flux_contribution_density_energy, &h_ff_flux_contribution_density_energy, sizeof(float3));
|
||||||
|
_clFinish();
|
||||||
|
|
||||||
|
}
|
||||||
|
int nel;
|
||||||
|
int nelr;
|
||||||
|
// read in domain geometry
|
||||||
|
//float* areas;
|
||||||
|
//int* elements_surrounding_elements;
|
||||||
|
//float* normals;
|
||||||
|
{
|
||||||
|
std::ifstream file(data_file_name);
|
||||||
|
if(!file.good()){
|
||||||
|
throw(string("can not find/open file!"));
|
||||||
|
}
|
||||||
|
file >> nel;
|
||||||
|
nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));
|
||||||
|
////std::cout<<"--cambine: nel="<<nel<<", nelr="<<nelr<<std::endl;
|
||||||
|
float* h_areas = new float[nelr];
|
||||||
|
int* h_elements_surrounding_elements = new int[nelr*NNB];
|
||||||
|
float* h_normals = new float[nelr*NDIM*NNB];
|
||||||
|
|
||||||
|
|
||||||
|
// read in data
|
||||||
|
for(int i = 0; i < nel; i++)
|
||||||
|
{
|
||||||
|
file >> h_areas[i];
|
||||||
|
for(int j = 0; j < NNB; j++)
|
||||||
|
{
|
||||||
|
file >> h_elements_surrounding_elements[i + j*nelr];
|
||||||
|
if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1;
|
||||||
|
h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering
|
||||||
|
|
||||||
|
for(int k = 0; k < NDIM; k++)
|
||||||
|
{
|
||||||
|
file >> h_normals[i + (j + k*NNB)*nelr];
|
||||||
|
h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// fill in remaining data
|
||||||
|
int last = nel-1;
|
||||||
|
for(int i = nel; i < nelr; i++)
|
||||||
|
{
|
||||||
|
h_areas[i] = h_areas[last];
|
||||||
|
for(int j = 0; j < NNB; j++)
|
||||||
|
{
|
||||||
|
// duplicate the last element
|
||||||
|
h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr];
|
||||||
|
for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
areas = alloc<float>(nelr);
|
||||||
|
upload<float>(areas, h_areas, nelr);
|
||||||
|
|
||||||
|
elements_surrounding_elements = alloc<int>(nelr*NNB);
|
||||||
|
upload<int>(elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB);
|
||||||
|
|
||||||
|
normals = alloc<float>(nelr*NDIM*NNB);
|
||||||
|
upload<float>(normals, h_normals, nelr*NDIM*NNB);
|
||||||
|
|
||||||
|
delete[] h_areas;
|
||||||
|
delete[] h_elements_surrounding_elements;
|
||||||
|
delete[] h_normals;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create arrays and set initial conditions
|
||||||
|
variables = alloc<float>(nelr*NVAR);
|
||||||
|
int tp = 0;
|
||||||
|
initialize_variables(nelr, variables, ff_variable);
|
||||||
|
old_variables = alloc<float>(nelr*NVAR);
|
||||||
|
fluxes = alloc<float>(nelr*NVAR);
|
||||||
|
step_factors = alloc<float>(nelr);
|
||||||
|
// make sure all memory is floatly allocated before we start timing
|
||||||
|
initialize_variables(nelr, old_variables, ff_variable);
|
||||||
|
initialize_variables(nelr, fluxes, ff_variable);
|
||||||
|
_clMemset(step_factors, 0, sizeof(float)*nelr);
|
||||||
|
// make sure CUDA isn't still doing something before we start timing
|
||||||
|
_clFinish();
|
||||||
|
// these need to be computed the first time in order to compute time step
|
||||||
|
printf( "Starting...\n");
|
||||||
|
|
||||||
|
// Begin iterations
|
||||||
|
for(int i = 0; i < iterations; i++){
|
||||||
|
copy<float>(old_variables, variables, nelr*NVAR);
|
||||||
|
// for the first iteration we compute the time step
|
||||||
|
compute_step_factor(nelr, variables, areas, step_factors);
|
||||||
|
for(int j = 0; j < RK; j++){
|
||||||
|
compute_flux(nelr, elements_surrounding_elements, normals, variables, ff_variable, fluxes, ff_flux_contribution_density_energy, \
|
||||||
|
ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z);
|
||||||
|
|
||||||
|
time_step(j, nelr, old_variables, variables, step_factors, fluxes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_clFinish();
|
||||||
|
//std::cout << "Saving solution..." << std::endl;
|
||||||
|
dump(variables, nel, nelr);
|
||||||
|
//std::cout << "Saved solution..." << std::endl;
|
||||||
|
_clStatistics();
|
||||||
|
//std::cout << "Cleaning up..." << std::endl;
|
||||||
|
|
||||||
|
//--release resources
|
||||||
|
_clFree(ff_variable);
|
||||||
|
_clFree(ff_flux_contribution_momentum_x);
|
||||||
|
_clFree(ff_flux_contribution_momentum_y);
|
||||||
|
_clFree(ff_flux_contribution_momentum_z);
|
||||||
|
_clFree(ff_flux_contribution_density_energy);
|
||||||
|
_clFree(areas);
|
||||||
|
_clFree(elements_surrounding_elements);
|
||||||
|
_clFree(normals);
|
||||||
|
_clFree(variables);
|
||||||
|
_clFree(old_variables);
|
||||||
|
_clFree(fluxes);
|
||||||
|
_clFree(step_factors);
|
||||||
|
_clRelease();
|
||||||
|
printf("Done...\n");
|
||||||
|
_clPrintTiming();
|
||||||
|
}
|
||||||
|
catch(string msg){
|
||||||
|
printf("--cambine:( an exception catched in main body ->%s\n", msg.c_str());
|
||||||
|
_clFree(ff_variable);
|
||||||
|
_clFree(ff_flux_contribution_momentum_x);
|
||||||
|
_clFree(ff_flux_contribution_momentum_y);
|
||||||
|
_clFree(ff_flux_contribution_momentum_z);
|
||||||
|
_clFree(ff_flux_contribution_density_energy);
|
||||||
|
_clFree(areas);
|
||||||
|
_clFree(elements_surrounding_elements);
|
||||||
|
_clFree(normals);
|
||||||
|
_clFree(variables);
|
||||||
|
_clFree(old_variables);
|
||||||
|
_clFree(fluxes);
|
||||||
|
_clFree(step_factors);
|
||||||
|
_clRelease();
|
||||||
|
}
|
||||||
|
catch(...){
|
||||||
|
//std::cout<<"--cambine:( unknow exceptions in main body..."<<std::endl;
|
||||||
|
_clFree(ff_variable);
|
||||||
|
_clFree(ff_flux_contribution_momentum_x);
|
||||||
|
_clFree(ff_flux_contribution_momentum_y);
|
||||||
|
_clFree(ff_flux_contribution_momentum_z);
|
||||||
|
_clFree(ff_flux_contribution_density_energy);
|
||||||
|
_clFree(areas);
|
||||||
|
_clFree(elements_surrounding_elements);
|
||||||
|
_clFree(normals);
|
||||||
|
_clFree(variables);
|
||||||
|
_clFree(old_variables);
|
||||||
|
_clFree(fluxes);
|
||||||
|
_clFree(step_factors);
|
||||||
|
_clRelease();
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
97047
tests/opencl/cfd/fvcorr.domn.097K
Normal file
97047
tests/opencl/cfd/fvcorr.domn.097K
Normal file
File diff suppressed because it is too large
Load diff
284
tests/opencl/cfd/kernel.cl
Normal file
284
tests/opencl/cfd/kernel.cl
Normal file
|
@ -0,0 +1,284 @@
|
||||||
|
/* ============================================================
|
||||||
|
//--functions: kernel funtion
|
||||||
|
//--programmer: Jianbin Fang
|
||||||
|
//--date: 24/03/2011
|
||||||
|
============================================================ */
|
||||||
|
#ifndef _KERNEL_
|
||||||
|
#define _KERNEL_
|
||||||
|
|
||||||
|
#define GAMMA (1.4f)
|
||||||
|
|
||||||
|
|
||||||
|
#define NDIM 3
|
||||||
|
#define NNB 4
|
||||||
|
|
||||||
|
#define RK 3 // 3rd order RK
|
||||||
|
#define ff_mach 1.2f
|
||||||
|
#define deg_angle_of_attack 0.0f
|
||||||
|
|
||||||
|
#define VAR_DENSITY 0
|
||||||
|
#define VAR_MOMENTUM 1
|
||||||
|
#define VAR_DENSITY_ENERGY (VAR_MOMENTUM+NDIM)
|
||||||
|
#define NVAR (VAR_DENSITY_ENERGY+1)
|
||||||
|
//#pragma OPENCL EXTENSION CL_MAD : enable
|
||||||
|
|
||||||
|
//self-defined user type
|
||||||
|
typedef struct{
|
||||||
|
float x;
|
||||||
|
float y;
|
||||||
|
float z;
|
||||||
|
} FLOAT3;
|
||||||
|
/*------------------------------------------------------------
|
||||||
|
@function: set memory
|
||||||
|
@params:
|
||||||
|
mem_d: target memory to be set;
|
||||||
|
val: set the target memory to value 'val'
|
||||||
|
num_bytes: the number of bytes all together
|
||||||
|
@return: through mem_d
|
||||||
|
------------------------------------------------------------*/
|
||||||
|
__kernel void memset_kernel(__global char * mem_d, short val, int ct){
|
||||||
|
const int thread_id = get_global_id(0);
|
||||||
|
if( thread_id >= ct) return;
|
||||||
|
mem_d[thread_id] = val;
|
||||||
|
}
|
||||||
|
|
||||||
|
//--cambine: omit &
|
||||||
|
inline void compute_velocity(float density, FLOAT3 momentum, FLOAT3* velocity){
|
||||||
|
velocity->x = momentum.x / density;
|
||||||
|
velocity->y = momentum.y / density;
|
||||||
|
velocity->z = momentum.z / density;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline float compute_speed_sqd(FLOAT3 velocity){
|
||||||
|
return velocity.x*velocity.x + velocity.y*velocity.y + velocity.z*velocity.z;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline float compute_pressure(float density, float density_energy, float speed_sqd){
|
||||||
|
return ((float)(GAMMA) - (float)(1.0f))*(density_energy - (float)(0.5f)*density*speed_sqd);
|
||||||
|
}
|
||||||
|
inline float compute_speed_of_sound(float density, float pressure){
|
||||||
|
//return sqrtf(float(GAMMA)*pressure/density);
|
||||||
|
return sqrt((float)(GAMMA)*pressure/density);
|
||||||
|
}
|
||||||
|
inline void compute_flux_contribution(float density, FLOAT3 momentum, float density_energy, float pressure, FLOAT3 velocity, FLOAT3* fc_momentum_x, FLOAT3* fc_momentum_y, FLOAT3* fc_momentum_z, FLOAT3* fc_density_energy)
|
||||||
|
{
|
||||||
|
fc_momentum_x->x = velocity.x*momentum.x + pressure;
|
||||||
|
fc_momentum_x->y = velocity.x*momentum.y;
|
||||||
|
fc_momentum_x->z = velocity.x*momentum.z;
|
||||||
|
|
||||||
|
|
||||||
|
fc_momentum_y->x = fc_momentum_x->y;
|
||||||
|
fc_momentum_y->y = velocity.y*momentum.y + pressure;
|
||||||
|
fc_momentum_y->z = velocity.y*momentum.z;
|
||||||
|
|
||||||
|
fc_momentum_z->x = fc_momentum_x->z;
|
||||||
|
fc_momentum_z->y = fc_momentum_y->z;
|
||||||
|
fc_momentum_z->z = velocity.z*momentum.z + pressure;
|
||||||
|
|
||||||
|
float de_p = density_energy+pressure;
|
||||||
|
fc_density_energy->x = velocity.x*de_p;
|
||||||
|
fc_density_energy->y = velocity.y*de_p;
|
||||||
|
fc_density_energy->z = velocity.z*de_p;
|
||||||
|
}
|
||||||
|
__kernel void initialize_variables(__global float* variables, __constant float* ff_variable, int nelr){
|
||||||
|
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
|
||||||
|
const int i = get_global_id(0);
|
||||||
|
if( i >= nelr) return;
|
||||||
|
for(int j = 0; j < NVAR; j++)
|
||||||
|
variables[i + j*nelr] = ff_variable[j];
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void compute_step_factor(__global float* variables,
|
||||||
|
__global float* areas,
|
||||||
|
__global float* step_factors,
|
||||||
|
int nelr){
|
||||||
|
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
|
||||||
|
const int i = get_global_id(0);
|
||||||
|
if( i >= nelr) return;
|
||||||
|
|
||||||
|
float density = variables[i + VAR_DENSITY*nelr];
|
||||||
|
FLOAT3 momentum;
|
||||||
|
momentum.x = variables[i + (VAR_MOMENTUM+0)*nelr];
|
||||||
|
momentum.y = variables[i + (VAR_MOMENTUM+1)*nelr];
|
||||||
|
momentum.z = variables[i + (VAR_MOMENTUM+2)*nelr];
|
||||||
|
|
||||||
|
float density_energy = variables[i + VAR_DENSITY_ENERGY*nelr];
|
||||||
|
|
||||||
|
FLOAT3 velocity; compute_velocity(density, momentum, &velocity);
|
||||||
|
float speed_sqd = compute_speed_sqd(velocity);
|
||||||
|
//float speed_sqd;
|
||||||
|
//compute_speed_sqd(velocity, speed_sqd);
|
||||||
|
float pressure = compute_pressure(density, density_energy, speed_sqd);
|
||||||
|
float speed_of_sound = compute_speed_of_sound(density, pressure);
|
||||||
|
|
||||||
|
// dt = float(0.5f) * sqrtf(areas[i]) / (||v|| + c).... but when we do time stepping, this later would need to be divided by the area, so we just do it all at once
|
||||||
|
//step_factors[i] = (float)(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
|
||||||
|
step_factors[i] = (float)(0.5f) / (sqrt(areas[i]) * (sqrt(speed_sqd) + speed_of_sound));
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void compute_flux(
|
||||||
|
__global int* elements_surrounding_elements,
|
||||||
|
__global float* normals,
|
||||||
|
__global float* variables,
|
||||||
|
__constant float* ff_variable,
|
||||||
|
__global float* fluxes,
|
||||||
|
__constant FLOAT3* ff_flux_contribution_density_energy,
|
||||||
|
__constant FLOAT3* ff_flux_contribution_momentum_x,
|
||||||
|
__constant FLOAT3* ff_flux_contribution_momentum_y,
|
||||||
|
__constant FLOAT3* ff_flux_contribution_momentum_z,
|
||||||
|
int nelr){
|
||||||
|
const float smoothing_coefficient = (float)(0.2f);
|
||||||
|
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
|
||||||
|
const int i = get_global_id(0);
|
||||||
|
if( i >= nelr) return;
|
||||||
|
int j, nb;
|
||||||
|
FLOAT3 normal; float normal_len;
|
||||||
|
float factor;
|
||||||
|
|
||||||
|
float density_i = variables[i + VAR_DENSITY*nelr];
|
||||||
|
FLOAT3 momentum_i;
|
||||||
|
momentum_i.x = variables[i + (VAR_MOMENTUM+0)*nelr];
|
||||||
|
momentum_i.y = variables[i + (VAR_MOMENTUM+1)*nelr];
|
||||||
|
momentum_i.z = variables[i + (VAR_MOMENTUM+2)*nelr];
|
||||||
|
|
||||||
|
float density_energy_i = variables[i + VAR_DENSITY_ENERGY*nelr];
|
||||||
|
|
||||||
|
FLOAT3 velocity_i; compute_velocity(density_i, momentum_i, &velocity_i);
|
||||||
|
float speed_sqd_i = compute_speed_sqd(velocity_i);
|
||||||
|
//float speed_sqd_i;
|
||||||
|
//compute_speed_sqd(velocity_i, speed_sqd_i);
|
||||||
|
//float speed_i = sqrtf(speed_sqd_i);
|
||||||
|
float speed_i = sqrt(speed_sqd_i);
|
||||||
|
float pressure_i = compute_pressure(density_i, density_energy_i, speed_sqd_i);
|
||||||
|
float speed_of_sound_i = compute_speed_of_sound(density_i, pressure_i);
|
||||||
|
FLOAT3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z;
|
||||||
|
FLOAT3 flux_contribution_i_density_energy;
|
||||||
|
compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i, &flux_contribution_i_momentum_x, &flux_contribution_i_momentum_y, &flux_contribution_i_momentum_z, &flux_contribution_i_density_energy);
|
||||||
|
|
||||||
|
float flux_i_density = (float)(0.0f);
|
||||||
|
FLOAT3 flux_i_momentum;
|
||||||
|
flux_i_momentum.x = (float)(0.0f);
|
||||||
|
flux_i_momentum.y = (float)(0.0f);
|
||||||
|
flux_i_momentum.z = (float)(0.0f);
|
||||||
|
float flux_i_density_energy = (float)(0.0f);
|
||||||
|
|
||||||
|
FLOAT3 velocity_nb;
|
||||||
|
float density_nb, density_energy_nb;
|
||||||
|
FLOAT3 momentum_nb;
|
||||||
|
FLOAT3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z;
|
||||||
|
FLOAT3 flux_contribution_nb_density_energy;
|
||||||
|
float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for(j = 0; j < NNB; j++)
|
||||||
|
{
|
||||||
|
nb = elements_surrounding_elements[i + j*nelr];
|
||||||
|
normal.x = normals[i + (j + 0*NNB)*nelr];
|
||||||
|
normal.y = normals[i + (j + 1*NNB)*nelr];
|
||||||
|
normal.z = normals[i + (j + 2*NNB)*nelr];
|
||||||
|
//normal_len = sqrtf(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);
|
||||||
|
normal_len = sqrt(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);
|
||||||
|
|
||||||
|
if(nb >= 0) // a legitimate neighbor
|
||||||
|
{
|
||||||
|
density_nb = variables[nb + VAR_DENSITY*nelr];
|
||||||
|
momentum_nb.x = variables[nb + (VAR_MOMENTUM+0)*nelr];
|
||||||
|
momentum_nb.y = variables[nb + (VAR_MOMENTUM+1)*nelr];
|
||||||
|
momentum_nb.z = variables[nb + (VAR_MOMENTUM+2)*nelr];
|
||||||
|
density_energy_nb = variables[nb + VAR_DENSITY_ENERGY*nelr];
|
||||||
|
compute_velocity(density_nb, momentum_nb, &velocity_nb);
|
||||||
|
speed_sqd_nb = compute_speed_sqd(velocity_nb);
|
||||||
|
pressure_nb = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
|
||||||
|
speed_of_sound_nb = compute_speed_of_sound(density_nb, pressure_nb);
|
||||||
|
compute_flux_contribution(density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb, &flux_contribution_nb_momentum_x, &flux_contribution_nb_momentum_y, &flux_contribution_nb_momentum_z, &flux_contribution_nb_density_energy);
|
||||||
|
|
||||||
|
// artificial viscosity
|
||||||
|
factor = -normal_len*smoothing_coefficient*(float)(0.5f)*(speed_i + sqrt(speed_sqd_nb) + speed_of_sound_i + speed_of_sound_nb);
|
||||||
|
flux_i_density += factor*(density_i-density_nb);
|
||||||
|
flux_i_density_energy += factor*(density_energy_i-density_energy_nb);
|
||||||
|
flux_i_momentum.x += factor*(momentum_i.x-momentum_nb.x);
|
||||||
|
flux_i_momentum.y += factor*(momentum_i.y-momentum_nb.y);
|
||||||
|
flux_i_momentum.z += factor*(momentum_i.z-momentum_nb.z);
|
||||||
|
|
||||||
|
// accumulate cell-centered fluxes
|
||||||
|
factor = (float)(0.5f)*normal.x;
|
||||||
|
flux_i_density += factor*(momentum_nb.x+momentum_i.x);
|
||||||
|
flux_i_density_energy += factor*(flux_contribution_nb_density_energy.x+flux_contribution_i_density_energy.x);
|
||||||
|
flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.x+flux_contribution_i_momentum_x.x);
|
||||||
|
flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.x+flux_contribution_i_momentum_y.x);
|
||||||
|
flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.x+flux_contribution_i_momentum_z.x);
|
||||||
|
|
||||||
|
factor = (float)(0.5f)*normal.y;
|
||||||
|
flux_i_density += factor*(momentum_nb.y+momentum_i.y);
|
||||||
|
flux_i_density_energy += factor*(flux_contribution_nb_density_energy.y+flux_contribution_i_density_energy.y);
|
||||||
|
flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.y+flux_contribution_i_momentum_x.y);
|
||||||
|
flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.y+flux_contribution_i_momentum_y.y);
|
||||||
|
flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.y+flux_contribution_i_momentum_z.y);
|
||||||
|
|
||||||
|
factor = (float)(0.5f)*normal.z;
|
||||||
|
flux_i_density += factor*(momentum_nb.z+momentum_i.z);
|
||||||
|
flux_i_density_energy += factor*(flux_contribution_nb_density_energy.z+flux_contribution_i_density_energy.z);
|
||||||
|
flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.z+flux_contribution_i_momentum_x.z);
|
||||||
|
flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.z+flux_contribution_i_momentum_y.z);
|
||||||
|
flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.z+flux_contribution_i_momentum_z.z);
|
||||||
|
}
|
||||||
|
else if(nb == -1) // a wing boundary
|
||||||
|
{
|
||||||
|
flux_i_momentum.x += normal.x*pressure_i;
|
||||||
|
flux_i_momentum.y += normal.y*pressure_i;
|
||||||
|
flux_i_momentum.z += normal.z*pressure_i;
|
||||||
|
}
|
||||||
|
else if(nb == -2) // a far field boundary
|
||||||
|
{
|
||||||
|
factor = (float)(0.5f)*normal.x;
|
||||||
|
flux_i_density += factor*(ff_variable[VAR_MOMENTUM+0]+momentum_i.x);
|
||||||
|
flux_i_density_energy += factor*(ff_flux_contribution_density_energy[0].x+flux_contribution_i_density_energy.x);
|
||||||
|
flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x[0].x + flux_contribution_i_momentum_x.x);
|
||||||
|
flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y[0].x + flux_contribution_i_momentum_y.x);
|
||||||
|
flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z[0].x + flux_contribution_i_momentum_z.x);
|
||||||
|
|
||||||
|
factor = (float)(0.5f)*normal.y;
|
||||||
|
flux_i_density += factor*(ff_variable[VAR_MOMENTUM+1]+momentum_i.y);
|
||||||
|
flux_i_density_energy += factor*(ff_flux_contribution_density_energy[0].y+flux_contribution_i_density_energy.y);
|
||||||
|
flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x[0].y + flux_contribution_i_momentum_x.y);
|
||||||
|
flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y[0].y + flux_contribution_i_momentum_y.y);
|
||||||
|
flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z[0].y + flux_contribution_i_momentum_z.y);
|
||||||
|
|
||||||
|
factor = (float)(0.5f)*normal.z;
|
||||||
|
flux_i_density += factor*(ff_variable[VAR_MOMENTUM+2]+momentum_i.z);
|
||||||
|
flux_i_density_energy += factor*(ff_flux_contribution_density_energy[0].z+flux_contribution_i_density_energy.z);
|
||||||
|
flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x[0].z + flux_contribution_i_momentum_x.z);
|
||||||
|
flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y[0].z + flux_contribution_i_momentum_y.z);
|
||||||
|
flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z[0].z + flux_contribution_i_momentum_z.z);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fluxes[i + VAR_DENSITY*nelr] = flux_i_density;
|
||||||
|
fluxes[i + (VAR_MOMENTUM+0)*nelr] = flux_i_momentum.x;
|
||||||
|
fluxes[i + (VAR_MOMENTUM+1)*nelr] = flux_i_momentum.y;
|
||||||
|
fluxes[i + (VAR_MOMENTUM+2)*nelr] = flux_i_momentum.z;
|
||||||
|
fluxes[i + VAR_DENSITY_ENERGY*nelr] = flux_i_density_energy;
|
||||||
|
}
|
||||||
|
|
||||||
|
__kernel void time_step(int j, int nelr,
|
||||||
|
__global float* old_variables,
|
||||||
|
__global float* variables,
|
||||||
|
__global float* step_factors,
|
||||||
|
__global float* fluxes){
|
||||||
|
//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
|
||||||
|
const int i = get_global_id(0);
|
||||||
|
if( i >= nelr) return;
|
||||||
|
|
||||||
|
float factor = step_factors[i]/(float)(RK+1-j);
|
||||||
|
|
||||||
|
variables[i + VAR_DENSITY*nelr] = old_variables[i + VAR_DENSITY*nelr] + factor*fluxes[i + VAR_DENSITY*nelr];
|
||||||
|
variables[i + VAR_DENSITY_ENERGY*nelr] = old_variables[i + VAR_DENSITY_ENERGY*nelr] + factor*fluxes[i + VAR_DENSITY_ENERGY*nelr];
|
||||||
|
variables[i + (VAR_MOMENTUM+0)*nelr] = old_variables[i + (VAR_MOMENTUM+0)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+0)*nelr];
|
||||||
|
variables[i + (VAR_MOMENTUM+1)*nelr] = old_variables[i + (VAR_MOMENTUM+1)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+1)*nelr];
|
||||||
|
variables[i + (VAR_MOMENTUM+2)*nelr] = old_variables[i + (VAR_MOMENTUM+2)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+2)*nelr];
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
40
tests/opencl/cfd/timing.cpp
Normal file
40
tests/opencl/cfd/timing.cpp
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#include "timing.h"
|
||||||
|
|
||||||
|
void time_measure_start(struct timeval *tv)
|
||||||
|
{
|
||||||
|
gettimeofday(tv, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
void time_measure_end(struct timeval *tv)
|
||||||
|
{
|
||||||
|
struct timeval tv_now, tv_diff;
|
||||||
|
double d;
|
||||||
|
|
||||||
|
gettimeofday(&tv_now, NULL);
|
||||||
|
tvsub(&tv_now, tv, &tv_diff);
|
||||||
|
|
||||||
|
d = (double) tv_diff.tv_sec * 1000.0 + (double) tv_diff.tv_usec / 1000.0;
|
||||||
|
printf("Time (Memory Copy and Launch) = %f (ms)\n", d);
|
||||||
|
}
|
||||||
|
|
||||||
|
float probe_event_time(cl_event event, cl_command_queue command_queue) {
|
||||||
|
cl_int error=0;
|
||||||
|
cl_ulong eventStart,eventEnd;
|
||||||
|
clFinish(command_queue);
|
||||||
|
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,
|
||||||
|
sizeof(cl_ulong),&eventStart,NULL);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
printf("ERROR (%d) in event start profiling.\n", error);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,
|
||||||
|
sizeof(cl_ulong),&eventEnd,NULL);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
printf("ERROR (%d) in event end profiling.\n", error);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (float)((eventEnd-eventStart)/1000000.0);
|
||||||
|
}
|
25
tests/opencl/cfd/timing.h
Normal file
25
tests/opencl/cfd/timing.h
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
#ifndef __TIMING_H__
|
||||||
|
#define __TIMING_H__
|
||||||
|
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
|
void time_measure_start(struct timeval *tv);
|
||||||
|
void time_measure_end(struct timeval *tv);
|
||||||
|
|
||||||
|
/* tvsub: ret = x - y. */
|
||||||
|
static inline void tvsub(struct timeval *x,
|
||||||
|
struct timeval *y,
|
||||||
|
struct timeval *ret)
|
||||||
|
{
|
||||||
|
ret->tv_sec = x->tv_sec - y->tv_sec;
|
||||||
|
ret->tv_usec = x->tv_usec - y->tv_usec;
|
||||||
|
if (ret->tv_usec < 0) {
|
||||||
|
ret->tv_sec--;
|
||||||
|
ret->tv_usec += 1000000;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
float probe_event_time(cl_event, cl_command_queue);
|
||||||
|
|
||||||
|
#endif
|
131
tests/opencl/cfd/util.h
Normal file
131
tests/opencl/cfd/util.h
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
#ifndef _C_UTIL_
|
||||||
|
#define _C_UTIL_
|
||||||
|
#include <math.h>
|
||||||
|
#include <iostream>
|
||||||
|
#include <omp.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
#ifdef RD_WG_SIZE_0_0
|
||||||
|
#define BLOCK_SIZE_0 RD_WG_SIZE_0_0
|
||||||
|
#elif defined(RD_WG_SIZE_0)
|
||||||
|
#define BLOCK_SIZE_0 RD_WG_SIZE_0
|
||||||
|
#elif defined(RD_WG_SIZE)
|
||||||
|
#define BLOCK_SIZE_0 RD_WG_SIZE
|
||||||
|
#else
|
||||||
|
#define BLOCK_SIZE_0 192
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef RD_WG_SIZE_1_0
|
||||||
|
#define BLOCK_SIZE_1 RD_WG_SIZE_1_0
|
||||||
|
#elif defined(RD_WG_SIZE_1)
|
||||||
|
#define BLOCK_SIZE_1 RD_WG_SIZE_1
|
||||||
|
#elif defined(RD_WG_SIZE)
|
||||||
|
#define BLOCK_SIZE_1 RD_WG_SIZE
|
||||||
|
#else
|
||||||
|
#define BLOCK_SIZE_1 192
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef RD_WG_SIZE_2_0
|
||||||
|
#define BLOCK_SIZE_2 RD_WG_SIZE_2_0
|
||||||
|
#elif defined(RD_WG_SIZE_1)
|
||||||
|
#define BLOCK_SIZE_2 RD_WG_SIZE_2
|
||||||
|
#elif defined(RD_WG_SIZE)
|
||||||
|
#define BLOCK_SIZE_2 RD_WG_SIZE
|
||||||
|
#else
|
||||||
|
#define BLOCK_SIZE_2 192
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef RD_WG_SIZE_3_0
|
||||||
|
#define BLOCK_SIZE_3 RD_WG_SIZE_3_0
|
||||||
|
#elif defined(RD_WG_SIZE_3)
|
||||||
|
#define BLOCK_SIZE_3 RD_WG_SIZE_3
|
||||||
|
#elif defined(RD_WG_SIZE)
|
||||||
|
#define BLOCK_SIZE_3 RD_WG_SIZE
|
||||||
|
#else
|
||||||
|
#define BLOCK_SIZE_3 192
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef RD_WG_SIZE_4_0
|
||||||
|
#define BLOCK_SIZE_4 RD_WG_SIZE_4_0
|
||||||
|
#elif defined(RD_WG_SIZE_4)
|
||||||
|
#define BLOCK_SIZE_4 RD_WG_SIZE_4
|
||||||
|
#elif defined(RD_WG_SIZE)
|
||||||
|
#define BLOCK_SIZE_4 RD_WG_SIZE
|
||||||
|
#else
|
||||||
|
#define BLOCK_SIZE_4 192
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
using std::endl;
|
||||||
|
|
||||||
|
double gettime() {
|
||||||
|
struct timeval t;
|
||||||
|
gettimeofday(&t,NULL);
|
||||||
|
return t.tv_sec+t.tv_usec*1e-6;
|
||||||
|
}
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
//--initialize array with maximum limit
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
template<typename datatype>
|
||||||
|
void fill(datatype *A, const int n, const datatype maxi){
|
||||||
|
for (int j = 0; j < n; j++){
|
||||||
|
A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//--print matrix
|
||||||
|
template<typename datatype>
|
||||||
|
void print_matrix(datatype *A, int height, int width){
|
||||||
|
for(int i=0; i<height; i++){
|
||||||
|
for(int j=0; j<width; j++){
|
||||||
|
int idx = i*width + j;
|
||||||
|
std::cout<<A[idx]<<" ";
|
||||||
|
}
|
||||||
|
std::cout<<std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
//--verify results
|
||||||
|
//-------------------------------------------------------------------
|
||||||
|
#define MAX_RELATIVE_ERROR .002
|
||||||
|
template<typename datatype>
|
||||||
|
void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
|
||||||
|
|
||||||
|
bool passed = true;
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i=0; i<size; i++){
|
||||||
|
if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
|
||||||
|
passed = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (passed){
|
||||||
|
std::cout << "--cambine:passed:-)" << std::endl;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
std::cout << "--cambine: failed:-(" << std::endl;
|
||||||
|
}
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
template<typename datatype>
|
||||||
|
void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
|
||||||
|
|
||||||
|
bool passed = true;
|
||||||
|
//#pragma omp parallel for
|
||||||
|
for (int i=0; i<size; i++){
|
||||||
|
if (cpu_results[i]!=gpu_results[i]){
|
||||||
|
passed = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (passed){
|
||||||
|
std::cout << "--cambine:passed:-)" << std::endl;
|
||||||
|
}
|
||||||
|
else{
|
||||||
|
std::cout << "--cambine: failed:-(" << std::endl;
|
||||||
|
}
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
9
tests/opencl/cutcp2/Makefile
Normal file
9
tests/opencl/cutcp2/Makefile
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
PROJECT = cutcp
|
||||||
|
|
||||||
|
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
|
||||||
|
|
||||||
|
CXXFLAGS += -I.
|
||||||
|
|
||||||
|
OPTS ?=
|
||||||
|
|
||||||
|
include ../common.mk
|
617
tests/opencl/cutcp2/args.c
Normal file
617
tests/opencl/cutcp2/args.c
Normal file
|
@ -0,0 +1,617 @@
|
||||||
|
|
||||||
|
#include <parboil.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
/*****************************************************************************/
|
||||||
|
/* Memory management routines */
|
||||||
|
|
||||||
|
/* Free an array of owned strings. */
|
||||||
|
void
|
||||||
|
pb_FreeStringArray(char **string_array)
|
||||||
|
{
|
||||||
|
char **p;
|
||||||
|
|
||||||
|
if (!string_array) return;
|
||||||
|
for (p = string_array; *p; p++) free(*p);
|
||||||
|
free(string_array);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_PlatformParam *
|
||||||
|
pb_PlatformParam(char *name, char *version)
|
||||||
|
{
|
||||||
|
if (name == NULL) {
|
||||||
|
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_PlatformParam *ret =
|
||||||
|
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
|
||||||
|
|
||||||
|
ret->name = name;
|
||||||
|
ret->version = version;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreePlatformParam(struct pb_PlatformParam *p)
|
||||||
|
{
|
||||||
|
if (p == NULL) return;
|
||||||
|
|
||||||
|
free(p->name);
|
||||||
|
free(p->version);
|
||||||
|
free(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_index(int index)
|
||||||
|
{
|
||||||
|
struct pb_DeviceParam *ret =
|
||||||
|
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||||
|
ret->criterion = pb_Device_INDEX;
|
||||||
|
ret->index = index;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_cpu(void)
|
||||||
|
{
|
||||||
|
struct pb_DeviceParam *ret =
|
||||||
|
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||||
|
ret->criterion = pb_Device_CPU;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_gpu(void)
|
||||||
|
{
|
||||||
|
struct pb_DeviceParam *ret =
|
||||||
|
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||||
|
ret->criterion = pb_Device_GPU;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_accelerator(void)
|
||||||
|
{
|
||||||
|
struct pb_DeviceParam *ret =
|
||||||
|
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||||
|
ret->criterion = pb_Device_ACCELERATOR;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_name(char *name)
|
||||||
|
{
|
||||||
|
struct pb_DeviceParam *ret =
|
||||||
|
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||||
|
ret->criterion = pb_Device_NAME;
|
||||||
|
ret->name = name;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreeDeviceParam(struct pb_DeviceParam *p)
|
||||||
|
{
|
||||||
|
if (p == NULL) return;
|
||||||
|
|
||||||
|
switch(p->criterion) {
|
||||||
|
case pb_Device_NAME:
|
||||||
|
free(p->name);
|
||||||
|
break;
|
||||||
|
case pb_Device_INDEX:
|
||||||
|
case pb_Device_CPU:
|
||||||
|
case pb_Device_ACCELERATOR:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreeParameters(struct pb_Parameters *p)
|
||||||
|
{
|
||||||
|
free(p->outFile);
|
||||||
|
pb_FreeStringArray(p->inpFiles);
|
||||||
|
pb_FreePlatformParam(p->platform);
|
||||||
|
pb_FreeDeviceParam(p->device);
|
||||||
|
free(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*****************************************************************************/
|
||||||
|
|
||||||
|
/* Parse a comma-delimited list of strings into an
|
||||||
|
* array of strings. */
|
||||||
|
static char **
|
||||||
|
read_string_array(char *in)
|
||||||
|
{
|
||||||
|
char **ret;
|
||||||
|
int i;
|
||||||
|
int count; /* Number of items in the input */
|
||||||
|
char *substring; /* Current substring within 'in' */
|
||||||
|
|
||||||
|
/* Count the number of items in the string */
|
||||||
|
count = 1;
|
||||||
|
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
|
||||||
|
|
||||||
|
/* Allocate storage */
|
||||||
|
ret = (char **)malloc((count + 1) * sizeof(char *));
|
||||||
|
|
||||||
|
/* Create copies of the strings from the list */
|
||||||
|
substring = in;
|
||||||
|
for (i = 0; i < count; i++) {
|
||||||
|
char *substring_end;
|
||||||
|
int substring_length;
|
||||||
|
|
||||||
|
/* Find length of substring */
|
||||||
|
for (substring_end = substring;
|
||||||
|
(*substring_end != ',') && (*substring_end != 0);
|
||||||
|
substring_end++);
|
||||||
|
|
||||||
|
substring_length = substring_end - substring;
|
||||||
|
|
||||||
|
/* Allocate memory and copy the substring */
|
||||||
|
ret[i] = (char *)malloc(substring_length + 1);
|
||||||
|
memcpy(ret[i], substring, substring_length);
|
||||||
|
ret[i][substring_length] = 0;
|
||||||
|
|
||||||
|
/* go to next substring */
|
||||||
|
substring = substring_end + 1;
|
||||||
|
}
|
||||||
|
ret[i] = NULL; /* Write the sentinel value */
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
report_parse_error(const char *str)
|
||||||
|
{
|
||||||
|
fputs(str, stderr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Interpret a string as a 'pb_DeviceParam' value.
|
||||||
|
* Return a pointer to a new value, or NULL on failure.
|
||||||
|
*/
|
||||||
|
static struct pb_DeviceParam *
|
||||||
|
read_device_param(char *str)
|
||||||
|
{
|
||||||
|
/* Try different ways of interpreting 'device_string' until one works */
|
||||||
|
|
||||||
|
/* If argument is an integer, then interpret it as a device index */
|
||||||
|
errno = 0;
|
||||||
|
char *end;
|
||||||
|
long device_int = strtol(str, &end, 10);
|
||||||
|
if (!errno) {
|
||||||
|
/* Negative numbers are not valid */
|
||||||
|
if (device_int < 0 || device_int > INT_MAX) return NULL;
|
||||||
|
|
||||||
|
return pb_DeviceParam_index(device_int);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Match against predefined strings */
|
||||||
|
if (strcmp(str, "CPU") == 0)
|
||||||
|
return pb_DeviceParam_cpu();
|
||||||
|
if (strcmp(str, "GPU") == 0)
|
||||||
|
return pb_DeviceParam_gpu();
|
||||||
|
if (strcmp(str, "ACCELERATOR") == 0)
|
||||||
|
return pb_DeviceParam_accelerator();
|
||||||
|
|
||||||
|
/* Assume any other string is a device name */
|
||||||
|
return pb_DeviceParam_name(strdup(str));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Interpret a string as a 'pb_PlatformParam' value.
|
||||||
|
* Return a pointer to a new value, or NULL on failure.
|
||||||
|
*/
|
||||||
|
static struct pb_PlatformParam *
|
||||||
|
read_platform_param(char *str)
|
||||||
|
{
|
||||||
|
int separator_index; /* Index of the '-' character separating
|
||||||
|
* name and version number. It's -1 if
|
||||||
|
* there's no '-' character. */
|
||||||
|
|
||||||
|
/* Find the last occurrence of '-' in 'str' */
|
||||||
|
{
|
||||||
|
char *cur;
|
||||||
|
separator_index = -1;
|
||||||
|
for (cur = str; *cur; cur++) {
|
||||||
|
if (*cur == '-') separator_index = cur - str;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The platform name is either the entire string, or all characters before
|
||||||
|
* the separator */
|
||||||
|
int name_length = separator_index == -1 ? strlen(str) : separator_index;
|
||||||
|
char *name_str = (char *)malloc(name_length + 1);
|
||||||
|
memcpy(name_str, str, name_length);
|
||||||
|
name_str[name_length] = 0;
|
||||||
|
|
||||||
|
/* The version is either NULL, or all characters after the separator */
|
||||||
|
char *version_str;
|
||||||
|
if (separator_index == -1) {
|
||||||
|
version_str = NULL;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const char *version_input_str = str + separator_index + 1;
|
||||||
|
int version_length = strlen(version_input_str);
|
||||||
|
|
||||||
|
version_str = (char *)malloc(version_length + 1);
|
||||||
|
memcpy(version_str, version_input_str, version_length);
|
||||||
|
version_str[version_length] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Create output structure */
|
||||||
|
return pb_PlatformParam(name_str, version_str);
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************************************/
|
||||||
|
/* Argument parsing state */
|
||||||
|
|
||||||
|
/* Argument parsing state.
|
||||||
|
*
|
||||||
|
* Arguments that are interpreted by the argument parser are removed from
|
||||||
|
* the list. Variables 'argc' and 'argn' do not count arguments that have
|
||||||
|
* been removed.
|
||||||
|
*
|
||||||
|
* During argument parsing, the array of arguments is compacted, overwriting
|
||||||
|
* the erased arguments. Variable 'argv_put' points to the array element
|
||||||
|
* where the next argument will be written. Variable 'argv_get' points to
|
||||||
|
* the array element where the next argument will be read from.
|
||||||
|
*/
|
||||||
|
struct argparse {
|
||||||
|
int argc; /* Number of arguments. Mutable. */
|
||||||
|
int argn; /* Current argument index. */
|
||||||
|
char **argv_get; /* Argument value being read. */
|
||||||
|
char **argv_put; /* Argument value being written.
|
||||||
|
* argv_put <= argv_get. */
|
||||||
|
};
|
||||||
|
|
||||||
|
static void
|
||||||
|
initialize_argparse(struct argparse *ap, int argc, char **argv)
|
||||||
|
{
|
||||||
|
ap->argc = argc;
|
||||||
|
ap->argn = 0;
|
||||||
|
ap->argv_get = ap->argv_put = argv;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Finish argument parsing, without processing the remaining arguments.
|
||||||
|
* Write new argument count into _argc. */
|
||||||
|
static void
|
||||||
|
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
|
||||||
|
{
|
||||||
|
/* Move the remaining arguments */
|
||||||
|
for(; ap->argn < ap->argc; ap->argn++)
|
||||||
|
*ap->argv_put++ = *ap->argv_get++;
|
||||||
|
|
||||||
|
/* Update the argument count */
|
||||||
|
*_argc = ap->argc;
|
||||||
|
|
||||||
|
/* Insert a terminating NULL */
|
||||||
|
argv[ap->argc] = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Delete the current argument. The argument will not be visible
|
||||||
|
* when argument parsing is done. */
|
||||||
|
static void
|
||||||
|
delete_argument(struct argparse *ap)
|
||||||
|
{
|
||||||
|
if (ap->argn >= ap->argc) {
|
||||||
|
fprintf(stderr, "delete_argument\n");
|
||||||
|
}
|
||||||
|
ap->argc--;
|
||||||
|
ap->argv_get++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Go to the next argument. Also, move the current argument to its
|
||||||
|
* final location in argv. */
|
||||||
|
static void
|
||||||
|
next_argument(struct argparse *ap)
|
||||||
|
{
|
||||||
|
if (ap->argn >= ap->argc) {
|
||||||
|
fprintf(stderr, "next_argument\n");
|
||||||
|
}
|
||||||
|
/* Move argument to its new location. */
|
||||||
|
*ap->argv_put++ = *ap->argv_get++;
|
||||||
|
ap->argn++;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
is_end_of_arguments(struct argparse *ap)
|
||||||
|
{
|
||||||
|
return ap->argn == ap->argc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get the current argument */
|
||||||
|
static char *
|
||||||
|
get_argument(struct argparse *ap)
|
||||||
|
{
|
||||||
|
return *ap->argv_get;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get the current argument, and also delete it */
|
||||||
|
static char *
|
||||||
|
consume_argument(struct argparse *ap)
|
||||||
|
{
|
||||||
|
char *ret = get_argument(ap);
|
||||||
|
delete_argument(ap);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************************************/
|
||||||
|
|
||||||
|
/* The result of parsing a command-line argument */
|
||||||
|
typedef enum {
|
||||||
|
ARGPARSE_OK, /* Success */
|
||||||
|
ARGPARSE_ERROR, /* Error */
|
||||||
|
ARGPARSE_DONE /* Success, and do not continue parsing */
|
||||||
|
} result;
|
||||||
|
|
||||||
|
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
|
||||||
|
|
||||||
|
|
||||||
|
/* A command-line option */
|
||||||
|
struct option {
|
||||||
|
char short_name; /* If not 0, the one-character
|
||||||
|
* name of this option */
|
||||||
|
const char *long_name; /* If not NULL, the long name of this option */
|
||||||
|
parse_action *action; /* What to do when this option occurs.
|
||||||
|
* Sentinel value is NULL.
|
||||||
|
*/
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Output file
|
||||||
|
*
|
||||||
|
* -o FILE
|
||||||
|
*/
|
||||||
|
static result
|
||||||
|
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
|
||||||
|
{
|
||||||
|
if (is_end_of_arguments(ap))
|
||||||
|
{
|
||||||
|
report_parse_error("Expecting file name after '-o'\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Replace the output file name */
|
||||||
|
free(params->outFile);
|
||||||
|
params->outFile = strdup(consume_argument(ap));
|
||||||
|
|
||||||
|
return ARGPARSE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Input files
|
||||||
|
*
|
||||||
|
* -i FILE,FILE,...
|
||||||
|
*/
|
||||||
|
static result
|
||||||
|
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
|
||||||
|
{
|
||||||
|
if (is_end_of_arguments(ap))
|
||||||
|
{
|
||||||
|
report_parse_error("Expecting file name after '-i'\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Replace the input file list */
|
||||||
|
pb_FreeStringArray(params->inpFiles);
|
||||||
|
params->inpFiles = read_string_array(consume_argument(ap));
|
||||||
|
return ARGPARSE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* End of options
|
||||||
|
*
|
||||||
|
* --
|
||||||
|
*/
|
||||||
|
|
||||||
|
static result
|
||||||
|
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
|
||||||
|
{
|
||||||
|
return ARGPARSE_DONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* OpenCL device
|
||||||
|
*
|
||||||
|
* --device X
|
||||||
|
*/
|
||||||
|
|
||||||
|
static result
|
||||||
|
parse_device(struct argparse *ap, struct pb_Parameters *params)
|
||||||
|
{
|
||||||
|
/* Read the next argument, which specifies a device */
|
||||||
|
|
||||||
|
if (is_end_of_arguments(ap))
|
||||||
|
{
|
||||||
|
report_parse_error("Expecting device specification after '--device'\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *device_string = consume_argument(ap);
|
||||||
|
struct pb_DeviceParam *device_param = read_device_param(device_string);
|
||||||
|
|
||||||
|
if (!device_param) {
|
||||||
|
report_parse_error("Unrecognized device specification format on command line\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Save the result */
|
||||||
|
pb_FreeDeviceParam(params->device);
|
||||||
|
params->device = device_param;
|
||||||
|
|
||||||
|
return ARGPARSE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static result
|
||||||
|
parse_platform(struct argparse *ap, struct pb_Parameters *params)
|
||||||
|
{
|
||||||
|
/* Read the next argument, which specifies a platform */
|
||||||
|
|
||||||
|
if (is_end_of_arguments(ap))
|
||||||
|
{
|
||||||
|
report_parse_error("Expecting device specification after '--platform'\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *platform_string = consume_argument(ap);
|
||||||
|
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
|
||||||
|
|
||||||
|
if (!platform_param) {
|
||||||
|
report_parse_error("Unrecognized platform specification format on command line\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Save the result */
|
||||||
|
pb_FreePlatformParam(params->platform);
|
||||||
|
params->platform = platform_param;
|
||||||
|
|
||||||
|
return ARGPARSE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static struct option options[] = {
|
||||||
|
{ 'o', NULL, &parse_output_file },
|
||||||
|
{ 'i', NULL, &parse_input_files },
|
||||||
|
{ '-', NULL, &parse_end_options },
|
||||||
|
{ 0, "device", &parse_device },
|
||||||
|
{ 0, "platform", &parse_platform },
|
||||||
|
{ 0, NULL, NULL }
|
||||||
|
};
|
||||||
|
|
||||||
|
static int
|
||||||
|
is_last_option(struct option *op)
|
||||||
|
{
|
||||||
|
return op->action == NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************************************/
|
||||||
|
|
||||||
|
/* Parse command-line parameters.
|
||||||
|
* Return zero on error, nonzero otherwise.
|
||||||
|
* On error, the other outputs may be invalid.
|
||||||
|
*
|
||||||
|
* The information collected from parameters is used to update
|
||||||
|
* 'ret'. 'ret' should be initialized.
|
||||||
|
*
|
||||||
|
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
|
||||||
|
{
|
||||||
|
char *err_message;
|
||||||
|
struct argparse ap;
|
||||||
|
|
||||||
|
/* Each argument */
|
||||||
|
initialize_argparse(&ap, *_argc, argv);
|
||||||
|
while(!is_end_of_arguments(&ap)) {
|
||||||
|
result arg_result; /* Result of parsing this option */
|
||||||
|
char *arg = get_argument(&ap);
|
||||||
|
|
||||||
|
/* Process this argument */
|
||||||
|
if (arg[0] == '-') {
|
||||||
|
/* Single-character flag */
|
||||||
|
if ((arg[1] != 0) && (arg[2] == 0)) {
|
||||||
|
delete_argument(&ap); /* This argument is consumed here */
|
||||||
|
|
||||||
|
/* Find a matching short option */
|
||||||
|
struct option *op;
|
||||||
|
for (op = options; !is_last_option(op); op++) {
|
||||||
|
if (op->short_name == arg[1]) {
|
||||||
|
arg_result = (*op->action)(&ap, ret);
|
||||||
|
goto option_was_processed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* No option matches */
|
||||||
|
report_parse_error("Unexpected command-line parameter\n");
|
||||||
|
arg_result = ARGPARSE_ERROR;
|
||||||
|
goto option_was_processed;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Long flag */
|
||||||
|
if (arg[1] == '-') {
|
||||||
|
delete_argument(&ap); /* This argument is consumed here */
|
||||||
|
|
||||||
|
/* Find a matching long option */
|
||||||
|
struct option *op;
|
||||||
|
for (op = options; !is_last_option(op); op++) {
|
||||||
|
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
|
||||||
|
arg_result = (*op->action)(&ap, ret);
|
||||||
|
goto option_was_processed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* No option matches */
|
||||||
|
report_parse_error("Unexpected command-line parameter\n");
|
||||||
|
arg_result = ARGPARSE_ERROR;
|
||||||
|
goto option_was_processed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* Other arguments are ignored */
|
||||||
|
next_argument(&ap);
|
||||||
|
arg_result = ARGPARSE_OK;
|
||||||
|
goto option_was_processed;
|
||||||
|
}
|
||||||
|
|
||||||
|
option_was_processed:
|
||||||
|
/* Decide what to do next based on 'arg_result' */
|
||||||
|
switch(arg_result) {
|
||||||
|
case ARGPARSE_OK:
|
||||||
|
/* Continue processing */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ARGPARSE_ERROR:
|
||||||
|
/* Error exit from the function */
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
case ARGPARSE_DONE:
|
||||||
|
/* Normal exit from the argument parsing loop */
|
||||||
|
goto end_of_options;
|
||||||
|
}
|
||||||
|
} /* end for each argument */
|
||||||
|
|
||||||
|
/* If all arguments were processed, then normal exit from the loop */
|
||||||
|
|
||||||
|
end_of_options:
|
||||||
|
finalize_argparse(&ap, _argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*****************************************************************************/
|
||||||
|
/* Other exported functions */
|
||||||
|
|
||||||
|
struct pb_Parameters *
|
||||||
|
pb_ReadParameters(int *_argc, char **argv)
|
||||||
|
{
|
||||||
|
struct pb_Parameters *ret =
|
||||||
|
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
|
||||||
|
|
||||||
|
/* Initialize the parameters structure */
|
||||||
|
ret->outFile = NULL;
|
||||||
|
ret->inpFiles = (char **)malloc(sizeof(char *));
|
||||||
|
ret->inpFiles[0] = NULL;
|
||||||
|
ret->platform = NULL;
|
||||||
|
ret->device = NULL;
|
||||||
|
|
||||||
|
/* Read parameters and update _argc, argv */
|
||||||
|
if (!pb_ParseParameters(ret, _argc, argv)) {
|
||||||
|
/* Parse error */
|
||||||
|
pb_FreeParameters(ret);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
pb_Parameters_CountInputs(struct pb_Parameters *p)
|
||||||
|
{
|
||||||
|
int n;
|
||||||
|
|
||||||
|
for (n = 0; p->inpFiles[n]; n++);
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
37
tests/opencl/cutcp2/atom.h
Normal file
37
tests/opencl/cutcp2/atom.h
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef ATOM_H
|
||||||
|
#define ATOM_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
typedef struct Atom_t {
|
||||||
|
float x, y, z, q;
|
||||||
|
} Atom;
|
||||||
|
|
||||||
|
typedef struct Atoms_t {
|
||||||
|
Atom *atoms;
|
||||||
|
int size;
|
||||||
|
} Atoms;
|
||||||
|
|
||||||
|
typedef struct Vec3_t {
|
||||||
|
float x, y, z;
|
||||||
|
} Vec3;
|
||||||
|
|
||||||
|
Atoms *read_atom_file(const char *fname);
|
||||||
|
void free_atom(Atoms *atom);
|
||||||
|
void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* ATOM_H */
|
BIN
tests/opencl/cutcp2/cutcp
Executable file
BIN
tests/opencl/cutcp2/cutcp
Executable file
Binary file not shown.
195
tests/opencl/cutcp2/cutcpu.c
Normal file
195
tests/opencl/cutcp2/cutcpu.c
Normal file
|
@ -0,0 +1,195 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <parboil.h>
|
||||||
|
#include "atom.h"
|
||||||
|
#include "cutoff.h"
|
||||||
|
|
||||||
|
#undef DEBUG_PASS_RATE
|
||||||
|
#define CHECK_CYLINDER_CPU
|
||||||
|
|
||||||
|
#define CELLEN 4.f
|
||||||
|
#define INV_CELLEN (1.f/CELLEN)
|
||||||
|
|
||||||
|
extern int cpu_compute_cutoff_potential_lattice(
|
||||||
|
Lattice *lattice, /* the lattice */
|
||||||
|
float cutoff, /* cutoff distance */
|
||||||
|
Atoms *atoms /* array of atoms */
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int nx = lattice->dim.nx;
|
||||||
|
int ny = lattice->dim.ny;
|
||||||
|
int nz = lattice->dim.nz;
|
||||||
|
float xlo = lattice->dim.lo.x;
|
||||||
|
float ylo = lattice->dim.lo.y;
|
||||||
|
float zlo = lattice->dim.lo.z;
|
||||||
|
float gridspacing = lattice->dim.h;
|
||||||
|
int natoms = atoms->size;
|
||||||
|
Atom *atom = atoms->atoms;
|
||||||
|
|
||||||
|
const float a2 = cutoff * cutoff;
|
||||||
|
const float inv_a2 = 1.f / a2;
|
||||||
|
float s;
|
||||||
|
const float inv_gridspacing = 1.f / gridspacing;
|
||||||
|
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
|
||||||
|
/* lattice point radius about each atom */
|
||||||
|
|
||||||
|
int n;
|
||||||
|
int i, j, k;
|
||||||
|
int ia, ib, ic;
|
||||||
|
int ja, jb, jc;
|
||||||
|
int ka, kb, kc;
|
||||||
|
int index;
|
||||||
|
int koff, jkoff;
|
||||||
|
|
||||||
|
float x, y, z, q;
|
||||||
|
float dx, dy, dz;
|
||||||
|
float dz2, dydz2, r2;
|
||||||
|
float e;
|
||||||
|
float xstart, ystart;
|
||||||
|
|
||||||
|
float *pg;
|
||||||
|
|
||||||
|
int gindex;
|
||||||
|
int ncell, nxcell, nycell, nzcell;
|
||||||
|
int *first, *next;
|
||||||
|
float inv_cellen = INV_CELLEN;
|
||||||
|
Vec3 minext, maxext; /* Extent of atom bounding box */
|
||||||
|
float xmin, ymin, zmin;
|
||||||
|
float xmax, ymax, zmax;
|
||||||
|
|
||||||
|
#if DEBUG_PASS_RATE
|
||||||
|
unsigned long long pass_count = 0;
|
||||||
|
unsigned long long fail_count = 0;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* find min and max extent */
|
||||||
|
get_atom_extent(&minext, &maxext, atoms);
|
||||||
|
|
||||||
|
/* number of cells in each dimension */
|
||||||
|
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
|
||||||
|
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
|
||||||
|
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
|
||||||
|
ncell = nxcell * nycell * nzcell;
|
||||||
|
|
||||||
|
/* allocate for cursor link list implementation */
|
||||||
|
first = (int *) malloc(ncell * sizeof(int));
|
||||||
|
for (gindex = 0; gindex < ncell; gindex++) {
|
||||||
|
first[gindex] = -1;
|
||||||
|
}
|
||||||
|
next = (int *) malloc(natoms * sizeof(int));
|
||||||
|
for (n = 0; n < natoms; n++) {
|
||||||
|
next[n] = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* geometric hashing */
|
||||||
|
for (n = 0; n < natoms; n++) {
|
||||||
|
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
|
||||||
|
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
|
||||||
|
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
|
||||||
|
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
|
||||||
|
gindex = (k*nycell + j)*nxcell + i;
|
||||||
|
next[n] = first[gindex];
|
||||||
|
first[gindex] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* traverse the grid cells */
|
||||||
|
for (gindex = 0; gindex < ncell; gindex++) {
|
||||||
|
for (n = first[gindex]; n != -1; n = next[n]) {
|
||||||
|
x = atom[n].x - xlo;
|
||||||
|
y = atom[n].y - ylo;
|
||||||
|
z = atom[n].z - zlo;
|
||||||
|
q = atom[n].q;
|
||||||
|
|
||||||
|
/* find closest grid point with position less than or equal to atom */
|
||||||
|
ic = (int) (x * inv_gridspacing);
|
||||||
|
jc = (int) (y * inv_gridspacing);
|
||||||
|
kc = (int) (z * inv_gridspacing);
|
||||||
|
|
||||||
|
/* find extent of surrounding box of grid points */
|
||||||
|
ia = ic - radius;
|
||||||
|
ib = ic + radius + 1;
|
||||||
|
ja = jc - radius;
|
||||||
|
jb = jc + radius + 1;
|
||||||
|
ka = kc - radius;
|
||||||
|
kb = kc + radius + 1;
|
||||||
|
|
||||||
|
/* trim box edges so that they are within grid point lattice */
|
||||||
|
if (ia < 0) ia = 0;
|
||||||
|
if (ib >= nx) ib = nx-1;
|
||||||
|
if (ja < 0) ja = 0;
|
||||||
|
if (jb >= ny) jb = ny-1;
|
||||||
|
if (ka < 0) ka = 0;
|
||||||
|
if (kb >= nz) kb = nz-1;
|
||||||
|
|
||||||
|
/* loop over surrounding grid points */
|
||||||
|
xstart = ia*gridspacing - x;
|
||||||
|
ystart = ja*gridspacing - y;
|
||||||
|
dz = ka*gridspacing - z;
|
||||||
|
for (k = ka; k <= kb; k++, dz += gridspacing) {
|
||||||
|
koff = k*ny;
|
||||||
|
dz2 = dz*dz;
|
||||||
|
dy = ystart;
|
||||||
|
for (j = ja; j <= jb; j++, dy += gridspacing) {
|
||||||
|
jkoff = (koff + j)*nx;
|
||||||
|
dydz2 = dy*dy + dz2;
|
||||||
|
#ifdef CHECK_CYLINDER_CPU
|
||||||
|
if (dydz2 >= a2) continue;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
dx = xstart;
|
||||||
|
index = jkoff + ia;
|
||||||
|
pg = lattice->lattice + index;
|
||||||
|
|
||||||
|
#if defined(__INTEL_COMPILER)
|
||||||
|
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
|
||||||
|
r2 = dx*dx + dydz2;
|
||||||
|
s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
|
||||||
|
e = q * (1/sqrtf(r2)) * s;
|
||||||
|
*pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
|
||||||
|
r2 = dx*dx + dydz2;
|
||||||
|
if (r2 >= a2)
|
||||||
|
{
|
||||||
|
#ifdef DEBUG_PASS_RATE
|
||||||
|
fail_count++;
|
||||||
|
#endif
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
#ifdef DEBUG_PASS_RATE
|
||||||
|
pass_count++;
|
||||||
|
#endif
|
||||||
|
s = (1.f - r2 * inv_a2);
|
||||||
|
e = q * (1/sqrtf(r2)) * s * s;
|
||||||
|
*pg += e;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
} /* end loop over surrounding grid points */
|
||||||
|
|
||||||
|
} /* end loop over atoms in a gridcell */
|
||||||
|
} /* end loop over gridcells */
|
||||||
|
|
||||||
|
/* free memory */
|
||||||
|
free(next);
|
||||||
|
free(first);
|
||||||
|
|
||||||
|
/* For debugging: print the number of times that the test passed/failed */
|
||||||
|
#ifdef DEBUG_PASS_RATE
|
||||||
|
printf ("Pass :%lld\n", pass_count);
|
||||||
|
printf ("Fail :%lld\n", fail_count);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
508
tests/opencl/cutcp2/cutoff.c
Normal file
508
tests/opencl/cutcp2/cutoff.c
Normal file
|
@ -0,0 +1,508 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <parboil.h>
|
||||||
|
|
||||||
|
#include "atom.h"
|
||||||
|
#include "cutoff.h"
|
||||||
|
#include "macros.h"
|
||||||
|
#include "ocl.h"
|
||||||
|
|
||||||
|
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||||
|
if (NULL == filename || NULL == data || 0 == size)
|
||||||
|
return CL_INVALID_VALUE;
|
||||||
|
|
||||||
|
FILE* fp = fopen(filename, "r");
|
||||||
|
if (NULL == fp) {
|
||||||
|
fprintf(stderr, "Failed to load kernel.");
|
||||||
|
return CL_INVALID_VALUE;
|
||||||
|
}
|
||||||
|
fseek(fp , 0 , SEEK_END);
|
||||||
|
long fsize = ftell(fp);
|
||||||
|
rewind(fp);
|
||||||
|
|
||||||
|
*data = (uint8_t*)malloc(fsize);
|
||||||
|
*size = fread(*data, 1, fsize, fp);
|
||||||
|
|
||||||
|
fclose(fp);
|
||||||
|
|
||||||
|
return CL_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
|
||||||
|
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
|
||||||
|
typedef cl_int4 xyz;
|
||||||
|
|
||||||
|
//extern "C" int gpu_compute_cutoff_potential_lattice(
|
||||||
|
int gpu_compute_cutoff_potential_lattice(
|
||||||
|
struct pb_TimerSet *timers,
|
||||||
|
Lattice *lattice, /* the lattice */
|
||||||
|
float cutoff, /* cutoff distance */
|
||||||
|
Atoms *atoms, /* array of atoms */
|
||||||
|
int verbose, /* print info/debug messages */
|
||||||
|
struct pb_Parameters *parameters
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int nx = lattice->dim.nx;
|
||||||
|
int ny = lattice->dim.ny;
|
||||||
|
int nz = lattice->dim.nz;
|
||||||
|
float xlo = lattice->dim.lo.x;
|
||||||
|
float ylo = lattice->dim.lo.y;
|
||||||
|
float zlo = lattice->dim.lo.z;
|
||||||
|
float h = lattice->dim.h;
|
||||||
|
int natoms = atoms->size;
|
||||||
|
Atom *atom = atoms->atoms;
|
||||||
|
|
||||||
|
xyz nbrlist[NBRLIST_MAXLEN];
|
||||||
|
int nbrlistlen = 0;
|
||||||
|
|
||||||
|
int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */
|
||||||
|
int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */
|
||||||
|
int num_excluded = 0;
|
||||||
|
|
||||||
|
int xRegionDim, yRegionDim, zRegionDim;
|
||||||
|
int xRegionIndex, yRegionIndex, zRegionIndex;
|
||||||
|
int xOffset, yOffset, zOffset;
|
||||||
|
int lnx, lny, lnz, lnall;
|
||||||
|
float *regionZeroAddr, *thisRegion;
|
||||||
|
cl_mem regionZeroCl;
|
||||||
|
int index, indexRegion;
|
||||||
|
|
||||||
|
int c;
|
||||||
|
xyz binDim;
|
||||||
|
int nbins;
|
||||||
|
cl_float4 *binBaseAddr, *binZeroAddr;
|
||||||
|
cl_mem binBaseCl, binZeroCl;
|
||||||
|
int *bincntBaseAddr, *bincntZeroAddr;
|
||||||
|
Atoms *extra = NULL;
|
||||||
|
|
||||||
|
cl_mem NbrListLen;
|
||||||
|
cl_mem NbrList;
|
||||||
|
|
||||||
|
int i, j, k, n;
|
||||||
|
int sum, total;
|
||||||
|
|
||||||
|
float avgFillFull, avgFillCover;
|
||||||
|
const float cutoff2 = cutoff * cutoff;
|
||||||
|
const float inv_cutoff2 = 1.f / cutoff2;
|
||||||
|
|
||||||
|
size_t gridDim[3], blockDim[3];
|
||||||
|
|
||||||
|
// The "compute" timer should be active upon entry to this function
|
||||||
|
|
||||||
|
/* pad lattice to be factor of 8 in each dimension */
|
||||||
|
xRegionDim = (int) ceilf(nx/8.f);
|
||||||
|
yRegionDim = (int) ceilf(ny/8.f);
|
||||||
|
zRegionDim = (int) ceilf(nz/8.f);
|
||||||
|
|
||||||
|
lnx = 8 * xRegionDim;
|
||||||
|
lny = 8 * yRegionDim;
|
||||||
|
lnz = 8 * zRegionDim;
|
||||||
|
lnall = lnx * lny * lnz;
|
||||||
|
|
||||||
|
/* will receive energies from OpenCL */
|
||||||
|
regionZeroAddr = (float *) malloc(lnall * sizeof(float));
|
||||||
|
|
||||||
|
/* create bins */
|
||||||
|
c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
|
||||||
|
binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
|
||||||
|
binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
|
||||||
|
binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
|
||||||
|
nbins = binDim.x * binDim.y * binDim.z;
|
||||||
|
binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
|
||||||
|
binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
|
||||||
|
|
||||||
|
bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
|
||||||
|
bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
|
||||||
|
|
||||||
|
/* create neighbor list */
|
||||||
|
if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
|
||||||
|
float s = sqrtf(3);
|
||||||
|
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
|
||||||
|
int cnt = 0;
|
||||||
|
/* develop neighbor list around 1 cell */
|
||||||
|
if (2*c + 1 > NBRLIST_DIM) {
|
||||||
|
fprintf(stderr, "must have cutoff <= %f\n",
|
||||||
|
(NBRLIST_DIM-1)/2 * BIN_LENGTH);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
for (k = -c; k <= c; k++) {
|
||||||
|
for (j = -c; j <= c; j++) {
|
||||||
|
for (i = -c; i <= c; i++) {
|
||||||
|
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
|
||||||
|
nbrlist[cnt].x = i;
|
||||||
|
nbrlist[cnt].y = j;
|
||||||
|
nbrlist[cnt].z = k;
|
||||||
|
cnt++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nbrlistlen = cnt;
|
||||||
|
}
|
||||||
|
else if (8*h <= 2*BIN_LENGTH) {
|
||||||
|
float s = 2.f*sqrtf(3);
|
||||||
|
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
|
||||||
|
int cnt = 0;
|
||||||
|
/* develop neighbor list around 3-cube of cells */
|
||||||
|
if (2*c + 3 > NBRLIST_DIM) {
|
||||||
|
fprintf(stderr, "must have cutoff <= %f\n",
|
||||||
|
(NBRLIST_DIM-3)/2 * BIN_LENGTH);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
for (k = -c; k <= c; k++) {
|
||||||
|
for (j = -c; j <= c; j++) {
|
||||||
|
for (i = -c; i <= c; i++) {
|
||||||
|
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
|
||||||
|
nbrlist[cnt].x = i;
|
||||||
|
nbrlist[cnt].y = j;
|
||||||
|
nbrlist[cnt].z = k;
|
||||||
|
cnt++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
nbrlistlen = cnt;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* perform geometric hashing of atoms into bins */
|
||||||
|
{
|
||||||
|
/* array of extra atoms, permit average of one extra per bin */
|
||||||
|
Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
|
||||||
|
int extra_len = 0;
|
||||||
|
|
||||||
|
for (n = 0; n < natoms; n++) {
|
||||||
|
cl_float4 p;
|
||||||
|
p.x = atom[n].x - xlo;
|
||||||
|
p.y = atom[n].y - ylo;
|
||||||
|
p.z = atom[n].z - zlo;
|
||||||
|
p.w = atom[n].q;
|
||||||
|
i = (int) floorf(p.x * BIN_INVLEN);
|
||||||
|
j = (int) floorf(p.y * BIN_INVLEN);
|
||||||
|
k = (int) floorf(p.z * BIN_INVLEN);
|
||||||
|
if (i >= -c && i < binDim.x - c &&
|
||||||
|
j >= -c && j < binDim.y - c &&
|
||||||
|
k >= -c && k < binDim.z - c &&
|
||||||
|
atom[n].q != 0) {
|
||||||
|
int index = (k * binDim.y + j) * binDim.x + i;
|
||||||
|
cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
|
||||||
|
int bindex = bincntZeroAddr[index];
|
||||||
|
if (bindex < BIN_DEPTH) {
|
||||||
|
/* copy atom into bin and increase counter for this bin */
|
||||||
|
bin[bindex] = p;
|
||||||
|
bincntZeroAddr[index]++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* add index to array of extra atoms to be computed with CPU */
|
||||||
|
if (extra_len >= nbins) {
|
||||||
|
fprintf(stderr, "exceeded space for storing extra atoms\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
extra_atoms[extra_len] = atom[n];
|
||||||
|
extra_len++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* excluded atoms are either outside bins or neutrally charged */
|
||||||
|
num_excluded++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Save result */
|
||||||
|
extra = (Atoms *)malloc(sizeof(Atoms));
|
||||||
|
extra->atoms = extra_atoms;
|
||||||
|
extra->size = extra_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* bin stats */
|
||||||
|
sum = total = 0;
|
||||||
|
for (n = 0; n < nbins; n++) {
|
||||||
|
binHistoFull[ bincntBaseAddr[n] ]++;
|
||||||
|
sum += bincntBaseAddr[n];
|
||||||
|
total += BIN_DEPTH;
|
||||||
|
}
|
||||||
|
avgFillFull = sum / (float) total;
|
||||||
|
sum = total = 0;
|
||||||
|
for (k = 0; k < binDim.z - 2*c; k++) {
|
||||||
|
for (j = 0; j < binDim.y - 2*c; j++) {
|
||||||
|
for (i = 0; i < binDim.x - 2*c; i++) {
|
||||||
|
int index = (k * binDim.y + j) * binDim.x + i;
|
||||||
|
binHistoCover[ bincntZeroAddr[index] ]++;
|
||||||
|
sum += bincntZeroAddr[index];
|
||||||
|
total += BIN_DEPTH;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
avgFillCover = sum / (float) total;
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
/* report */
|
||||||
|
printf("number of atoms = %d\n", natoms);
|
||||||
|
printf("lattice spacing = %g\n", h);
|
||||||
|
printf("cutoff distance = %g\n", cutoff);
|
||||||
|
printf("\n");
|
||||||
|
printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
|
||||||
|
printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
|
||||||
|
printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
|
||||||
|
printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
|
||||||
|
printf("number of bytes for lattice data = %u\n", (unsigned int) (lnall*sizeof(float)));
|
||||||
|
printf("\n");
|
||||||
|
printf("bin padding thickness = %d\n", c);
|
||||||
|
printf("bin cover dimensions = %d %d %d\n",
|
||||||
|
binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
|
||||||
|
printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
|
||||||
|
printf("number of bins = %d\n", nbins);
|
||||||
|
printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
|
||||||
|
printf("%% overhead space = %g\n",
|
||||||
|
(natoms / (double) (nbins * BIN_DEPTH)) * 100);
|
||||||
|
printf("number of bytes for bin data = %u\n",
|
||||||
|
(unsigned int)(nbins * BIN_DEPTH * sizeof(cl_float4)));
|
||||||
|
printf("\n");
|
||||||
|
printf("bin histogram with padding:\n");
|
||||||
|
sum = 0;
|
||||||
|
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||||
|
printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]);
|
||||||
|
sum += binHistoFull[n];
|
||||||
|
}
|
||||||
|
printf(" total number of bins: %d\n", sum);
|
||||||
|
printf(" %% average fill: %g\n", avgFillFull * 100);
|
||||||
|
printf("\n");
|
||||||
|
printf("bin histogram excluding padding:\n");
|
||||||
|
sum = 0;
|
||||||
|
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||||
|
printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]);
|
||||||
|
sum += binHistoCover[n];
|
||||||
|
}
|
||||||
|
printf(" total number of bins: %d\n", sum);
|
||||||
|
printf(" %% average fill: %g\n", avgFillCover * 100);
|
||||||
|
printf("\n");
|
||||||
|
printf("number of extra atoms = %d\n", extra->size);
|
||||||
|
printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
/* sanity check on bins */
|
||||||
|
sum = 0;
|
||||||
|
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||||
|
sum += n * binHistoFull[n];
|
||||||
|
}
|
||||||
|
sum += extra->size + num_excluded;
|
||||||
|
printf("sanity check on bin histogram with edges: "
|
||||||
|
"sum + others = %d\n", sum);
|
||||||
|
sum = 0;
|
||||||
|
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||||
|
sum += n * binHistoCover[n];
|
||||||
|
}
|
||||||
|
sum += extra->size + num_excluded;
|
||||||
|
printf("sanity check on bin histogram excluding edges: "
|
||||||
|
"sum + others = %d\n", sum);
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
/* neighbor list */
|
||||||
|
printf("neighbor list length = %d\n", nbrlistlen);
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
pb_Context* pb_context;
|
||||||
|
pb_context = pb_InitOpenCLContext(parameters);
|
||||||
|
if (pb_context == NULL) {
|
||||||
|
fprintf (stderr, "Error: No OpenCL platform/device can be found.");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_int clStatus;
|
||||||
|
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
|
||||||
|
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
|
||||||
|
cl_context clContext = (cl_context) pb_context->clContext;
|
||||||
|
|
||||||
|
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
|
||||||
|
CHECK_ERROR("clCreateCommandQueue")
|
||||||
|
|
||||||
|
pb_SetOpenCL(&clContext, &clCommandQueue);
|
||||||
|
|
||||||
|
//const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
|
||||||
|
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
|
||||||
|
uint8_t *kernel_bin = NULL;
|
||||||
|
size_t kernel_size;
|
||||||
|
cl_int binary_status = 0;
|
||||||
|
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||||
|
CHECK_ERROR("read_kernel_file")
|
||||||
|
cl_program clProgram = clCreateProgramWithBinary(
|
||||||
|
clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
|
||||||
|
CHECK_ERROR("clCreateProgramWithSource")
|
||||||
|
|
||||||
|
char clOptions[50];
|
||||||
|
sprintf(clOptions,"-I src/opencl_base"); //-cl-nv-verbose
|
||||||
|
|
||||||
|
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
|
||||||
|
if (clStatus != CL_SUCCESS) {
|
||||||
|
size_t string_size = 0;
|
||||||
|
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
|
||||||
|
0, NULL, &string_size);
|
||||||
|
char* string = (char*)malloc(string_size*sizeof(char));
|
||||||
|
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
|
||||||
|
string_size, string, NULL);
|
||||||
|
puts(string);
|
||||||
|
}
|
||||||
|
|
||||||
|
CHECK_ERROR("clBuildProgram")
|
||||||
|
|
||||||
|
cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus);
|
||||||
|
CHECK_ERROR("clCreateKernel")
|
||||||
|
|
||||||
|
/* setup OpenCL kernel parameters */
|
||||||
|
blockDim[0] = 8;
|
||||||
|
blockDim[1] = 8;
|
||||||
|
blockDim[2] = 2;
|
||||||
|
gridDim[0] = 4 * xRegionDim * blockDim[0];
|
||||||
|
gridDim[1] = yRegionDim * blockDim[1];
|
||||||
|
gridDim[2] = 1 * blockDim[2];
|
||||||
|
|
||||||
|
/* allocate and initialize memory on OpenCL device */
|
||||||
|
pb_SwitchToTimer(timers, pb_TimerID_COPY);
|
||||||
|
if (verbose) {
|
||||||
|
printf("Allocating %.2fMB on OpenCL device for potentials\n",
|
||||||
|
lnall * sizeof(float) / (double) (1024*1024));
|
||||||
|
}
|
||||||
|
|
||||||
|
regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus);
|
||||||
|
CHECK_ERROR("clCreateBuffer")
|
||||||
|
|
||||||
|
// clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float));
|
||||||
|
|
||||||
|
if (verbose) {
|
||||||
|
printf("Allocating %.2fMB on OpenCL device for atom bins\n",
|
||||||
|
nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
|
||||||
|
}
|
||||||
|
|
||||||
|
binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
|
||||||
|
CHECK_ERROR("clCreateBuffer")
|
||||||
|
|
||||||
|
clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
|
||||||
|
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||||
|
|
||||||
|
//Sub buffers are not supported in OpenCL v1.0
|
||||||
|
int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
|
||||||
|
|
||||||
|
NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
|
||||||
|
CHECK_ERROR("clCreateBuffer")
|
||||||
|
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
|
||||||
|
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||||
|
|
||||||
|
NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
|
||||||
|
CHECK_ERROR("clCreateBuffer")
|
||||||
|
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
|
||||||
|
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||||
|
|
||||||
|
if (verbose)
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
|
||||||
|
clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
|
||||||
|
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
|
||||||
|
clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
|
||||||
|
clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
|
||||||
|
clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
|
||||||
|
clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
|
||||||
|
clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),®ionZeroCl);
|
||||||
|
clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
|
||||||
|
clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
|
||||||
|
CHECK_ERROR("clSetKernelArg")
|
||||||
|
|
||||||
|
/* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
|
||||||
|
pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
|
||||||
|
printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
|
||||||
|
for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
|
||||||
|
printf(" computing plane %d\r", zRegionIndex);
|
||||||
|
fflush(stdout);
|
||||||
|
|
||||||
|
clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
|
||||||
|
CHECK_ERROR("clSetKernelArg")
|
||||||
|
|
||||||
|
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
|
||||||
|
|
||||||
|
CHECK_ERROR("clEnqueueNDRangeKernel")
|
||||||
|
|
||||||
|
clStatus = clFinish(clCommandQueue);
|
||||||
|
|
||||||
|
CHECK_ERROR("clFinish")
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Finished OpenCL kernel calls\n");
|
||||||
|
|
||||||
|
/* copy result regions from OpenCL device */
|
||||||
|
pb_SwitchToTimer(timers, pb_TimerID_COPY);
|
||||||
|
clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL);
|
||||||
|
CHECK_ERROR("clEnqueueReadBuffer")
|
||||||
|
|
||||||
|
/* free OpenCL memory allocations */
|
||||||
|
clStatus = clReleaseMemObject(regionZeroCl);
|
||||||
|
clStatus = clReleaseMemObject(binBaseCl);
|
||||||
|
clStatus = clReleaseMemObject(NbrListLen);
|
||||||
|
clStatus = clReleaseMemObject(NbrList);
|
||||||
|
CHECK_ERROR("clReleaseMemObject")
|
||||||
|
|
||||||
|
clStatus = clReleaseKernel(clKernel);
|
||||||
|
clStatus = clReleaseProgram(clProgram);
|
||||||
|
clStatus = clReleaseCommandQueue(clCommandQueue);
|
||||||
|
clStatus = clReleaseContext(clContext);
|
||||||
|
|
||||||
|
//free((void*)clSource[0]);
|
||||||
|
|
||||||
|
/* transpose regions back into lattice */
|
||||||
|
pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
|
||||||
|
for (k = 0; k < nz; k++) {
|
||||||
|
zRegionIndex = (k >> 3);
|
||||||
|
zOffset = (k & 7);
|
||||||
|
|
||||||
|
for (j = 0; j < ny; j++) {
|
||||||
|
yRegionIndex = (j >> 3);
|
||||||
|
yOffset = (j & 7);
|
||||||
|
|
||||||
|
for (i = 0; i < nx; i++) {
|
||||||
|
xRegionIndex = (i >> 3);
|
||||||
|
xOffset = (i & 7);
|
||||||
|
|
||||||
|
thisRegion = regionZeroAddr
|
||||||
|
+ ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
|
||||||
|
+ xRegionIndex) * REGION_SIZE;
|
||||||
|
|
||||||
|
indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
|
||||||
|
index = (k * ny + j) * nx + i;
|
||||||
|
|
||||||
|
lattice->lattice[index] = thisRegion[indexRegion];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* handle extra atoms */
|
||||||
|
if (extra->size > 0) {
|
||||||
|
printf("computing extra atoms on CPU\n");
|
||||||
|
if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
|
||||||
|
fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
|
||||||
|
"for extra atoms\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
printf("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* cleanup memory allocations */
|
||||||
|
free(regionZeroAddr);
|
||||||
|
free(binBaseAddr);
|
||||||
|
free(bincntBaseAddr);
|
||||||
|
free_atom(extra);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
72
tests/opencl/cutcp2/cutoff.h
Normal file
72
tests/opencl/cutcp2/cutoff.h
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef CUTOFF_H
|
||||||
|
#define CUTOFF_H
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define SHIFTED
|
||||||
|
|
||||||
|
/* A structure to record how points in 3D space map to array
|
||||||
|
elements. Array element (z, y, x)
|
||||||
|
where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
|
||||||
|
maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
|
||||||
|
*/
|
||||||
|
typedef struct LatticeDim_t {
|
||||||
|
/* Number of lattice points in x, y, z dimensions */
|
||||||
|
int nx, ny, nz;
|
||||||
|
|
||||||
|
/* Lowest corner of lattice */
|
||||||
|
Vec3 lo;
|
||||||
|
|
||||||
|
/* Lattice spacing */
|
||||||
|
float h;
|
||||||
|
} LatticeDim;
|
||||||
|
|
||||||
|
/* An electric potential field sampled on a regular grid. The
|
||||||
|
lattice size and grid point positions are specified by 'dim'.
|
||||||
|
*/
|
||||||
|
typedef struct Lattice_t {
|
||||||
|
LatticeDim dim;
|
||||||
|
float *lattice;
|
||||||
|
} Lattice;
|
||||||
|
|
||||||
|
LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
|
||||||
|
|
||||||
|
Lattice *create_lattice(LatticeDim dim);
|
||||||
|
void destroy_lattice(Lattice *);
|
||||||
|
|
||||||
|
int gpu_compute_cutoff_potential_lattice(
|
||||||
|
struct pb_TimerSet *timers,
|
||||||
|
Lattice *lattice,
|
||||||
|
float cutoff, /* cutoff distance */
|
||||||
|
Atoms *atom, /* array of atoms */
|
||||||
|
int verbose, /* print info/debug messages */
|
||||||
|
struct pb_Parameters *parameters
|
||||||
|
);
|
||||||
|
|
||||||
|
int cpu_compute_cutoff_potential_lattice(
|
||||||
|
Lattice *lattice, /* the lattice */
|
||||||
|
float cutoff, /* cutoff distance */
|
||||||
|
Atoms *atoms /* array of atoms */
|
||||||
|
);
|
||||||
|
|
||||||
|
int remove_exclusions(
|
||||||
|
Lattice *lattice, /* the lattice */
|
||||||
|
float exclcutoff, /* exclusion cutoff distance */
|
||||||
|
Atoms *atom /* array of atoms */
|
||||||
|
);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif /* CUTOFF_H */
|
157
tests/opencl/cutcp2/excl.c
Normal file
157
tests/opencl/cutcp2/excl.c
Normal file
|
@ -0,0 +1,157 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <parboil.h>
|
||||||
|
|
||||||
|
#include "atom.h"
|
||||||
|
#include "cutoff.h"
|
||||||
|
|
||||||
|
#define CELLEN 4.f
|
||||||
|
#define INV_CELLEN (1.f/CELLEN)
|
||||||
|
|
||||||
|
extern int remove_exclusions(
|
||||||
|
Lattice *lattice, /* the lattice */
|
||||||
|
float cutoff, /* exclusion cutoff distance */
|
||||||
|
Atoms *atoms /* array of atoms */
|
||||||
|
)
|
||||||
|
{
|
||||||
|
int nx = lattice->dim.nx;
|
||||||
|
int ny = lattice->dim.ny;
|
||||||
|
int nz = lattice->dim.nz;
|
||||||
|
float xlo = lattice->dim.lo.x;
|
||||||
|
float ylo = lattice->dim.lo.y;
|
||||||
|
float zlo = lattice->dim.lo.z;
|
||||||
|
float gridspacing = lattice->dim.h;
|
||||||
|
Atom *atom = atoms->atoms;
|
||||||
|
|
||||||
|
const float a2 = cutoff * cutoff;
|
||||||
|
const float inv_gridspacing = 1.f / gridspacing;
|
||||||
|
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
|
||||||
|
/* lattice point radius about each atom */
|
||||||
|
|
||||||
|
int n;
|
||||||
|
int i, j, k;
|
||||||
|
int ia, ib, ic;
|
||||||
|
int ja, jb, jc;
|
||||||
|
int ka, kb, kc;
|
||||||
|
int index;
|
||||||
|
int koff, jkoff;
|
||||||
|
|
||||||
|
float x, y, z, q;
|
||||||
|
float dx, dy, dz;
|
||||||
|
float dz2, dydz2, r2;
|
||||||
|
float e;
|
||||||
|
float xstart, ystart;
|
||||||
|
|
||||||
|
float *pg;
|
||||||
|
|
||||||
|
int gindex;
|
||||||
|
int ncell, nxcell, nycell, nzcell;
|
||||||
|
int *first, *next;
|
||||||
|
float inv_cellen = INV_CELLEN;
|
||||||
|
Vec3 minext, maxext;
|
||||||
|
|
||||||
|
/* find min and max extent */
|
||||||
|
get_atom_extent(&minext, &maxext, atoms);
|
||||||
|
|
||||||
|
/* number of cells in each dimension */
|
||||||
|
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
|
||||||
|
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
|
||||||
|
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
|
||||||
|
ncell = nxcell * nycell * nzcell;
|
||||||
|
|
||||||
|
/* allocate for cursor link list implementation */
|
||||||
|
first = (int *) malloc(ncell * sizeof(int));
|
||||||
|
for (gindex = 0; gindex < ncell; gindex++) {
|
||||||
|
first[gindex] = -1;
|
||||||
|
}
|
||||||
|
next = (int *) malloc(atoms->size * sizeof(int));
|
||||||
|
for (n = 0; n < atoms->size; n++) {
|
||||||
|
next[n] = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* geometric hashing */
|
||||||
|
for (n = 0; n < atoms->size; n++) {
|
||||||
|
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
|
||||||
|
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
|
||||||
|
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
|
||||||
|
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
|
||||||
|
gindex = (k*nycell + j)*nxcell + i;
|
||||||
|
next[n] = first[gindex];
|
||||||
|
first[gindex] = n;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* traverse the grid cells */
|
||||||
|
for (gindex = 0; gindex < ncell; gindex++) {
|
||||||
|
for (n = first[gindex]; n != -1; n = next[n]) {
|
||||||
|
x = atom[n].x - xlo;
|
||||||
|
y = atom[n].y - ylo;
|
||||||
|
z = atom[n].z - zlo;
|
||||||
|
q = atom[n].q;
|
||||||
|
|
||||||
|
/* find closest grid point with position less than or equal to atom */
|
||||||
|
ic = (int) (x * inv_gridspacing);
|
||||||
|
jc = (int) (y * inv_gridspacing);
|
||||||
|
kc = (int) (z * inv_gridspacing);
|
||||||
|
|
||||||
|
/* find extent of surrounding box of grid points */
|
||||||
|
ia = ic - radius;
|
||||||
|
ib = ic + radius + 1;
|
||||||
|
ja = jc - radius;
|
||||||
|
jb = jc + radius + 1;
|
||||||
|
ka = kc - radius;
|
||||||
|
kb = kc + radius + 1;
|
||||||
|
|
||||||
|
/* trim box edges so that they are within grid point lattice */
|
||||||
|
if (ia < 0) ia = 0;
|
||||||
|
if (ib >= nx) ib = nx-1;
|
||||||
|
if (ja < 0) ja = 0;
|
||||||
|
if (jb >= ny) jb = ny-1;
|
||||||
|
if (ka < 0) ka = 0;
|
||||||
|
if (kb >= nz) kb = nz-1;
|
||||||
|
|
||||||
|
/* loop over surrounding grid points */
|
||||||
|
xstart = ia*gridspacing - x;
|
||||||
|
ystart = ja*gridspacing - y;
|
||||||
|
dz = ka*gridspacing - z;
|
||||||
|
for (k = ka; k <= kb; k++, dz += gridspacing) {
|
||||||
|
koff = k*ny;
|
||||||
|
dz2 = dz*dz;
|
||||||
|
|
||||||
|
dy = ystart;
|
||||||
|
for (j = ja; j <= jb; j++, dy += gridspacing) {
|
||||||
|
jkoff = (koff + j)*nx;
|
||||||
|
dydz2 = dy*dy + dz2;
|
||||||
|
|
||||||
|
dx = xstart;
|
||||||
|
index = jkoff + ia;
|
||||||
|
pg = lattice->lattice + index;
|
||||||
|
|
||||||
|
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
|
||||||
|
r2 = dx*dx + dydz2;
|
||||||
|
|
||||||
|
/* If atom and lattice point are too close, set the lattice value
|
||||||
|
* to zero */
|
||||||
|
if (r2 < a2) *pg = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} /* end loop over surrounding grid points */
|
||||||
|
|
||||||
|
} /* end loop over atoms in a gridcell */
|
||||||
|
} /* end loop over gridcells */
|
||||||
|
|
||||||
|
/* free memory */
|
||||||
|
free(next);
|
||||||
|
free(first);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
55
tests/opencl/cutcp2/gpu_info.c
Normal file
55
tests/opencl/cutcp2/gpu_info.c
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
//#include <endian.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <malloc.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
|
#include "gpu_info.h"
|
||||||
|
|
||||||
|
void compute_active_thread(size_t *thread,
|
||||||
|
size_t *grid,
|
||||||
|
int task,
|
||||||
|
int pad,
|
||||||
|
int major,
|
||||||
|
int minor,
|
||||||
|
int sm)
|
||||||
|
{
|
||||||
|
int max_thread;
|
||||||
|
int max_block=8;
|
||||||
|
if(major==1)
|
||||||
|
{
|
||||||
|
if(minor>=2)
|
||||||
|
max_thread=1024;
|
||||||
|
else
|
||||||
|
max_thread=768;
|
||||||
|
}
|
||||||
|
else if(major==2)
|
||||||
|
max_thread=1536;
|
||||||
|
else
|
||||||
|
//newer GPU //keep using 2.0
|
||||||
|
max_thread=1536;
|
||||||
|
|
||||||
|
int _grid;
|
||||||
|
int _thread;
|
||||||
|
|
||||||
|
if(task*pad>sm*max_thread)
|
||||||
|
{
|
||||||
|
_thread=max_thread/max_block;
|
||||||
|
_grid = ((task*pad+_thread-1)/_thread)*_thread;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_thread=pad;
|
||||||
|
_grid=task*pad;
|
||||||
|
}
|
||||||
|
|
||||||
|
thread[0]=_thread;
|
||||||
|
grid[0]=_grid;
|
||||||
|
}
|
28
tests/opencl/cutcp2/gpu_info.h
Normal file
28
tests/opencl/cutcp2/gpu_info.h
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef __GPUINFOH__
|
||||||
|
#define __GPUINFOH__
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void compute_active_thread(size_t *thread,
|
||||||
|
size_t *grid,
|
||||||
|
int task,
|
||||||
|
int pad,
|
||||||
|
int major,
|
||||||
|
int minor,
|
||||||
|
int sm);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
104
tests/opencl/cutcp2/kernel.cl
Normal file
104
tests/opencl/cutcp2/kernel.cl
Normal file
|
@ -0,0 +1,104 @@
|
||||||
|
/*
|
||||||
|
* potential lattice is decomposed into size 8^3 lattice point "regions"
|
||||||
|
*
|
||||||
|
* THIS IMPLEMENTATION: one thread per lattice point
|
||||||
|
* thread block size 128 gives 4 thread blocks per region
|
||||||
|
* kernel is invoked for each x-y plane of regions,
|
||||||
|
* where gridDim.x is 4*(x region dimension) so that blockIdx.x
|
||||||
|
* can absorb the z sub-region index in its 2 lowest order bits
|
||||||
|
*
|
||||||
|
* Regions are stored contiguously in memory in row-major order
|
||||||
|
*
|
||||||
|
* The bins have to not only cover the region, but they need to surround
|
||||||
|
* the outer edges so that region sides and corners can still use
|
||||||
|
* neighbor list stencil. The binZeroAddr is actually a shifted pointer into
|
||||||
|
* the bin array (binZeroAddr = binBaseAddr + (c*binDim_y + c)*binDim_x + c)
|
||||||
|
* where c = ceil(cutoff / binsize). This allows for negative offsets to
|
||||||
|
* be added to myBinIndex.
|
||||||
|
*
|
||||||
|
* The (0,0,0) spatial origin corresponds to lower left corner of both
|
||||||
|
* regionZeroAddr and binZeroAddr. The atom coordinates are translated
|
||||||
|
* during binning to enforce this assumption.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "macros.h"
|
||||||
|
|
||||||
|
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
|
||||||
|
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
|
||||||
|
typedef int4 xyz;
|
||||||
|
|
||||||
|
__kernel void opencl_cutoff_potential_lattice(
|
||||||
|
int binDim_x,
|
||||||
|
int binDim_y,
|
||||||
|
__global float4 *binBaseAddr,
|
||||||
|
int offset,
|
||||||
|
float h, /* lattice spacing */
|
||||||
|
float cutoff2, /* square of cutoff distance */
|
||||||
|
float inv_cutoff2,
|
||||||
|
__global float *regionZeroAddr, /* address of lattice regions starting at origin */
|
||||||
|
int zRegionIndex,
|
||||||
|
__constant int *NbrListLen,
|
||||||
|
__constant xyz *NbrList
|
||||||
|
)
|
||||||
|
{
|
||||||
|
__global float4* binZeroAddr = binBaseAddr + offset;
|
||||||
|
|
||||||
|
__global float *myRegionAddr;
|
||||||
|
int Bx, By, Bz;
|
||||||
|
|
||||||
|
/* thread id */
|
||||||
|
const int tid = (get_local_id(2)*get_local_size(1) +
|
||||||
|
get_local_id(1))*get_local_size(0) + get_local_id(0);
|
||||||
|
|
||||||
|
/* this is the start of the sub-region indexed by tid */
|
||||||
|
myRegionAddr = regionZeroAddr + ((zRegionIndex*get_num_groups(1)
|
||||||
|
+ get_group_id(1))*(get_num_groups(0)>>2) + (get_group_id(0)>>2))*REGION_SIZE
|
||||||
|
+ (get_group_id(0)&3)*SUB_REGION_SIZE;
|
||||||
|
|
||||||
|
/* spatial coordinate of this lattice point */
|
||||||
|
float x = (8 * (get_group_id(0) >> 2) + get_local_id(0)) * h;
|
||||||
|
float y = (8 * get_group_id(1) + get_local_id(1)) * h;
|
||||||
|
float z = (8 * zRegionIndex + 2*(get_group_id(0)&3) + get_local_id(2)) * h;
|
||||||
|
|
||||||
|
float dx;
|
||||||
|
float dy;
|
||||||
|
float dz;
|
||||||
|
float r2;
|
||||||
|
float s;
|
||||||
|
|
||||||
|
int totalbins = 0;
|
||||||
|
|
||||||
|
/* bin number determined by center of region */
|
||||||
|
Bx = (int) floor((8 * (get_group_id(0) >> 2) + 4) * h * BIN_INVLEN);
|
||||||
|
By = (int) floor((8 * get_group_id(1) + 4) * h * BIN_INVLEN);
|
||||||
|
Bz = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
|
||||||
|
|
||||||
|
float energy = 0.f;
|
||||||
|
int bincnt;
|
||||||
|
for (bincnt = 0; bincnt < *NbrListLen; bincnt++) {
|
||||||
|
int i = Bx + NbrList[bincnt].x;
|
||||||
|
int j = By + NbrList[bincnt].y;
|
||||||
|
int k = Bz + NbrList[bincnt].z;
|
||||||
|
|
||||||
|
__global float4* p_global = binZeroAddr +
|
||||||
|
(((k*binDim_y + j)*binDim_x + i) * BIN_DEPTH);
|
||||||
|
|
||||||
|
int m;
|
||||||
|
for (m = 0; m < BIN_DEPTH; m++) {
|
||||||
|
float aq = p_global[m].w;
|
||||||
|
if (0.f != aq) {
|
||||||
|
dx = p_global[m].x - x;
|
||||||
|
dy = p_global[m].y - y;
|
||||||
|
dz = p_global[m].z - z;
|
||||||
|
r2 = dx*dx + dy*dy + dz*dz;
|
||||||
|
if (r2 < cutoff2) {
|
||||||
|
s = (1.f - r2 * inv_cutoff2);
|
||||||
|
energy += aq * rsqrt(r2) * s * s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} /* end loop over atoms in bin */
|
||||||
|
} /* end loop over neighbor list */
|
||||||
|
|
||||||
|
/* store into global memory */
|
||||||
|
myRegionAddr[tid+0] = energy;
|
||||||
|
}
|
69
tests/opencl/cutcp2/macros.h
Normal file
69
tests/opencl/cutcp2/macros.h
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
#ifndef __MACROSH__
|
||||||
|
#define __MACROSH__
|
||||||
|
|
||||||
|
#ifdef __DEVICE_EMULATION__
|
||||||
|
#define DEBUG
|
||||||
|
/* define which grid block and which thread to examine */
|
||||||
|
#define BX 0
|
||||||
|
#define BY 0
|
||||||
|
#define TX 0
|
||||||
|
#define TY 0
|
||||||
|
#define TZ 0
|
||||||
|
#define EMU(code) do { \
|
||||||
|
if (blockIdx.x==BX && blockIdx.y==BY && \
|
||||||
|
threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
|
||||||
|
code; \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
#define INT(n) printf("%s = %d\n", #n, n)
|
||||||
|
#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
|
||||||
|
#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
|
||||||
|
#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
|
||||||
|
(double)(f).y, (double)(f).z, (double)(f).w)
|
||||||
|
#else
|
||||||
|
#define EMU(code)
|
||||||
|
#define INT(n)
|
||||||
|
#define FLOAT(f)
|
||||||
|
#define INT3(n)
|
||||||
|
#define FLOAT4(f)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* report error from OpenCL */
|
||||||
|
#define CHECK_ERROR(errorMessage) \
|
||||||
|
if(clStatus != CL_SUCCESS) \
|
||||||
|
{ \
|
||||||
|
printf("Error: %s!\n",errorMessage); \
|
||||||
|
printf("Line: %d\n",__LINE__); \
|
||||||
|
exit(1); \
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* neighbor list:
|
||||||
|
* stored in constant memory as table of offsets
|
||||||
|
* flat index addressing is computed by kernel
|
||||||
|
*
|
||||||
|
* reserve enough memory for 11^3 stencil of grid cells
|
||||||
|
* this fits within 16K of memory
|
||||||
|
*/
|
||||||
|
#define NBRLIST_DIM 11
|
||||||
|
#define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* atom bins cached into shared memory for processing
|
||||||
|
*
|
||||||
|
* this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
|
||||||
|
* should permit scheduling of up to 3 thread blocks per SM
|
||||||
|
*/
|
||||||
|
#define BIN_DEPTH 8 /* max number of atoms per bin */
|
||||||
|
#define BIN_SIZE 32 /* size of bin in floats */
|
||||||
|
#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
|
||||||
|
|
||||||
|
#define BIN_LENGTH 4.f /* spatial length in Angstroms */
|
||||||
|
#define BIN_INVLEN (1.f / BIN_LENGTH)
|
||||||
|
/* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
|
||||||
|
* so that bin fill should be 80% (for non-empty regions of space) */
|
||||||
|
|
||||||
|
#define REGION_SIZE 512 /* number of floats in lattice region */
|
||||||
|
#define SUB_REGION_SIZE 128 /* number of floats in lattice sub-region */
|
||||||
|
|
||||||
|
#endif
|
190
tests/opencl/cutcp2/main.cc
Normal file
190
tests/opencl/cutcp2/main.cc
Normal file
|
@ -0,0 +1,190 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <parboil.h>
|
||||||
|
|
||||||
|
#include "atom.h"
|
||||||
|
#include "cutoff.h"
|
||||||
|
#include "output.h"
|
||||||
|
|
||||||
|
#define ERRTOL 1e-4f
|
||||||
|
|
||||||
|
#define NOKERNELS 0
|
||||||
|
#define CUTOFF1 1
|
||||||
|
#define CUTOFF6 32
|
||||||
|
#define CUTOFF6OVERLAP 64
|
||||||
|
#define CUTOFFCPU 16384
|
||||||
|
|
||||||
|
|
||||||
|
int appenddata(const char *filename, int size, double time) {
|
||||||
|
FILE *fp;
|
||||||
|
fp=fopen(filename, "a");
|
||||||
|
if (fp == NULL) {
|
||||||
|
printf("error appending to file %s..\n", filename);
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
fprintf(fp, "%d %.3f\n", size, time);
|
||||||
|
fclose(fp);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
LatticeDim
|
||||||
|
lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
|
||||||
|
{
|
||||||
|
LatticeDim ret;
|
||||||
|
|
||||||
|
ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
|
||||||
|
ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
|
||||||
|
ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
|
||||||
|
ret.lo = lo;
|
||||||
|
ret.h = h;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
Lattice *
|
||||||
|
create_lattice(LatticeDim dim)
|
||||||
|
{
|
||||||
|
int size;
|
||||||
|
Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
|
||||||
|
|
||||||
|
if (lat == NULL) {
|
||||||
|
fprintf(stderr, "Out of memory\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
lat->dim = dim;
|
||||||
|
|
||||||
|
/* Round up the allocated size to a multiple of 8 */
|
||||||
|
size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
|
||||||
|
lat->lattice = (float *)calloc(size, sizeof(float));
|
||||||
|
|
||||||
|
if (lat->lattice == NULL) {
|
||||||
|
fprintf(stderr, "Out of memory\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return lat;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void
|
||||||
|
destroy_lattice(Lattice *lat)
|
||||||
|
{
|
||||||
|
if (lat) {
|
||||||
|
free(lat->lattice);
|
||||||
|
free(lat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
Atoms *atom;
|
||||||
|
|
||||||
|
LatticeDim lattice_dim;
|
||||||
|
Lattice *gpu_lattice;
|
||||||
|
Vec3 min_ext, max_ext; /* Bounding box of atoms */
|
||||||
|
Vec3 lo, hi; /* Bounding box with padding */
|
||||||
|
|
||||||
|
float h = 0.5f; /* Lattice spacing */
|
||||||
|
float cutoff = 12.f; /* Cutoff radius */
|
||||||
|
float exclcutoff = 1.f; /* Radius for exclusion */
|
||||||
|
float padding = 0.5f; /* Bounding box padding distance */
|
||||||
|
|
||||||
|
int n;
|
||||||
|
|
||||||
|
struct pb_Parameters *parameters;
|
||||||
|
struct pb_TimerSet timers;
|
||||||
|
|
||||||
|
/* Read input parameters */
|
||||||
|
parameters = pb_ReadParameters(&argc, argv);
|
||||||
|
if (parameters == NULL) {
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
parameters->inpFiles = (char **)malloc(sizeof(char *) * 2);
|
||||||
|
parameters->inpFiles[0] = (char *)malloc(100);
|
||||||
|
parameters->inpFiles[1] = NULL;
|
||||||
|
strncpy(parameters->inpFiles[0], "watbox.sl40.pqr", 100);
|
||||||
|
|
||||||
|
/* Expect one input file */
|
||||||
|
if (pb_Parameters_CountInputs(parameters) != 1) {
|
||||||
|
fprintf(stderr, "Expecting one input file\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
pb_InitializeTimerSet(&timers);
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||||
|
|
||||||
|
{
|
||||||
|
const char *pqrfilename = parameters->inpFiles[0];
|
||||||
|
|
||||||
|
if (!(atom = read_atom_file(pqrfilename))) {
|
||||||
|
fprintf(stderr, "read_atom_file() failed\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* find extent of domain */
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||||
|
get_atom_extent(&min_ext, &max_ext, atom);
|
||||||
|
printf("extent of domain is:\n");
|
||||||
|
printf(" minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
|
||||||
|
printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
|
||||||
|
|
||||||
|
printf("padding domain by %g Angstroms\n", padding);
|
||||||
|
lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
|
||||||
|
hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
|
||||||
|
printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
|
||||||
|
|
||||||
|
lattice_dim = lattice_from_bounding_box(lo, hi, h);
|
||||||
|
gpu_lattice = create_lattice(lattice_dim);
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Run OpenCL kernel
|
||||||
|
* (Begin and end with COMPUTE timer active)
|
||||||
|
*/
|
||||||
|
if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0, parameters)) {
|
||||||
|
fprintf(stderr, "Computation failed\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Zero the lattice points that are too close to an atom. This is
|
||||||
|
* necessary for numerical stability.
|
||||||
|
*/
|
||||||
|
if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
|
||||||
|
fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n");
|
||||||
|
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||||
|
|
||||||
|
/* Print output */
|
||||||
|
if (parameters->outFile) {
|
||||||
|
//write_lattice_summary(parameters->outFile, gpu_lattice);
|
||||||
|
}
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||||
|
|
||||||
|
/* Cleanup */
|
||||||
|
destroy_lattice(gpu_lattice);
|
||||||
|
free_atom(atom);
|
||||||
|
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
|
||||||
|
pb_PrintTimerSet(&timers);
|
||||||
|
pb_FreeParameters(parameters);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
49
tests/opencl/cutcp2/ocl.c
Normal file
49
tests/opencl/cutcp2/ocl.c
Normal file
|
@ -0,0 +1,49 @@
|
||||||
|
#include <CL/cl.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "ocl.h"
|
||||||
|
|
||||||
|
char* readFile(const char* fileName)
|
||||||
|
{
|
||||||
|
FILE* fp;
|
||||||
|
fp = fopen(fileName,"r");
|
||||||
|
if(fp == NULL)
|
||||||
|
{
|
||||||
|
printf("Error 1!\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fseek(fp,0,SEEK_END);
|
||||||
|
long size = ftell(fp);
|
||||||
|
rewind(fp);
|
||||||
|
|
||||||
|
char* buffer = (char*)malloc(sizeof(char)*(size+1));
|
||||||
|
if(buffer == NULL)
|
||||||
|
{
|
||||||
|
printf("Error 2!\n");
|
||||||
|
fclose(fp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t res = fread(buffer,1,size,fp);
|
||||||
|
if(res != size)
|
||||||
|
{
|
||||||
|
printf("Error 3!\n");
|
||||||
|
fclose(fp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer[size] = 0;
|
||||||
|
fclose(fp);
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
|
||||||
|
{
|
||||||
|
cl_int clStatus;
|
||||||
|
char* temp = (char*)malloc(size);
|
||||||
|
memset(temp,val,size);
|
||||||
|
clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
|
||||||
|
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||||
|
free(temp);
|
||||||
|
}
|
25
tests/opencl/cutcp2/ocl.h
Normal file
25
tests/opencl/cutcp2/ocl.h
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
#ifndef __OCLH__
|
||||||
|
#define __OCLH__
|
||||||
|
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void clMemSet(cl_command_queue, cl_mem, int, size_t);
|
||||||
|
char* readFile(const char*);
|
||||||
|
|
||||||
|
#define CHECK_ERROR(errorMessage) \
|
||||||
|
if(clStatus != CL_SUCCESS) \
|
||||||
|
{ \
|
||||||
|
printf("Error: %s!\n",errorMessage); \
|
||||||
|
printf("Line: %d\n",__LINE__); \
|
||||||
|
exit(1); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
67
tests/opencl/cutcp2/output.c
Normal file
67
tests/opencl/cutcp2/output.c
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include <parboil.h>
|
||||||
|
|
||||||
|
#include "atom.h"
|
||||||
|
#include "cutoff.h"
|
||||||
|
|
||||||
|
void
|
||||||
|
write_lattice_summary(const char *filename, Lattice *lattice)
|
||||||
|
{
|
||||||
|
float *lattice_data = lattice->lattice;
|
||||||
|
int nx = lattice->dim.nx;
|
||||||
|
int ny = lattice->dim.ny;
|
||||||
|
int nz = lattice->dim.nz;
|
||||||
|
|
||||||
|
/* Open output file */
|
||||||
|
FILE *outfile = fopen(filename, "w");
|
||||||
|
|
||||||
|
if (outfile == NULL) {
|
||||||
|
fprintf(stderr, "Cannot open output file\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Write the sum of the the absolute values of all lattice potentials */
|
||||||
|
{
|
||||||
|
double abspotential = 0.0;
|
||||||
|
float tmp;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < nx * ny * nz; i++)
|
||||||
|
abspotential += fabs((double) lattice_data[i]);
|
||||||
|
|
||||||
|
tmp = (float) abspotential;
|
||||||
|
|
||||||
|
fwrite(&tmp, 1, sizeof(float), outfile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Write the size of a lattice plane */
|
||||||
|
{
|
||||||
|
uint32_t tmp;
|
||||||
|
|
||||||
|
tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
|
||||||
|
fwrite(&tmp, 1, sizeof(uint32_t), outfile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Write the plane of lattice data at z=0 and z = nz-1 */
|
||||||
|
{
|
||||||
|
int plane_size = nx * ny;
|
||||||
|
|
||||||
|
fwrite(lattice_data, plane_size, sizeof(float), outfile);
|
||||||
|
fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
|
||||||
|
outfile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Cleanup */
|
||||||
|
fclose(outfile);
|
||||||
|
}
|
25
tests/opencl/cutcp2/output.h
Normal file
25
tests/opencl/cutcp2/output.h
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef OUTPUT_H
|
||||||
|
#define OUTPUT_H
|
||||||
|
|
||||||
|
#include "cutoff.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void
|
||||||
|
write_lattice_summary(const char *filename, Lattice *lattice);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
348
tests/opencl/cutcp2/parboil.h
Normal file
348
tests/opencl/cutcp2/parboil.h
Normal file
|
@ -0,0 +1,348 @@
|
||||||
|
/*
|
||||||
|
* (c) 2010 The Board of Trustees of the University of Illinois.
|
||||||
|
*/
|
||||||
|
#ifndef PARBOIL_HEADER
|
||||||
|
#define PARBOIL_HEADER
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
/* A platform as specified by the user on the command line */
|
||||||
|
struct pb_PlatformParam {
|
||||||
|
char *name; /* The platform name. This string is owned. */
|
||||||
|
char *version; /* The platform version; may be NULL.
|
||||||
|
* This string is owned. */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Create a PlatformParam from the given strings.
|
||||||
|
* 'name' must not be NULL. 'version' may be NULL.
|
||||||
|
* If not NULL, the strings should have been allocated by malloc(),
|
||||||
|
* and they will be owned by the returned object.
|
||||||
|
*/
|
||||||
|
struct pb_PlatformParam *
|
||||||
|
pb_PlatformParam(char *name, char *version);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreePlatformParam(struct pb_PlatformParam *);
|
||||||
|
|
||||||
|
/* A criterion for how to select a device */
|
||||||
|
enum pb_DeviceSelectionCriterion {
|
||||||
|
pb_Device_INDEX, /* Enumerate the devices and select one
|
||||||
|
* by its number */
|
||||||
|
pb_Device_CPU, /* Select a CPU device */
|
||||||
|
pb_Device_GPU, /* Select a GPU device */
|
||||||
|
pb_Device_ACCELERATOR, /* Select an accelerator device */
|
||||||
|
pb_Device_NAME /* Select a device by name */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* A device as specified by the user on the command line */
|
||||||
|
struct pb_DeviceParam {
|
||||||
|
enum pb_DeviceSelectionCriterion criterion;
|
||||||
|
union {
|
||||||
|
int index; /* If criterion == pb_Device_INDEX,
|
||||||
|
* the index of the device */
|
||||||
|
char *name; /* If criterion == pb_Device_NAME,
|
||||||
|
* the name of the device.
|
||||||
|
* This string is owned. */
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_index(int index);
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_cpu(void);
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_gpu(void);
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_accelerator(void);
|
||||||
|
|
||||||
|
/* Create a by-name device selection criterion.
|
||||||
|
* The string should have been allocated by malloc(), and it will will be
|
||||||
|
* owned by the returned object.
|
||||||
|
*/
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_name(char *name);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreeDeviceParam(struct pb_DeviceParam *);
|
||||||
|
|
||||||
|
/* Command line parameters for benchmarks */
|
||||||
|
struct pb_Parameters {
|
||||||
|
char *outFile; /* If not NULL, the raw output of the
|
||||||
|
* computation should be saved to this
|
||||||
|
* file. The string is owned. */
|
||||||
|
char **inpFiles; /* A NULL-terminated array of strings
|
||||||
|
* holding the input file(s) for the
|
||||||
|
* computation. The array and strings
|
||||||
|
* are owned. */
|
||||||
|
struct pb_PlatformParam *platform; /* If not NULL, the platform
|
||||||
|
* specified on the command line. */
|
||||||
|
struct pb_DeviceParam *device; /* If not NULL, the device
|
||||||
|
* specified on the command line. */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Read command-line parameters.
|
||||||
|
*
|
||||||
|
* The argc and argv parameters to main are read, and any parameters
|
||||||
|
* interpreted by this function are removed from the argument list.
|
||||||
|
*
|
||||||
|
* A new instance of struct pb_Parameters is returned.
|
||||||
|
* If there is an error, then an error message is printed on stderr
|
||||||
|
* and NULL is returned.
|
||||||
|
*/
|
||||||
|
struct pb_Parameters *
|
||||||
|
pb_ReadParameters(int *_argc, char **argv);
|
||||||
|
|
||||||
|
/* Free an instance of struct pb_Parameters.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pb_FreeParameters(struct pb_Parameters *p);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreeStringArray(char **);
|
||||||
|
|
||||||
|
/* Count the number of input files in a pb_Parameters instance.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
pb_Parameters_CountInputs(struct pb_Parameters *p);
|
||||||
|
|
||||||
|
/* A time or duration. */
|
||||||
|
//#if _POSIX_VERSION >= 200112L
|
||||||
|
typedef unsigned long long pb_Timestamp; /* time in microseconds */
|
||||||
|
//#else
|
||||||
|
//# error "Timestamps not implemented"
|
||||||
|
//#endif
|
||||||
|
|
||||||
|
enum pb_TimerState {
|
||||||
|
pb_Timer_STOPPED,
|
||||||
|
pb_Timer_RUNNING,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pb_Timer {
|
||||||
|
enum pb_TimerState state;
|
||||||
|
pb_Timestamp elapsed; /* Amount of time elapsed so far */
|
||||||
|
pb_Timestamp init; /* Beginning of the current time interval,
|
||||||
|
* if state is RUNNING. End of the last
|
||||||
|
* recorded time interfal otherwise. */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Reset a timer.
|
||||||
|
* Use this to initialize a timer or to clear
|
||||||
|
* its elapsed time. The reset timer is stopped.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pb_ResetTimer(struct pb_Timer *timer);
|
||||||
|
|
||||||
|
/* Start a timer. The timer is set to RUNNING mode and
|
||||||
|
* time elapsed while the timer is running is added to
|
||||||
|
* the timer.
|
||||||
|
* The timer should not already be running.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pb_StartTimer(struct pb_Timer *timer);
|
||||||
|
|
||||||
|
/* Stop a timer.
|
||||||
|
* This stops adding elapsed time to the timer.
|
||||||
|
* The timer should not already be stopped.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pb_StopTimer(struct pb_Timer *timer);
|
||||||
|
|
||||||
|
/* Get the elapsed time in seconds. */
|
||||||
|
double
|
||||||
|
pb_GetElapsedTime(struct pb_Timer *timer);
|
||||||
|
|
||||||
|
/* Execution time is assigned to one of these categories. */
|
||||||
|
enum pb_TimerID {
|
||||||
|
pb_TimerID_NONE = 0,
|
||||||
|
pb_TimerID_IO, /* Time spent in input/output */
|
||||||
|
pb_TimerID_KERNEL, /* Time spent computing on the device,
|
||||||
|
* recorded asynchronously */
|
||||||
|
pb_TimerID_COPY, /* Time spent synchronously moving data
|
||||||
|
* to/from device and allocating/freeing
|
||||||
|
* memory on the device */
|
||||||
|
pb_TimerID_DRIVER, /* Time spent in the host interacting with the
|
||||||
|
* driver, primarily for recording the time
|
||||||
|
* spent queueing asynchronous operations */
|
||||||
|
pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
|
||||||
|
pb_TimerID_COMPUTE, /* Time for all program execution other
|
||||||
|
* than parsing command line arguments,
|
||||||
|
* I/O, kernel, and copy */
|
||||||
|
pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and
|
||||||
|
* host activity: automatically filled in,
|
||||||
|
* not intended for direct usage */
|
||||||
|
pb_TimerID_LAST /* Number of timer IDs */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Dynamic list of asynchronously tracked times between events */
|
||||||
|
struct pb_async_time_marker_list {
|
||||||
|
char *label; // actually just a pointer to a string
|
||||||
|
enum pb_TimerID timerID; /* The ID to which the interval beginning
|
||||||
|
* with this marker should be attributed */
|
||||||
|
void * marker;
|
||||||
|
//cudaEvent_t marker; /* The driver event for this marker */
|
||||||
|
struct pb_async_time_marker_list *next;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pb_SubTimer {
|
||||||
|
char *label;
|
||||||
|
struct pb_Timer timer;
|
||||||
|
struct pb_SubTimer *next;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pb_SubTimerList {
|
||||||
|
struct pb_SubTimer *current;
|
||||||
|
struct pb_SubTimer *subtimer_list;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* A set of timers for recording execution times. */
|
||||||
|
struct pb_TimerSet {
|
||||||
|
enum pb_TimerID current;
|
||||||
|
struct pb_async_time_marker_list* async_markers;
|
||||||
|
pb_Timestamp async_begin;
|
||||||
|
pb_Timestamp wall_begin;
|
||||||
|
struct pb_Timer timers[pb_TimerID_LAST];
|
||||||
|
struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Reset all timers in the set. */
|
||||||
|
void
|
||||||
|
pb_InitializeTimerSet(struct pb_TimerSet *timers);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
|
||||||
|
|
||||||
|
/* Select which timer the next interval of time should be accounted
|
||||||
|
* to. The selected timer is started and other timers are stopped.
|
||||||
|
* Using pb_TimerID_NONE stops all timers. */
|
||||||
|
void
|
||||||
|
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
|
||||||
|
|
||||||
|
/* Print timer values to standard output. */
|
||||||
|
void
|
||||||
|
pb_PrintTimerSet(struct pb_TimerSet *timers);
|
||||||
|
|
||||||
|
/* Release timer resources */
|
||||||
|
void
|
||||||
|
pb_DestroyTimerSet(struct pb_TimerSet * timers);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct pb_Device_tag {
|
||||||
|
char* name;
|
||||||
|
void* clDevice;
|
||||||
|
int id;
|
||||||
|
unsigned int in_use;
|
||||||
|
unsigned int available;
|
||||||
|
} pb_Device;
|
||||||
|
|
||||||
|
struct pb_Context_tag;
|
||||||
|
typedef struct pb_Context_tag pb_Context;
|
||||||
|
|
||||||
|
typedef struct pb_Platform_tag {
|
||||||
|
char* name;
|
||||||
|
char* version;
|
||||||
|
void* clPlatform;
|
||||||
|
unsigned int in_use;
|
||||||
|
pb_Context** contexts;
|
||||||
|
pb_Device** devices;
|
||||||
|
} pb_Platform;
|
||||||
|
|
||||||
|
struct pb_Context_tag {
|
||||||
|
void* clPlatformId;
|
||||||
|
void* clContext;
|
||||||
|
void* clDeviceId;
|
||||||
|
pb_Platform* pb_platform;
|
||||||
|
pb_Device* pb_device;
|
||||||
|
};
|
||||||
|
|
||||||
|
// verbosely print out list of platforms and their devices to the console.
|
||||||
|
pb_Platform**
|
||||||
|
pb_GetPlatforms();
|
||||||
|
|
||||||
|
// Choose a platform according to the given platform specification
|
||||||
|
pb_Platform*
|
||||||
|
pb_GetPlatform(struct pb_PlatformParam *platform);
|
||||||
|
|
||||||
|
// choose a platform: by name, name & version
|
||||||
|
pb_Platform*
|
||||||
|
pb_GetPlatformByName(const char* name);
|
||||||
|
|
||||||
|
pb_Platform*
|
||||||
|
pb_GetPlatformByNameAndVersion(const char* name, const char* version);
|
||||||
|
|
||||||
|
// Choose a device according to the given device specification
|
||||||
|
pb_Device*
|
||||||
|
pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
|
||||||
|
|
||||||
|
pb_Device**
|
||||||
|
pb_GetDevices(pb_Platform* pb_platform);
|
||||||
|
|
||||||
|
// choose a device by name.
|
||||||
|
pb_Device*
|
||||||
|
pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
|
||||||
|
|
||||||
|
pb_Platform*
|
||||||
|
pb_GetPlatformByEnvVars();
|
||||||
|
|
||||||
|
pb_Context*
|
||||||
|
pb_InitOpenCLContext(struct pb_Parameters* parameters);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_ReleasePlatforms();
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_ReleaseContext(pb_Context* c);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_PrintPlatformInfo(pb_Context* c);
|
||||||
|
|
||||||
|
void
|
||||||
|
perf_init();
|
||||||
|
|
||||||
|
//#define MEASURE_KERNEL_TIME
|
||||||
|
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
|
#ifdef MEASURE_KERNEL_TIME
|
||||||
|
#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
|
||||||
|
cl_int
|
||||||
|
pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
|
||||||
|
cl_kernel /* kernel */,
|
||||||
|
cl_uint /* work_dim */,
|
||||||
|
const size_t * /* global_work_offset */,
|
||||||
|
const size_t * /* global_work_size */,
|
||||||
|
const size_t * /* local_work_size */,
|
||||||
|
cl_uint /* num_events_in_wait_list */,
|
||||||
|
const cl_event * /* event_wait_list */,
|
||||||
|
cl_event * /* event */);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
|
||||||
|
void pb_sig_float(char*, float*, int);
|
||||||
|
void pb_sig_double(char*, double*, int);
|
||||||
|
void pb_sig_short(char*, short*, int);
|
||||||
|
void pb_sig_int(char*, int*, int);
|
||||||
|
void pb_sig_uchar(char*, unsigned char*, unsigned int);
|
||||||
|
void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif //PARBOIL_HEADER
|
||||||
|
|
1394
tests/opencl/cutcp2/parboil_opencl.c
Normal file
1394
tests/opencl/cutcp2/parboil_opencl.c
Normal file
File diff suppressed because it is too large
Load diff
139
tests/opencl/cutcp2/readatom.c
Normal file
139
tests/opencl/cutcp2/readatom.c
Normal file
|
@ -0,0 +1,139 @@
|
||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <math.h>
|
||||||
|
#include "atom.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define LINELEN 96
|
||||||
|
#define INITLEN 20
|
||||||
|
|
||||||
|
|
||||||
|
Atoms *read_atom_file(const char *fname)
|
||||||
|
{
|
||||||
|
FILE *file;
|
||||||
|
char line[LINELEN];
|
||||||
|
|
||||||
|
Atom *atom; /* Atom array */
|
||||||
|
int len = INITLEN; /* Size of atom array */
|
||||||
|
int cnt = 0; /* Number of atoms read */
|
||||||
|
|
||||||
|
/* allocate initial atom array */
|
||||||
|
atom = (Atom *) malloc(len * sizeof(Atom));
|
||||||
|
if (NULL==atom) {
|
||||||
|
fprintf(stderr, "can't allocate memory\n");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
int i;
|
||||||
|
for (i = 0; i < len; ++i) {
|
||||||
|
atom[i].x = i+0;
|
||||||
|
atom[i].y = i+1;
|
||||||
|
atom[i].z = i+2;
|
||||||
|
atom[i].q = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
/* open atom "pqr" file */
|
||||||
|
file = fopen(fname, "r");
|
||||||
|
if (NULL==file) {
|
||||||
|
fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* loop to read pqr file line by line */
|
||||||
|
while (fgets(line, LINELEN, file) != NULL) {
|
||||||
|
|
||||||
|
if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
|
||||||
|
continue; /* skip anything that isn't an atom record */
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cnt==len) { /* extend atom array */
|
||||||
|
void *tmp = realloc(atom, 2*len*sizeof(Atom));
|
||||||
|
if (NULL==tmp) {
|
||||||
|
fprintf(stderr, "can't allocate more memory\n");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
atom = (Atom *) tmp;
|
||||||
|
len *= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* read position coordinates and charge from atom record */
|
||||||
|
if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
|
||||||
|
&(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
|
||||||
|
fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
cnt++; /* count atoms as we store them */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* verify EOF and close file */
|
||||||
|
if ( !feof(file) ) {
|
||||||
|
fprintf(stderr, "did not find EOF\n");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
if (fclose(file)) {
|
||||||
|
fprintf(stderr, "can't close file\n");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Build the output data structure */
|
||||||
|
{
|
||||||
|
Atoms *out = (Atoms *)malloc(sizeof(Atoms));
|
||||||
|
|
||||||
|
if (NULL == out) {
|
||||||
|
fprintf(stderr, "can't allocate memory\n");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
out->size = cnt;
|
||||||
|
out->atoms = atom;
|
||||||
|
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void free_atom(Atoms *atom)
|
||||||
|
{
|
||||||
|
if (atom) {
|
||||||
|
free(atom->atoms);
|
||||||
|
free(atom);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
|
||||||
|
{
|
||||||
|
Atom *atoms = atom->atoms;
|
||||||
|
int natoms = atom->size;
|
||||||
|
Vec3 lo;
|
||||||
|
Vec3 hi;
|
||||||
|
int n;
|
||||||
|
|
||||||
|
hi.x = lo.x = atoms[0].x;
|
||||||
|
hi.y = lo.y = atoms[0].y;
|
||||||
|
hi.z = lo.z = atoms[0].z;
|
||||||
|
|
||||||
|
for (n = 1; n < natoms; n++) {
|
||||||
|
lo.x = fminf(lo.x, atoms[n].x);
|
||||||
|
hi.x = fmaxf(hi.x, atoms[n].x);
|
||||||
|
lo.y = fminf(lo.y, atoms[n].y);
|
||||||
|
hi.y = fmaxf(hi.y, atoms[n].y);
|
||||||
|
lo.z = fminf(lo.z, atoms[n].z);
|
||||||
|
hi.z = fmaxf(hi.z, atoms[n].z);
|
||||||
|
}
|
||||||
|
|
||||||
|
*out_lo = lo;
|
||||||
|
*out_hi = hi;
|
||||||
|
}
|
5945
tests/opencl/cutcp2/watbox.sl40.pqr
Executable file
5945
tests/opencl/cutcp2/watbox.sl40.pqr
Executable file
File diff suppressed because it is too large
Load diff
1
tests/opencl/dwt2d/.gitignore
vendored
Normal file
1
tests/opencl/dwt2d/.gitignore
vendored
Normal file
|
@ -0,0 +1 @@
|
||||||
|
*.bmp.dwt.*
|
BIN
tests/opencl/dwt2d/16.bmp
Normal file
BIN
tests/opencl/dwt2d/16.bmp
Normal file
Binary file not shown.
BIN
tests/opencl/dwt2d/192.bmp
Normal file
BIN
tests/opencl/dwt2d/192.bmp
Normal file
Binary file not shown.
7
tests/opencl/dwt2d/Makefile
Normal file
7
tests/opencl/dwt2d/Makefile
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
PROJECT = dwt2d
|
||||||
|
|
||||||
|
SRCS = components.cc dwt.cc main.cc
|
||||||
|
|
||||||
|
OPTS ?= dwt2d_inputs/._192.bmp -D 192x192 -f -5 -l 3
|
||||||
|
|
||||||
|
include ../common.mk
|
46
tests/opencl/dwt2d/Makefile_old
Normal file
46
tests/opencl/dwt2d/Makefile_old
Normal file
|
@ -0,0 +1,46 @@
|
||||||
|
#all:
|
||||||
|
# g++ -o components.o -I/usr/local/cuda-5.5/include -c components.cpp
|
||||||
|
# g++ -o prog -I/usr/local/cuda-5.5/include main.cpp components.o -lOpenCL
|
||||||
|
#
|
||||||
|
include ../../common/make.config
|
||||||
|
|
||||||
|
#Can be changed by `make TYPE=CPU`
|
||||||
|
TYPE = GPU
|
||||||
|
|
||||||
|
#Library
|
||||||
|
ifeq ($(TYPE),GPU)
|
||||||
|
OPENCL_INC = $(NV_OPENCL_INC)
|
||||||
|
OPENCL_LIB = $(NV_OPENCL_LIB)
|
||||||
|
else
|
||||||
|
OPENCL_INC = $(INTEL_OPENCL_INC)
|
||||||
|
OPENCL_LIB = $(INTEL_OPENCL_LIB)
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifdef OUTPUT
|
||||||
|
override OUTPUT = -DOUTPUT
|
||||||
|
endif
|
||||||
|
|
||||||
|
OUTPUT = -DOUTPUT
|
||||||
|
|
||||||
|
C_C = g++
|
||||||
|
OCL_LIB = -lOpenCL -L$(OPENCL_LIB)
|
||||||
|
#OCL_INC = -I/usr/local/cuda-5.5/include
|
||||||
|
OCL_INC = -I$(OPENCL_INC)
|
||||||
|
|
||||||
|
default: dwt2d
|
||||||
|
|
||||||
|
components:
|
||||||
|
$(C_C) -o components.o $(OCL_INC) -c components.cpp
|
||||||
|
|
||||||
|
dwt2d:
|
||||||
|
$(C_C) -o dwt2d.out $(OCL_INC) \
|
||||||
|
-I../util -DTIMING \
|
||||||
|
main.cpp ../util/timing.c \
|
||||||
|
$(components) \
|
||||||
|
$(OUTPUT) \
|
||||||
|
$(OCL_LIB)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf *.o dwt2d.out
|
||||||
|
rm *.bmp.dwt.*
|
||||||
|
|
22
tests/opencl/dwt2d/README
Normal file
22
tests/opencl/dwt2d/README
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
//DESCRITPTION
|
||||||
|
|
||||||
|
This is the OpenCL version of the code.
|
||||||
|
|
||||||
|
The JPEG2000 standard uses 2D Discrete Wavelet Transform (2D DWT), which consumes a significant part of the total encoding time
|
||||||
|
|
||||||
|
|
||||||
|
// USE
|
||||||
|
**************OUTPUT********************
|
||||||
|
USAGE:
|
||||||
|
make clean
|
||||||
|
make OUTPUT=Y
|
||||||
|
|
||||||
|
|
||||||
|
**************PARAMETERS*****************
|
||||||
|
USEAGE:
|
||||||
|
./dwt2d [otpions] src_img.rgb <out_img.dwt>
|
||||||
|
|
||||||
|
-d, --dimension dimensions of src img, e.g. 1920x1080
|
||||||
|
-l, --level DWT level, default 3
|
||||||
|
-f, --forward forward transform
|
||||||
|
-5, --53 5/3 transform
|
25
tests/opencl/dwt2d/common.h
Normal file
25
tests/opencl/dwt2d/common.h
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
#ifndef _COMMON_H
|
||||||
|
#define _COMMON_H
|
||||||
|
|
||||||
|
//24-bit multiplication is faster on G80,
|
||||||
|
//but we must be sure to multiply integers
|
||||||
|
//only within [-8M, 8M - 1] range
|
||||||
|
#define IMUL(a, b) __mul24(a, b)
|
||||||
|
|
||||||
|
#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
|
||||||
|
/*
|
||||||
|
# define cudaCheckError( msg ) { \
|
||||||
|
cudaError_t err = cudaGetLastError(); \
|
||||||
|
if( cudaSuccess != err) { \
|
||||||
|
fprintf(stderr, "%s: %i: %s: %s.\n", \
|
||||||
|
__FILE__, __LINE__, msg, cudaGetErrorString( err) ); \
|
||||||
|
exit(-1); \
|
||||||
|
} }
|
||||||
|
|
||||||
|
# define cudaCheckAsyncError( msg ) { \
|
||||||
|
cudaThreadSynchronize(); \
|
||||||
|
cudaCheckError( msg ); \
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
#endif
|
29
tests/opencl/dwt2d/components.cc
Normal file
29
tests/opencl/dwt2d/components.cc
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <error.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <assert.h>
|
||||||
|
|
||||||
|
|
||||||
|
#include "components.h"
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
#include <OpenCL/cl.h>
|
||||||
|
#else
|
||||||
|
#include <CL/opencl.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* the old "components.cu" has been separate into two parts
|
||||||
|
"components.cpp",contains functions
|
||||||
|
"components.cl", contains all kernel functions
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Separate compoents of 8bit RGB source image */
|
||||||
|
|
||||||
|
//need add some segments
|
21
tests/opencl/dwt2d/components.h
Normal file
21
tests/opencl/dwt2d/components.h
Normal file
|
@ -0,0 +1,21 @@
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <error.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <assert.h>
|
||||||
|
//#include <stddef.h>
|
||||||
|
#ifndef _COMPONENTS_H
|
||||||
|
#define _COMPONENTS_H
|
||||||
|
|
||||||
|
/* Separate compoents of source 8bit RGB image */
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
void rgbToComponents(T d_r, T d_g, T d_b, unsigned char * src, int width, int height);
|
||||||
|
|
||||||
|
|
||||||
|
/* Copy a 8bit source image data into a color compoment of type T */
|
||||||
|
//template<typename T>
|
||||||
|
//void bwToComponent(T *d_c, unsigned char * src, int width, int height);
|
||||||
|
|
||||||
|
#endif
|
95
tests/opencl/dwt2d/dwt.cc
Normal file
95
tests/opencl/dwt2d/dwt.cc
Normal file
|
@ -0,0 +1,95 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <fcntl.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <error.h>
|
||||||
|
/*#include "dwt_cuda/dwt.h"
|
||||||
|
#include "dwt_cuda/common.h"
|
||||||
|
#include "dwt.h"
|
||||||
|
#include "common.h"
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
#include "dwt.h"
|
||||||
|
#include "common.h"
|
||||||
|
/*
|
||||||
|
inline void fdwt(float *in, float *out, int width, int height, int levels)
|
||||||
|
{
|
||||||
|
dwt_cuda::fdwt97(in, out, width, height, levels);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void fdwt(int *in, int *out, int width, int height, int levels)
|
||||||
|
{
|
||||||
|
dwt_cuda::fdwt53(in, out, width, height, levels);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void rdwt(float *in, float *out, int width, int height, int levels)
|
||||||
|
{
|
||||||
|
dwt_cuda::rdwt97(in, out, width, height, levels);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void rdwt(int *in, int *out, int width, int height, int levels)
|
||||||
|
{
|
||||||
|
dwt_cuda::rdwt53(in, out, width, height, levels);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
template<typename T>
|
||||||
|
int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
|
||||||
|
{
|
||||||
|
//need add segments
|
||||||
|
}
|
||||||
|
template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool);
|
||||||
|
template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool);
|
||||||
|
|
||||||
|
void samplesToChar(unsigned char * dst, float * src, int samplesNum)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for(i = 0; i < samplesNum; i++) {
|
||||||
|
float r = (src[i]+0.5f) * 255;
|
||||||
|
if (r > 255) r = 255;
|
||||||
|
if (r < 0) r = 0;
|
||||||
|
dst[i] = (unsigned char)r;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void samplesToChar(unsigned char * dst, int * src, int samplesNum)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for(i = 0; i < samplesNum; i++) {
|
||||||
|
int r = src[i]+128;
|
||||||
|
if (r > 255) r = 255;
|
||||||
|
if (r < 0) r = 0;
|
||||||
|
dst[i] = (unsigned char)r;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
///* Write output linear orderd*/
|
||||||
|
template<typename T>
|
||||||
|
int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
|
||||||
|
const char * filename, const char * suffix)
|
||||||
|
{
|
||||||
|
//need add segments
|
||||||
|
|
||||||
|
}
|
||||||
|
template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
|
||||||
|
template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* Write output visual ordered */
|
||||||
|
template<typename T>
|
||||||
|
int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight,
|
||||||
|
int stages, const char * filename, const char * suffix)
|
||||||
|
{
|
||||||
|
//need add segments
|
||||||
|
}
|
||||||
|
template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
|
||||||
|
template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
15
tests/opencl/dwt2d/dwt.h
Normal file
15
tests/opencl/dwt2d/dwt.h
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
|
||||||
|
#ifndef _DWT_H
|
||||||
|
#define _DWT_H
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward);
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
int writeNStage2DDWT(T *component_cuda, int width, int height,
|
||||||
|
int stages, const char * filename, const char * suffix);
|
||||||
|
template<typename T>
|
||||||
|
int writeLinear(T *component_cuda, int width, int height,
|
||||||
|
const char * filename, const char * suffix);
|
||||||
|
|
||||||
|
#endif
|
BIN
tests/opencl/dwt2d/dwt2d
Executable file
BIN
tests/opencl/dwt2d/dwt2d
Executable file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_data
Executable file
BIN
tests/opencl/dwt2d/dwt2d_data
Executable file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/._192.bmp
Executable file
BIN
tests/opencl/dwt2d/dwt2d_inputs/._192.bmp
Executable file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/._inputGen
Executable file
BIN
tests/opencl/dwt2d/dwt2d_inputs/._inputGen
Executable file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/._rgb.bmp
Executable file
BIN
tests/opencl/dwt2d/dwt2d_inputs/._rgb.bmp
Executable file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._16.bmp
Normal file
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._16.bmp
Normal file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._192.bmp
Normal file
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._192.bmp
Normal file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._4.bmp
Normal file
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._4.bmp
Normal file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._64.bmp
Normal file
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._64.bmp
Normal file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._8.bmp
Normal file
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._8.bmp
Normal file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._bmp.cpp
Executable file
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._bmp.cpp
Executable file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._bmp.h
Executable file
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._bmp.h
Executable file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._creat.out
Executable file
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._creat.out
Executable file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._create.cpp
Executable file
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._create.cpp
Executable file
Binary file not shown.
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._create.out
Executable file
BIN
tests/opencl/dwt2d/dwt2d_inputs/inputGen/._create.out
Executable file
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue