add benchmakrs

2025-04-25 14:27:32 -04:00 · 2024-04-23 00:12:51 -04:00 · 2024-04-23 00:12:51 -04:00 · ea26d69751
commit ea26d69751
parent 7ee20fca2b
349 changed files with 190817 additions and 0 deletions
--- a/tests/opencl/b+tree/.gitignore
+++ b/tests/opencl/b+tree/.gitignore
@ -0,0 +1 @@
+*.txt
--- a/tests/opencl/b+tree/Makefile
+++ b/tests/opencl/b+tree/Makefile
@ -0,0 +1,7 @@
+PROJECT = btree
+
+SRCS = main.cc kernel_gpu_opencl_wrapper.cc kernel_gpu_opencl_wrapper_2.cc num.cc opencl.cc timer.cc timing.cpp
+
+OPTS ?= file mil.txt command command.txt 
+
+include common.mk
--- a/tests/opencl/b+tree/Makefile_old
+++ b/tests/opencl/b+tree/Makefile_old
@ -0,0 +1,159 @@
+include ../../common/make.config
+
+#Can be changed by `make TYPE=CPU`
+TYPE = GPU
+
+#Library
+ifeq ($(TYPE),GPU)
+	OPENCL_INC = $(NV_OPENCL_INC)
+	OPENCL_LIB = $(NV_OPENCL_LIB)
+else
+	OPENCL_INC = $(INTEL_OPENCL_INC)
+	OPENCL_LIB = $(INTEL_OPENCL_LIB)
+endif
+
+C_C = gcc
+OCL_LIB = -lOpenCL
+
+# ====================================================================================================100
+#	NVCC
+# ====================================================================================================100
+
+# CUD_C = nvcc
+# OMP_FLAG = 	-Xcompiler paste_one_here
+
+# ====================================================================================================100
+#	PGCC
+# ====================================================================================================100
+
+# C_C = pgcc
+# OMP_FLAG = -mp
+# ACC_FLAG = 	-ta=nvidia \
+			# -Minfo \
+			# -Mbounds
+
+# ====================================================================================================100
+#	END
+# ====================================================================================================100
+
+# ========================================================================================================================================================================================================200
+#	EXECUTABLES (LINK OBJECTS TOGETHER INTO BINARY)
+# ========================================================================================================================================================================================================200
+
+b+tree.out:	./main.o \
+		./kernel/kernel_gpu_opencl_wrapper.o \
+		./kernel/kernel_gpu_opencl_wrapper_2.o \
+		./util/timer/timer.o \
+		./util/num/num.o \
+		./util/opencl/opencl.o \
+        ./timing.o
+	$(C_C)	./main.o \
+			./kernel/kernel_gpu_opencl_wrapper.o \
+			./kernel/kernel_gpu_opencl_wrapper_2.o \
+			./util/timer/timer.o \
+			./util/num/num.o \
+			./util/opencl/opencl.o \
+            ./timing.o \
+			-lm \
+			-L$(OPENCL_LIB) \
+			-lOpenCL \
+	                -o b+tree.out
+
+run: b+tree.out
+	./b+tree.out file ../../data/b+tree/mil.txt command ../../data/b+tree/command.txt -p 0 -d 0
+
+# ========================================================================================================================================================================================================200
+#	OBJECTS (COMPILE SOURCE FILES INTO OBJECTS)
+# ========================================================================================================================================================================================================200
+
+# ======================================================================================================================================================150
+#	MAIN FUNCTION
+# ======================================================================================================================================================150
+
+main.o:	./common.h \
+		./main.h \
+		./main.c
+	$(C_C)	$(KERNEL_DIM) ./main.c \
+			-c \
+		    -I$(OPENCL_INC) \
+			-o ./main.o \
+			-Wno-unused-result \
+			-O3
+
+# ======================================================================================================================================================150
+#	KERNELS
+# ======================================================================================================================================================150
+
+./kernel/kernel_gpu_opencl_wrapper.o:	./common.h \
+										./kernel/kernel_gpu_opencl_wrapper.h \
+										./kernel/kernel_gpu_opencl_wrapper.c
+	$(C_C)	$(KERNEL_DIM) ./kernel/kernel_gpu_opencl_wrapper.c \
+			-c \
+			-o ./kernel/kernel_gpu_opencl_wrapper.o \
+			-O3 \
+			-I../util \
+			-DTIMING \
+			-I$(OPENCL_INC)
+
+./kernel/kernel_gpu_opencl_wrapper_2.o:	./common.h \
+										./kernel/kernel_gpu_opencl_wrapper_2.h \
+										./kernel/kernel_gpu_opencl_wrapper_2.c
+	$(C_C)	$(KERNEL_DIM) ./kernel/kernel_gpu_opencl_wrapper_2.c \
+			-c \
+			-o ./kernel/kernel_gpu_opencl_wrapper_2.o \
+			-O3 \
+			-I../util \
+			-DTIMING \
+			-I$(OPENCL_INC)
+
+timing.o: ../util/timing.h ../util/timing.c
+	$(C_C) ../util/timing.c -I../util -c -o timing.o -I$(OPENCL_INC)
+
+
+# ======================================================================================================================================================150
+#	UTILITIES
+# ======================================================================================================================================================150
+
+./util/timer/timer.o:	./common.h \
+						./util/timer/timer.h \
+						./util/timer/timer.c
+	$(C_C)	./util/timer/timer.c \
+			-c \
+			-o ./util/timer/timer.o \
+			-O3
+
+./util/num/num.o:	./common.h \
+					./util/num/num.h \
+					./util/num/num.c
+	$(C_C)	./util/num/num.c \
+			-c \
+			-o ./util/num/num.o \
+			-O3
+
+./util/opencl/opencl.o:	./common.h \
+						./util/opencl/opencl.h \
+						./util/opencl/opencl.c
+	$(C_C)	./util/opencl/opencl.c \
+			-c \
+			-o ./util/opencl/opencl.o \
+			-O3 \
+			-I$(OPENCL_INC)
+
+# ======================================================================================================================================================150
+#	END
+# ======================================================================================================================================================150
+
+# ========================================================================================================================================================================================================200
+#	DELETE
+# ========================================================================================================================================================================================================200
+
+clean:
+	rm	*.o *.out \
+		./kernel/*.o \
+		./util/timer/*.o \
+		./util/num/*.o \
+		./util/opencl/*.o \
+                output.txt 2>/dev/null || true
+# ========================================================================================================================================================================================================200
+#	END
+# ========================================================================================================================================================================================================200
--- a/tests/opencl/b+tree/README
+++ b/tests/opencl/b+tree/README
@ -0,0 +1,7 @@
+******Adjustable work group size*****
+The kernel 1: RD_WG_SIZE_0_0 RD_WG_SIZE_0
+The kernel 2: RD_WG_SIZE_1_0 RD_WG_SIZE_1
+
+USAGE:
+make clean
+make KERNEL_DIM="-DRD_WG_SIZE_0=256 -DRD_WG_SIZE_1=256"
--- a/tests/opencl/b+tree/btree
+++ b/tests/opencl/b+tree/btree
--- a/tests/opencl/b+tree/common.h
+++ b/tests/opencl/b+tree/common.h
@ -0,0 +1,476 @@
+// # ifdef __cplusplus
+// extern "C" {
+// # endif
+
+#ifndef LIST_H
+# define LIST_H
+
+//===============================================================================================================================================================================================================200
+//	DEFINE/INCLUDE
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	INCLUDE (for some reason these are not recognized when defined in main file before this one is included)
+//======================================================================================================================================================150
+
+#include <stdint.h>					// (in path known to compiler)			needed by uint32_t
+#include <stdbool.h>				// (in path known to compiler)			needed by true/false, bool
+#include <stdlib.h>					// (in path known to compiler)			needed by malloc
+#include <stdio.h>
+
+//======================================================================================================================================================150
+//	DEFINE
+//======================================================================================================================================================150
+
+#define fp float
+
+#define Version "1.5"
+
+#ifdef WINDOWS
+	#define bool char
+	#define false 0
+	#define true 1
+#endif
+
+/* #define DEFAULT_ORDER 256 */
+
+#ifdef RD_WG_SIZE_0_0
+        #define  DEFAULT_ORDER RD_WG_SIZE_0_0
+#elif defined(RD_WG_SIZE_0)
+        #define  DEFAULT_ORDER RD_WG_SIZE_0
+#elif defined(RD_WG_SIZE)
+        #define  DEFAULT_ORDER RD_WG_SIZE
+#else
+        #define  DEFAULT_ORDER 256
+#endif
+
+#ifdef RD_WG_SIZE_1_0
+        #define  DEFAULT_ORDER_2 RD_WG_SIZE_1_0
+#elif defined(RD_WG_SIZE_1)
+        #define  DEFAULT_ORDER_2 RD_WG_SIZE_1
+#elif defined(RD_WG_SIZE)
+        #define  DEFAULT_ORDER_2 RD_WG_SIZE
+#else
+        #define  DEFAULT_ORDER_2 256
+#endif
+
+
+#define malloc(size) ({                                                   \
+  void *_tmp;                                                             \
+                                                                          \
+  if (!(_tmp = malloc(size))) {                                           \
+    fprintf(stderr, "Allocation failed at %s:%d!\n", __FILE__, __LINE__); \
+    exit(-1);                                                             \
+  }                                                                       \
+                                                                          \
+  _tmp;                                                                   \
+})
+
+//======================================================================================================================================================150
+//	STRUCTURES
+//======================================================================================================================================================150
+
+// struct list_item;
+typedef struct list_item list_item_t;
+
+typedef struct list_t {
+  list_item_t *head, *tail;
+  uint32_t length;
+  int32_t (*compare)(const void *key, const void *with);
+  void (*datum_delete)(void *);
+} list_t;
+
+typedef list_item_t *list_iterator_t;
+typedef list_item_t *list_reverse_iterator_t;
+
+/* Type representing the record
+* to which a given key refers.
+* In a real B+ tree system, the
+* record would hold data (in a database)
+* or a file (in an operating system)
+* or some other information.
+* Users can rewrite this part of the code
+* to change the type and content
+* of the value field.
+*/
+typedef struct record {
+	int value;
+} record;
+
+/* Type representing a node in the B+ tree.
+* This type is general enough to serve for both
+* the leaf and the internal node.
+* The heart of the node is the array
+* of keys and the array of corresponding
+* pointers.  The relation between keys
+* and pointers differs between leaves and
+* internal nodes.  In a leaf, the index
+* of each key equals the index of its corresponding
+* pointer, with a maximum of order - 1 key-pointer
+* pairs.  The last pointer points to the
+* leaf to the right (or NULL in the case
+* of the rightmost leaf).
+* In an internal node, the first pointer
+* refers to lower nodes with keys less than
+* the smallest key in the keys array.  Then,
+* with indices i starting at 0, the pointer
+* at i + 1 points to the subtree with keys
+* greater than or equal to the key in this
+* node at index i.
+* The num_keys field is used to keep
+* track of the number of valid keys.
+* In an internal node, the number of valid
+* pointers is always num_keys + 1.
+* In a leaf, the number of valid pointers
+* to data is always num_keys.  The
+* last leaf pointer points to the next leaf.
+*/
+typedef struct node {
+	void ** pointers;
+	int * keys;
+	struct node * parent;
+	bool is_leaf;
+	int num_keys;
+	struct node * next; // Used for queue.
+} node;
+
+// 
+typedef struct knode {
+	int location;
+	int indices [DEFAULT_ORDER + 1];
+	int  keys [DEFAULT_ORDER + 1];
+	bool is_leaf;
+	int num_keys;
+} knode; 
+
+struct list_item {
+  struct list_item *pred, *next;
+  void *datum;
+};
+
+//===============================================================================================================================================================================================================200
+//	PROTOTYPES
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+// Other
+//======================================================================================================================================================150
+
+void 
+list_item_init(	list_item_t *li, 
+				void *datum);
+
+void 
+list_item_delete(	list_item_t *li, 
+					void (*datum_delete)(void *datum));
+
+void 
+list_insert_item_tail(	list_t *l, 
+						list_item_t *i);
+
+void 
+list_insert_item_before(list_t *l, 
+						list_item_t *next, 
+						list_item_t *i);
+
+void 
+list_insert_item_after(	list_t *l, 
+						list_item_t *pred, 
+						list_item_t *i);
+
+void 
+list_insert_item_sorted(list_t *l, 
+						list_item_t *i);
+
+//======================================================================================================================================================150
+// ???
+//======================================================================================================================================================150
+
+void 
+list_init(	list_t *l,
+			int32_t (*compare)(const void *key, const void *with),
+			void (*datum_delete)(void *datum));
+
+void 
+list_delete(list_t *l);
+
+void 
+list_reset(list_t *l);
+
+void 
+list_insert_head(	list_t *l, 
+					void *v);
+
+void 
+list_insert_tail(	list_t *l, 
+					void *v);
+
+void 
+list_insert_before(list_t *l, 
+					list_item_t *next, 
+					void *v);
+
+void 
+list_insert_after(	list_t *l, 
+					list_item_t *pred, 
+					void *v);
+
+void 
+list_insert_sorted(	list_t *l, 
+					void *v);
+
+void 
+list_insert_item_head(	list_t *l, 
+						list_item_t *i);
+
+void 
+list_remove_item(	list_t *l, 
+					list_item_t *i);
+
+void 
+list_remove_head(list_t *l);
+
+void 
+list_remove_tail(list_t *l);
+
+list_item_t *
+list_find_item(	list_t *l, 
+				void *datum);
+
+list_item_t *
+list_get_head_item(list_t *l);
+
+list_item_t *
+list_get_tail_item(list_t *l);
+
+void *
+list_find(	list_t *l, 
+			void *datum);
+
+void *
+list_get_head(list_t *l);
+
+void *
+list_get_tail(list_t *l);
+
+uint32_t 
+list_get_length(list_t *l);
+
+bool 
+list_is_empty(list_t *l);
+
+bool 
+list_not_empty(list_t *l);
+
+void 
+list_visit_items(	list_t *l, 
+					void (*visitor)(void *v));
+
+void *
+list_item_get_datum(list_item_t *li);
+
+void 
+list_iterator_init(	list_t *l, 
+					list_iterator_t *li);
+
+void 
+list_iterator_delete(list_iterator_t *li);
+
+void 
+list_iterator_next(list_iterator_t *li);
+
+void 
+list_iterator_prev(list_iterator_t *li);
+
+void *
+list_iterator_get_datum(list_iterator_t *li);
+
+bool 
+list_iterator_is_valid(list_iterator_t *li);
+
+void 
+list_reverse_iterator_init(	list_t *l, 
+							list_iterator_t *li);
+
+void 
+list_reverse_iterator_delete(list_iterator_t *li);
+
+void 
+list_reverse_iterator_next(list_iterator_t *li);
+
+void 
+list_reverse_iterator_prev(list_iterator_t *li);
+
+void *
+list_reverse_iterator_get_datum(list_iterator_t *li);
+
+bool 
+list_reverse_iterator_is_valid(list_reverse_iterator_t *li);
+
+//======================================================================================================================================================150
+// Output and utility
+//======================================================================================================================================================150
+
+void *
+kmalloc(int size);
+
+long 
+transform_to_cuda(	node *n, 
+					bool verbose); //returns actual mem used in a long
+
+void 
+usage_1( void );
+
+void 
+usage_2( void );
+
+void 
+enqueue( node * new_node );
+
+node * 
+dequeue( void );
+
+int 
+height( node * root );
+
+int 
+path_to_root(	node * root, 
+				node * child );
+
+void 
+print_leaves( node * root );
+
+void 
+print_tree( node * root );
+
+node * 
+find_leaf(	node * root, 
+			int key, 
+			bool verbose );
+
+record * 
+find(	node * root, 
+		int key, 
+		bool verbose );
+
+int 
+cut( int length );
+
+//======================================================================================================================================================150
+// Insertion
+//======================================================================================================================================================150
+
+record * 
+make_record(int value);
+
+node * 
+make_node( void );
+
+node * 
+make_leaf( void );
+
+int 
+get_left_index(	node * parent, 
+				node * left);
+
+node * 
+insert_into_leaf(	node * leaf, 
+					int key, record * pointer );
+
+node * 
+insert_into_leaf_after_splitting(	node * root, 
+									node * leaf, 
+									int key, 
+									record * pointer);
+
+node * 
+insert_into_node(	node * root, 
+					node * parent, 
+					int left_index, 
+					int key, 
+					node * right);
+
+node * 
+insert_into_node_after_splitting(	node * root, 
+									node * parent, 
+									int left_index, 
+									int key, 
+									node * right);
+
+node * 
+insert_into_parent(	node * root, 
+					node * left, 
+					int key, 
+					node * right);
+
+node * 
+insert_into_new_root(	node * left, 
+						int key, 
+						node * right);
+
+node * 
+start_new_tree(	int key, 
+				record * pointer);
+
+node * 
+insert(	node * root, 
+		int key, 
+		int value );
+
+//======================================================================================================================================================150
+// Deletion
+//======================================================================================================================================================150
+
+int 
+get_neighbor_index(node * n );
+
+node * 
+adjust_root(node * root);
+
+node * 
+coalesce_nodes(	node * root, 
+				node * n, 
+				node * neighbor, 
+				int neighbor_index, 
+				int k_prime);
+
+node * 
+redistribute_nodes(	node * root, 
+					node * n, 
+					node * neighbor, 
+					int neighbor_index, 
+					int k_prime_index, 
+					int k_prime);
+
+node * 
+delete_entry(	node * root, 
+				node * n, 
+				int key, 
+				void * pointer );
+
+node * 
+deleteVal(	node * root, 
+			int key );
+
+//===============================================================================================================================================================================================================200
+//	HEADER
+//===============================================================================================================================================================================================================200
+
+extern int platform_id_inuse;
+extern int device_id_inuse;
+extern cl_device_type device_type;
+
+
+
+// int main(	int argc, 
+			// char *argv []);
+
+//===============================================================================================================================================================================================================200
+//	END
+//===============================================================================================================================================================================================================200
+
+// #endif
+
+// # ifdef __cplusplus
+// }
+# endif
--- a/tests/opencl/b+tree/common.mk
+++ b/tests/opencl/b+tree/common.mk
@ -0,0 +1,126 @@
+XLEN ?= 32
+TOOLDIR ?= /opt
+
+TARGET ?= opaesim
+
+XRT_SYN_DIR  ?= ../../../hw/syn/xilinx/xrt
+XRT_DEVICE_INDEX ?= 0
+
+ifeq ($(XLEN),64)
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
+VX_CFLAGS += -march=rv64imafd -mabi=lp64d
+K_CFLAGS += -march=rv64imafd -mabi=ilp64d
+STARTUP_ADDR ?= 0x180000000
+else
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
+VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
+K_CFLAGS += -march=rv32imaf -mabi=ilp32f
+STARTUP_ADDR ?= 0x80000000
+endif
+
+RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
+RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
+
+POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
+POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
+
+VORTEX_RT_PATH ?= $(realpath ../../../runtime)
+VORTEX_KN_PATH ?= $(realpath ../../../kernel)
+
+FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
+
+LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
+LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
+
+K_CFLAGS   += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
+K_CFLAGS   += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
+K_CFLAGS   += -I$(VORTEX_KN_PATH)/include -DNDEBUG -DLLVM_VOTEX
+K_LDFLAGS  += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a -lm
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
+CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing -Wno-unused-result
+CXXFLAGS += -pthread
+CXXFLAGS += -I$(POCL_RT_PATH)/include
+
+ifdef HOSTGPU
+	CXXFLAGS += -DHOSTGPU
+	LDFLAGS += -lOpenCL
+else
+	LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so
+endif
+
+# Debugigng
+#ifdef DEBUG
+	CXXFLAGS += -g -O0
+#else    
+#	CXXFLAGS += -O2 -DNDEBUG
+#endif
+
+ifeq ($(TARGET), fpga)
+	OPAE_DRV_PATHS ?= libopae-c.so
+else
+ifeq ($(TARGET), asesim)
+	OPAE_DRV_PATHS ?= libopae-c-ase.so
+else
+ifeq ($(TARGET), opaesim)
+	OPAE_DRV_PATHS ?= libopae-c-sim.so
+endif	
+endif
+endif
+
+OBJS := $(addsuffix .o, $(notdir $(SRCS)))
+
+all: $(PROJECT) kernel.pocl kernel2.pocl
+ 
+kernel.pocl: kernel_gpu_opencl.cl  
+	LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel_gpu_opencl.cl 
+
+kernel2.pocl: kernel_gpu_opencl_2.cl  
+	LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel2.pocl kernel_gpu_opencl_2.cl
+
+#backprop.o: backprop.c
+#	$(CXX) $(CXXFLAGS) backprop.c -c -Wno-unused-result
+	
+%.cc.o: %.cc
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+%.cpp.o: %.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+%.c.o: %.c
+	$(CC) $(CXXFLAGS) -c $< -o $@
+
+$(PROJECT): $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+run-hostgpu: $(PROJECT) kernel.pocl
+	./$(PROJECT) $(OPTS)
+
+run-simx: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-rtlsim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-opae: $(PROJECT) kernel.pocl
+	SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-xrt: $(PROJECT) kernel.pocl
+ifeq ($(TARGET), hw)
+	SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+else
+	XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)	
+endif
+
+.depend: $(SRCS)
+	$(CXX) $(CXXFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf $(PROJECT) *.o .depend
+
+clean-all: clean
+	rm -rf *.dump *.pocl
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
--- a/tests/opencl/b+tree/kernel_gpu_opencl.cl
+++ b/tests/opencl/b+tree/kernel_gpu_opencl.cl
@ -0,0 +1,109 @@
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+//========================================================================================================================================================================================================200
+//	DEFINE/INCLUDE
+//========================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	DEFINE
+//======================================================================================================================================================150
+
+// double precision support (switch between as needed for NVIDIA/AMD)
+#ifdef AMDAPP
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+// clBuildProgram compiler cannot link this file for some reason, so had to redefine constants and structures below
+//#include ../common.h						// (in directory specified to compiler)			main function header
+
+//======================================================================================================================================================150
+//	DEFINE (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
+//======================================================================================================================================================150
+
+// change to double if double precision needed
+#define fp float
+
+#define DEFAULT_ORDER 256
+
+//======================================================================================================================================================150
+//	STRUCTURES (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
+//======================================================================================================================================================150
+
+// Type representing the record to which a given key refers. In a real B+ tree system, the record would hold data (in a database) or a file (in an operating system) or some other information.
+// Users can rewrite this part of the code to change the type and content of the value field.
+typedef struct record {
+	int value;
+} record;
+
+// ???
+typedef struct knode {
+	int location;
+	int indices [DEFAULT_ORDER + 1];
+	int  keys [DEFAULT_ORDER + 1];
+	bool is_leaf;
+	int num_keys;
+} knode; 
+
+//========================================================================================================================================================================================================200
+//	findK function
+//========================================================================================================================================================================================================200
+
+__kernel void 
+findK(	long height,
+		__global knode *knodesD,
+		long knodes_elem,
+		__global record *recordsD,
+
+		__global long *currKnodeD,
+		__global long *offsetD,
+		__global int *keysD, 
+		__global record *ansD)
+{
+
+	// private thread IDs
+	int thid = get_local_id(0);
+	int bid = get_group_id(0);
+
+	// processtree levels
+	int i;
+	for(i = 0; i < height; i++){
+
+		// if value is between the two keys
+		if((knodesD[currKnodeD[bid]].keys[thid]) <= keysD[bid] && (knodesD[currKnodeD[bid]].keys[thid+1] > keysD[bid])){
+			// this conditional statement is inserted to avoid crush due to but in original code
+			// "offset[bid]" calculated below that addresses knodes[] in the next iteration goes outside of its bounds cause segmentation fault
+			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
+			if(knodesD[offsetD[bid]].indices[thid] < knodes_elem){
+				offsetD[bid] = knodesD[offsetD[bid]].indices[thid];
+			}
+		}
+		//__syncthreads();
+		barrier(CLK_LOCAL_MEM_FENCE);
+		// set for next tree level
+		if(thid==0){
+			currKnodeD[bid] = offsetD[bid];
+		}
+		//__syncthreads();
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+	}
+
+	//At this point, we have a candidate leaf node which may contain
+	//the target record.  Check each key to hopefully find the record
+	if(knodesD[currKnodeD[bid]].keys[thid] == keysD[bid]){
+		ansD[bid].value = recordsD[knodesD[currKnodeD[bid]].indices[thid]].value;
+	}
+
+}
+
+//========================================================================================================================================================================================================200
+//	End
+//========================================================================================================================================================================================================200
+
+// #ifdef __cplusplus
+// }
+// #endif
--- a/tests/opencl/b+tree/kernel_gpu_opencl_2.cl
+++ b/tests/opencl/b+tree/kernel_gpu_opencl_2.cl
@ -0,0 +1,111 @@
+//========================================================================================================================================================================================================200
+//	DEFINE/INCLUDE
+//========================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	DEFINE
+//======================================================================================================================================================150
+
+// double precision support (switch between as needed for NVIDIA/AMD)
+#ifdef AMDAPP
+#pragma OPENCL EXTENSION cl_amd_fp64 : enable
+#else
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#endif
+
+// clBuildProgram compiler cannot link this file for some reason, so had to redefine constants and structures below
+// #include ../common.h						// (in directory specified to compiler)			main function header
+
+//======================================================================================================================================================150
+//	DEFINE (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
+//======================================================================================================================================================150
+
+// change to double if double precision needed
+#define fp float
+
+#define DEFAULT_ORDER_2 256
+
+//======================================================================================================================================================150
+//	STRUCTURES (had to bring from ../common.h here because feature of including headers in clBuildProgram does not work for some reason)
+//======================================================================================================================================================150
+
+// ???
+typedef struct knode {
+	int location;
+	int indices [DEFAULT_ORDER_2 + 1];
+	int  keys [DEFAULT_ORDER_2 + 1];
+	bool is_leaf;
+	int num_keys;
+} knode; 
+
+//========================================================================================================================================================================================================200
+//	findRangeK function
+//========================================================================================================================================================================================================200
+
+__kernel void 
+findRangeK(	long height,
+			__global knode *knodesD,
+			long knodes_elem,
+
+			__global long *currKnodeD,
+			__global long *offsetD,
+			__global long *lastKnodeD,
+			__global long *offset_2D,
+			__global int *startD,
+			__global int *endD,
+			__global int *RecstartD, 
+			__global int *ReclenD)
+{
+
+	// private thread IDs
+	int thid = get_local_id(0);
+	int bid = get_group_id(0);
+
+	// ???
+	int i;
+	for(i = 0; i < height; i++){
+
+		if((knodesD[currKnodeD[bid]].keys[thid] <= startD[bid]) && (knodesD[currKnodeD[bid]].keys[thid+1] > startD[bid])){
+			// this conditional statement is inserted to avoid crush due to but in original code
+			// "offset[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
+			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
+			if(knodesD[currKnodeD[bid]].indices[thid] < knodes_elem){
+				offsetD[bid] = knodesD[currKnodeD[bid]].indices[thid];
+			}
+		}
+		if((knodesD[lastKnodeD[bid]].keys[thid] <= endD[bid]) && (knodesD[lastKnodeD[bid]].keys[thid+1] > endD[bid])){
+			// this conditional statement is inserted to avoid crush due to but in original code
+			// "offset_2[bid]" calculated below that later addresses part of knodes goes outside of its bounds cause segmentation fault
+			// more specifically, values saved into knodes->indices in the main function are out of bounds of knodes that they address
+			if(knodesD[lastKnodeD[bid]].indices[thid] < knodes_elem){
+				offset_2D[bid] = knodesD[lastKnodeD[bid]].indices[thid];
+			}
+		}
+		//__syncthreads();
+		barrier(CLK_LOCAL_MEM_FENCE);
+		// set for next tree level
+		if(thid==0){
+			currKnodeD[bid] = offsetD[bid];
+			lastKnodeD[bid] = offset_2D[bid];
+		}
+		//	__syncthreads();
+		barrier(CLK_LOCAL_MEM_FENCE);
+	}
+
+	// Find the index of the starting record
+	if(knodesD[currKnodeD[bid]].keys[thid] == startD[bid]){
+		RecstartD[bid] = knodesD[currKnodeD[bid]].indices[thid];
+	}
+	//	__syncthreads();
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+	// Find the index of the ending record
+	if(knodesD[lastKnodeD[bid]].keys[thid] == endD[bid]){
+		ReclenD[bid] = knodesD[lastKnodeD[bid]].indices[thid] - RecstartD[bid]+1;
+	}
+
+}
+
+//========================================================================================================================================================================================================200
+//	End
+//========================================================================================================================================================================================================200
--- a/tests/opencl/b+tree/kernel_gpu_opencl_wrapper.cc
+++ b/tests/opencl/b+tree/kernel_gpu_opencl_wrapper.cc
@ -0,0 +1,753 @@
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+//========================================================================================================================================================================================================200
+//	DEFINE/INCLUDE
+//========================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	LIBRARIES
+//======================================================================================================================================================150
+
+#include <CL/cl.h>									// (in directory provided to compiler)		needed by OpenCL types and functions
+#include <string.h>									// (in directory known to compiler)			needed by memset
+
+//======================================================================================================================================================150
+//	COMMON
+//======================================================================================================================================================150
+
+#include "common.h"								// (in directory provided here)
+
+//======================================================================================================================================================150
+//	UTILITIES
+//======================================================================================================================================================150
+
+#include "opencl.h"					// (in directory provided here)
+#include "timer.h"					// (in directory provided here)
+#ifdef TIMING
+#include "timing.h"
+#endif
+
+//======================================================================================================================================================150
+//	HEADER
+//======================================================================================================================================================150
+
+#include "kernel_gpu_opencl_wrapper.h"			// (in directory provided here)
+
+int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp_ = fopen(filename, "r");
+  if (NULL == fp_) {
+    fprintf(stderr, "Failed to load kernel. %s\n", filename);
+    return -1;
+  }
+  fseek(fp_ , 0 , SEEK_END);
+  long fsize = ftell(fp_);
+  rewind(fp_);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp_);
+  
+  fclose(fp_);
+  
+  return 0;
+}
+
+
+//========================================================================================================================================================================================================200
+//	KERNEL_GPU_CUDA_WRAPPER FUNCTION
+//========================================================================================================================================================================================================200
+void 
+kernel_gpu_opencl_wrapper(	record *records,
+							long records_mem,
+							knode *knodes,
+							long knodes_elem,
+							long knodes_mem,
+
+							int order,
+							long maxheight,
+							int count,
+
+							long *currKnode,
+							long *offset,
+							int *keys,
+							record *ans)
+{
+  
+  cl_device_type device_type = CL_DEVICE_TYPE_GPU;
+  int platform_id_inuse = 0;            // platform id in use (default: 0)
+  int device_id_inuse = 0;            // platform id in use (default: 0)
+	//======================================================================================================================================================150
+	//	CPU VARIABLES
+	//======================================================================================================================================================150
+
+#ifdef TIMING
+	struct timeval tv;
+	struct timeval tv_total_start, tv_total_end;
+	struct timeval tv_init_end;
+	struct timeval tv_h2d_start, tv_h2d_end;
+	struct timeval tv_d2h_start, tv_d2h_end;
+	struct timeval tv_kernel_start, tv_kernel_end;
+	struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
+	struct timeval tv_close_start, tv_close_end;
+	float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time= 0,
+		  d2h_time = 0, close_time = 0, total_time = 0;
+#endif
+
+#ifdef  TIMING
+    gettimeofday(&tv_total_start, NULL);
+#endif
+
+	//======================================================================================================================================================150
+	//	GPU SETUP
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	INITIAL DRIVER OVERHEAD
+	//====================================================================================================100
+
+	// cudaThreadSynchronize();
+
+	//====================================================================================================100
+	//	COMMON VARIABLES
+	//====================================================================================================100
+
+	// common variables
+	cl_int error;
+
+	//====================================================================================================100
+	//	GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
+	//====================================================================================================100
+
+	// Get the number of available platforms
+	cl_uint num_platforms;
+	error = clGetPlatformIDs(	0, 
+								NULL, 
+								&num_platforms);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	// Get the list of available platforms
+	cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
+	error = clGetPlatformIDs(	num_platforms, 
+								platforms, 
+								NULL);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	// Select the 1st platform
+	cl_platform_id platform = platforms[platform_id_inuse];
+
+	// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
+	char pbuf[100];
+	error = clGetPlatformInfo(	platform, 
+								CL_PLATFORM_VENDOR, 
+								sizeof(pbuf), 
+								pbuf, 
+								NULL);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+	printf("Platform: %s\n", pbuf);
+
+	//====================================================================================================100
+	//	GET DEVICE INFORMATION
+	//====================================================================================================100
+
+	cl_uint devices_size;
+    error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &devices_size);
+	if (error != CL_SUCCESS)
+		fatal_CL(error, __LINE__);
+    if (devices_size == 0) {
+        printf("There are no devices for Platform %d\n", platform_id_inuse);
+        exit(0);
+    }
+    printf("Device num: %u\n", devices_size);
+    // Get the list of devices (previousely selected for the context)
+    cl_device_id *devices = (cl_device_id *) malloc(devices_size);
+    error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, devices_size,
+            devices, NULL);
+    if (error != CL_SUCCESS) 
+        fatal_CL(error, __LINE__);
+
+	// Select the device
+	cl_device_id device;
+	device = devices[device_id_inuse];
+
+	// Check device type
+	error = clGetDeviceInfo(device, CL_DEVICE_TYPE,
+            sizeof(device_type), (void *)&device_type, NULL);
+	if (error != CL_SUCCESS)
+		fatal_CL(error, __LINE__);
+	if(device_type == CL_DEVICE_TYPE_GPU)
+	    printf("Creating GPU Context\n");
+	else if (device_type == CL_DEVICE_TYPE_CPU)
+        printf("Creating CPU Context\n");
+	else
+        printf("This Context Type Not Supported\n");
+
+	// Get the name of the selected device
+	error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf),
+            pbuf, NULL);
+	if (error != CL_SUCCESS)
+		fatal_CL(error, __LINE__);
+	printf("Device: %s\n", pbuf);
+
+	//====================================================================================================100
+	//	CREATE CONTEXT FOR THE PLATFORM
+	//====================================================================================================100
+
+	// Create context properties for selected platform
+	cl_context_properties context_properties[3] = {	CL_CONTEXT_PLATFORM, 
+													(cl_context_properties) platform, 
+													0};
+
+	// Create context for selected platform being GPU
+	cl_context context;
+	context = clCreateContextFromType(	context_properties, 
+										device_type, 
+										NULL, 
+										NULL, 
+										&error);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+
+	//====================================================================================================100
+	//	CREATE COMMAND QUEUE FOR THE DEVICE
+	//====================================================================================================100
+
+	// Create a command queue
+	cl_command_queue command_queue;
+#ifdef TIMING
+	command_queue = clCreateCommandQueue(context, device,
+            CL_QUEUE_PROFILING_ENABLE, &error);
+#else
+	command_queue = clCreateCommandQueue(context, device, 0, &error);
+#endif
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//====================================================================================================100
+	//	CREATE PROGRAM, COMPILE IT
+	//====================================================================================================100
+
+	// Load kernel source code from file
+  /*
+	const char *source = load_kernel_source("./kernel/kernel_gpu_opencl.cl");
+	size_t sourceSize = strlen(source);
+
+	// Create the program
+  cl_program program = clCreateProgramWithSource(	context, 
+													1, 
+													&source, 
+													&sourceSize, 
+													&error);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+  */
+      uint8_t *kernel_bin = NULL;
+    size_t kernel_size;
+    cl_int binary_status = 0;
+    if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+      std::abort();
+
+    cl_program program = clCreateProgramWithBinary(
+          context, 1, &devices[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &error);
+    free(kernel_bin);
+
+	char clOptions[110];
+	//  sprintf(clOptions,"-I../../src");                                                                                 
+	sprintf(clOptions,"-I./../");
+
+#ifdef DEFAULT_ORDER
+	sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER=%d", DEFAULT_ORDER);
+#endif
+
+  /*
+	// Compile the program
+	error = clBuildProgram(	program, 
+							1, 
+							&device, 
+							clOptions, 
+							NULL, 
+							NULL);
+  */
+	  error = clBuildProgram(program, 1, &devices[device_id_inuse], NULL, NULL, NULL);
+	  if(error != CL_SUCCESS) { 
+        printf("ERROR: clBuildProgram() => %d\n", error); return; }
+
+
+	// Print warnings and errors from compilation
+	static char log[65536]; 
+	memset(log, 0, sizeof(log));
+	clGetProgramBuildInfo(	program, 
+							device, 
+							CL_PROGRAM_BUILD_LOG, 
+							sizeof(log)-1, 
+							log, 
+							NULL);
+	printf("-----OpenCL Compiler Output-----\n");
+	if (strstr(log,"warning:") || strstr(log, "error:")) 
+		printf("<<<<\n%s\n>>>>\n", log);
+	printf("--------------------------------\n");
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	// Create kernel
+	cl_kernel kernel;
+	kernel = clCreateKernel(program, 
+							"findK", 
+							&error);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef  TIMING
+	gettimeofday(&tv_init_end, NULL);
+	tvsub(&tv_init_end, &tv_total_start, &tv);
+	init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+	//====================================================================================================100
+	//	END
+	//====================================================================================================100
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY				(MALLOC)
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	DEVICE IN
+	//====================================================================================================100
+
+	//==================================================50
+	//	recordsD
+	//==================================================50
+
+	cl_mem recordsD;
+	recordsD = clCreateBuffer(	context, 
+								CL_MEM_READ_WRITE, 
+								records_mem, 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	knodesD
+	//==================================================50
+
+	cl_mem knodesD;
+	knodesD = clCreateBuffer(	context, 
+								CL_MEM_READ_WRITE, 
+								knodes_mem, 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	currKnodeD
+	//==================================================50
+
+	cl_mem currKnodeD;
+	currKnodeD = clCreateBuffer(	context, 
+								CL_MEM_READ_WRITE, 
+								count*sizeof(long), 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	offsetD
+	//==================================================50
+
+	cl_mem offsetD;
+	offsetD = clCreateBuffer(	context, 
+								CL_MEM_READ_WRITE, 
+								count*sizeof(long), 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	keysD
+	//==================================================50
+
+	cl_mem keysD;
+	keysD = clCreateBuffer(	context, 
+								CL_MEM_READ_WRITE, 
+								count*sizeof(long), 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef  TIMING
+	gettimeofday(&tv_mem_alloc_end, NULL);
+	tvsub(&tv_mem_alloc_end, &tv_init_end, &tv);
+	mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+	//==================================================50
+	//	END
+	//==================================================50
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansD
+	//==================================================50
+
+	cl_mem ansD;
+	ansD = clCreateBuffer(	context, 
+								CL_MEM_READ_WRITE, 
+								count*sizeof(record), 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	END
+	//==================================================50
+
+	//====================================================================================================100
+	//	END
+	//====================================================================================================100
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY			COPY
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	GPU MEMORY				(MALLOC) COPY IN
+	//====================================================================================================100
+
+	//==================================================50
+	//	recordsD
+	//==================================================50
+
+    cl_event event;
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									recordsD,				// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									records_mem,			// size to be copied
+									records,				// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	knodesD
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									knodesD,				// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									knodes_mem,				// size to be copied
+									knodes,					// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	currKnodeD
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									currKnodeD,				// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(long),		// size to be copied
+									currKnode,				// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	offsetD
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									offsetD,				// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(long),		// size to be copied
+									offset,					// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	keysD
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									keysD,					// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(int),		// size to be copied
+									keys,					// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	END
+	//==================================================50
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansD
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									ansD,					// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(record),	// size to be copied
+									ans,					// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	END
+	//==================================================50
+
+	//====================================================================================================100
+	//	END
+	//====================================================================================================100
+
+	//======================================================================================================================================================150
+	// findK kernel
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	Execution Parameters
+	//====================================================================================================100
+
+	size_t local_work_size[1];
+	local_work_size[0] = order < 1024 ? order : 1024;
+	size_t global_work_size[1];
+	global_work_size[0] = count * local_work_size[0];
+
+	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
+
+	//====================================================================================================100
+	//	Kernel Arguments
+	//====================================================================================================100
+
+	clSetKernelArg(	kernel, 
+					0, 
+					sizeof(long), 
+					(void *) &maxheight);
+	clSetKernelArg(	kernel, 
+					1, 
+					sizeof(cl_mem), 
+					(void *) &knodesD);
+	clSetKernelArg(	kernel, 
+					2, 
+					sizeof(long), 
+					(void *) &knodes_elem);
+	clSetKernelArg(	kernel, 
+					3, 
+					sizeof(cl_mem), 
+					(void *) &recordsD);
+
+	clSetKernelArg(	kernel, 
+					4, 
+					sizeof(cl_mem), 
+					(void *) &currKnodeD);
+	clSetKernelArg(	kernel, 
+					5, 
+					sizeof(cl_mem), 
+					(void *) &offsetD);
+	clSetKernelArg(	kernel, 
+					6, 
+					sizeof(cl_mem), 
+					(void *) &keysD);
+	clSetKernelArg(	kernel, 
+					7, 
+					sizeof(cl_mem), 
+					(void *) &ansD);
+
+	//====================================================================================================100
+	//	Kernel
+	//====================================================================================================100
+
+	error = clEnqueueNDRangeKernel(	command_queue, 
+									kernel, 
+									1, 
+									NULL, 
+									global_work_size, 
+									local_work_size, 
+									0, 
+									NULL, 
+									&event);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
+#ifdef TIMING
+    kernel_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//====================================================================================================100
+	//	END
+	//====================================================================================================100
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY			COPY (CONTD.)
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansD
+	//==================================================50
+
+	error = clEnqueueReadBuffer(command_queue,				// The command queue.
+								ansD,						// The image on the device.
+								CL_TRUE,					// Blocking? (ie. Wait at this line until read has finished?)
+								0,							// Offset. None in this case.
+								count*sizeof(record),		// Size to copy.
+								ans,						// The pointer to the image on the host.
+								0,							// Number of events in wait list. Not used.
+								NULL,						// Event wait list. Not used.
+								&event);						// Event object for determining status. Not used.
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    d2h_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	END
+	//==================================================50
+
+	//====================================================================================================100
+	//	END
+	//====================================================================================================100
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY DEALLOCATION
+	//======================================================================================================================================================150
+
+#ifdef  TIMING
+	gettimeofday(&tv_close_start, NULL);
+#endif
+
+	// Release kernels...
+	clReleaseKernel(kernel);
+
+	// Now the program...
+	clReleaseProgram(program);
+
+	// Clean up the device memory...
+	clReleaseMemObject(recordsD);
+	clReleaseMemObject(knodesD);
+
+	clReleaseMemObject(currKnodeD);
+	clReleaseMemObject(offsetD);
+	clReleaseMemObject(keysD);
+	clReleaseMemObject(ansD);
+
+	// Flush the queue
+	error = clFlush(command_queue);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	// ...and finally, the queue and context.
+	clReleaseCommandQueue(command_queue);
+
+	// ???
+	clReleaseContext(context);
+
+#ifdef  TIMING
+	gettimeofday(&tv_close_end, NULL);
+	tvsub(&tv_close_end, &tv_close_start, &tv);
+	close_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+    tvsub(&tv_close_end, &tv_total_start, &tv);
+    total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+
+	//======================================================================================================================================================150
+	//	DISPLAY TIMING
+	//======================================================================================================================================================150
+
+#ifdef TIMING
+	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
+	printf("Init: %f\n", init_time);
+	printf("MemAlloc: %f\n", mem_alloc_time);
+	printf("HtoD: %f\n", h2d_time);
+	printf("Exec: %f\n", kernel_time);
+	printf("DtoH: %f\n", d2h_time);
+	printf("Close: %f\n", close_time);
+	printf("Total: %f\n", total_time);
+#endif
+
+	//======================================================================================================================================================150
+	//	END
+	//======================================================================================================================================================150
+
+}
+
+//========================================================================================================================================================================================================200
+//	END
+//========================================================================================================================================================================================================200
+
+// #ifdef __cplusplus
+// }
+// #endif
--- a/tests/opencl/b+tree/kernel_gpu_opencl_wrapper.h
+++ b/tests/opencl/b+tree/kernel_gpu_opencl_wrapper.h
@ -0,0 +1,31 @@
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+//========================================================================================================================================================================================================200
+//	KERNEL_GPU_OPENCL_WRAPPER HEADER
+//========================================================================================================================================================================================================200
+
+void 
+kernel_gpu_opencl_wrapper(	record *records,
+							long records_mem,
+							knode *knodes,
+							long knodes_elem,
+							long knodes_mem,
+
+							int order,
+							long maxheight,
+							int count,
+
+							long *currKnode,
+							long *offset,
+							int *keys,
+							record *ans);
+
+//========================================================================================================================================================================================================200
+//	End
+//========================================================================================================================================================================================================200
+
+// #ifdef __cplusplus
+// }
+// #endif
--- a/tests/opencl/b+tree/kernel_gpu_opencl_wrapper_2.cc
+++ b/tests/opencl/b+tree/kernel_gpu_opencl_wrapper_2.cc
@ -0,0 +1,886 @@
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+//========================================================================================================================================================================================================200
+//	INCLUDE
+//========================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	LIBRARIES
+//======================================================================================================================================================150
+
+#include <CL/cl.h>									// (in directory provided to compiler)		needed by OpenCL types and functions
+#include <string.h>									// (in directory known to compiler)			needed by memset
+#include <stdio.h>									// (in directory known to compiler)			needed by printf, stderr
+
+//======================================================================================================================================================150
+//	COMMON
+//======================================================================================================================================================150
+
+#include "common.h"									// (in directory provided here)
+
+//======================================================================================================================================================150
+//	UTILITIES
+//======================================================================================================================================================150
+
+#include "timer.h"						// (in directory provided here)
+#include "opencl.h"
+#ifdef TIMING
+#include "timing.h"
+#endif
+
+//======================================================================================================================================================150
+//	HEADER
+//======================================================================================================================================================150
+
+#include "kernel_gpu_opencl_wrapper_2.h"				// (in directory provided here)
+
+//========================================================================================================================================================================================================200
+//	FUNCTION
+//========================================================================================================================================================================================================200
+int read_kernel_file2(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp_ = fopen(filename, "r");
+  if (NULL == fp_) {
+    fprintf(stderr, "Failed to load kernel.2 %s\n", filename);
+    return -1;
+  }
+  fseek(fp_ , 0 , SEEK_END);
+  long fsize = ftell(fp_);
+  rewind(fp_);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp_);
+  
+  fclose(fp_);
+  
+  return 0;
+}
+
+
+void 
+kernel_gpu_opencl_wrapper_2(knode *knodes,
+							long knodes_elem,
+							long knodes_mem,
+
+							int order,
+							long maxheight,
+							int count,
+
+							long *currKnode,
+							long *offset,
+							long *lastKnode,
+							long *offset_2,
+							int *start,
+							int *end,
+							int *recstart,
+							int *reclength)
+{
+  cl_device_type device_type = CL_DEVICE_TYPE_GPU;
+  int platform_id_inuse = 0;            // platform id in use (default: 0)
+  int device_id_inuse = 0;            // platform id in use (default: 0)
+	
+	//======================================================================================================================================================150
+	//	CPU VARIABLES
+	//======================================================================================================================================================150
+
+//Primitives for timing
+#ifdef TIMING
+	struct timeval tv;
+	struct timeval tv_total_start, tv_total_end;
+	struct timeval tv_init_end;
+	struct timeval tv_h2d_start, tv_h2d_end;
+	struct timeval tv_d2h_start, tv_d2h_end;
+	struct timeval tv_kernel_start, tv_kernel_end;
+	struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
+	struct timeval tv_close_start, tv_close_end;
+	float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time= 0,
+		  d2h_time = 0, close_time = 0, total_time = 0;
+#endif
+
+#ifdef  TIMING
+    gettimeofday(&tv_total_start, NULL);
+#endif
+
+	//======================================================================================================================================================150
+	//	GPU SETUP
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	INITIAL DRIVER OVERHEAD
+	//====================================================================================================100
+
+	// cudaThreadSynchronize();
+
+	//====================================================================================================100
+	//	COMMON VARIABLES
+	//====================================================================================================100
+
+	// common variables
+	cl_int error;
+
+	//====================================================================================================100
+	//	GET PLATFORMS (Intel, AMD, NVIDIA, based on provided library), SELECT ONE
+	//====================================================================================================100
+
+	// Get the number of available platforms
+	cl_uint num_platforms;
+	error = clGetPlatformIDs(	0, 
+								NULL, 
+								&num_platforms);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	// Get the list of available platforms
+	cl_platform_id *platforms = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms);
+	error = clGetPlatformIDs(	num_platforms, 
+								platforms, 
+								NULL);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	// Select the 1st platform
+	cl_platform_id platform = platforms[platform_id_inuse];
+
+	// Get the name of the selected platform and print it (if there are multiple platforms, choose the first one)
+	char pbuf[100];
+	error = clGetPlatformInfo(	platform, 
+								CL_PLATFORM_VENDOR, 
+								sizeof(pbuf), 
+								pbuf, 
+								NULL);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+	printf("Platform: %s\n", pbuf);
+
+	//====================================================================================================100
+	//	GET DEVICE INFORMATION
+	//====================================================================================================100
+
+	cl_uint devices_size;
+    error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &devices_size);
+	if (error != CL_SUCCESS)
+		fatal_CL(error, __LINE__);
+    if (devices_size == 0) {
+        printf("There are no devices for Platform %d\n", platform_id_inuse);
+        exit(0);
+    }
+    printf("Device num: %u\n", devices_size);
+    // Get the list of devices (previousely selected for the context)
+    cl_device_id *devices = (cl_device_id *) malloc(devices_size);
+    error = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, devices_size,
+            devices, NULL);
+    if (error != CL_SUCCESS) 
+        fatal_CL(error, __LINE__);
+
+	// Select the device
+	cl_device_id device;
+	device = devices[device_id_inuse];
+
+	// Check device type
+	error = clGetDeviceInfo(device, CL_DEVICE_TYPE,
+            sizeof(device_type), (void *)&device_type, NULL);
+	if (error != CL_SUCCESS)
+		fatal_CL(error, __LINE__);
+	if(device_type == CL_DEVICE_TYPE_GPU)
+	    printf("Creating GPU Context\n");
+	else if (device_type == CL_DEVICE_TYPE_CPU)
+        printf("Creating CPU Context\n");
+	else
+        printf("This Context Type Not Supported\n");
+
+	// Get the name of the selected device
+	error = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(pbuf),
+            pbuf, NULL);
+	if (error != CL_SUCCESS)
+		fatal_CL(error, __LINE__);
+	printf("Device: %s\n", pbuf);
+
+	//====================================================================================================100
+	//	CREATE CONTEXT FOR THE PLATFORM
+	//====================================================================================================100
+
+	// Create context properties for selected platform
+	cl_context_properties context_properties[3] = {	CL_CONTEXT_PLATFORM, 
+													(cl_context_properties) platform, 
+													0};
+
+	// Create context for selected platform being GPU
+	cl_context context;
+	context = clCreateContextFromType(	context_properties, 
+										device_type, 
+										NULL, 
+										NULL, 
+										&error);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//====================================================================================================100
+	//	CREATE COMMAND QUEUE FOR THE DEVICE
+	//====================================================================================================100
+
+	// Create a command queue
+	cl_command_queue command_queue;
+#ifdef TIMING
+	command_queue = clCreateCommandQueue(context, device,
+            CL_QUEUE_PROFILING_ENABLE, &error);
+#else
+	command_queue = clCreateCommandQueue(context, device, 0, &error);
+#endif
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//====================================================================================================100
+	//	CREATE PROGRAM, COMPILE IT
+	//====================================================================================================100
+
+	// Load kernel source code from file
+/*
+	const char *source = load_kernel_source("./kernel/kernel_gpu_opencl_2.cl");
+	size_t sourceSize = strlen(source);
+
+	// Create the program
+  cl_program program = clCreateProgramWithSource(	context, 
+													1, 
+													&source, 
+													&sourceSize, 
+													&error);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+*/
+  uint8_t *kernel_bin = NULL;
+    size_t kernel_size;
+    cl_int binary_status = 0;
+    if (0 != read_kernel_file2("kernel2.pocl", &kernel_bin, &kernel_size))
+      std::abort();
+
+    cl_program program = clCreateProgramWithBinary(
+          context, 1, &devices[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &error);
+    free(kernel_bin);
+
+
+	char clOptions[110];
+	//  sprintf(clOptions,"-I../../src");                                                                                 
+	sprintf(clOptions,"-I./../");
+
+#ifdef DEFAULT_ORDER_2
+	sprintf(clOptions + strlen(clOptions), " -DDEFAULT_ORDER_2=%d", DEFAULT_ORDER_2);
+#endif
+
+  error = clBuildProgram(program, 1, &devices[device_id_inuse], NULL, NULL, NULL);
+	  if(error != CL_SUCCESS) { 
+        printf("ERROR: clBuildProgram() => %d\n", error); return; }
+
+
+	// Compile the program
+  /*
+	error = clBuildProgram(	program, 
+							1, 
+							&device, 
+							clOptions, 
+							NULL, 
+							NULL);*/
+	// Print warnings and errors from compilation
+	static char log[65536]; 
+	memset(log, 0, sizeof(log));
+	clGetProgramBuildInfo(	program, 
+							device, 
+							CL_PROGRAM_BUILD_LOG, 
+							sizeof(log)-1, 
+							log, 
+							NULL);
+	printf("-----OpenCL Compiler Output-----\n");
+	if (strstr(log,"warning:") || strstr(log, "error:")) 
+		printf("<<<<\n%s\n>>>>\n", log);
+	printf("--------------------------------\n");
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	// Create kernel
+	cl_kernel kernel;
+	kernel = clCreateKernel(program, 
+							"findRangeK", 
+							&error);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+#ifdef  TIMING
+	gettimeofday(&tv_init_end, NULL);
+	tvsub(&tv_init_end, &tv_total_start, &tv);
+	init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+	//====================================================================================================100
+	//	END
+	//====================================================================================================100
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY				MALLOC
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	DEVICE IN
+	//====================================================================================================100
+
+	//==================================================50
+	//	knodesD
+	//==================================================50
+
+	cl_mem knodesD;
+	knodesD = clCreateBuffer(	context, 
+								CL_MEM_READ_WRITE, 
+								knodes_mem, 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	currKnodeD
+	//==================================================50
+
+	cl_mem currKnodeD;
+	currKnodeD = clCreateBuffer(context, 
+								CL_MEM_READ_WRITE, 
+								count*sizeof(long), 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	offsetD
+	//==================================================50
+
+	cl_mem offsetD;
+	offsetD = clCreateBuffer(	context, 
+								CL_MEM_READ_WRITE, 
+								count*sizeof(long), 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	lastKnodeD
+	//==================================================50
+
+	cl_mem lastKnodeD;
+	lastKnodeD = clCreateBuffer(context, 
+								CL_MEM_READ_WRITE, 
+								count*sizeof(long), 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	offset_2D
+	//==================================================50
+
+	cl_mem offset_2D;
+	offset_2D = clCreateBuffer(context, 
+								CL_MEM_READ_WRITE, 
+								count*sizeof(long), 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	startD
+	//==================================================50
+
+	cl_mem startD;
+	startD = clCreateBuffer(context, 
+								CL_MEM_READ_WRITE, 
+								count*sizeof(int), 
+								NULL, 
+								&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	endD
+	//==================================================50
+
+	cl_mem endD;
+	endD = clCreateBuffer(	context, 
+							CL_MEM_READ_WRITE, 
+							count*sizeof(int), 
+							NULL, 
+							&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	END
+	//==================================================50
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansDStart
+	//==================================================50
+
+	cl_mem ansDStart;
+	ansDStart = clCreateBuffer(	context, 
+							CL_MEM_READ_WRITE, 
+							count*sizeof(int), 
+							NULL, 
+							&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	//==================================================50
+	//	ansDLength
+	//==================================================50
+
+	cl_mem ansDLength;
+	ansDLength = clCreateBuffer(	context, 
+							CL_MEM_READ_WRITE, 
+							count*sizeof(int), 
+							NULL, 
+							&error );
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+#ifdef  TIMING
+	gettimeofday(&tv_mem_alloc_end, NULL);
+	tvsub(&tv_mem_alloc_end, &tv_init_end, &tv);
+	mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+	//==================================================50
+	//	END
+	//==================================================50
+
+	//====================================================================================================100
+	//	END
+	//====================================================================================================100
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY			COPY
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	DEVICE IN
+	//====================================================================================================100
+
+	//==================================================50
+	//	knodesD
+	//==================================================50
+
+    cl_event event;
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									knodesD,				// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									knodes_mem,				// size to be copied
+									knodes,					// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	currKnodeD
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									currKnodeD,				// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(long),		// size to be copied
+									currKnode,				// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	offsetD
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									offsetD,				// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(long),		// size to be copied
+									offset,					// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	lastKnodeD
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									lastKnodeD,				// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(long),		// size to be copied
+									lastKnode,				// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	offset_2D
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									offset_2D,				// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(long),		// size to be copied
+									offset_2,				// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	startD
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									startD,					// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(int),		// size to be copied
+									start,					// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	endD
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									endD,					// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(int),		// size to be copied
+									end,					// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	END
+	//==================================================50
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansDStart
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									endD,					// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(int),		// size to be copied
+									end,					// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+								    &event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	ansDLength
+	//==================================================50
+
+	error = clEnqueueWriteBuffer(	command_queue,			// command queue
+									ansDLength,					// destination
+									1,						// block the source from access until this copy operation complates (1=yes, 0=no)
+									0,						// offset in destination to write to
+									count*sizeof(int),		// size to be copied
+									reclength,					// source
+									0,						// # of events in the list of events to wait for
+									NULL,					// list of events to wait for
+									&event);					// ID of this operation to be used by waiting operations
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    h2d_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	END
+	//==================================================50
+
+	//======================================================================================================================================================150
+	//	KERNEL
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	Execution Parameters
+	//====================================================================================================100
+
+	size_t local_work_size[1];
+	local_work_size[0] = order < 1024 ? order : 1024;
+	size_t global_work_size[1];
+	global_work_size[0] = count * local_work_size[0];
+
+	printf("# of blocks = %d, # of threads/block = %d (ensure that device can handle)\n", (int)(global_work_size[0]/local_work_size[0]), (int)local_work_size[0]);
+
+	//====================================================================================================100
+	//	Kernel Arguments
+	//====================================================================================================100
+
+	clSetKernelArg(	kernel, 
+					0, 
+					sizeof(long), 
+					(void *) &maxheight);
+	clSetKernelArg(	kernel, 
+					1, 
+					sizeof(cl_mem), 
+					(void *) &knodesD);
+	clSetKernelArg(	kernel, 
+					2, 
+					sizeof(long), 
+					(void *) &knodes_elem);
+
+	clSetKernelArg(	kernel, 
+					3, 
+					sizeof(cl_mem), 
+					(void *) &currKnodeD);
+	clSetKernelArg(	kernel, 
+					4, 
+					sizeof(cl_mem), 
+					(void *) &offsetD);
+	clSetKernelArg(	kernel, 
+					5, 
+					sizeof(cl_mem), 
+					(void *) &lastKnodeD);
+	clSetKernelArg(	kernel, 
+					6, 
+					sizeof(cl_mem), 
+					(void *) &offset_2D);
+	clSetKernelArg(	kernel, 
+					7, 
+					sizeof(cl_mem), 
+					(void *) &startD);
+	clSetKernelArg(	kernel, 
+					8, 
+					sizeof(cl_mem), 
+					(void *) &endD);
+	clSetKernelArg(	kernel, 
+					9, 
+					sizeof(cl_mem), 
+					(void *) &ansDStart);
+	clSetKernelArg(	kernel, 
+					10, 
+					sizeof(cl_mem), 
+					(void *) &ansDLength);
+
+	//====================================================================================================100
+	//	Kernel
+	//====================================================================================================100
+
+	error = clEnqueueNDRangeKernel(	command_queue, 
+									kernel, 
+									1, 
+									NULL, 
+									global_work_size, 
+									local_work_size, 
+									0, 
+									NULL, 
+									&event);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	// Wait for all operations to finish NOT SURE WHERE THIS SHOULD GO
+#ifdef TIMING
+    kernel_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//====================================================================================================100
+	//	END
+	//====================================================================================================100
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY			COPY (CONTD.)
+	//======================================================================================================================================================150
+
+	//====================================================================================================100
+	//	DEVICE IN/OUT
+	//====================================================================================================100
+
+	//==================================================50
+	//	ansDStart
+	//==================================================50
+
+	error = clEnqueueReadBuffer(command_queue,				// The command queue.
+								ansDStart,					// The image on the device.
+								CL_TRUE,					// Blocking? (ie. Wait at this line until read has finished?)
+								0,							// Offset. None in this case.
+								count*sizeof(int),			// Size to copy.
+								recstart,					// The pointer to the image on the host.
+								0,							// Number of events in wait list. Not used.
+								NULL,						// Event wait list. Not used.
+								&event);						// Event object for determining status. Not used.
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    d2h_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	ansDLength
+	//==================================================50
+
+	error = clEnqueueReadBuffer(command_queue,				// The command queue.
+								ansDLength,					// The image on the device.
+								CL_TRUE,					// Blocking? (ie. Wait at this line until read has finished?)
+								0,							// Offset. None in this case.
+								count*sizeof(int),			// Size to copy.
+								reclength,					// The pointer to the image on the host.
+								0,							// Number of events in wait list. Not used.
+								NULL,						// Event wait list. Not used.
+								&event);						// Event object for determining status. Not used.
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+#ifdef TIMING
+    d2h_time += probe_event_time(event, command_queue);
+#endif
+    clReleaseEvent(event);
+
+	//==================================================50
+	//	END
+	//==================================================50
+
+	//====================================================================================================100
+	//	END
+	//====================================================================================================100
+
+	//======================================================================================================================================================150
+	//	GPU MEMORY DEALLOCATION
+	//======================================================================================================================================================150
+
+#ifdef  TIMING
+	gettimeofday(&tv_close_start, NULL);
+#endif
+
+	// Release kernels...
+	clReleaseKernel(kernel);
+
+	// Now the program...
+	clReleaseProgram(program);
+
+	// Clean up the device memory...
+	clReleaseMemObject(knodesD);
+
+	clReleaseMemObject(currKnodeD);
+	clReleaseMemObject(offsetD);
+	clReleaseMemObject(lastKnodeD);
+	clReleaseMemObject(offset_2D);
+	clReleaseMemObject(startD);
+	clReleaseMemObject(endD);
+	clReleaseMemObject(ansDStart);
+	clReleaseMemObject(ansDLength);
+
+	// Flush the queue
+	error = clFlush(command_queue);
+	if (error != CL_SUCCESS) 
+		fatal_CL(error, __LINE__);
+
+	// ...and finally, the queue and context.
+	clReleaseCommandQueue(command_queue);
+
+	// ???
+	clReleaseContext(context);
+
+#ifdef  TIMING
+	gettimeofday(&tv_close_end, NULL);
+	tvsub(&tv_close_end, &tv_close_start, &tv);
+	close_time += tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+    tvsub(&tv_close_end, &tv_total_start, &tv);
+    total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+	//======================================================================================================================================================150
+	//	DISPLAY TIMING
+	//======================================================================================================================================================150
+
+#ifdef TIMING
+	printf("Time spent in different stages of GPU_CUDA KERNEL:\n");
+	printf("Init: %f\n", init_time);
+	printf("MemAlloc: %f\n", mem_alloc_time);
+	printf("HtoD: %f\n", h2d_time);
+	printf("Exec: %f\n", kernel_time);
+	printf("DtoH: %f\n", d2h_time);
+	printf("Close: %f\n", close_time);
+	printf("Total: %f\n", total_time);
+#endif
+
+	//======================================================================================================================================================150
+	//	END
+	//======================================================================================================================================================150
+
+}
+
+//========================================================================================================================================================================================================200
+//	END
+//========================================================================================================================================================================================================200
+
+// #ifdef __cplusplus
+// }
+// #endif
--- a/tests/opencl/b+tree/kernel_gpu_opencl_wrapper_2.h
+++ b/tests/opencl/b+tree/kernel_gpu_opencl_wrapper_2.h
@ -0,0 +1,33 @@
+// #ifdef __cplusplus
+// extern "C" {
+// #endif
+
+//========================================================================================================================================================================================================200
+//	KERNEL_GPU_OPENCL_WRAPPER HEADER
+//========================================================================================================================================================================================================200
+
+void 
+kernel_gpu_opencl_wrapper_2(knode *knodes,
+							long knodes_elem,
+							long knodes_mem,
+
+							int order,
+							long maxheight,
+							int count,
+
+							long *currKnode,
+							long *offset,
+							long *lastKnode,
+							long *offset_2,
+							int *start,
+							int *end,
+							int *recstart,
+							int *reclength);
+
+//========================================================================================================================================================================================================200
+//	End
+//========================================================================================================================================================================================================200
+
+// #ifdef __cplusplus
+// }
+// #endif
--- a/tests/opencl/b+tree/main.cc
+++ b/tests/opencl/b+tree/main.cc
--- a/tests/opencl/b+tree/main.h
+++ b/tests/opencl/b+tree/main.h
@ -0,0 +1,19 @@
+// # ifdef __cplusplus
+// extern "C" {
+// # endif
+
+//===============================================================================================================================================================================================================200
+//	HEADER
+//===============================================================================================================================================================================================================200
+
+int 
+main(	int argc, 
+		char *argv []);
+
+//===============================================================================================================================================================================================================200
+//	END
+//===============================================================================================================================================================================================================200
+
+// # ifdef __cplusplus
+// }
+// # endif
--- a/tests/opencl/b+tree/num.cc
+++ b/tests/opencl/b+tree/num.cc
@ -0,0 +1,53 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	DESCRIPTION
+//===============================================================================================================================================================================================================200
+
+// Returns:	0 if string does not represent integer
+//			1 if string represents integer
+
+//===============================================================================================================================================================================================================200
+//	NUM CODE
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	ISINTEGER FUNCTION
+//======================================================================================================================================================150
+
+int isInteger(char *str){
+
+	//====================================================================================================100
+	//	make sure it's not empty
+	//====================================================================================================100
+
+	if (*str == '\0'){
+		return 0;
+	}
+
+	//====================================================================================================100
+	//	if any digit is not a number, return false
+	//====================================================================================================100
+
+	for(; *str != '\0'; str++){
+		if (*str < 48 || *str > 57){	// digit characters (need to include . if checking for float)
+			return 0;
+		}
+	}
+
+	//====================================================================================================100
+	//	it got past all my checks so I think it's a number
+	//====================================================================================================100
+
+	return 1;
+}
+
+//===============================================================================================================================================================================================================200
+//	END NUM CODE
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/tests/opencl/b+tree/num.h
+++ b/tests/opencl/b+tree/num.h
@ -0,0 +1,21 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	FILE HEADER
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	ISINTEGER FUNCTION PROTOTYPE
+//======================================================================================================================================================150
+
+int isInteger(char *str);
+
+//===============================================================================================================================================================================================================200
+//	END FILE HEADER
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/tests/opencl/b+tree/opencl.cc
+++ b/tests/opencl/b+tree/opencl.cc
@ -0,0 +1,138 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	INCLUDE/DEFINE
+//===============================================================================================================================================================================================================200
+
+#include "opencl.h"					// (in directory)	function headers
+#include <CL/cl.h>
+//===============================================================================================================================================================================================================200
+//	LOAD KERNEL SOURCE CODE FUNCTION
+//===============================================================================================================================================================================================================200
+
+char *
+load_kernel_source(const char *filename)
+{
+	// Open the source file
+	FILE *file = fopen(filename, "r");
+	if (file == NULL){
+		fatal("Error opening kernel source file\n");
+	}
+
+	// Determine the size of the file
+	if (fseek(file, 0, SEEK_END)){
+		fatal("Error reading kernel source file\n");
+	}
+	size_t size = ftell(file);
+
+	// Allocate space for the source code (plus one for null-terminator)
+	char *source = (char *) malloc(size + 1);
+
+	// Read the source code into the string
+	fseek(file, 0, SEEK_SET);
+	// printf("Number of elements: %lu\nSize = %lu", fread(source, 1, size, file), size);
+	// exit(1);
+	if (fread(source, 1, size, file) != size){
+		fatal("Error reading kernel source file\n");
+	}
+
+	// Null-terminate the string
+	source[size] = '\0';
+
+	// Return the pointer to the string
+	return source;
+}
+
+//===============================================================================================================================================================================================================200
+//	PRINT ERROR FUNCTION
+//===============================================================================================================================================================================================================200
+
+void 
+fatal(const char *s)
+{
+	fprintf(stderr, "Error: %s\n", s);
+	exit(1);
+}
+
+//===============================================================================================================================================================================================================200
+//	PRINT OPENCL ERROR FUNCTION
+//===============================================================================================================================================================================================================200
+
+void 
+fatal_CL(cl_int error, int line_no) {
+
+	printf("Error at line %d: ", line_no);
+
+	switch(error) {
+
+		case CL_SUCCESS: 									printf("CL_SUCCESS\n"); break;
+		case CL_DEVICE_NOT_FOUND: 							printf("CL_DEVICE_NOT_FOUND\n"); break;
+		case CL_DEVICE_NOT_AVAILABLE: 						printf("CL_DEVICE_NOT_AVAILABLE\n"); break;
+		case CL_COMPILER_NOT_AVAILABLE: 					printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
+		case CL_MEM_OBJECT_ALLOCATION_FAILURE: 				printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
+		case CL_OUT_OF_RESOURCES: 							printf("CL_OUT_OF_RESOURCES\n"); break;
+		case CL_OUT_OF_HOST_MEMORY: 						printf("CL_OUT_OF_HOST_MEMORY\n"); break;
+		case CL_PROFILING_INFO_NOT_AVAILABLE: 				printf("CL_PROFILING_INFO_NOT_AVAILABLE\n"); break;
+		case CL_MEM_COPY_OVERLAP: 							printf("CL_MEM_COPY_OVERLAP\n"); break;
+		case CL_IMAGE_FORMAT_MISMATCH: 						printf("CL_IMAGE_FORMAT_MISMATCH\n"); break;
+		case CL_IMAGE_FORMAT_NOT_SUPPORTED: 				printf("CL_IMAGE_FORMAT_NOT_SUPPORTED\n"); break;
+		case CL_BUILD_PROGRAM_FAILURE: 						printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
+		case CL_MAP_FAILURE: 								printf("CL_MAP_FAILURE\n"); break;
+		case CL_INVALID_VALUE: 								printf("CL_INVALID_VALUE\n"); break;
+		case CL_INVALID_DEVICE_TYPE: 						printf("CL_INVALID_DEVICE_TYPE\n"); break;
+		case CL_INVALID_PLATFORM: 							printf("CL_INVALID_PLATFORM\n"); break;
+		case CL_INVALID_DEVICE: 							printf("CL_INVALID_DEVICE\n"); break;
+		case CL_INVALID_CONTEXT: 							printf("CL_INVALID_CONTEXT\n"); break;
+		case CL_INVALID_QUEUE_PROPERTIES: 					printf("CL_INVALID_QUEUE_PROPERTIES\n"); break;
+		case CL_INVALID_COMMAND_QUEUE: 						printf("CL_INVALID_COMMAND_QUEUE\n"); break;
+		case CL_INVALID_HOST_PTR: 							printf("CL_INVALID_HOST_PTR\n"); break;
+		case CL_INVALID_MEM_OBJECT: 						printf("CL_INVALID_MEM_OBJECT\n"); break;
+		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: 			printf("CL_INVALID_IMAGE_FORMAT_DESCRIPTOR\n"); break;
+		case CL_INVALID_IMAGE_SIZE: 						printf("CL_INVALID_IMAGE_SIZE\n"); break;
+		case CL_INVALID_SAMPLER: 							printf("CL_INVALID_SAMPLER\n"); break;
+		case CL_INVALID_BINARY: 							printf("CL_INVALID_BINARY\n"); break;
+		case CL_INVALID_BUILD_OPTIONS: 						printf("CL_INVALID_BUILD_OPTIONS\n"); break;
+		case CL_INVALID_PROGRAM: 							printf("CL_INVALID_PROGRAM\n"); break;
+		case CL_INVALID_PROGRAM_EXECUTABLE: 				printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
+		case CL_INVALID_KERNEL_NAME: 						printf("CL_INVALID_KERNEL_NAME\n"); break;
+		case CL_INVALID_KERNEL_DEFINITION: 					printf("CL_INVALID_KERNEL_DEFINITION\n"); break;
+		case CL_INVALID_KERNEL: 							printf("CL_INVALID_KERNEL\n"); break;
+		case CL_INVALID_ARG_INDEX: 							printf("CL_INVALID_ARG_INDEX\n"); break;
+		case CL_INVALID_ARG_VALUE: 							printf("CL_INVALID_ARG_VALUE\n"); break;
+		case CL_INVALID_ARG_SIZE: 							printf("CL_INVALID_ARG_SIZE\n"); break;
+		case CL_INVALID_KERNEL_ARGS: 						printf("CL_INVALID_KERNEL_ARGS\n"); break;
+		case CL_INVALID_WORK_DIMENSION: 					printf("CL_INVALID_WORK_DIMENSION\n"); break;
+		case CL_INVALID_WORK_GROUP_SIZE: 					printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
+		case CL_INVALID_WORK_ITEM_SIZE: 					printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
+		case CL_INVALID_GLOBAL_OFFSET: 						printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
+		case CL_INVALID_EVENT_WAIT_LIST: 					printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
+		case CL_INVALID_EVENT: 								printf("CL_INVALID_EVENT\n"); break;
+		case CL_INVALID_OPERATION: 							printf("CL_INVALID_OPERATION\n"); break;
+		case CL_INVALID_GL_OBJECT: 							printf("CL_INVALID_GL_OBJECT\n"); break;
+		case CL_INVALID_BUFFER_SIZE: 						printf("CL_INVALID_BUFFER_SIZE\n"); break;
+		case CL_INVALID_MIP_LEVEL: 							printf("CL_INVALID_MIP_LEVEL\n"); break;
+		case CL_INVALID_GLOBAL_WORK_SIZE: 					printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
+
+		#ifdef CL_VERSION_1_1
+		case CL_MISALIGNED_SUB_BUFFER_OFFSET: 				printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
+		case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST:	printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
+		/* case CL_INVALID_PROPERTY: 							printf("CL_INVALID_PROPERTY\n"); break; */
+		#endif
+
+		default:											printf("Invalid OpenCL error code\n");
+
+	}
+
+	exit(error);
+
+}
+
+//===============================================================================================================================================================================================================200
+//	END
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/tests/opencl/b+tree/opencl.h
+++ b/tests/opencl/b+tree/opencl.h
@ -0,0 +1,40 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	INCLUDE/DEFINE
+//===============================================================================================================================================================================================================200
+
+#include <stdio.h>					// (in path known to compiler)		needed by printf
+
+#include <CL/cl.h>					// (in path specified to compiler)	needed by OpenCL types
+
+//===============================================================================================================================================================================================================200
+//	LOAD KERNEL SOURCE CODE FUNCTION HEADER
+//===============================================================================================================================================================================================================200
+
+char *
+load_kernel_source(const char *filename);
+
+//===============================================================================================================================================================================================================200
+//	PRINT ERROR FUNCTION HEADER
+//===============================================================================================================================================================================================================200
+
+void 
+fatal(const char *s);
+
+//===============================================================================================================================================================================================================200
+//	PRINT OPENCL ERROR FUNCTION HEADER
+//===============================================================================================================================================================================================================200
+
+void 
+fatal_CL(cl_int error, int line_no);
+
+//===============================================================================================================================================================================================================200
+//	END
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/tests/opencl/b+tree/timer.cc
+++ b/tests/opencl/b+tree/timer.cc
@ -0,0 +1,37 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	TIMER CODE
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	INCLUDE/DEFINE
+//======================================================================================================================================================150
+
+#include <stdlib.h>
+#include "timer.h"
+
+//======================================================================================================================================================150
+//	FUNCTIONS
+//======================================================================================================================================================150
+
+//====================================================================================================100
+//	DISPLAY TIME
+//====================================================================================================100
+
+ // Returns the current system time in microseconds
+long long get_time() {
+	struct timeval tv;
+	gettimeofday(&tv, NULL);
+	return (tv.tv_sec * 1000000) + tv.tv_usec;
+}
+
+//===============================================================================================================================================================================================================200
+//	END TIMER CODE
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/tests/opencl/b+tree/timer.h
+++ b/tests/opencl/b+tree/timer.h
@ -0,0 +1,23 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//===============================================================================================================================================================================================================200
+//	TIMER HEADER
+#include <sys/time.h>		    					// (in directory known to compiler)			needed by gettimeofday
+//===============================================================================================================================================================================================================200
+
+//======================================================================================================================================================150
+//	FUNCTION PROTOTYPES
+//======================================================================================================================================================150
+
+long long 
+get_time();
+
+//===============================================================================================================================================================================================================200
+//	END TIMER HEADER
+//===============================================================================================================================================================================================================200
+
+#ifdef __cplusplus
+}
+#endif
--- a/tests/opencl/b+tree/timing.cpp
+++ b/tests/opencl/b+tree/timing.cpp
@ -0,0 +1,40 @@
+#include <stdio.h>
+
+#include "timing.h"
+
+void time_measure_start(struct timeval *tv)
+{
+	gettimeofday(tv, NULL);
+}
+
+void time_measure_end(struct timeval *tv)
+{
+	struct timeval tv_now, tv_diff;
+	double d;
+
+	gettimeofday(&tv_now, NULL);
+	tvsub(&tv_now, tv, &tv_diff);
+
+	d = (double) tv_diff.tv_sec * 1000.0 + (double) tv_diff.tv_usec / 1000.0;
+	printf("Time (Memory Copy and Launch) = %f (ms)\n", d);
+}
+
+float probe_event_time(cl_event event, cl_command_queue command_queue) {
+    cl_int error=0;
+    cl_ulong eventStart,eventEnd;
+    clFinish(command_queue);
+    error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,
+                                    sizeof(cl_ulong),&eventStart,NULL);
+    if (error != CL_SUCCESS) {
+        printf("ERROR (%d) in event start profiling.\n", error);
+        return 0;
+    }
+    error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,
+                                    sizeof(cl_ulong),&eventEnd,NULL);
+    if (error != CL_SUCCESS) {
+        printf("ERROR (%d) in event end profiling.\n", error);
+        return 0;
+    }
+
+    return (float)((eventEnd-eventStart)/1000000.0);
+}
--- a/tests/opencl/b+tree/timing.h
+++ b/tests/opencl/b+tree/timing.h
@ -0,0 +1,25 @@
+#ifndef __TIMING_H__
+#define __TIMING_H__
+
+#include <sys/time.h>
+#include <CL/cl.h>
+
+void time_measure_start(struct timeval *tv);
+void time_measure_end(struct timeval *tv);
+
+/* tvsub: ret = x - y. */
+static inline void tvsub(struct timeval *x,
+						 struct timeval *y,
+						 struct timeval *ret)
+{
+	ret->tv_sec = x->tv_sec - y->tv_sec;
+	ret->tv_usec = x->tv_usec - y->tv_usec;
+	if (ret->tv_usec < 0) {
+		ret->tv_sec--;
+		ret->tv_usec += 1000000;
+	}
+}
+
+float probe_event_time(cl_event, cl_command_queue);
+
+#endif
--- a/tests/opencl/backprop/Makefile
+++ b/tests/opencl/backprop/Makefile
@ -0,0 +1,8 @@
+PROJECT = backprop
+
+SRCS = backprop.cpp imagenet.cpp facetrain.cpp backprop_ocl.cpp timing.cpp
+
+OPTS ?= -n 1024 -p 0 -d 0
+#OPTS ?= -n 32768 -p 0 -d 0
+
+include common.mk
--- a/tests/opencl/backprop/backprop.cpp
+++ b/tests/opencl/backprop/backprop.cpp
@ -0,0 +1,473 @@
+/*
+ ******************************************************************
+ * HISTORY
+ * 15-Oct-94  Jeff Shufelt (js), Carnegie Mellon University
+ *	Prepared for 15-681, Fall 1994.
+ * Modified by Shuai Che
+ ******************************************************************
+ */
+
+#include <omp.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "backprop.h"
+#include <math.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define ABS(x)          (((x) > 0.0) ? (x) : (-(x)))
+
+#define fastcopy(to,from,len)\
+{\
+  register char *_to,*_from;\
+  register int _i,_l;\
+  _to = (char *)(to);\
+  _from = (char *)(from);\
+  _l = (len);\
+  for (_i = 0; _i < _l; _i++) *_to++ = *_from++;\
+}
+
+/*** Return random number between 0.0 and 1.0 ***/
+float drnd()
+{
+  return ((float) rand() / (float) BIGRND);
+}
+
+/*** Return random number between -1.0 and 1.0 ***/
+float dpn1()
+{
+  return ((drnd() * 2.0) - 1.0);
+}
+
+/*** The squashing function.  Currently, it's a sigmoid. ***/
+
+float squash(float x)
+{
+  float m;
+  //x = -x;
+  //m = 1 + x + x*x/2 + x*x*x/6 + x*x*x*x/24 + x*x*x*x*x/120;
+  //return(1.0 / (1.0 + m));
+  return (1.0 / (1.0 + exp(-x)));
+}
+
+
+/*** Allocate 1d array of floats ***/
+
+float *alloc_1d_dbl(int n)
+{
+  float *new_t;
+
+  new_t = (float *) malloc ((unsigned) (n * sizeof (float)));
+  if (new_t == NULL) {
+    printf("ALLOC_1D_DBL: Couldn't allocate array of floats\n");
+    return (NULL);
+  }
+  return (new_t);
+}
+
+
+/*** Allocate 2d array of floats ***/
+
+float **alloc_2d_dbl(int m, int n)
+{
+  int i;
+  float **new_t;
+
+  new_t = (float **) malloc ((unsigned) (m * sizeof (float *)));
+  if (new_t == NULL) {
+    printf("ALLOC_2D_DBL: Couldn't allocate array of dbl ptrs\n");
+    return (NULL);
+  }
+
+  for (i = 0; i < m; i++) {
+    new_t[i] = alloc_1d_dbl(n);
+  }
+
+  return (new_t);
+}
+
+
+void bpnn_randomize_weights(float **w, int m, int n)
+{
+  int i, j;
+
+  for (i = 0; i <= m; i++) {
+    for (j = 0; j <= n; j++) {
+     w[i][j] = (float) rand()/RAND_MAX;
+    //  w[i][j] = dpn1();
+    }
+  }
+}
+
+void bpnn_randomize_row(float *w, int m)
+{
+	int i;
+	for (i = 0; i <= m; i++) {
+     //w[i] = (float) rand()/RAND_MAX;
+	 w[i] = 0.1;
+    }
+}
+
+
+void bpnn_zero_weights(float **w, int m, int n)
+{
+  int i, j;
+
+  for (i = 0; i <= m; i++) {
+    for (j = 0; j <= n; j++) {
+      w[i][j] = 0.0;
+    }
+  }
+}
+
+
+void bpnn_initialize(int seed)
+{
+  printf("Random number generator seed: %d\n", seed);
+  srand(seed);
+}
+
+
+BPNN *bpnn_internal_create(int n_in, int n_hidden, int n_out)
+{
+  BPNN *newnet;
+
+  newnet = (BPNN *) malloc (sizeof (BPNN));
+  if (newnet == NULL) {
+    printf("BPNN_CREATE: Couldn't allocate neural network\n");
+    return (NULL);
+  }
+
+  newnet->input_n = n_in;
+  newnet->hidden_n = n_hidden;
+  newnet->output_n = n_out;
+  newnet->input_units = alloc_1d_dbl(n_in + 1);
+  newnet->hidden_units = alloc_1d_dbl(n_hidden + 1);
+  newnet->output_units = alloc_1d_dbl(n_out + 1);
+
+  newnet->hidden_delta = alloc_1d_dbl(n_hidden + 1);
+  newnet->output_delta = alloc_1d_dbl(n_out + 1);
+  newnet->target = alloc_1d_dbl(n_out + 1);
+
+  newnet->input_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
+  newnet->hidden_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
+
+  newnet->input_prev_weights = alloc_2d_dbl(n_in + 1, n_hidden + 1);
+  newnet->hidden_prev_weights = alloc_2d_dbl(n_hidden + 1, n_out + 1);
+
+  return (newnet);
+}
+
+
+void bpnn_free(BPNN *net)
+{
+  int n1, n2, i;
+
+  n1 = net->input_n;
+  n2 = net->hidden_n;
+
+  free((char *) net->input_units);
+  free((char *) net->hidden_units);
+  free((char *) net->output_units);
+
+  free((char *) net->hidden_delta);
+  free((char *) net->output_delta);
+  free((char *) net->target);
+
+  for (i = 0; i <= n1; i++) {
+    free((char *) net->input_weights[i]);
+    free((char *) net->input_prev_weights[i]);
+  }
+  free((char *) net->input_weights);
+  free((char *) net->input_prev_weights);
+
+  for (i = 0; i <= n2; i++) {
+    free((char *) net->hidden_weights[i]);
+    free((char *) net->hidden_prev_weights[i]);
+  }
+  free((char *) net->hidden_weights);
+  free((char *) net->hidden_prev_weights);
+
+  free((char *) net);
+}
+
+
+/*** Creates a new fully-connected network from scratch,
+     with the given numbers of input, hidden, and output units.
+     Threshold units are automatically included.  All weights are
+     randomly initialized.
+
+     Space is also allocated for temporary storage (momentum weights,
+     error computations, etc).
+***/
+
+BPNN *bpnn_create(int n_in, int n_hidden, int n_out)
+{
+
+  BPNN *newnet;
+
+  newnet = bpnn_internal_create(n_in, n_hidden, n_out);
+
+#ifdef INITZERO
+  bpnn_zero_weights(newnet->input_weights, n_in, n_hidden);
+#else
+  bpnn_randomize_weights(newnet->input_weights, n_in, n_hidden);
+#endif
+  bpnn_randomize_weights(newnet->hidden_weights, n_hidden, n_out);
+  bpnn_zero_weights(newnet->input_prev_weights, n_in, n_hidden);
+  bpnn_zero_weights(newnet->hidden_prev_weights, n_hidden, n_out);
+  bpnn_randomize_row(newnet->target, n_out);
+  return (newnet);
+}
+
+
+void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2)
+{
+  float sum;
+  int j, k;
+
+  /*** Set up thresholding unit ***/
+  l1[0] = 1.0;
+#ifdef OPEN
+  omp_set_num_threads(NUM_THREAD);
+  #pragma omp parallel for shared(conn, n1, n2, l1) private(k, j) reduction(+: sum) schedule(static)
+#endif 
+  /*** For each unit in second layer ***/
+  for (j = 1; j <= n2; j++) {
+
+    /*** Compute weighted sum of its inputs ***/
+    sum = 0.0;
+    for (k = 0; k <= n1; k++) {	
+      sum += conn[k][j] * l1[k]; 
+    }
+    l2[j] = squash(sum);
+  }
+}
+
+void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err)  
+{
+  int j;
+  float o, t, errsum;
+  errsum = 0.0;
+  for (j = 1; j <= nj; j++) {
+    o = output[j];
+    t = target[j];
+    delta[j] = o * (1.0 - o) * (t - o);
+    errsum += ABS(delta[j]);
+  }
+  *err = errsum;
+}
+
+
+void bpnn_hidden_error(float *delta_h,   
+					   int nh, 
+					   float *delta_o, 
+					   int no, 
+					   float **who, 
+					   float *hidden, 
+					   float *err)
+{
+  int j, k;
+  float h, sum, errsum;
+
+  errsum = 0.0;
+  for (j = 1; j <= nh; j++) {
+    h = hidden[j];
+    sum = 0.0;
+    for (k = 1; k <= no; k++) {
+      sum += delta_o[k] * who[j][k];
+    }
+    delta_h[j] = h * (1.0 - h) * sum;
+    errsum += ABS(delta_h[j]);
+  }
+  *err = errsum;
+}
+
+
+void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw)
+{
+  float new_dw;
+  int k, j;
+  ly[0] = 1.0;
+
+#ifdef OPEN
+  omp_set_num_threads(NUM_THREAD);
+  #pragma omp parallel for  \
+      shared(oldw, w, delta) \
+	  private(j, k, new_dw) \
+	  firstprivate(ndelta, nly, momentum) 
+#endif 
+  for (j = 1; j <= ndelta; j++) {
+    for (k = 0; k <= nly; k++) {
+      new_dw = ((ETA * delta[j] * ly[k]) + (MOMENTUM * oldw[k][j]));
+	  w[k][j] += new_dw;
+	  oldw[k][j] = new_dw;
+    }
+  }
+}
+
+
+void bpnn_feedforward(BPNN *net)
+{
+  int in, hid, out;
+
+  in = net->input_n;
+  hid = net->hidden_n;
+  out = net->output_n;
+
+  /*** Feed forward input activations. ***/
+  bpnn_layerforward(net->input_units, net->hidden_units,
+      net->input_weights, in, hid);
+  bpnn_layerforward(net->hidden_units, net->output_units,
+      net->hidden_weights, hid, out);
+
+}
+
+
+void bpnn_train(BPNN *net, float *eo, float *eh)
+{
+  int in, hid, out;
+  float out_err, hid_err;
+
+  in = net->input_n;
+  hid = net->hidden_n;
+  out = net->output_n;
+
+  /*** Feed forward input activations. ***/
+  bpnn_layerforward(net->input_units, net->hidden_units,
+      net->input_weights, in, hid);
+  bpnn_layerforward(net->hidden_units, net->output_units,
+      net->hidden_weights, hid, out);
+
+  /*** Compute error on output and hidden units. ***/
+  bpnn_output_error(net->output_delta, net->target, net->output_units,
+      out, &out_err);
+  bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out,
+      net->hidden_weights, net->hidden_units, &hid_err);
+  *eo = out_err;
+  *eh = hid_err;
+
+  /*** Adjust input and hidden weights. ***/
+  bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid,
+      net->hidden_weights, net->hidden_prev_weights);
+  bpnn_adjust_weights(net->hidden_delta, hid, net->input_units, in,
+      net->input_weights, net->input_prev_weights);
+
+}
+
+
+void bpnn_save(BPNN *net, char *filename)
+{
+  int n1, n2, n3, i, j, memcnt;
+  float dvalue, **w;
+  char *mem;
+  ///add//
+  FILE *pFile;
+  pFile = fopen( filename, "w+" );
+  ///////
+  /*
+  if ((fd = creat(filename, 0644)) == -1) {
+    printf("BPNN_SAVE: Cannot create '%s'\n", filename);
+    return;
+  }
+  */
+
+  n1 = net->input_n;  n2 = net->hidden_n;  n3 = net->output_n;
+  printf("Saving %dx%dx%d network to '%s'\n", n1, n2, n3, filename);
+  //fflush(stdout);
+
+  //write(fd, (char *) &n1, sizeof(int));
+  //write(fd, (char *) &n2, sizeof(int));
+  //write(fd, (char *) &n3, sizeof(int));
+
+  fwrite( (char *) &n1 , sizeof(char), sizeof(char), pFile);
+  fwrite( (char *) &n2 , sizeof(char), sizeof(char), pFile);
+  fwrite( (char *) &n3 , sizeof(char), sizeof(char), pFile);
+
+  memcnt = 0;
+  w = net->input_weights;
+  mem = (char *) malloc ((unsigned) ((n1+1) * (n2+1) * sizeof(float)));
+  for (i = 0; i <= n1; i++) {
+    for (j = 0; j <= n2; j++) {
+      dvalue = w[i][j];
+      fastcopy(&mem[memcnt], &dvalue, sizeof(float));
+      memcnt += sizeof(float);
+    }
+  }
+  //write(fd, mem, (n1+1) * (n2+1) * sizeof(float));
+  fwrite( mem , (unsigned)(sizeof(float)), (unsigned) ((n1+1) * (n2+1) * sizeof(float)) , pFile);
+  free(mem);
+
+  memcnt = 0;
+  w = net->hidden_weights;
+  mem = (char *) malloc ((unsigned) ((n2+1) * (n3+1) * sizeof(float)));
+  for (i = 0; i <= n2; i++) {
+    for (j = 0; j <= n3; j++) {
+      dvalue = w[i][j];
+      fastcopy(&mem[memcnt], &dvalue, sizeof(float));
+      memcnt += sizeof(float);
+    }
+  }
+  //write(fd, mem, (n2+1) * (n3+1) * sizeof(float));
+  fwrite( mem , sizeof(float), (unsigned) ((n2+1) * (n3+1) * sizeof(float)) , pFile);
+  free(mem);
+
+  fclose(pFile);
+  return;
+}
+
+
+BPNN *bpnn_read(char *filename)
+{
+  char *mem;
+  BPNN *new_t;
+  int fd, n1, n2, n3, i, j, memcnt;
+
+  if ((fd = open(filename, 0, 0644)) == -1) {
+    return (NULL);
+  }
+
+  printf("Reading '%s'\n", filename);  //fflush(stdout);
+
+  read(fd, (char *) &n1, sizeof(int));
+  read(fd, (char *) &n2, sizeof(int));
+  read(fd, (char *) &n3, sizeof(int));
+  new_t = bpnn_internal_create(n1, n2, n3);
+
+  printf("'%s' contains a %dx%dx%d network\n", filename, n1, n2, n3);
+  printf("Reading input weights...");  //fflush(stdout);
+
+  memcnt = 0;
+  mem = (char *) malloc ((unsigned) ((n1+1) * (n2+1) * sizeof(float)));
+  read(fd, mem, (n1+1) * (n2+1) * sizeof(float));
+  for (i = 0; i <= n1; i++) {
+    for (j = 0; j <= n2; j++) {
+      fastcopy(&(new_t->input_weights[i][j]), &mem[memcnt], sizeof(float));
+      memcnt += sizeof(float);
+    }
+  }
+  free(mem);
+
+  printf("Done\nReading hidden weights...");  //fflush(stdout);
+
+  memcnt = 0;
+  mem = (char *) malloc ((unsigned) ((n2+1) * (n3+1) * sizeof(float)));
+  read(fd, mem, (n2+1) * (n3+1) * sizeof(float));
+  for (i = 0; i <= n2; i++) {
+    for (j = 0; j <= n3; j++) {
+      fastcopy(&(new_t->hidden_weights[i][j]), &mem[memcnt], sizeof(float));
+      memcnt += sizeof(float);
+    }
+  }
+  free(mem);
+  close(fd);
+
+  printf("Done\n");  //fflush(stdout);
+
+  bpnn_zero_weights(new_t->input_prev_weights, n1, n2);
+  bpnn_zero_weights(new_t->hidden_prev_weights, n2, n3);
+
+  return (new_t);
+}
--- a/tests/opencl/backprop/backprop.h
+++ b/tests/opencl/backprop/backprop.h
@ -0,0 +1,70 @@
+#ifndef _BACKPROP_H_
+#define _BACKPROP_H_
+
+#define BIGRND 0x7fffffff
+#define THREADS 256
+#define WIDTH 16  // shared memory width  
+#define HEIGHT 16 // shared memory height
+#define BLOCK_SIZE 16
+
+#define ETA 0.3       //eta value
+#define MOMENTUM 0.3  //momentum value
+#define NUM_THREAD 4  //OpenMP threads
+
+#include <CL/cl.h>
+
+typedef struct {
+  int input_n;                  /* number of input units */
+  int hidden_n;                 /* number of hidden units */
+  int output_n;                 /* number of output units */
+
+  float *input_units;          /* the input units */
+  float *hidden_units;         /* the hidden units */
+  float *output_units;         /* the output units */
+
+  float *hidden_delta;         /* storage for hidden unit error */
+  float *output_delta;         /* storage for output unit error */
+
+  float *target;               /* storage for target vector */
+
+  float **input_weights;       /* weights from input to hidden layer */
+  float **hidden_weights;      /* weights from hidden to output layer */
+
+                                /*** The next two are for momentum ***/
+  float **input_prev_weights;  /* previous change on input to hidden wgt */
+  float **hidden_prev_weights; /* previous change on hidden to output wgt */
+} BPNN;
+
+
+/*** User-level functions ***/
+
+//void bpnn_initialize();
+void bpnn_initialize(int seed);
+BPNN *bpnn_create(int n_in, int n_hidden, int n_out);
+void bpnn_free(BPNN *net);
+//BPNN *bpnn_create();
+//void bpnn_free();
+void bpnn_train(BPNN *net, float *eo, float *eh);
+//void bpnn_train();
+//void bpnn_feedforward();
+void bpnn_feedforward(BPNN *net);
+void bpnn_save(BPNN *net, char *filename);
+//void bpnn_save();
+//BPNN *bpnn_read();
+BPNN *bpnn_read(char *filename);
+void load(BPNN *net);
+int bpnn_train_kernel(BPNN *net, float *eo, float *eh);
+void bpnn_layerforward(float *l1, float *l2, float **conn, int n1, int n2);
+void bpnn_output_error(float *delta, float *target, float *output, int nj, float *err);
+void bpnn_hidden_error(float *delta_h, int nh, float *delta_o, int no, float **who, float *hidden, float *err); 
+void bpnn_adjust_weights(float *delta, int ndelta, float *ly, int nly, float **w, float **oldw);
+int setup(int argc, char** argv);
+float **alloc_2d_dbl(int m, int n);
+float squash(float x);
+
+/*** OpenCL config variables ***/
+extern int platform_id_inuse;
+extern int device_id_inuse;
+extern cl_device_type device_type;
+
+#endif
--- a/tests/opencl/backprop/backprop_ocl.cpp
+++ b/tests/opencl/backprop/backprop_ocl.cpp
@ -0,0 +1,402 @@
+// includes, system
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+#include <sys/time.h>
+#include "backprop.h"
+
+#ifdef NV //NVIDIA
+	#include <oclUtils.h>
+#else
+	#include <CL/cl.h>
+#endif
+
+#ifdef TIMING
+    #include "timing.h"
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+
+// local variables
+static cl_context	    context;
+static cl_command_queue cmd_queue;
+static cl_device_id   * device_list;
+static cl_uint          num_devices;
+
+// OCL config
+int platform_id_inuse = 0;            // platform id in use (default: 0)
+int device_id_inuse = 0;              //device id in use (default : 0)
+cl_device_type device_type = CL_DEVICE_TYPE_GPU;
+
+//Primitives for timing
+#ifdef TIMING
+struct timeval tv;
+struct timeval tv_total_start, tv_total_end;
+struct timeval tv_init_end;
+struct timeval tv_h2d_start, tv_h2d_end;
+struct timeval tv_d2h_start, tv_d2h_end;
+struct timeval tv_kernel_start, tv_kernel_end;
+struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
+struct timeval tv_close_start, tv_close_end;
+float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time = 0,
+      d2h_time = 0, close_time = 0, total_time = 0;
+#endif
+
+static int initialize(void)
+{
+	cl_int result;
+	size_t size;
+    cl_uint num_platforms;
+
+    // get OpenCL platforms
+	if (clGetPlatformIDs(0, NULL, &num_platforms) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(0,0,*) failed\n"); return -1; }
+	cl_platform_id all_platform_id[num_platforms];
+	if (clGetPlatformIDs(num_platforms, all_platform_id, NULL) != CL_SUCCESS) { printf("ERROR: clGetPlatformIDs(*,*,0) failed\n"); return -1; }
+    cl_platform_id platform_id = all_platform_id[platform_id_inuse];
+
+    // get device
+    if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices) != CL_SUCCESS) { printf("ERROR: clGetDeviceIDs failed\n"); return -1; };
+	printf("num_devices = %d\n", num_devices);
+    if(device_id_inuse > num_devices) {
+        printf("Invalid Device Number\n");
+        return -1;
+    }
+	device_list = new cl_device_id[num_devices];
+	//device_list = (cl_device_id *)malloc(sizeof(cl_device_id)*num_devices);
+	if( !device_list ) { printf("ERROR: new cl_device_id[] failed\n"); return -1; }
+    if (clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, num_devices, device_list, NULL) != CL_SUCCESS) { printf("ERROR: clGetDeviceIDs failed\n"); return -1; };
+
+    // get device type
+    if (clGetDeviceInfo(device_list[device_id_inuse], CL_DEVICE_TYPE, sizeof(device_type), (void *)&device_type, NULL)!= CL_SUCCESS) { printf("ERROR: clGetDeviceIDs failed\n"); return -1; };
+
+	// create OpenCL context
+	cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0};
+	context = clCreateContextFromType( ctxprop, device_type, NULL, NULL, NULL );
+	if( !context ) { printf("ERROR: clCreateContextFromType(%s) failed\n", device_type == CL_DEVICE_TYPE_GPU ? "GPU" : "CPU"); return -1; }
+
+	// create command queue for the specific device
+#ifdef TIMING
+	cmd_queue = clCreateCommandQueue( context, device_list[device_id_inuse], CL_QUEUE_PROFILING_ENABLE, NULL );
+#else
+	cmd_queue = clCreateCommandQueue( context, device_list[device_id_inuse], 0, NULL );
+#endif
+	if( !cmd_queue ) { printf("ERROR: clCreateCommandQueue() failed\n"); return -1; }
+	return 0;
+}
+
+static int shutdown()
+{
+	// release resources
+	if( cmd_queue ) clReleaseCommandQueue( cmd_queue );
+	if( context ) clReleaseContext( context );
+	if( device_list ) delete[] device_list;
+
+	// reset all variables
+	cmd_queue = 0;
+	context = 0;
+	device_list = 0;
+	num_devices = 0;
+	device_type = CL_DEVICE_TYPE_GPU;
+
+	return 0;
+}
+
+double gettime() {
+  struct timeval t;
+  gettimeofday(&t,NULL);
+  return t.tv_sec+t.tv_usec*1e-6;
+}
+
+int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+
+unsigned int num_threads = 0;
+unsigned int num_blocks = 0;
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int
+main( int argc, char** argv)
+{
+	setup(argc, argv);
+}
+
+
+int bpnn_train_kernel(BPNN *net, float *eo, float *eh)
+{
+	int in, hid, out;
+	float out_err, hid_err;
+
+	in = net->input_n;
+	hid = net->hidden_n;
+	out = net->output_n;
+
+	int sourcesize = 1024*1024;
+	char * source = (char *)calloc(sourcesize, sizeof(char));
+	if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; }
+
+	// read the kernel core source
+	const char * kernel_bp1  = "bpnn_layerforward_ocl";
+	const char * kernel_bp2  = "bpnn_adjust_weights_ocl";
+	//const char * tempchar = "./backprop_kernel.cl";
+  /*
+	FILE * fp = fopen(tempchar, "rb");
+	if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; }
+	fread(source + strlen(source), sourcesize, 1, fp);
+	fclose(fp);
+  */
+#ifdef  TIMING
+    gettimeofday(&tv_total_start, NULL);
+#endif
+	if(initialize()) return -1;
+
+	// compile kernel
+	cl_int err = 0;
+  /*
+	const char * slist[2] = { source, 0 };
+	cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; }
+	err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
+	{ // show warnings/errors
+		//static char log[65536]; memset(log, 0, sizeof(log));
+		//cl_device_id device_id = 0;
+		//err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL);
+		//clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL);
+		//if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log);
+	}
+	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
+  */
+    uint8_t *kernel_bin = NULL;
+    size_t kernel_size;
+    cl_int binary_status = 0;
+    if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+      std::abort();
+
+    cl_program prog = clCreateProgramWithBinary(
+          context, 1, &device_list[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &err);
+    free(kernel_bin);
+
+	err = clBuildProgram(prog, 1, &device_list[device_id_inuse], NULL, NULL, NULL);
+	if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; }
+
+
+	cl_kernel kernel1;
+	cl_kernel kernel2;
+	kernel1 = clCreateKernel(prog, kernel_bp1, &err);
+	kernel2 = clCreateKernel(prog, kernel_bp2, &err);
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; }
+	clReleaseProgram(prog);
+
+#ifdef  TIMING
+	gettimeofday(&tv_init_end, NULL);
+	tvsub(&tv_init_end, &tv_total_start, &tv);
+	init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+
+	float *input_weights_one_dim;
+    float *input_weights_prev_one_dim;
+	float * partial_sum;
+	float sum;
+	float num_blocks = in / BLOCK_SIZE;
+
+	input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
+	input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float));
+	partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float));
+
+	// set global and local workitems
+	size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 };
+	size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 };
+
+	// this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights
+	// todo: fix mem allocation
+	int m = 0;
+	for (int k = 0; k <= in; k++) {
+		for (int j = 0; j <= hid; j++) {
+		input_weights_one_dim[m] = net->input_weights[k][j];
+		input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j];
+	    m++;
+		}
+	}
+
+	cl_mem input_hidden_ocl;
+	cl_mem input_ocl;
+	cl_mem output_hidden_ocl;
+	cl_mem hidden_partial_sum;
+	cl_mem hidden_delta_ocl;
+	cl_mem input_prev_weights_ocl;
+
+#ifdef  TIMING
+    gettimeofday(&tv_mem_alloc_start, NULL);
+#endif
+	input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;}
+	input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;}
+	output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;}
+	hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;}
+	hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;}
+	input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err );
+	if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;}
+#ifdef  TIMING
+    gettimeofday(&tv_mem_alloc_end, NULL);
+    tvsub(&tv_mem_alloc_end, &tv_mem_alloc_start, &tv);
+    mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+	printf("Performing %s computation\n", device_type == CL_DEVICE_TYPE_GPU ? "GPU" : "CPU");
+    cl_event event;
+    cl_event write_event[3];
+
+	//write buffers
+	err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, &write_event[0]);
+	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; }
+
+	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, &write_event[1]);
+	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
+#ifdef TIMING
+    h2d_time += probe_event_time(write_event[0],cmd_queue);
+    h2d_time += probe_event_time(write_event[1],cmd_queue);
+#endif
+    clReleaseEvent(write_event[0]);
+    clReleaseEvent(write_event[1]);
+
+	clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl);
+	clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl);
+	clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl);
+	clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum );
+	clSetKernelArg(kernel1, 4, sizeof(float) *  HEIGHT, (void*)NULL );
+	clSetKernelArg(kernel1, 5, sizeof(float ) *  HEIGHT * WIDTH, (void*)NULL );
+	clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in);
+	clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid);
+
+	err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 2, NULL, global_work, local_work, 0, 0, &event);
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
+#ifdef TIMING
+    kernel_time += probe_event_time(event,cmd_queue);
+#endif
+    clReleaseEvent(event);
+
+	err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, &event);
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: partial sum\n"); return -1; }
+#ifdef TIMING
+    d2h_time += probe_event_time(event,cmd_queue);
+#endif
+    clReleaseEvent(event);
+
+	for (int j = 1; j <= hid; j++) {
+		sum = 0.0;
+		for (int k = 0; k < num_blocks; k++) {
+		sum += partial_sum[k * hid + j-1] ;
+    }
+		sum += net->input_weights[0][j];
+		net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum)));
+	}
+
+
+	bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out);
+	bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err);
+	bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err);
+	bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights);
+
+	err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl,       1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, &write_event[0]);
+	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; }
+
+	err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, &write_event[1]);
+	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; }
+
+	err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl,       1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, &write_event[2]);
+	if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; }
+#ifdef TIMING
+    h2d_time += probe_event_time(write_event[0],cmd_queue);
+    h2d_time += probe_event_time(write_event[1],cmd_queue);
+    h2d_time += probe_event_time(write_event[2],cmd_queue);
+#endif
+    clReleaseEvent(write_event[0]);
+    clReleaseEvent(write_event[1]);
+    clReleaseEvent(write_event[2]);
+
+	clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl);
+	clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid);
+	clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl);
+	clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in);
+	clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl);
+	clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl );
+
+	err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, &event);
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; }
+#ifdef TIMING
+    kernel_time += probe_event_time(event,cmd_queue);
+#endif
+    clReleaseEvent(event);
+
+	err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, &event);
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_ocl\n"); return -1; }
+#ifdef TIMING
+    d2h_time += probe_event_time(event,cmd_queue);
+#endif
+    clReleaseEvent(event);
+
+	err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, &event);
+	if(err != CL_SUCCESS) { printf("ERROR: 1  clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; }
+#ifdef TIMING
+    d2h_time += probe_event_time(event,cmd_queue);
+#endif
+    clReleaseEvent(event);
+
+#ifdef  TIMING
+	gettimeofday(&tv_close_start, NULL);
+#endif
+
+	clReleaseMemObject(input_ocl);
+	clReleaseMemObject(output_hidden_ocl);
+	clReleaseMemObject(input_hidden_ocl);
+	clReleaseMemObject(hidden_partial_sum);
+	clReleaseMemObject(input_prev_weights_ocl);
+
+	free(input_weights_prev_one_dim);
+	free(partial_sum);
+	free(input_weights_one_dim);
+
+    shutdown();
+
+#ifdef  TIMING
+	gettimeofday(&tv_close_end, NULL);
+	tvsub(&tv_close_end, &tv_close_start, &tv);
+	close_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+	tvsub(&tv_close_end, &tv_total_start, &tv);
+	total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+
+	printf("Init: %f\n", init_time);
+	printf("MemAlloc: %f\n", mem_alloc_time);
+	printf("HtoD: %f\n", h2d_time);
+	printf("Exec: %f\n", kernel_time);
+	printf("DtoH: %f\n", d2h_time);
+	printf("Close: %f\n", close_time);
+	printf("Total: %f\n", total_time);
+#endif
+}
--- a/tests/opencl/backprop/common.mk
+++ b/tests/opencl/backprop/common.mk
@ -0,0 +1,123 @@
+XLEN ?= 32
+TOOLDIR ?= /opt
+
+TARGET ?= opaesim
+
+XRT_SYN_DIR  ?= ../../../hw/syn/xilinx/xrt
+XRT_DEVICE_INDEX ?= 0
+
+ifeq ($(XLEN),64)
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
+VX_CFLAGS += -march=rv64imafd -mabi=lp64d
+K_CFLAGS += -march=rv64imafd -mabi=ilp64d
+STARTUP_ADDR ?= 0x180000000
+else
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
+VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
+K_CFLAGS += -march=rv32imaf -mabi=ilp32f
+STARTUP_ADDR ?= 0x80000000
+endif
+
+RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
+RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
+
+POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
+POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
+
+VORTEX_RT_PATH ?= $(realpath ../../../runtime)
+VORTEX_KN_PATH ?= $(realpath ../../../kernel)
+
+FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
+
+LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
+LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
+
+K_CFLAGS   += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
+K_CFLAGS   += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
+K_CFLAGS   += -I$(VORTEX_KN_PATH)/include -DNDEBUG -DLLVM_VOTEX
+K_LDFLAGS  += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a -lm
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
+CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing -Wno-unused-result
+CXXFLAGS += -pthread
+CXXFLAGS += -I$(POCL_RT_PATH)/include
+
+ifdef HOSTGPU
+	CXXFLAGS += -DHOSTGPU
+	LDFLAGS += -lOpenCL
+else
+	LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so
+endif
+
+# Debugigng
+#ifdef DEBUG
+	CXXFLAGS += -g -O0
+#else    
+#	CXXFLAGS += -O2 -DNDEBUG
+#endif
+
+ifeq ($(TARGET), fpga)
+	OPAE_DRV_PATHS ?= libopae-c.so
+else
+ifeq ($(TARGET), asesim)
+	OPAE_DRV_PATHS ?= libopae-c-ase.so
+else
+ifeq ($(TARGET), opaesim)
+	OPAE_DRV_PATHS ?= libopae-c-sim.so
+endif	
+endif
+endif
+
+OBJS := $(addsuffix .o, $(notdir $(SRCS)))
+
+all: $(PROJECT) kernel.pocl
+ 
+kernel.pocl: kernel.cl
+	LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
+
+#backprop.o: backprop.c
+#	$(CXX) $(CXXFLAGS) backprop.c -c -Wno-unused-result
+	
+%.cc.o: %.cc
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+%.cpp.o: %.cpp
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+%.c.o: %.c
+	$(CC) $(CXXFLAGS) -c $< -o $@
+
+$(PROJECT): $(OBJS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+run-hostgpu: $(PROJECT) kernel.pocl
+	./$(PROJECT) $(OPTS)
+
+run-simx: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-rtlsim: $(PROJECT) kernel.pocl   
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-opae: $(PROJECT) kernel.pocl
+	SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+
+run-xrt: $(PROJECT) kernel.pocl
+ifeq ($(TARGET), hw)
+	SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+else
+	XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)	
+endif
+
+.depend: $(SRCS)
+	$(CXX) $(CXXFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf $(PROJECT) *.o .depend
+
+clean-all: clean
+	rm -rf *.dump *.pocl
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
--- a/tests/opencl/backprop/facetrain.cpp
+++ b/tests/opencl/backprop/facetrain.cpp
@ -0,0 +1,70 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "backprop.h"
+#include "omp.h"
+#include <CL/cl.h>
+#include <string.h>
+
+extern char *strcpy();
+extern void exit();
+
+int layer_size = 0;
+
+void backprop_face()
+{
+  BPNN *net;
+  int i;
+  float out_err, hid_err;
+  net = bpnn_create(layer_size, 16, 1); // (16, 1 can not be changed)
+
+  printf("Input layer size : %d\n", layer_size);
+  load(net);
+  //entering the training kernel, only one iteration
+  printf("Starting training kernel\n");
+  bpnn_train_kernel(net, &out_err, &hid_err);
+  bpnn_free(net);
+  printf("\nFinish the training for one iteration\n");
+}
+
+int setup(int argc, char **argv)
+{
+    layer_size = -1;
+
+    int cur_arg;
+	for (cur_arg = 1; cur_arg<argc; cur_arg++) {
+        if (strcmp(argv[cur_arg], "-h") == 0) {
+            fprintf(stderr, "usage: backprop <-n num of input elements> [-p platform_id] [-d device_id] [-t device_type]\n");
+            exit(0);
+        }
+        else if (strcmp(argv[cur_arg], "-n") == 0) {
+            if (argc >= cur_arg + 1) {
+                layer_size = atoi(argv[cur_arg+1]);
+                cur_arg++;
+            }
+        }
+        else if (strcmp(argv[cur_arg], "-p") == 0) {
+            if (argc >= cur_arg + 1) {
+                platform_id_inuse = atoi(argv[cur_arg+1]);
+                cur_arg++;
+            }
+        }
+        else if (strcmp(argv[cur_arg], "-d") == 0) {
+            if (argc >= cur_arg + 1) {
+                device_id_inuse = atoi(argv[cur_arg+1]);
+                cur_arg++;
+            }
+        }
+    }
+
+    if (layer_size % 16 != 0){
+        fprintf(stderr, "The number of input points must be divided by 16\n");
+        exit(0);
+    }
+
+    int seed = 7;
+    bpnn_initialize(seed);
+    backprop_face();
+
+    exit(0);
+}
--- a/tests/opencl/backprop/imagenet.cpp
+++ b/tests/opencl/backprop/imagenet.cpp
@ -0,0 +1,24 @@
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "backprop.h"
+
+extern int layer_size;
+
+void load(BPNN *net)
+//BPNN *net;
+{
+  float *units;
+  int nr, nc, imgsize, i, j, k;
+
+  nr = layer_size;
+  
+  imgsize = nr * nc;
+  units = net->input_units;
+
+  k = 1;
+  for (i = 0; i < nr; i++) {
+	  units[k] = (float) rand()/RAND_MAX ;
+	  k++;
+    }
+}
--- a/tests/opencl/backprop/kernel.cl
+++ b/tests/opencl/backprop/kernel.cl
@ -0,0 +1,90 @@
+#define THREADS 256
+#define WIDTH 16  
+#define HEIGHT 16 
+#define ETA 0.3f       
+#define MOMENTUM 0.3f  
+
+#ifndef _BACKPROP_CUDA_KERNEL_H_
+#define _BACKPROP_CUDA_KERNEL_H_
+#define WM(i, j)   weight_matrix[(j) + (i) * WIDTH]
+
+__kernel void 
+bpnn_layerforward_ocl(__global float *input_cuda,
+	                  __global float *output_hidden_cuda,
+					  __global float *input_hidden_cuda,
+					  __global float *hidden_partial_sum,
+					  __local float *input_node,
+					  __local float *weight_matrix,
+					  int in,
+					  int hid) 
+{
+
+   int by = get_group_id(1);
+   int tx = get_local_id(0);
+   int ty = get_local_id(1);
+
+   int index =  ( hid + 1 ) * HEIGHT * by + ( hid + 1 ) * ty + tx + 1 + ( hid + 1 ) ;  
+
+   int index_in = HEIGHT * by + ty + 1;
+   
+	if ( tx == 0 )
+		input_node[ty] = input_cuda[index_in] ;
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+		weight_matrix[ty * WIDTH + tx] =  input_hidden_cuda[index];
+		barrier(CLK_LOCAL_MEM_FENCE);
+   
+		weight_matrix[ty * WIDTH + tx]= weight_matrix[ty * WIDTH + tx] * input_node[ty];
+		barrier(CLK_LOCAL_MEM_FENCE);
+   
+		for ( int i = 1 ; i <= HEIGHT ; i=i*2){
+	//for ( int i = 1 ; i <= 4 ; i++){
+      int power_two = i; 
+		//int power_two = 2 << (i - 1);
+
+	    if( ty % power_two == 0 )
+		  weight_matrix[ty * WIDTH + tx]= weight_matrix[ty * WIDTH + tx] + weight_matrix[(ty + power_two/2)* WIDTH + tx];
+		  
+		barrier(CLK_LOCAL_MEM_FENCE);
+
+    }
+   
+    input_hidden_cuda[index] =  weight_matrix[ty * WIDTH + tx];
+   
+	barrier(CLK_LOCAL_MEM_FENCE);
+
+    if ( tx == 0 ) {
+	  hidden_partial_sum[by * hid + ty] = weight_matrix[tx* WIDTH + ty];
+    }
+
+}
+
+
+__kernel void  bpnn_adjust_weights_ocl( __global float * delta,   
+										 int hid,         
+										__global float * ly,      
+										 int in,          
+										__global float * w,       
+										__global float * oldw)  									
+{
+   
+   int by = get_group_id(1);
+   int tx = get_local_id(0);
+   int ty = get_local_id(1);
+	
+   int index =  ( hid + 1 ) * HEIGHT * by + ( hid + 1 ) * ty + tx + 1 + ( hid + 1 ) ;  
+   int index_y = HEIGHT * by + ty + 1;
+   int index_x = tx + 1;
+
+   w[index] += ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
+   oldw[index] = ((ETA * delta[index_x] * ly[index_y]) + (MOMENTUM * oldw[index]));
+
+   barrier(CLK_LOCAL_MEM_FENCE);
+
+   if (ty == 0 && by ==0){
+	w[index_x] += ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
+	oldw[index_x] = ((ETA * delta[index_x]) + (MOMENTUM * oldw[index_x]));
+   }
+
+}
+#endif 
--- a/tests/opencl/backprop/timing.cpp
+++ b/tests/opencl/backprop/timing.cpp
@ -0,0 +1,40 @@
+#include <stdio.h>
+
+#include "timing.h"
+
+void time_measure_start(struct timeval *tv)
+{
+	gettimeofday(tv, NULL);
+}
+
+void time_measure_end(struct timeval *tv)
+{
+	struct timeval tv_now, tv_diff;
+	double d;
+
+	gettimeofday(&tv_now, NULL);
+	tvsub(&tv_now, tv, &tv_diff);
+
+	d = (double) tv_diff.tv_sec * 1000.0 + (double) tv_diff.tv_usec / 1000.0;
+	printf("Time (Memory Copy and Launch) = %f (ms)\n", d);
+}
+
+float probe_event_time(cl_event event, cl_command_queue command_queue) {
+    cl_int error=0;
+    cl_ulong eventStart,eventEnd;
+    clFinish(command_queue);
+    error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,
+                                    sizeof(cl_ulong),&eventStart,NULL);
+    if (error != CL_SUCCESS) {
+        printf("ERROR (%d) in event start profiling.\n", error);
+        return 0;
+    }
+    error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,
+                                    sizeof(cl_ulong),&eventEnd,NULL);
+    if (error != CL_SUCCESS) {
+        printf("ERROR (%d) in event end profiling.\n", error);
+        return 0;
+    }
+
+    return (float)((eventEnd-eventStart)/1000000.0);
+}
--- a/tests/opencl/backprop/timing.h
+++ b/tests/opencl/backprop/timing.h
@ -0,0 +1,25 @@
+#ifndef __TIMING_H__
+#define __TIMING_H__
+
+#include <sys/time.h>
+#include <CL/cl.h>
+
+void time_measure_start(struct timeval *tv);
+void time_measure_end(struct timeval *tv);
+
+/* tvsub: ret = x - y. */
+static inline void tvsub(struct timeval *x,
+						 struct timeval *y,
+						 struct timeval *ret)
+{
+	ret->tv_sec = x->tv_sec - y->tv_sec;
+	ret->tv_usec = x->tv_usec - y->tv_usec;
+	if (ret->tv_usec < 0) {
+		ret->tv_sec--;
+		ret->tv_usec += 1000000;
+	}
+}
+
+float probe_event_time(cl_event, cl_command_queue);
+
+#endif
--- a/tests/opencl/bfs2/CLHelper.h
+++ b/tests/opencl/bfs2/CLHelper.h
@ -0,0 +1,931 @@
+//------------------------------------------
+//--cambine:helper function for OpenCL
+//--programmer:	Jianbin Fang
+//--date:	27/12/2010
+//------------------------------------------
+#ifndef _CL_HELPER_
+#define _CL_HELPER_
+
+#include <CL/cl.h>
+#include <vector>
+#include <iostream>
+#include <fstream>
+#include <string>
+
+#ifdef TIMING
+#include "timing.h"
+#endif
+
+using std::string;
+using std::ifstream;
+using std::cerr;
+using std::endl;
+//using std::cout;
+//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
+#define WORK_DIM 2	//work-items dimensions
+
+extern float init_time, mem_alloc_time, h2d_time, kernel_time,
+      d2h_time, close_time, total_time;
+
+struct oclHandleStruct {
+    cl_context              context;
+    cl_device_id            *devices;
+    cl_command_queue        queue;
+    cl_program              program;
+    cl_int		cl_status;
+    std::string error_str;
+    std::vector<cl_kernel>  kernel;
+};
+
+struct oclHandleStruct oclHandles;
+
+struct _clDeviceProp{
+  char device_name[100];
+};
+
+char kernel_file[100]  = "Kernels.cl";
+int total_kernels = 2;
+string kernel_names[2] = {"BFS_1", "BFS_2"};
+int work_group_size = 512;
+int platform_id_inuse = 0; // platform id in use (default: 0)
+int device_id_inuse = 0; //device id in use (default : 0)
+cl_device_type device_type = CL_DEVICE_TYPE_GPU;
+
+void _clGetDeviceProperties(int idx, _clDeviceProp *prop) throw(string){
+   
+   oclHandles.cl_status= clGetDeviceInfo(oclHandles.devices[idx], CL_DEVICE_NAME, 100, prop->device_name, NULL);
+   
+#ifdef ERRMSG
+	if(oclHandles.cl_status != CL_SUCCESS){
+	  oclHandles.error_str = "exception in _clGetDeviceProperties-> ";
+	  switch(oclHandles.cl_status){
+	  	case CL_INVALID_DEVICE:
+	  		oclHandles.error_str += "CL_INVALID_DEVICE";
+	  		break;
+	  	case CL_INVALID_VALUE:
+	  		oclHandles.error_str += "CL_INVALID_VALUE";
+	  		break;	   
+	  	default:
+	  		oclHandles.error_str += "unknown reasons";
+	  		break;	  		
+	  }
+	throw(oclHandles.error_str);
+	}	
+#endif
+}
+
+int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+/*
+ * Converts the contents of a file into a string
+ */
+string FileToString(const string fileName)
+{
+    ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
+
+    try {
+        size_t size;
+        char*  str;
+        string s;
+
+        if(f.is_open()) {
+            size_t fileSize;
+            f.seekg(0, ifstream::end);
+            size = fileSize = f.tellg();
+            f.seekg(0, ifstream::beg);
+
+            str = new char[size+1];
+            if (!str) throw(string("Could not allocate memory"));
+
+            f.read(str, fileSize);
+            f.close();
+            str[size] = '\0';
+
+            s = str;
+            delete [] str;
+            return s;
+        }
+    } catch(std::string msg) {
+        printf("Exception caught in FileToString(): %s\n");
+        if(f.is_open())
+            f.close();
+    } catch(...) {
+        printf("Exception caught in FileToString()\n");
+        if(f.is_open())
+            f.close();
+    }
+    string errorMsg = "FileToString()::Error: Unable to open file "
+                      + fileName;
+    throw(errorMsg);
+}
+//---------------------------------------
+//Read command line parameters
+//
+void _clCmdParams(int argc, char* argv[])
+{
+    for (int i =0; i < argc; ++i) {
+        switch (argv[i][1]) {
+        case 'g':	//--g stands for size of work group
+            if (++i < argc) {
+                sscanf(argv[i], "%u", &work_group_size);
+            } else {
+                printf("Could not read argument after option %d\n", argv[i-1]);
+                throw;
+            }
+            break;
+        case 'd':	 //--d stands for device id used in computaion
+            if (++i < argc) {
+                sscanf(argv[i], "%u", &device_id_inuse);
+            } else {
+                printf("Could not read argument after option %d\n", argv[i-1]);
+                throw;
+            }
+            break;
+        case 'p':   // --p stands for platform id used in computation
+            if (++i < argc) {
+                sscanf(argv[i], "%u", &platform_id_inuse);
+            } else {
+                printf("Could not read argument after option %d\n", argv[i-1]);
+                throw;
+            }
+            break;
+/*
+        case 't':   // --t stands for device type, 0:GPU, 1:CPU
+            if (++i < argc) {
+                sscanf(argv[i], "%u", &device_type);
+                device_type = (device_type == 0) ? CL_DEVICE_TYPE_GPU
+                    : CL_DEVICE_TYPE_CPU;
+            } else {
+                std::cerr << "Could not read argument after option " << argv[i-1] << std::endl;
+                throw;
+            }
+            break;
+*/
+        default:
+            ;
+        }
+    }
+
+}
+
+//---------------------------------------
+//Initlize CL objects
+//--description: there are 5 steps to initialize all the OpenCL objects needed
+//--revised on 04/01/2011: get the number of devices  and
+//  devices have no relationship with context
+void _clInit()
+{
+    cl_int resultCL;
+
+    oclHandles.context = NULL;
+    oclHandles.devices = NULL;
+    oclHandles.queue = NULL;
+    oclHandles.program = NULL;
+
+    cl_uint deviceListSize;
+
+    //-----------------------------------------------
+    //--cambine-1: find the available platforms and select one
+
+    cl_uint numPlatforms;
+    cl_platform_id targetPlatform = NULL;
+
+    resultCL = clGetPlatformIDs(0, NULL, &numPlatforms);
+    if (resultCL != CL_SUCCESS)
+        throw (string("InitCL()::Error: Getting number of platforms (clGetPlatformIDs)"));
+    printf("number of platforms:%d\n",numPlatforms);	//by cambine
+
+    if (!(numPlatforms > 0))
+        throw (string("InitCL()::Error: No platforms found (clGetPlatformIDs)"));
+
+    cl_platform_id* allPlatforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id));
+
+    resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
+    if (resultCL != CL_SUCCESS)
+        throw (string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
+
+    for (int i = 0; i < numPlatforms; i++) {
+        char pbuff[128];
+        resultCL = clGetPlatformInfo( allPlatforms[i],
+                                      CL_PLATFORM_VENDOR,
+                                      sizeof(pbuff),
+                                      pbuff,
+                                      NULL);
+        if (resultCL != CL_SUCCESS)
+            throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
+
+        printf("vendor is %s\n",pbuff);
+
+    }
+    /* Select the target platform. Default: first platform */
+    targetPlatform = allPlatforms[platform_id_inuse];
+    free(allPlatforms);
+
+    //-----------------------------------------------
+    //--cambine-3: detect OpenCL devices
+    /* First, get the size of device list */
+    oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_ALL, 0, NULL, &deviceListSize);
+    if(oclHandles.cl_status!=CL_SUCCESS) {
+        throw(string("exception in _clInit -> clGetDeviceIDs"));
+    }
+    if (deviceListSize == 0)
+        throw(string("InitCL()::Error: No devices found."));
+    printf("device number: %d\n");
+
+    /* Now, allocate the device list */
+    oclHandles.devices = (cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
+
+    if (oclHandles.devices == 0)
+        throw(string("InitCL()::Error: Could not allocate memory."));
+
+    /* Next, get the device list data */
+    oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_ALL, deviceListSize,
+            oclHandles.devices, NULL);
+    if(oclHandles.cl_status!=CL_SUCCESS) {
+        throw(string("exception in _clInit -> clGetDeviceIDs-2"));
+    }
+
+	  _clDeviceProp prop;
+  	_clGetDeviceProperties(device_id_inuse, &prop);
+	  printf("--cambine: device name=%s\n", prop.device_name);
+	
+    /* Then, get device type */
+    /*
+	oclHandles.cl_status = clGetDeviceInfo(oclHandles.devices[device_id_inuse],
+            CL_DEVICE_TYPE, sizeof(cl_device_type),	(void *)&device_type,
+            NULL);
+	if (oclHandles.cl_status != CL_SUCCESS) {
+        throw(string("error in Getting Device Info"));
+    }
+	if (device_type == CL_DEVICE_TYPE_GPU)
+        printf("Creating GPU Context\n");
+	else if (device_type == CL_DEVICE_TYPE_CPU)
+        printf("Creating CPU Context\n");
+	else
+        throw(string("unsupported device type"));
+      */
+    //-----------------------------------------------
+    //--cambine-2: create an OpenCL context
+    cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)targetPlatform, 0 };
+    /*
+    oclHandles.context = clCreateContextFromType(cprops,
+                         device_type,
+                         NULL,
+                         NULL,
+                         &resultCL);
+    */
+    oclHandles.context = clCreateContext(0, 
+                                        deviceListSize, 
+                                        oclHandles.devices, 
+                                        NULL,
+										                    NULL,
+                                        &resultCL);
+ 
+    if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
+        throw (string("InitCL()::Error: Creating Context (clCreateContextFromType)"));
+
+    //-----------------------------------------------
+    //--cambine-4: Create an OpenCL command queue
+#ifdef TIMING
+    oclHandles.queue = clCreateCommandQueue(oclHandles.context,
+            oclHandles.devices[device_id_inuse], CL_QUEUE_PROFILING_ENABLE,
+            &resultCL);
+#else
+    oclHandles.queue = clCreateCommandQueue(oclHandles.context,
+            oclHandles.devices[device_id_inuse], 0, &resultCL);
+
+#endif
+
+    if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
+        throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
+
+    //-----------------------------------------------
+    //--cambine-5: Load CL file, build CL program object, create CL kernel object
+    /* 
+    std::string  source_str = FileToString(kernel_file);
+    const char * source    = source_str.c_str();
+    size_t sourceSize[]    = { source_str.length() };
+
+    oclHandles.program = clCreateProgramWithSource(oclHandles.context,
+                         1,
+                         &source,
+                         sourceSize,
+                         &resultCL);
+
+    if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
+        throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)"));
+    */
+   uint8_t *kernel_bin = NULL;
+    size_t kernel_size;
+    cl_int binary_status = 0;
+    if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+      std::abort();
+
+    oclHandles.program = clCreateProgramWithBinary(
+          oclHandles.context, 1, &oclHandles.devices[device_id_inuse], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &resultCL);
+    free(kernel_bin);
+
+    //insert debug information
+    //std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
+    //options += " -cl-nv-opt-level=3";
+    resultCL = clBuildProgram(oclHandles.program, deviceListSize, oclHandles.devices, NULL, NULL,NULL);
+
+    if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) {
+        printf("InitCL()::Error: In clBuildProgram\n");
+
+        size_t length;
+        resultCL = clGetProgramBuildInfo(oclHandles.program,
+                                         oclHandles.devices[device_id_inuse],
+                                         CL_PROGRAM_BUILD_LOG,
+                                         0,
+                                         NULL,
+                                         &length);
+        if(resultCL != CL_SUCCESS)
+            throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
+
+        char* buffer = (char*)malloc(length);
+        resultCL = clGetProgramBuildInfo(oclHandles.program,
+                                         oclHandles.devices[device_id_inuse],
+                                         CL_PROGRAM_BUILD_LOG,
+                                         length,
+                                         buffer,
+                                         NULL);
+        if(resultCL != CL_SUCCESS)
+            throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)"));
+
+        printf("%s\n", buffer);
+        free(buffer);
+
+        throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
+    }
+
+    //get program information in intermediate representation
+#ifdef PTX_MSG
+    size_t binary_sizes[deviceListSize];
+    char * binaries[deviceListSize];
+    //figure out number of devices and the sizes of the binary for each device.
+    oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*deviceListSize, &binary_sizes, NULL );
+    if(oclHandles.cl_status!=CL_SUCCESS) {
+        throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
+    }
+
+    printf("--cambine: %d", binary_sizes);
+    //copy over all of the generated binaries.
+    for(int i=0; i<deviceListSize; i++)
+        binaries[i] = (char *)malloc( sizeof(char)*(binary_sizes[i]+1));
+    oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES, sizeof(char *)*deviceListSize, binaries, NULL );
+    if(oclHandles.cl_status!=CL_SUCCESS) {
+        throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
+    }
+    for(int i=0; i<deviceListSize; i++)
+        binaries[i][binary_sizes[i]] = '\0';
+    printf("--cambine:writing ptd information...\n");
+    FILE * ptx_file = fopen("cl.ptx","w");
+    if(ptx_file==NULL) {
+        throw(string("exceptions in allocate ptx file."));
+    }
+    fprintf(ptx_file,"%s",binaries[DEVICE_ID_INUSE]);
+    fclose(ptx_file);
+    printf("--cambine:writing ptd information done.\n");
+    for(int i=0; i<deviceListSize; i++)
+        free(binaries[i]);
+#endif
+
+    for (int nKernel = 0; nKernel < total_kernels; nKernel++) {
+        /* get a kernel object handle for a kernel with the given name */
+        cl_kernel kernel = clCreateKernel(oclHandles.program,
+                                          (kernel_names[nKernel]).c_str(),
+                                          &resultCL);
+
+        if ((resultCL != CL_SUCCESS) || (kernel == NULL)) {
+            string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" + kernel_names[nKernel] + "\"";
+            throw(errorMsg);
+        }
+
+        oclHandles.kernel.push_back(kernel);
+    }
+    //get resource alocation information
+#ifdef RES_MSG
+    char * build_log;
+    size_t ret_val_size;
+    oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSE], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
+    if(oclHandles.cl_status!=CL_SUCCESS) {
+        throw(string("exceptions in _InitCL -> getting resource information"));
+    }
+
+    build_log = (char *)malloc(ret_val_size+1);
+    oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSE], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
+    if(oclHandles.cl_status!=CL_SUCCESS) {
+        throw(string("exceptions in _InitCL -> getting resources allocation information-2"));
+    }
+    build_log[ret_val_size] = '\0';
+    printf("--cambine: %s\n", build_log);
+    free(build_log);
+#endif
+}
+
+//---------------------------------------
+//release CL objects
+void _clRelease()
+{
+    char errorFlag = false;
+
+    for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++) {
+        if (oclHandles.kernel[nKernel] != NULL) {
+            cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
+            if (resultCL != CL_SUCCESS) {
+                printf("ReleaseCL()::Error: In clReleaseKernel\n");
+                errorFlag = true;
+            }
+            oclHandles.kernel[nKernel] = NULL;
+        }
+        oclHandles.kernel.clear();
+    }
+
+    if (oclHandles.program != NULL) {
+        cl_int resultCL = clReleaseProgram(oclHandles.program);
+        if (resultCL != CL_SUCCESS) {
+            printf("ReleaseCL()::Error: In clReleaseProgram\n");
+            errorFlag = true;
+        }
+        oclHandles.program = NULL;
+    }
+
+    if (oclHandles.queue != NULL) {
+        cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
+        if (resultCL != CL_SUCCESS) {
+            printf("ReleaseCL()::Error: In clReleaseCommandQueue\n");
+            errorFlag = true;
+        }
+        oclHandles.queue = NULL;
+    }
+
+    free(oclHandles.devices);
+
+    if (oclHandles.context != NULL) {
+        cl_int resultCL = clReleaseContext(oclHandles.context);
+        if (resultCL != CL_SUCCESS) {
+            printf("ReleaseCL()::Error: In clReleaseContext\n");
+            errorFlag = true;
+        }
+        oclHandles.context = NULL;
+    }
+
+    if (errorFlag) throw(string("ReleaseCL()::Error encountered."));
+}
+//--------------------------------------------------------
+//--cambine:create buffer and then copy data from host to device
+cl_mem _clCreateAndCpyMem(int size, void * h_mem_source) throw(string)
+{
+    cl_mem d_mem;
+    d_mem = clCreateBuffer(oclHandles.context,	CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR,  \
+                           size, h_mem_source, &oclHandles.cl_status);
+#ifdef ERRMSG
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(string("excpetion in _clCreateAndCpyMem()"));
+#endif
+    return d_mem;
+}
+//-------------------------------------------------------
+//--cambine:	create read only  buffer for devices
+//--date:	17/01/2011
+cl_mem _clMallocRW(int size, void * h_mem_ptr) throw(string)
+{
+    cl_mem d_mem;
+    d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_WRITE, size, NULL, &oclHandles.cl_status);
+#ifdef ERRMSG
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(string("excpetion in _clMallocRW"));
+#endif
+    return d_mem;
+}
+//-------------------------------------------------------
+//--cambine:	create read and write buffer for devices
+//--date:	17/01/2011
+cl_mem _clMalloc(int size, void * h_mem_ptr) throw(string)
+{
+    cl_mem d_mem;
+    d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, NULL, &oclHandles.cl_status);
+#ifdef ERRMSG
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(string("excpetion in _clMalloc"));
+#endif
+    return d_mem;
+}
+
+//-------------------------------------------------------
+//--cambine:	transfer data from host to device
+//--date:	17/01/2011
+void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string)
+{
+    cl_event event;
+    oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem,
+            CL_TRUE, 0, size, h_mem_ptr, 0, NULL, &event);
+#ifdef ERRMSG
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(string("excpetion in _clMemcpyH2D"));
+#endif
+#ifdef TIMING
+    h2d_time += probe_event_time(event, oclHandles.queue);
+#endif
+}
+//--------------------------------------------------------
+//--cambine:create buffer and then copy data from host to device with pinned
+// memory
+cl_mem _clCreateAndCpyPinnedMem(int size, float* h_mem_source) throw(string)
+{
+    cl_mem d_mem, d_mem_pinned;
+    float * h_mem_pinned = NULL;
+    d_mem_pinned = clCreateBuffer(oclHandles.context,	CL_MEM_READ_ONLY|CL_MEM_ALLOC_HOST_PTR,  \
+                                  size, NULL, &oclHandles.cl_status);
+#ifdef ERRMSG
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
+#endif
+    //------------
+    d_mem = clCreateBuffer(oclHandles.context,	CL_MEM_READ_ONLY,  \
+                           size, NULL, &oclHandles.cl_status);
+#ifdef ERRMSG
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
+#endif
+    //----------
+    h_mem_pinned = (cl_float *)clEnqueueMapBuffer(oclHandles.queue, d_mem_pinned, CL_TRUE,  \
+                   CL_MAP_WRITE, 0, size, 0, NULL,  \
+                   NULL,  &oclHandles.cl_status);
+#ifdef ERRMSG
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
+#endif
+    int element_number = size/sizeof(float);
+    #pragma omp parallel for
+    for(int i=0; i<element_number; i++) {
+        h_mem_pinned[i] = h_mem_source[i];
+    }
+    //----------
+    oclHandles.cl_status = clEnqueueWriteBuffer(oclHandles.queue, d_mem, 	\
+                           CL_TRUE, 0, size, h_mem_pinned,  \
+                           0, NULL, NULL);
+#ifdef ERRMSG
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
+#endif
+
+    return d_mem;
+}
+
+
+//--------------------------------------------------------
+//--cambine:create write only buffer on device
+cl_mem _clMallocWO(int size) throw(string)
+{
+    cl_mem d_mem;
+    d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0, &oclHandles.cl_status);
+#ifdef ERRMSG
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(string("excpetion in _clCreateMem()"));
+#endif
+    return d_mem;
+}
+
+//--------------------------------------------------------
+//transfer data from device to host
+void _clMemcpyD2H(cl_mem d_mem, int size, void * h_mem) throw(string)
+{
+    cl_event event;
+    oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem, 0,0, &event);
+#ifdef ERRMSG
+    oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
+    switch(oclHandles.cl_status) {
+    case CL_INVALID_COMMAND_QUEUE:
+        oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+        break;
+    case CL_INVALID_CONTEXT:
+        oclHandles.error_str += "CL_INVALID_CONTEXT";
+        break;
+    case CL_INVALID_MEM_OBJECT:
+        oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+        break;
+    case CL_INVALID_VALUE:
+        oclHandles.error_str += "CL_INVALID_VALUE";
+        break;
+    case CL_INVALID_EVENT_WAIT_LIST:
+        oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
+        break;
+    case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+        oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+        break;
+    case CL_OUT_OF_HOST_MEMORY:
+        oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+        break;
+    default:
+        oclHandles.error_str += "Unknown reason";
+        break;
+    }
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(oclHandles.error_str);
+#endif
+#ifdef TIMING
+    d2h_time += probe_event_time(event, oclHandles.queue);
+#endif
+}
+
+//--------------------------------------------------------
+//set kernel arguments
+void _clSetArgs(int kernel_id, int arg_idx, void * d_mem, int size = 0) throw(string)
+{
+    if(!size) {
+        oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, sizeof(d_mem), &d_mem);
+#ifdef ERRMSG
+        oclHandles.error_str = "excpetion in _clSetKernelArg() ";
+        switch(oclHandles.cl_status) {
+        case CL_INVALID_KERNEL:
+            oclHandles.error_str += "CL_INVALID_KERNEL";
+            break;
+        case CL_INVALID_ARG_INDEX:
+            oclHandles.error_str += "CL_INVALID_ARG_INDEX";
+            break;
+        case CL_INVALID_ARG_VALUE:
+            oclHandles.error_str += "CL_INVALID_ARG_VALUE";
+            break;
+        case CL_INVALID_MEM_OBJECT:
+            oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+            break;
+        case CL_INVALID_SAMPLER:
+            oclHandles.error_str += "CL_INVALID_SAMPLER";
+            break;
+        case CL_INVALID_ARG_SIZE:
+            oclHandles.error_str += "CL_INVALID_ARG_SIZE";
+            break;
+        case CL_OUT_OF_RESOURCES:
+            oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+            break;
+        case CL_OUT_OF_HOST_MEMORY:
+            oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+            break;
+        default:
+            oclHandles.error_str += "Unknown reason";
+            break;
+        }
+        if(oclHandles.cl_status != CL_SUCCESS)
+            throw(oclHandles.error_str);
+#endif
+    } else {
+        oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
+#ifdef ERRMSG
+        oclHandles.error_str = "excpetion in _clSetKernelArg() ";
+        switch(oclHandles.cl_status) {
+        case CL_INVALID_KERNEL:
+            oclHandles.error_str += "CL_INVALID_KERNEL";
+            break;
+        case CL_INVALID_ARG_INDEX:
+            oclHandles.error_str += "CL_INVALID_ARG_INDEX";
+            break;
+        case CL_INVALID_ARG_VALUE:
+            oclHandles.error_str += "CL_INVALID_ARG_VALUE";
+            break;
+        case CL_INVALID_MEM_OBJECT:
+            oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+            break;
+        case CL_INVALID_SAMPLER:
+            oclHandles.error_str += "CL_INVALID_SAMPLER";
+            break;
+        case CL_INVALID_ARG_SIZE:
+            oclHandles.error_str += "CL_INVALID_ARG_SIZE";
+            break;
+        case CL_OUT_OF_RESOURCES:
+            oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+            break;
+        case CL_OUT_OF_HOST_MEMORY:
+            oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+            break;
+        default:
+            oclHandles.error_str += "Unknown reason";
+            break;
+        }
+        if(oclHandles.cl_status != CL_SUCCESS)
+            throw(oclHandles.error_str);
+#endif
+    }
+}
+void _clFinish() throw(string)
+{
+    oclHandles.cl_status = clFinish(oclHandles.queue);
+#ifdef ERRMSG
+    oclHandles.error_str = "excpetion in _clFinish";
+    switch(oclHandles.cl_status) {
+    case CL_INVALID_COMMAND_QUEUE:
+        oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+        break;
+    case CL_OUT_OF_RESOURCES:
+        oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+        break;
+    case CL_OUT_OF_HOST_MEMORY:
+        oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+        break;
+    default:
+        oclHandles.error_str += "Unknown reasons";
+        break;
+
+    }
+    if(oclHandles.cl_status!=CL_SUCCESS) {
+        throw(oclHandles.error_str);
+    }
+#endif
+}
+//--------------------------------------------------------
+//--cambine:enqueue kernel
+void _clInvokeKernel(int kernel_id, int work_items, int work_group_size) throw(string)
+{
+    cl_uint work_dim = WORK_DIM;
+    cl_event e[1];
+    if(work_items%work_group_size != 0)	//process situations that work_items cannot be divided by work_group_size
+        work_items = work_items + (work_group_size-(work_items%work_group_size));
+    size_t local_work_size[] = {work_group_size, 1};
+    size_t global_work_size[] = {work_items, 1};
+    oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
+                           global_work_size, local_work_size, 0, 0, &(e[0]) );
+#ifdef ERRMSG
+    oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
+    switch(oclHandles.cl_status) {
+    case CL_INVALID_PROGRAM_EXECUTABLE:
+        oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
+        break;
+    case CL_INVALID_COMMAND_QUEUE:
+        oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+        break;
+    case CL_INVALID_KERNEL:
+        oclHandles.error_str += "CL_INVALID_KERNEL";
+        break;
+    case CL_INVALID_CONTEXT:
+        oclHandles.error_str += "CL_INVALID_CONTEXT";
+        break;
+    case CL_INVALID_KERNEL_ARGS:
+        oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
+        break;
+    case CL_INVALID_WORK_DIMENSION:
+        oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
+        break;
+    case CL_INVALID_GLOBAL_WORK_SIZE:
+        oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
+        break;
+    case CL_INVALID_WORK_GROUP_SIZE:
+        oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
+        break;
+    case CL_INVALID_WORK_ITEM_SIZE:
+        oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
+        break;
+    case CL_INVALID_GLOBAL_OFFSET:
+        oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
+        break;
+    case CL_OUT_OF_RESOURCES:
+        oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+        break;
+    case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+        oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+        break;
+    case CL_INVALID_EVENT_WAIT_LIST:
+        oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
+        break;
+    case CL_OUT_OF_HOST_MEMORY:
+        oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+        break;
+    default:
+        oclHandles.error_str += "Unkown reseason";
+        break;
+    }
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(oclHandles.error_str);
+#endif
+
+#ifdef TIMING
+    kernel_time += probe_event_time(e[0], oclHandles.queue);
+#endif
+    //_clFinish();
+    // oclHandles.cl_status = clWaitForEvents(1, &e[0]);
+    // #ifdef ERRMSG
+    // if (oclHandles.cl_status!= CL_SUCCESS)
+    //     throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
+    // #endif
+}
+void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x, int group_y) throw(string)
+{
+    cl_uint work_dim = WORK_DIM;
+    size_t local_work_size[] = {group_x, group_y};
+    size_t global_work_size[] = {range_x, range_y};
+    cl_event e[1];
+    /*if(work_items%work_group_size != 0)	//process situations that work_items cannot be divided by work_group_size
+      work_items = work_items + (work_group_size-(work_items%work_group_size));*/
+    oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \
+                           global_work_size, local_work_size, 0, 0, &(e[0]) );
+#ifdef ERRMSG
+    oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
+    switch(oclHandles.cl_status) {
+    case CL_INVALID_PROGRAM_EXECUTABLE:
+        oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
+        break;
+    case CL_INVALID_COMMAND_QUEUE:
+        oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
+        break;
+    case CL_INVALID_KERNEL:
+        oclHandles.error_str += "CL_INVALID_KERNEL";
+        break;
+    case CL_INVALID_CONTEXT:
+        oclHandles.error_str += "CL_INVALID_CONTEXT";
+        break;
+    case CL_INVALID_KERNEL_ARGS:
+        oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
+        break;
+    case CL_INVALID_WORK_DIMENSION:
+        oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
+        break;
+    case CL_INVALID_GLOBAL_WORK_SIZE:
+        oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
+        break;
+    case CL_INVALID_WORK_GROUP_SIZE:
+        oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
+        break;
+    case CL_INVALID_WORK_ITEM_SIZE:
+        oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
+        break;
+    case CL_INVALID_GLOBAL_OFFSET:
+        oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
+        break;
+    case CL_OUT_OF_RESOURCES:
+        oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+        break;
+    case CL_MEM_OBJECT_ALLOCATION_FAILURE:
+        oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+        break;
+    case CL_INVALID_EVENT_WAIT_LIST:
+        oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
+        break;
+    case CL_OUT_OF_HOST_MEMORY:
+        oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+        break;
+    default:
+        oclHandles.error_str += "Unkown reseason";
+        break;
+    }
+    if(oclHandles.cl_status != CL_SUCCESS)
+        throw(oclHandles.error_str);
+#endif
+
+#ifdef TIMING
+    kernel_time += probe_event_time(e[0], oclHandles.queue);
+#endif
+
+    //_clFinish();
+    /*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
+    #ifdef ERRMSG
+        if (oclHandles.cl_status!= CL_SUCCESS)
+            throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
+    #endif*/
+}
+
+//--------------------------------------------------------
+//release OpenCL objects
+void _clFree(cl_mem ob) throw(string)
+{
+    if(ob!=NULL)
+        oclHandles.cl_status = clReleaseMemObject(ob);
+#ifdef ERRMSG
+    oclHandles.error_str = "excpetion in _clFree() ->";
+    switch(oclHandles.cl_status) {
+    case CL_INVALID_MEM_OBJECT:
+        oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
+        break;
+    case CL_OUT_OF_RESOURCES:
+        oclHandles.error_str += "CL_OUT_OF_RESOURCES";
+        break;
+    case CL_OUT_OF_HOST_MEMORY:
+        oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
+        break;
+    default:
+        oclHandles.error_str += "Unkown reseason";
+        break;
+    }
+    if (oclHandles.cl_status!= CL_SUCCESS)
+        throw(oclHandles.error_str);
+#endif
+}
+#endif //_CL_HELPER_
--- a/tests/opencl/bfs2/Makefile
+++ b/tests/opencl/bfs2/Makefile
@ -0,0 +1,7 @@
+PROJECT = bfs
+
+SRCS = bfs.cpp timer.cc
+
+OPTS ?= graph4096.txt 0 0 0 
+
+include ../common.mk
--- a/tests/opencl/bfs2/bfs
+++ b/tests/opencl/bfs2/bfs
--- a/tests/opencl/bfs2/bfs.cpp
+++ b/tests/opencl/bfs2/bfs.cpp
@ -0,0 +1,351 @@
+//--by Jianbin Fang
+
+#define __CL_ENABLE_EXCEPTIONS
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <cstring>
+
+#ifdef  PROFILING
+#include "timer.h"
+#endif
+
+#ifdef TIMING
+#include "timing.h"
+#endif
+
+#include "CLHelper.h"
+#include "util.h"
+
+#define MAX_THREADS_PER_BLOCK 256
+
+//Structure to hold a node information
+struct Node {
+    int starting;
+    int no_of_edges;
+};
+
+//Primitives for timing
+#ifdef TIMING
+struct timeval tv;
+struct timeval tv_total_start, tv_total_end;
+struct timeval tv_h2d_start, tv_h2d_end;
+struct timeval tv_d2h_start, tv_d2h_end;
+struct timeval tv_kernel_start, tv_kernel_end;
+struct timeval tv_mem_alloc_start, tv_mem_alloc_end;
+struct timeval tv_close_start, tv_close_end;
+float init_time = 0, mem_alloc_time = 0, h2d_time = 0, kernel_time= 0,
+      d2h_time = 0, close_time = 0, total_time = 0;
+#endif
+
+//----------------------------------------------------------
+//--bfs on cpu
+//--programmer:	jianbin
+//--date:	26/01/2011
+//--note: width is changed to the new_width
+//----------------------------------------------------------
+void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
+                 int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
+                 char *h_graph_visited, int *h_cost_ref)
+{
+    char stop;
+    int k = 0;
+    do {
+        //if no thread changes this value then the loop stops
+        stop=false;
+        for(int tid = 0; tid < no_of_nodes; tid++ ) {
+            if (h_graph_mask[tid] == true) {
+                h_graph_mask[tid]=false;
+                for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++) {
+                    int id = h_graph_edges[i];	//--cambine: node id is connected with node tid
+                    if(!h_graph_visited[id]) {	//--cambine: if node id has not been visited, enter the body below
+                        h_cost_ref[id]=h_cost_ref[tid]+1;
+                        h_updating_graph_mask[id]=true;
+                    }
+                }
+            }
+        }
+
+        for(int tid=0; tid< no_of_nodes ; tid++ ) {
+            if (h_updating_graph_mask[tid] == true) {
+                h_graph_mask[tid]=true;
+                h_graph_visited[tid]=true;
+                stop=true;
+                h_updating_graph_mask[tid]=false;
+            }
+        }
+        k++;
+    } while(stop);
+}
+//----------------------------------------------------------
+//--breadth first search on GPUs
+//----------------------------------------------------------
+void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \
+                 int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \
+                 char *h_graph_visited, int *h_cost)
+throw(std::string)
+{
+
+    //int number_elements = height*width;
+    char h_over;
+    cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \
+    d_graph_visited, d_cost, d_over;
+    try {
+#ifdef  TIMING
+        gettimeofday(&tv_total_start, NULL);
+#endif
+
+        _clInit();
+
+#ifdef  TIMING
+        gettimeofday(&tv_mem_alloc_start, NULL);
+        tvsub(&tv_mem_alloc_start, &tv_total_start, &tv);
+        init_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+        //--1 transfer data from host to device
+        d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes);
+        d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges);
+        d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask);
+        d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask);
+        d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited);
+
+        d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost);
+        d_over = _clMallocRW(sizeof(char), &h_over);
+
+#ifdef  TIMING
+        gettimeofday(&tv_mem_alloc_end, NULL);
+        tvsub(&tv_mem_alloc_end, &tv_mem_alloc_start, &tv);
+        mem_alloc_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+#endif
+
+        _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes);
+        _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges);
+        _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask);
+        _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask);
+        _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited);
+        _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost);
+
+        //--2 invoke kernel
+#ifdef	PROFILING
+        timer kernel_timer;
+        double kernel_time = 0.0;
+        kernel_timer.reset();
+        kernel_timer.start();
+#endif
+        do {
+            h_over = false;
+            _clMemcpyH2D(d_over, sizeof(char), &h_over);
+
+            //--kernel 0
+            int kernel_id = 0;
+            int kernel_idx = 0;
+            _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
+            _clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
+            _clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
+            _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
+            _clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
+            _clSetArgs(kernel_id, kernel_idx++, d_cost);
+            _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
+
+            //int work_items = no_of_nodes;
+            _clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
+
+            //--kernel 1
+            kernel_id = 1;
+            kernel_idx = 0;
+            _clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
+            _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
+            _clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
+            _clSetArgs(kernel_id, kernel_idx++, d_over);
+            _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
+
+            //work_items = no_of_nodes;
+            _clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
+
+            _clMemcpyD2H(d_over,sizeof(char), &h_over);
+        } while(h_over);
+
+        _clFinish();
+#ifdef	PROFILING
+        kernel_timer.stop();
+        kernel_time = kernel_timer.getTimeInSeconds();
+#endif
+
+        //--3 transfer data from device to host
+        _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost);
+
+#ifdef  TIMING
+        gettimeofday(&tv_close_start, NULL);
+#endif
+        //--statistics
+#ifdef	PROFILING
+        printf("kernel time(s): %d\n", kernel_time );
+#endif
+        //--4 release cl resources.
+        _clFree(d_graph_nodes);
+        _clFree(d_graph_edges);
+        _clFree(d_graph_mask);
+        _clFree(d_updating_graph_mask);
+        _clFree(d_graph_visited);
+        _clFree(d_cost);
+        _clFree(d_over);
+        _clRelease();
+    } catch(std::string msg) {
+        _clFree(d_graph_nodes);
+        _clFree(d_graph_edges);
+        _clFree(d_graph_mask);
+        _clFree(d_updating_graph_mask);
+        _clFree(d_graph_visited);
+        _clFree(d_cost);
+        _clFree(d_over);
+        _clRelease();
+        std::string e_str = "in run_transpose_gpu -> ";
+        e_str += msg;
+        throw(e_str);
+    }
+#ifdef  TIMING
+        gettimeofday(&tv_close_end, NULL);
+        tvsub(&tv_close_end, &tv_close_start, &tv);
+        close_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+        tvsub(&tv_close_end, &tv_total_start, &tv);
+        total_time = tv.tv_sec * 1000.0 + (float) tv.tv_usec / 1000.0;
+
+        printf("Init: %f\n", init_time);
+        printf("MemAlloc: %f\n", mem_alloc_time);
+        printf("HtoD: %f\n", h2d_time);
+        printf("Exec: %f\n", kernel_time);
+        printf("DtoH: %f\n", d2h_time);
+        printf("Close: %f\n", close_time);
+        printf("Total: %f\n", total_time);
+#endif
+
+    return ;
+}
+void Usage(int argc, char**argv)
+{
+
+    fprintf(stderr,"Usage: %s <input_file> [-p platform] [-d device] [-t gpu(0)/cpu(1)]\n", argv[0]);
+
+}
+//----------------------------------------------------------
+//--cambine:	main function
+//--author:		created by Jianbin Fang
+//--date:		25/01/2011
+//----------------------------------------------------------
+int main(int argc, char * argv[])
+{
+    int no_of_nodes;
+    int edge_list_size;
+    FILE *fp;
+    Node* h_graph_nodes;
+    char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
+    try {
+        char *input_f;
+        if(argc < 2) {
+            Usage(argc, argv);
+            exit(0);
+        }
+
+        _clCmdParams(argc, argv);
+        input_f = argv[1];
+        printf("Reading File\n");
+        //Read in Graph from a file
+        fp = fopen(input_f,"r");
+        if(!fp) {
+            printf("Error Reading graph file\n");
+            return 0;
+        }
+
+        int source = 0;
+
+        fscanf(fp,"%d",&no_of_nodes);
+
+        int num_of_blocks = 1;
+        int num_of_threads_per_block = no_of_nodes;
+
+        //Make execution Parameters according to the number of nodes
+        //Distribute threads across multiple Blocks if necessary
+        if(no_of_nodes>MAX_THREADS_PER_BLOCK) {
+            num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK);
+            num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
+        }
+        work_group_size = num_of_threads_per_block;
+        // allocate host memory
+        h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes);
+        h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
+        h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes);
+        h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes);
+
+        int start, edgeno;
+        // initalize the memory
+        for(int i = 0; i < no_of_nodes; i++) {
+            fscanf(fp,"%d %d",&start,&edgeno);
+            h_graph_nodes[i].starting = start;
+            h_graph_nodes[i].no_of_edges = edgeno;
+            h_graph_mask[i]=false;
+            h_updating_graph_mask[i]=false;
+            h_graph_visited[i]=false;
+        }
+        //read the source node from the file
+        fscanf(fp,"%d",&source);
+        source=0;
+        //set the source node as true in the mask
+        h_graph_mask[source]=true;
+        h_graph_visited[source]=true;
+        fscanf(fp,"%d",&edge_list_size);
+        int id,cost;
+        int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size);
+        for(int i=0; i < edge_list_size ; i++) {
+            fscanf(fp,"%d",&id);
+            fscanf(fp,"%d",&cost);
+            h_graph_edges[i] = id;
+        }
+
+        if(fp)
+            fclose(fp);
+        // allocate mem for the result on host side
+        int	*h_cost = (int*) malloc(sizeof(int)*no_of_nodes);
+        int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes);
+        for(int i=0; i<no_of_nodes; i++) {
+            h_cost[i]=-1;
+            h_cost_ref[i] = -1;
+        }
+        h_cost[source]=0;
+        h_cost_ref[source]=0;
+        //---------------------------------------------------------
+        //--gpu entry
+        run_bfs_gpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
+        //---------------------------------------------------------
+        //--cpu entry
+        // initalize the memory again
+        for(int i = 0; i < no_of_nodes; i++) {
+            h_graph_mask[i]=false;
+            h_updating_graph_mask[i]=false;
+            h_graph_visited[i]=false;
+        }
+        //set the source node as true in the mask
+        source=0;
+        h_graph_mask[source]=true;
+        h_graph_visited[source]=true;
+        run_bfs_cpu(no_of_nodes,h_graph_nodes,edge_list_size,h_graph_edges, h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost_ref);
+        //---------------------------------------------------------
+        //--result varification
+        compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
+        //release host memory
+        free(h_graph_nodes);
+        free(h_graph_mask);
+        free(h_updating_graph_mask);
+        free(h_graph_visited);
+
+    } catch(std::string msg) {
+        printf("--cambine: exception in main -> %s\n", msg);
+        //release host memory
+        free(h_graph_nodes);
+        free(h_graph_mask);
+        free(h_updating_graph_mask);
+        free(h_graph_visited);
+    }
+
+    return 0;
+}
--- a/tests/opencl/bfs2/kernel.cl
+++ b/tests/opencl/bfs2/kernel.cl
@ -0,0 +1,50 @@
+/* ============================================================
+//--cambine: kernel funtion of Breadth-First-Search
+//--author:	created by Jianbin Fang
+//--date:	06/12/2010
+============================================================ */
+#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
+//Structure to hold a node information
+typedef struct{
+	int starting;
+	int no_of_edges;
+} Node;
+//--7 parameters
+__kernel void BFS_1( const __global Node* g_graph_nodes,
+					const __global int* g_graph_edges, 
+					__global char* g_graph_mask, 
+					__global char* g_updating_graph_mask, 
+					__global char* g_graph_visited, 
+					__global int* g_cost, 
+					const  int no_of_nodes){
+	int tid = get_global_id(0);
+	if( tid<no_of_nodes && g_graph_mask[tid]){
+		g_graph_mask[tid]=false;
+		for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
+			int id = g_graph_edges[i];
+			if(!g_graph_visited[id]){
+				g_cost[id]=g_cost[tid]+1;
+				g_updating_graph_mask[id]=true;
+				}
+			}
+	}	
+}
+
+//--5 parameters
+__kernel void BFS_2(__global char* g_graph_mask, 
+					__global char* g_updating_graph_mask, 
+					__global char* g_graph_visited, 
+					__global char* g_over,
+					const  int no_of_nodes
+					) {
+	int tid = get_global_id(0);
+	if( tid<no_of_nodes && g_updating_graph_mask[tid]){
+
+		g_graph_mask[tid]=true;
+		g_graph_visited[tid]=true;
+		*g_over=true;
+		g_updating_graph_mask[tid]=false;
+	}
+}
+
+
--- a/tests/opencl/bfs2/timer.cc
+++ b/tests/opencl/bfs2/timer.cc
@ -0,0 +1,78 @@
+#include <cstdlib>
+#include <cstring>
+#include <fstream>
+#include <iomanip>
+
+#include "timer.h"
+
+
+using namespace std;
+
+double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
+
+
+double timer::get_CPU_speed_in_MHz()
+{
+#if defined __linux__
+    ifstream infile("/proc/cpuinfo");
+    char     buffer[256], *colon;
+
+    while (infile.good()) {
+	infile.getline(buffer, 256);
+
+	if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
+	    return atof(colon + 2);
+    }
+#endif
+
+    return 0.0;
+}
+
+
+void timer::print_time(ostream &str, const char *which, double time) const
+{
+    static const char *units[] = { " ns", " us", " ms", "  s", " ks", 0 };
+    const char	      **unit   = units;
+
+    time = 1000.0 * time / CPU_speed_in_MHz;
+
+    while (time >= 999.5 && unit[1] != 0) {
+	time /= 1000.0;
+	++ unit;
+    }
+
+    str << which << " = " << setprecision(3) << setw(4) << time << *unit;
+}
+
+
+ostream &timer::print(ostream &str)
+{
+    str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
+
+    if (CPU_speed_in_MHz == 0)
+	str << "could not determine CPU speed\n";
+    else if (count > 0) {
+	double total = static_cast<double>(total_time);
+
+	print_time(str, "avg", total / static_cast<double>(count));
+	print_time(str, ", total", total);
+	str << ", count = " << setw(9) << count << '\n';
+    }
+    else
+	str << "not used\n";
+
+    return str;
+}
+
+
+ostream &operator << (ostream &str, class timer &timer)
+{
+    return timer.print(str);
+}
+
+double timer::getTimeInSeconds()
+{
+    double total = static_cast<double>(total_time);
+    double res = (total / 1000000.0) / CPU_speed_in_MHz;
+    return res;
+}
--- a/tests/opencl/bfs2/timer.h
+++ b/tests/opencl/bfs2/timer.h
@ -0,0 +1,128 @@
+#ifndef timer_h
+#define timer_h
+
+#include <iostream>
+
+
+class timer {
+    public:
+			   timer(const char *name = 0);
+			   timer(const char *name, std::ostream &write_on_exit);
+
+			   ~timer();
+
+	void		   start(), stop();
+	void		   reset();
+	std::ostream   	   &print(std::ostream &);
+
+	double             getTimeInSeconds();
+
+    private:
+	void		   print_time(std::ostream &, const char *which, double time) const;
+
+	union {
+	    long long	   total_time;
+	    struct {
+#if defined __PPC__
+		int	   high, low;
+#else
+		int	   low, high;
+#endif
+	    };
+	};
+
+	unsigned long long count;
+	const char	   *const name;
+	std::ostream	   *const write_on_exit;
+
+	static double	   CPU_speed_in_MHz, get_CPU_speed_in_MHz();
+};
+
+
+std::ostream &operator << (std::ostream &, class timer &);
+
+
+inline void timer::reset()
+{
+    total_time = 0;
+    count      = 0;
+}
+
+
+inline timer::timer(const char *name)
+:
+    name(name),
+    write_on_exit(0)
+{
+    reset();
+}
+
+
+inline timer::timer(const char *name, std::ostream &write_on_exit)
+:
+    name(name),
+    write_on_exit(&write_on_exit)
+{
+    reset();
+}
+
+
+inline timer::~timer()
+{
+    if (write_on_exit != 0)
+	print(*write_on_exit);
+}
+
+
+inline void timer::start()
+{
+#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
+    unsigned eax, edx;
+
+    asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
+
+    total_time -= ((unsigned long long) edx << 32) + eax;
+#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
+    asm volatile
+    (
+	"rdtsc\n\t"
+	"subl %%eax, %0\n\t"
+	"sbbl %%edx, %1"
+    :
+	"+m" (low), "+m" (high)
+    :
+    :
+	"eax", "edx"
+    );
+#else
+#error Compiler/Architecture not recognized
+#endif
+}
+
+
+inline void timer::stop()
+{
+#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
+    unsigned eax, edx;
+
+    asm volatile ("rdtsc" : "=a" (eax), "=d" (edx));
+
+    total_time += ((unsigned long long) edx << 32) + eax;
+#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64)
+    asm volatile
+    (
+	"rdtsc\n\t"
+	"addl %%eax, %0\n\t"
+	"adcl %%edx, %1"
+    :
+	"+m" (low), "+m" (high)
+    :
+    :
+	"eax", "edx"
+    );
+#endif
+
+    ++ count;
+}
+
+#endif
--- a/tests/opencl/bfs2/util.h
+++ b/tests/opencl/bfs2/util.h
@ -0,0 +1,72 @@
+#ifndef _C_UTIL_
+#define _C_UTIL_
+#include <math.h>
+#include <iostream>
+#include <omp.h>
+//-------------------------------------------------------------------
+//--initialize array with maximum limit
+//-------------------------------------------------------------------
+template<typename datatype>
+void fill(datatype *A, const int n, const datatype maxi){
+    for (int j = 0; j < n; j++) 
+    {
+        A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
+    }
+}
+
+//--print matrix
+template<typename datatype>
+void print_matrix(datatype *A, int height, int width){
+	for(int i=0; i<height; i++){
+		for(int j=0; j<width; j++){
+			int idx = i*width + j;
+			std::cout<<A[idx]<<" ";
+		}
+		std::cout<<std::endl;
+	}
+
+	return;
+}
+//-------------------------------------------------------------------
+//--verify results
+//-------------------------------------------------------------------
+#define MAX_RELATIVE_ERROR  .002
+template<typename datatype>
+void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
+
+    char passed = true; 
+#pragma omp parallel for
+    for (int i=0; i<size; i++){
+      if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
+         passed = false; 
+      }
+    }
+    if (passed){
+        printf("--cambine:passed:-)\n");
+    }
+    else{
+        printf("--cambine: failed:-(\n");
+    }
+    return ;
+}
+template<typename datatype>
+void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
+
+    char passed = true; 
+//#pragma omp parallel for
+    for (int i=0; i<size; i++){
+      if (cpu_results[i]!=gpu_results[i]){
+         passed = false; 
+      }
+    }
+    if (passed){
+        printf("--cambine:passed:-)\n");
+    }
+    else{
+        printf("--cambine: failed:-(\n");
+    }
+    return ;
+}
+
+#endif
+
--- a/tests/opencl/cfd/.gitignore
+++ b/tests/opencl/cfd/.gitignore
@ -0,0 +1,2 @@
+*.txt
+*.out
--- a/tests/opencl/cfd/CLHelper.h
+++ b/tests/opencl/cfd/CLHelper.h
--- a/tests/opencl/cfd/Makefile
+++ b/tests/opencl/cfd/Makefile
@ -0,0 +1,7 @@
+PROJECT = cfd
+
+SRCS = euler3d.cc timing.cpp
+
+OPTS ?= fvcorr.domn.097K
+
+include ../common.mk
--- a/tests/opencl/cfd/Makefile_old
+++ b/tests/opencl/cfd/Makefile_old
@ -0,0 +1,36 @@
+include ../../common/make.config
+
+#Can be changed by `make TYPE=CPU`
+TYPE = GPU
+
+#Library
+ifeq ($(TYPE),GPU)
+	OPENCL_INC = $(NV_OPENCL_INC)
+	OPENCL_LIB = $(NV_OPENCL_LIB)
+else
+	OPENCL_INC = $(INTEL_OPENCL_INC)
+	OPENCL_LIB = $(INTEL_OPENCL_LIB)
+endif
+
+#C compiler
+CC = g++
+
+FLAGS = -O3
+SRC = euler3d.cpp ../util/timing.c
+
+EXE = euler3d.out
+
+release:$(SRC)
+	$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -I../util -DTIMING $(FLAGS)
+
+errmsg:$(SRC)
+	$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB) -lOpenCL -D ERRMSG $(FLAGS)
+
+ptx:$(SRC)
+	$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB)  -lOpenCL -D PTX_MSG $(FLAGS)
+
+res:$(SRC)
+	$(CC) $(KERNEL_DIM) $(SRC) -o $(EXE) -I$(OPENCL_INC) -L$(OPENCL_LIB)  -lOpenCL -D RES_MSG $(FLAGS)
+
+clean: $(SRC)
+	rm -f $(EXE) $(EXE).linkinfo result*
--- a/tests/opencl/cfd/README
+++ b/tests/opencl/cfd/README
@ -0,0 +1,10 @@
+******Adjustable work group size*****
+RD_WG_SIZE  for all
+RD_WG_SIZE_1 or RD_WG_SIZE_1_0 for initialize_variables
+RD_WG_SIZE_2 or RD_WG_SIZE_2_0 for compute_step_factor
+RD_WG_SIZE_3 or RD_WG_SIZE_3_0 for compute_flux
+RD_WG_SIZE_4 or RD_WG_SIZE_4_0 for time_step
+
+USAGE:
+make clean
+make KERNEL_DIM="-DRD_WG_SIZE_1=128 -DRD_WG_SIZE_2=192 -DRD_WG_SIZE_3=128 -DRD_WG_SIZE_4=256"
--- a/tests/opencl/cfd/cfd
+++ b/tests/opencl/cfd/cfd
--- a/tests/opencl/cfd/euler3d.cc
+++ b/tests/opencl/cfd/euler3d.cc
@ -0,0 +1,418 @@
+/********************************************************************
+	euler3d.cpp
+	: parallelized code of CFD
+	
+	- original code from the AIAA-2009-4001 by Andrew Corrigan, acorriga@gmu.edu
+	- parallelization with OpenCL API has been applied by
+	Jianbin Fang - j.fang@tudelft.nl
+	Delft University of Technology
+	Faculty of Electrical Engineering, Mathematics and Computer Science
+	Department of Software Technology
+	Parallel and Distributed Systems Group
+	on 24/03/2011
+********************************************************************/
+
+#include <iostream>
+#include <fstream>
+#include <math.h>
+#include "CLHelper.h" 
+ 
+/*
+ * Options 
+ * 
+ */ 
+#define GAMMA 1.4f
+#define iterations 2000
+#ifndef block_length
+	#define block_length 192
+#endif
+
+#define NDIM 3
+#define NNB 4
+
+#define RK 3	// 3rd order RK
+#define ff_mach 1.2f
+#define deg_angle_of_attack 0.0f
+
+/*
+ * not options
+ */
+
+
+#if block_length > 128
+#warning "the kernels may fail too launch on some systems if the block length is too large"
+#endif
+
+
+#define VAR_DENSITY 0
+#define VAR_MOMENTUM  1
+#define VAR_DENSITY_ENERGY (VAR_MOMENTUM+NDIM)
+#define NVAR (VAR_DENSITY_ENERGY+1)
+
+//self-defined user type
+typedef struct{
+	float x;
+	float y;
+	float z;
+} float3;
+
+/*
+ * Generic functions
+ */
+template <typename T>
+cl_mem alloc(int N){
+	cl_mem mem_d = _clMalloc(sizeof(T)*N);
+	return mem_d;
+}
+
+template <typename T>
+void dealloc(cl_mem array){
+	_clFree(array);
+}
+
+template <typename T>
+void copy(cl_mem dst, cl_mem src, int N){
+	_clMemcpyD2D(dst, src, N*sizeof(T));
+}
+
+template <typename T>
+void upload(cl_mem dst, T* src, int N){
+	_clMemcpyH2D(dst, src, N*sizeof(T));
+}
+
+template <typename T>
+void download(T* dst, cl_mem src, int N){
+	_clMemcpyD2H(dst, src, N*sizeof(T));
+}
+
+void dump(cl_mem variables, int nel, int nelr){
+	float* h_variables = new float[nelr*NVAR];
+	download(h_variables, variables, nelr*NVAR);
+
+	{
+		std::ofstream file("density.txt");
+		file << nel << " " << nelr << std::endl;
+		for(int i = 0; i < nel; i++) file << h_variables[i + VAR_DENSITY*nelr] << std::endl;
+	}
+
+
+	{
+		std::ofstream file("momentum.txt");
+		file << nel << " " << nelr << std::endl;
+		for(int i = 0; i < nel; i++)
+		{
+			for(int j = 0; j != NDIM; j++)
+				file << h_variables[i + (VAR_MOMENTUM+j)*nelr] << " ";
+			file << std::endl;
+		}
+	}
+	
+	{
+		std::ofstream file("density_energy.txt");
+		file << nel << " " << nelr << std::endl;
+		for(int i = 0; i < nel; i++) file << h_variables[i + VAR_DENSITY_ENERGY*nelr] << std::endl;
+	}
+	delete[] h_variables;
+}
+
+void initialize_variables(int nelr, cl_mem variables, cl_mem ff_variable) throw(string){
+
+	int work_items = nelr;
+	int work_group_size = BLOCK_SIZE_1;
+	int kernel_id = 1;
+	int arg_idx = 0;	
+	_clSetArgs(kernel_id, arg_idx++, variables);
+	_clSetArgs(kernel_id, arg_idx++, ff_variable);
+	_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
+	_clInvokeKernel(kernel_id, work_items, work_group_size);
+}
+
+void compute_step_factor(int nelr, cl_mem variables, cl_mem areas, cl_mem step_factors){
+
+	int work_items = nelr;
+	int work_group_size = BLOCK_SIZE_2;
+	int kernel_id = 2;
+	int arg_idx = 0;
+	_clSetArgs(kernel_id, arg_idx++, variables);
+	_clSetArgs(kernel_id, arg_idx++, areas);
+	_clSetArgs(kernel_id, arg_idx++, step_factors);
+	_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
+	_clInvokeKernel(kernel_id, work_items, work_group_size);
+}
+
+void compute_flux(int nelr, cl_mem elements_surrounding_elements, cl_mem normals, cl_mem variables, cl_mem ff_variable, \
+			cl_mem fluxes, cl_mem ff_flux_contribution_density_energy,
+			cl_mem ff_flux_contribution_momentum_x,
+			cl_mem ff_flux_contribution_momentum_y,
+			cl_mem ff_flux_contribution_momentum_z){
+
+	int work_items = nelr;
+	int work_group_size = BLOCK_SIZE_3;
+	int kernel_id = 3;
+	int arg_idx = 0;
+	_clSetArgs(kernel_id, arg_idx++, elements_surrounding_elements);
+	_clSetArgs(kernel_id, arg_idx++, normals);
+	_clSetArgs(kernel_id, arg_idx++, variables);
+	_clSetArgs(kernel_id, arg_idx++, ff_variable);
+	_clSetArgs(kernel_id, arg_idx++, fluxes);
+	_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_density_energy);
+	_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_x);
+	_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_y);
+	_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_z);
+	_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
+	_clInvokeKernel(kernel_id, work_items, work_group_size);
+}
+
+void time_step(int j, int nelr, cl_mem old_variables, cl_mem variables, cl_mem step_factors, cl_mem fluxes){
+
+	int work_items = nelr;
+	int work_group_size = BLOCK_SIZE_4;
+	int kernel_id = 4;
+	int arg_idx = 0;
+	_clSetArgs(kernel_id, arg_idx++, &j, sizeof(int));
+	_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
+	_clSetArgs(kernel_id, arg_idx++, old_variables);
+	_clSetArgs(kernel_id, arg_idx++, variables);
+	_clSetArgs(kernel_id, arg_idx++, step_factors);
+	_clSetArgs(kernel_id, arg_idx++, fluxes);
+	
+	_clInvokeKernel(kernel_id, work_items, work_group_size);
+}
+inline void compute_flux_contribution(float& density, float3& momentum, float& density_energy, float& pressure, float3& velocity, float3& fc_momentum_x, float3& fc_momentum_y, float3& fc_momentum_z, float3& fc_density_energy)
+{
+	fc_momentum_x.x = velocity.x*momentum.x + pressure;
+	fc_momentum_x.y = velocity.x*momentum.y;
+	fc_momentum_x.z = velocity.x*momentum.z;
+	
+	
+	fc_momentum_y.x = fc_momentum_x.y;
+	fc_momentum_y.y = velocity.y*momentum.y + pressure;
+	fc_momentum_y.z = velocity.y*momentum.z;
+
+	fc_momentum_z.x = fc_momentum_x.z;
+	fc_momentum_z.y = fc_momentum_y.z;
+	fc_momentum_z.z = velocity.z*momentum.z + pressure;
+
+	float de_p = density_energy+pressure;
+	fc_density_energy.x = velocity.x*de_p;
+	fc_density_energy.y = velocity.y*de_p;
+	fc_density_energy.z = velocity.z*de_p;
+}
+
+/*
+ * Main function
+ */
+int main(int argc, char** argv){
+  printf("WG size of kernel:initialize = %d, WG size of kernel:compute_step_factor = %d, WG size of kernel:compute_flux = %d, WG size of kernel:time_step = %d\n", BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
+
+	if (argc < 2){
+		//std::cout << "specify data file name and [device type] [device id]" << std::endl;
+		return 0;
+	}
+	const char* data_file_name = argv[1];
+	_clCmdParams(argc, argv);
+	cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z,  ff_flux_contribution_density_energy;
+	cl_mem areas, elements_surrounding_elements, normals;
+	cl_mem variables, old_variables, fluxes, step_factors;
+	float h_ff_variable[NVAR];
+
+	try{		
+		_clInit(device_type, device_id);		
+		// set far field conditions and load them into constant memory on the gpu
+		{
+			//float h_ff_variable[NVAR];
+			const float angle_of_attack = float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
+			
+			h_ff_variable[VAR_DENSITY] = float(1.4);
+			
+			float ff_pressure = float(1.0f);
+			float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]);
+			float ff_speed = float(ff_mach)*ff_speed_of_sound;
+			
+			float3 ff_velocity;
+			ff_velocity.x = ff_speed*float(cos((float)angle_of_attack));
+			ff_velocity.y = ff_speed*float(sin((float)angle_of_attack));
+			ff_velocity.z = 0.0f;
+			
+			h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
+			h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
+			h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;
+					
+			h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*(float(0.5f)*(ff_speed*ff_speed)) + (ff_pressure / float(GAMMA-1.0f));
+
+			float3 h_ff_momentum;
+			h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0);
+			h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1);
+			h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2);
+			float3 h_ff_flux_contribution_momentum_x;
+			float3 h_ff_flux_contribution_momentum_y;
+			float3 h_ff_flux_contribution_momentum_z;
+			float3 h_ff_flux_contribution_density_energy;
+			compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum, h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, ff_velocity, h_ff_flux_contribution_momentum_x, h_ff_flux_contribution_momentum_y, h_ff_flux_contribution_momentum_z, h_ff_flux_contribution_density_energy);
+
+			// copy far field conditions to the gpu
+			//cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z,  ff_flux_contribution_density_energy;
+			ff_variable = _clMalloc(NVAR*sizeof(float));
+			ff_flux_contribution_momentum_x = _clMalloc(sizeof(float3));
+			ff_flux_contribution_momentum_y = _clMalloc(sizeof(float3));
+			ff_flux_contribution_momentum_z = _clMalloc(sizeof(float3));
+			ff_flux_contribution_density_energy = _clMalloc(sizeof(float3));
+			_clMemcpyH2D(ff_variable,          h_ff_variable,          NVAR*sizeof(float));
+			_clMemcpyH2D(ff_flux_contribution_momentum_x, &h_ff_flux_contribution_momentum_x, sizeof(float3));
+			_clMemcpyH2D(ff_flux_contribution_momentum_y, &h_ff_flux_contribution_momentum_y, sizeof(float3));
+			_clMemcpyH2D(ff_flux_contribution_momentum_z, &h_ff_flux_contribution_momentum_z, sizeof(float3));		
+			_clMemcpyH2D(ff_flux_contribution_density_energy, &h_ff_flux_contribution_density_energy, sizeof(float3));
+			_clFinish();
+		
+		}
+		int nel;
+		int nelr;
+		// read in domain geometry
+		//float* areas;
+		//int* elements_surrounding_elements;
+		//float* normals;
+		{
+			std::ifstream file(data_file_name);
+			if(!file.good()){
+				throw(string("can not find/open file!"));
+			}
+			file >> nel;
+			nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));
+			////std::cout<<"--cambine: nel="<<nel<<", nelr="<<nelr<<std::endl;
+			float* h_areas = new float[nelr];
+			int* h_elements_surrounding_elements = new int[nelr*NNB];
+			float* h_normals = new float[nelr*NDIM*NNB];
+
+					
+			// read in data
+			for(int i = 0; i < nel; i++)
+			{
+				file >> h_areas[i];
+				for(int j = 0; j < NNB; j++)
+				{
+					file >> h_elements_surrounding_elements[i + j*nelr];
+					if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1;
+					h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering				
+					
+					for(int k = 0; k < NDIM; k++)
+					{
+						file >> h_normals[i + (j + k*NNB)*nelr];
+						h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr];
+					}
+				}
+			}
+			
+			// fill in remaining data
+			int last = nel-1;
+			for(int i = nel; i < nelr; i++)
+			{
+				h_areas[i] = h_areas[last];
+				for(int j = 0; j < NNB; j++)
+				{
+					// duplicate the last element
+					h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr];	
+					for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr];
+				}
+			}
+			
+			areas = alloc<float>(nelr);
+			upload<float>(areas, h_areas, nelr);
+
+			elements_surrounding_elements = alloc<int>(nelr*NNB);
+			upload<int>(elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB);
+
+			normals = alloc<float>(nelr*NDIM*NNB);
+			upload<float>(normals, h_normals, nelr*NDIM*NNB);
+					
+			delete[] h_areas;
+			delete[] h_elements_surrounding_elements;
+			delete[] h_normals;
+		}
+		
+		// Create arrays and set initial conditions
+		variables = alloc<float>(nelr*NVAR);				
+		int tp = 0;
+		initialize_variables(nelr, variables, ff_variable);			
+		old_variables = alloc<float>(nelr*NVAR);   	
+		fluxes = alloc<float>(nelr*NVAR);
+		step_factors = alloc<float>(nelr); 
+		// make sure all memory is floatly allocated before we start timing
+		initialize_variables(nelr, old_variables, ff_variable);	
+		initialize_variables(nelr, fluxes, ff_variable);		
+		_clMemset(step_factors, 0, sizeof(float)*nelr);
+		// make sure CUDA isn't still doing something before we start timing
+		_clFinish();
+		// these need to be computed the first time in order to compute time step
+		printf( "Starting...\n");
+
+		// Begin iterations
+		for(int i = 0; i < iterations; i++){
+			copy<float>(old_variables, variables, nelr*NVAR);
+			// for the first iteration we compute the time step
+			compute_step_factor(nelr, variables, areas, step_factors);
+			for(int j = 0; j < RK; j++){
+				compute_flux(nelr, elements_surrounding_elements, normals, variables, ff_variable, fluxes, ff_flux_contribution_density_energy, \
+				ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z);
+			
+				time_step(j, nelr, old_variables, variables, step_factors, fluxes);
+			}
+		}
+		_clFinish();
+		//std::cout << "Saving solution..." << std::endl;
+		dump(variables, nel, nelr);
+		//std::cout << "Saved solution..." << std::endl;
+		_clStatistics();
+		//std::cout << "Cleaning up..." << std::endl;
+		
+		//--release resources
+		_clFree(ff_variable);
+		_clFree(ff_flux_contribution_momentum_x);
+		_clFree(ff_flux_contribution_momentum_y);
+		_clFree(ff_flux_contribution_momentum_z);
+		_clFree(ff_flux_contribution_density_energy);
+		_clFree(areas);
+		_clFree(elements_surrounding_elements);
+		_clFree(normals);
+		_clFree(variables);
+		_clFree(old_variables);
+		_clFree(fluxes);
+		_clFree(step_factors);
+		_clRelease();
+		printf("Done...\n");
+		_clPrintTiming();
+	}
+	catch(string msg){
+		printf("--cambine:( an exception catched in main body ->%s\n", msg.c_str());		
+		_clFree(ff_variable);
+		_clFree(ff_flux_contribution_momentum_x);
+		_clFree(ff_flux_contribution_momentum_y);
+		_clFree(ff_flux_contribution_momentum_z);
+		_clFree(ff_flux_contribution_density_energy);
+		_clFree(areas);
+		_clFree(elements_surrounding_elements);
+		_clFree(normals);
+		_clFree(variables);
+		_clFree(old_variables);
+		_clFree(fluxes);
+		_clFree(step_factors);
+		_clRelease();		
+	}
+	catch(...){
+		//std::cout<<"--cambine:( unknow exceptions in main body..."<<std::endl;		
+		_clFree(ff_variable);
+		_clFree(ff_flux_contribution_momentum_x);
+		_clFree(ff_flux_contribution_momentum_y);
+		_clFree(ff_flux_contribution_momentum_z);
+		_clFree(ff_flux_contribution_density_energy);
+		_clFree(areas);
+		_clFree(elements_surrounding_elements);
+		_clFree(normals);
+		_clFree(variables);
+		_clFree(old_variables);
+		_clFree(fluxes);
+		_clFree(step_factors);
+		_clRelease();		
+	}
+		
+	return 0;
+}
--- a/tests/opencl/cfd/fvcorr.domn.097K
+++ b/tests/opencl/cfd/fvcorr.domn.097K
--- a/tests/opencl/cfd/kernel.cl
+++ b/tests/opencl/cfd/kernel.cl
@ -0,0 +1,284 @@
+/* ============================================================
+//--functions: 	kernel funtion
+//--programmer:	Jianbin Fang
+//--date:		24/03/2011
+============================================================ */
+#ifndef _KERNEL_
+#define _KERNEL_
+
+#define GAMMA (1.4f)
+
+
+#define NDIM 3
+#define NNB 4
+
+#define RK 3	// 3rd order RK
+#define ff_mach 1.2f
+#define deg_angle_of_attack 0.0f
+
+#define VAR_DENSITY 0
+#define VAR_MOMENTUM  1
+#define VAR_DENSITY_ENERGY (VAR_MOMENTUM+NDIM)
+#define NVAR (VAR_DENSITY_ENERGY+1)
+//#pragma OPENCL EXTENSION CL_MAD : enable
+
+//self-defined user type
+typedef struct{
+	float x;
+	float y;
+	float z;
+} FLOAT3;
+/*------------------------------------------------------------
+	@function:	set memory 
+	@params:	
+		mem_d: 		target memory to be set;
+		val:		set the target memory to value 'val'
+		num_bytes:	the number of bytes all together
+	@return:	through mem_d
+------------------------------------------------------------*/
+__kernel void memset_kernel(__global char * mem_d, short val, int ct){
+	const int thread_id = get_global_id(0);
+	if( thread_id >= ct) return;
+	mem_d[thread_id] = val;
+}
+
+//--cambine: omit &
+inline void compute_velocity(float  density, FLOAT3 momentum, FLOAT3* velocity){
+	velocity->x = momentum.x / density;
+	velocity->y = momentum.y / density;
+	velocity->z = momentum.z / density;
+}
+	
+inline float compute_speed_sqd(FLOAT3 velocity){
+	return velocity.x*velocity.x + velocity.y*velocity.y + velocity.z*velocity.z;
+}
+
+inline float compute_pressure(float density, float density_energy, float speed_sqd){
+	return ((float)(GAMMA) - (float)(1.0f))*(density_energy - (float)(0.5f)*density*speed_sqd);
+}
+inline float compute_speed_of_sound(float density, float pressure){
+	//return sqrtf(float(GAMMA)*pressure/density);
+	return sqrt((float)(GAMMA)*pressure/density);
+}
+inline void compute_flux_contribution(float density, FLOAT3 momentum, float density_energy, float pressure, FLOAT3 velocity, FLOAT3* fc_momentum_x, FLOAT3* fc_momentum_y, FLOAT3* fc_momentum_z, FLOAT3* fc_density_energy)
+{
+	fc_momentum_x->x = velocity.x*momentum.x + pressure;
+	fc_momentum_x->y = velocity.x*momentum.y;
+	fc_momentum_x->z = velocity.x*momentum.z;
+	
+	
+	fc_momentum_y->x = fc_momentum_x->y;
+	fc_momentum_y->y = velocity.y*momentum.y + pressure;
+	fc_momentum_y->z = velocity.y*momentum.z;
+
+	fc_momentum_z->x = fc_momentum_x->z;
+	fc_momentum_z->y = fc_momentum_y->z;
+	fc_momentum_z->z = velocity.z*momentum.z + pressure;
+
+	float de_p = density_energy+pressure;
+	fc_density_energy->x = velocity.x*de_p;
+	fc_density_energy->y = velocity.y*de_p;
+	fc_density_energy->z = velocity.z*de_p;
+}
+__kernel void initialize_variables(__global float* variables, __constant float* ff_variable, int nelr){
+	//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
+	const int i = get_global_id(0);
+	if( i >= nelr) return;
+	for(int j = 0; j < NVAR; j++)
+		variables[i + j*nelr] = ff_variable[j];
+	
+}
+
+__kernel void compute_step_factor(__global float* variables, 
+							__global float* areas, 
+							__global float* step_factors,
+							int nelr){
+	//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
+	const int i = get_global_id(0);
+	if( i >= nelr) return;
+
+	float density = variables[i + VAR_DENSITY*nelr];
+	FLOAT3 momentum;
+	momentum.x = variables[i + (VAR_MOMENTUM+0)*nelr];
+	momentum.y = variables[i + (VAR_MOMENTUM+1)*nelr];
+	momentum.z = variables[i + (VAR_MOMENTUM+2)*nelr];
+	
+	float density_energy = variables[i + VAR_DENSITY_ENERGY*nelr];
+	
+	FLOAT3 velocity;       compute_velocity(density, momentum, &velocity);
+	float speed_sqd      = compute_speed_sqd(velocity);
+	//float speed_sqd;
+	//compute_speed_sqd(velocity, speed_sqd);
+	float pressure       = compute_pressure(density, density_energy, speed_sqd);
+	float speed_of_sound = compute_speed_of_sound(density, pressure);
+
+	// dt = float(0.5f) * sqrtf(areas[i]) /  (||v|| + c).... but when we do time stepping, this later would need to be divided by the area, so we just do it all at once
+	//step_factors[i] = (float)(0.5f) / (sqrtf(areas[i]) * (sqrtf(speed_sqd) + speed_of_sound));
+	step_factors[i] = (float)(0.5f) / (sqrt(areas[i]) * (sqrt(speed_sqd) + speed_of_sound));
+}
+
+__kernel void compute_flux(
+					__global int* elements_surrounding_elements, 
+					__global float* normals, 
+					__global float* variables, 
+					__constant float* ff_variable,
+					__global float* fluxes,
+					__constant FLOAT3* ff_flux_contribution_density_energy,
+					__constant FLOAT3* ff_flux_contribution_momentum_x,
+					__constant FLOAT3* ff_flux_contribution_momentum_y,
+					__constant FLOAT3* ff_flux_contribution_momentum_z,
+					int nelr){
+	const float smoothing_coefficient = (float)(0.2f);
+	//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
+	const int i = get_global_id(0);
+	if( i >= nelr) return;
+	int j, nb;
+	FLOAT3 normal; float normal_len;
+	float factor;
+	
+	float density_i = variables[i + VAR_DENSITY*nelr];
+	FLOAT3 momentum_i;
+	momentum_i.x = variables[i + (VAR_MOMENTUM+0)*nelr];
+	momentum_i.y = variables[i + (VAR_MOMENTUM+1)*nelr];
+	momentum_i.z = variables[i + (VAR_MOMENTUM+2)*nelr];
+
+	float density_energy_i = variables[i + VAR_DENSITY_ENERGY*nelr];
+
+	FLOAT3 velocity_i;             				compute_velocity(density_i, momentum_i, &velocity_i);
+	float speed_sqd_i                          = compute_speed_sqd(velocity_i);
+	//float speed_sqd_i;
+	//compute_speed_sqd(velocity_i, speed_sqd_i);
+	//float speed_i                              = sqrtf(speed_sqd_i);
+	float speed_i                              = sqrt(speed_sqd_i);
+	float pressure_i                           = compute_pressure(density_i, density_energy_i, speed_sqd_i);
+	float speed_of_sound_i                     = compute_speed_of_sound(density_i, pressure_i);
+	FLOAT3 flux_contribution_i_momentum_x, flux_contribution_i_momentum_y, flux_contribution_i_momentum_z;
+	FLOAT3 flux_contribution_i_density_energy;	
+	compute_flux_contribution(density_i, momentum_i, density_energy_i, pressure_i, velocity_i, &flux_contribution_i_momentum_x, &flux_contribution_i_momentum_y, &flux_contribution_i_momentum_z, &flux_contribution_i_density_energy);
+	
+	float flux_i_density = (float)(0.0f);
+	FLOAT3 flux_i_momentum;
+	flux_i_momentum.x = (float)(0.0f);
+	flux_i_momentum.y = (float)(0.0f);
+	flux_i_momentum.z = (float)(0.0f);
+	float flux_i_density_energy = (float)(0.0f);
+		
+	FLOAT3 velocity_nb;
+	float density_nb, density_energy_nb;
+	FLOAT3 momentum_nb;
+	FLOAT3 flux_contribution_nb_momentum_x, flux_contribution_nb_momentum_y, flux_contribution_nb_momentum_z;
+	FLOAT3 flux_contribution_nb_density_energy;	
+	float speed_sqd_nb, speed_of_sound_nb, pressure_nb;
+	
+	#pragma unroll
+	for(j = 0; j < NNB; j++)
+	{
+		nb = elements_surrounding_elements[i + j*nelr];
+		normal.x = normals[i + (j + 0*NNB)*nelr];
+		normal.y = normals[i + (j + 1*NNB)*nelr];
+		normal.z = normals[i + (j + 2*NNB)*nelr];
+		//normal_len = sqrtf(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);
+		normal_len = sqrt(normal.x*normal.x + normal.y*normal.y + normal.z*normal.z);
+		
+		if(nb >= 0) 	// a legitimate neighbor
+		{
+			density_nb = variables[nb + VAR_DENSITY*nelr];
+			momentum_nb.x = variables[nb + (VAR_MOMENTUM+0)*nelr];
+			momentum_nb.y = variables[nb + (VAR_MOMENTUM+1)*nelr];
+			momentum_nb.z = variables[nb + (VAR_MOMENTUM+2)*nelr];
+			density_energy_nb = variables[nb + VAR_DENSITY_ENERGY*nelr];
+												compute_velocity(density_nb, momentum_nb, &velocity_nb);
+			speed_sqd_nb                      = compute_speed_sqd(velocity_nb);
+			pressure_nb                       = compute_pressure(density_nb, density_energy_nb, speed_sqd_nb);
+			speed_of_sound_nb                 = compute_speed_of_sound(density_nb, pressure_nb);
+			                                    compute_flux_contribution(density_nb, momentum_nb, density_energy_nb, pressure_nb, velocity_nb, &flux_contribution_nb_momentum_x, &flux_contribution_nb_momentum_y, &flux_contribution_nb_momentum_z, &flux_contribution_nb_density_energy);
+			
+			// artificial viscosity
+			factor = -normal_len*smoothing_coefficient*(float)(0.5f)*(speed_i + sqrt(speed_sqd_nb) + speed_of_sound_i + speed_of_sound_nb);
+			flux_i_density += factor*(density_i-density_nb);
+			flux_i_density_energy += factor*(density_energy_i-density_energy_nb);
+			flux_i_momentum.x += factor*(momentum_i.x-momentum_nb.x);
+			flux_i_momentum.y += factor*(momentum_i.y-momentum_nb.y);
+			flux_i_momentum.z += factor*(momentum_i.z-momentum_nb.z);
+
+			// accumulate cell-centered fluxes
+			factor = (float)(0.5f)*normal.x;
+			flux_i_density += factor*(momentum_nb.x+momentum_i.x);
+			flux_i_density_energy += factor*(flux_contribution_nb_density_energy.x+flux_contribution_i_density_energy.x);
+			flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.x+flux_contribution_i_momentum_x.x);
+			flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.x+flux_contribution_i_momentum_y.x);
+			flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.x+flux_contribution_i_momentum_z.x);
+			
+			factor = (float)(0.5f)*normal.y;
+			flux_i_density += factor*(momentum_nb.y+momentum_i.y);
+			flux_i_density_energy += factor*(flux_contribution_nb_density_energy.y+flux_contribution_i_density_energy.y);
+			flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.y+flux_contribution_i_momentum_x.y);
+			flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.y+flux_contribution_i_momentum_y.y);
+			flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.y+flux_contribution_i_momentum_z.y);
+			
+			factor = (float)(0.5f)*normal.z;
+			flux_i_density += factor*(momentum_nb.z+momentum_i.z);
+			flux_i_density_energy += factor*(flux_contribution_nb_density_energy.z+flux_contribution_i_density_energy.z);
+			flux_i_momentum.x += factor*(flux_contribution_nb_momentum_x.z+flux_contribution_i_momentum_x.z);
+			flux_i_momentum.y += factor*(flux_contribution_nb_momentum_y.z+flux_contribution_i_momentum_y.z);
+			flux_i_momentum.z += factor*(flux_contribution_nb_momentum_z.z+flux_contribution_i_momentum_z.z);
+		}
+		else if(nb == -1)	// a wing boundary
+		{
+			flux_i_momentum.x += normal.x*pressure_i;
+			flux_i_momentum.y += normal.y*pressure_i;
+			flux_i_momentum.z += normal.z*pressure_i;
+		}
+		else if(nb == -2) // a far field boundary
+		{
+			factor = (float)(0.5f)*normal.x;
+			flux_i_density += factor*(ff_variable[VAR_MOMENTUM+0]+momentum_i.x);
+			flux_i_density_energy += factor*(ff_flux_contribution_density_energy[0].x+flux_contribution_i_density_energy.x);
+			flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x[0].x + flux_contribution_i_momentum_x.x);
+			flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y[0].x + flux_contribution_i_momentum_y.x);
+			flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z[0].x + flux_contribution_i_momentum_z.x);
+			
+			factor = (float)(0.5f)*normal.y;
+			flux_i_density += factor*(ff_variable[VAR_MOMENTUM+1]+momentum_i.y);
+			flux_i_density_energy += factor*(ff_flux_contribution_density_energy[0].y+flux_contribution_i_density_energy.y);
+			flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x[0].y + flux_contribution_i_momentum_x.y);
+			flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y[0].y + flux_contribution_i_momentum_y.y);
+			flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z[0].y + flux_contribution_i_momentum_z.y);
+
+			factor = (float)(0.5f)*normal.z;
+			flux_i_density += factor*(ff_variable[VAR_MOMENTUM+2]+momentum_i.z);
+			flux_i_density_energy += factor*(ff_flux_contribution_density_energy[0].z+flux_contribution_i_density_energy.z);
+			flux_i_momentum.x += factor*(ff_flux_contribution_momentum_x[0].z + flux_contribution_i_momentum_x.z);
+			flux_i_momentum.y += factor*(ff_flux_contribution_momentum_y[0].z + flux_contribution_i_momentum_y.z);
+			flux_i_momentum.z += factor*(ff_flux_contribution_momentum_z[0].z + flux_contribution_i_momentum_z.z);
+
+		}
+	}
+
+	fluxes[i + VAR_DENSITY*nelr] = flux_i_density;
+	fluxes[i + (VAR_MOMENTUM+0)*nelr] = flux_i_momentum.x;
+	fluxes[i + (VAR_MOMENTUM+1)*nelr] = flux_i_momentum.y;
+	fluxes[i + (VAR_MOMENTUM+2)*nelr] = flux_i_momentum.z;
+	fluxes[i + VAR_DENSITY_ENERGY*nelr] = flux_i_density_energy;
+}
+
+__kernel void time_step(int j, int nelr, 
+				__global float* old_variables, 
+				__global float* variables, 
+				__global float* step_factors, 
+				__global float* fluxes){
+	//const int i = (blockDim.x*blockIdx.x + threadIdx.x);
+	const int i = get_global_id(0);
+	if( i >= nelr) return;
+
+	float factor = step_factors[i]/(float)(RK+1-j);
+
+	variables[i + VAR_DENSITY*nelr] = old_variables[i + VAR_DENSITY*nelr] + factor*fluxes[i + VAR_DENSITY*nelr];
+	variables[i + VAR_DENSITY_ENERGY*nelr] = old_variables[i + VAR_DENSITY_ENERGY*nelr] + factor*fluxes[i + VAR_DENSITY_ENERGY*nelr];
+	variables[i + (VAR_MOMENTUM+0)*nelr] = old_variables[i + (VAR_MOMENTUM+0)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+0)*nelr];
+	variables[i + (VAR_MOMENTUM+1)*nelr] = old_variables[i + (VAR_MOMENTUM+1)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+1)*nelr];	
+	variables[i + (VAR_MOMENTUM+2)*nelr] = old_variables[i + (VAR_MOMENTUM+2)*nelr] + factor*fluxes[i + (VAR_MOMENTUM+2)*nelr];	
+	
+}
+
+#endif
--- a/tests/opencl/cfd/timing.cpp
+++ b/tests/opencl/cfd/timing.cpp
@ -0,0 +1,40 @@
+#include <stdio.h>
+
+#include "timing.h"
+
+void time_measure_start(struct timeval *tv)
+{
+	gettimeofday(tv, NULL);
+}
+
+void time_measure_end(struct timeval *tv)
+{
+	struct timeval tv_now, tv_diff;
+	double d;
+
+	gettimeofday(&tv_now, NULL);
+	tvsub(&tv_now, tv, &tv_diff);
+
+	d = (double) tv_diff.tv_sec * 1000.0 + (double) tv_diff.tv_usec / 1000.0;
+	printf("Time (Memory Copy and Launch) = %f (ms)\n", d);
+}
+
+float probe_event_time(cl_event event, cl_command_queue command_queue) {
+    cl_int error=0;
+    cl_ulong eventStart,eventEnd;
+    clFinish(command_queue);
+    error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_START,
+                                    sizeof(cl_ulong),&eventStart,NULL);
+    if (error != CL_SUCCESS) {
+        printf("ERROR (%d) in event start profiling.\n", error);
+        return 0;
+    }
+    error = clGetEventProfilingInfo(event,CL_PROFILING_COMMAND_END,
+                                    sizeof(cl_ulong),&eventEnd,NULL);
+    if (error != CL_SUCCESS) {
+        printf("ERROR (%d) in event end profiling.\n", error);
+        return 0;
+    }
+
+    return (float)((eventEnd-eventStart)/1000000.0);
+}
--- a/tests/opencl/cfd/timing.h
+++ b/tests/opencl/cfd/timing.h
@ -0,0 +1,25 @@
+#ifndef __TIMING_H__
+#define __TIMING_H__
+
+#include <sys/time.h>
+#include <CL/cl.h>
+
+void time_measure_start(struct timeval *tv);
+void time_measure_end(struct timeval *tv);
+
+/* tvsub: ret = x - y. */
+static inline void tvsub(struct timeval *x,
+						 struct timeval *y,
+						 struct timeval *ret)
+{
+	ret->tv_sec = x->tv_sec - y->tv_sec;
+	ret->tv_usec = x->tv_usec - y->tv_usec;
+	if (ret->tv_usec < 0) {
+		ret->tv_sec--;
+		ret->tv_usec += 1000000;
+	}
+}
+
+float probe_event_time(cl_event, cl_command_queue);
+
+#endif
--- a/tests/opencl/cfd/util.h
+++ b/tests/opencl/cfd/util.h
@ -0,0 +1,131 @@
+#ifndef _C_UTIL_
+#define _C_UTIL_
+#include <math.h>
+#include <iostream>
+#include <omp.h>
+#include <sys/time.h>
+
+#ifdef RD_WG_SIZE_0_0
+	#define BLOCK_SIZE_0 RD_WG_SIZE_0_0
+#elif defined(RD_WG_SIZE_0)
+	#define BLOCK_SIZE_0 RD_WG_SIZE_0
+#elif defined(RD_WG_SIZE)
+	#define BLOCK_SIZE_0 RD_WG_SIZE
+#else
+	#define BLOCK_SIZE_0 192
+#endif
+
+#ifdef RD_WG_SIZE_1_0
+	#define BLOCK_SIZE_1 RD_WG_SIZE_1_0
+#elif defined(RD_WG_SIZE_1)
+	#define BLOCK_SIZE_1 RD_WG_SIZE_1
+#elif defined(RD_WG_SIZE)
+	#define BLOCK_SIZE_1 RD_WG_SIZE
+#else
+	#define BLOCK_SIZE_1 192
+#endif
+
+#ifdef RD_WG_SIZE_2_0
+	#define BLOCK_SIZE_2 RD_WG_SIZE_2_0
+#elif defined(RD_WG_SIZE_1)
+	#define BLOCK_SIZE_2 RD_WG_SIZE_2
+#elif defined(RD_WG_SIZE)
+	#define BLOCK_SIZE_2 RD_WG_SIZE
+#else
+	#define BLOCK_SIZE_2 192
+#endif
+
+#ifdef RD_WG_SIZE_3_0
+	#define BLOCK_SIZE_3 RD_WG_SIZE_3_0
+#elif defined(RD_WG_SIZE_3)
+	#define BLOCK_SIZE_3 RD_WG_SIZE_3
+#elif defined(RD_WG_SIZE)
+	#define BLOCK_SIZE_3 RD_WG_SIZE
+#else
+	#define BLOCK_SIZE_3 192
+#endif
+
+#ifdef RD_WG_SIZE_4_0
+	#define BLOCK_SIZE_4 RD_WG_SIZE_4_0
+#elif defined(RD_WG_SIZE_4)
+	#define BLOCK_SIZE_4 RD_WG_SIZE_4
+#elif defined(RD_WG_SIZE)
+	#define BLOCK_SIZE_4 RD_WG_SIZE
+#else
+	#define BLOCK_SIZE_4 192
+#endif
+
+
+using std::endl;
+
+double gettime() {
+  struct timeval t;
+  gettimeofday(&t,NULL);
+  return t.tv_sec+t.tv_usec*1e-6;
+}
+//-------------------------------------------------------------------
+//--initialize array with maximum limit
+//-------------------------------------------------------------------
+template<typename datatype>
+void fill(datatype *A, const int n, const datatype maxi){
+    for (int j = 0; j < n; j++){
+        A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
+    }
+}
+
+//--print matrix
+template<typename datatype>
+void print_matrix(datatype *A, int height, int width){
+	for(int i=0; i<height; i++){
+		for(int j=0; j<width; j++){
+			int idx = i*width + j;
+			std::cout<<A[idx]<<" ";
+		}
+		std::cout<<std::endl;
+	}
+
+	return;
+}
+//-------------------------------------------------------------------
+//--verify results
+//-------------------------------------------------------------------
+#define MAX_RELATIVE_ERROR  .002
+template<typename datatype>
+void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
+
+    bool passed = true; 
+#pragma omp parallel for
+    for (int i=0; i<size; i++){
+      if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
+         passed = false; 
+      }
+    }
+    if (passed){
+        std::cout << "--cambine:passed:-)" << std::endl;
+    }
+    else{
+        std::cout << "--cambine: failed:-(" << std::endl;
+    }
+    return ;
+}
+template<typename datatype>
+void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
+
+    bool passed = true; 
+//#pragma omp parallel for
+    for (int i=0; i<size; i++){
+      if (cpu_results[i]!=gpu_results[i]){
+         passed = false; 
+      }
+    }
+    if (passed){
+        std::cout << "--cambine:passed:-)" << std::endl;
+    }
+    else{
+        std::cout << "--cambine: failed:-(" << std::endl;
+    }
+    return ;
+}
+
+#endif
+
--- a/tests/opencl/cutcp2/Makefile
+++ b/tests/opencl/cutcp2/Makefile
@ -0,0 +1,9 @@
+PROJECT = cutcp
+
+SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
+
+CXXFLAGS += -I.
+
+OPTS ?= 
+
+include ../common.mk
--- a/tests/opencl/cutcp2/args.c
+++ b/tests/opencl/cutcp2/args.c
@ -0,0 +1,617 @@
+
+#include <parboil.h>
+#include <errno.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+
+/*****************************************************************************/
+/* Memory management routines */
+
+/* Free an array of owned strings. */
+void
+pb_FreeStringArray(char **string_array)
+{
+  char **p;
+
+  if (!string_array) return;
+  for (p = string_array; *p; p++) free(*p);
+  free(string_array);
+}
+
+struct pb_PlatformParam *
+pb_PlatformParam(char *name, char *version)
+{
+  if (name == NULL) {
+    fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
+    exit(-1);
+  }
+
+  struct pb_PlatformParam *ret =
+    (struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
+
+  ret->name = name;
+  ret->version = version;
+  return ret;
+}
+
+void
+pb_FreePlatformParam(struct pb_PlatformParam *p)
+{
+  if (p == NULL) return;
+
+  free(p->name);
+  free(p->version);
+  free(p);
+}
+
+struct pb_DeviceParam *
+pb_DeviceParam_index(int index)
+{
+  struct pb_DeviceParam *ret =
+    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
+  ret->criterion = pb_Device_INDEX;
+  ret->index = index;
+  return ret;
+}
+
+struct pb_DeviceParam *
+pb_DeviceParam_cpu(void)
+{
+  struct pb_DeviceParam *ret =
+    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
+  ret->criterion = pb_Device_CPU;
+  return ret;
+}
+
+struct pb_DeviceParam *
+pb_DeviceParam_gpu(void)
+{
+  struct pb_DeviceParam *ret =
+    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
+  ret->criterion = pb_Device_GPU;
+  return ret;
+}
+
+struct pb_DeviceParam *
+pb_DeviceParam_accelerator(void)
+{
+  struct pb_DeviceParam *ret =
+    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
+  ret->criterion = pb_Device_ACCELERATOR;
+  return ret;
+}
+
+struct pb_DeviceParam *
+pb_DeviceParam_name(char *name)
+{
+  struct pb_DeviceParam *ret =
+    (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
+  ret->criterion = pb_Device_NAME;
+  ret->name = name;
+  return ret;
+}
+
+void
+pb_FreeDeviceParam(struct pb_DeviceParam *p)
+{
+  if (p == NULL) return;
+
+  switch(p->criterion) {
+  case pb_Device_NAME:
+    free(p->name);
+    break;
+  case pb_Device_INDEX:
+  case pb_Device_CPU:
+  case pb_Device_ACCELERATOR:
+    break;
+  default:
+    fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
+    exit(-1);
+  }
+}
+
+void
+pb_FreeParameters(struct pb_Parameters *p)
+{
+  free(p->outFile);
+  pb_FreeStringArray(p->inpFiles);
+  pb_FreePlatformParam(p->platform);
+  pb_FreeDeviceParam(p->device);
+  free(p);
+}
+
+/*****************************************************************************/
+
+/* Parse a comma-delimited list of strings into an
+ * array of strings. */
+static char ** 
+read_string_array(char *in)
+{
+  char **ret;
+  int i;
+  int count;			/* Number of items in the input */
+  char *substring;		/* Current substring within 'in' */
+
+  /* Count the number of items in the string */
+  count = 1;
+  for (i = 0; in[i]; i++) if (in[i] == ',') count++;
+
+  /* Allocate storage */
+  ret = (char **)malloc((count + 1) * sizeof(char *));
+
+  /* Create copies of the strings from the list */
+  substring = in;
+  for (i = 0; i < count; i++) {
+    char *substring_end;
+    int substring_length;
+
+    /* Find length of substring */
+    for (substring_end = substring;
+	 (*substring_end != ',') && (*substring_end != 0);
+	 substring_end++);
+
+    substring_length = substring_end - substring;
+
+    /* Allocate memory and copy the substring */
+    ret[i] = (char *)malloc(substring_length + 1);
+    memcpy(ret[i], substring, substring_length);
+    ret[i][substring_length] = 0;
+
+    /* go to next substring */
+    substring = substring_end + 1;
+  }
+  ret[i] = NULL;		/* Write the sentinel value */
+
+  return ret;
+}
+
+static void
+report_parse_error(const char *str)
+{
+  fputs(str, stderr);
+}
+
+/* Interpret a string as a 'pb_DeviceParam' value.
+ * Return a pointer to a new value, or NULL on failure.
+ */
+static struct pb_DeviceParam *
+read_device_param(char *str)
+{  
+  /* Try different ways of interpreting 'device_string' until one works */
+
+  /* If argument is an integer, then interpret it as a device index */
+  errno = 0;
+  char *end;
+  long device_int = strtol(str, &end, 10);
+  if (!errno) {
+    /* Negative numbers are not valid */
+    if (device_int < 0 || device_int > INT_MAX) return NULL;
+
+    return pb_DeviceParam_index(device_int);
+  }
+
+  /* Match against predefined strings */
+  if (strcmp(str, "CPU") == 0)
+    return pb_DeviceParam_cpu();
+  if (strcmp(str, "GPU") == 0)
+    return pb_DeviceParam_gpu();
+  if (strcmp(str, "ACCELERATOR") == 0)
+    return pb_DeviceParam_accelerator();
+
+  /* Assume any other string is a device name */
+  return pb_DeviceParam_name(strdup(str));
+}
+
+/* Interpret a string as a 'pb_PlatformParam' value.
+ * Return a pointer to a new value, or NULL on failure.
+ */
+static struct pb_PlatformParam *
+read_platform_param(char *str)
+{
+  int separator_index;          /* Index of the '-' character separating
+                                 * name and version number.  It's -1 if
+                                 * there's no '-' character. */
+
+  /* Find the last occurrence of '-' in 'str' */
+  {
+    char *cur;
+    separator_index = -1;
+    for (cur = str; *cur; cur++) {
+      if (*cur == '-') separator_index = cur - str;
+    }
+  }
+
+  /* The platform name is either the entire string, or all characters before
+   * the separator */
+  int name_length = separator_index == -1 ? strlen(str) : separator_index;
+  char *name_str = (char *)malloc(name_length + 1);
+  memcpy(name_str, str, name_length);
+  name_str[name_length] = 0;
+
+  /* The version is either NULL, or all characters after the separator */
+  char *version_str;
+  if (separator_index == -1) {
+    version_str = NULL;
+  }
+  else {
+    const char *version_input_str = str + separator_index + 1;
+    int version_length = strlen(version_input_str);
+
+    version_str = (char *)malloc(version_length + 1);
+    memcpy(version_str, version_input_str, version_length);
+    version_str[version_length] = 0;
+  }
+
+  /* Create output structure */
+  return pb_PlatformParam(name_str, version_str);
+}
+
+/****************************************************************************/
+/* Argument parsing state */
+
+/* Argument parsing state.
+ *
+ * Arguments that are interpreted by the argument parser are removed from
+ * the list.  Variables 'argc' and 'argn' do not count arguments that have
+ * been removed.
+ *
+ * During argument parsing, the array of arguments is compacted, overwriting
+ * the erased arguments.  Variable 'argv_put' points to the array element
+ * where the next argument will be written.  Variable 'argv_get' points to
+ * the array element where the next argument will be read from.
+ */
+struct argparse {
+  int argc;			/* Number of arguments.  Mutable. */
+  int argn;			/* Current argument index. */
+  char **argv_get;		/* Argument value being read. */
+  char **argv_put;		/* Argument value being written.
+				 * argv_put <= argv_get. */
+};
+
+static void
+initialize_argparse(struct argparse *ap, int argc, char **argv)
+{
+  ap->argc = argc;
+  ap->argn = 0;
+  ap->argv_get = ap->argv_put = argv;
+}
+
+/* Finish argument parsing, without processing the remaining arguments.
+ * Write new argument count into _argc. */
+static void
+finalize_argparse(struct argparse *ap, int *_argc, char **argv)
+{
+  /* Move the remaining arguments */
+  for(; ap->argn < ap->argc; ap->argn++)
+    *ap->argv_put++ = *ap->argv_get++;
+
+  /* Update the argument count */
+  *_argc = ap->argc;
+
+  /* Insert a terminating NULL */
+  argv[ap->argc] = NULL;
+}
+
+/* Delete the current argument.  The argument will not be visible
+ * when argument parsing is done. */
+static void
+delete_argument(struct argparse *ap)
+{
+  if (ap->argn >= ap->argc) {
+    fprintf(stderr, "delete_argument\n");
+  }
+  ap->argc--;
+  ap->argv_get++;
+}
+
+/* Go to the next argument.  Also, move the current argument to its
+ * final location in argv. */
+static void
+next_argument(struct argparse *ap)
+{
+  if (ap->argn >= ap->argc) {
+    fprintf(stderr, "next_argument\n");
+  }
+  /* Move argument to its new location. */
+  *ap->argv_put++ = *ap->argv_get++;
+  ap->argn++;
+}
+
+static int
+is_end_of_arguments(struct argparse *ap)
+{
+  return ap->argn == ap->argc;
+}
+
+/* Get the current argument */
+static char *
+get_argument(struct argparse *ap)
+{
+  return *ap->argv_get;
+}
+
+/* Get the current argument, and also delete it */
+static char *
+consume_argument(struct argparse *ap)
+{
+  char *ret = get_argument(ap);
+  delete_argument(ap);
+  return ret;
+}
+
+/****************************************************************************/
+
+/* The result of parsing a command-line argument */
+typedef enum {
+  ARGPARSE_OK,                  /* Success */
+  ARGPARSE_ERROR,               /* Error */
+  ARGPARSE_DONE                 /* Success, and do not continue parsing */
+} result;
+
+typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
+
+
+/* A command-line option */
+struct option {
+  char short_name;              /* If not 0, the one-character
+                                 * name of this option */
+  const char *long_name;        /* If not NULL, the long name of this option */
+  parse_action *action;       /* What to do when this option occurs.
+                               * Sentinel value is NULL.
+                               */
+};
+
+/* Output file
+ *
+ * -o FILE
+ */
+static result
+parse_output_file(struct argparse *ap, struct pb_Parameters *params)
+{
+  if (is_end_of_arguments(ap))
+    {
+      report_parse_error("Expecting file name after '-o'\n");
+      return ARGPARSE_ERROR;
+    }
+
+  /* Replace the output file name */
+  free(params->outFile);
+  params->outFile = strdup(consume_argument(ap));
+
+  return ARGPARSE_OK;
+}
+
+/* Input files
+ *
+ * -i FILE,FILE,...
+ */
+static result
+parse_input_files(struct argparse *ap, struct pb_Parameters *params)
+{
+  if (is_end_of_arguments(ap))
+    {
+      report_parse_error("Expecting file name after '-i'\n");
+      return ARGPARSE_ERROR;
+    }
+
+  /* Replace the input file list */
+  pb_FreeStringArray(params->inpFiles);
+  params->inpFiles = read_string_array(consume_argument(ap));
+  return ARGPARSE_OK;
+}
+
+/* End of options
+ *
+ * --
+ */
+
+static result
+parse_end_options(struct argparse *ap, struct pb_Parameters *params)
+{
+  return ARGPARSE_DONE;
+}
+
+/* OpenCL device
+ *
+ * --device X
+ */
+
+static result
+parse_device(struct argparse *ap, struct pb_Parameters *params)
+{
+  /* Read the next argument, which specifies a device */
+
+  if (is_end_of_arguments(ap))
+    {
+      report_parse_error("Expecting device specification after '--device'\n");
+      return ARGPARSE_ERROR;
+    }
+
+  char *device_string = consume_argument(ap);
+  struct pb_DeviceParam *device_param = read_device_param(device_string);
+
+  if (!device_param) {
+    report_parse_error("Unrecognized device specification format on command line\n");
+    return ARGPARSE_ERROR;
+  }
+
+  /* Save the result */
+  pb_FreeDeviceParam(params->device);
+  params->device = device_param;
+
+  return ARGPARSE_OK;
+}
+
+static result
+parse_platform(struct argparse *ap, struct pb_Parameters *params)
+{
+  /* Read the next argument, which specifies a platform */
+
+  if (is_end_of_arguments(ap))
+    {
+      report_parse_error("Expecting device specification after '--platform'\n");
+      return ARGPARSE_ERROR;
+    }
+
+  char *platform_string = consume_argument(ap);
+  struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
+
+  if (!platform_param) {
+    report_parse_error("Unrecognized platform specification format on command line\n");
+    return ARGPARSE_ERROR;
+  }
+
+  /* Save the result */
+  pb_FreePlatformParam(params->platform);
+  params->platform = platform_param;
+
+  return ARGPARSE_OK;
+}
+  
+
+static struct option options[] = {
+  { 'o', NULL, &parse_output_file },
+  { 'i', NULL, &parse_input_files },
+  { '-', NULL, &parse_end_options },
+  { 0, "device", &parse_device },
+  { 0, "platform", &parse_platform },
+  { 0, NULL, NULL }
+};
+
+static int
+is_last_option(struct option *op)
+{
+  return op->action == NULL;
+}
+
+/****************************************************************************/
+
+/* Parse command-line parameters.
+ * Return zero on error, nonzero otherwise.
+ * On error, the other outputs may be invalid.
+ *
+ * The information collected from parameters is used to update
+ * 'ret'.  'ret' should be initialized.
+ *
+ * '_argc' and 'argv' are updated to contain only the unprocessed arguments.
+ */
+static int
+pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
+{
+  char *err_message;
+  struct argparse ap;
+
+  /* Each argument */
+  initialize_argparse(&ap, *_argc, argv);
+  while(!is_end_of_arguments(&ap)) {
+    result arg_result;          /* Result of parsing this option */
+    char *arg = get_argument(&ap);
+
+    /* Process this argument */
+    if (arg[0] == '-') {
+      /* Single-character flag */
+      if ((arg[1] != 0) && (arg[2] == 0)) {
+        delete_argument(&ap);	/* This argument is consumed here */
+
+        /* Find a matching short option */
+        struct option *op;
+        for (op = options; !is_last_option(op); op++) {
+          if (op->short_name == arg[1]) {
+            arg_result = (*op->action)(&ap, ret);
+            goto option_was_processed;
+          }
+        }
+
+        /* No option matches */
+        report_parse_error("Unexpected command-line parameter\n");
+        arg_result = ARGPARSE_ERROR;
+        goto option_was_processed;
+      }
+
+      /* Long flag */
+      if (arg[1] == '-') {
+        delete_argument(&ap);	/* This argument is consumed here */
+
+        /* Find a matching long option */
+        struct option *op;
+        for (op = options; !is_last_option(op); op++) {
+          if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
+            arg_result = (*op->action)(&ap, ret);
+            goto option_was_processed;
+          }
+        }
+
+        /* No option matches */
+        report_parse_error("Unexpected command-line parameter\n");
+        arg_result = ARGPARSE_ERROR;
+        goto option_was_processed;
+      }
+    }
+    else {
+      /* Other arguments are ignored */
+      next_argument(&ap);
+      arg_result = ARGPARSE_OK;
+      goto option_was_processed;
+    }
+
+  option_was_processed:
+    /* Decide what to do next based on 'arg_result' */
+    switch(arg_result) {
+    case ARGPARSE_OK:
+      /* Continue processing */
+      break;
+
+    case ARGPARSE_ERROR:
+      /* Error exit from the function */
+      return 0;
+
+    case ARGPARSE_DONE:
+      /* Normal exit from the argument parsing loop */
+      goto end_of_options;
+    }
+  } /* end for each argument */
+
+  /* If all arguments were processed, then normal exit from the loop */
+
+ end_of_options:
+  finalize_argparse(&ap, _argc, argv);
+  return 1;
+}
+
+/*****************************************************************************/
+/* Other exported functions */
+
+struct pb_Parameters *
+pb_ReadParameters(int *_argc, char **argv)
+{
+  struct pb_Parameters *ret =
+    (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
+
+  /* Initialize the parameters structure */
+  ret->outFile = NULL;
+  ret->inpFiles = (char **)malloc(sizeof(char *));
+  ret->inpFiles[0] = NULL;
+  ret->platform = NULL;
+  ret->device = NULL;
+
+  /* Read parameters and update _argc, argv */
+  if (!pb_ParseParameters(ret, _argc, argv)) {
+    /* Parse error */
+    pb_FreeParameters(ret);
+    return NULL;
+  }
+
+  return ret;
+}
+
+int
+pb_Parameters_CountInputs(struct pb_Parameters *p)
+{
+  int n;
+
+  for (n = 0; p->inpFiles[n]; n++);
+  return n;
+}
+
--- a/tests/opencl/cutcp2/atom.h
+++ b/tests/opencl/cutcp2/atom.h
@ -0,0 +1,37 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2008-2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef ATOM_H
+#define ATOM_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  typedef struct Atom_t {
+    float x, y, z, q;
+  } Atom;
+
+  typedef struct Atoms_t {
+    Atom *atoms;
+    int size;
+  } Atoms;
+
+  typedef struct Vec3_t {
+    float x, y, z;
+  } Vec3;
+
+  Atoms *read_atom_file(const char *fname);
+  void free_atom(Atoms *atom);
+  void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ATOM_H */
--- a/tests/opencl/cutcp2/cutcp
+++ b/tests/opencl/cutcp2/cutcp
--- a/tests/opencl/cutcp2/cutcpu.c
+++ b/tests/opencl/cutcp2/cutcpu.c
@ -0,0 +1,195 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2008-2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <parboil.h>
+#include "atom.h"
+#include "cutoff.h"
+
+#undef DEBUG_PASS_RATE
+#define CHECK_CYLINDER_CPU
+
+#define CELLEN      4.f
+#define INV_CELLEN  (1.f/CELLEN)
+
+extern int cpu_compute_cutoff_potential_lattice(
+    Lattice *lattice,                  /* the lattice */
+    float cutoff,                      /* cutoff distance */
+    Atoms *atoms                       /* array of atoms */
+    )
+{
+  int nx = lattice->dim.nx;
+  int ny = lattice->dim.ny;
+  int nz = lattice->dim.nz;
+  float xlo = lattice->dim.lo.x;
+  float ylo = lattice->dim.lo.y;
+  float zlo = lattice->dim.lo.z;
+  float gridspacing = lattice->dim.h;
+  int natoms = atoms->size;
+  Atom *atom = atoms->atoms;
+
+  const float a2 = cutoff * cutoff;
+  const float inv_a2 = 1.f / a2;
+  float s;
+  const float inv_gridspacing = 1.f / gridspacing;
+  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
+    /* lattice point radius about each atom */
+
+  int n;
+  int i, j, k;
+  int ia, ib, ic;
+  int ja, jb, jc;
+  int ka, kb, kc;
+  int index;
+  int koff, jkoff;
+
+  float x, y, z, q;
+  float dx, dy, dz;
+  float dz2, dydz2, r2;
+  float e;
+  float xstart, ystart;
+
+  float *pg;
+
+  int gindex;
+  int ncell, nxcell, nycell, nzcell;
+  int *first, *next;
+  float inv_cellen = INV_CELLEN;
+  Vec3 minext, maxext;		/* Extent of atom bounding box */
+  float xmin, ymin, zmin;
+  float xmax, ymax, zmax;
+
+#if DEBUG_PASS_RATE
+  unsigned long long pass_count = 0;
+  unsigned long long fail_count = 0;
+#endif
+
+  /* find min and max extent */
+  get_atom_extent(&minext, &maxext, atoms);
+
+  /* number of cells in each dimension */
+  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
+  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
+  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  ncell = nxcell * nycell * nzcell;
+
+  /* allocate for cursor link list implementation */
+  first = (int *) malloc(ncell * sizeof(int));
+  for (gindex = 0;  gindex < ncell;  gindex++) {
+    first[gindex] = -1;
+  }
+  next = (int *) malloc(natoms * sizeof(int));
+  for (n = 0;  n < natoms;  n++) {
+    next[n] = -1;
+  }
+
+  /* geometric hashing */
+  for (n = 0;  n < natoms;  n++) {
+    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
+    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k*nycell + j)*nxcell + i;
+    next[n] = first[gindex];
+    first[gindex] = n;
+  }
+
+  /* traverse the grid cells */
+  for (gindex = 0;  gindex < ncell;  gindex++) {
+    for (n = first[gindex];  n != -1;  n = next[n]) {
+      x = atom[n].x - xlo;
+      y = atom[n].y - ylo;
+      z = atom[n].z - zlo;
+      q = atom[n].q;
+
+      /* find closest grid point with position less than or equal to atom */
+      ic = (int) (x * inv_gridspacing);
+      jc = (int) (y * inv_gridspacing);
+      kc = (int) (z * inv_gridspacing);
+
+      /* find extent of surrounding box of grid points */
+      ia = ic - radius;
+      ib = ic + radius + 1;
+      ja = jc - radius;
+      jb = jc + radius + 1;
+      ka = kc - radius;
+      kb = kc + radius + 1;
+
+      /* trim box edges so that they are within grid point lattice */
+      if (ia < 0)   ia = 0;
+      if (ib >= nx) ib = nx-1;
+      if (ja < 0)   ja = 0;
+      if (jb >= ny) jb = ny-1;
+      if (ka < 0)   ka = 0;
+      if (kb >= nz) kb = nz-1;
+
+      /* loop over surrounding grid points */
+      xstart = ia*gridspacing - x;
+      ystart = ja*gridspacing - y;
+      dz = ka*gridspacing - z;
+      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
+        koff = k*ny;
+        dz2 = dz*dz;
+        dy = ystart;
+        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
+          jkoff = (koff + j)*nx;
+          dydz2 = dy*dy + dz2;
+#ifdef CHECK_CYLINDER_CPU
+          if (dydz2 >= a2) continue;
+#endif
+
+          dx = xstart;
+          index = jkoff + ia;
+          pg = lattice->lattice + index;
+
+#if defined(__INTEL_COMPILER)
+          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
+            r2 = dx*dx + dydz2;
+            s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
+            e = q * (1/sqrtf(r2)) * s;
+            *pg += (r2 < a2 ? e : 0);  /* LOOP VECTORIZED!! */
+          }
+#else
+          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
+            r2 = dx*dx + dydz2;
+            if (r2 >= a2)
+		{
+#ifdef DEBUG_PASS_RATE
+		  fail_count++;
+#endif
+		  continue;
+		}
+#ifdef DEBUG_PASS_RATE
+	    pass_count++;
+#endif
+            s = (1.f - r2 * inv_a2);
+            e = q * (1/sqrtf(r2)) * s * s;
+            *pg += e;
+          }
+#endif
+        }
+      } /* end loop over surrounding grid points */
+
+    } /* end loop over atoms in a gridcell */
+  } /* end loop over gridcells */
+
+  /* free memory */
+  free(next);
+  free(first);
+
+  /* For debugging: print the number of times that the test passed/failed */
+#ifdef DEBUG_PASS_RATE
+  printf ("Pass :%lld\n", pass_count);
+  printf ("Fail :%lld\n", fail_count);
+#endif
+
+  return 0;
+}
--- a/tests/opencl/cutcp2/cutoff.c
+++ b/tests/opencl/cutcp2/cutoff.c
@ -0,0 +1,508 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2008-2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include <CL/cl.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <parboil.h>
+
+#include "atom.h"
+#include "cutoff.h"
+#include "macros.h"
+#include "ocl.h"
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (NULL == filename || NULL == data || 0 == size)
+    return CL_INVALID_VALUE;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return CL_INVALID_VALUE;
+  }
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return CL_SUCCESS;
+}
+
+// OpenCL 1.1 support for int3 is not uniform on all implementations, so
+// we use int4 instead.  Only the 'x', 'y', and 'z' fields of xyz are used.
+typedef cl_int4 xyz;
+
+//extern "C" int gpu_compute_cutoff_potential_lattice(
+int gpu_compute_cutoff_potential_lattice(
+    struct pb_TimerSet *timers,
+    Lattice *lattice,                  /* the lattice */
+    float cutoff,                      /* cutoff distance */
+    Atoms *atoms,                      /* array of atoms */
+    int verbose,                        /* print info/debug messages */
+    struct pb_Parameters *parameters
+    )
+{
+  int nx = lattice->dim.nx;
+  int ny = lattice->dim.ny;
+  int nz = lattice->dim.nz;
+  float xlo = lattice->dim.lo.x;
+  float ylo = lattice->dim.lo.y;
+  float zlo = lattice->dim.lo.z;
+  float h = lattice->dim.h;
+  int natoms = atoms->size;
+  Atom *atom = atoms->atoms;
+
+  xyz nbrlist[NBRLIST_MAXLEN];
+  int nbrlistlen = 0;
+
+  int binHistoFull[BIN_DEPTH+1] = { 0 };   /* clear every array element */
+  int binHistoCover[BIN_DEPTH+1] = { 0 };  /* clear every array element */
+  int num_excluded = 0;
+
+  int xRegionDim, yRegionDim, zRegionDim;
+  int xRegionIndex, yRegionIndex, zRegionIndex;
+  int xOffset, yOffset, zOffset;
+  int lnx, lny, lnz, lnall;
+  float *regionZeroAddr, *thisRegion;
+  cl_mem regionZeroCl;
+  int index, indexRegion;
+
+  int c;
+  xyz binDim;
+  int nbins;
+  cl_float4 *binBaseAddr, *binZeroAddr;
+  cl_mem binBaseCl, binZeroCl;
+  int *bincntBaseAddr, *bincntZeroAddr;
+  Atoms *extra = NULL;
+
+  cl_mem NbrListLen;
+  cl_mem NbrList;
+
+  int i, j, k, n;
+  int sum, total;
+
+  float avgFillFull, avgFillCover;
+  const float cutoff2 = cutoff * cutoff;
+  const float inv_cutoff2 = 1.f / cutoff2;
+
+  size_t gridDim[3], blockDim[3];
+
+  // The "compute" timer should be active upon entry to this function
+
+  /* pad lattice to be factor of 8 in each dimension */
+  xRegionDim = (int) ceilf(nx/8.f);
+  yRegionDim = (int) ceilf(ny/8.f);
+  zRegionDim = (int) ceilf(nz/8.f);
+
+  lnx = 8 * xRegionDim;
+  lny = 8 * yRegionDim;
+  lnz = 8 * zRegionDim;
+  lnall = lnx * lny * lnz;
+
+  /* will receive energies from OpenCL */
+  regionZeroAddr = (float *) malloc(lnall * sizeof(float));
+
+  /* create bins */
+  c = (int) ceil(cutoff * BIN_INVLEN);  /* count extra bins around lattice */
+  binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
+  binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
+  binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
+  nbins = binDim.x * binDim.y * binDim.z;
+  binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
+  binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
+
+  bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
+  bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
+
+  /* create neighbor list */
+  if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
+    float s = sqrtf(3);
+    float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
+    int cnt = 0;
+    /* develop neighbor list around 1 cell */
+    if (2*c + 1 > NBRLIST_DIM) {
+      fprintf(stderr, "must have cutoff <= %f\n",
+          (NBRLIST_DIM-1)/2 * BIN_LENGTH);
+      return -1;
+    }
+    for (k = -c;  k <= c;  k++) {
+      for (j = -c;  j <= c;  j++) {
+        for (i = -c;  i <= c;  i++) {
+          if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
+          nbrlist[cnt].x = i;
+          nbrlist[cnt].y = j;
+          nbrlist[cnt].z = k;
+          cnt++;
+        }
+      }
+    }
+    nbrlistlen = cnt;
+  }
+  else if (8*h <= 2*BIN_LENGTH) {
+    float s = 2.f*sqrtf(3);
+    float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
+    int cnt = 0;
+    /* develop neighbor list around 3-cube of cells */
+    if (2*c + 3 > NBRLIST_DIM) {
+      fprintf(stderr, "must have cutoff <= %f\n",
+          (NBRLIST_DIM-3)/2 * BIN_LENGTH);
+      return -1;
+    }
+    for (k = -c;  k <= c;  k++) {
+      for (j = -c;  j <= c;  j++) {
+        for (i = -c;  i <= c;  i++) {
+          if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
+          nbrlist[cnt].x = i;
+          nbrlist[cnt].y = j;
+          nbrlist[cnt].z = k;
+          cnt++;
+        }
+      }
+    }
+    nbrlistlen = cnt;
+  }
+  else {
+    fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
+    return -1;
+  }
+
+  /* perform geometric hashing of atoms into bins */
+  {
+    /* array of extra atoms, permit average of one extra per bin */
+    Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
+    int extra_len = 0;
+    
+    for (n = 0;  n < natoms;  n++) {
+      cl_float4 p;
+      p.x = atom[n].x - xlo;
+      p.y = atom[n].y - ylo;
+      p.z = atom[n].z - zlo;
+      p.w = atom[n].q;
+      i = (int) floorf(p.x * BIN_INVLEN);
+      j = (int) floorf(p.y * BIN_INVLEN);
+      k = (int) floorf(p.z * BIN_INVLEN);
+      if (i >= -c && i < binDim.x - c &&
+	  j >= -c && j < binDim.y - c &&
+	  k >= -c && k < binDim.z - c &&
+	  atom[n].q != 0) {
+	int index = (k * binDim.y + j) * binDim.x + i;
+	cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
+	int bindex = bincntZeroAddr[index];
+	if (bindex < BIN_DEPTH) {
+	  /* copy atom into bin and increase counter for this bin */
+	  bin[bindex] = p;
+	  bincntZeroAddr[index]++;
+	}
+	else {
+	  /* add index to array of extra atoms to be computed with CPU */
+	  if (extra_len >= nbins) {
+	    fprintf(stderr, "exceeded space for storing extra atoms\n");
+	    return -1;
+	  }
+	  extra_atoms[extra_len] = atom[n];
+	  extra_len++;
+	}
+      }
+      else {
+	/* excluded atoms are either outside bins or neutrally charged */
+	num_excluded++;
+      }
+    }
+
+    /* Save result */
+    extra = (Atoms *)malloc(sizeof(Atoms));
+    extra->atoms = extra_atoms;
+    extra->size = extra_len;
+  }
+
+  /* bin stats */
+  sum = total = 0;
+  for (n = 0;  n < nbins;  n++) {
+    binHistoFull[ bincntBaseAddr[n] ]++;
+    sum += bincntBaseAddr[n];
+    total += BIN_DEPTH;
+  }
+  avgFillFull = sum / (float) total;
+  sum = total = 0;
+  for (k = 0;  k < binDim.z - 2*c;  k++) {
+    for (j = 0;  j < binDim.y - 2*c;  j++) {
+      for (i = 0;  i < binDim.x - 2*c;  i++) {
+        int index = (k * binDim.y + j) * binDim.x + i;
+        binHistoCover[ bincntZeroAddr[index] ]++;
+        sum += bincntZeroAddr[index];
+        total += BIN_DEPTH;
+      }
+    }
+  }
+  avgFillCover = sum / (float) total;
+
+  if (verbose) {
+    /* report */
+    printf("number of atoms = %d\n", natoms);
+    printf("lattice spacing = %g\n", h);
+    printf("cutoff distance = %g\n", cutoff);
+    printf("\n");
+    printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
+    printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
+    printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
+    printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
+    printf("number of bytes for lattice data = %u\n", (unsigned int) (lnall*sizeof(float)));
+    printf("\n");
+    printf("bin padding thickness = %d\n", c);
+    printf("bin cover dimensions = %d %d %d\n",
+        binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
+    printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
+    printf("number of bins = %d\n", nbins);
+    printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
+    printf("%% overhead space = %g\n",
+        (natoms / (double) (nbins * BIN_DEPTH)) * 100);
+    printf("number of bytes for bin data = %u\n",
+        (unsigned int)(nbins * BIN_DEPTH * sizeof(cl_float4)));
+    printf("\n");
+    printf("bin histogram with padding:\n");
+    sum = 0;
+    for (n = 0;  n <= BIN_DEPTH;  n++) {
+      printf("     number of bins with %d atoms:  %d\n", n, binHistoFull[n]);
+      sum += binHistoFull[n];
+    }
+    printf("     total number of bins:  %d\n", sum);
+    printf("     %% average fill:  %g\n", avgFillFull * 100);
+    printf("\n");
+    printf("bin histogram excluding padding:\n");
+    sum = 0;
+    for (n = 0;  n <= BIN_DEPTH;  n++) {
+      printf("     number of bins with %d atoms:  %d\n", n, binHistoCover[n]);
+      sum += binHistoCover[n];
+    }
+    printf("     total number of bins:  %d\n", sum);
+    printf("     %% average fill:  %g\n", avgFillCover * 100);
+    printf("\n");
+    printf("number of extra atoms = %d\n", extra->size);
+    printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
+    printf("\n");
+
+    /* sanity check on bins */
+    sum = 0;
+    for (n = 0;  n <= BIN_DEPTH;  n++) {
+      sum += n * binHistoFull[n];
+    }
+    sum += extra->size + num_excluded;
+    printf("sanity check on bin histogram with edges:  "
+        "sum + others = %d\n", sum);
+    sum = 0;
+    for (n = 0;  n <= BIN_DEPTH;  n++) {
+      sum += n * binHistoCover[n];
+    }
+    sum += extra->size + num_excluded;
+    printf("sanity check on bin histogram excluding edges:  "
+        "sum + others = %d\n", sum);
+    printf("\n");
+
+    /* neighbor list */
+    printf("neighbor list length = %d\n", nbrlistlen);
+    printf("\n");
+  }
+
+  pb_Context* pb_context;
+  pb_context = pb_InitOpenCLContext(parameters);
+  if (pb_context == NULL) {
+    fprintf (stderr, "Error: No OpenCL platform/device can be found."); 
+    return -1;
+  }
+
+  cl_int clStatus;
+  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
+  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
+  cl_context clContext = (cl_context) pb_context->clContext;
+
+  cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
+  CHECK_ERROR("clCreateCommandQueue")
+
+  pb_SetOpenCL(&clContext, &clCommandQueue);
+  
+  //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
+  //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
+  uint8_t *kernel_bin = NULL;
+  size_t kernel_size;
+  cl_int binary_status = 0;  
+  clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
+  CHECK_ERROR("read_kernel_file")  
+	cl_program clProgram = clCreateProgramWithBinary(
+      clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
+  CHECK_ERROR("clCreateProgramWithSource")
+
+  char clOptions[50];
+  sprintf(clOptions,"-I src/opencl_base");  //-cl-nv-verbose
+
+  clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
+  if (clStatus != CL_SUCCESS) {
+    size_t string_size = 0;
+    clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 
+                          0, NULL, &string_size);
+    char* string = (char*)malloc(string_size*sizeof(char));
+    clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 
+                          string_size, string, NULL);
+    puts(string);
+  }
+
+  CHECK_ERROR("clBuildProgram")
+
+  cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus);
+  CHECK_ERROR("clCreateKernel")
+
+  /* setup OpenCL kernel parameters */
+  blockDim[0] = 8;
+  blockDim[1] = 8;
+  blockDim[2] = 2;
+  gridDim[0] = 4 * xRegionDim * blockDim[0];
+  gridDim[1] = yRegionDim * blockDim[1];
+  gridDim[2] = 1 * blockDim[2];
+
+  /* allocate and initialize memory on OpenCL device */
+  pb_SwitchToTimer(timers, pb_TimerID_COPY);
+  if (verbose) {
+    printf("Allocating %.2fMB on OpenCL device for potentials\n",
+           lnall * sizeof(float) / (double) (1024*1024));
+  }
+
+  regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus);
+  CHECK_ERROR("clCreateBuffer")
+
+  // clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float));
+
+  if (verbose) {
+    printf("Allocating %.2fMB on OpenCL device for atom bins\n",
+           nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
+  }
+
+  binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
+  CHECK_ERROR("clCreateBuffer")
+ 
+  clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  //Sub buffers are not supported in OpenCL v1.0
+  int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;  
+
+  NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
+  CHECK_ERROR("clCreateBuffer")
+  clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
+  CHECK_ERROR("clEnqueueWriteBuffer")
+
+  if (verbose) 
+    printf("\n");
+
+  clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
+  clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
+  clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
+  clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
+  clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
+  clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
+  clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
+  clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&regionZeroCl);
+  clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
+  clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
+  CHECK_ERROR("clSetKernelArg")
+
+  /* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
+  pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
+  printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
+  for (zRegionIndex = 0;  zRegionIndex < zRegionDim;  zRegionIndex++) {
+    printf("  computing plane %d\r", zRegionIndex);
+    fflush(stdout);
+
+    clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
+    CHECK_ERROR("clSetKernelArg")
+
+    clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
+
+    CHECK_ERROR("clEnqueueNDRangeKernel")
+
+    clStatus = clFinish(clCommandQueue);
+
+    CHECK_ERROR("clFinish")
+  }
+
+  printf("Finished OpenCL kernel calls\n");
+
+  /* copy result regions from OpenCL device */
+  pb_SwitchToTimer(timers, pb_TimerID_COPY);
+  clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL);
+  CHECK_ERROR("clEnqueueReadBuffer")
+
+  /* free OpenCL memory allocations */
+  clStatus = clReleaseMemObject(regionZeroCl);
+  clStatus = clReleaseMemObject(binBaseCl);
+  clStatus = clReleaseMemObject(NbrListLen);
+  clStatus = clReleaseMemObject(NbrList);
+  CHECK_ERROR("clReleaseMemObject")
+
+  clStatus = clReleaseKernel(clKernel);
+  clStatus = clReleaseProgram(clProgram);
+  clStatus = clReleaseCommandQueue(clCommandQueue);
+  clStatus = clReleaseContext(clContext);
+
+  //free((void*)clSource[0]);
+
+  /* transpose regions back into lattice */
+  pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
+  for (k = 0;  k < nz;  k++) {
+    zRegionIndex = (k >> 3);
+    zOffset = (k & 7);
+
+    for (j = 0;  j < ny;  j++) {
+      yRegionIndex = (j >> 3);
+      yOffset = (j & 7);
+
+      for (i = 0;  i < nx;  i++) {
+        xRegionIndex = (i >> 3);
+        xOffset = (i & 7);
+
+        thisRegion = regionZeroAddr
+          + ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
+              + xRegionIndex) * REGION_SIZE;
+
+        indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
+        index = (k * ny + j) * nx + i;
+
+        lattice->lattice[index] = thisRegion[indexRegion];
+      }
+    }
+  }
+
+  /* handle extra atoms */
+  if (extra->size > 0) {
+    printf("computing extra atoms on CPU\n");
+    if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
+      fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
+          "for extra atoms\n");
+      return -1;
+    }
+    printf("\n");
+  }
+
+  /* cleanup memory allocations */
+  free(regionZeroAddr);
+  free(binBaseAddr);
+  free(bincntBaseAddr);
+  free_atom(extra);
+
+  return 0;
+}
--- a/tests/opencl/cutcp2/cutoff.h
+++ b/tests/opencl/cutcp2/cutoff.h
@ -0,0 +1,72 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2008-2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef CUTOFF_H
+#define CUTOFF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SHIFTED
+
+  /* A structure to record how points in 3D space map to array
+     elements.  Array element (z, y, x)
+     where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
+     maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
+  */
+  typedef struct LatticeDim_t {
+    /* Number of lattice points in x, y, z dimensions */
+    int nx, ny, nz;
+
+    /* Lowest corner of lattice */
+    Vec3 lo;
+
+    /* Lattice spacing */
+    float h;
+  } LatticeDim;
+
+  /* An electric potential field sampled on a regular grid.  The
+     lattice size and grid point positions are specified by 'dim'.
+  */
+  typedef struct Lattice_t {
+    LatticeDim dim;
+    float *lattice;
+  } Lattice;
+
+  LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
+
+  Lattice *create_lattice(LatticeDim dim);
+  void destroy_lattice(Lattice *);
+
+  int gpu_compute_cutoff_potential_lattice(
+      struct pb_TimerSet *timers,
+      Lattice *lattice,
+      float cutoff,                      /* cutoff distance */
+      Atoms *atom,                       /* array of atoms */
+      int verbose,                        /* print info/debug messages */
+      struct pb_Parameters *parameters
+    );
+
+  int cpu_compute_cutoff_potential_lattice(
+      Lattice *lattice,                  /* the lattice */
+      float cutoff,                      /* cutoff distance */
+      Atoms *atoms                       /* array of atoms */
+    );
+
+  int remove_exclusions(
+      Lattice *lattice,                  /* the lattice */
+      float exclcutoff,                  /* exclusion cutoff distance */
+      Atoms *atom                        /* array of atoms */
+    );
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* CUTOFF_H */
--- a/tests/opencl/cutcp2/excl.c
+++ b/tests/opencl/cutcp2/excl.c
@ -0,0 +1,157 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2008-2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <parboil.h>
+
+#include "atom.h"
+#include "cutoff.h"
+
+#define CELLEN      4.f
+#define INV_CELLEN  (1.f/CELLEN)
+
+extern int remove_exclusions(
+    Lattice *lattice,                  /* the lattice */
+    float cutoff,                      /* exclusion cutoff distance */
+    Atoms *atoms                       /* array of atoms */
+    )
+{
+  int nx = lattice->dim.nx;
+  int ny = lattice->dim.ny;
+  int nz = lattice->dim.nz;
+  float xlo = lattice->dim.lo.x;
+  float ylo = lattice->dim.lo.y;
+  float zlo = lattice->dim.lo.z;
+  float gridspacing = lattice->dim.h;
+  Atom *atom = atoms->atoms;
+
+  const float a2 = cutoff * cutoff;
+  const float inv_gridspacing = 1.f / gridspacing;
+  const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
+    /* lattice point radius about each atom */
+
+  int n;
+  int i, j, k;
+  int ia, ib, ic;
+  int ja, jb, jc;
+  int ka, kb, kc;
+  int index;
+  int koff, jkoff;
+
+  float x, y, z, q;
+  float dx, dy, dz;
+  float dz2, dydz2, r2;
+  float e;
+  float xstart, ystart;
+
+  float *pg;
+
+  int gindex;
+  int ncell, nxcell, nycell, nzcell;
+  int *first, *next;
+  float inv_cellen = INV_CELLEN;
+  Vec3 minext, maxext;
+
+  /* find min and max extent */
+  get_atom_extent(&minext, &maxext, atoms);
+
+  /* number of cells in each dimension */
+  nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
+  nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
+  nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
+  ncell = nxcell * nycell * nzcell;
+
+  /* allocate for cursor link list implementation */
+  first = (int *) malloc(ncell * sizeof(int));
+  for (gindex = 0;  gindex < ncell;  gindex++) {
+    first[gindex] = -1;
+  }
+  next = (int *) malloc(atoms->size * sizeof(int));
+  for (n = 0;  n < atoms->size;  n++) {
+    next[n] = -1;
+  }
+
+  /* geometric hashing */
+  for (n = 0;  n < atoms->size;  n++) {
+    if (0==atom[n].q) continue;  /* skip any non-contributing atoms */
+    i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
+    j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
+    k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
+    gindex = (k*nycell + j)*nxcell + i;
+    next[n] = first[gindex];
+    first[gindex] = n;
+  }
+
+  /* traverse the grid cells */
+  for (gindex = 0;  gindex < ncell;  gindex++) {
+    for (n = first[gindex];  n != -1;  n = next[n]) {
+      x = atom[n].x - xlo;
+      y = atom[n].y - ylo;
+      z = atom[n].z - zlo;
+      q = atom[n].q;
+
+      /* find closest grid point with position less than or equal to atom */
+      ic = (int) (x * inv_gridspacing);
+      jc = (int) (y * inv_gridspacing);
+      kc = (int) (z * inv_gridspacing);
+
+      /* find extent of surrounding box of grid points */
+      ia = ic - radius;
+      ib = ic + radius + 1;
+      ja = jc - radius;
+      jb = jc + radius + 1;
+      ka = kc - radius;
+      kb = kc + radius + 1;
+
+      /* trim box edges so that they are within grid point lattice */
+      if (ia < 0)   ia = 0;
+      if (ib >= nx) ib = nx-1;
+      if (ja < 0)   ja = 0;
+      if (jb >= ny) jb = ny-1;
+      if (ka < 0)   ka = 0;
+      if (kb >= nz) kb = nz-1;
+
+      /* loop over surrounding grid points */
+      xstart = ia*gridspacing - x;
+      ystart = ja*gridspacing - y;
+      dz = ka*gridspacing - z;
+      for (k = ka;  k <= kb;  k++, dz += gridspacing) {
+        koff = k*ny;
+        dz2 = dz*dz;
+
+        dy = ystart;
+        for (j = ja;  j <= jb;  j++, dy += gridspacing) {
+          jkoff = (koff + j)*nx;
+          dydz2 = dy*dy + dz2;
+
+          dx = xstart;
+          index = jkoff + ia;
+          pg = lattice->lattice + index;
+
+          for (i = ia;  i <= ib;  i++, pg++, dx += gridspacing) {
+            r2 = dx*dx + dydz2;
+
+	    /* If atom and lattice point are too close, set the lattice value
+	     * to zero */
+            if (r2 < a2) *pg = 0;
+          }
+        }
+      } /* end loop over surrounding grid points */
+
+    } /* end loop over atoms in a gridcell */
+  } /* end loop over gridcells */
+
+  /* free memory */
+  free(next);
+  free(first);
+
+  return 0;
+}
--- a/tests/opencl/cutcp2/gpu_info.c
+++ b/tests/opencl/cutcp2/gpu_info.c
@ -0,0 +1,55 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+//#include <endian.h>
+#include <stdlib.h>
+#include <malloc.h>
+#include <stdio.h>
+#include <inttypes.h>
+
+#include "gpu_info.h"
+
+void compute_active_thread(size_t *thread,
+			   size_t *grid,
+			   int task,
+			   int pad,
+			   int major,
+			   int minor,
+			   int sm)
+{
+	int max_thread;
+	int max_block=8;
+	if(major==1)
+	{
+		if(minor>=2)
+			max_thread=1024;
+		else
+			max_thread=768;
+	}
+	else if(major==2)
+		max_thread=1536;
+	else
+		//newer GPU  //keep using 2.0
+		max_thread=1536;
+	
+	int _grid;
+	int _thread;
+	
+	if(task*pad>sm*max_thread)
+	{
+		_thread=max_thread/max_block;
+		_grid = ((task*pad+_thread-1)/_thread)*_thread;
+	}
+	else
+	{
+		_thread=pad;
+		_grid=task*pad;
+	}
+
+	thread[0]=_thread;
+	grid[0]=_grid;
+}
--- a/tests/opencl/cutcp2/gpu_info.h
+++ b/tests/opencl/cutcp2/gpu_info.h
@ -0,0 +1,28 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef __GPUINFOH__
+#define __GPUINFOH__
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void compute_active_thread(size_t *thread,
+			   size_t *grid,
+			   int task,
+			   int pad,
+			   int major,
+			   int minor,
+			   int sm);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/tests/opencl/cutcp2/kernel.cl
+++ b/tests/opencl/cutcp2/kernel.cl
@ -0,0 +1,104 @@
+/*
+ * potential lattice is decomposed into size 8^3 lattice point "regions"
+ *
+ * THIS IMPLEMENTATION:  one thread per lattice point
+ * thread block size 128 gives 4 thread blocks per region
+ * kernel is invoked for each x-y plane of regions,
+ * where gridDim.x is 4*(x region dimension) so that blockIdx.x 
+ * can absorb the z sub-region index in its 2 lowest order bits
+ *
+ * Regions are stored contiguously in memory in row-major order
+ *
+ * The bins have to not only cover the region, but they need to surround
+ * the outer edges so that region sides and corners can still use
+ * neighbor list stencil.  The binZeroAddr is actually a shifted pointer into
+ * the bin array (binZeroAddr = binBaseAddr + (c*binDim_y + c)*binDim_x + c)
+ * where c = ceil(cutoff / binsize).  This allows for negative offsets to
+ * be added to myBinIndex.
+ *
+ * The (0,0,0) spatial origin corresponds to lower left corner of both
+ * regionZeroAddr and binZeroAddr.  The atom coordinates are translated
+ * during binning to enforce this assumption.
+ */
+
+#include "macros.h"
+
+// OpenCL 1.1 support for int3 is not uniform on all implementations, so
+// we use int4 instead.  Only the 'x', 'y', and 'z' fields of xyz are used.
+typedef int4 xyz;
+
+__kernel void opencl_cutoff_potential_lattice(
+    int binDim_x,
+    int binDim_y,
+    __global float4 *binBaseAddr,
+    int offset,
+    float h,                /* lattice spacing */
+    float cutoff2,          /* square of cutoff distance */
+    float inv_cutoff2,
+    __global float *regionZeroAddr,  /* address of lattice regions starting at origin */
+    int zRegionIndex,
+    __constant int *NbrListLen,
+    __constant xyz *NbrList
+    )
+{
+  __global float4* binZeroAddr = binBaseAddr + offset;
+
+  __global float *myRegionAddr;
+  int Bx, By, Bz;
+
+  /* thread id */
+  const int tid = (get_local_id(2)*get_local_size(1) + 
+                      get_local_id(1))*get_local_size(0) + get_local_id(0);
+
+  /* this is the start of the sub-region indexed by tid */
+  myRegionAddr = regionZeroAddr + ((zRegionIndex*get_num_groups(1)
+	+ get_group_id(1))*(get_num_groups(0)>>2) + (get_group_id(0)>>2))*REGION_SIZE
+	+ (get_group_id(0)&3)*SUB_REGION_SIZE;
+
+  /* spatial coordinate of this lattice point */
+  float x = (8 * (get_group_id(0) >> 2) + get_local_id(0)) * h;
+  float y = (8 * get_group_id(1) + get_local_id(1)) * h;
+  float z = (8 * zRegionIndex + 2*(get_group_id(0)&3) + get_local_id(2)) * h;
+
+  float dx;
+  float dy;
+  float dz;
+  float r2;
+  float s;
+
+  int totalbins = 0;
+
+  /* bin number determined by center of region */
+  Bx = (int) floor((8 * (get_group_id(0) >> 2) + 4) * h * BIN_INVLEN);
+  By = (int) floor((8 * get_group_id(1) + 4) * h * BIN_INVLEN);
+  Bz = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
+
+  float energy = 0.f;
+  int bincnt;
+  for (bincnt = 0; bincnt < *NbrListLen;  bincnt++) {
+    int i = Bx + NbrList[bincnt].x;
+    int j = By + NbrList[bincnt].y;
+    int k = Bz + NbrList[bincnt].z;
+
+  	__global float4* p_global = binZeroAddr + 
+  	                       (((k*binDim_y + j)*binDim_x + i) * BIN_DEPTH);
+
+    int m;
+    for (m = 0;  m < BIN_DEPTH;  m++) {
+    	float aq = p_global[m].w;
+      if (0.f != aq) {
+        dx = p_global[m].x - x;
+        dy = p_global[m].y - y;
+        dz = p_global[m].z - z;
+        r2 = dx*dx + dy*dy + dz*dz;
+        if (r2 < cutoff2) {
+          s = (1.f - r2 * inv_cutoff2);
+          energy += aq * rsqrt(r2) * s * s;
+        }
+      }
+    } /* end loop over atoms in bin */
+  } /* end loop over neighbor list */
+
+  /* store into global memory */
+  myRegionAddr[tid+0] = energy;
+}
--- a/tests/opencl/cutcp2/macros.h
+++ b/tests/opencl/cutcp2/macros.h
@ -0,0 +1,69 @@
+#ifndef __MACROSH__
+#define __MACROSH__
+
+#ifdef __DEVICE_EMULATION__
+#define DEBUG
+/* define which grid block and which thread to examine */
+#define BX  0
+#define BY  0
+#define TX  0
+#define TY  0
+#define TZ  0
+#define EMU(code) do { \
+  if (blockIdx.x==BX && blockIdx.y==BY && \
+      threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
+    code; \
+  } \
+} while (0)
+#define INT(n)    printf("%s = %d\n", #n, n)
+#define FLOAT(f)  printf("%s = %g\n", #f, (double)(f))
+#define INT3(n)   printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
+#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
+    (double)(f).y, (double)(f).z, (double)(f).w)
+#else
+#define EMU(code)
+#define INT(n)
+#define FLOAT(f)
+#define INT3(n)
+#define FLOAT4(f)
+#endif
+
+/* report error from OpenCL */
+#define CHECK_ERROR(errorMessage)           \
+  if(clStatus != CL_SUCCESS)                \
+  {                                         \
+     printf("Error: %s!\n",errorMessage);   \
+     printf("Line: %d\n",__LINE__);         \
+     exit(1);                               \
+  }
+
+/*
+ * neighbor list:
+ * stored in constant memory as table of offsets
+ * flat index addressing is computed by kernel
+ *
+ * reserve enough memory for 11^3 stencil of grid cells
+ * this fits within 16K of memory
+ */
+#define NBRLIST_DIM  11
+#define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
+
+/*
+ * atom bins cached into shared memory for processing
+ *
+ * this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
+ * should permit scheduling of up to 3 thread blocks per SM
+ */
+#define BIN_DEPTH         8  /* max number of atoms per bin */
+#define BIN_SIZE         32  /* size of bin in floats */
+#define BIN_CACHE_MAXLEN 32  /* max number of atom bins to cache */
+
+#define BIN_LENGTH      4.f  /* spatial length in Angstroms */
+#define BIN_INVLEN  (1.f / BIN_LENGTH)
+/* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
+ * so that bin fill should be 80% (for non-empty regions of space) */
+
+#define REGION_SIZE     512  /* number of floats in lattice region */
+#define SUB_REGION_SIZE 128  /* number of floats in lattice sub-region */
+
+#endif
--- a/tests/opencl/cutcp2/main.cc
+++ b/tests/opencl/cutcp2/main.cc
@ -0,0 +1,190 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2008-2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <parboil.h>
+
+#include "atom.h"
+#include "cutoff.h"
+#include "output.h"
+
+#define ERRTOL 1e-4f
+
+#define NOKERNELS             0
+#define CUTOFF1               1
+#define CUTOFF6              32
+#define CUTOFF6OVERLAP       64
+#define CUTOFFCPU         16384
+
+
+int appenddata(const char *filename, int size, double time) {
+  FILE *fp;
+  fp=fopen(filename, "a");
+  if (fp == NULL) {
+    printf("error appending to file %s..\n", filename);
+    return -1;
+  }
+  fprintf(fp, "%d  %.3f\n", size, time);
+  fclose(fp);
+  return 0;
+}
+
+LatticeDim
+lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
+{
+  LatticeDim ret;
+
+  ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
+  ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
+  ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
+  ret.lo = lo;
+  ret.h = h;
+
+  return ret;
+}
+
+Lattice *
+create_lattice(LatticeDim dim)
+{
+  int size;
+  Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
+
+  if (lat == NULL) {
+    fprintf(stderr, "Out of memory\n");
+    exit(1);
+  }
+
+  lat->dim = dim;
+
+  /* Round up the allocated size to a multiple of 8 */
+  size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
+  lat->lattice = (float *)calloc(size, sizeof(float));
+
+  if (lat->lattice == NULL) {
+    fprintf(stderr, "Out of memory\n");
+    exit(1);
+  }
+
+  return lat;
+}
+
+
+void
+destroy_lattice(Lattice *lat)
+{
+  if (lat) {
+    free(lat->lattice);
+    free(lat);
+  }
+}
+
+int main(int argc, char *argv[]) {
+  Atoms *atom;
+
+  LatticeDim lattice_dim;
+  Lattice *gpu_lattice;
+  Vec3 min_ext, max_ext;	/* Bounding box of atoms */
+  Vec3 lo, hi;			/* Bounding box with padding  */
+
+  float h = 0.5f;		/* Lattice spacing */
+  float cutoff = 12.f;		/* Cutoff radius */
+  float exclcutoff = 1.f;	/* Radius for exclusion */
+  float padding = 0.5f;		/* Bounding box padding distance */
+
+  int n;
+
+  struct pb_Parameters *parameters;
+  struct pb_TimerSet timers;
+
+  /* Read input parameters */
+  parameters = pb_ReadParameters(&argc, argv);
+  if (parameters == NULL) {
+    exit(1);
+  }
+
+  parameters->inpFiles = (char **)malloc(sizeof(char *) * 2);
+  parameters->inpFiles[0] = (char *)malloc(100);
+  parameters->inpFiles[1] = NULL;
+  strncpy(parameters->inpFiles[0], "watbox.sl40.pqr", 100);
+
+  /* Expect one input file */
+  if (pb_Parameters_CountInputs(parameters) != 1) {
+    fprintf(stderr, "Expecting one input file\n");
+    exit(1);
+  }
+
+  pb_InitializeTimerSet(&timers);
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+
+  {
+    const char *pqrfilename = parameters->inpFiles[0];
+
+    if (!(atom = read_atom_file(pqrfilename))) {
+      fprintf(stderr, "read_atom_file() failed\n");
+      exit(1);
+    }
+    printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
+  }
+
+  /* find extent of domain */
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+  get_atom_extent(&min_ext, &max_ext, atom);
+  printf("extent of domain is:\n");
+  printf("  minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
+  printf("  maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
+
+  printf("padding domain by %g Angstroms\n", padding);
+  lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
+  hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
+  printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
+
+  lattice_dim = lattice_from_bounding_box(lo, hi, h);
+  gpu_lattice = create_lattice(lattice_dim);
+  printf("\n");
+
+  /*
+   *  Run OpenCL kernel
+   *  (Begin and end with COMPUTE timer active)
+   */
+  if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0, parameters)) {
+    fprintf(stderr, "Computation failed\n");
+    exit(1);
+  }
+
+  /*
+   * Zero the lattice points that are too close to an atom.  This is
+   * necessary for numerical stability.
+   */
+  if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
+    fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
+    exit(1);
+  }
+
+  printf("\n");
+
+  pb_SwitchToTimer(&timers, pb_TimerID_IO);
+
+  /* Print output */
+  if (parameters->outFile) {
+    //write_lattice_summary(parameters->outFile, gpu_lattice);
+  }
+  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
+
+  /* Cleanup */
+  destroy_lattice(gpu_lattice);
+  free_atom(atom);
+
+  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
+  pb_PrintTimerSet(&timers);
+  pb_FreeParameters(parameters);
+
+  return 0;
+}
--- a/tests/opencl/cutcp2/ocl.c
+++ b/tests/opencl/cutcp2/ocl.c
@ -0,0 +1,49 @@
+#include <CL/cl.h>
+#include <stdio.h>
+#include <string.h>
+#include "ocl.h"
+
+char* readFile(const char* fileName)
+{
+        FILE* fp;
+        fp = fopen(fileName,"r");
+        if(fp == NULL)
+        {
+                printf("Error 1!\n");
+                exit(1);
+        }
+
+        fseek(fp,0,SEEK_END);
+        long size = ftell(fp);
+        rewind(fp);
+
+        char* buffer = (char*)malloc(sizeof(char)*(size+1));
+        if(buffer  == NULL)
+        {
+                printf("Error 2!\n");
+                fclose(fp);
+                exit(1);
+        }
+
+        size_t res = fread(buffer,1,size,fp);
+        if(res != size)
+        {
+                printf("Error 3!\n");
+                fclose(fp);
+                exit(1);
+        }
+
+	buffer[size] = 0;
+        fclose(fp);
+        return buffer;
+}
+
+void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
+{
+	cl_int clStatus;
+	char* temp = (char*)malloc(size);
+	memset(temp,val,size);
+	clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
+	CHECK_ERROR("clEnqueueWriteBuffer")
+	free(temp);
+}
--- a/tests/opencl/cutcp2/ocl.h
+++ b/tests/opencl/cutcp2/ocl.h
@ -0,0 +1,25 @@
+#ifndef __OCLH__
+#define __OCLH__
+
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void clMemSet(cl_command_queue, cl_mem, int, size_t);
+char* readFile(const char*);
+
+#define CHECK_ERROR(errorMessage)           \
+  if(clStatus != CL_SUCCESS)                \
+  {                                         \
+     printf("Error: %s!\n",errorMessage);   \
+     printf("Line: %d\n",__LINE__);         \
+     exit(1);                               \
+  }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/tests/opencl/cutcp2/output.c
+++ b/tests/opencl/cutcp2/output.c
@ -0,0 +1,67 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2008-2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <math.h>
+#include <parboil.h>
+
+#include "atom.h"
+#include "cutoff.h"
+
+void
+write_lattice_summary(const char *filename, Lattice *lattice)
+{
+  float *lattice_data = lattice->lattice;
+  int nx = lattice->dim.nx;
+  int ny = lattice->dim.ny;
+  int nz = lattice->dim.nz;
+
+  /* Open output file */
+  FILE *outfile = fopen(filename, "w");
+
+  if (outfile == NULL) {
+    fprintf(stderr, "Cannot open output file\n");
+    exit(1);
+  }
+
+  /* Write the sum of the the absolute values of all lattice potentials */
+  {
+    double abspotential = 0.0;
+    float tmp;
+    int i;
+
+    for (i = 0; i < nx * ny * nz; i++)
+      abspotential += fabs((double) lattice_data[i]);
+
+    tmp = (float) abspotential;
+
+    fwrite(&tmp, 1, sizeof(float), outfile);
+  }
+
+  /* Write the size of a lattice plane */
+  {
+    uint32_t tmp;
+
+    tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
+    fwrite(&tmp, 1, sizeof(uint32_t), outfile);
+  }
+
+  /* Write the plane of lattice data at z=0 and z = nz-1 */
+  {
+    int plane_size = nx * ny;
+
+    fwrite(lattice_data, plane_size, sizeof(float), outfile);
+    fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
+	   outfile);
+  }
+
+  /* Cleanup */
+  fclose(outfile);
+}
--- a/tests/opencl/cutcp2/output.h
+++ b/tests/opencl/cutcp2/output.h
@ -0,0 +1,25 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2008-2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#ifndef OUTPUT_H
+#define OUTPUT_H
+
+#include "cutoff.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void
+write_lattice_summary(const char *filename, Lattice *lattice);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/tests/opencl/cutcp2/parboil.h
+++ b/tests/opencl/cutcp2/parboil.h
@ -0,0 +1,348 @@
+/*
+ * (c) 2010 The Board of Trustees of the University of Illinois.
+ */
+#ifndef PARBOIL_HEADER
+#define PARBOIL_HEADER
+
+#include <stdio.h>
+#include <string.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <unistd.h>
+
+/* A platform as specified by the user on the command line */
+struct pb_PlatformParam {
+  char *name;                   /* The platform name.  This string is owned. */
+  char *version;                /* The platform version; may be NULL.
+                                 * This string is owned. */
+};
+
+/* Create a PlatformParam from the given strings.
+ * 'name' must not be NULL.  'version' may be NULL.
+ * If not NULL, the strings should have been allocated by malloc(),
+ * and they will be owned by the returned object.
+ */
+struct pb_PlatformParam *
+pb_PlatformParam(char *name, char *version);
+
+void
+pb_FreePlatformParam(struct pb_PlatformParam *);
+
+/* A criterion for how to select a device */
+enum pb_DeviceSelectionCriterion {
+  pb_Device_INDEX,             /* Enumerate the devices and select one
+                                * by its number */
+  pb_Device_CPU,                /* Select a CPU device */
+  pb_Device_GPU,                /* Select a GPU device  */
+  pb_Device_ACCELERATOR,        /* Select an accelerator device */
+  pb_Device_NAME                /* Select a device by name */
+};
+
+/* A device as specified by the user on the command line */
+struct pb_DeviceParam {
+  enum pb_DeviceSelectionCriterion criterion;
+  union {
+    int index;                  /* If criterion == pb_Device_INDEX,
+                                 * the index of the device */
+    char *name;                 /* If criterion == pb_Device_NAME,
+                                 * the name of the device.
+                                 * This string is owned. */
+  };
+};
+
+struct pb_DeviceParam *
+pb_DeviceParam_index(int index);
+
+struct pb_DeviceParam *
+pb_DeviceParam_cpu(void);
+
+struct pb_DeviceParam *
+pb_DeviceParam_gpu(void);
+
+struct pb_DeviceParam *
+pb_DeviceParam_accelerator(void);
+
+/* Create a by-name device selection criterion.
+ * The string should have been allocated by malloc(), and it will will be
+ * owned by the returned object.
+ */
+struct pb_DeviceParam *
+pb_DeviceParam_name(char *name);
+
+void
+pb_FreeDeviceParam(struct pb_DeviceParam *);
+
+/* Command line parameters for benchmarks */
+struct pb_Parameters {
+  char *outFile;		/* If not NULL, the raw output of the
+				 * computation should be saved to this
+				 * file. The string is owned. */
+  char **inpFiles;		/* A NULL-terminated array of strings
+				 * holding the input file(s) for the
+				 * computation.  The array and strings
+				 * are owned. */
+  struct pb_PlatformParam *platform; /* If not NULL, the platform
+                                      * specified on the command line. */
+  struct pb_DeviceParam *device; /* If not NULL, the device
+                                      * specified on the command line. */
+};
+
+/* Read command-line parameters.
+ *
+ * The argc and argv parameters to main are read, and any parameters
+ * interpreted by this function are removed from the argument list.
+ *
+ * A new instance of struct pb_Parameters is returned.
+ * If there is an error, then an error message is printed on stderr
+ * and NULL is returned.
+ */
+struct pb_Parameters *
+pb_ReadParameters(int *_argc, char **argv);
+
+/* Free an instance of struct pb_Parameters.
+ */
+void
+pb_FreeParameters(struct pb_Parameters *p);
+
+void
+pb_FreeStringArray(char **);
+
+/* Count the number of input files in a pb_Parameters instance.
+ */
+int
+pb_Parameters_CountInputs(struct pb_Parameters *p);
+
+/* A time or duration. */
+//#if _POSIX_VERSION >= 200112L
+typedef unsigned long long pb_Timestamp; /* time in microseconds */
+//#else
+//# error "Timestamps not implemented"
+//#endif
+
+enum pb_TimerState {
+  pb_Timer_STOPPED,
+  pb_Timer_RUNNING,
+};
+
+struct pb_Timer {
+  enum pb_TimerState state;
+  pb_Timestamp elapsed;		/* Amount of time elapsed so far */
+  pb_Timestamp init;		/* Beginning of the current time interval,
+				 * if state is RUNNING.  End of the last 
+				 * recorded time interfal otherwise.  */
+};
+
+/* Reset a timer.
+ * Use this to initialize a timer or to clear
+ * its elapsed time.  The reset timer is stopped.
+ */
+void
+pb_ResetTimer(struct pb_Timer *timer);
+
+/* Start a timer.  The timer is set to RUNNING mode and
+ * time elapsed while the timer is running is added to
+ * the timer.
+ * The timer should not already be running.
+ */
+void
+pb_StartTimer(struct pb_Timer *timer);
+
+/* Stop a timer.
+ * This stops adding elapsed time to the timer.
+ * The timer should not already be stopped.
+ */
+void
+pb_StopTimer(struct pb_Timer *timer);
+
+/* Get the elapsed time in seconds. */
+double
+pb_GetElapsedTime(struct pb_Timer *timer);
+
+/* Execution time is assigned to one of these categories. */
+enum pb_TimerID {
+  pb_TimerID_NONE = 0,
+  pb_TimerID_IO,		/* Time spent in input/output */
+  pb_TimerID_KERNEL,		/* Time spent computing on the device, 
+				 * recorded asynchronously */
+  pb_TimerID_COPY,		/* Time spent synchronously moving data 
+				 * to/from device and allocating/freeing 
+				 * memory on the device */
+  pb_TimerID_DRIVER,		/* Time spent in the host interacting with the 
+				 * driver, primarily for recording the time 
+                                 * spent queueing asynchronous operations */
+  pb_TimerID_COPY_ASYNC,	/* Time spent in asynchronous transfers */
+  pb_TimerID_COMPUTE,		/* Time for all program execution other
+				 * than parsing command line arguments,
+				 * I/O, kernel, and copy */
+  pb_TimerID_OVERLAP,		/* Time double-counted in asynchronous and 
+				 * host activity: automatically filled in, 
+				 * not intended for direct usage */
+  pb_TimerID_LAST		/* Number of timer IDs */
+};
+
+/* Dynamic list of asynchronously tracked times between events */
+struct pb_async_time_marker_list {
+  char *label; // actually just a pointer to a string
+  enum pb_TimerID timerID;	/* The ID to which the interval beginning 
+                                 * with this marker should be attributed */
+  void * marker; 
+  //cudaEvent_t marker; 		/* The driver event for this marker */
+  struct pb_async_time_marker_list *next; 
+};
+
+struct pb_SubTimer {
+  char *label;
+  struct pb_Timer timer;
+  struct pb_SubTimer *next;
+};
+
+struct pb_SubTimerList {
+  struct pb_SubTimer *current;
+  struct pb_SubTimer *subtimer_list;
+};
+
+/* A set of timers for recording execution times. */
+struct pb_TimerSet {
+  enum pb_TimerID current;
+  struct pb_async_time_marker_list* async_markers;
+  pb_Timestamp async_begin;
+  pb_Timestamp wall_begin;
+  struct pb_Timer timers[pb_TimerID_LAST];
+  struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
+};
+
+/* Reset all timers in the set. */
+void
+pb_InitializeTimerSet(struct pb_TimerSet *timers);
+
+void
+pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
+
+/* Select which timer the next interval of time should be accounted
+ * to. The selected timer is started and other timers are stopped.
+ * Using pb_TimerID_NONE stops all timers. */
+void
+pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
+
+void
+pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
+
+/* Print timer values to standard output. */
+void
+pb_PrintTimerSet(struct pb_TimerSet *timers);
+
+/* Release timer resources */
+void
+pb_DestroyTimerSet(struct pb_TimerSet * timers);
+
+void
+pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
+
+
+typedef struct pb_Device_tag {
+  char* name;
+  void* clDevice;
+  int id;
+  unsigned int in_use;
+  unsigned int available;
+} pb_Device;
+
+struct pb_Context_tag;
+typedef struct pb_Context_tag pb_Context;
+
+typedef struct pb_Platform_tag {
+  char* name;
+  char* version;
+  void* clPlatform;
+  unsigned int in_use;
+  pb_Context** contexts;
+  pb_Device** devices;
+} pb_Platform;
+
+struct pb_Context_tag {
+  void* clPlatformId;
+  void* clContext;
+  void* clDeviceId;
+  pb_Platform* pb_platform;
+  pb_Device* pb_device;
+};
+
+// verbosely print out list of platforms and their devices to the console.
+pb_Platform**
+pb_GetPlatforms();
+
+// Choose a platform according to the given platform specification
+pb_Platform*
+pb_GetPlatform(struct pb_PlatformParam *platform);
+
+// choose a platform: by name, name & version
+pb_Platform*
+pb_GetPlatformByName(const char* name);
+
+pb_Platform*
+pb_GetPlatformByNameAndVersion(const char* name, const char* version);
+
+// Choose a device according to the given device specification
+pb_Device*
+pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
+
+pb_Device**
+pb_GetDevices(pb_Platform* pb_platform);
+
+// choose a device by name.
+pb_Device*
+pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
+
+pb_Platform*
+pb_GetPlatformByEnvVars();
+
+pb_Context*
+pb_InitOpenCLContext(struct pb_Parameters* parameters);
+
+void
+pb_ReleasePlatforms();
+
+void
+pb_ReleaseContext(pb_Context* c);
+
+void
+pb_PrintPlatformInfo(pb_Context* c);
+
+void
+perf_init();
+
+//#define MEASURE_KERNEL_TIME
+
+#include <CL/cl.h>
+
+#ifdef MEASURE_KERNEL_TIME
+#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
+cl_int
+pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
+                       cl_kernel        /* kernel */,
+                       cl_uint          /* work_dim */,
+                       const size_t *   /* global_work_offset */,
+                       const size_t *   /* global_work_size */,
+                       const size_t *   /* local_work_size */,
+                       cl_uint          /* num_events_in_wait_list */,
+                       const cl_event * /* event_wait_list */,
+                       cl_event *       /* event */);
+#endif
+
+enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
+void pb_sig_float(char*, float*, int);
+void pb_sig_double(char*, double*, int);
+void pb_sig_short(char*, short*, int);
+void pb_sig_int(char*, int*, int);
+void pb_sig_uchar(char*, unsigned char*, unsigned int);
+void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //PARBOIL_HEADER
+
--- a/tests/opencl/cutcp2/parboil_opencl.c
+++ b/tests/opencl/cutcp2/parboil_opencl.c
--- a/tests/opencl/cutcp2/readatom.c
+++ b/tests/opencl/cutcp2/readatom.c
@ -0,0 +1,139 @@
+/***************************************************************************
+ *cr
+ *cr            (C) Copyright 2008-2010 The Board of Trustees of the
+ *cr                        University of Illinois
+ *cr                         All Rights Reserved
+ *cr
+ ***************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "atom.h"
+
+
+#define LINELEN 96
+#define INITLEN 20
+
+
+Atoms *read_atom_file(const char *fname)
+{
+  FILE *file;
+  char line[LINELEN];
+
+  Atom *atom;			/* Atom array */
+  int len = INITLEN;		/* Size of atom array */
+  int cnt = 0;			/* Number of atoms read */
+
+  /* allocate initial atom array */
+  atom = (Atom *) malloc(len * sizeof(Atom));
+  if (NULL==atom) {
+    fprintf(stderr, "can't allocate memory\n");
+    return NULL;
+  }
+
+  int i;
+  for (i = 0; i < len; ++i) {
+    atom[i].x = i+0;
+    atom[i].y = i+1;
+    atom[i].z = i+2;
+    atom[i].q = 1;
+  }
+
+#if 0
+  /* open atom "pqr" file */
+  file = fopen(fname, "r");
+  if (NULL==file) {
+    fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
+    return NULL;
+  }
+
+  /* loop to read pqr file line by line */
+  while (fgets(line, LINELEN, file) != NULL) {
+
+    if (strncmp(line, "ATOM  ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
+      continue;  /* skip anything that isn't an atom record */
+    }
+
+    if (cnt==len) {  /* extend atom array */
+      void *tmp = realloc(atom, 2*len*sizeof(Atom));
+      if (NULL==tmp) {
+        fprintf(stderr, "can't allocate more memory\n");
+        return NULL;
+      }
+      atom = (Atom *) tmp;
+      len *= 2;
+    }
+
+    /* read position coordinates and charge from atom record */
+    if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
+          &(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
+      fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
+      return NULL;
+    }
+
+    cnt++;  /* count atoms as we store them */
+  }
+
+  /* verify EOF and close file */
+  if ( !feof(file) ) {
+    fprintf(stderr, "did not find EOF\n");
+    return NULL;
+  }
+  if (fclose(file)) {
+    fprintf(stderr, "can't close file\n");
+    return NULL;
+  }
+#endif
+
+  /* Build the output data structure */
+  {
+    Atoms *out = (Atoms *)malloc(sizeof(Atoms));
+
+    if (NULL == out) {
+      fprintf(stderr, "can't allocate memory\n");
+      return NULL;
+    }
+
+    out->size = cnt;
+    out->atoms = atom;
+
+    return out;
+  }
+}
+
+
+void free_atom(Atoms *atom)
+{
+  if (atom) {
+    free(atom->atoms);
+    free(atom);
+  }
+}
+
+void
+get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
+{
+  Atom *atoms = atom->atoms;
+  int natoms = atom->size;
+  Vec3 lo;
+  Vec3 hi;
+  int n;
+
+  hi.x = lo.x = atoms[0].x;
+  hi.y = lo.y = atoms[0].y;
+  hi.z = lo.z = atoms[0].z;
+
+  for (n = 1; n < natoms; n++) {
+    lo.x = fminf(lo.x, atoms[n].x);
+    hi.x = fmaxf(hi.x, atoms[n].x);
+    lo.y = fminf(lo.y, atoms[n].y);
+    hi.y = fmaxf(hi.y, atoms[n].y);
+    lo.z = fminf(lo.z, atoms[n].z);
+    hi.z = fmaxf(hi.z, atoms[n].z);
+  }
+
+  *out_lo = lo;
+  *out_hi = hi;
+}
--- a/tests/opencl/cutcp2/watbox.sl40.pqr
+++ b/tests/opencl/cutcp2/watbox.sl40.pqr
--- a/tests/opencl/dwt2d/.gitignore
+++ b/tests/opencl/dwt2d/.gitignore
@ -0,0 +1 @@
+*.bmp.dwt.*
--- a/tests/opencl/dwt2d/16.bmp
+++ b/tests/opencl/dwt2d/16.bmp
--- a/tests/opencl/dwt2d/192.bmp
+++ b/tests/opencl/dwt2d/192.bmp
--- a/tests/opencl/dwt2d/Makefile
+++ b/tests/opencl/dwt2d/Makefile
@ -0,0 +1,7 @@
+PROJECT = dwt2d
+
+SRCS = components.cc dwt.cc main.cc
+
+OPTS ?= dwt2d_inputs/._192.bmp -D 192x192 -f -5 -l 3 
+
+include ../common.mk
--- a/tests/opencl/dwt2d/Makefile_old
+++ b/tests/opencl/dwt2d/Makefile_old
@ -0,0 +1,46 @@
+#all:
+#	g++ -o components.o -I/usr/local/cuda-5.5/include -c components.cpp
+#	g++ -o prog  -I/usr/local/cuda-5.5/include main.cpp components.o -lOpenCL
+#
+include ../../common/make.config
+
+#Can be changed by `make TYPE=CPU`
+TYPE = GPU
+
+#Library
+ifeq ($(TYPE),GPU)
+	OPENCL_INC = $(NV_OPENCL_INC)
+	OPENCL_LIB = $(NV_OPENCL_LIB)
+else
+	OPENCL_INC = $(INTEL_OPENCL_INC)
+	OPENCL_LIB = $(INTEL_OPENCL_LIB)
+endif
+
+ifdef OUTPUT
+	override OUTPUT = -DOUTPUT
+endif
+
+OUTPUT = -DOUTPUT
+
+C_C = g++
+OCL_LIB = -lOpenCL -L$(OPENCL_LIB)
+#OCL_INC = -I/usr/local/cuda-5.5/include
+OCL_INC = -I$(OPENCL_INC)
+
+default:	dwt2d
+
+components:
+	$(C_C) -o components.o $(OCL_INC) -c components.cpp 
+
+dwt2d:
+	$(C_C) -o dwt2d.out $(OCL_INC) \
+		-I../util -DTIMING \
+		main.cpp ../util/timing.c \
+		$(components) \
+		$(OUTPUT) \
+		$(OCL_LIB)
+
+clean:	
+	rm -rf *.o dwt2d.out
+	rm *.bmp.dwt.*
+
--- a/tests/opencl/dwt2d/README
+++ b/tests/opencl/dwt2d/README
@ -0,0 +1,22 @@
+//DESCRITPTION
+
+This is the OpenCL version of the code.
+
+The JPEG2000 standard uses 2D Discrete Wavelet Transform (2D DWT), which consumes a significant part of the total encoding time
+
+
+// USE
+**************OUTPUT********************
+USAGE:
+make clean
+make OUTPUT=Y
+
+
+**************PARAMETERS*****************
+USEAGE:
+./dwt2d [otpions] src_img.rgb <out_img.dwt>
+
+  -d, --dimension     dimensions of src img, e.g. 1920x1080
+  -l, --level         DWT level, default 3
+  -f, --forward       forward transform
+  -5, --53            5/3 transform
--- a/tests/opencl/dwt2d/common.h
+++ b/tests/opencl/dwt2d/common.h
@ -0,0 +1,25 @@
+#ifndef _COMMON_H
+#define _COMMON_H
+
+//24-bit multiplication is faster on G80,
+//but we must be sure to multiply integers
+//only within [-8M, 8M - 1] range
+#define IMUL(a, b) __mul24(a, b)
+
+#define DIVANDRND(a, b) ((((a) % (b)) != 0) ? ((a) / (b) + 1) : ((a) / (b)))
+/*
+#  define cudaCheckError( msg ) {                                            \
+    cudaError_t err = cudaGetLastError();                                    \
+    if( cudaSuccess != err) {                                                \
+        fprintf(stderr, "%s: %i: %s: %s.\n",                                 \
+                __FILE__, __LINE__, msg, cudaGetErrorString( err) );         \
+        exit(-1);                                                            \
+    } }
+
+#  define cudaCheckAsyncError( msg ) {                                       \
+    cudaThreadSynchronize();                                                 \
+    cudaCheckError( msg );                                                   \
+    }
+*/
+
+#endif
--- a/tests/opencl/dwt2d/components.cc
+++ b/tests/opencl/dwt2d/components.cc
@ -0,0 +1,29 @@
+#include <unistd.h>
+#include <error.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+
+
+#include "components.h"
+#include "common.h"
+
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/opencl.h>
+#endif
+
+
+
+
+/* the old "components.cu" has been separate into two parts 
+   "components.cpp",contains functions
+   "components.cl", contains all kernel functions
+*/
+
+/* Separate compoents of 8bit RGB source image */
+
+//need add some segments
--- a/tests/opencl/dwt2d/components.h
+++ b/tests/opencl/dwt2d/components.h
@ -0,0 +1,21 @@
+#include <unistd.h>
+#include <error.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <assert.h>
+//#include <stddef.h>
+#ifndef _COMPONENTS_H
+#define _COMPONENTS_H
+
+/* Separate compoents of source 8bit RGB image */
+
+template<typename T>
+void rgbToComponents(T d_r, T d_g, T d_b, unsigned char * src, int width, int height);
+
+
+/* Copy a 8bit source image data into a color compoment of type T */
+//template<typename T>
+//void bwToComponent(T *d_c, unsigned char * src, int width, int height);
+
+#endif
--- a/tests/opencl/dwt2d/dwt.cc
+++ b/tests/opencl/dwt2d/dwt.cc
@ -0,0 +1,95 @@
+#include <stdio.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <error.h>
+/*#include "dwt_cuda/dwt.h"
+#include "dwt_cuda/common.h"
+#include "dwt.h"
+#include "common.h"
+*/
+
+#include "common.h"
+#include "dwt.h"
+#include "common.h"
+/*
+inline void fdwt(float *in, float *out, int width, int height, int levels)
+{
+        dwt_cuda::fdwt97(in, out, width, height, levels);
+}
+
+inline void fdwt(int *in, int *out, int width, int height, int levels)
+{
+        dwt_cuda::fdwt53(in, out, width, height, levels);
+}
+
+inline void rdwt(float *in, float *out, int width, int height, int levels)
+{
+        dwt_cuda::rdwt97(in, out, width, height, levels);
+}
+
+inline void rdwt(int *in, int *out, int width, int height, int levels)
+{
+        dwt_cuda::rdwt53(in, out, width, height, levels);
+}
+*/
+template<typename T>
+int nStage2dDWT(T * in, T * out, T * backup, int pixWidth, int pixHeight, int stages, bool forward)
+{
+    //need add segments
+}
+template int nStage2dDWT<float>(float*, float*, float*, int, int, int, bool);
+template int nStage2dDWT<int>(int*, int*, int*, int, int, int, bool);
+
+void samplesToChar(unsigned char * dst, float * src, int samplesNum)
+{
+    int i;
+
+    for(i = 0; i < samplesNum; i++) {
+        float r = (src[i]+0.5f) * 255;
+        if (r > 255) r = 255; 
+        if (r < 0)   r = 0; 
+        dst[i] = (unsigned char)r;
+    }
+}
+
+void samplesToChar(unsigned char * dst, int * src, int samplesNum)
+{
+    int i;
+
+    for(i = 0; i < samplesNum; i++) {
+        int r = src[i]+128;
+        if (r > 255) r = 255;
+        if (r < 0)   r = 0; 
+        dst[i] = (unsigned char)r;
+    }
+}
+
+///* Write output linear orderd*/
+template<typename T>
+int writeLinear(T *component_cuda, int pixWidth, int pixHeight,
+                const char * filename, const char * suffix)
+{
+	//need add segments
+
+}
+template int writeLinear<float>(float *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); 
+template int writeLinear<int>(int *component_cuda, int pixWidth, int pixHeight, const char * filename, const char * suffix); 
+
+
+
+/* Write output visual ordered */
+template<typename T>
+int writeNStage2DDWT(T *component_cuda, int pixWidth, int pixHeight, 
+                     int stages, const char * filename, const char * suffix) 
+{
+    //need add segments
+}
+template int writeNStage2DDWT<float>(float *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); 
+template int writeNStage2DDWT<int>(int *component_cuda, int pixWidth, int pixHeight, int stages, const char * filename, const char * suffix); 
+
+
+
+
--- a/tests/opencl/dwt2d/dwt.h
+++ b/tests/opencl/dwt2d/dwt.h
@ -0,0 +1,15 @@
+
+#ifndef _DWT_H
+#define _DWT_H
+
+template<typename T> 
+int nStage2dDWT(T *in, T *out, T * backup, int pixWidth, int pixHeight, int stages, bool forward);
+
+template<typename T>
+int writeNStage2DDWT(T *component_cuda, int width, int height, 
+                     int stages, const char * filename, const char * suffix);
+template<typename T>
+int writeLinear(T *component_cuda, int width, int height, 
+                     const char * filename, const char * suffix);
+
+#endif
--- a/tests/opencl/dwt2d/dwt2d
+++ b/tests/opencl/dwt2d/dwt2d
--- a/tests/opencl/dwt2d/dwt2d_data
+++ b/tests/opencl/dwt2d/dwt2d_data
--- a/tests/opencl/dwt2d/dwt2d_inputs/._192.bmp
+++ b/tests/opencl/dwt2d/dwt2d_inputs/._192.bmp
--- a/tests/opencl/dwt2d/dwt2d_inputs/._inputGen
+++ b/tests/opencl/dwt2d/dwt2d_inputs/._inputGen
--- a/tests/opencl/dwt2d/dwt2d_inputs/._rgb.bmp
+++ b/tests/opencl/dwt2d/dwt2d_inputs/._rgb.bmp
--- a/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._16.bmp
+++ b/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._16.bmp
--- a/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._192.bmp
+++ b/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._192.bmp
--- a/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._4.bmp
+++ b/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._4.bmp
--- a/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._64.bmp
+++ b/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._64.bmp
--- a/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._8.bmp
+++ b/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._8.bmp
--- a/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._bmp.cpp
+++ b/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._bmp.cpp
--- a/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._bmp.h
+++ b/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._bmp.h
--- a/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._creat.out
+++ b/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._creat.out
--- a/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._create.cpp
+++ b/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._create.cpp
--- a/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._create.out
+++ b/tests/opencl/dwt2d/dwt2d_inputs/inputGen/._create.out
--- a/Show more
+++ b/Show more