fpga_synthesis merge

This commit is contained in:
Blaise Tine 2020-06-23 12:41:26 -07:00
commit b56fb31a6a
1384 changed files with 5533630 additions and 791310 deletions

6
.gitignore vendored
View file

@ -1,6 +0,0 @@
./rtl/obj_dir/
./rtl/obj_dir/*.vcd
./rtl/.*
./rtl/modelsim/*.vcd
*.vcd
.*

6
Makefile Normal file
View file

@ -0,0 +1,6 @@
all:
$(MAKE) -C hw
$(MAKE) -C driver
$(MAKE) -C runtime
$(MAKE) -C simX

View file

@ -6,13 +6,15 @@ Vortex currently supported RISC-V RV32I ISA
/docs contains documentation.
/runtime contains the runtime software support for Vortex.
/hw constains hardware sources.
/emulator contains a software emulator for Vortex.
/driver contains the driver software.
/runtime contains the kernel runtime software.
/SimX contains a cycle-approximate simulator for Vortex.
/rtl constains Vortex processor hardware description.
/evaluation contains the synthesis/runtime reports.
Basic Instructions to run OpenCL Benchmarks on Vortex
-----------------------------------------------------
@ -24,6 +26,8 @@ Install development tools
Install gnu-riscv-tools
$ export RISC_GNU_TOOLS_PATH=/opt/riscv-gnu-toolchain
$ sudo apt-get -y install \
binutils build-essential libtool texinfo \
gzip zip unzip patchutils curl git \
@ -35,26 +39,27 @@ Install gnu-riscv-tools
$ cd riscv-gnu-toolchain
$ git submodule update --init --recursive
$ mkdir build
$ cd build
$ export RISC_GNU_TOOLS_PATH=$PWD/../drops
$ cd build
$ ../configure --prefix=$RISC_GNU_TOOLS_PATH --with-arch=rv32im --with-abi=ilp32
$ make -j`nproc`
$ make -j`nproc` build-qemu
Install Verilator
You need into build the latest version using the instructions on their website
$ https://www.veripool.org/projects/verilator/wiki/Installing
Install Vortex
$ sudo apt-get install verilator
$ git clone https://github.gatech.edu/casl/Vortex.git
Build SimX
$ cd Vortex/simx
$ cd Vortex
$ make
Run SGEMM OpenCL Benchmark
$ cd Vortex/benchmarks/opencl/sgemm
$ make
<<<<<<< HEAD
$ make run
Basic Instructions to build the OpenCL Compiler for Vortex
@ -82,3 +87,6 @@ Build pocl for RISCV
$ rm -rf *
$ cmake -G Ninja -DCMAKE_INSTALL_PREFIX=$POCL_RT_PATH -DCMAKE_BUILD_TYPE=Debug -DOCS_AVAILABLE=OFF -DBUILD_SHARED_LIBS=OFF -DNEWLIB_BSP=ON -DNEWLIB_DEVICE_ADDRESS_BIT=32 -DNEWLIB_DEVICE_MARCH=rv32im -DBUILD_TESTS=OFF -DHOST_DEVICE_BUILD_HASH=basic-riscv32-unknown-elf -DCMAKE_TOOLCHAIN_FILE=../RISCV_newlib.cmake -DENABLE_TRACING=OFF -DENABLE_ICD=OFF -DPOCL_DEBUG_MESSAGES=ON ..
$ cmake --build . --target install
=======
$ make run
>>>>>>> fpga_synthesis

View file

@ -12,15 +12,14 @@ HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()

View file

@ -12,15 +12,14 @@ HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()

View file

@ -12,15 +12,14 @@ HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()

View file

@ -0,0 +1,816 @@
//------------------------------------------
//--cambine:helper function for OpenCL
//--programmer: Jianbin Fang
//--date: 27/12/2010
//------------------------------------------
#ifndef _CL_HELPER_
#define _CL_HELPER_
#include <CL/cl.h>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
using std::string;
using std::ifstream;
using std::cerr;
using std::endl;
using std::cout;
//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
#define WORK_DIM 2 // work-items dimensions
struct oclHandleStruct {
cl_context context;
cl_device_id *devices;
cl_command_queue queue;
cl_program program;
cl_int cl_status;
std::string error_str;
std::vector<cl_kernel> kernel;
};
struct oclHandleStruct oclHandles;
char kernel_file[100] = "Kernels.cl";
int total_kernels = 2;
string kernel_names[2] = {"BFS_1", "BFS_2"};
int work_group_size = 512;
int device_id_inused = 0; // deviced id used (default : 0)
/*
* Converts the contents of a file into a string
*/
string FileToString(const string fileName) {
ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
try {
size_t size;
char *str;
string s;
if (f.is_open()) {
size_t fileSize;
f.seekg(0, ifstream::end);
size = fileSize = f.tellg();
f.seekg(0, ifstream::beg);
str = new char[size + 1];
if (!str)
throw(string("Could not allocate memory"));
f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete[] str;
return s;
}
} catch (std::string msg) {
cerr << "Exception caught in FileToString(): " << msg << endl;
if (f.is_open())
f.close();
} catch (...) {
cerr << "Exception caught in FileToString()" << endl;
if (f.is_open())
f.close();
}
string errorMsg = "FileToString()::Error: Unable to open file " + fileName;
throw(errorMsg);
}
//---------------------------------------
// Read command line parameters
//
void _clCmdParams(int argc, char *argv[]) {
for (int i = 0; i < argc; ++i) {
switch (argv[i][1]) {
case 'g': //--g stands for size of work group
if (++i < argc) {
sscanf(argv[i], "%u", &work_group_size);
} else {
std::cerr << "Could not read argument after option " << argv[i - 1]
<< std::endl;
throw;
}
break;
case 'd': //--d stands for device id used in computaion
if (++i < argc) {
sscanf(argv[i], "%u", &device_id_inused);
} else {
std::cerr << "Could not read argument after option " << argv[i - 1]
<< std::endl;
throw;
}
break;
default:;
}
}
}
//---------------------------------------
// Initlize CL objects
//--description: there are 5 steps to initialize all the OpenCL objects needed
//--revised on 04/01/2011: get the number of devices and
// devices have no relationship with context
void _clInit() {
printf("_clInit()\n");
int DEVICE_ID_INUSED = device_id_inused;
cl_int resultCL;
oclHandles.context = NULL;
oclHandles.devices = NULL;
oclHandles.queue = NULL;
oclHandles.program = NULL;
cl_uint deviceListSize;
//-----------------------------------------------
//--cambine-1: find the available platforms and select one
cl_uint numPlatforms = 1;
cl_platform_id targetPlatform = NULL;
cl_platform_id *allPlatforms =
(cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
if (resultCL != CL_SUCCESS)
throw(string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
// Select the target platform. Default: first platform
targetPlatform = allPlatforms[0];
/*for (int i = 0; i < numPlatforms; i++)
{
char pbuff[128];
resultCL = clGetPlatformInfo( allPlatforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuff),
pbuff,
NULL);
if (resultCL != CL_SUCCESS)
throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
//printf("vedor is %s\n",pbuff);
}
free(allPlatforms);*/
//-----------------------------------------------
//--cambine-2: create an OpenCL context
/*cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM,
(cl_context_properties)targetPlatform, 0 };
oclHandles.context = clCreateContextFromType(cprops,
CL_DEVICE_TYPE_GPU,
NULL,
NULL,
&resultCL);
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
throw (string("InitCL()::Error: Creating Context
(clCreateContextFromType)"));
//-----------------------------------------------
//--cambine-3: detect OpenCL devices
// First, get the size of device list
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0,
NULL, &deviceListSize);
if(oclHandles.cl_status!=CL_SUCCESS){
throw(string("exception in _clInit -> clGetDeviceIDs"));
}
if (deviceListSize == 0)
throw(string("InitCL()::Error: No devices found."));
printf("OK1()\n");
//std::cout<<"device number:"<<deviceListSize<<std::endl;*/
// Now, allocate the device list
deviceListSize = 1;
oclHandles.devices =
(cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
if (oclHandles.devices == 0)
throw(string("InitCL()::Error: Could not allocate memory."));
//* Next, get the device list data
oclHandles.cl_status =
clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_DEFAULT, deviceListSize,
oclHandles.devices, NULL);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string("exception in _clInit -> clGetDeviceIDs-2"));
}
oclHandles.context = clCreateContext(NULL, deviceListSize, oclHandles.devices,
NULL, NULL, &resultCL);
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
throw(string("InitCL()::Error: Creating Context (clCreateContext)"));
//-----------------------------------------------
//--cambine-4: Create an OpenCL command queue
oclHandles.queue = clCreateCommandQueue(
oclHandles.context, oclHandles.devices[DEVICE_ID_INUSED], 0, &resultCL);
printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue);
if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
//-----------------------------------------------
//--cambine-5: Load CL file, build CL program object, create CL kernel object
/*std::string source_str = FileToString(kernel_file);
const char * source = source_str.c_str();
size_t sourceSize[] = { source_str.length() };*/
oclHandles.program = clCreateProgramWithBuiltInKernels(
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED],
"BFS_1;BFS_2", &resultCL);
/*oclHandles.program = clCreateProgramWithSource(oclHandles.context,
1,
&source,
sourceSize,
&resultCL);*/
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
throw(string("InitCL()::Error: Loading Binary into cl_program. "
"(clCreateProgramWithBinary)"));
// insert debug information
// std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
// options += " -cl-nv-opt-level=3";
resultCL = clBuildProgram(oclHandles.program, deviceListSize,
oclHandles.devices, NULL, NULL, NULL);
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) {
cerr << "InitCL()::Error: In clBuildProgram" << endl;
size_t length;
resultCL = clGetProgramBuildInfo(oclHandles.program,
oclHandles.devices[DEVICE_ID_INUSED],
CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
if (resultCL != CL_SUCCESS)
throw(string("InitCL()::Error: Getting Program build "
"info(clGetProgramBuildInfo)"));
char *buffer = (char *)malloc(length);
resultCL = clGetProgramBuildInfo(
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
CL_PROGRAM_BUILD_LOG, length, buffer, NULL);
if (resultCL != CL_SUCCESS)
throw(string("InitCL()::Error: Getting Program build "
"info(clGetProgramBuildInfo)"));
cerr << buffer << endl;
free(buffer);
throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
}
// get program information in intermediate representation
#ifdef PTX_MSG
size_t binary_sizes[deviceListSize];
char *binaries[deviceListSize];
// figure out number of devices and the sizes of the binary for each device.
oclHandles.cl_status =
clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES,
sizeof(size_t) * deviceListSize, &binary_sizes, NULL);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
}
std::cout << "--cambine:" << binary_sizes << std::endl;
// copy over all of the generated binaries.
for (int i = 0; i < deviceListSize; i++)
binaries[i] = (char *)malloc(sizeof(char) * (binary_sizes[i] + 1));
oclHandles.cl_status =
clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES,
sizeof(char *) * deviceListSize, binaries, NULL);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
}
for (int i = 0; i < deviceListSize; i++)
binaries[i][binary_sizes[i]] = '\0';
std::cout << "--cambine:writing ptd information..." << std::endl;
FILE *ptx_file = fopen("cl.ptx", "w");
if (ptx_file == NULL) {
throw(string("exceptions in allocate ptx file."));
}
fprintf(ptx_file, "%s", binaries[DEVICE_ID_INUSED]);
fclose(ptx_file);
std::cout << "--cambine:writing ptd information done." << std::endl;
for (int i = 0; i < deviceListSize; i++)
free(binaries[i]);
#endif
for (int nKernel = 0; nKernel < total_kernels; nKernel++) {
/* get a kernel object handle for a kernel with the given name */
cl_kernel kernel = clCreateKernel(
oclHandles.program, (kernel_names[nKernel]).c_str(), &resultCL);
if ((resultCL != CL_SUCCESS) || (kernel == NULL)) {
string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" +
kernel_names[nKernel] + "\"";
throw(errorMsg);
}
oclHandles.kernel.push_back(kernel);
}
// get resource alocation information
#ifdef RES_MSG
char *build_log;
size_t ret_val_size;
oclHandles.cl_status = clGetProgramBuildInfo(
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string("exceptions in _InitCL -> getting resource information"));
}
build_log = (char *)malloc(ret_val_size + 1);
oclHandles.cl_status = clGetProgramBuildInfo(
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string(
"exceptions in _InitCL -> getting resources allocation information-2"));
}
build_log[ret_val_size] = '\0';
std::cout << "--cambine:" << build_log << std::endl;
free(build_log);
#endif
}
//---------------------------------------
// release CL objects
void _clRelease() {
char errorFlag = false;
for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++) {
if (oclHandles.kernel[nKernel] != NULL) {
cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
if (resultCL != CL_SUCCESS) {
cerr << "ReleaseCL()::Error: In clReleaseKernel" << endl;
errorFlag = true;
}
oclHandles.kernel[nKernel] = NULL;
}
oclHandles.kernel.clear();
}
if (oclHandles.program != NULL) {
cl_int resultCL = clReleaseProgram(oclHandles.program);
if (resultCL != CL_SUCCESS) {
cerr << "ReleaseCL()::Error: In clReleaseProgram" << endl;
errorFlag = true;
}
oclHandles.program = NULL;
}
if (oclHandles.queue != NULL) {
cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
if (resultCL != CL_SUCCESS) {
cerr << "ReleaseCL()::Error: In clReleaseCommandQueue" << endl;
errorFlag = true;
}
oclHandles.queue = NULL;
}
free(oclHandles.devices);
if (oclHandles.context != NULL) {
cl_int resultCL = clReleaseContext(oclHandles.context);
if (resultCL != CL_SUCCESS) {
cerr << "ReleaseCL()::Error: In clReleaseContext" << endl;
errorFlag = true;
}
oclHandles.context = NULL;
}
if (errorFlag)
throw(string("ReleaseCL()::Error encountered."));
}
//--------------------------------------------------------
//--cambine:create buffer and then copy data from host to device
cl_mem _clCreateAndCpyMem(int size, void *h_mem_source) throw(string) {
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size,
h_mem_source, &oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem()"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: create read only buffer for devices
//--date: 17/01/2011
cl_mem _clMallocRW(int size, void *h_mem_ptr) throw(string) {
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size,
h_mem_ptr, &oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMallocRW"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: create read and write buffer for devices
//--date: 17/01/2011
cl_mem _clMalloc(int size, void *h_mem_ptr) throw(string) {
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context,
CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, size,
h_mem_ptr, &oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMalloc"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: transfer data from host to device
//--date: 17/01/2011
void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string) {
oclHandles.cl_status = clEnqueueWriteBuffer(
oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_ptr, 0, NULL, NULL);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMemcpyH2D"));
#endif
}
//--------------------------------------------------------
//--cambine:create buffer and then copy data from host to device with pinned
// memory
cl_mem _clCreateAndCpyPinnedMem(int size, float *h_mem_source) throw(string) {
cl_mem d_mem, d_mem_pinned;
float *h_mem_pinned = NULL;
d_mem_pinned = clCreateBuffer(oclHandles.context,
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size,
NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
#endif
//------------
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, size, NULL,
&oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
#endif
//----------
h_mem_pinned = (cl_float *)clEnqueueMapBuffer(
oclHandles.queue, d_mem_pinned, CL_TRUE, CL_MAP_WRITE, 0, size, 0, NULL,
NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
#endif
int element_number = size / sizeof(float);
#pragma omp parallel for
for (int i = 0; i < element_number; i++) {
h_mem_pinned[i] = h_mem_source[i];
}
//----------
oclHandles.cl_status = clEnqueueWriteBuffer(
oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_pinned, 0, NULL, NULL);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
#endif
return d_mem;
}
//--------------------------------------------------------
//--cambine:create write only buffer on device
cl_mem _clMallocWO(int size) throw(string) {
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0,
&oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateMem()"));
#endif
return d_mem;
}
//--------------------------------------------------------
// transfer data from device to host
void _clMemcpyD2H(cl_mem d_mem, int size, void *h_mem) throw(string) {
oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE,
0, size, h_mem, 0, 0, 0);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
switch (oclHandles.cl_status) {
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_VALUE:
oclHandles.error_str += "CL_INVALID_VALUE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
//--------------------------------------------------------
// set kernel arguments
void _clSetArgs(int kernel_id, int arg_idx, void *d_mem,
int size = 0) throw(string) {
if (!size) {
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx,
sizeof(d_mem), &d_mem);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
switch (oclHandles.cl_status) {
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_ARG_INDEX:
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
break;
case CL_INVALID_ARG_VALUE:
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_SAMPLER:
oclHandles.error_str += "CL_INVALID_SAMPLER";
break;
case CL_INVALID_ARG_SIZE:
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
} else {
oclHandles.cl_status =
clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
switch (oclHandles.cl_status) {
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_ARG_INDEX:
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
break;
case CL_INVALID_ARG_VALUE:
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_SAMPLER:
oclHandles.error_str += "CL_INVALID_SAMPLER";
break;
case CL_INVALID_ARG_SIZE:
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
}
void _clFinish() throw(string) {
oclHandles.cl_status = clFinish(oclHandles.queue);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clFinish";
switch (oclHandles.cl_status) {
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reasons";
break;
}
if (oclHandles.cl_status != CL_SUCCESS) {
throw(oclHandles.error_str);
}
#endif
}
//--------------------------------------------------------
//--cambine:enqueue kernel
void _clInvokeKernel(int kernel_id, int work_items,
int work_group_size) throw(string) {
cl_uint work_dim = WORK_DIM;
cl_event e[1];
if (work_items % work_group_size != 0) // process situations that work_items
// cannot be divided by work_group_size
work_items =
work_items + (work_group_size - (work_items % work_group_size));
size_t local_work_size[] = {work_group_size, 1};
size_t global_work_size[] = {work_items, 1};
oclHandles.cl_status = clEnqueueNDRangeKernel(
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
global_work_size, local_work_size, 0, 0, &(e[0]));
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
switch (oclHandles.cl_status) {
case CL_INVALID_PROGRAM_EXECUTABLE:
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
break;
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_KERNEL_ARGS:
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
break;
case CL_INVALID_WORK_DIMENSION:
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
break;
case CL_INVALID_GLOBAL_WORK_SIZE:
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
break;
case CL_INVALID_WORK_GROUP_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
break;
case CL_INVALID_WORK_ITEM_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
break;
case CL_INVALID_GLOBAL_OFFSET:
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
//_clFinish();
// oclHandles.cl_status = clWaitForEvents(1, &e[0]);
// #ifdef ERRMSG
// if (oclHandles.cl_status!= CL_SUCCESS)
// throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
// #endif
}
void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x,
int group_y) throw(string) {
cl_uint work_dim = WORK_DIM;
size_t local_work_size[] = {group_x, group_y};
size_t global_work_size[] = {range_x, range_y};
cl_event e[1];
/*if(work_items%work_group_size != 0) //process situations that work_items
cannot be divided by work_group_size
work_items = work_items + (work_group_size-(work_items%work_group_size));*/
oclHandles.cl_status = clEnqueueNDRangeKernel(
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
global_work_size, local_work_size, 0, 0, &(e[0]));
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
switch (oclHandles.cl_status) {
case CL_INVALID_PROGRAM_EXECUTABLE:
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
break;
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_KERNEL_ARGS:
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
break;
case CL_INVALID_WORK_DIMENSION:
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
break;
case CL_INVALID_GLOBAL_WORK_SIZE:
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
break;
case CL_INVALID_WORK_GROUP_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
break;
case CL_INVALID_WORK_ITEM_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
break;
case CL_INVALID_GLOBAL_OFFSET:
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
//_clFinish();
/*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
#ifdef ERRMSG
if (oclHandles.cl_status!= CL_SUCCESS)
throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
#endif*/
}
//--------------------------------------------------------
// release OpenCL objects
void _clFree(cl_mem ob) throw(string) {
if (ob != NULL)
oclHandles.cl_status = clReleaseMemObject(ob);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clFree() ->";
switch (oclHandles.cl_status) {
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
#endif //_CL_HELPER_

View file

@ -0,0 +1,67 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = bfs
SRCS = main.cc
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,53 @@
/* ============================================================
//--cambine: kernel funtion of Breadth-First-Search
//--author: created by Jianbin Fang
//--date: 06/12/2010
============================================================ */
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
//Structure to hold a node information
typedef struct{
int starting;
int no_of_edges;
} Node;
//--7 parameters
__kernel void BFS_1( const __global Node* g_graph_nodes,
const __global int* g_graph_edges,
__global char* g_graph_mask,
__global char* g_updating_graph_mask,
__global char* g_graph_visited,
__global int* g_cost,
const int no_of_nodes){
int tid = get_global_id(0);
if( tid<no_of_nodes && g_graph_mask[tid]){
g_graph_mask[tid]=false;
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
int id = g_graph_edges[i];
if(!g_graph_visited[id]){
g_cost[id]=g_cost[tid]+1;
g_updating_graph_mask[id]=true;
}
}
}
}
//--5 parameters
__kernel void BFS_2(__global char* g_graph_mask,
__global char* g_updating_graph_mask,
__global char* g_graph_visited,
__global char* g_over,
const int no_of_nodes
) {
int tid = get_global_id(0);
if( tid<no_of_nodes && g_updating_graph_mask[tid]){
g_graph_mask[tid]=true;
g_graph_visited[tid]=true;
*g_over=true;
g_updating_graph_mask[tid]=false;
}
}

297
benchmarks/old_opencl/bfs/main.cc Executable file
View file

@ -0,0 +1,297 @@
//--by Jianbin Fang
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#ifdef PROFILING
#include "timer.h"
#endif
#include "CLHelper.h"
#include "util.h"
#define MAX_THREADS_PER_BLOCK 256
// Structure to hold a node information
struct Node {
int starting;
int no_of_edges;
};
//----------------------------------------------------------
//--bfs on cpu
//--programmer: jianbin
//--date: 26/01/2011
//--note: width is changed to the new_width
//----------------------------------------------------------
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask,
char *h_updating_graph_mask, char *h_graph_visited,
int *h_cost_ref) {
char stop;
int k = 0;
do {
// if no thread changes this value then the loop stops
stop = false;
for (int tid = 0; tid < no_of_nodes; tid++) {
if (h_graph_mask[tid] == true) {
h_graph_mask[tid] = false;
for (int i = h_graph_nodes[tid].starting;
i < (h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting);
i++) {
int id =
h_graph_edges[i]; //--cambine: node id is connected with node tid
if (!h_graph_visited[id]) { //--cambine: if node id has not been
//visited, enter the body below
h_cost_ref[id] = h_cost_ref[tid] + 1;
h_updating_graph_mask[id] = true;
}
}
}
}
for (int tid = 0; tid < no_of_nodes; tid++) {
if (h_updating_graph_mask[tid] == true) {
h_graph_mask[tid] = true;
h_graph_visited[tid] = true;
stop = true;
h_updating_graph_mask[tid] = false;
}
}
k++;
} while (stop);
}
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask,
char *h_updating_graph_mask, char *h_graph_visited,
int *h_cost) throw(std::string) {
// int number_elements = height*width;
char h_over;
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
d_graph_visited, d_cost, d_over;
try {
//--1 transfer data from host to device
_clInit();
d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
d_updating_graph_mask =
_clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
d_over = _clMallocRW(sizeof(char), &h_over);
_clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
_clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
_clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char),
h_updating_graph_mask);
_clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
_clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
//--2 invoke kernel
#ifdef PROFILING
timer kernel_timer;
double kernel_time = 0.0;
kernel_timer.reset();
kernel_timer.start();
#endif
do {
h_over = false;
_clMemcpyH2D(d_over, sizeof(char), &h_over);
//--kernel 0
int kernel_id = 0;
int kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_cost);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
// int work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
//--kernel 1
kernel_id = 1;
kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_over);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
// work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
_clMemcpyD2H(d_over, sizeof(char), &h_over);
} while (h_over);
_clFinish();
#ifdef PROFILING
kernel_timer.stop();
kernel_time = kernel_timer.getTimeInSeconds();
#endif
//--3 transfer data from device to host
_clMemcpyD2H(d_cost, no_of_nodes * sizeof(int), h_cost);
//--statistics
#ifdef PROFILING
std::cout << "kernel time(s):" << kernel_time << std::endl;
#endif
//--4 release cl resources.
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
} catch (std::string msg) {
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
std::string e_str = "in run_transpose_gpu -> ";
e_str += msg;
throw(e_str);
}
return;
}
//----------------------------------------------------------
//--cambine: main function
//--author: created by Jianbin Fang
//--date: 25/01/2011
//----------------------------------------------------------
int main(int argc, char *argv[]) {
printf("enter demo main\n");
int no_of_nodes;
int edge_list_size;
FILE *fp;
Node *h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
try {
char *input_f = "graph4096.txt";
printf("Reading File\n");
// Read in Graph from a file
fp = fopen(input_f, "r");
if (!fp) {
printf("Error Reading graph file\n");
return 0;
}
printf("Reading File completed!\n");
int source = 0;
fscanf(fp, "%d", &no_of_nodes);
int num_of_blocks = 1;
int num_of_threads_per_block = no_of_nodes;
// Make execution Parameters according to the number of nodes
// Distribute threads across multiple Blocks if necessary
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
work_group_size = num_of_threads_per_block;
// allocate host memory
h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
h_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
h_updating_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
h_graph_visited = (char *)malloc(sizeof(char) * no_of_nodes);
int start, edgeno;
// initalize the memory
for (int i = 0; i < no_of_nodes; i++) {
fscanf(fp, "%d %d", &start, &edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// read the source node from the file
fscanf(fp, "%d", &source);
source = 0;
// set the source node as true in the mask
h_graph_mask[source] = true;
h_graph_visited[source] = true;
fscanf(fp, "%d", &edge_list_size);
int id, cost;
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
for (int i = 0; i < edge_list_size; i++) {
fscanf(fp, "%d", &id);
fscanf(fp, "%d", &cost);
h_graph_edges[i] = id;
}
if (fp)
fclose(fp);
// allocate mem for the result on host side
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
int *h_cost_ref = (int *)malloc(sizeof(int) * no_of_nodes);
for (int i = 0; i < no_of_nodes; i++) {
h_cost[i] = -1;
h_cost_ref[i] = -1;
}
h_cost[source] = 0;
h_cost_ref[source] = 0;
//---------------------------------------------------------
//--gpu entry
run_bfs_gpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
//---------------------------------------------------------
//--cpu entry
// initalize the memory again
for (int i = 0; i < no_of_nodes; i++) {
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// set the source node as true in the mask
source = 0;
h_graph_mask[source] = true;
h_graph_visited[source] = true;
run_bfs_cpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited,
h_cost_ref);
//---------------------------------------------------------
//--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
// release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
} catch (std::string msg) {
std::cout << "--cambine: exception in main ->" << msg << std::endl;
// release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
}
return 0;
}

1
benchmarks/old_opencl/bfs/run Executable file
View file

@ -0,0 +1 @@
./bfs ../../data/bfs/graph1MW_6.txt

View file

@ -0,0 +1,78 @@
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iomanip>
#include "timer.h"
using namespace std;
double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
double timer::get_CPU_speed_in_MHz()
{
#if defined __linux__
ifstream infile("/proc/cpuinfo");
char buffer[256], *colon;
while (infile.good()) {
infile.getline(buffer, 256);
if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
return atof(colon + 2);
}
#endif
return 0.0;
}
void timer::print_time(ostream &str, const char *which, double time) const
{
static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 };
const char **unit = units;
time = 1000.0 * time / CPU_speed_in_MHz;
while (time >= 999.5 && unit[1] != 0) {
time /= 1000.0;
++ unit;
}
str << which << " = " << setprecision(3) << setw(4) << time << *unit;
}
ostream &timer::print(ostream &str)
{
str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
if (CPU_speed_in_MHz == 0)
str << "could not determine CPU speed\n";
else if (count > 0) {
double total = static_cast<double>(total_time);
print_time(str, "avg", total / static_cast<double>(count));
print_time(str, ", total", total);
str << ", count = " << setw(9) << count << '\n';
}
else
str << "not used\n";
return str;
}
ostream &operator << (ostream &str, class timer &timer)
{
return timer.print(str);
}
double timer::getTimeInSeconds()
{
double total = static_cast<double>(total_time);
double res = (total / 1000000.0) / CPU_speed_in_MHz;
return res;
}

101
benchmarks/old_opencl/bfs/timer.h Executable file
View file

@ -0,0 +1,101 @@
#ifndef timer_h
#define timer_h
#include <iostream>
class timer {
public:
timer(const char *name = 0);
timer(const char *name, std::ostream &write_on_exit);
~timer();
void start(), stop();
void reset();
std::ostream &print(std::ostream &);
double getTimeInSeconds();
private:
void print_time(std::ostream &, const char *which, double time) const;
union {
long long total_time;
struct {
#if defined __PPC__
int high, low;
#else
int low, high;
#endif
};
};
unsigned long long count;
const char *const name;
std::ostream *const write_on_exit;
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
};
std::ostream &operator<<(std::ostream &, class timer &);
inline void timer::reset() {
total_time = 0;
count = 0;
}
inline timer::timer(const char *name) : name(name), write_on_exit(0) {
reset();
}
inline timer::timer(const char *name, std::ostream &write_on_exit)
: name(name), write_on_exit(&write_on_exit) {
reset();
}
inline timer::~timer() {
if (write_on_exit != 0)
print(*write_on_exit);
}
inline void timer::start() {
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx;
asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time -= ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
(defined __i386 || defined __x86_64)
asm volatile("rdtsc\n\t"
"subl %%eax, %0\n\t"
"sbbl %%edx, %1"
: "+m"(low), "+m"(high)
:
: "eax", "edx");
#else
#error Compiler/Architecture not recognized
#endif
}
inline void timer::stop() {
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx;
asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time += ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
(defined __i386 || defined __x86_64)
asm volatile("rdtsc\n\t"
"addl %%eax, %0\n\t"
"adcl %%edx, %1"
: "+m"(low), "+m"(high)
:
: "eax", "edx");
#endif
++count;
}
#endif

View file

@ -0,0 +1,72 @@
#ifndef _C_UTIL_
#define _C_UTIL_
#include <math.h>
#include <iostream>
//-------------------------------------------------------------------
//--initialize array with maximum limit
//-------------------------------------------------------------------
template<typename datatype>
void fill(datatype *A, const int n, const datatype maxi){
for (int j = 0; j < n; j++)
{
A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
}
}
//--print matrix
template<typename datatype>
void print_matrix(datatype *A, int height, int width){
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int idx = i*width + j;
std::cout<<A[idx]<<" ";
}
std::cout<<std::endl;
}
return;
}
//-------------------------------------------------------------------
//--verify results
//-------------------------------------------------------------------
#define MAX_RELATIVE_ERROR .002
template<typename datatype>
void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
char passed = true;
#pragma omp parallel for
for (int i=0; i<size; i++){
if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
passed = false;
}
}
if (passed){
std::cout << "--cambine:passed:-)" << endl;
}
else{
std::cout << "--cambine: failed:-(" << endl;
}
return ;
}
template<typename datatype>
void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
char passed = true;
//#pragma omp parallel for
for (int i=0; i<size; i++){
if (cpu_results[i]!=gpu_results[i]){
passed = false;
}
}
if (passed){
std::cout << "--cambine:passed:-)" << endl;
}
else{
std::cout << "--cambine: failed:-(" << endl;
}
return ;
}
#endif

View file

@ -0,0 +1,67 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = convolution
SRCS = main.cpp utils.cpp
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View file

@ -0,0 +1,54 @@
__kernel
void convolution(
__read_only image2d_t sourceImage,
__write_only image2d_t outputImage,
int rows,
int cols,
__constant float* filter,
int filterWidth,
sampler_t sampler)
{
// Store each work-items unique row and column
int column = get_global_id(0);
int row = get_global_id(1);
// Half the width of the filter is needed for indexing
// memory later
int halfWidth = (int)(filterWidth/2);
// All accesses to images return data as four-element vector
// (i.e., float4), although only the 'x' component will contain
// meaningful data in this code
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
// Iterator for the filter
int filterIdx = 0;
// Each work-item iterates around its local area based on the
// size of the filter
int2 coords; // Coordinates for accessing the image
// Iterate the filter rows
for(int i = -halfWidth; i <= halfWidth; i++) {
coords.y = row + i;
// Iterate over the filter columns
for(int j = -halfWidth; j <= halfWidth; j++) {
coords.x = column + j;
float4 pixel;
// Read a pixel from the image. A single channel image
// stores the pixel in the 'x' coordinate of the returned
// vector.
pixel = read_imagef(sourceImage, sampler, coords);
sum.x += pixel.x * filter[filterIdx++];
}
}
// Copy the data to the output image if the
// work-item is in bounds
if(row < rows && column < cols) {
coords.x = column;
coords.y = row;
write_imagef(outputImage, coords, sum);
}
}

View file

@ -0,0 +1,261 @@
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include "utils.h"
// This function takes a positive integer and rounds it up to
// the nearest multiple of another provided integer
unsigned int roundUp(unsigned int value, unsigned int multiple) {
// Determine how far past the nearest multiple the value is
unsigned int remainder = value % multiple;
// Add the difference to make the value a multiple
if(remainder != 0) {
value += (multiple-remainder);
}
return value;
}
// This function reads in a text file and stores it as a char pointer
char* readSource(char* kernelPath) {
cl_int status;
FILE *fp;
char *source;
long int size;
printf("Program file is: %s\n", kernelPath);
fp = fopen(kernelPath, "rb");
if(!fp) {
printf("Could not open kernel file\n");
exit(-1);
}
status = fseek(fp, 0, SEEK_END);
if(status != 0) {
printf("Error seeking to end of file\n");
exit(-1);
}
size = ftell(fp);
if(size < 0) {
printf("Error getting file position\n");
exit(-1);
}
rewind(fp);
source = (char *)malloc(size + 1);
int i;
for (i = 0; i < size+1; i++) {
source[i]='\0';
}
if(source == NULL) {
printf("Error allocating space for the kernel source\n");
exit(-1);
}
fread(source, 1, size, fp);
source[size] = '\0';
return source;
}
void chk(cl_int status, const char* cmd) {
if(status != CL_SUCCESS) {
printf("%s failed (%d)\n", cmd, status);
exit(-1);
}
}
int main() {
int i, j, k, l;
// Rows and columns in the input image
int imageHeight;
int imageWidth;
const char* inputFile = "input.bmp";
const char* outputFile = "output.bmp";
// Homegrown function to read a BMP from file
float* inputImage = readImage(inputFile, &imageWidth,
&imageHeight);
// Size of the input and output images on the host
int dataSize = imageHeight*imageWidth*sizeof(float);
// Output image on the host
float* outputImage = NULL;
outputImage = (float*)malloc(dataSize);
float* refImage = NULL;
refImage = (float*)malloc(dataSize);
// 45 degree motion blur
float filter[49] =
{0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, -2, 0, 2, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0};
// The convolution filter is 7x7
int filterWidth = 7;
int filterSize = filterWidth*filterWidth; // Assume a square kernel
// Set up the OpenCL environment
cl_int status;
// Discovery platform
cl_platform_id platform;
status = clGetPlatformIDs(1, &platform, NULL);
chk(status, "clGetPlatformIDs");
// Discover device
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
chk(status, "clGetDeviceIDs");
// Create context
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)(platform), 0};
cl_context context;
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
chk(status, "clCreateContext");
// Create command queue
cl_command_queue queue;
queue = clCreateCommandQueue(context, device, 0, &status);
chk(status, "clCreateCommandQueue");
// The image format describes how the data will be stored in memory
cl_image_format format;
format.image_channel_order = CL_R; // single channel
format.image_channel_data_type = CL_FLOAT; // float data type
// Create space for the source image on the device
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the output image on the device
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the 7x7 filter on the device
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
NULL, &status);
chk(status, "clCreateBuffer");
// Copy the source image to the device
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
0, 0, inputImage, 0, NULL, NULL);
chk(status, "clEnqueueWriteImage");
// Copy the 7x7 filter to the device
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
filterSize*sizeof(float), filter, 0, NULL, NULL);
chk(status, "clEnqueueWriteBuffer");
// Create the image sampler
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
chk(status, "clCreateSampler");
const char* source = readSource("kernel.cl");
// Create a program object with source and build it
cl_program program;
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
chk(status, "clCreateProgramWithSource");
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
chk(status, "clBuildProgram");
// Create the kernel object
cl_kernel kernel;
kernel = clCreateKernel(program, "convolution", &status);
chk(status, "clCreateKernel");
// Set the kernel arguments
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
chk(status, "clSetKernelArg");
// Set the work item dimensions
size_t globalSize[2] = {imageWidth, imageHeight};
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
NULL, NULL);
chk(status, "clEnqueueNDRange");
// Read the image back to the host
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
region, 0, 0, outputImage, 0, NULL, NULL);
chk(status, "clEnqueueReadImage");
// Write the output image to file
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
// Compute the reference image
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
refImage[i*imageWidth+j] = 0;
}
}
// Iterate over the rows of the source image
int halfFilterWidth = filterWidth/2;
float sum;
for(i = 0; i < imageHeight; i++) {
// Iterate over the columns of the source image
for(j = 0; j < imageWidth; j++) {
sum = 0; // Reset sum for new source pixel
// Apply the filter to the neighborhood
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
if(i+k >= 0 && i+k < imageHeight &&
j+l >= 0 && j+l < imageWidth) {
sum += inputImage[(i+k)*imageWidth + j+l] *
filter[(k+halfFilterWidth)*filterWidth +
l+halfFilterWidth];
}
}
}
refImage[i*imageWidth+j] = sum;
}
}
int failed = 0;
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
printf("Results are INCORRECT\n");
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
failed = 1;
}
if(failed) break;
}
if(failed) break;
}
if(!failed) {
printf("Results are correct\n");
}
return 0;
}

View file

@ -0,0 +1,180 @@
#include <stdio.h>
#include <stdlib.h>
#include "utils.h"
void storeImage(float *imageOut,
const char *filename,
int rows,
int cols,
const char* refFilename) {
FILE *ifp, *ofp;
unsigned char tmp;
int offset;
unsigned char *buffer;
int i, j;
int bytes;
int height, width;
ifp = fopen(refFilename, "rb");
if(ifp == NULL) {
perror(filename);
exit(-1);
}
fseek(ifp, 10, SEEK_SET);
fread(&offset, 4, 1, ifp);
fseek(ifp, 18, SEEK_SET);
fread(&width, 4, 1, ifp);
fread(&height, 4, 1, ifp);
fseek(ifp, 0, SEEK_SET);
buffer = (unsigned char *)malloc(offset);
if(buffer == NULL) {
perror("malloc");
exit(-1);
}
fread(buffer, 1, offset, ifp);
printf("Writing output image to %s\n", filename);
ofp = fopen(filename, "wb");
if(ofp == NULL) {
perror("opening output file");
exit(-1);
}
bytes = fwrite(buffer, 1, offset, ofp);
if(bytes != offset) {
printf("error writing header!\n");
exit(-1);
}
// NOTE bmp formats store data in reverse raster order (see comment in
// readImage function), so we need to flip it upside down here.
int mod = width % 4;
if(mod != 0) {
mod = 4 - mod;
}
// printf("mod = %d\n", mod);
for(i = height-1; i >= 0; i--) {
for(j = 0; j < width; j++) {
tmp = (unsigned char)imageOut[i*cols+j];
fwrite(&tmp, sizeof(char), 1, ofp);
}
// In bmp format, rows must be a multiple of 4-bytes.
// So if we're not at a multiple of 4, add junk padding.
for(j = 0; j < mod; j++) {
fwrite(&tmp, sizeof(char), 1, ofp);
}
}
fclose(ofp);
fclose(ifp);
free(buffer);
}
/*
* Read bmp image and convert to byte array. Also output the width and height
*/
float* readImage(const char *filename, int* widthOut, int* heightOut) {
uchar* imageData;
int height, width;
uchar tmp;
int offset;
int i, j;
printf("Reading input image from %s\n", filename);
FILE *fp = fopen(filename, "rb");
if(fp == NULL) {
perror(filename);
exit(-1);
}
fseek(fp, 10, SEEK_SET);
fread(&offset, 4, 1, fp);
fseek(fp, 18, SEEK_SET);
fread(&width, 4, 1, fp);
fread(&height, 4, 1, fp);
printf("width = %d\n", width);
printf("height = %d\n", height);
*widthOut = width;
*heightOut = height;
imageData = (uchar*)malloc(width*height);
if(imageData == NULL) {
perror("malloc");
exit(-1);
}
fseek(fp, offset, SEEK_SET);
fflush(NULL);
int mod = width % 4;
if(mod != 0) {
mod = 4 - mod;
}
// NOTE bitmaps are stored in upside-down raster order. So we begin
// reading from the bottom left pixel, then going from left-to-right,
// read from the bottom to the top of the image. For image analysis,
// we want the image to be right-side up, so we'll modify it here.
// First we read the image in upside-down
// Read in the actual image
for(i = 0; i < height; i++) {
// add actual data to the image
for(j = 0; j < width; j++) {
fread(&tmp, sizeof(char), 1, fp);
imageData[i*width + j] = tmp;
}
// For the bmp format, each row has to be a multiple of 4,
// so I need to read in the junk data and throw it away
for(j = 0; j < mod; j++) {
fread(&tmp, sizeof(char), 1, fp);
}
}
// Then we flip it over
int flipRow;
for(i = 0; i < height/2; i++) {
flipRow = height - (i+1);
for(j = 0; j < width; j++) {
tmp = imageData[i*width+j];
imageData[i*width+j] = imageData[flipRow*width+j];
imageData[flipRow*width+j] = tmp;
}
}
fclose(fp);
// Input image on the host
float* floatImage = NULL;
floatImage = (float*)malloc(sizeof(float)*width*height);
if(floatImage == NULL) {
perror("malloc");
exit(-1);
}
// Convert the BMP image to float (not required)
for(i = 0; i < height; i++) {
for(j = 0; j < width; j++) {
floatImage[i*width+j] = (float)imageData[i*width+j];
}
}
free(imageData);
return floatImage;
}

View file

@ -0,0 +1,11 @@
#ifndef __UTILS__
#define __UTILS__
typedef unsigned char uchar;
float* readImage(const char *filename, int* widthOut, int* heightOut);
void storeImage(float *imageOut, const char *filename, int rows, int cols,
const char* refFilename);
#endif

View file

@ -12,15 +12,14 @@ HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()

View file

@ -0,0 +1,67 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = gaussian
SRCS = main.cc clutils.cpp utils.cpp
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

View file

@ -0,0 +1,241 @@
/*-----------------------------------------------------------
** ge_p.c -- The program is to solve a linear system Ax = b
** by using Gaussian Elimination. The algorithm on page 101
** ("Foundations of Parallel Programming") is used.
** The sequential version is ge_s.c. This parallel
** implementation converts three independent for() loops
** into three Fans. Use the data file ge_3.dat to verify
** the correction of the output.
**
** Written by Andreas Kura, 02/15/95
** Modified by Chong-wei Xu, /04/20/95
**-----------------------------------------------------------
*/
#include <us.h>
#include <stdio.h>
int Size, t;
float **a, *b;
BEGIN_SHARED_DECL
float **m;
END_SHARED_DECL;
FILE *fp;
void InitProblemOnce();
void InitPerRun();
void ForwardSub();
void Fan1();
void Fan2();
void Fan3();
void InitMat();
void InitAry();
void PrintMat();
void PrintAry();
main ()
{
InitializeUs();
MakeSharedVariables; /* to make SHARED m */
InitProblemOnce();
InitPerRun();
ForwardSub();
printf("The result of matrix m is: \n");
PrintMat(SHARED m, Size, Size);
printf("The result of matrix a is: \n");
PrintMat(a, Size, Size);
printf("The result of array b is: \n");
PrintAry(b, Size);
}
/*------------------------------------------------------
** InitProblemOnce -- Initialize all of matrices and
** vectors by opening a data file specified by the user.
**
** We used dynamic array **a, *b, and **m to allocate
** the memory storages.
**------------------------------------------------------
*/
void InitProblemOnce()
{
char filename[30];
printf("Enter the data file name: ");
scanf("%s", filename);
printf("The file name is: %s\n", filename);
fp = fopen(filename, "r");
fscanf(fp, "%d", &Size);
a = (float **) UsAllocScatterMatrix(Size, Size, sizeof(float));
/*
a = (float **) malloc(Size * sizeof(float *));
for (i=0; i<Size; i++) {
a[i] = (float *) malloc(Size * sizeof(float));
}
*/
InitMat(a, Size, Size);
printf("The input matrix a is:\n");
PrintMat(a, Size, Size);
b = (float *) UsAlloc(Size * sizeof(float));
/*
b = (float *) malloc(Size * sizeof(float));
*/
InitAry(b, Size);
printf("The input array b is:\n");
PrintAry(b, Size);
SHARED m = (float **) UsAllocScatterMatrix(Size, Size, sizeof(float));
/*
m = (float **) malloc(Size * sizeof(float *));
for (i=0; i<Size; i++) {
m[i] = (float *) malloc(Size * sizeof(float));
}
*/
Share(&Size);
Share(&a);
Share(&b);
}
/*------------------------------------------------------
** InitPerRun() -- Initialize the contents of the
** multipier matrix **m
**------------------------------------------------------
*/
void InitPerRun()
{
int i, j;
for (i=0; i<Size; i++)
for (j=0; j<Size; j++)
SHARED m[i][j] = 0.0;
}
/*------------------------------------------------------
** ForwardSub() -- Forward substitution of Gaussian
** elimination.
**------------------------------------------------------
*/
void ForwardSub()
{
for (t=0; t<(Size-1); t++) {
Share(&t);
GenOnI(Fan1, Size-1-t); /* t=0 to (Size-2), the range is
** Size-2-t+1 = Size-1-t
*/
GenOnA(Fan2, Size-1-t, Size-t);
GenOnI(Fan3, Size-1-t);
}
}
/*-------------------------------------------------------
** Fan1() -- Calculate multiplier matrix
** Pay attention to the index. Index i give the range
** which starts from 0 to range-1. The real values of
** the index should be adjust and related with the value
** of t which is defined on the ForwardSub().
**-------------------------------------------------------
*/
void Fan1(dummy, i)
int dummy, i;
{
/* Use these printf() to display the nodes and index */
printf("from node #%d\n", PhysProcToUsProc(Proc_Node));
SHARED m[i+t+1][t] = a[i+t+1][t] / a[t][t];
printf("i=%d, a[%d][%d]=%.2f, a[%d][%d]=%.2f, m[%d][%d]=%.2f\n",
(i+t+1),t,t,a[t][t],(i+t+1),t,a[i+t+1][t],(i+t+1),t,
SHARED m[i+t+1][t]);
}
/*-------------------------------------------------------
** Fan2() -- Modify the matrix A into LUD
**-------------------------------------------------------
*/
void Fan2(dummy, i, j)
int dummy, i, j;
{
a[i+1+t][j+t] -= SHARED m[i+1+t][t] * a[t][j+t];
Share (&a);
}
/*-------------------------------------------------------
** Fan3() -- Modify the array b
**-------------------------------------------------------
*/
void Fan3(dummy, i)
int dummy, i;
{
b[i+1+t] -= SHARED m[i+1+t][t] * b[t];
}
/*------------------------------------------------------
** InitMat() -- Initialize the matrix by reading data
** from the data file
**------------------------------------------------------
*/
void InitMat(ary, nrow, ncol)
float **ary;
int nrow, ncol;
{
int i, j;
for (i=0; i<nrow; i++) {
for (j=0; j<ncol; j++) {
fscanf(fp, "%f", &ary[i][j]);
}
}
}
/*------------------------------------------------------
** PrintMat() -- Print the contents of the matrix
**------------------------------------------------------
*/
void PrintMat(ary, nrow, ncol)
float **ary;
int nrow, ncol;
{
int i, j;
for (i=0; i<nrow; i++) {
for (j=0; j<ncol; j++) {
printf("%8.2f ", ary[i][j]);
}
printf("\n");
}
printf("\n");
}
/*------------------------------------------------------
** InitAry() -- Initialize the array (vector) by reading
** data from the data file
**------------------------------------------------------
*/
void InitAry(ary, ary_size)
float *ary;
int ary_size;
{
int i;
for (i=0; i<ary_size; i++) {
fscanf(fp, "%f", &ary[i]);
}
}
/*------------------------------------------------------
** PrintAry() -- Print the contents of the array (vector)
**------------------------------------------------------
*/
void PrintAry(ary, ary_size)
float *ary;
int ary_size;
{
int i;
for (i=0; i<ary_size; i++) {
printf("%.2f ", ary[i]);
}
printf("\n");
}

View file

@ -0,0 +1,60 @@
The Gaussian Elimination application solves systems of equations using the
gaussian elimination method.
The application analyzes an n x n matrix and an associated 1 x n vector to solve a
set of equations with n variables and n unknowns. The matrix and vector describe equations
of the form:
a0x + b0y + c0z + d0w = e0
a1x + b1y + c1z + d1w = e1
a2x + b2y + c2z + d2w = e2
a3x + b3y + c3z + d3w = e3
where in this case n=4. The matrix for the above equations would be as follows:
[a0 b0 c0 d0]
[a1 b1 c1 d1]
[a2 b2 c2 d2]
[a3 b3 c3 d3]
and the vector would be:
[e0]
[e1]
[e2]
[e3]
The application creates a solution vector:
[x]
[y]
[z]
[w]
The Makefile may need to be adjusted for different machines, but it was written for Mac OS X and
Linux with either NVIDIA or AMD OpenCL SDKs.
Additional input files can be created with the matrixGenerator.py file in the data folder.
Gaussian Elimination Usage
gaussianElimination [filename] [-hqt] [-p [int] -d [int]]
example:
$ ./gaussianElimination matrix4.txt
filename the filename that holds the matrix data
-h, --help Display the help file
-q Quiet mode. Suppress all text output.
-t Print timing information.
-p [int] Choose the platform (must choose both platform and device)
-d [int] Choose the device (must choose both platform and device)
Notes: 1. The filename is required as the first parameter.
2. If you declare either the device or the platform,
you must declare both.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,281 @@
/****************************************************************************\
* Copyright (c) 2011, Advanced Micro Devices, Inc. *
* All rights reserved. *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* Redistributions of source code must retain the above copyright notice, *
* this list of conditions and the following disclaimer. *
* *
* Redistributions in binary form must reproduce the above copyright notice, *
* this list of conditions and the following disclaimer in the documentation *
* and/or other materials provided with the distribution. *
* *
* Neither the name of the copyright holder nor the names of its contributors *
* may be used to endorse or promote products derived from this software *
* without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR *
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
* *
* If you use the software (in whole or in part), you shall adhere to all *
* applicable U.S., European, and other export laws, including but not *
* limited to the U.S. Export Administration Regulations (“EAR”), (15 C.F.R. *
* Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 *
* of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you *
* hereby certify that, except pursuant to a license granted by the United *
* States Department of Commerce Bureau of Industry and Security or as *
* otherwise permitted pursuant to a License Exception under the U.S. Export *
* Administration Regulations ("EAR"), you will not (1) export, re-export or *
* release to a national of a country in Country Groups D:1, E:1 or E:2 any *
* restricted technology, software, or source code you receive hereunder, *
* or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such *
* technology or software, if such foreign produced direct product is subject *
* to national security controls as identified on the Commerce Control List *
*(currently found in Supplement 1 to Part 774 of EAR). For the most current *
* Country Group listings, or for additional information about the EAR or *
* your obligations under those regulations, please refer to the U.S. Bureau *
* of Industry and Security’s website at http://www.bis.doc.gov/. *
\****************************************************************************/
#ifndef __CL_UTILS_H__
#define __CL_UTILS_H__
#include <CL/cl.h>
// The cl_time type is OS specific
#ifdef _WIN32
#include <tchar.h>
#include <Windows.h>
typedef __int64 cl_time;
#else
#include <sys/time.h>
typedef double cl_time;
#endif
//-------------------------------------------------------
// Initialization and Cleanup
//-------------------------------------------------------
// Detects platforms and devices, creates context and command queue
cl_context cl_init(char devicePreference='\0');
// Creates a context given a platform and a device
cl_context cl_init_context(int platform,int dev,int quiet=0);
// Releases resources used by clutils
void cl_cleanup();
// Releases a kernel object
void cl_freeKernel(cl_kernel kernel);
// Releases a memory object
void cl_freeMem(cl_mem mem);
// Releases a program object
void cl_freeProgram(cl_program program);
// Returns the global command queue
cl_command_queue cl_getCommandQueue();
//-------------------------------------------------------
// Synchronization functions
//-------------------------------------------------------
// Performs a clFinish on the command queue
void cl_sync();
//-------------------------------------------------------
// Memory allocation
//-------------------------------------------------------
// Allocates a regular buffer on the device
cl_mem cl_allocBuffer(size_t mem_size,
cl_mem_flags flags = CL_MEM_READ_WRITE);
// XXX I don't think this does exactly what we want it to do
// Allocates a read-only buffer and transfers the data
cl_mem cl_allocBufferConst(size_t mem_size, void* host_ptr);
// Allocates pinned memory on the host
cl_mem cl_allocBufferPinned(size_t mem_size);
// Allocates an image on the device
cl_mem cl_allocImage(size_t height, size_t width, char type,
cl_mem_flags flags = CL_MEM_READ_WRITE);
//-------------------------------------------------------
// Data transfers
//-------------------------------------------------------
// Copies a buffer from the device to pinned memory on the host and
// maps it so it can be read
void* cl_copyAndMapBuffer(cl_mem dst, cl_mem src, size_t size);
// Copies from one buffer to another
void cl_copyBufferToBuffer(cl_mem dst, cl_mem src, size_t size);
// Copies data to a buffer on the device
void cl_copyBufferToDevice(cl_mem dst, void *src, size_t mem_size,
cl_bool blocking = CL_TRUE);
// Copies data to an image on the device
void cl_copyImageToDevice(cl_mem dst, void* src, size_t height, size_t width);
// Copies an image from the device to the host
void cl_copyImageToHost(void* dst, cl_mem src, size_t height, size_t width);
// Copies data from a device buffer to the host
void cl_copyBufferToHost(void *dst, cl_mem src, size_t mem_size,
cl_bool blocking = CL_TRUE);
// Copies data from a buffer on the device to an image on the device
void cl_copyBufferToImage(cl_mem src, cl_mem dst, int height, int width);
// Maps a buffer
void* cl_mapBuffer(cl_mem mem, size_t mem_size, cl_mem_flags flags);
// Unmaps a buffer
void cl_unmapBuffer(cl_mem mem, void *ptr);
// Writes data to a zero-copy buffer on the device
void cl_writeToZCBuffer(cl_mem mem, void* data, size_t size);
//-------------------------------------------------------
// Program and kernels
//-------------------------------------------------------
// Compiles a program
cl_program cl_compileProgram(char* kernelPath, char* compileoptions,
bool verboseoptions = 0);
// Creates a kernel
cl_kernel cl_createKernel(cl_program program, const char* kernelName);
// Sets a kernel argument
void cl_setKernelArg(cl_kernel kernel, unsigned int index, size_t size,
void* data);
//-------------------------------------------------------
// Profiling/events
//-------------------------------------------------------
// Computes the execution time (start to end) for an event
double cl_computeExecTime(cl_event);
// Compute the elapsed time between two CPU timer values
double cl_computeTime(cl_time start, cl_time end);
// Creates an event from CPU timers
void cl_createUserEvent(cl_time start, cl_time end, char* desc);
// Disable logging of events
void cl_disableEvents();
// Enable logging of events
void cl_enableEvents();
// Query the current system time
void cl_getTime(cl_time* time);
// Calls a function which prints events to the terminal
void cl_printEvents();
// Calls a function which writes the events to a file
void cl_writeEventsToFile(char* path);
//-------------------------------------------------------
// Error handling
//-------------------------------------------------------
// Compare a status value to CL_SUCCESS and optionally exit on error
int cl_errChk(const cl_int status, const char *msg, bool exitOnErr);
// Queries the supported image formats for the device and prints
// them to the screen
void printSupportedImageFormats();
//-------------------------------------------------------
// Platform and device information
//-------------------------------------------------------
bool cl_deviceIsAMD(cl_device_id dev=NULL);
bool cl_deviceIsNVIDIA(cl_device_id dev=NULL);
bool cl_platformIsNVIDIA(cl_platform_id plat=NULL);
char* cl_getDeviceDriverVersion(cl_device_id dev=NULL);
char* cl_getDeviceName(cl_device_id dev=NULL);
char* cl_getDeviceVendor(cl_device_id dev=NULL);
char* cl_getDeviceVersion(cl_device_id dev=NULL);
char* cl_getPlatformName(cl_platform_id platform);
char* cl_getPlatformVendor(cl_platform_id platform);
//-------------------------------------------------------
// Utility functions
//-------------------------------------------------------
char* catStringWithInt(const char* str, int integer);
char* itoa_portable(int value, char* result, int base);
//-------------------------------------------------------
// Data types
//-------------------------------------------------------
typedef struct{
int x;
int y;
} int2;
typedef struct{
float x;
float y;
}float2;
typedef struct{
float x;
float y;
float z;
float w;
}float4;
//-------------------------------------------------------
// Defines
//-------------------------------------------------------
#define MAX_ERR_VAL 64
#define NUM_PROGRAMS 7
#define NUM_KERNELS 13
#define KERNEL_INIT_DET 0
#define KERNEL_BUILD_DET 1
#define KERNEL_SURF_DESC 2
#define KERNEL_NORM_DESC 3
#define KERNEL_NON_MAX_SUP 4
#define KERNEL_GET_ORIENT1 5
#define KERNEL_GET_ORIENT2 6
#define KERNEL_NN 7
#define KERNEL_SCAN 8
#define KERNEL_SCAN4 9
#define KERNEL_TRANSPOSE 10
#define KERNEL_SCANIMAGE 11
#define KERNEL_TRANSPOSEIMAGE 12
#endif

View file

@ -0,0 +1,40 @@
#ifndef _GAUSSIANELIM
#define _GAUSSIANELIM
#include <iostream>
#include <vector>
#include <float.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include "clutils.h"
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
float *OpenClGaussianElimination(
cl_context context,
int timing);
void printUsage();
int parseCommandline(int argc, char *argv[], char* filename,
int *q, int *t, int *p, int *d);
void InitPerRun(int size,float *m);
void ForwardSub(cl_context context, float *a, float *b, float *m, int size,int timing);
void BackSub(float *a, float *b, float *finalVec, int size);
void Fan1(float *m, float *a, int Size, int t);
void Fan2(float *m, float *a, float *b,int Size, int j1, int t);
//void Fan3(float *m, float *b, int Size, int t);
void InitMat(FILE *fp, int size, float *ary, int nrow, int ncol);
void InitAry(FILE *fp, float *ary, int ary_size);
void PrintMat(float *ary, int size, int nrow, int ncolumn);
void PrintAry(float *ary, int ary_size);
float eventTime(cl_event event,cl_command_queue command_queue);
#endif

View file

@ -0,0 +1,49 @@
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
typedef struct latLong
{
float lat;
float lng;
} LatLong;
__kernel void Fan1(__global float *m_dev,
__global float *a_dev,
__global float *b_dev,
const int size,
const int t) {
int globalId = get_global_id(0);
if (globalId < size-1-t) {
*(m_dev + size * (globalId + t + 1)+t) = *(a_dev + size * (globalId + t + 1) + t) / *(a_dev + size * t + t);
}
}
__kernel void Fan2(__global float *m_dev,
__global float *a_dev,
__global float *b_dev,
const int size,
const int t) {
int globalId = get_global_id(0);
int globalIdx = get_global_id(0);
int globalIdy = get_global_id(1);
if (globalIdx < size-1-t && globalIdy < size-t) {
a_dev[size*(globalIdx+1+t)+(globalIdy+t)] -= m_dev[size*(globalIdx+1+t)+t] * a_dev[size*t+(globalIdy+t)];
if(globalIdy == 0){
b_dev[globalIdx+1+t] -= m_dev[size*(globalIdx+1+t)+(globalIdy+t)] * b_dev[t];
}
}
// One dimensional
// int globalIdx = globalId % size;
// int globalIdy = globalId / size;
//
// if (globalIdx < size-1-t && globalIdy < size-t) {
// a_dev[size*(globalIdx+1+t)+(globalIdy+t)] -= m_dev[size*(globalIdx+1+t)+t] * a_dev[size*t+(globalIdy+t)];
// }
// if(globalIdy == 0){
// b_dev[globalIdx+1+t] -= m_dev[size*(globalIdx+1+t)+(globalIdy+t)] * b_dev[t];
// }
}

View file

@ -0,0 +1,412 @@
#ifndef __GAUSSIAN_ELIMINATION__
#define __GAUSSIAN_ELIMINATION__
#include "gaussianElim.h"
cl_context context = NULL;
int main(int argc, char *argv[]) {
printf("enter demo main\n");
float *a = NULL, *b = NULL, *finalVec = NULL;
float *m = NULL;
int size;
FILE *fp;
// args
char filename[100];
int quiet = 0, timing = 0, platform = -1, device = -1;
// parse command line
if (parseCommandline(argc, argv, filename, &quiet, &timing, &platform,
&device)) {
printUsage();
return 0;
}
context = cl_init_context(platform, device, quiet);
fp = fopen(filename, "r");
fscanf(fp, "%d", &size);
a = (float *)malloc(size * size * sizeof(float));
printf("OK\n");
InitMat(fp, size, a, size, size);
// printf("The input matrix a is:\n");
// PrintMat(a, size, size, size);
b = (float *)malloc(size * sizeof(float));
InitAry(fp, b, size);
// printf("The input array b is:\n");
// PrintAry(b, size);
// create the solution matrix
m = (float *)malloc(size * size * sizeof(float));
// create a new vector to hold the final answer
finalVec = (float *)malloc(size * sizeof(float));
InitPerRun(size, m);
// begin timing
// run kernels
ForwardSub(context, a, b, m, size, timing);
// end timing
if (!quiet) {
printf("The result of matrix m is: \n");
PrintMat(m, size, size, size);
printf("The result of matrix a is: \n");
PrintMat(a, size, size, size);
printf("The result of array b is: \n");
PrintAry(b, size);
BackSub(a, b, finalVec, size);
printf("The final solution is: \n");
PrintAry(finalVec, size);
}
fclose(fp);
free(m);
free(a);
free(b);
free(finalVec);
// OpenClGaussianElimination(context,timing);
return 0;
}
/*------------------------------------------------------
** ForwardSub() -- Forward substitution of Gaussian
** elimination.
**------------------------------------------------------
*/
void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
int timing) {
// 1. set up kernels
cl_kernel fan1_kernel, fan2_kernel;
cl_int status = 0;
cl_program gaussianElim_program;
cl_event writeEvent, kernelEvent, readEvent;
float writeTime = 0, readTime = 0, kernelTime = 0;
float writeMB = 0, readMB = 0;
gaussianElim_program =
cl_compileProgram((char *)"gaussianElim_kernels.cl", NULL);
fan1_kernel = clCreateKernel(gaussianElim_program, "Fan1", &status);
status = cl_errChk(status, (char *)"Error Creating Fan1 kernel", true);
if (status)
exit(1);
fan2_kernel = clCreateKernel(gaussianElim_program, "Fan2", &status);
status = cl_errChk(status, (char *)"Error Creating Fan2 kernel", true);
if (status)
exit(1);
// 2. set up memory on device and send ipts data to device
cl_mem a_dev, b_dev, m_dev;
cl_int error = 0;
a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * size * size, NULL, &error);
b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * size, NULL,
&error);
m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * size * size, NULL, &error);
cl_command_queue command_queue = cl_getCommandQueue();
error = clEnqueueWriteBuffer(command_queue, a_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size * size, a, 0, NULL,
&writeEvent);
if (timing)
writeTime += eventTime(writeEvent, command_queue);
clReleaseEvent(writeEvent);
error = clEnqueueWriteBuffer(command_queue, b_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size, b, 0, NULL, &writeEvent);
if (timing)
writeTime += eventTime(writeEvent, command_queue);
clReleaseEvent(writeEvent);
error = clEnqueueWriteBuffer(command_queue, m_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size * size, m, 0, NULL,
&writeEvent);
if (timing)
writeTime += eventTime(writeEvent, command_queue);
clReleaseEvent(writeEvent);
writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);
// 3. Determine block sizes
size_t globalWorksizeFan1[1];
size_t globalWorksizeFan2[2];
globalWorksizeFan1[0] = size;
globalWorksizeFan2[0] = size;
globalWorksizeFan2[1] = size;
int t;
// 4. Setup and Run kernels
for (t = 0; t < (size - 1); t++) {
// kernel args
cl_int argchk;
argchk = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size);
argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t);
cl_errChk(argchk, "ERROR in Setting Fan1 kernel args", true);
// launch kernel
error =
clEnqueueNDRangeKernel(command_queue, fan1_kernel, 1, 0,
globalWorksizeFan1, NULL, 0, NULL, &kernelEvent);
cl_errChk(error, "ERROR in Executing Fan1 Kernel", true);
if (timing) {
// printf("here1a\n");
kernelTime += eventTime(kernelEvent, command_queue);
// printf("here1b\n");
}
clReleaseEvent(kernelEvent);
// Fan1<<<dimGrid,dimBlock>>>(m_cuda,a_cuda,Size,t);
// cudaThreadSynchronize();
// kernel args
argchk = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size);
argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t);
cl_errChk(argchk, "ERROR in Setting Fan2 kernel args", true);
// launch kernel
error =
clEnqueueNDRangeKernel(command_queue, fan2_kernel, 2, 0,
globalWorksizeFan2, NULL, 0, NULL, &kernelEvent);
cl_errChk(error, "ERROR in Executing Fan1 Kernel", true);
if (timing) {
// printf("here2a\n");
kernelTime += eventTime(kernelEvent, command_queue);
// printf("here2b\n");
}
clReleaseEvent(kernelEvent);
// Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t);
// cudaThreadSynchronize();
}
// 5. transfer data off of device
error =
clEnqueueReadBuffer(command_queue, a_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size * size, a, 0, NULL, &readEvent);
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
if (timing)
readTime += eventTime(readEvent, command_queue);
clReleaseEvent(readEvent);
error = clEnqueueReadBuffer(command_queue, b_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size, b, 0, NULL, &readEvent);
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
if (timing)
readTime += eventTime(readEvent, command_queue);
clReleaseEvent(readEvent);
error =
clEnqueueReadBuffer(command_queue, m_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size * size, m, 0, NULL, &readEvent);
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
if (timing)
readTime += eventTime(readEvent, command_queue);
clReleaseEvent(readEvent);
readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);
if (timing) {
printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s) "
"[size]\t\tTotal(s)\n");
printf("%dx%d \t", size, size);
printf("%f [%.2fMB]\t", writeTime, writeMB);
printf("%f\t", kernelTime);
printf("%f [%.2fMB]\t", readTime, readMB);
printf("%f\n\n", writeTime + kernelTime + readTime);
}
}
float eventTime(cl_event event, cl_command_queue command_queue) {
cl_int error = 0;
cl_ulong eventStart, eventEnd;
clFinish(command_queue);
error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &eventStart, NULL);
cl_errChk(error, "ERROR in Event Profiling.", true);
error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &eventEnd, NULL);
cl_errChk(error, "ERROR in Event Profiling.", true);
return (float)((eventEnd - eventStart) / 1e9);
}
int parseCommandline(int argc, char *argv[], char *filename, int *q, int *t,
int *p, int *d) {
int i;
// if (argc < 2) return 1; // error
strncpy(filename, "matrix4.txt", 100);
char flag;
for (i = 1; i < argc; i++) {
if (argv[i][0] == '-') { // flag
flag = argv[i][1];
switch (flag) {
case 'h': // help
return 1;
break;
case 'q': // quiet
*q = 1;
break;
case 't': // timing
*t = 1;
break;
case 'p': // platform
i++;
*p = atoi(argv[i]);
break;
case 'd': // device
i++;
*d = atoi(argv[i]);
break;
}
}
}
if ((*d >= 0 && *p < 0) ||
(*p >= 0 &&
*d < 0)) // both p and d must be specified if either are specified
return 1;
return 0;
}
void printUsage() {
printf("Gaussian Elimination Usage\n");
printf("\n");
printf("gaussianElimination [filename] [-hqt] [-p [int] -d [int]]\n");
printf("\n");
printf("example:\n");
printf("$ ./gaussianElimination matrix4.txt\n");
printf("\n");
printf("filename the filename that holds the matrix data\n");
printf("\n");
printf("-h Display the help file\n");
printf("-q Quiet mode. Suppress all text output.\n");
printf("-t Print timing information.\n");
printf("\n");
printf("-p [int] Choose the platform (must choose both platform and "
"device)\n");
printf("-d [int] Choose the device (must choose both platform and "
"device)\n");
printf("\n");
printf("\n");
printf("Notes: 1. The filename is required as the first parameter.\n");
printf(" 2. If you declare either the device or the platform,\n");
printf(" you must declare both.\n\n");
}
/*------------------------------------------------------
** InitPerRun() -- Initialize the contents of the
** multipier matrix **m
**------------------------------------------------------
*/
void InitPerRun(int size, float *m) {
int i;
for (i = 0; i < size * size; i++)
*(m + i) = 0.0;
}
void BackSub(float *a, float *b, float *finalVec, int size) {
// solve "bottom up"
int i, j;
for (i = 0; i < size; i++) {
finalVec[size - i - 1] = b[size - i - 1];
for (j = 0; j < i; j++) {
finalVec[size - i - 1] -= *(a + size * (size - i - 1) + (size - j - 1)) *
finalVec[size - j - 1];
}
finalVec[size - i - 1] =
finalVec[size - i - 1] / *(a + size * (size - i - 1) + (size - i - 1));
}
}
void InitMat(FILE *fp, int size, float *ary, int nrow, int ncol) {
int i, j;
for (i = 0; i < nrow; i++) {
for (j = 0; j < ncol; j++) {
fscanf(fp, "%f", ary + size * i + j);
}
}
}
/*------------------------------------------------------
** InitAry() -- Initialize the array (vector) by reading
** data from the data file
**------------------------------------------------------
*/
void InitAry(FILE *fp, float *ary, int ary_size) {
int i;
for (i = 0; i < ary_size; i++) {
fscanf(fp, "%f", &ary[i]);
}
}
/*------------------------------------------------------
** PrintMat() -- Print the contents of the matrix
**------------------------------------------------------
*/
void PrintMat(float *ary, int size, int nrow, int ncol) {
int i, j;
for (i = 0; i < nrow; i++) {
for (j = 0; j < ncol; j++) {
printf("%8.2f ", *(ary + size * i + j));
}
printf("\n");
}
printf("\n");
}
/*------------------------------------------------------
** PrintAry() -- Print the contents of the array (vector)
**------------------------------------------------------
*/
void PrintAry(float *ary, int ary_size) {
int i;
for (i = 0; i < ary_size; i++) {
printf("%.2f ", ary[i]);
}
printf("\n\n");
}
#endif

View file

@ -0,0 +1,11 @@
4
-0.6 -0.5 0.7 0.3
-0.3 -0.9 0.3 0.7
-0.4 -0.5 -0.3 -0.8
0.0 -0.1 0.2 0.9
-0.85 -0.68 0.24 -0.53
0.7 0.0 -0.4 -0.5

View file

@ -0,0 +1 @@
./gaussian ../../data/gaussian/matrix4.txt

View file

@ -0,0 +1,204 @@
/****************************************************************************\
* Copyright (c) 2011, Advanced Micro Devices, Inc. *
* All rights reserved. *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* Redistributions of source code must retain the above copyright notice, *
* this list of conditions and the following disclaimer. *
* *
* Redistributions in binary form must reproduce the above copyright notice, *
* this list of conditions and the following disclaimer in the documentation *
* and/or other materials provided with the distribution. *
* *
* Neither the name of the copyright holder nor the names of its contributors *
* may be used to endorse or promote products derived from this software *
* without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR *
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
* *
* If you use the software (in whole or in part), you shall adhere to all *
* applicable U.S., European, and other export laws, including but not *
* limited to the U.S. Export Administration Regulations (“EAR”), (15 C.F.R. *
* Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 *
* of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you *
* hereby certify that, except pursuant to a license granted by the United *
* States Department of Commerce Bureau of Industry and Security or as *
* otherwise permitted pursuant to a License Exception under the U.S. Export *
* Administration Regulations ("EAR"), you will not (1) export, re-export or *
* release to a national of a country in Country Groups D:1, E:1 or E:2 any *
* restricted technology, software, or source code you receive hereunder, *
* or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such *
* technology or software, if such foreign produced direct product is subject *
* to national security controls as identified on the Commerce Control List *
*(currently found in Supplement 1 to Part 774 of EAR). For the most current *
* Country Group listings, or for additional information about the EAR or *
* your obligations under those regulations, please refer to the U.S. Bureau *
* of Industry and Security’s website at http://www.bis.doc.gov/. *
\****************************************************************************/
#include <stdio.h>
#include <sys/stat.h>
#include <string.h>
#include <stdlib.h>
#include "utils.h"
static bool usingImages = true;
//! A wrapper for malloc that checks the return value
void* alloc(size_t size) {
void* ptr = NULL;
ptr = malloc(size);
if(ptr == NULL) {
perror("malloc");
exit(-1);
}
return ptr;
}
// This function checks to make sure a file exists before we open it
void checkFile(char* filename)
{
struct stat fileStatus;
if(stat(filename, &fileStatus) != 0) {
printf("Error opening file: %s\n", filename);
exit(-1);
}
else {
if(!(S_IFREG & fileStatus.st_mode)) {
printf("File %s is not a regular file\n", filename);
exit(-1);
}
}
}
// This function checks to make sure a directory exists
void checkDir(char* dirpath)
{
struct stat fileStatus;
if(stat(dirpath, &fileStatus) != 0) {
printf("Directory does not exist: %s\n", dirpath);
exit(-1);
}
else {
if(!(S_IFDIR & fileStatus.st_mode)) {
printf("Directory was not provided: %s\n", dirpath);
exit(-1);
}
}
}
// Parse the command line arguments
void parseArguments(int argc, char** argv, char** input, char** events,
char** ipts, char* devicePref, bool* verifyResults)
{
for(int i = 2; i < argc; i++) {
if(strcmp(argv[i], "-d") == 0) { // Event dump found
if(i == argc-1) {
printf("Usage: -e Needs directory path\n");
exit(-1);
}
devicePref[0] = argv[i+1][0];
i++;
continue;
}
if(strcmp(argv[i], "-e") == 0) { // Event dump found
if(i == argc-1) {
printf("Usage: -e Needs directory path\n");
exit(-1);
}
*events = argv[i+1];
i++;
continue;
}
if(strcmp(argv[i], "-i") == 0) { // Input found
if(i == argc-1) {
printf("Usage: -i Needs directory path\n");
exit(-1);
}
*input = argv[i+1];
i++;
continue;
}
if(strcmp(argv[i], "-l") == 0) { // Ipts dump found
if(i == argc-1) {
printf("Usage: -l Needs directory path\n");
exit(-1);
}
*ipts = argv[i+1];
i++;
continue;
}
if(strcmp(argv[i], "-n") == 0) { // Don't use OpenCL images
setUsingImages(false);
continue;
}
if(strcmp(argv[i], "-v") == 0) { // Verify results
*verifyResults = true;
continue;
}
}
}
// This function that takes a positive integer 'value' and returns
// the nearest multiple of 'multiple' (used for padding columns)
unsigned int roundUp(unsigned int value, unsigned int multiple) {
unsigned int remainder = value % multiple;
// Make the value a multiple of multiple
if(remainder != 0) {
value += (multiple-remainder);
}
return value;
}
// Concatenate two strings and return a pointer to the new string
char* smartStrcat(char* str1, char* str2)
{
char* newStr = NULL;
newStr = (char*)alloc((strlen(str1)+strlen(str2)+1)*sizeof(char));
strcpy(newStr, str1);
strcat(newStr, str2);
return newStr;
}
// Set the value of using images to true if they are being
// used, or false if they are not
void setUsingImages(bool val)
{
usingImages = val;
}
// Return whether or not images are being used
bool isUsingImages()
{
return usingImages;
}

View file

@ -0,0 +1,84 @@
/****************************************************************************\
* Copyright (c) 2011, Advanced Micro Devices, Inc. *
* All rights reserved. *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* Redistributions of source code must retain the above copyright notice, *
* this list of conditions and the following disclaimer. *
* *
* Redistributions in binary form must reproduce the above copyright notice, *
* this list of conditions and the following disclaimer in the documentation *
* and/or other materials provided with the distribution. *
* *
* Neither the name of the copyright holder nor the names of its contributors *
* may be used to endorse or promote products derived from this software *
* without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR *
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
* *
* If you use the software (in whole or in part), you shall adhere to all *
* applicable U.S., European, and other export laws, including but not *
* limited to the U.S. Export Administration Regulations (“EAR”), (15 C.F.R. *
* Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 *
* of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you *
* hereby certify that, except pursuant to a license granted by the United *
* States Department of Commerce Bureau of Industry and Security or as *
* otherwise permitted pursuant to a License Exception under the U.S. Export *
* Administration Regulations ("EAR"), you will not (1) export, re-export or *
* release to a national of a country in Country Groups D:1, E:1 or E:2 any *
* restricted technology, software, or source code you receive hereunder, *
* or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such *
* technology or software, if such foreign produced direct product is subject *
* to national security controls as identified on the Commerce Control List *
*(currently found in Supplement 1 to Part 774 of EAR). For the most current *
* Country Group listings, or for additional information about the EAR or *
* your obligations under those regulations, please refer to the U.S. Bureau *
* of Industry and Security’s website at http://www.bis.doc.gov/. *
\****************************************************************************/
#ifndef _UTILS_
#define _UTILS_
// Wrapper for malloc
void* alloc(size_t size);
// Checks for existence of directory
void checkDir(char* dirpath);
// Check for existence of file
void checkFile(char* filename);
// Parse the input command line options to the program
void parseArguments(int argc, char** argv, char** input, char** events,
char** ipts, char* devicePref, bool* verifyResults);
// Print the program usage information
void printUsage();
// Rounds up size to the nearest multiple of multiple
unsigned int roundUp(unsigned int value, unsigned int multiple);
// Concatenate two strings, creating a new one
char* smartStrcat(char* str1, char* str2);
// Set the value of usingImages
void setUsingImages(bool val);
// Return whether or not images are being used
bool isUsingImages();
#endif

View file

@ -0,0 +1,78 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.S
VX_SRCS += $(VX_RT_PATH)/io/vx_io.S $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.S
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/startup/vx_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = kmeans
SRCS = main.cc read_input.c rmse.c cluster.c kmeans_clustering.c
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
kmeans_clustering.o: kmeans_clustering.c
$(CC) $(CXXFLAGS) -c kmeans_clustering.c
cluster.o: cluster.c
$(CC) $(CXXFLAGS) -c cluster.c
read_input.o: read_input.c
$(CC) $(CXXFLAGS) -c read_input.c
rmse.o: rmse.c
$(CC) $(CXXFLAGS) -c rmse.c
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

View file

View file

@ -0,0 +1,155 @@
/*****************************************************************************/
/*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */
/*By downloading, copying, installing or using the software you agree */
/*to this license. If you do not agree to this license, do not download, */
/*install, copy or use the software. */
/* */
/* */
/*Copyright (c) 2005 Northwestern University */
/*All rights reserved. */
/*Redistribution of the software in source and binary forms, */
/*with or without modification, is permitted provided that the */
/*following conditions are met: */
/* */
/*1 Redistributions of source code must retain the above copyright */
/* notice, this list of conditions and the following disclaimer. */
/* */
/*2 Redistributions in binary form must reproduce the above copyright */
/* notice, this list of conditions and the following disclaimer in the */
/* documentation and/or other materials provided with the distribution.*/
/* */
/*3 Neither the name of Northwestern University nor the names of its */
/* contributors may be used to endorse or promote products derived */
/* from this software without specific prior written permission. */
/* */
/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */
/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */
/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */
/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */
/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */
/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */
/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */
/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/*POSSIBILITY OF SUCH DAMAGE. */
/******************************************************************************/
/*************************************************************************/
/** File: cluster.c **/
/** Description: Takes as input a file, containing 1 data point per **/
/** per line, and performs a fuzzy c-means clustering **/
/** on the data. Fuzzy clustering is performed using **/
/** min to max clusters and the clustering that gets **/
/** the best score according to a compactness and **/
/** separation criterion are returned. **/
/** Author: Brendan McCane **/
/** James Cook University of North Queensland. **/
/** Australia. email: mccane@cs.jcu.edu.au **/
/** **/
/** Edited by: Jay Pisharath, Wei-keng Liao **/
/** Northwestern University. **/
/** **/
/** ================================================================ **/
/** **/
/** Edited by: Shuai Che, David Tarjan, Sang-Ha Lee **/
/** University of Virginia **/
/** **/
/** Description: No longer supports fuzzy c-means clustering; **/
/** only regular k-means clustering. **/
/** No longer performs "validity" function to analyze **/
/** compactness and separation crietria; instead **/
/** calculate root mean squared error. **/
/** **/
/*************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <math.h>
#include <float.h>
#include "kmeans.h"
float min_rmse_ref = FLT_MAX;
extern double wtime(void);
/* reference min_rmse value */
/*---< cluster() >-----------------------------------------------------------*/
int cluster(int npoints, /* number of data points */
int nfeatures, /* number of attributes for each point */
float **features, /* array: [npoints][nfeatures] */
int min_nclusters, /* range of min to max number of clusters */
int max_nclusters,
float threshold, /* loop terminating factor */
int *best_nclusters, /* out: number between min and max with lowest RMSE */
float ***cluster_centres, /* out: [best_nclusters][nfeatures] */
float *min_rmse, /* out: minimum RMSE */
int isRMSE, /* calculate RMSE */
int nloops /* number of iteration for each number of clusters */
)
{
int nclusters; /* number of clusters k */
int index =0; /* number of iteration to reach the best RMSE */
int rmse; /* RMSE for each clustering */
int *membership; /* which cluster a data point belongs to */
float **tmp_cluster_centres; /* hold coordinates of cluster centers */
int i;
/* allocate memory for membership */
membership = (int*) malloc(npoints * sizeof(int));
/* sweep k from min to max_nclusters to find the best number of clusters */
for(nclusters = min_nclusters; nclusters <= max_nclusters; nclusters++)
{
if (nclusters > npoints) break; /* cannot have more clusters than points */
/* allocate device memory, invert data array (@ kmeans_cuda.cu) */
allocate(npoints, nfeatures, nclusters, features);
/* iterate nloops times for each number of clusters */
for(i = 0; i < nloops; i++)
{
/* initialize initial cluster centers, CUDA calls (@ kmeans_cuda.cu) */
tmp_cluster_centres = kmeans_clustering(features,
nfeatures,
npoints,
nclusters,
threshold,
membership);
if (*cluster_centres) {
free((*cluster_centres)[0]);
free(*cluster_centres);
}
*cluster_centres = tmp_cluster_centres;
/* find the number of clusters with the best RMSE */
if(isRMSE)
{
rmse = rms_err(features,
nfeatures,
npoints,
tmp_cluster_centres,
nclusters);
if(rmse < min_rmse_ref){
min_rmse_ref = rmse; //update reference min RMSE
*min_rmse = min_rmse_ref; //update return min RMSE
*best_nclusters = nclusters; //update optimum number of clusters
index = i; //update number of iteration to reach best RMSE
}
}
}
deallocateMemory(); /* free device memory (@ kmeans_cuda.cu) */
}
free(membership);
return index;
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,191 @@
/* getopt.h */
/* Declarations for getopt.
Copyright (C) 1989-1994, 1996-1999, 2001 Free Software
Foundation, Inc. This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute
it and/or modify it under the terms of the GNU Lesser
General Public License as published by the Free Software
Foundation; either version 2.1 of the License, or
(at your option) any later version.
The GNU C Library is distributed in the hope that it will
be useful, but WITHOUT ANY WARRANTY; without even the
implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General
Public License along with the GNU C Library; if not, write
to the Free Software Foundation, Inc., 59 Temple Place,
Suite 330, Boston, MA 02111-1307 USA. */
#ifndef _GETOPT_H
#ifndef __need_getopt
# define _GETOPT_H 1
#endif
/* If __GNU_LIBRARY__ is not already defined, either we are being used
standalone, or this is the first header included in the source file.
If we are being used with glibc, we need to include <features.h>, but
that does not exist if we are standalone. So: if __GNU_LIBRARY__ is
not defined, include <ctype.h>, which will pull in <features.h> for us
if it's from glibc. (Why ctype.h? It's guaranteed to exist and it
doesn't flood the namespace with stuff the way some other headers do.) */
#if !defined __GNU_LIBRARY__
# include <ctype.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* For communication from `getopt' to the caller.
When `getopt' finds an option that takes an argument,
the argument value is returned here.
Also, when `ordering' is RETURN_IN_ORDER,
each non-option ARGV-element is returned here. */
extern char *optarg;
/* Index in ARGV of the next element to be scanned.
This is used for communication to and from the caller
and for communication between successive calls to `getopt'.
On entry to `getopt', zero means this is the first call; initialize.
When `getopt' returns -1, this is the index of the first of the
non-option elements that the caller should itself scan.
Otherwise, `optind' communicates from one call to the next
how much of ARGV has been scanned so far. */
extern int optind;
/* Callers store zero here to inhibit the error message `getopt' prints
for unrecognized options. */
extern int opterr;
/* Set to an option character which was unrecognized. */
extern int optopt;
#ifndef __need_getopt
/* Describe the long-named options requested by the application.
The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
of `struct option' terminated by an element containing a name which is
zero.
The field `has_arg' is:
no_argument (or 0) if the option does not take an argument,
required_argument (or 1) if the option requires an argument,
optional_argument (or 2) if the option takes an optional argument.
If the field `flag' is not NULL, it points to a variable that is set
to the value given in the field `val' when the option is found, but
left unchanged if the option is not found.
To have a long-named option do something other than set an `int' to
a compiled-in constant, such as set a value from `optarg', set the
option's `flag' field to zero and its `val' field to a nonzero
value (the equivalent single-letter option character, if there is
one). For long options that have a zero `flag' field, `getopt'
returns the contents of the `val' field. */
struct option
{
# if (defined __STDC__ && __STDC__) || defined __cplusplus
const char *name;
# else
char *name;
# endif
/* has_arg can't be an enum because some compilers complain about
type mismatches in all the code that assumes it is an int. */
int has_arg;
int *flag;
int val;
};
/* Names for the values of the `has_arg' field of `struct option'. */
# define no_argument 0
# define required_argument 1
# define optional_argument 2
#endif /* need getopt */
/* Get definitions and prototypes for functions to process the
arguments in ARGV (ARGC of them, minus the program name) for
options given in OPTS.
Return the option character from OPTS just read. Return -1 when
there are no more options. For unrecognized options, or options
missing arguments, `optopt' is set to the option letter, and '?' is
returned.
The OPTS string is a list of characters which are recognized option
letters, optionally followed by colons, specifying that that letter
takes an argument, to be placed in `optarg'.
If a letter in OPTS is followed by two colons, its argument is
optional. This behavior is specific to the GNU `getopt'.
The argument `--' causes premature termination of argument
scanning, explicitly telling `getopt' that there are no more
options.
If OPTS begins with `--', then non-option arguments are treated as
arguments to the option '\0'. This behavior is specific to the GNU
`getopt'. */
#if (defined __STDC__ && __STDC__) || defined __cplusplus
# ifdef __GNU_LIBRARY__
/* Many other libraries have conflicting prototypes for getopt, with
differences in the consts, in stdlib.h. To avoid compilation
errors, only prototype getopt for the GNU C library. */
extern int getopt (int ___argc, char *const *___argv, const char *__shortopts);
# else /* not __GNU_LIBRARY__ */
extern int getopt ();
# endif /* __GNU_LIBRARY__ */
# ifndef __need_getopt
extern int getopt_long (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind);
extern int getopt_long_only (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind);
/* Internal only. Users should not call this directly. */
extern int _getopt_internal (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind,
int __long_only);
# endif
#else /* not __STDC__ */
extern int getopt ();
# ifndef __need_getopt
extern int getopt_long ();
extern int getopt_long_only ();
extern int _getopt_internal ();
# endif
#endif /* __STDC__ */
#ifdef __cplusplus
}
#endif
/* Make sure we later can get all the definitions and declarations. */
#undef __need_getopt
#endif /* getopt.h */

View file

@ -0,0 +1,61 @@
#ifndef FLT_MAX
#define FLT_MAX 3.40282347e+38
#endif
__kernel void
kmeans_kernel_c(__global float *feature,
__global float *clusters,
__global int *membership,
int npoints,
int nclusters,
int nfeatures,
int offset,
int size
)
{
unsigned int point_id = get_global_id(0);
int index = 0;
//const unsigned int point_id = get_global_id(0);
if (point_id < npoints)
{
float min_dist=FLT_MAX;
for (int i=0; i < nclusters; i++) {
float dist = 0;
float ans = 0;
for (int l=0; l<nfeatures; l++){
ans += (feature[l * npoints + point_id]-clusters[i*nfeatures+l])*
(feature[l * npoints + point_id]-clusters[i*nfeatures+l]);
}
dist = ans;
if (dist < min_dist) {
min_dist = dist;
index = i;
}
}
//printf("%d\n", index);
membership[point_id] = index;
}
return;
}
__kernel void
kmeans_swap(__global float *feature,
__global float *feature_swap,
int npoints,
int nfeatures
){
unsigned int tid = get_global_id(0);
//for(int i = 0; i < nfeatures; i++)
// feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
//Lingjie Zhang modificated at 11/05/2015
if (tid < npoints){
for(int i = 0; i < nfeatures; i++)
feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
}
// end of Lingjie Zhang's modification
}

Some files were not shown because too many files have changed in this diff Show more