Merge branch 'develop'

This commit is contained in:
Blaise Tine 2023-12-31 11:26:48 -08:00
commit ec2a35def9
65 changed files with 5705 additions and 18904 deletions

View file

@ -82,64 +82,9 @@ module VX_cluster import VX_gpu_pkg::*; #(
`endif
VX_mem_bus_if #(
.DATA_SIZE (L2_WORD_SIZE),
.TAG_WIDTH (L2_TAG_WIDTH)
) l2_mem_bus_if[L2_NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) per_socket_icache_mem_bus_if[`NUM_SOCKETS]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) per_socket_dcache_mem_bus_if[`NUM_SOCKETS]();
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_ARB_TAG_WIDTH)
) icache_mem_bus_if[1]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_ARB_TAG_WIDTH)
) dcache_mem_bus_if[1]();
`RESET_RELAY (l1_mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (`NUM_SOCKETS),
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) icache_mem_arb (
.clk (clk),
.reset (l1_mem_arb_reset),
.bus_in_if (per_socket_icache_mem_bus_if),
.bus_out_if (icache_mem_bus_if)
);
VX_mem_arb #(
.NUM_INPUTS (`NUM_SOCKETS),
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) dcache_mem_arb (
.clk (clk),
.reset (l1_mem_arb_reset),
.bus_in_if (per_socket_dcache_mem_bus_if),
.bus_out_if (dcache_mem_bus_if)
);
`ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[ICACHE_MEM_ARB_IDX], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_ARB_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l2_mem_bus_if[DCACHE_MEM_ARB_IDX], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) per_socket_mem_bus_if[`NUM_SOCKETS]();
`RESET_RELAY (l2_reset, reset);
@ -155,7 +100,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE),
.MREQ_SIZE (`L2_MREQ_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_REG (2),
@ -168,7 +113,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.l2cache),
`endif
.core_bus_if (l2_mem_bus_if),
.core_bus_if (per_socket_mem_bus_if),
.mem_bus_if (mem_bus_if)
);
@ -209,8 +154,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.dcr_bus_if (socket_dcr_bus_if),
.icache_mem_bus_if (per_socket_icache_mem_bus_if[i]),
.dcache_mem_bus_if (per_socket_dcache_mem_bus_if[i]),
.mem_bus_if (per_socket_mem_bus_if[i]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[i]),

View file

@ -292,8 +292,8 @@
// Floating-Point Units ///////////////////////////////////////////////////////
// Size of FPU Request Queue
`ifndef FPU_REQ_QUEUE_SIZE
`define FPU_REQ_QUEUE_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
`ifndef FPUQ_SIZE
`define FPUQ_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
`endif
// FNCP Latency

View file

@ -141,10 +141,9 @@ package VX_gpu_pkg;
/////////////////////////////// L1 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_TAG_WIDTH = (ICACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS));
localparam DCACHE_MEM_ARB_TAG_WIDTH = (DCACHE_MEM_TAG_WIDTH + `CLOG2(`NUM_SOCKETS));
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_ARB_TAG_WIDTH, DCACHE_MEM_ARB_TAG_WIDTH);
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
/////////////////////////////// L2 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_IDX = 0;
@ -154,10 +153,10 @@ package VX_gpu_pkg;
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
// Input request size
localparam L2_NUM_REQS = 2;
localparam L2_NUM_REQS = `NUM_SOCKETS;
// Core request tag bits
localparam L2_TAG_WIDTH = L1_MEM_TAG_WIDTH;
localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH;
// Memory request data bits
localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8);

View file

@ -30,8 +30,7 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_dcr_bus_if.slave dcr_bus_if,
// Memory
VX_mem_bus_if.master icache_mem_bus_if,
VX_mem_bus_if.master dcache_mem_bus_if,
VX_mem_bus_if.master mem_bus_if,
`ifdef GBAR_ENABLE
// Barrier
@ -79,6 +78,11 @@ module VX_socket import VX_gpu_pkg::*; #(
.TAG_WIDTH (ICACHE_TAG_WIDTH)
) per_core_icache_bus_if[`SOCKET_SIZE]();
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) icache_mem_bus_if();
`RESET_RELAY (icache_reset, reset);
VX_cache_cluster #(
@ -117,6 +121,11 @@ module VX_socket import VX_gpu_pkg::*; #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) dcache_mem_bus_if();
`RESET_RELAY (dcache_reset, reset);
@ -151,6 +160,40 @@ module VX_socket import VX_gpu_pkg::*; #(
.mem_bus_if (dcache_mem_bus_if)
);
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) l1_mem_bus_if[2]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if[1]();
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
///////////////////////////////////////////////////////////////////////////
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;

View file

@ -30,7 +30,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
localparam NUM_LANES = `NUM_FPU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAG_WIDTH = `LOG2UP(`FPU_REQ_QUEUE_SIZE);
localparam TAG_WIDTH = `LOG2UP(`FPUQ_SIZE);
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #(
@ -87,7 +87,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_index_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
.SIZE (`FPU_REQ_QUEUE_SIZE)
.SIZE (`FPUQ_SIZE)
) tag_store (
.clk (clk),
.reset (reset),

View file

@ -15,10 +15,6 @@ all:
$(MAKE) -C blackscholes
$(MAKE) -C transpose
$(MAKE) -C convolution
# $(MAKE) -C cutcp
# $(MAKE) -C sgemm2
# $(MAKE) -C vectorhypot
# $(MAKE) -C mri-q run-simx
run-simx:
$(MAKE) -C vecadd run-simx
@ -37,10 +33,6 @@ run-simx:
$(MAKE) -C blackscholes run-simx
$(MAKE) -C transpose run-simx
$(MAKE) -C convolution run-simx
# $(MAKE) -C cutcp run-simx
# $(MAKE) -C sgemm2 run-simx
# $(MAKE) -C vectorhypot run-simx
# $(MAKE) -C mri-q run-simx
run-rtlsim:
$(MAKE) -C vecadd run-rtlsim
@ -59,10 +51,6 @@ run-rtlsim:
$(MAKE) -C oclprintf run-rtlsim
$(MAKE) -C blackscholes run-rtlsim
$(MAKE) -C convolution run-rtlsim
# $(MAKE) -C cutcp run-rtlsim
# $(MAKE) -C sgemm2 run-rtlsim
# $(MAKE) -C vectorhypot run-rtlsim
# $(MAKE) -C mri-q run-rtlsim
run-opae:
$(MAKE) -C vecadd run-opae
@ -81,10 +69,6 @@ run-opae:
$(MAKE) -C oclprintf run-opae
$(MAKE) -C blackscholes run-opae
$(MAKE) -C convolution run-opae
# $(MAKE) -C cutcp run-opae
# $(MAKE) -C sgemm2 run-opae
# $(MAKE) -C vectorhypot run-opae
# $(MAKE) -C mri-q run-opae
clean:
$(MAKE) -C vecadd clean
@ -103,10 +87,6 @@ clean:
$(MAKE) -C oclprintf clean
$(MAKE) -C blackscholes clean
$(MAKE) -C convolution clean
# $(MAKE) -C cutcp clean
# $(MAKE) -C sgemm2 clean
# $(MAKE) -C vectorhypot clean
# $(MAKE) -C mri-q clean
clean-all:
$(MAKE) -C vecadd clean-all
@ -124,8 +104,4 @@ clean-all:
$(MAKE) -C lbm clean-all
$(MAKE) -C oclprintf clean-all
$(MAKE) -C blackscholes clean-all
$(MAKE) -C convolution clean-all
# $(MAKE) -C cutcp clean-all
# $(MAKE) -C sgemm2 clean-all
# $(MAKE) -C vectorhypot clean-all
# $(MAKE) -C mri-q clean-all
$(MAKE) -C convolution clean-all

View file

@ -1,9 +0,0 @@
PROJECT = cutcp
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
CXXFLAGS += -I.
OPTS ?=
include ../common.mk

View file

@ -1,617 +0,0 @@
#include <parboil.h>
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
/*****************************************************************************/
/* Memory management routines */
/* Free an array of owned strings. */
void
pb_FreeStringArray(char **string_array)
{
char **p;
if (!string_array) return;
for (p = string_array; *p; p++) free(*p);
free(string_array);
}
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version)
{
if (name == NULL) {
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
exit(-1);
}
struct pb_PlatformParam *ret =
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
ret->name = name;
ret->version = version;
return ret;
}
void
pb_FreePlatformParam(struct pb_PlatformParam *p)
{
if (p == NULL) return;
free(p->name);
free(p->version);
free(p);
}
struct pb_DeviceParam *
pb_DeviceParam_index(int index)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_INDEX;
ret->index = index;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_cpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_CPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_gpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_GPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_ACCELERATOR;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_name(char *name)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_NAME;
ret->name = name;
return ret;
}
void
pb_FreeDeviceParam(struct pb_DeviceParam *p)
{
if (p == NULL) return;
switch(p->criterion) {
case pb_Device_NAME:
free(p->name);
break;
case pb_Device_INDEX:
case pb_Device_CPU:
case pb_Device_ACCELERATOR:
break;
default:
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
exit(-1);
}
}
void
pb_FreeParameters(struct pb_Parameters *p)
{
free(p->outFile);
pb_FreeStringArray(p->inpFiles);
pb_FreePlatformParam(p->platform);
pb_FreeDeviceParam(p->device);
free(p);
}
/*****************************************************************************/
/* Parse a comma-delimited list of strings into an
* array of strings. */
static char **
read_string_array(char *in)
{
char **ret;
int i;
int count; /* Number of items in the input */
char *substring; /* Current substring within 'in' */
/* Count the number of items in the string */
count = 1;
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
/* Allocate storage */
ret = (char **)malloc((count + 1) * sizeof(char *));
/* Create copies of the strings from the list */
substring = in;
for (i = 0; i < count; i++) {
char *substring_end;
int substring_length;
/* Find length of substring */
for (substring_end = substring;
(*substring_end != ',') && (*substring_end != 0);
substring_end++);
substring_length = substring_end - substring;
/* Allocate memory and copy the substring */
ret[i] = (char *)malloc(substring_length + 1);
memcpy(ret[i], substring, substring_length);
ret[i][substring_length] = 0;
/* go to next substring */
substring = substring_end + 1;
}
ret[i] = NULL; /* Write the sentinel value */
return ret;
}
static void
report_parse_error(const char *str)
{
fputs(str, stderr);
}
/* Interpret a string as a 'pb_DeviceParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_DeviceParam *
read_device_param(char *str)
{
/* Try different ways of interpreting 'device_string' until one works */
/* If argument is an integer, then interpret it as a device index */
errno = 0;
char *end;
long device_int = strtol(str, &end, 10);
if (!errno) {
/* Negative numbers are not valid */
if (device_int < 0 || device_int > INT_MAX) return NULL;
return pb_DeviceParam_index(device_int);
}
/* Match against predefined strings */
if (strcmp(str, "CPU") == 0)
return pb_DeviceParam_cpu();
if (strcmp(str, "GPU") == 0)
return pb_DeviceParam_gpu();
if (strcmp(str, "ACCELERATOR") == 0)
return pb_DeviceParam_accelerator();
/* Assume any other string is a device name */
return pb_DeviceParam_name(strdup(str));
}
/* Interpret a string as a 'pb_PlatformParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_PlatformParam *
read_platform_param(char *str)
{
int separator_index; /* Index of the '-' character separating
* name and version number. It's -1 if
* there's no '-' character. */
/* Find the last occurrence of '-' in 'str' */
{
char *cur;
separator_index = -1;
for (cur = str; *cur; cur++) {
if (*cur == '-') separator_index = cur - str;
}
}
/* The platform name is either the entire string, or all characters before
* the separator */
int name_length = separator_index == -1 ? strlen(str) : separator_index;
char *name_str = (char *)malloc(name_length + 1);
memcpy(name_str, str, name_length);
name_str[name_length] = 0;
/* The version is either NULL, or all characters after the separator */
char *version_str;
if (separator_index == -1) {
version_str = NULL;
}
else {
const char *version_input_str = str + separator_index + 1;
int version_length = strlen(version_input_str);
version_str = (char *)malloc(version_length + 1);
memcpy(version_str, version_input_str, version_length);
version_str[version_length] = 0;
}
/* Create output structure */
return pb_PlatformParam(name_str, version_str);
}
/****************************************************************************/
/* Argument parsing state */
/* Argument parsing state.
*
* Arguments that are interpreted by the argument parser are removed from
* the list. Variables 'argc' and 'argn' do not count arguments that have
* been removed.
*
* During argument parsing, the array of arguments is compacted, overwriting
* the erased arguments. Variable 'argv_put' points to the array element
* where the next argument will be written. Variable 'argv_get' points to
* the array element where the next argument will be read from.
*/
struct argparse {
int argc; /* Number of arguments. Mutable. */
int argn; /* Current argument index. */
char **argv_get; /* Argument value being read. */
char **argv_put; /* Argument value being written.
* argv_put <= argv_get. */
};
static void
initialize_argparse(struct argparse *ap, int argc, char **argv)
{
ap->argc = argc;
ap->argn = 0;
ap->argv_get = ap->argv_put = argv;
}
/* Finish argument parsing, without processing the remaining arguments.
* Write new argument count into _argc. */
static void
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
{
/* Move the remaining arguments */
for(; ap->argn < ap->argc; ap->argn++)
*ap->argv_put++ = *ap->argv_get++;
/* Update the argument count */
*_argc = ap->argc;
/* Insert a terminating NULL */
argv[ap->argc] = NULL;
}
/* Delete the current argument. The argument will not be visible
* when argument parsing is done. */
static void
delete_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "delete_argument\n");
}
ap->argc--;
ap->argv_get++;
}
/* Go to the next argument. Also, move the current argument to its
* final location in argv. */
static void
next_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "next_argument\n");
}
/* Move argument to its new location. */
*ap->argv_put++ = *ap->argv_get++;
ap->argn++;
}
static int
is_end_of_arguments(struct argparse *ap)
{
return ap->argn == ap->argc;
}
/* Get the current argument */
static char *
get_argument(struct argparse *ap)
{
return *ap->argv_get;
}
/* Get the current argument, and also delete it */
static char *
consume_argument(struct argparse *ap)
{
char *ret = get_argument(ap);
delete_argument(ap);
return ret;
}
/****************************************************************************/
/* The result of parsing a command-line argument */
typedef enum {
ARGPARSE_OK, /* Success */
ARGPARSE_ERROR, /* Error */
ARGPARSE_DONE /* Success, and do not continue parsing */
} result;
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
/* A command-line option */
struct option {
char short_name; /* If not 0, the one-character
* name of this option */
const char *long_name; /* If not NULL, the long name of this option */
parse_action *action; /* What to do when this option occurs.
* Sentinel value is NULL.
*/
};
/* Output file
*
* -o FILE
*/
static result
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-o'\n");
return ARGPARSE_ERROR;
}
/* Replace the output file name */
free(params->outFile);
params->outFile = strdup(consume_argument(ap));
return ARGPARSE_OK;
}
/* Input files
*
* -i FILE,FILE,...
*/
static result
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-i'\n");
return ARGPARSE_ERROR;
}
/* Replace the input file list */
pb_FreeStringArray(params->inpFiles);
params->inpFiles = read_string_array(consume_argument(ap));
return ARGPARSE_OK;
}
/* End of options
*
* --
*/
static result
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
{
return ARGPARSE_DONE;
}
/* OpenCL device
*
* --device X
*/
static result
parse_device(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a device */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--device'\n");
return ARGPARSE_ERROR;
}
char *device_string = consume_argument(ap);
struct pb_DeviceParam *device_param = read_device_param(device_string);
if (!device_param) {
report_parse_error("Unrecognized device specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreeDeviceParam(params->device);
params->device = device_param;
return ARGPARSE_OK;
}
static result
parse_platform(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a platform */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--platform'\n");
return ARGPARSE_ERROR;
}
char *platform_string = consume_argument(ap);
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
if (!platform_param) {
report_parse_error("Unrecognized platform specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreePlatformParam(params->platform);
params->platform = platform_param;
return ARGPARSE_OK;
}
static struct option options[] = {
{ 'o', NULL, &parse_output_file },
{ 'i', NULL, &parse_input_files },
{ '-', NULL, &parse_end_options },
{ 0, "device", &parse_device },
{ 0, "platform", &parse_platform },
{ 0, NULL, NULL }
};
static int
is_last_option(struct option *op)
{
return op->action == NULL;
}
/****************************************************************************/
/* Parse command-line parameters.
* Return zero on error, nonzero otherwise.
* On error, the other outputs may be invalid.
*
* The information collected from parameters is used to update
* 'ret'. 'ret' should be initialized.
*
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
*/
static int
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
{
char *err_message;
struct argparse ap;
/* Each argument */
initialize_argparse(&ap, *_argc, argv);
while(!is_end_of_arguments(&ap)) {
result arg_result; /* Result of parsing this option */
char *arg = get_argument(&ap);
/* Process this argument */
if (arg[0] == '-') {
/* Single-character flag */
if ((arg[1] != 0) && (arg[2] == 0)) {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching short option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->short_name == arg[1]) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
/* Long flag */
if (arg[1] == '-') {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching long option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
}
else {
/* Other arguments are ignored */
next_argument(&ap);
arg_result = ARGPARSE_OK;
goto option_was_processed;
}
option_was_processed:
/* Decide what to do next based on 'arg_result' */
switch(arg_result) {
case ARGPARSE_OK:
/* Continue processing */
break;
case ARGPARSE_ERROR:
/* Error exit from the function */
return 0;
case ARGPARSE_DONE:
/* Normal exit from the argument parsing loop */
goto end_of_options;
}
} /* end for each argument */
/* If all arguments were processed, then normal exit from the loop */
end_of_options:
finalize_argparse(&ap, _argc, argv);
return 1;
}
/*****************************************************************************/
/* Other exported functions */
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv)
{
struct pb_Parameters *ret =
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
/* Initialize the parameters structure */
ret->outFile = NULL;
ret->inpFiles = (char **)malloc(sizeof(char *));
ret->inpFiles[0] = NULL;
ret->platform = NULL;
ret->device = NULL;
/* Read parameters and update _argc, argv */
if (!pb_ParseParameters(ret, _argc, argv)) {
/* Parse error */
pb_FreeParameters(ret);
return NULL;
}
return ret;
}
int
pb_Parameters_CountInputs(struct pb_Parameters *p)
{
int n;
for (n = 0; p->inpFiles[n]; n++);
return n;
}

View file

@ -1,37 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef ATOM_H
#define ATOM_H
#ifdef __cplusplus
extern "C" {
#endif
typedef struct Atom_t {
float x, y, z, q;
} Atom;
typedef struct Atoms_t {
Atom *atoms;
int size;
} Atoms;
typedef struct Vec3_t {
float x, y, z;
} Vec3;
Atoms *read_atom_file(const char *fname);
void free_atom(Atoms *atom);
void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
#ifdef __cplusplus
}
#endif
#endif /* ATOM_H */

View file

@ -1,195 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#undef DEBUG_PASS_RATE
#define CHECK_CYLINDER_CPU
#define CELLEN 4.f
#define INV_CELLEN (1.f/CELLEN)
extern int cpu_compute_cutoff_potential_lattice(
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms /* array of atoms */
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float gridspacing = lattice->dim.h;
int natoms = atoms->size;
Atom *atom = atoms->atoms;
const float a2 = cutoff * cutoff;
const float inv_a2 = 1.f / a2;
float s;
const float inv_gridspacing = 1.f / gridspacing;
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
/* lattice point radius about each atom */
int n;
int i, j, k;
int ia, ib, ic;
int ja, jb, jc;
int ka, kb, kc;
int index;
int koff, jkoff;
float x, y, z, q;
float dx, dy, dz;
float dz2, dydz2, r2;
float e;
float xstart, ystart;
float *pg;
int gindex;
int ncell, nxcell, nycell, nzcell;
int *first, *next;
float inv_cellen = INV_CELLEN;
Vec3 minext, maxext; /* Extent of atom bounding box */
float xmin, ymin, zmin;
float xmax, ymax, zmax;
#if DEBUG_PASS_RATE
unsigned long long pass_count = 0;
unsigned long long fail_count = 0;
#endif
/* find min and max extent */
get_atom_extent(&minext, &maxext, atoms);
/* number of cells in each dimension */
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
ncell = nxcell * nycell * nzcell;
/* allocate for cursor link list implementation */
first = (int *) malloc(ncell * sizeof(int));
for (gindex = 0; gindex < ncell; gindex++) {
first[gindex] = -1;
}
next = (int *) malloc(natoms * sizeof(int));
for (n = 0; n < natoms; n++) {
next[n] = -1;
}
/* geometric hashing */
for (n = 0; n < natoms; n++) {
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
gindex = (k*nycell + j)*nxcell + i;
next[n] = first[gindex];
first[gindex] = n;
}
/* traverse the grid cells */
for (gindex = 0; gindex < ncell; gindex++) {
for (n = first[gindex]; n != -1; n = next[n]) {
x = atom[n].x - xlo;
y = atom[n].y - ylo;
z = atom[n].z - zlo;
q = atom[n].q;
/* find closest grid point with position less than or equal to atom */
ic = (int) (x * inv_gridspacing);
jc = (int) (y * inv_gridspacing);
kc = (int) (z * inv_gridspacing);
/* find extent of surrounding box of grid points */
ia = ic - radius;
ib = ic + radius + 1;
ja = jc - radius;
jb = jc + radius + 1;
ka = kc - radius;
kb = kc + radius + 1;
/* trim box edges so that they are within grid point lattice */
if (ia < 0) ia = 0;
if (ib >= nx) ib = nx-1;
if (ja < 0) ja = 0;
if (jb >= ny) jb = ny-1;
if (ka < 0) ka = 0;
if (kb >= nz) kb = nz-1;
/* loop over surrounding grid points */
xstart = ia*gridspacing - x;
ystart = ja*gridspacing - y;
dz = ka*gridspacing - z;
for (k = ka; k <= kb; k++, dz += gridspacing) {
koff = k*ny;
dz2 = dz*dz;
dy = ystart;
for (j = ja; j <= jb; j++, dy += gridspacing) {
jkoff = (koff + j)*nx;
dydz2 = dy*dy + dz2;
#ifdef CHECK_CYLINDER_CPU
if (dydz2 >= a2) continue;
#endif
dx = xstart;
index = jkoff + ia;
pg = lattice->lattice + index;
#if defined(__INTEL_COMPILER)
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
e = q * (1/sqrtf(r2)) * s;
*pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
}
#else
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
if (r2 >= a2)
{
#ifdef DEBUG_PASS_RATE
fail_count++;
#endif
continue;
}
#ifdef DEBUG_PASS_RATE
pass_count++;
#endif
s = (1.f - r2 * inv_a2);
e = q * (1/sqrtf(r2)) * s * s;
*pg += e;
}
#endif
}
} /* end loop over surrounding grid points */
} /* end loop over atoms in a gridcell */
} /* end loop over gridcells */
/* free memory */
free(next);
free(first);
/* For debugging: print the number of times that the test passed/failed */
#ifdef DEBUG_PASS_RATE
printf ("Pass :%lld\n", pass_count);
printf ("Fail :%lld\n", fail_count);
#endif
return 0;
}

View file

@ -1,508 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#include "macros.h"
#include "ocl.h"
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (NULL == filename || NULL == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
typedef cl_int4 xyz;
//extern "C" int gpu_compute_cutoff_potential_lattice(
int gpu_compute_cutoff_potential_lattice(
struct pb_TimerSet *timers,
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms, /* array of atoms */
int verbose, /* print info/debug messages */
struct pb_Parameters *parameters
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float h = lattice->dim.h;
int natoms = atoms->size;
Atom *atom = atoms->atoms;
xyz nbrlist[NBRLIST_MAXLEN];
int nbrlistlen = 0;
int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */
int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */
int num_excluded = 0;
int xRegionDim, yRegionDim, zRegionDim;
int xRegionIndex, yRegionIndex, zRegionIndex;
int xOffset, yOffset, zOffset;
int lnx, lny, lnz, lnall;
float *regionZeroAddr, *thisRegion;
cl_mem regionZeroCl;
int index, indexRegion;
int c;
xyz binDim;
int nbins;
cl_float4 *binBaseAddr, *binZeroAddr;
cl_mem binBaseCl, binZeroCl;
int *bincntBaseAddr, *bincntZeroAddr;
Atoms *extra = NULL;
cl_mem NbrListLen;
cl_mem NbrList;
int i, j, k, n;
int sum, total;
float avgFillFull, avgFillCover;
const float cutoff2 = cutoff * cutoff;
const float inv_cutoff2 = 1.f / cutoff2;
size_t gridDim[3], blockDim[3];
// The "compute" timer should be active upon entry to this function
/* pad lattice to be factor of 8 in each dimension */
xRegionDim = (int) ceilf(nx/8.f);
yRegionDim = (int) ceilf(ny/8.f);
zRegionDim = (int) ceilf(nz/8.f);
lnx = 8 * xRegionDim;
lny = 8 * yRegionDim;
lnz = 8 * zRegionDim;
lnall = lnx * lny * lnz;
/* will receive energies from OpenCL */
regionZeroAddr = (float *) malloc(lnall * sizeof(float));
/* create bins */
c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
nbins = binDim.x * binDim.y * binDim.z;
binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
/* create neighbor list */
if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
float s = sqrtf(3);
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
int cnt = 0;
/* develop neighbor list around 1 cell */
if (2*c + 1 > NBRLIST_DIM) {
fprintf(stderr, "must have cutoff <= %f\n",
(NBRLIST_DIM-1)/2 * BIN_LENGTH);
return -1;
}
for (k = -c; k <= c; k++) {
for (j = -c; j <= c; j++) {
for (i = -c; i <= c; i++) {
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
nbrlist[cnt].x = i;
nbrlist[cnt].y = j;
nbrlist[cnt].z = k;
cnt++;
}
}
}
nbrlistlen = cnt;
}
else if (8*h <= 2*BIN_LENGTH) {
float s = 2.f*sqrtf(3);
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
int cnt = 0;
/* develop neighbor list around 3-cube of cells */
if (2*c + 3 > NBRLIST_DIM) {
fprintf(stderr, "must have cutoff <= %f\n",
(NBRLIST_DIM-3)/2 * BIN_LENGTH);
return -1;
}
for (k = -c; k <= c; k++) {
for (j = -c; j <= c; j++) {
for (i = -c; i <= c; i++) {
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
nbrlist[cnt].x = i;
nbrlist[cnt].y = j;
nbrlist[cnt].z = k;
cnt++;
}
}
}
nbrlistlen = cnt;
}
else {
fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
return -1;
}
/* perform geometric hashing of atoms into bins */
{
/* array of extra atoms, permit average of one extra per bin */
Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
int extra_len = 0;
for (n = 0; n < natoms; n++) {
cl_float4 p;
p.x = atom[n].x - xlo;
p.y = atom[n].y - ylo;
p.z = atom[n].z - zlo;
p.w = atom[n].q;
i = (int) floorf(p.x * BIN_INVLEN);
j = (int) floorf(p.y * BIN_INVLEN);
k = (int) floorf(p.z * BIN_INVLEN);
if (i >= -c && i < binDim.x - c &&
j >= -c && j < binDim.y - c &&
k >= -c && k < binDim.z - c &&
atom[n].q != 0) {
int index = (k * binDim.y + j) * binDim.x + i;
cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
int bindex = bincntZeroAddr[index];
if (bindex < BIN_DEPTH) {
/* copy atom into bin and increase counter for this bin */
bin[bindex] = p;
bincntZeroAddr[index]++;
}
else {
/* add index to array of extra atoms to be computed with CPU */
if (extra_len >= nbins) {
fprintf(stderr, "exceeded space for storing extra atoms\n");
return -1;
}
extra_atoms[extra_len] = atom[n];
extra_len++;
}
}
else {
/* excluded atoms are either outside bins or neutrally charged */
num_excluded++;
}
}
/* Save result */
extra = (Atoms *)malloc(sizeof(Atoms));
extra->atoms = extra_atoms;
extra->size = extra_len;
}
/* bin stats */
sum = total = 0;
for (n = 0; n < nbins; n++) {
binHistoFull[ bincntBaseAddr[n] ]++;
sum += bincntBaseAddr[n];
total += BIN_DEPTH;
}
avgFillFull = sum / (float) total;
sum = total = 0;
for (k = 0; k < binDim.z - 2*c; k++) {
for (j = 0; j < binDim.y - 2*c; j++) {
for (i = 0; i < binDim.x - 2*c; i++) {
int index = (k * binDim.y + j) * binDim.x + i;
binHistoCover[ bincntZeroAddr[index] ]++;
sum += bincntZeroAddr[index];
total += BIN_DEPTH;
}
}
}
avgFillCover = sum / (float) total;
if (verbose) {
/* report */
printf("number of atoms = %d\n", natoms);
printf("lattice spacing = %g\n", h);
printf("cutoff distance = %g\n", cutoff);
printf("\n");
printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
printf("number of bytes for lattice data = %u\n", (unsigned int) (lnall*sizeof(float)));
printf("\n");
printf("bin padding thickness = %d\n", c);
printf("bin cover dimensions = %d %d %d\n",
binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
printf("number of bins = %d\n", nbins);
printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
printf("%% overhead space = %g\n",
(natoms / (double) (nbins * BIN_DEPTH)) * 100);
printf("number of bytes for bin data = %u\n",
(unsigned int)(nbins * BIN_DEPTH * sizeof(cl_float4)));
printf("\n");
printf("bin histogram with padding:\n");
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]);
sum += binHistoFull[n];
}
printf(" total number of bins: %d\n", sum);
printf(" %% average fill: %g\n", avgFillFull * 100);
printf("\n");
printf("bin histogram excluding padding:\n");
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]);
sum += binHistoCover[n];
}
printf(" total number of bins: %d\n", sum);
printf(" %% average fill: %g\n", avgFillCover * 100);
printf("\n");
printf("number of extra atoms = %d\n", extra->size);
printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
printf("\n");
/* sanity check on bins */
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
sum += n * binHistoFull[n];
}
sum += extra->size + num_excluded;
printf("sanity check on bin histogram with edges: "
"sum + others = %d\n", sum);
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
sum += n * binHistoCover[n];
}
sum += extra->size + num_excluded;
printf("sanity check on bin histogram excluding edges: "
"sum + others = %d\n", sum);
printf("\n");
/* neighbor list */
printf("neighbor list length = %d\n", nbrlistlen);
printf("\n");
}
pb_Context* pb_context;
pb_context = pb_InitOpenCLContext(parameters);
if (pb_context == NULL) {
fprintf (stderr, "Error: No OpenCL platform/device can be found.");
return -1;
}
cl_int clStatus;
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
cl_context clContext = (cl_context) pb_context->clContext;
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
CHECK_ERROR("clCreateCommandQueue")
pb_SetOpenCL(&clContext, &clCommandQueue);
//const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("read_kernel_file")
cl_program clProgram = clCreateProgramWithBinary(
clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
char clOptions[50];
sprintf(clOptions,"-I src/opencl_base"); //-cl-nv-verbose
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
if (clStatus != CL_SUCCESS) {
size_t string_size = 0;
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
0, NULL, &string_size);
char* string = (char*)malloc(string_size*sizeof(char));
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
string_size, string, NULL);
puts(string);
}
CHECK_ERROR("clBuildProgram")
cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus);
CHECK_ERROR("clCreateKernel")
/* setup OpenCL kernel parameters */
blockDim[0] = 8;
blockDim[1] = 8;
blockDim[2] = 2;
gridDim[0] = 4 * xRegionDim * blockDim[0];
gridDim[1] = yRegionDim * blockDim[1];
gridDim[2] = 1 * blockDim[2];
/* allocate and initialize memory on OpenCL device */
pb_SwitchToTimer(timers, pb_TimerID_COPY);
if (verbose) {
printf("Allocating %.2fMB on OpenCL device for potentials\n",
lnall * sizeof(float) / (double) (1024*1024));
}
regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
// clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float));
if (verbose) {
printf("Allocating %.2fMB on OpenCL device for atom bins\n",
nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
}
binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
//Sub buffers are not supported in OpenCL v1.0
int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
if (verbose)
printf("\n");
clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&regionZeroCl);
clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
CHECK_ERROR("clSetKernelArg")
/* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
printf(" computing plane %d\r", zRegionIndex);
fflush(stdout);
clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
CHECK_ERROR("clSetKernelArg")
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
CHECK_ERROR("clEnqueueNDRangeKernel")
clStatus = clFinish(clCommandQueue);
CHECK_ERROR("clFinish")
}
printf("Finished OpenCL kernel calls\n");
/* copy result regions from OpenCL device */
pb_SwitchToTimer(timers, pb_TimerID_COPY);
clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL);
CHECK_ERROR("clEnqueueReadBuffer")
/* free OpenCL memory allocations */
clStatus = clReleaseMemObject(regionZeroCl);
clStatus = clReleaseMemObject(binBaseCl);
clStatus = clReleaseMemObject(NbrListLen);
clStatus = clReleaseMemObject(NbrList);
CHECK_ERROR("clReleaseMemObject")
clStatus = clReleaseKernel(clKernel);
clStatus = clReleaseProgram(clProgram);
clStatus = clReleaseCommandQueue(clCommandQueue);
clStatus = clReleaseContext(clContext);
//free((void*)clSource[0]);
/* transpose regions back into lattice */
pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
for (k = 0; k < nz; k++) {
zRegionIndex = (k >> 3);
zOffset = (k & 7);
for (j = 0; j < ny; j++) {
yRegionIndex = (j >> 3);
yOffset = (j & 7);
for (i = 0; i < nx; i++) {
xRegionIndex = (i >> 3);
xOffset = (i & 7);
thisRegion = regionZeroAddr
+ ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
+ xRegionIndex) * REGION_SIZE;
indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
index = (k * ny + j) * nx + i;
lattice->lattice[index] = thisRegion[indexRegion];
}
}
}
/* handle extra atoms */
if (extra->size > 0) {
printf("computing extra atoms on CPU\n");
if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
"for extra atoms\n");
return -1;
}
printf("\n");
}
/* cleanup memory allocations */
free(regionZeroAddr);
free(binBaseAddr);
free(bincntBaseAddr);
free_atom(extra);
return 0;
}

View file

@ -1,72 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef CUTOFF_H
#define CUTOFF_H
#ifdef __cplusplus
extern "C" {
#endif
#define SHIFTED
/* A structure to record how points in 3D space map to array
elements. Array element (z, y, x)
where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
*/
typedef struct LatticeDim_t {
/* Number of lattice points in x, y, z dimensions */
int nx, ny, nz;
/* Lowest corner of lattice */
Vec3 lo;
/* Lattice spacing */
float h;
} LatticeDim;
/* An electric potential field sampled on a regular grid. The
lattice size and grid point positions are specified by 'dim'.
*/
typedef struct Lattice_t {
LatticeDim dim;
float *lattice;
} Lattice;
LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
Lattice *create_lattice(LatticeDim dim);
void destroy_lattice(Lattice *);
int gpu_compute_cutoff_potential_lattice(
struct pb_TimerSet *timers,
Lattice *lattice,
float cutoff, /* cutoff distance */
Atoms *atom, /* array of atoms */
int verbose, /* print info/debug messages */
struct pb_Parameters *parameters
);
int cpu_compute_cutoff_potential_lattice(
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms /* array of atoms */
);
int remove_exclusions(
Lattice *lattice, /* the lattice */
float exclcutoff, /* exclusion cutoff distance */
Atoms *atom /* array of atoms */
);
#ifdef __cplusplus
}
#endif
#endif /* CUTOFF_H */

View file

@ -1,157 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#define CELLEN 4.f
#define INV_CELLEN (1.f/CELLEN)
extern int remove_exclusions(
Lattice *lattice, /* the lattice */
float cutoff, /* exclusion cutoff distance */
Atoms *atoms /* array of atoms */
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float gridspacing = lattice->dim.h;
Atom *atom = atoms->atoms;
const float a2 = cutoff * cutoff;
const float inv_gridspacing = 1.f / gridspacing;
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
/* lattice point radius about each atom */
int n;
int i, j, k;
int ia, ib, ic;
int ja, jb, jc;
int ka, kb, kc;
int index;
int koff, jkoff;
float x, y, z, q;
float dx, dy, dz;
float dz2, dydz2, r2;
float e;
float xstart, ystart;
float *pg;
int gindex;
int ncell, nxcell, nycell, nzcell;
int *first, *next;
float inv_cellen = INV_CELLEN;
Vec3 minext, maxext;
/* find min and max extent */
get_atom_extent(&minext, &maxext, atoms);
/* number of cells in each dimension */
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
ncell = nxcell * nycell * nzcell;
/* allocate for cursor link list implementation */
first = (int *) malloc(ncell * sizeof(int));
for (gindex = 0; gindex < ncell; gindex++) {
first[gindex] = -1;
}
next = (int *) malloc(atoms->size * sizeof(int));
for (n = 0; n < atoms->size; n++) {
next[n] = -1;
}
/* geometric hashing */
for (n = 0; n < atoms->size; n++) {
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
gindex = (k*nycell + j)*nxcell + i;
next[n] = first[gindex];
first[gindex] = n;
}
/* traverse the grid cells */
for (gindex = 0; gindex < ncell; gindex++) {
for (n = first[gindex]; n != -1; n = next[n]) {
x = atom[n].x - xlo;
y = atom[n].y - ylo;
z = atom[n].z - zlo;
q = atom[n].q;
/* find closest grid point with position less than or equal to atom */
ic = (int) (x * inv_gridspacing);
jc = (int) (y * inv_gridspacing);
kc = (int) (z * inv_gridspacing);
/* find extent of surrounding box of grid points */
ia = ic - radius;
ib = ic + radius + 1;
ja = jc - radius;
jb = jc + radius + 1;
ka = kc - radius;
kb = kc + radius + 1;
/* trim box edges so that they are within grid point lattice */
if (ia < 0) ia = 0;
if (ib >= nx) ib = nx-1;
if (ja < 0) ja = 0;
if (jb >= ny) jb = ny-1;
if (ka < 0) ka = 0;
if (kb >= nz) kb = nz-1;
/* loop over surrounding grid points */
xstart = ia*gridspacing - x;
ystart = ja*gridspacing - y;
dz = ka*gridspacing - z;
for (k = ka; k <= kb; k++, dz += gridspacing) {
koff = k*ny;
dz2 = dz*dz;
dy = ystart;
for (j = ja; j <= jb; j++, dy += gridspacing) {
jkoff = (koff + j)*nx;
dydz2 = dy*dy + dz2;
dx = xstart;
index = jkoff + ia;
pg = lattice->lattice + index;
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
/* If atom and lattice point are too close, set the lattice value
* to zero */
if (r2 < a2) *pg = 0;
}
}
} /* end loop over surrounding grid points */
} /* end loop over atoms in a gridcell */
} /* end loop over gridcells */
/* free memory */
free(next);
free(first);
return 0;
}

View file

@ -1,55 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
//#include <endian.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <inttypes.h>
#include "gpu_info.h"
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm)
{
int max_thread;
int max_block=8;
if(major==1)
{
if(minor>=2)
max_thread=1024;
else
max_thread=768;
}
else if(major==2)
max_thread=1536;
else
//newer GPU //keep using 2.0
max_thread=1536;
int _grid;
int _thread;
if(task*pad>sm*max_thread)
{
_thread=max_thread/max_block;
_grid = ((task*pad+_thread-1)/_thread)*_thread;
}
else
{
_thread=pad;
_grid=task*pad;
}
thread[0]=_thread;
grid[0]=_grid;
}

View file

@ -1,28 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef __GPUINFOH__
#define __GPUINFOH__
#ifdef __cplusplus
extern "C" {
#endif
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -1,104 +0,0 @@
/*
* potential lattice is decomposed into size 8^3 lattice point "regions"
*
* THIS IMPLEMENTATION: one thread per lattice point
* thread block size 128 gives 4 thread blocks per region
* kernel is invoked for each x-y plane of regions,
* where gridDim.x is 4*(x region dimension) so that blockIdx.x
* can absorb the z sub-region index in its 2 lowest order bits
*
* Regions are stored contiguously in memory in row-major order
*
* The bins have to not only cover the region, but they need to surround
* the outer edges so that region sides and corners can still use
* neighbor list stencil. The binZeroAddr is actually a shifted pointer into
* the bin array (binZeroAddr = binBaseAddr + (c*binDim_y + c)*binDim_x + c)
* where c = ceil(cutoff / binsize). This allows for negative offsets to
* be added to myBinIndex.
*
* The (0,0,0) spatial origin corresponds to lower left corner of both
* regionZeroAddr and binZeroAddr. The atom coordinates are translated
* during binning to enforce this assumption.
*/
#include "macros.h"
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
typedef int4 xyz;
__kernel void opencl_cutoff_potential_lattice(
int binDim_x,
int binDim_y,
__global float4 *binBaseAddr,
int offset,
float h, /* lattice spacing */
float cutoff2, /* square of cutoff distance */
float inv_cutoff2,
__global float *regionZeroAddr, /* address of lattice regions starting at origin */
int zRegionIndex,
__constant int *NbrListLen,
__constant xyz *NbrList
)
{
__global float4* binZeroAddr = binBaseAddr + offset;
__global float *myRegionAddr;
int Bx, By, Bz;
/* thread id */
const int tid = (get_local_id(2)*get_local_size(1) +
get_local_id(1))*get_local_size(0) + get_local_id(0);
/* this is the start of the sub-region indexed by tid */
myRegionAddr = regionZeroAddr + ((zRegionIndex*get_num_groups(1)
+ get_group_id(1))*(get_num_groups(0)>>2) + (get_group_id(0)>>2))*REGION_SIZE
+ (get_group_id(0)&3)*SUB_REGION_SIZE;
/* spatial coordinate of this lattice point */
float x = (8 * (get_group_id(0) >> 2) + get_local_id(0)) * h;
float y = (8 * get_group_id(1) + get_local_id(1)) * h;
float z = (8 * zRegionIndex + 2*(get_group_id(0)&3) + get_local_id(2)) * h;
float dx;
float dy;
float dz;
float r2;
float s;
int totalbins = 0;
/* bin number determined by center of region */
Bx = (int) floor((8 * (get_group_id(0) >> 2) + 4) * h * BIN_INVLEN);
By = (int) floor((8 * get_group_id(1) + 4) * h * BIN_INVLEN);
Bz = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
float energy = 0.f;
int bincnt;
for (bincnt = 0; bincnt < *NbrListLen; bincnt++) {
int i = Bx + NbrList[bincnt].x;
int j = By + NbrList[bincnt].y;
int k = Bz + NbrList[bincnt].z;
__global float4* p_global = binZeroAddr +
(((k*binDim_y + j)*binDim_x + i) * BIN_DEPTH);
int m;
for (m = 0; m < BIN_DEPTH; m++) {
float aq = p_global[m].w;
if (0.f != aq) {
dx = p_global[m].x - x;
dy = p_global[m].y - y;
dz = p_global[m].z - z;
r2 = dx*dx + dy*dy + dz*dz;
if (r2 < cutoff2) {
s = (1.f - r2 * inv_cutoff2);
energy += aq * rsqrt(r2) * s * s;
}
}
} /* end loop over atoms in bin */
} /* end loop over neighbor list */
/* store into global memory */
myRegionAddr[tid+0] = energy;
}

View file

@ -1,69 +0,0 @@
#ifndef __MACROSH__
#define __MACROSH__
#ifdef __DEVICE_EMULATION__
#define DEBUG
/* define which grid block and which thread to examine */
#define BX 0
#define BY 0
#define TX 0
#define TY 0
#define TZ 0
#define EMU(code) do { \
if (blockIdx.x==BX && blockIdx.y==BY && \
threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
code; \
} \
} while (0)
#define INT(n) printf("%s = %d\n", #n, n)
#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
(double)(f).y, (double)(f).z, (double)(f).w)
#else
#define EMU(code)
#define INT(n)
#define FLOAT(f)
#define INT3(n)
#define FLOAT4(f)
#endif
/* report error from OpenCL */
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
/*
* neighbor list:
* stored in constant memory as table of offsets
* flat index addressing is computed by kernel
*
* reserve enough memory for 11^3 stencil of grid cells
* this fits within 16K of memory
*/
#define NBRLIST_DIM 11
#define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
/*
* atom bins cached into shared memory for processing
*
* this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
* should permit scheduling of up to 3 thread blocks per SM
*/
#define BIN_DEPTH 8 /* max number of atoms per bin */
#define BIN_SIZE 32 /* size of bin in floats */
#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
#define BIN_LENGTH 4.f /* spatial length in Angstroms */
#define BIN_INVLEN (1.f / BIN_LENGTH)
/* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
* so that bin fill should be 80% (for non-empty regions of space) */
#define REGION_SIZE 512 /* number of floats in lattice region */
#define SUB_REGION_SIZE 128 /* number of floats in lattice sub-region */
#endif

View file

@ -1,190 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#include "output.h"
#define ERRTOL 1e-4f
#define NOKERNELS 0
#define CUTOFF1 1
#define CUTOFF6 32
#define CUTOFF6OVERLAP 64
#define CUTOFFCPU 16384
int appenddata(const char *filename, int size, double time) {
FILE *fp;
fp=fopen(filename, "a");
if (fp == NULL) {
printf("error appending to file %s..\n", filename);
return -1;
}
fprintf(fp, "%d %.3f\n", size, time);
fclose(fp);
return 0;
}
LatticeDim
lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
{
LatticeDim ret;
ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
ret.lo = lo;
ret.h = h;
return ret;
}
Lattice *
create_lattice(LatticeDim dim)
{
int size;
Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
if (lat == NULL) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
lat->dim = dim;
/* Round up the allocated size to a multiple of 8 */
size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
lat->lattice = (float *)calloc(size, sizeof(float));
if (lat->lattice == NULL) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
return lat;
}
void
destroy_lattice(Lattice *lat)
{
if (lat) {
free(lat->lattice);
free(lat);
}
}
int main(int argc, char *argv[]) {
Atoms *atom;
LatticeDim lattice_dim;
Lattice *gpu_lattice;
Vec3 min_ext, max_ext; /* Bounding box of atoms */
Vec3 lo, hi; /* Bounding box with padding */
float h = 0.5f; /* Lattice spacing */
float cutoff = 12.f; /* Cutoff radius */
float exclcutoff = 1.f; /* Radius for exclusion */
float padding = 0.5f; /* Bounding box padding distance */
int n;
struct pb_Parameters *parameters;
struct pb_TimerSet timers;
/* Read input parameters */
parameters = pb_ReadParameters(&argc, argv);
if (parameters == NULL) {
exit(1);
}
parameters->inpFiles = (char **)malloc(sizeof(char *) * 2);
parameters->inpFiles[0] = (char *)malloc(100);
parameters->inpFiles[1] = NULL;
strncpy(parameters->inpFiles[0], "watbox.sl40.pqr", 100);
/* Expect one input file */
if (pb_Parameters_CountInputs(parameters) != 1) {
fprintf(stderr, "Expecting one input file\n");
exit(1);
}
pb_InitializeTimerSet(&timers);
pb_SwitchToTimer(&timers, pb_TimerID_IO);
{
const char *pqrfilename = parameters->inpFiles[0];
if (!(atom = read_atom_file(pqrfilename))) {
fprintf(stderr, "read_atom_file() failed\n");
exit(1);
}
printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
}
/* find extent of domain */
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
get_atom_extent(&min_ext, &max_ext, atom);
printf("extent of domain is:\n");
printf(" minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
printf("padding domain by %g Angstroms\n", padding);
lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
lattice_dim = lattice_from_bounding_box(lo, hi, h);
gpu_lattice = create_lattice(lattice_dim);
printf("\n");
/*
* Run OpenCL kernel
* (Begin and end with COMPUTE timer active)
*/
if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0, parameters)) {
fprintf(stderr, "Computation failed\n");
exit(1);
}
/*
* Zero the lattice points that are too close to an atom. This is
* necessary for numerical stability.
*/
if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
exit(1);
}
printf("\n");
pb_SwitchToTimer(&timers, pb_TimerID_IO);
/* Print output */
if (parameters->outFile) {
//write_lattice_summary(parameters->outFile, gpu_lattice);
}
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
/* Cleanup */
destroy_lattice(gpu_lattice);
free_atom(atom);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
pb_FreeParameters(parameters);
return 0;
}

View file

@ -1,49 +0,0 @@
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include "ocl.h"
char* readFile(const char* fileName)
{
FILE* fp;
fp = fopen(fileName,"r");
if(fp == NULL)
{
printf("Error 1!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
long size = ftell(fp);
rewind(fp);
char* buffer = (char*)malloc(sizeof(char)*(size+1));
if(buffer == NULL)
{
printf("Error 2!\n");
fclose(fp);
exit(1);
}
size_t res = fread(buffer,1,size,fp);
if(res != size)
{
printf("Error 3!\n");
fclose(fp);
exit(1);
}
buffer[size] = 0;
fclose(fp);
return buffer;
}
void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
{
cl_int clStatus;
char* temp = (char*)malloc(size);
memset(temp,val,size);
clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
free(temp);
}

View file

@ -1,25 +0,0 @@
#ifndef __OCLH__
#define __OCLH__
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
void clMemSet(cl_command_queue, cl_mem, int, size_t);
char* readFile(const char*);
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
#ifdef __cplusplus
}
#endif
#endif

View file

@ -1,67 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
void
write_lattice_summary(const char *filename, Lattice *lattice)
{
float *lattice_data = lattice->lattice;
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
/* Open output file */
FILE *outfile = fopen(filename, "w");
if (outfile == NULL) {
fprintf(stderr, "Cannot open output file\n");
exit(1);
}
/* Write the sum of the the absolute values of all lattice potentials */
{
double abspotential = 0.0;
float tmp;
int i;
for (i = 0; i < nx * ny * nz; i++)
abspotential += fabs((double) lattice_data[i]);
tmp = (float) abspotential;
fwrite(&tmp, 1, sizeof(float), outfile);
}
/* Write the size of a lattice plane */
{
uint32_t tmp;
tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
fwrite(&tmp, 1, sizeof(uint32_t), outfile);
}
/* Write the plane of lattice data at z=0 and z = nz-1 */
{
int plane_size = nx * ny;
fwrite(lattice_data, plane_size, sizeof(float), outfile);
fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
outfile);
}
/* Cleanup */
fclose(outfile);
}

View file

@ -1,25 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef OUTPUT_H
#define OUTPUT_H
#include "cutoff.h"
#ifdef __cplusplus
extern "C" {
#endif
void
write_lattice_summary(const char *filename, Lattice *lattice);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -1,348 +0,0 @@
/*
* (c) 2010 The Board of Trustees of the University of Illinois.
*/
#ifndef PARBOIL_HEADER
#define PARBOIL_HEADER
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
#include <unistd.h>
/* A platform as specified by the user on the command line */
struct pb_PlatformParam {
char *name; /* The platform name. This string is owned. */
char *version; /* The platform version; may be NULL.
* This string is owned. */
};
/* Create a PlatformParam from the given strings.
* 'name' must not be NULL. 'version' may be NULL.
* If not NULL, the strings should have been allocated by malloc(),
* and they will be owned by the returned object.
*/
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version);
void
pb_FreePlatformParam(struct pb_PlatformParam *);
/* A criterion for how to select a device */
enum pb_DeviceSelectionCriterion {
pb_Device_INDEX, /* Enumerate the devices and select one
* by its number */
pb_Device_CPU, /* Select a CPU device */
pb_Device_GPU, /* Select a GPU device */
pb_Device_ACCELERATOR, /* Select an accelerator device */
pb_Device_NAME /* Select a device by name */
};
/* A device as specified by the user on the command line */
struct pb_DeviceParam {
enum pb_DeviceSelectionCriterion criterion;
union {
int index; /* If criterion == pb_Device_INDEX,
* the index of the device */
char *name; /* If criterion == pb_Device_NAME,
* the name of the device.
* This string is owned. */
};
};
struct pb_DeviceParam *
pb_DeviceParam_index(int index);
struct pb_DeviceParam *
pb_DeviceParam_cpu(void);
struct pb_DeviceParam *
pb_DeviceParam_gpu(void);
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void);
/* Create a by-name device selection criterion.
* The string should have been allocated by malloc(), and it will will be
* owned by the returned object.
*/
struct pb_DeviceParam *
pb_DeviceParam_name(char *name);
void
pb_FreeDeviceParam(struct pb_DeviceParam *);
/* Command line parameters for benchmarks */
struct pb_Parameters {
char *outFile; /* If not NULL, the raw output of the
* computation should be saved to this
* file. The string is owned. */
char **inpFiles; /* A NULL-terminated array of strings
* holding the input file(s) for the
* computation. The array and strings
* are owned. */
struct pb_PlatformParam *platform; /* If not NULL, the platform
* specified on the command line. */
struct pb_DeviceParam *device; /* If not NULL, the device
* specified on the command line. */
};
/* Read command-line parameters.
*
* The argc and argv parameters to main are read, and any parameters
* interpreted by this function are removed from the argument list.
*
* A new instance of struct pb_Parameters is returned.
* If there is an error, then an error message is printed on stderr
* and NULL is returned.
*/
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv);
/* Free an instance of struct pb_Parameters.
*/
void
pb_FreeParameters(struct pb_Parameters *p);
void
pb_FreeStringArray(char **);
/* Count the number of input files in a pb_Parameters instance.
*/
int
pb_Parameters_CountInputs(struct pb_Parameters *p);
/* A time or duration. */
//#if _POSIX_VERSION >= 200112L
typedef unsigned long long pb_Timestamp; /* time in microseconds */
//#else
//# error "Timestamps not implemented"
//#endif
enum pb_TimerState {
pb_Timer_STOPPED,
pb_Timer_RUNNING,
};
struct pb_Timer {
enum pb_TimerState state;
pb_Timestamp elapsed; /* Amount of time elapsed so far */
pb_Timestamp init; /* Beginning of the current time interval,
* if state is RUNNING. End of the last
* recorded time interfal otherwise. */
};
/* Reset a timer.
* Use this to initialize a timer or to clear
* its elapsed time. The reset timer is stopped.
*/
void
pb_ResetTimer(struct pb_Timer *timer);
/* Start a timer. The timer is set to RUNNING mode and
* time elapsed while the timer is running is added to
* the timer.
* The timer should not already be running.
*/
void
pb_StartTimer(struct pb_Timer *timer);
/* Stop a timer.
* This stops adding elapsed time to the timer.
* The timer should not already be stopped.
*/
void
pb_StopTimer(struct pb_Timer *timer);
/* Get the elapsed time in seconds. */
double
pb_GetElapsedTime(struct pb_Timer *timer);
/* Execution time is assigned to one of these categories. */
enum pb_TimerID {
pb_TimerID_NONE = 0,
pb_TimerID_IO, /* Time spent in input/output */
pb_TimerID_KERNEL, /* Time spent computing on the device,
* recorded asynchronously */
pb_TimerID_COPY, /* Time spent synchronously moving data
* to/from device and allocating/freeing
* memory on the device */
pb_TimerID_DRIVER, /* Time spent in the host interacting with the
* driver, primarily for recording the time
* spent queueing asynchronous operations */
pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
pb_TimerID_COMPUTE, /* Time for all program execution other
* than parsing command line arguments,
* I/O, kernel, and copy */
pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and
* host activity: automatically filled in,
* not intended for direct usage */
pb_TimerID_LAST /* Number of timer IDs */
};
/* Dynamic list of asynchronously tracked times between events */
struct pb_async_time_marker_list {
char *label; // actually just a pointer to a string
enum pb_TimerID timerID; /* The ID to which the interval beginning
* with this marker should be attributed */
void * marker;
//cudaEvent_t marker; /* The driver event for this marker */
struct pb_async_time_marker_list *next;
};
struct pb_SubTimer {
char *label;
struct pb_Timer timer;
struct pb_SubTimer *next;
};
struct pb_SubTimerList {
struct pb_SubTimer *current;
struct pb_SubTimer *subtimer_list;
};
/* A set of timers for recording execution times. */
struct pb_TimerSet {
enum pb_TimerID current;
struct pb_async_time_marker_list* async_markers;
pb_Timestamp async_begin;
pb_Timestamp wall_begin;
struct pb_Timer timers[pb_TimerID_LAST];
struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
};
/* Reset all timers in the set. */
void
pb_InitializeTimerSet(struct pb_TimerSet *timers);
void
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
/* Select which timer the next interval of time should be accounted
* to. The selected timer is started and other timers are stopped.
* Using pb_TimerID_NONE stops all timers. */
void
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
void
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
/* Print timer values to standard output. */
void
pb_PrintTimerSet(struct pb_TimerSet *timers);
/* Release timer resources */
void
pb_DestroyTimerSet(struct pb_TimerSet * timers);
void
pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
typedef struct pb_Device_tag {
char* name;
void* clDevice;
int id;
unsigned int in_use;
unsigned int available;
} pb_Device;
struct pb_Context_tag;
typedef struct pb_Context_tag pb_Context;
typedef struct pb_Platform_tag {
char* name;
char* version;
void* clPlatform;
unsigned int in_use;
pb_Context** contexts;
pb_Device** devices;
} pb_Platform;
struct pb_Context_tag {
void* clPlatformId;
void* clContext;
void* clDeviceId;
pb_Platform* pb_platform;
pb_Device* pb_device;
};
// verbosely print out list of platforms and their devices to the console.
pb_Platform**
pb_GetPlatforms();
// Choose a platform according to the given platform specification
pb_Platform*
pb_GetPlatform(struct pb_PlatformParam *platform);
// choose a platform: by name, name & version
pb_Platform*
pb_GetPlatformByName(const char* name);
pb_Platform*
pb_GetPlatformByNameAndVersion(const char* name, const char* version);
// Choose a device according to the given device specification
pb_Device*
pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
pb_Device**
pb_GetDevices(pb_Platform* pb_platform);
// choose a device by name.
pb_Device*
pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
pb_Platform*
pb_GetPlatformByEnvVars();
pb_Context*
pb_InitOpenCLContext(struct pb_Parameters* parameters);
void
pb_ReleasePlatforms();
void
pb_ReleaseContext(pb_Context* c);
void
pb_PrintPlatformInfo(pb_Context* c);
void
perf_init();
//#define MEASURE_KERNEL_TIME
#include <CL/cl.h>
#ifdef MEASURE_KERNEL_TIME
#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
cl_int
pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
cl_kernel /* kernel */,
cl_uint /* work_dim */,
const size_t * /* global_work_offset */,
const size_t * /* global_work_size */,
const size_t * /* local_work_size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */);
#endif
enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
void pb_sig_float(char*, float*, int);
void pb_sig_double(char*, double*, int);
void pb_sig_short(char*, short*, int);
void pb_sig_int(char*, int*, int);
void pb_sig_uchar(char*, unsigned char*, unsigned int);
void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
#ifdef __cplusplus
}
#endif
#endif //PARBOIL_HEADER

File diff suppressed because it is too large Load diff

View file

@ -1,139 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "atom.h"
#define LINELEN 96
#define INITLEN 20
Atoms *read_atom_file(const char *fname)
{
FILE *file;
char line[LINELEN];
Atom *atom; /* Atom array */
int len = INITLEN; /* Size of atom array */
int cnt = 0; /* Number of atoms read */
/* allocate initial atom array */
atom = (Atom *) malloc(len * sizeof(Atom));
if (NULL==atom) {
fprintf(stderr, "can't allocate memory\n");
return NULL;
}
int i;
for (i = 0; i < len; ++i) {
atom[i].x = i+0;
atom[i].y = i+1;
atom[i].z = i+2;
atom[i].q = 1;
}
#if 0
/* open atom "pqr" file */
file = fopen(fname, "r");
if (NULL==file) {
fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
return NULL;
}
/* loop to read pqr file line by line */
while (fgets(line, LINELEN, file) != NULL) {
if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
continue; /* skip anything that isn't an atom record */
}
if (cnt==len) { /* extend atom array */
void *tmp = realloc(atom, 2*len*sizeof(Atom));
if (NULL==tmp) {
fprintf(stderr, "can't allocate more memory\n");
return NULL;
}
atom = (Atom *) tmp;
len *= 2;
}
/* read position coordinates and charge from atom record */
if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
&(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
return NULL;
}
cnt++; /* count atoms as we store them */
}
/* verify EOF and close file */
if ( !feof(file) ) {
fprintf(stderr, "did not find EOF\n");
return NULL;
}
if (fclose(file)) {
fprintf(stderr, "can't close file\n");
return NULL;
}
#endif
/* Build the output data structure */
{
Atoms *out = (Atoms *)malloc(sizeof(Atoms));
if (NULL == out) {
fprintf(stderr, "can't allocate memory\n");
return NULL;
}
out->size = cnt;
out->atoms = atom;
return out;
}
}
void free_atom(Atoms *atom)
{
if (atom) {
free(atom->atoms);
free(atom);
}
}
void
get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
{
Atom *atoms = atom->atoms;
int natoms = atom->size;
Vec3 lo;
Vec3 hi;
int n;
hi.x = lo.x = atoms[0].x;
hi.y = lo.y = atoms[0].y;
hi.z = lo.z = atoms[0].z;
for (n = 1; n < natoms; n++) {
lo.x = fminf(lo.x, atoms[n].x);
hi.x = fmaxf(hi.x, atoms[n].x);
lo.y = fminf(lo.y, atoms[n].y);
hi.y = fmaxf(hi.y, atoms[n].y);
lo.z = fminf(lo.z, atoms[n].z);
hi.z = fmaxf(hi.z, atoms[n].z);
}
*out_lo = lo;
*out_hi = hi;
}

File diff suppressed because it is too large Load diff

7
tests/opencl/fft/.depend Normal file
View file

@ -0,0 +1,7 @@
main.o: main.cc /opt/pocl/runtime/include/CL/opencl.h \
/opt/pocl/runtime/include/CL/cl.h \
/opt/pocl/runtime/include/CL/cl_version.h \
/opt/pocl/runtime/include/CL/cl_platform.h \
/opt/pocl/runtime/include/CL/cl_gl.h \
/opt/pocl/runtime/include/CL/cl_gl_ext.h \
/opt/pocl/runtime/include/CL/cl_ext.h common.h

View file

@ -1,4 +1,4 @@
PROJECT = sgemm2
PROJECT = fft4
SRCS = main.cc

View file

@ -0,0 +1,3 @@
#pragma once
#define LOCAL_SIZE 16

BIN
tests/opencl/fft/fft4 Executable file

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,63 @@
#include "common.h"
__kernel void fft_radix4(__global float2* input, __global float2* output, const unsigned int N) {
int globalId = get_global_id(0);
int localId = get_local_id(0);
int groupId = get_group_id(0);
// Allocate local memory to store intermediate results and twiddle factors
__local float2 localData[LOCAL_SIZE];
__local float2 twiddleFactors[LOCAL_SIZE / 4];
// Calculate twiddle factors for this FFT stage and store in local memory
if (localId < LOCAL_SIZE / 4) {
float angle = -2 * M_PI * localId / LOCAL_SIZE;
twiddleFactors[localId] = (float2)(cos(angle), sin(angle));
}
barrier(CLK_LOCAL_MEM_FENCE);
// Calculate the offset for the data this work-group will process
int offset = groupId * LOCAL_SIZE;
// Load a chunk of input into local memory for faster access
if (globalId < N) {
localData[localId] = input[globalId];
}
barrier(CLK_LOCAL_MEM_FENCE);
// Perform the Radix-4 FFT on the data chunk in local memory
for (unsigned int stride = 1; stride < LOCAL_SIZE; stride *= 4) {
int twiddleIndex = (localId / stride) % 4;
float2 twiddle = twiddleFactors[twiddleIndex * (LOCAL_SIZE / (4 * stride))];
// Load data
float2 data0 = localData[localId];
float2 data1 = localData[localId + stride];
float2 data2 = localData[localId + 2 * stride];
float2 data3 = localData[localId + 3 * stride];
// Apply twiddle factors
data1 *= twiddle;
data2 *= twiddle * twiddle;
data3 *= twiddle * twiddle * twiddle;
// Radix-4 butterfly operations
float2 t0 = data0 + data2;
float2 t1 = data0 - data2;
float2 t2 = data1 + data3;
float2 t3 = (data1 - data3) * (float2)(0, -1);
// Store results
localData[localId] = t0 + t2;
localData[localId + stride] = t1 + t3;
localData[localId + 2 * stride] = t0 - t2;
localData[localId + 3 * stride] = t1 - t3;
barrier(CLK_LOCAL_MEM_FENCE);
}
// Write the results back to global memory
if (globalId < N) {
output[globalId] = localData[localId];
}
}

Binary file not shown.

View file

@ -7,12 +7,31 @@
#include <unistd.h>
#include <chrono>
#include <vector>
#include <cmath>
#include "common.h"
#define LOCAL_SIZE 16
#define KERNEL_NAME "fft_radix4"
#define FLOAT_ULP 6
#define KERNEL_NAME "sgemm2"
struct float2 {
float x;
float y;
float2(float real = 0.0f, float imag = 0.0f) : x(real), y(imag) {}
float2 operator+(const float2& other) const {
return {x + other.x, y + other.y};
}
float2 operator-(const float2& other) const {
return {x - other.x, y - other.y};
}
float2 operator*(const float2& other) const {
return {x * other.x - y * other.y, x * other.y + y * other.x};
}
};
#define CL_CHECK(_expr) \
do { \
@ -45,7 +64,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
@ -58,25 +76,33 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
return 0;
}
static bool compare_equal(float a, float b) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
return d <= FLOAT_ULP;
static std::vector<float2> referenceDFT(const std::vector<float2>& input) {
std::vector<float2> output(input.size());
for (unsigned int k = 0; k < input.size(); ++k) { // For each output element
output[k] = {0, 0}; // Initialize to zero
for (unsigned int n = 0; n < input.size(); ++n) { // For each input element
float angle = -2 * M_PI * k * n / input.size();
float2 twiddle = {cos(angle), sin(angle)};
output[k].x += input[n].x * twiddle.x - input[n].y * twiddle.y;
output[k].y += input[n].x * twiddle.y + input[n].y * twiddle.x;
}
}
return output;
}
static void matmul_cpu(float *C, float *A, float *B, int N) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float sum = 0.0f;
for (int k = 0; k < N; k++) {
sum += A[i * N + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
static int verifyOutput(const std::vector<float2>& output,
const std::vector<float2>& reference,
unsigned int N) {
int errors = 0;
for (unsigned int i = 0; i < N; ++i) {
float2 diff = {output[i].x - reference[i].x, output[i].y - reference[i].y};
float error = sqrt(diff.x * diff.x + diff.y * diff.y);
if (error > 1e-5) {
printf("*** error: [%d] expected=(%f,%f), actual=(%f,%f)\n", i, reference[i].x, reference[i].y, output[i].x, output[i].y);
++errors;
}
}
return errors;
}
cl_device_id device_id = NULL;
@ -84,24 +110,22 @@ cl_context context = NULL;
cl_command_queue commandQueue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_mem a_memobj = NULL;
cl_mem b_memobj = NULL;
cl_mem c_memobj = NULL;
cl_mem i_memobj = NULL;
cl_mem o_memobj = NULL;
uint8_t *kernel_bin = NULL;
static void cleanup() {
if (commandQueue) clReleaseCommandQueue(commandQueue);
if (kernel) clReleaseKernel(kernel);
if (program) clReleaseProgram(program);
if (a_memobj) clReleaseMemObject(a_memobj);
if (b_memobj) clReleaseMemObject(b_memobj);
if (c_memobj) clReleaseMemObject(c_memobj);
if (i_memobj) clReleaseMemObject(i_memobj);
if (o_memobj) clReleaseMemObject(o_memobj);
if (context) clReleaseContext(context);
if (device_id) clReleaseDevice(device_id);
if (kernel_bin) free(kernel_bin);
}
int size = 32;
int size = 64;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
@ -124,19 +148,13 @@ static void parse_args(int argc, char **argv) {
exit(-1);
}
}
printf("Workload size=%d\n", size);
}
int main (int argc, char **argv) {
// parse command arguments
parse_args(argc, argv);
uint32_t num_points = size * size;
printf("Matrix size=%d\n", size);
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
return -1;
}
cl_platform_id platform_id;
size_t kernel_size;
@ -148,15 +166,10 @@ int main (int argc, char **argv) {
printf("Create context\n");
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
char device_string[1024];
clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
printf("Using device: %s\n", device_string);
printf("Allocate device buffers\n");
size_t nbytes = num_points * sizeof(float);
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
size_t nbytes = size * sizeof(float2);
i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
@ -169,11 +182,7 @@ int main (int argc, char **argv) {
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
if (program == NULL) {
cleanup();
return -1;
}
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@ -181,61 +190,47 @@ int main (int argc, char **argv) {
// Create kernel
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
size_t global_size[2] = {size, size};
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&i_memobj));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&o_memobj));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(int), (void *)&size));
// Allocate memories for input arrays and output arrays.
std::vector<float> h_a(num_points);
std::vector<float> h_b(num_points);
std::vector<float> h_c(num_points);
// Allocate memories for input arrays and output arrays.
std::vector<float2> h_i(size);
std::vector<float2> h_o(size);
// Generate input values
for (uint32_t i = 0; i < num_points; ++i) {
h_a[i] = static_cast<float>(rand()) / RAND_MAX;
h_b[i] = static_cast<float>(rand()) / RAND_MAX;
for (int i = 0; i < size; ++i) {
h_i[i].x = sin(2 * M_PI * i / size); // Sine wave as an example
h_i[i].y = 0.0f; // Zero imaginary part
}
// Creating command queue
commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
printf("Upload source buffers\n");
CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a.data(), 0, NULL, NULL));
CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b.data(), 0, NULL, NULL));
CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, nbytes, h_i.data(), 0, NULL, NULL));
printf("Execute the kernel\n");
size_t global_work_size[1] = {size};
size_t local_work_size[1] = {LOCAL_SIZE};
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL));
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
printf("Download destination buffer\n");
CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, nbytes, h_o.data(), 0, NULL, NULL));
printf("Verify result\n");
std::vector<float> ref_vec(num_points);
matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size);
int errors = 0;
for (uint32_t i = 0; i < num_points; ++i) {
if (!compare_equal(h_c[i], ref_vec[i])) {
if (errors < 100)
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
++errors;
}
}
if (errors != 0) {
printf("FAILED! - %d errors\n", errors);
} else {
std::vector<float2> reference = referenceDFT(h_i);
auto errors = verifyOutput(h_o, reference, size);
if (0 == errors) {
printf("PASSED!\n");
} else {
printf("FAILED! - %d errors\n", errors);
}
// Clean up

BIN
tests/opencl/fft/main.cc.o Normal file

Binary file not shown.

View file

@ -1,9 +0,0 @@
PROJECT = mri-q
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c file.cc computeQ.c
CXXFLAGS += -I.
OPTS ?=
include ../common.mk

View file

@ -1,617 +0,0 @@
#include <parboil.h>
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
/*****************************************************************************/
/* Memory management routines */
/* Free an array of owned strings. */
void
pb_FreeStringArray(char **string_array)
{
char **p;
if (!string_array) return;
for (p = string_array; *p; p++) free(*p);
free(string_array);
}
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version)
{
if (name == NULL) {
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
exit(-1);
}
struct pb_PlatformParam *ret =
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
ret->name = name;
ret->version = version;
return ret;
}
void
pb_FreePlatformParam(struct pb_PlatformParam *p)
{
if (p == NULL) return;
free(p->name);
free(p->version);
free(p);
}
struct pb_DeviceParam *
pb_DeviceParam_index(int index)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_INDEX;
ret->index = index;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_cpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_CPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_gpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_GPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_ACCELERATOR;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_name(char *name)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_NAME;
ret->name = name;
return ret;
}
void
pb_FreeDeviceParam(struct pb_DeviceParam *p)
{
if (p == NULL) return;
switch(p->criterion) {
case pb_Device_NAME:
free(p->name);
break;
case pb_Device_INDEX:
case pb_Device_CPU:
case pb_Device_ACCELERATOR:
break;
default:
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
exit(-1);
}
}
void
pb_FreeParameters(struct pb_Parameters *p)
{
free(p->outFile);
pb_FreeStringArray(p->inpFiles);
pb_FreePlatformParam(p->platform);
pb_FreeDeviceParam(p->device);
free(p);
}
/*****************************************************************************/
/* Parse a comma-delimited list of strings into an
* array of strings. */
static char **
read_string_array(char *in)
{
char **ret;
int i;
int count; /* Number of items in the input */
char *substring; /* Current substring within 'in' */
/* Count the number of items in the string */
count = 1;
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
/* Allocate storage */
ret = (char **)malloc((count + 1) * sizeof(char *));
/* Create copies of the strings from the list */
substring = in;
for (i = 0; i < count; i++) {
char *substring_end;
int substring_length;
/* Find length of substring */
for (substring_end = substring;
(*substring_end != ',') && (*substring_end != 0);
substring_end++);
substring_length = substring_end - substring;
/* Allocate memory and copy the substring */
ret[i] = (char *)malloc(substring_length + 1);
memcpy(ret[i], substring, substring_length);
ret[i][substring_length] = 0;
/* go to next substring */
substring = substring_end + 1;
}
ret[i] = NULL; /* Write the sentinel value */
return ret;
}
static void
report_parse_error(const char *str)
{
fputs(str, stderr);
}
/* Interpret a string as a 'pb_DeviceParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_DeviceParam *
read_device_param(char *str)
{
/* Try different ways of interpreting 'device_string' until one works */
/* If argument is an integer, then interpret it as a device index */
errno = 0;
char *end;
long device_int = strtol(str, &end, 10);
if (!errno) {
/* Negative numbers are not valid */
if (device_int < 0 || device_int > INT_MAX) return NULL;
return pb_DeviceParam_index(device_int);
}
/* Match against predefined strings */
if (strcmp(str, "CPU") == 0)
return pb_DeviceParam_cpu();
if (strcmp(str, "GPU") == 0)
return pb_DeviceParam_gpu();
if (strcmp(str, "ACCELERATOR") == 0)
return pb_DeviceParam_accelerator();
/* Assume any other string is a device name */
return pb_DeviceParam_name(strdup(str));
}
/* Interpret a string as a 'pb_PlatformParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_PlatformParam *
read_platform_param(char *str)
{
int separator_index; /* Index of the '-' character separating
* name and version number. It's -1 if
* there's no '-' character. */
/* Find the last occurrence of '-' in 'str' */
{
char *cur;
separator_index = -1;
for (cur = str; *cur; cur++) {
if (*cur == '-') separator_index = cur - str;
}
}
/* The platform name is either the entire string, or all characters before
* the separator */
int name_length = separator_index == -1 ? strlen(str) : separator_index;
char *name_str = (char *)malloc(name_length + 1);
memcpy(name_str, str, name_length);
name_str[name_length] = 0;
/* The version is either NULL, or all characters after the separator */
char *version_str;
if (separator_index == -1) {
version_str = NULL;
}
else {
const char *version_input_str = str + separator_index + 1;
int version_length = strlen(version_input_str);
version_str = (char *)malloc(version_length + 1);
memcpy(version_str, version_input_str, version_length);
version_str[version_length] = 0;
}
/* Create output structure */
return pb_PlatformParam(name_str, version_str);
}
/****************************************************************************/
/* Argument parsing state */
/* Argument parsing state.
*
* Arguments that are interpreted by the argument parser are removed from
* the list. Variables 'argc' and 'argn' do not count arguments that have
* been removed.
*
* During argument parsing, the array of arguments is compacted, overwriting
* the erased arguments. Variable 'argv_put' points to the array element
* where the next argument will be written. Variable 'argv_get' points to
* the array element where the next argument will be read from.
*/
struct argparse {
int argc; /* Number of arguments. Mutable. */
int argn; /* Current argument index. */
char **argv_get; /* Argument value being read. */
char **argv_put; /* Argument value being written.
* argv_put <= argv_get. */
};
static void
initialize_argparse(struct argparse *ap, int argc, char **argv)
{
ap->argc = argc;
ap->argn = 0;
ap->argv_get = ap->argv_put = argv;
}
/* Finish argument parsing, without processing the remaining arguments.
* Write new argument count into _argc. */
static void
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
{
/* Move the remaining arguments */
for(; ap->argn < ap->argc; ap->argn++)
*ap->argv_put++ = *ap->argv_get++;
/* Update the argument count */
*_argc = ap->argc;
/* Insert a terminating NULL */
argv[ap->argc] = NULL;
}
/* Delete the current argument. The argument will not be visible
* when argument parsing is done. */
static void
delete_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "delete_argument\n");
}
ap->argc--;
ap->argv_get++;
}
/* Go to the next argument. Also, move the current argument to its
* final location in argv. */
static void
next_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "next_argument\n");
}
/* Move argument to its new location. */
*ap->argv_put++ = *ap->argv_get++;
ap->argn++;
}
static int
is_end_of_arguments(struct argparse *ap)
{
return ap->argn == ap->argc;
}
/* Get the current argument */
static char *
get_argument(struct argparse *ap)
{
return *ap->argv_get;
}
/* Get the current argument, and also delete it */
static char *
consume_argument(struct argparse *ap)
{
char *ret = get_argument(ap);
delete_argument(ap);
return ret;
}
/****************************************************************************/
/* The result of parsing a command-line argument */
typedef enum {
ARGPARSE_OK, /* Success */
ARGPARSE_ERROR, /* Error */
ARGPARSE_DONE /* Success, and do not continue parsing */
} result;
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
/* A command-line option */
struct option {
char short_name; /* If not 0, the one-character
* name of this option */
const char *long_name; /* If not NULL, the long name of this option */
parse_action *action; /* What to do when this option occurs.
* Sentinel value is NULL.
*/
};
/* Output file
*
* -o FILE
*/
static result
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-o'\n");
return ARGPARSE_ERROR;
}
/* Replace the output file name */
free(params->outFile);
params->outFile = strdup(consume_argument(ap));
return ARGPARSE_OK;
}
/* Input files
*
* -i FILE,FILE,...
*/
static result
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-i'\n");
return ARGPARSE_ERROR;
}
/* Replace the input file list */
pb_FreeStringArray(params->inpFiles);
params->inpFiles = read_string_array(consume_argument(ap));
return ARGPARSE_OK;
}
/* End of options
*
* --
*/
static result
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
{
return ARGPARSE_DONE;
}
/* OpenCL device
*
* --device X
*/
static result
parse_device(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a device */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--device'\n");
return ARGPARSE_ERROR;
}
char *device_string = consume_argument(ap);
struct pb_DeviceParam *device_param = read_device_param(device_string);
if (!device_param) {
report_parse_error("Unrecognized device specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreeDeviceParam(params->device);
params->device = device_param;
return ARGPARSE_OK;
}
static result
parse_platform(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a platform */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--platform'\n");
return ARGPARSE_ERROR;
}
char *platform_string = consume_argument(ap);
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
if (!platform_param) {
report_parse_error("Unrecognized platform specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreePlatformParam(params->platform);
params->platform = platform_param;
return ARGPARSE_OK;
}
static struct option options[] = {
{ 'o', NULL, &parse_output_file },
{ 'i', NULL, &parse_input_files },
{ '-', NULL, &parse_end_options },
{ 0, "device", &parse_device },
{ 0, "platform", &parse_platform },
{ 0, NULL, NULL }
};
static int
is_last_option(struct option *op)
{
return op->action == NULL;
}
/****************************************************************************/
/* Parse command-line parameters.
* Return zero on error, nonzero otherwise.
* On error, the other outputs may be invalid.
*
* The information collected from parameters is used to update
* 'ret'. 'ret' should be initialized.
*
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
*/
static int
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
{
char *err_message;
struct argparse ap;
/* Each argument */
initialize_argparse(&ap, *_argc, argv);
while(!is_end_of_arguments(&ap)) {
result arg_result; /* Result of parsing this option */
char *arg = get_argument(&ap);
/* Process this argument */
if (arg[0] == '-') {
/* Single-character flag */
if ((arg[1] != 0) && (arg[2] == 0)) {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching short option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->short_name == arg[1]) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
/* Long flag */
if (arg[1] == '-') {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching long option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
}
else {
/* Other arguments are ignored */
next_argument(&ap);
arg_result = ARGPARSE_OK;
goto option_was_processed;
}
option_was_processed:
/* Decide what to do next based on 'arg_result' */
switch(arg_result) {
case ARGPARSE_OK:
/* Continue processing */
break;
case ARGPARSE_ERROR:
/* Error exit from the function */
return 0;
case ARGPARSE_DONE:
/* Normal exit from the argument parsing loop */
goto end_of_options;
}
} /* end for each argument */
/* If all arguments were processed, then normal exit from the loop */
end_of_options:
finalize_argparse(&ap, _argc, argv);
return 1;
}
/*****************************************************************************/
/* Other exported functions */
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv)
{
struct pb_Parameters *ret =
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
/* Initialize the parameters structure */
ret->outFile = NULL;
ret->inpFiles = (char **)malloc(sizeof(char *));
ret->inpFiles[0] = NULL;
ret->platform = NULL;
ret->device = NULL;
/* Read parameters and update _argc, argv */
if (!pb_ParseParameters(ret, _argc, argv)) {
/* Parse error */
pb_FreeParameters(ret);
return NULL;
}
return ret;
}
int
pb_Parameters_CountInputs(struct pb_Parameters *p)
{
int n;
for (n = 0; p->inpFiles[n]; n++);
return n;
}

View file

@ -1,118 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2007 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <malloc.h>
#include <CL/cl.h>
#include "ocl.h"
#include "macros.h"
#include "computeQ.h"
#include "parboil.h"
#define NC 1
void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm)
{
int phiMagBlocks = numK / KERNEL_PHI_MAG_THREADS_PER_BLOCK;
if (numK % KERNEL_PHI_MAG_THREADS_PER_BLOCK)
phiMagBlocks++;
size_t DimPhiMagBlock = KERNEL_PHI_MAG_THREADS_PER_BLOCK;
size_t DimPhiMagGrid = phiMagBlocks*KERNEL_PHI_MAG_THREADS_PER_BLOCK;
cl_int clStatus;
clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(cl_mem),&phiR_d);
clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(cl_mem),&phiI_d);
clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&phiMag_d);
clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(int),&numK);
CHECK_ERROR("clSetKernelArg")
clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimPhiMagGrid,&DimPhiMagBlock,0,NULL,NULL);
CHECK_ERROR("clEnqueueNDRangeKernel")
}
static
unsigned long long int
readElapsedTime(cl_event internal)
{
cl_int status;
cl_ulong t_begin, t_end;
status = clGetEventProfilingInfo(internal, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &t_begin, NULL);
if (status != CL_SUCCESS) return 0;
status = clGetEventProfilingInfo(internal, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &t_end, NULL);
if (status != CL_SUCCESS) return 0;
return (unsigned long long int)(t_end - t_begin);
}
void computeQ_GPU (int numK,int numX,
cl_mem x_d, cl_mem y_d, cl_mem z_d,
struct kValues* kVals,
cl_mem Qr_d, cl_mem Qi_d,
clPrmtr* clPrm)
{
int QGrids = numK / KERNEL_Q_K_ELEMS_PER_GRID;
if (numK % KERNEL_Q_K_ELEMS_PER_GRID)
QGrids++;
int QBlocks = numX / KERNEL_Q_THREADS_PER_BLOCK;
if (numX % KERNEL_Q_THREADS_PER_BLOCK)
QBlocks++;
size_t DimQBlock = KERNEL_Q_THREADS_PER_BLOCK/NC;
size_t DimQGrid = QBlocks*KERNEL_Q_THREADS_PER_BLOCK/NC;
cl_int clStatus;
cl_mem ck;
ck = clCreateBuffer(clPrm->clContext,CL_MEM_READ_WRITE,KERNEL_Q_K_ELEMS_PER_GRID*sizeof(struct kValues),NULL,&clStatus);
int QGrid;
for (QGrid = 0; QGrid < QGrids; QGrid++) {
// Put the tile of K values into constant mem
int QGridBase = QGrid * KERNEL_Q_K_ELEMS_PER_GRID;
struct kValues* kValsTile = kVals + QGridBase;
int numElems = MIN(KERNEL_Q_K_ELEMS_PER_GRID, numK - QGridBase);
clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,ck,CL_TRUE,0,numElems*sizeof(struct kValues),kValsTile,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
clStatus = clSetKernelArg(clPrm->clKernel,0,sizeof(int),&numK);
clStatus = clSetKernelArg(clPrm->clKernel,1,sizeof(int),&QGridBase);
clStatus = clSetKernelArg(clPrm->clKernel,2,sizeof(cl_mem),&x_d);
clStatus = clSetKernelArg(clPrm->clKernel,3,sizeof(cl_mem),&y_d);
clStatus = clSetKernelArg(clPrm->clKernel,4,sizeof(cl_mem),&z_d);
clStatus = clSetKernelArg(clPrm->clKernel,5,sizeof(cl_mem),&Qr_d);
clStatus = clSetKernelArg(clPrm->clKernel,6,sizeof(cl_mem),&Qi_d);
clStatus = clSetKernelArg(clPrm->clKernel,7,sizeof(cl_mem),&ck);
CHECK_ERROR("clSetKernelArg")
printf ("Grid: %d, Block: %d\n", DimQGrid, DimQBlock);
#define TIMED_EXECUTION
#ifdef TIMED_EXECUTION
cl_event e;
clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,&e);
CHECK_ERROR("clEnqueueNDRangeKernel")
clWaitForEvents(1, &e);
printf ("%llu\n", readElapsedTime(e));
#else
clStatus = clEnqueueNDRangeKernel(clPrm->clCommandQueue,clPrm->clKernel,1,NULL,&DimQGrid,&DimQBlock,0,NULL,NULL);
CHECK_ERROR("clEnqueueNDRangeKernel")
#endif
}
}
void createDataStructsCPU(int numK, int numX, float** phiMag,
float** Qr, float** Qi)
{
*phiMag = (float* ) memalign(16, numK * sizeof(float));
*Qr = (float*) memalign(16, numX * sizeof (float));
*Qi = (float*) memalign(16, numX * sizeof (float));
}

View file

@ -1,22 +0,0 @@
#ifndef __COMPUTEQ__
#define __COMPUTEQ__
#ifdef __cplusplus
extern "C" {
#endif
void computePhiMag_GPU(int numK,cl_mem phiR_d,cl_mem phiI_d,cl_mem phiMag_d,clPrmtr* clPrm);
void computeQ_GPU (int numK,int numX,
cl_mem x_d, cl_mem y_d, cl_mem z_d,
struct kValues* kVals,
cl_mem Qr_d, cl_mem Qi_d,
clPrmtr* clPrm);
void createDataStructsCPU(int numK, int numX, float** phiMag,
float** Qr, float** Qi);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -1,78 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2007 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
//#include <endian.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <inttypes.h>
#include "file.h"
#if __BYTE_ORDER != __LITTLE_ENDIAN
# error "File I/O is not implemented for this system: wrong endianness."
#endif
extern "C"
void inputData(char* fName, int* _numK, int* _numX,
float** kx, float** ky, float** kz,
float** x, float** y, float** z,
float** phiR, float** phiI)
{
int numK, numX;
FILE* fid = fopen(fName, "r");
if (fid == NULL)
{
fprintf(stderr, "Cannot open input file\n");
exit(-1);
}
fread (&numK, sizeof (int), 1, fid);
*_numK = numK;
fread (&numX, sizeof (int), 1, fid);
*_numX = numX;
*kx = (float *) memalign(16, numK * sizeof (float));
fread (*kx, sizeof (float), numK, fid);
*ky = (float *) memalign(16, numK * sizeof (float));
fread (*ky, sizeof (float), numK, fid);
*kz = (float *) memalign(16, numK * sizeof (float));
fread (*kz, sizeof (float), numK, fid);
*x = (float *) memalign(16, numX * sizeof (float));
fread (*x, sizeof (float), numX, fid);
*y = (float *) memalign(16, numX * sizeof (float));
fread (*y, sizeof (float), numX, fid);
*z = (float *) memalign(16, numX * sizeof (float));
fread (*z, sizeof (float), numX, fid);
*phiR = (float *) memalign(16, numK * sizeof (float));
fread (*phiR, sizeof (float), numK, fid);
*phiI = (float *) memalign(16, numK * sizeof (float));
fread (*phiI, sizeof (float), numK, fid);
fclose (fid);
}
extern "C"
void outputData(char* fName, float* outR, float* outI, int numX)
{
FILE* fid = fopen(fName, "w");
uint32_t tmp32;
if (fid == NULL)
{
fprintf(stderr, "Cannot open output file\n");
exit(-1);
}
/* Write the data size */
tmp32 = numX;
fwrite(&tmp32, sizeof(uint32_t), 1, fid);
/* Write the reconstructed data */
fwrite (outR, sizeof (float), numX, fid);
fwrite (outI, sizeof (float), numX, fid);
fclose (fid);
}

View file

@ -1,22 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2007 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifdef __cplusplus
extern "C" {
#endif
void inputData(char* fName, int* _numK, int* _numX,
float** kx, float** ky, float** kz,
float** x, float** y, float** z,
float** phiR, float** phiI);
void outputData(char* fName, float* outR, float* outI, int numX);
#ifdef __cplusplus
}
#endif

View file

@ -1,55 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
//#include <endian.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <inttypes.h>
#include "gpu_info.h"
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm)
{
int max_thread;
int max_block=8;
if(major==1)
{
if(minor>=2)
max_thread=1024;
else
max_thread=768;
}
else if(major==2)
max_thread=1536;
else
//newer GPU //keep using 2.0
max_thread=1536;
int _grid;
int _thread;
if(task*pad>sm*max_thread)
{
_thread=max_thread/max_block;
_grid = ((task*pad+_thread-1)/_thread)*_thread;
}
else
{
_thread=pad;
_grid=task*pad;
}
thread[0]=_thread;
grid[0]=_grid;
}

View file

@ -1,28 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef __GPUINFOH__
#define __GPUINFOH__
#ifdef __cplusplus
extern "C" {
#endif
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -1,51 +0,0 @@
#include "macros.h"
__kernel void
ComputePhiMag_GPU(__global float* phiR, __global float* phiI, __global float* phiMag, int numK) {
int indexK = get_global_id(0);
float real = indexK;
float imag = indexK;
if (indexK < numK) {
/*float*/ real = phiR[indexK];
/*float*/ imag = phiI[indexK];
phiMag[indexK] = real*real + imag*imag;
}
}
__kernel void
ComputeQ_GPU(int numK, int kGlobalIndex,
__global float* x, __global float* y, __global float* z,
__global float* Qr, __global float* Qi, __global struct kValues* ck)
{
float sX;
float sY;
float sZ;
float sQr;
float sQi;
// Determine the element of the X arrays computed by this thread
int xIndex = get_group_id(0)*KERNEL_Q_THREADS_PER_BLOCK + get_local_id(0);
// Read block's X values from global mem to shared mem
sX = x[xIndex];
sY = y[xIndex];
sZ = z[xIndex];
sQr = Qr[xIndex];
sQi = Qi[xIndex];
int kIndex = 0;
for (; (kIndex < KERNEL_Q_K_ELEMS_PER_GRID); kIndex++) {
if (kGlobalIndex < numK) {
float expArg;
expArg = PIx2 * (ck[kIndex].Kx * sX +
ck[kIndex].Ky * sY +
ck[kIndex].Kz * sZ);
sQr = sQr + ck[kIndex].PhiMag * cos(expArg); // native_cos(expArg);
sQi = sQi + ck[kIndex].PhiMag * sin(expArg); // native_sin(expArg);
}
kGlobalIndex++;
}
Qr[xIndex] = sQr;
Qi[xIndex] = sQi;
}

View file

@ -1,21 +0,0 @@
#ifndef __MACROS__
#define __MACROS__
#define PI 3.1415926535897932384626433832795029f
#define PIx2 6.2831853071795864769252867665590058f
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
#define K_ELEMS_PER_GRID 2048
#define KERNEL_PHI_MAG_THREADS_PER_BLOCK 256
#define KERNEL_Q_THREADS_PER_BLOCK 256
#define KERNEL_Q_K_ELEMS_PER_GRID 1024
struct kValues {
float Kx;
float Ky;
float Kz;
float PhiMag;
};
#endif

View file

@ -1,321 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2007 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*
* C code for creating the Q data structure for fast convolution-based
* Hessian multiplication for arbitrary k-space trajectories.
*
* Inputs:
* kx - VECTOR of kx values, same length as ky and kz
* ky - VECTOR of ky values, same length as kx and kz
* kz - VECTOR of kz values, same length as kx and ky
* x - VECTOR of x values, same length as y and z
* y - VECTOR of y values, same length as x and z
* z - VECTOR of z values, same length as x and y
* phi - VECTOR of the Fourier transform of the spatial basis
* function, evaluated at [kx, ky, kz]. Same length as kx, ky, and kz.
*
* recommended g++ options:
* -O3 -lm -ffast-math -funroll-all-loops
*/
#include <stdio.h>
#include <sys/time.h>
#include <parboil.h>
#include <CL/cl.h>
#include "ocl.h"
#include "file.h"
#include "macros.h"
#include "computeQ.h"
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
static void
setupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr,clPrmtr* clPrm)
{
cl_int clStatus;
*dev_ptr = clCreateBuffer(clPrm->clContext,CL_MEM_READ_ONLY,num*size,NULL,&clStatus);
CHECK_ERROR("clCreateBuffer");
clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
CHECK_ERROR("clEnequeueWriteBuffer");
}
static void
cleanupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr, clPrmtr* clPrm)
{
cl_int clStatus;
clStatus = clEnqueueReadBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
CHECK_ERROR("clEnqueueReadBuffer")
clStatus = clReleaseMemObject(*dev_ptr);
CHECK_ERROR("clReleaseMemObject")
}
int
main (int argc, char *argv[]) {
int numX, numK; /* Number of X and K values */
int original_numK; /* Number of K values in input file */
float *kx, *ky, *kz; /* K trajectory (3D vectors) */
float *x, *y, *z; /* X coordinates (3D vectors) */
float *phiR, *phiI; /* Phi values (complex) */
float *phiMag; /* Magnitude of Phi */
float *Qr, *Qi; /* Q signal (complex) */
struct kValues* kVals;
struct pb_Parameters *params;
struct pb_TimerSet timers;
pb_InitializeTimerSet(&timers);
/* Read command line */
params = pb_ReadParameters(&argc, argv);
params->inpFiles = (char **)malloc(sizeof(char *) * 2);
params->inpFiles[0] = (char *)malloc(100);
params->inpFiles[1] = NULL;
strncpy(params->inpFiles[0], "32_32_32_dataset.bin", 100);
if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
{
fprintf(stderr, "Expecting one input filename\n");
exit(-1);
}
/* Read in data */
pb_SwitchToTimer(&timers, pb_TimerID_IO);
inputData(params->inpFiles[0],
&original_numK, &numX,
&kx, &ky, &kz,
&x, &y, &z,
&phiR, &phiI);
/* Reduce the number of k-space samples if a number is given
* on the command line */
if (argc < 2)
numK = original_numK;
else
{
int inputK;
char *end;
inputK = strtol(argv[1], &end, 10);
if (end == argv[1])
{
fprintf(stderr, "Expecting an integer parameter\n");
exit(-1);
}
numK = MIN(inputK, original_numK);
}
printf("%d pixels in output; %d samples in trajectory; using %d samples\n",
numX, original_numK, numK);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
clPrmtr clPrm;
pb_Context* pb_context;
pb_context = pb_InitOpenCLContext(params);
if (pb_context == NULL) {
fprintf (stderr, "Error: No OpenCL platform/device can be found.");
return -1;
}
cl_int clStatus;
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
clPrm.clContext = (cl_context) pb_context->clContext;
clPrm.clCommandQueue = clCreateCommandQueue(clPrm.clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
CHECK_ERROR("clCreateCommandQueue")
pb_SetOpenCL(&(clPrm.clContext), &(clPrm.clCommandQueue));
#ifdef HOSTGPU
const char* clSource[] = {readFile("kernel.cl")};
CHECK_ERROR("clCreateProgramWithSource")
cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
#else
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
CHECK_ERROR("read_kernel_file")
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
CHECK_ERROR("clCreateProgramWithSource")
cl_program clProgram = clCreateProgramWithBinary(
clPrm.clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
#endif
char options[50];
sprintf(options,"-I src/opencl_nvidia");
clStatus = clBuildProgram(clProgram,0,NULL,options,NULL,NULL);
if (clStatus != CL_SUCCESS) {
char buf[4096];
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 4096, buf, NULL);
printf ("%s\n", buf);
CHECK_ERROR("clBuildProgram")
}
/* Create CPU data structures */
createDataStructsCPU(numK, numX, &phiMag, &Qr, &Qi);
/* GPU section 1 (precompute PhiMag) */
{
clPrm.clKernel = clCreateKernel(clProgram,"ComputePhiMag_GPU",&clStatus);
CHECK_ERROR("clCreateKernel")
/* Mirror several data structures on the device */
cl_mem phiR_d;
cl_mem phiI_d;
cl_mem phiMag_d;
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
setupMemoryGPU(numK,sizeof(float),&phiR_d,phiR,&clPrm);
setupMemoryGPU(numK,sizeof(float),&phiI_d,phiI,&clPrm);
phiMag_d = clCreateBuffer(clPrm.clContext,CL_MEM_WRITE_ONLY,numK*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clFinish(clPrm.clCommandQueue);
CHECK_ERROR("clFinish")
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
computePhiMag_GPU(numK, phiR_d, phiI_d, phiMag_d, &clPrm);
clStatus = clFinish(clPrm.clCommandQueue);
CHECK_ERROR("clFinish")
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
cleanupMemoryGPU(numK,sizeof(float),&phiMag_d,phiMag,&clPrm);
clStatus = clReleaseMemObject(phiR_d);
CHECK_ERROR("clReleaseMemObject")
clStatus = clReleaseMemObject(phiI_d);
CHECK_ERROR("clReleaseMemObject")
}
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
kVals = (struct kValues*)calloc(numK, sizeof (struct kValues));
int k;
for (k = 0; k < numK; k++) {
kVals[k].Kx = kx[k];
kVals[k].Ky = ky[k];
kVals[k].Kz = kz[k];
kVals[k].PhiMag = phiMag[k];
}
free(phiMag);
clStatus = clReleaseKernel(clPrm.clKernel);
/* GPU section 2 */
{
clPrm.clKernel = clCreateKernel(clProgram,"ComputeQ_GPU",&clStatus);
CHECK_ERROR("clCreateKernel")
cl_mem x_d;
cl_mem y_d;
cl_mem z_d;
cl_mem Qr_d;
cl_mem Qi_d;
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
setupMemoryGPU(numX,sizeof(float),&x_d,x,&clPrm);
setupMemoryGPU(numX,sizeof(float),&y_d,y,&clPrm);
setupMemoryGPU(numX,sizeof(float),&z_d,z,&clPrm);
Qr_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clMemSet(&clPrm,Qr_d,0,numX*sizeof(float));
Qi_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clMemSet(&clPrm,Qi_d,0,numX*sizeof(float));
clStatus = clFinish(clPrm.clCommandQueue);
CHECK_ERROR("clFinish")
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
computeQ_GPU(numK, numX, x_d, y_d, z_d, kVals, Qr_d, Qi_d, &clPrm);
clStatus = clFinish(clPrm.clCommandQueue);
CHECK_ERROR("clFinish")
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
clStatus = clReleaseMemObject(x_d);
CHECK_ERROR("clReleaseMemObject")
clStatus = clReleaseMemObject(y_d);
CHECK_ERROR("clReleaseMemObject")
clStatus = clReleaseMemObject(z_d);
CHECK_ERROR("clReleaseMemObject")
cleanupMemoryGPU(numX,sizeof(float),&Qr_d,Qr,&clPrm);
cleanupMemoryGPU(numX,sizeof(float),&Qi_d,Qi,&clPrm);
}
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
if (params->outFile)
{
/* Write Q to file */
pb_SwitchToTimer(&timers, pb_TimerID_IO);
outputData(params->outFile, Qr, Qi, numX);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
}
free (kx);
free (ky);
free (kz);
free (x);
free (y);
free (z);
free (phiR);
free (phiI);
free (kVals);
free (Qr);
free (Qi);
//free((void*)clSource[0]);
clStatus = clReleaseKernel(clPrm.clKernel);
clStatus = clReleaseProgram(clProgram);
clStatus = clReleaseCommandQueue(clPrm.clCommandQueue);
clStatus = clReleaseContext(clPrm.clContext);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
pb_FreeParameters(params);
return 0;
}

View file

@ -1,50 +0,0 @@
#include <CL/cl.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "ocl.h"
char* readFile(const char* fileName)
{
FILE* fp;
fp = fopen(fileName,"r");
if(fp == NULL)
{
printf("Error 1!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
long size = ftell(fp);
rewind(fp);
char* buffer = (char*)malloc(sizeof(char)*(size+1));
if(buffer == NULL)
{
printf("Error 2!\n");
fclose(fp);
exit(1);
}
size_t res = fread(buffer,1,size,fp);
if(res != size)
{
printf("Error 3!\n");
fclose(fp);
exit(1);
}
buffer[size] = 0;
fclose(fp);
return buffer;
}
void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
{
cl_int clStatus;
char* temp = (char*)malloc(size);
memset(temp,val,size);
clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
free(temp);
}

View file

@ -1,21 +0,0 @@
#ifndef __OCLH__
#define __OCLH__
typedef struct {
cl_uint major;
cl_uint minor;
cl_uint multiProcessorCount;
} OpenCLDeviceProp;
void clMemSet(cl_command_queue, cl_mem, int, size_t);
char* readFile(const char*);
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
#endif

View file

@ -1,50 +0,0 @@
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include "ocl.h"
#include <parboil.h>
char* readFile(const char* fileName)
{
FILE* fp;
fp = fopen(fileName,"r");
if(fp == NULL)
{
printf("Error 1!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
long size = ftell(fp);
rewind(fp);
char* buffer = (char*)malloc(sizeof(char)*(size+1));
if(buffer == NULL)
{
printf("Error 2!\n");
fclose(fp);
exit(1);
}
size_t res = fread(buffer,1,size,fp);
if(res != size)
{
printf("Error 3!\n");
fclose(fp);
exit(1);
}
buffer[size] = 0;
fclose(fp);
return buffer;
}
void clMemSet(clPrmtr* clPrm, cl_mem buf, int val, size_t size)
{
cl_int clStatus;
char* temp = (char*)malloc(size);
memset(temp,val,size);
clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
free(temp);
}

View file

@ -1,31 +0,0 @@
#ifndef __OCLH__
#define __OCLH__
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
cl_context clContext;
cl_command_queue clCommandQueue;
cl_kernel clKernel;
} clPrmtr;
void clMemSet(clPrmtr*, cl_mem, int, size_t);
char* readFile(const char*);
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
#ifdef __cplusplus
}
#endif
#endif

View file

@ -1,348 +0,0 @@
/*
* (c) 2010 The Board of Trustees of the University of Illinois.
*/
#ifndef PARBOIL_HEADER
#define PARBOIL_HEADER
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
#include <unistd.h>
/* A platform as specified by the user on the command line */
struct pb_PlatformParam {
char *name; /* The platform name. This string is owned. */
char *version; /* The platform version; may be NULL.
* This string is owned. */
};
/* Create a PlatformParam from the given strings.
* 'name' must not be NULL. 'version' may be NULL.
* If not NULL, the strings should have been allocated by malloc(),
* and they will be owned by the returned object.
*/
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version);
void
pb_FreePlatformParam(struct pb_PlatformParam *);
/* A criterion for how to select a device */
enum pb_DeviceSelectionCriterion {
pb_Device_INDEX, /* Enumerate the devices and select one
* by its number */
pb_Device_CPU, /* Select a CPU device */
pb_Device_GPU, /* Select a GPU device */
pb_Device_ACCELERATOR, /* Select an accelerator device */
pb_Device_NAME /* Select a device by name */
};
/* A device as specified by the user on the command line */
struct pb_DeviceParam {
enum pb_DeviceSelectionCriterion criterion;
union {
int index; /* If criterion == pb_Device_INDEX,
* the index of the device */
char *name; /* If criterion == pb_Device_NAME,
* the name of the device.
* This string is owned. */
};
};
struct pb_DeviceParam *
pb_DeviceParam_index(int index);
struct pb_DeviceParam *
pb_DeviceParam_cpu(void);
struct pb_DeviceParam *
pb_DeviceParam_gpu(void);
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void);
/* Create a by-name device selection criterion.
* The string should have been allocated by malloc(), and it will will be
* owned by the returned object.
*/
struct pb_DeviceParam *
pb_DeviceParam_name(char *name);
void
pb_FreeDeviceParam(struct pb_DeviceParam *);
/* Command line parameters for benchmarks */
struct pb_Parameters {
char *outFile; /* If not NULL, the raw output of the
* computation should be saved to this
* file. The string is owned. */
char **inpFiles; /* A NULL-terminated array of strings
* holding the input file(s) for the
* computation. The array and strings
* are owned. */
struct pb_PlatformParam *platform; /* If not NULL, the platform
* specified on the command line. */
struct pb_DeviceParam *device; /* If not NULL, the device
* specified on the command line. */
};
/* Read command-line parameters.
*
* The argc and argv parameters to main are read, and any parameters
* interpreted by this function are removed from the argument list.
*
* A new instance of struct pb_Parameters is returned.
* If there is an error, then an error message is printed on stderr
* and NULL is returned.
*/
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv);
/* Free an instance of struct pb_Parameters.
*/
void
pb_FreeParameters(struct pb_Parameters *p);
void
pb_FreeStringArray(char **);
/* Count the number of input files in a pb_Parameters instance.
*/
int
pb_Parameters_CountInputs(struct pb_Parameters *p);
/* A time or duration. */
//#if _POSIX_VERSION >= 200112L
typedef unsigned long long pb_Timestamp; /* time in microseconds */
//#else
//# error "Timestamps not implemented"
//#endif
enum pb_TimerState {
pb_Timer_STOPPED,
pb_Timer_RUNNING,
};
struct pb_Timer {
enum pb_TimerState state;
pb_Timestamp elapsed; /* Amount of time elapsed so far */
pb_Timestamp init; /* Beginning of the current time interval,
* if state is RUNNING. End of the last
* recorded time interfal otherwise. */
};
/* Reset a timer.
* Use this to initialize a timer or to clear
* its elapsed time. The reset timer is stopped.
*/
void
pb_ResetTimer(struct pb_Timer *timer);
/* Start a timer. The timer is set to RUNNING mode and
* time elapsed while the timer is running is added to
* the timer.
* The timer should not already be running.
*/
void
pb_StartTimer(struct pb_Timer *timer);
/* Stop a timer.
* This stops adding elapsed time to the timer.
* The timer should not already be stopped.
*/
void
pb_StopTimer(struct pb_Timer *timer);
/* Get the elapsed time in seconds. */
double
pb_GetElapsedTime(struct pb_Timer *timer);
/* Execution time is assigned to one of these categories. */
enum pb_TimerID {
pb_TimerID_NONE = 0,
pb_TimerID_IO, /* Time spent in input/output */
pb_TimerID_KERNEL, /* Time spent computing on the device,
* recorded asynchronously */
pb_TimerID_COPY, /* Time spent synchronously moving data
* to/from device and allocating/freeing
* memory on the device */
pb_TimerID_DRIVER, /* Time spent in the host interacting with the
* driver, primarily for recording the time
* spent queueing asynchronous operations */
pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
pb_TimerID_COMPUTE, /* Time for all program execution other
* than parsing command line arguments,
* I/O, kernel, and copy */
pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and
* host activity: automatically filled in,
* not intended for direct usage */
pb_TimerID_LAST /* Number of timer IDs */
};
/* Dynamic list of asynchronously tracked times between events */
struct pb_async_time_marker_list {
char *label; // actually just a pointer to a string
enum pb_TimerID timerID; /* The ID to which the interval beginning
* with this marker should be attributed */
void * marker;
//cudaEvent_t marker; /* The driver event for this marker */
struct pb_async_time_marker_list *next;
};
struct pb_SubTimer {
char *label;
struct pb_Timer timer;
struct pb_SubTimer *next;
};
struct pb_SubTimerList {
struct pb_SubTimer *current;
struct pb_SubTimer *subtimer_list;
};
/* A set of timers for recording execution times. */
struct pb_TimerSet {
enum pb_TimerID current;
struct pb_async_time_marker_list* async_markers;
pb_Timestamp async_begin;
pb_Timestamp wall_begin;
struct pb_Timer timers[pb_TimerID_LAST];
struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
};
/* Reset all timers in the set. */
void
pb_InitializeTimerSet(struct pb_TimerSet *timers);
void
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
/* Select which timer the next interval of time should be accounted
* to. The selected timer is started and other timers are stopped.
* Using pb_TimerID_NONE stops all timers. */
void
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
void
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
/* Print timer values to standard output. */
void
pb_PrintTimerSet(struct pb_TimerSet *timers);
/* Release timer resources */
void
pb_DestroyTimerSet(struct pb_TimerSet * timers);
void
pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
typedef struct pb_Device_tag {
char* name;
void* clDevice;
int id;
unsigned int in_use;
unsigned int available;
} pb_Device;
struct pb_Context_tag;
typedef struct pb_Context_tag pb_Context;
typedef struct pb_Platform_tag {
char* name;
char* version;
void* clPlatform;
unsigned int in_use;
pb_Context** contexts;
pb_Device** devices;
} pb_Platform;
struct pb_Context_tag {
void* clPlatformId;
void* clContext;
void* clDeviceId;
pb_Platform* pb_platform;
pb_Device* pb_device;
};
// verbosely print out list of platforms and their devices to the console.
pb_Platform**
pb_GetPlatforms();
// Choose a platform according to the given platform specification
pb_Platform*
pb_GetPlatform(struct pb_PlatformParam *platform);
// choose a platform: by name, name & version
pb_Platform*
pb_GetPlatformByName(const char* name);
pb_Platform*
pb_GetPlatformByNameAndVersion(const char* name, const char* version);
// Choose a device according to the given device specification
pb_Device*
pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
pb_Device**
pb_GetDevices(pb_Platform* pb_platform);
// choose a device by name.
pb_Device*
pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
pb_Platform*
pb_GetPlatformByEnvVars();
pb_Context*
pb_InitOpenCLContext(struct pb_Parameters* parameters);
void
pb_ReleasePlatforms();
void
pb_ReleaseContext(pb_Context* c);
void
pb_PrintPlatformInfo(pb_Context* c);
void
perf_init();
//#define MEASURE_KERNEL_TIME
#include <CL/cl.h>
#ifdef MEASURE_KERNEL_TIME
#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
cl_int
pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
cl_kernel /* kernel */,
cl_uint /* work_dim */,
const size_t * /* global_work_offset */,
const size_t * /* global_work_size */,
const size_t * /* local_work_size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */);
#endif
enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
void pb_sig_float(char*, float*, int);
void pb_sig_double(char*, double*, int);
void pb_sig_short(char*, short*, int);
void pb_sig_int(char*, int*, int);
void pb_sig_uchar(char*, unsigned char*, unsigned int);
void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
#ifdef __cplusplus
}
#endif
#endif //PARBOIL_HEADER

File diff suppressed because it is too large Load diff

View file

@ -1,73 +0,0 @@
__kernel void sgemm2(__global float *A,
__global float *B,
__global float *C,
const unsigned int N,
__local float *localA,
__local float *localB)
{
int globalRow = get_global_id(1);
int globalCol = get_global_id(0);
int localRow = get_local_id(1);
int localCol = get_local_id(0);
int localSize = get_local_size(0); // assuming square local size
float sum = 0.0f;
// Loop over all blocks of both matrices
for (int k = 0; k < N; k += localSize) {
// Load block of matrix A to local memory
localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
// Load block of matrix B to local memory, adjusting for column-major access
localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
// Synchronize to make sure the tiles are loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Multiply the two matrix blocks and accumulate result
for (int j = 0; j < localSize; j++) {
sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
}
// Ensure computation is done before loading next block
barrier(CLK_LOCAL_MEM_FENCE);
}
C[globalRow * N + globalCol] = sum;
}
/*__kernel void sgemm2(__global float *A,
__global float *B,
__global float *C,
const unsigned int N)
{
int globalRow = get_global_id(1);
int globalCol = get_global_id(0);
int localRow = get_local_id(1);
int localCol = get_local_id(0);
// Static local memory declaration
__local float localA[16][16];
__local float localB[16][16];
float sum = 0.0f;
// Iterate over blocks
for (int k = 0; k < N; k += 16) {
// Load a block of matrix A into local memory
localA[localRow][localCol] = A[globalRow * N + k + localCol];
// Load a block of matrix B into local memory
localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
// Ensure the entire block is loaded
barrier(CLK_LOCAL_MEM_FENCE);
// Compute multiplication for this block
for (int j = 0; j < 16; j++) {
sum += localA[localRow][j] * localB[j][localCol];
}
}
C[globalRow * N + globalCol] = sum;
}*/

View file

@ -1,10 +0,0 @@
PROJECT = vectorhypot
SRCS = main.cc oclUtils.cpp shrUtils.cpp cmd_arg_reader.cpp
CXXFLAGS += -I.
OPTS ?=
include ../common.mk

View file

@ -1,152 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
// includes, file
#include "cmd_arg_reader.h"
// includes, system
#include <vector>
// internal unnamed namespace
namespace
{
// types, internal (class, enum, struct, union, typedef)
// variables, internal
} // namespace {
// variables, exported
/*static*/ CmdArgReader* CmdArgReader::self;
/*static*/ char** CmdArgReader::rargv;
/*static*/ int CmdArgReader::rargc;
// functions, exported
////////////////////////////////////////////////////////////////////////////////
//! Public construction interface
//! @return a handle to the class instance
//! @param argc number of command line arguments (as given to main())
//! @param argv command line argument string (as given to main())
////////////////////////////////////////////////////////////////////////////////
/*static*/ void
CmdArgReader::init( const int argc, const char** argv)
{
if ( NULL != self)
{
return;
}
// command line arguments
if (( 0 == argc) || ( 0 == argv))
{
LOGIC_EXCEPTION( "No command line arguments given.");
}
self = new CmdArgReader();
self->createArgsMaps( argc, argv);
rargc = argc;
rargv = const_cast<char**>( argv);
}
////////////////////////////////////////////////////////////////////////////////
//! Constructor, default
////////////////////////////////////////////////////////////////////////////////
CmdArgReader::CmdArgReader() :
args(),
unprocessed(),
iter(),
iter_unprocessed()
{ }
////////////////////////////////////////////////////////////////////////////////
//! Destructor
////////////////////////////////////////////////////////////////////////////////
CmdArgReader::~CmdArgReader()
{
for( iter = args.begin(); iter != args.end(); ++iter)
{
if( *(iter->second.first) == typeid( int))
{
delete static_cast<int*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( bool))
{
delete static_cast<bool*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::string))
{
delete static_cast<std::string*>( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::vector< std::string>) )
{
delete static_cast< std::vector< std::string>* >( iter->second.second);
break;
}
else if( *(iter->second.first) == typeid( std::vector<int>) )
{
delete static_cast< std::vector<int>* >( iter->second.second);
break;
}
}
}
////////////////////////////////////////////////////////////////////////////////
//! Read args as token value pair into map for better processing (Even the
//! values remain strings until the parameter values is requested by the
//! program.)
//! @param argc the argument count (as given to 'main')
//! @param argv the char* array containing the command line arguments
////////////////////////////////////////////////////////////////////////////////
void
CmdArgReader::createArgsMaps( const int argc, const char** argv) {
std::string token;
std::string val_str;
std::map< std::string, std::string> args;
std::string::size_type pos;
std::string arg;
for( int i=1; i<argc; ++i)
{
arg = argv[i];
// check if valid command line argument: all arguments begin with - or --
if (arg[0] != '-')
{
RUNTIME_EXCEPTION("Invalid command line argument.");
}
int numDashes = (arg[1] == '-' ? 2 : 1);
// check if only flag or if a value is given
if ( (pos = arg.find( '=')) == std::string::npos)
{
unprocessed[ std::string( arg, numDashes, arg.length()-numDashes)] = "FLAG";
}
else
{
unprocessed[ std::string( arg, numDashes, pos-numDashes)] =
std::string( arg, pos+1, arg.length()-1);
}
}
}

View file

@ -1,488 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _CMDARGREADER_H_
#define _CMDARGREADER_H_
// includes, system
#include <map>
#include <iostream>
#include <sstream>
#include <algorithm>
#include <typeinfo>
// includes, project
#include "exception.h"
//! Preprocessed command line arguments
//! @note Lazy evaluation: The arguments are converted from strings to
//! the correct data type upon request. Converted values are stored
//! in an additonal map so that no additional conversion is
//! necessary. Arrays of command line arguments are stored in
//! std::vectors
//! @note Usage:
//! const std::string* file =
//! CmdArgReader::getArg< std::string>( "model")
//! const std::vector< std::string>* files =
//! CmdArgReader::getArg< std::vector< std::string> >( "model")
//! @note All command line arguments begin with '--' followed by the token;
//! token and value are seperated by '='; example --samples=50
//! @note Arrays have the form --model=[one.obj,two.obj,three.obj]
//! (without whitespaces)
//! Command line argument parser
class CmdArgReader
{
template<class> friend class TestCmdArgReader;
protected:
//! @param self handle to the only instance of this class
static CmdArgReader* self;
public:
//! Public construction interface
//! @return a handle to the class instance
//! @param argc number of command line arguments (as given to main())
//! @param argv command line argument string (as given to main())
static void init( const int argc, const char** argv);
public:
//! Get the value of the command line argument with given name
//! @return A const handle to the requested argument.
//! If the argument does not exist or if it
//! is not from type T NULL is returned
//! @param name the name of the requested argument
//! @note T the type of the argument requested
template<class T>
static inline const T* getArg( const std::string& name);
//! Check if a command line argument with the given name exists
//! @return true if a command line argument with name \a name exists,
//! otherwise false
//! @param name name of the command line argument in question
static inline bool existArg( const std::string& name);
//! Get the original / raw argc program argument
static inline int& getRArgc();
//! Get the original / raw argv program argument
static inline char**& getRArgv();
public:
//! Destructor
~CmdArgReader();
protected:
//! Constructor, default
CmdArgReader();
private:
// private helper functions
//! Get the value of the command line argument with given name
//! @note Private helper function for 'getArg' to work on the members
//! @return A const handle to the requested argument. If the argument
//! does not exist or if it is not from type T a NULL pointer
//! is returned.
//! @param name the name of the requested argument
//! @note T the type of the argument requested
template<class T>
inline const T* getArgHelper( const std::string& name);
//! Check if a command line argument with name \a name exists
//! @return true if a command line argument of name \a name exists,
//! otherwise false
//! @param name the name of the requested argument
inline bool existArgHelper( const std::string& name) const;
//! Read args as token value pair into map for better processing
//! (Even the values remain strings until the parameter values is
//! requested by the program.)
//! @param argc the argument count (as given to 'main')
//! @param argv the char* array containing the command line arguments
void createArgsMaps( const int argc, const char** argv);
//! Helper for "casting" the strings from the map with the unprocessed
//! values to the correct
//! data type.
//! @return true if conversion succeeded, otherwise false
//! @param element the value as string
//! @param val the value as type T
template<class T>
static inline bool convertToT( const std::string& element, T& val);
public:
// typedefs internal
//! container for a processed command line argument
//! typeid is used to easily be able to decide if a re-requested token-value
//! pair match the type of the first conversion
typedef std::pair< const std::type_info*, void*> ValType;
//! map of already converted values
typedef std::map< std::string, ValType > ArgsMap;
//! iterator for the map of already converted values
typedef ArgsMap::iterator ArgsMapIter;
typedef ArgsMap::const_iterator ConstArgsMapIter;
//! map of unprocessed (means unconverted) token-value pairs
typedef std::map< std::string, std::string> UnpMap;
//! iterator for the map of unprocessed (means unconverted) token-value pairs
typedef std::map< std::string, std::string>::iterator UnpMapIter;
private:
#ifdef _WIN32
# pragma warning( disable: 4251)
#endif
//! rargc original value of argc
static int rargc;
//! rargv contains command line arguments in raw format
static char** rargv;
//! args Map containing the already converted token-value pairs
ArgsMap args;
//! args Map containing the unprocessed / unconverted token-value pairs
UnpMap unprocessed;
//! iter Iterator for the map with the already converted token-value
//! pairs (to avoid frequent reallocation)
ArgsMapIter iter;
//! iter Iterator for the map with the unconverted token-value
//! pairs (to avoid frequent reallocation)
UnpMapIter iter_unprocessed;
#ifdef _WIN32
# pragma warning( default: 4251)
#endif
private:
//! Constructor, copy (not implemented)
CmdArgReader( const CmdArgReader&);
//! Assignment operator (not implemented)
CmdArgReader& operator=( const CmdArgReader&);
};
// variables, exported (extern)
// functions, inlined (inline)
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line argument arrays
//! @note This function is used each type for which no template specialization
//! exist (which will cause errors if the type does not fulfill the std::vector
//! interface).
////////////////////////////////////////////////////////////////////////////////
template<class T>
/*static*/ inline bool
CmdArgReader::convertToT( const std::string& element, T& val)
{
// preallocate storage
val.resize( std::count( element.begin(), element.end(), ',') + 1);
unsigned int i = 0;
std::string::size_type pos_start = 1; // leave array prefix '['
std::string::size_type pos_end = 0;
// do for all elements of the comma seperated list
while( std::string::npos != ( pos_end = element.find(',', pos_end+1)) )
{
// convert each element by the appropriate function
if ( ! convertToT< typename T::value_type >(
std::string( element, pos_start, pos_end - pos_start), val[i]))
{
return false;
}
pos_start = pos_end + 1;
++i;
}
std::string tmp1( element, pos_start, element.length() - pos_start - 1);
// process last element (leave array postfix ']')
if ( ! convertToT< typename T::value_type >( std::string( element,
pos_start,
element.length() - pos_start - 1),
val[i]))
{
return false;
}
// possible to process all elements?
return true;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type int
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<int>( const std::string& element, int& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type float
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<float>( const std::string& element, float& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type double
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<double>( const std::string& element, double& val)
{
std::istringstream ios( element);
ios >> val;
bool ret_val = false;
if ( ios.eof())
{
ret_val = true;
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type string
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<std::string>( const std::string& element,
std::string& val)
{
val = element;
return true;
}
////////////////////////////////////////////////////////////////////////////////
//! Conversion function for command line arguments of type bool
////////////////////////////////////////////////////////////////////////////////
template<>
inline bool
CmdArgReader::convertToT<bool>( const std::string& element, bool& val)
{
// check if value is given as string-type { true | false }
if ( "true" == element)
{
val = true;
return true;
}
else if ( "false" == element)
{
val = false;
return true;
}
// check if argument is given as integer { 0 | 1 }
else
{
int tmp;
if ( convertToT<int>( element, tmp))
{
if ( 1 == tmp)
{
val = true;
return true;
}
else if ( 0 == tmp)
{
val = false;
return true;
}
}
}
return false;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the value of the command line argument with given name
//! @return A const handle to the requested argument. If the argument does
//! not exist or if it is not from type T NULL is returned
//! @param T the type of the argument requested
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
template<class T>
/*static*/ const T*
CmdArgReader::getArg( const std::string& name)
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
return NULL;
}
return self->getArgHelper<T>( name);
}
////////////////////////////////////////////////////////////////////////////////
//! Check if a command line argument with the given name exists
//! @return true if a command line argument with name \a name exists,
//! otherwise false
//! @param name name of the command line argument in question
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline bool
CmdArgReader::existArg( const std::string& name)
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getArg(): CmdArgReader not initialized.");
return false;
}
return self->existArgHelper( name);
}
////////////////////////////////////////////////////////////////////////////////
//! @brief Get the value of the command line argument with given name
//! @return A const handle to the requested argument. If the argument does
//! not exist or if it is not from type T NULL is returned
//! @param T the type of the argument requested
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
template<class T>
const T*
CmdArgReader::getArgHelper( const std::string& name)
{
// check if argument already processed and stored in correct type
if ( args.end() != (iter = args.find( name)))
{
if ( (*(iter->second.first)) == typeid( T) )
{
return (T*) iter->second.second;
}
}
else
{
T* tmp = new T;
// check the array with unprocessed values
if ( unprocessed.end() != (iter_unprocessed = unprocessed.find( name)))
{
// try to "cast" the string to the type requested
if ( convertToT< T >( iter_unprocessed->second, *tmp))
{
// add the token element pair to map of already converted values
args[name] = std::make_pair( &(typeid( T)), (void*) tmp);
return tmp;
}
}
// not used while not inserted into the map -> cleanup
delete tmp;
}
// failed, argument not available
return NULL;
}
////////////////////////////////////////////////////////////////////////////////
//! Check if a command line argument with name \a name exists
//! @return true if a command line argument of name \a name exists,
//! otherwise false
//! @param name the name of the requested argument
////////////////////////////////////////////////////////////////////////////////
inline bool
CmdArgReader::existArgHelper( const std::string& name) const
{
bool ret_val = false;
// check if argument already processed and stored in correct type
if( args.end() != args.find( name))
{
ret_val = true;
}
else
{
// check the array with unprocessed values
if ( unprocessed.end() != unprocessed.find( name))
{
ret_val = true;
}
}
return ret_val;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the original / raw argc program argument
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline int&
CmdArgReader::getRArgc()
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
}
return rargc;
}
////////////////////////////////////////////////////////////////////////////////
//! Get the original / raw argv program argument
////////////////////////////////////////////////////////////////////////////////
/*static*/ inline char**&
CmdArgReader::getRArgv()
{
if( ! self)
{
RUNTIME_EXCEPTION("CmdArgReader::getRArgc(): CmdArgReader not initialized.");
}
return rargv;
}
// functions, exported (extern)
#endif // #ifndef _CMDARGREADER_H_

View file

@ -1,151 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/* CUda UTility Library */
#ifndef _EXCEPTION_H_
#define _EXCEPTION_H_
// includes, system
#include <exception>
#include <stdexcept>
#include <iostream>
#include <stdlib.h>
//! Exception wrapper.
//! @param Std_Exception Exception out of namespace std for easy typing.
template<class Std_Exception>
class Exception : public Std_Exception
{
public:
//! @brief Static construction interface
//! @return Alwayss throws ( Located_Exception<Exception>)
//! @param file file in which the Exception occurs
//! @param line line in which the Exception occurs
//! @param detailed details on the code fragment causing the Exception
static void throw_it( const char* file,
const int line,
const char* detailed = "-" );
//! Static construction interface
//! @return Alwayss throws ( Located_Exception<Exception>)
//! @param file file in which the Exception occurs
//! @param line line in which the Exception occurs
//! @param detailed details on the code fragment causing the Exception
static void throw_it( const char* file,
const int line,
const std::string& detailed);
//! Destructor
virtual ~Exception() throw();
private:
//! Constructor, default (private)
Exception();
//! Constructor, standard
//! @param str string returned by what()
Exception( const std::string& str);
};
////////////////////////////////////////////////////////////////////////////////
//! Exception handler function for arbitrary exceptions
//! @param ex exception to handle
////////////////////////////////////////////////////////////////////////////////
template<class Exception_Typ>
inline void
handleException( const Exception_Typ& ex)
{
std::cerr << ex.what() << std::endl;
exit( EXIT_FAILURE);
}
//! Convenience macros
//! Exception caused by dynamic program behavior, e.g. file does not exist
#define RUNTIME_EXCEPTION( msg) \
Exception<std::runtime_error>::throw_it( __FILE__, __LINE__, msg)
//! Logic exception in program, e.g. an assert failed
#define LOGIC_EXCEPTION( msg) \
Exception<std::logic_error>::throw_it( __FILE__, __LINE__, msg)
//! Out of range exception
#define RANGE_EXCEPTION( msg) \
Exception<std::range_error>::throw_it( __FILE__, __LINE__, msg)
////////////////////////////////////////////////////////////////////////////////
//! Implementation
// includes, system
#include <sstream>
////////////////////////////////////////////////////////////////////////////////
//! Static construction interface.
//! @param Exception causing code fragment (file and line) and detailed infos.
////////////////////////////////////////////////////////////////////////////////
/*static*/ template<class Std_Exception>
void
Exception<Std_Exception>::
throw_it( const char* file, const int line, const char* detailed)
{
std::stringstream s;
// Quiet heavy-weight but exceptions are not for
// performance / release versions
s << "Exception in file '" << file << "' in line " << line << "\n"
<< "Detailed description: " << detailed << "\n";
throw Exception( s.str());
}
////////////////////////////////////////////////////////////////////////////////
//! Static construction interface.
//! @param Exception causing code fragment (file and line) and detailed infos.
////////////////////////////////////////////////////////////////////////////////
/*static*/ template<class Std_Exception>
void
Exception<Std_Exception>::
throw_it( const char* file, const int line, const std::string& msg)
{
throw_it( file, line, msg.c_str());
}
////////////////////////////////////////////////////////////////////////////////
//! Constructor, default (private).
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::Exception() :
Exception("Unknown Exception.\n")
{ }
////////////////////////////////////////////////////////////////////////////////
//! Constructor, standard (private).
//! String returned by what().
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::Exception( const std::string& s) :
Std_Exception( s)
{ }
////////////////////////////////////////////////////////////////////////////////
//! Destructor
////////////////////////////////////////////////////////////////////////////////
template<class Std_Exception>
Exception<Std_Exception>::~Exception() throw() { }
// functions, exported
#endif // #ifndef _EXCEPTION_H_

View file

@ -1,41 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// OpenCL Kernel Function Naive Implementation for hyptenuse
__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
{
// get index into global data array
size_t szGlobalOffset = get_global_id(0) + uiOffset;
// bound check
if (szGlobalOffset >= uiNumElements)
{
return;
}
// Processing 4 elements per work item, so read fgA and fgB source values from GMEM
float4 f4A = fg4A[szGlobalOffset];
float4 f4B = fg4B[szGlobalOffset];
float4 f4H = (float4)0.0f;
// Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop
for (int i = 0; i < iInnerLoopCount; i++)
{
// compute the 4 hypotenuses using built-in function
f4H.x = hypot (f4A.x, f4B.x);
f4H.y = hypot (f4A.y, f4B.y);
f4H.z = hypot (f4A.z, f4B.z);
f4H.w = hypot (f4A.w, f4B.w);
}
// Write 4 result values back out to GMEM
fg4Hypot[szGlobalOffset] = f4H;
}

View file

@ -1,702 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// *********************************************************************
// oclCopyComputeOverlap Notes:
//
// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
// element by element vector hyptenuse computation using 2 input float arrays
// and 1 output float array.
//
// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
// with respect to GPU computation (and with respect to host thread).
//
// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
//
// After setup, warmup and calibration to the system, the sample runs 4 scenarios:
// A) Computations with 2 command queues on GPU
// A multiple-cycle sequence is executed, timed and compared against the host
// B) Computations with 1 command queue on GPU
// A multiple-cycle sequence is executed, timed and compared against the host
//
// The 2-command queue approach ought to be substantially faster
//
// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently
// increases compute time without increasing data size (via a loop inside the kernel)
//
// At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
// (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
//
// If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
//
// Single Queue with all the data and all the work
// Ttot (serial) = 4T + 4T + 2T = 10T
//
// Dual Queue, where each queue has 1/2 the data and 1/2 the work
// Tq0 (overlap) = 2T + 2T + T ....
// Tq1 (overlap) = .... 2T + 2T + T
//
// Ttot (elapsed, wall) = 2T + 2T + 2T + T = 7T
//
// Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 % (Tesla arch 1.2 or 1.3, single copy engine)
//
// For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
// This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.
// *********************************************************************
// common SDK header for standard utilities and system libs
#include <oclUtils.h>
#include <shrQATest.h>
#include <iostream>
// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
// values greater than 0.0f represent a speed-up relative to non-overlapped
#define EXPECTED_OVERLAP 30.0f
#define EXPECTED_OVERLAP_FERMI 45.0f
#define PASS_FACTOR 0.60f
#define RETRIES_ON_FAILURE 1
// Base sizes for parameters manipulated dynamically or on the command line
#define BASE_WORK_ITEMS 64
#define BASE_ARRAY_LENGTH 40000
#define BASE_LOOP_COUNT 32
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return CL_INVALID_VALUE;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return CL_INVALID_VALUE;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return CL_SUCCESS;
}
// Vars
// *********************************************************************
cl_platform_id cpPlatform; // OpenCL platform
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQueue[2]; // OpenCL command queues
cl_device_id* cdDevices; // OpenCL device list
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel[2]; // OpenCL kernel, 1 per queue
cl_mem cmPinnedSrcA; // OpenCL pinned host source buffer A
cl_mem cmPinnedSrcB; // OpenCL pinned host source buffer B
cl_mem cmPinnedResult; // OpenCL pinned host result buffer
float* fSourceA = NULL; // Mapped pointer for pinned Host source A buffer
float* fSourceB = NULL; // Mapped pointer for pinned Host source B buffer
float* fResult = NULL; // Mapped pointer for pinned Host result buffer
cl_mem cmDevSrcA; // OpenCL device source buffer A
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevResult; // OpenCL device result buffer
size_t szBuffBytes; // Size of main buffers
size_t szGlobalWorkSize; // 1D var for Total # of work items in the launched ND range
size_t szLocalWorkSize = BASE_WORK_ITEMS; // initial # of work items in the work group
cl_int ciErrNum; // Error code var
char* cPathAndName = NULL; // Var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
const char* cExecutableName = NULL;
// demo config vars
const char* cSourceFile = "kernel.cl"; // OpenCL computation kernel source code
float* Golden = NULL; // temp buffer to hold golden results for cross check
bool bNoPrompt = false; // Command line switch to skip exit prompt
bool bQATest = false; // Command line switch to test
// Forward Declarations
// *********************************************************************
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles);
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
void Cleanup (int iExitCode);
void (*pCleanup)(int) = &Cleanup;
int *gp_argc = 0;
const char *** gp_argv = NULL;
// Main function
// *********************************************************************
int main(int argc, const char **argv)
{
//Locals
size_t szKernelLength; // Byte size of kernel code
double dBuildTime; // Compile time
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
cl_uint uiNumDevices; // Number of devices available
int iDevCap = -1; // Capability of device
int iInnerLoopCount = BASE_LOOP_COUNT; // Varies "compute intensity" per data within the kernel
const int iTestCycles = 10; // How many times to run the external test loop
const int iWarmupCycles = 8; // How many times to run the warmup sequence
cl_uint uiWorkGroupMultiple = 4; // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
cl_uint uiNumElements = BASE_ARRAY_LENGTH; // initial # of elements per array to process (note: procesing 4 per work item)
cl_uint uiSizeMultiple = 4; // Command line var (using "sizemult=<n>") to optionally increase vector sizes
bool bPassFlag = false; // Var to accumulate test pass/fail
shrBOOL bMatch = shrFALSE; // Cross check result
shrBOOL bTestOverlap = shrFALSE;
double dAvgGPUTime[2] = {0.0, 0.0}; // Average time of iTestCycles calls for 2-Queue and 1-Queue test
double dHostTime[2] = {0.0, 0.0}; // Host computation time (2nd test is redundant but a good stability indicator)
float fMinPassCriteria[2] = {0.0f, 0.0f}; // Test pass cireria, adjusted dependant on GPU arch
gp_argc = &argc;
gp_argv = &argv;
shrQAStart(argc, (char **)argv);
// start logs
cExecutableName = argv[0];
shrSetLogFileName ("oclCopyComputeOverlap.txt");
shrLog("%s Starting...\n\n", argv[0]);
// get basic command line args
bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
bQATest = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
// Optional Command-line multiplier for vector size
// Default val of 4 gives 10.24 million float elements per vector
// Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);
uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
shrLog("Array sizes = %u float elements\n", uiNumElements);
// Optional Command-line multiplier for workgroup size (x 64 work items)
// Default val of 4 gives szLocalWorkSize of 256.
// Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10);
szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
shrLog("Workgroup Size = %u\n\n", szLocalWorkSize);
// Get the NVIDIA platform if available, otherwise use default
shrLog("Get the Platform ID...\n\n");
ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Get OpenCL platform name and version
char cBuffer[256];
ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("Platform Name = %s\n\n", cBuffer);
// Get all the devices
shrLog("Get the Device info and select Device...\n");
uiNumDevices = 1;
cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, cdDevices, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Set target device and check capabilities
shrLog(" # of Devices Available = %u\n", uiNumDevices);
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
shrLog(" Using Device %u, ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
/*iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
if (iDevCap > 0) {
shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
} else {
shrLog("\n\n", iDevCap);
}
if (strstr(cBuffer, "NVIDIA") != NULL)
{
if (iDevCap < 12)
{
shrLog("Device doesn't have overlap capability. Skipping test...\n");
Cleanup (EXIT_SUCCESS);
}
// Device and Platform eligible for overlap testing
bTestOverlap = shrTRUE;
// If device has overlap capability, proceed
fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP; // 1st cycle overlap is same for 1 or 2 copy engines
if (iDevCap != 20)
{
// Single copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // avg of many cycles
}
else
{
char cDevName[1024];
clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
{
// Tesla or Quadro (arch = 2.0) ... Dual copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI; // average of many cycles
}
else
{
// Geforce ... Single copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // average of many cycles
}
}
}*/
// Create the context
shrLog("clCreateContext...\n");
cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create 2 command-queues
cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateCommandQueue [0]...\n");
cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateCommandQueue [1]...\n");
// Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
szBuffBytes = sizeof(cl_float) * uiNumElements;
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements);
// Allocate pinned source and result host buffers:
// Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements);
// Get mapped pointers to pinned input host buffers
// Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n");
// Alloc temp golden buffer for cross checks
Golden = (float*)malloc(szBuffBytes);
oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
#ifdef HOSTGPU
// Read the OpenCL kernel in from source file
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
oclCheckError(cPathAndName != NULL, shrTRUE);
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
// Create the program object
shrLog("clCreateProgramWithSource...\n");
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
#else
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cpProgram = clCreateProgramWithBinary(
cxGPUContext, 1, &cdDevices[uiTargetDevice], &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
#endif
// Build the program for the target device
clFinish(cqCommandQueue[0]);
shrDeltaT(0);
ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
shrLog("clBuildProgram...");
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
Cleanup(EXIT_FAILURE);
}
dBuildTime = shrDeltaT(0);
// Create the kernel
ckKernel[0] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ckKernel[1] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateKernel (ckKernel[2])...\n");
// Offsets for 2 queues
cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
// Set the Argument values for the 1st kernel instance (queue 0)
ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n");
// Set the Argument values for the 2d kernel instance (queue 1)
ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n");
//*******************************************
// Warmup the driver with dual queue sequence
//*******************************************
// Warmup with dual queue sequence for iTestCycles
shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
DualQueueSequence(iWarmupCycles, uiNumElements, false);
// Use single queue config to adjust compute intensity
shrLog("Adjust compute for GPU / system...\n");
iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles);
shrLog(" Kernel inner loop count = %d\n", iInnerLoopCount);
//*******************************************
// Run and time with 2 command-queues
//*******************************************
for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
// Run the sequence iTestCycles times
dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
// Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
shrLog(" Device vs Host Result Comparison\t: ");
VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
shrDeltaT(0);
for (int i = 0; i < iTestCycles; i++)
{
VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
}
dHostTime[0] = shrDeltaT(0)/iTestCycles;
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
bPassFlag = (bMatch == shrTRUE);
//*******************************************
// Run and time with 1 command queue
//*******************************************
// Run the sequence iTestCycles times
dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
// Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
shrLog(" Device vs Host Result Comparison\t: ");
shrDeltaT(0);
for (int i = 0; i < iTestCycles; i++)
{
VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);
}
dHostTime[1] = shrDeltaT(0)/iTestCycles;
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
bPassFlag &= (bMatch == shrTRUE);
//*******************************************
// Compare Single and Dual queue timing
shrLog("\nResult Summary:\n");
// Log GPU and CPU Time for 2-queue scenario
shrLog(" Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
// Log GPU and CPU Time for 1-queue scenario
shrLog(" Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
// Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
if( bTestOverlap ) {
bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
// Log info to master log in standard format
shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize);
bPassFlag &= bAvgOverlapOK;
break;
}
}
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
}
//*******************************************
// Report pass/fail, cleanup and exit
Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
return 0;
}
// Run 1 queue sequence for n cycles
// *********************************************************************
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
{
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
// *** Make sure queues are empty and then start timer
double dAvgTime = 0.0;
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
// Run the sequence iCycles times
for (int i = 0; i < iCycles; i++)
{
// Nonblocking Write of all of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Launch kernel computation, command-queue 0
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Non Blocking Read of output data from device to host, command-queue 0
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
clFlush(cqCommandQueue[0]);
}
// *** Assure sync to host and return average sequence time
clFinish(cqCommandQueue[0]);
dAvgTime = shrDeltaT(0)/(double)iCycles;
// Log config if asked for
if (bShowConfig)
{
shrLog("\n1-Queue sequence Configuration:\n");
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 1\n",
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
}
return dAvgTime;
}
// Run 2 queue sequence for n cycles
// *********************************************************************
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
{
// Locals
size_t szHalfBuffer = szBuffBytes / 2;
size_t szHalfOffset = szHalfBuffer / sizeof(float);
double dAvgTime = 0.0;
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Set Global work size for 2 command-queues, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
// Make sure queues are empty and then start timer
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
for (int i = 0; i < iCycles; i++)
{
// Mid Phase 0
// Nonblocking Write of 1st half of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 1 ***********************************
// Launch kernel computation, command-queue 0
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Nonblocking Write of 2nd half of input data from host to device in command-queue 1
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the compute for queue 0 and write for queue 1 to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 2 ***********************************
// Launch kernel computation, command-queue 1
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Non Blocking Read of 1st half of output data from device to host, command-queue 0
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the compute for queue 1 and the read for queue 0 to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 0 (Rolls over) ***********************************
// Non Blocking Read of 2nd half of output data from device to host, command-queue 1
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
}
// *** Sync to host and get average sequence time
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
dAvgTime = shrDeltaT(0)/(double)iCycles;
// Log config if asked for
if (bShowConfig)
{
shrLog("\n2-Queue sequence Configuration:\n");
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 2\n",
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
}
return dAvgTime;
}
// Function to adjust compute task according to device capability
// This allows a consistent overlap % across a wide variety of GPU's for test purposes
// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
// *********************************************************************
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
{
// Locals
double dCopyTime, dComputeTime;
int iComputedLoopCount;
// Change Source Data
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
// *** Make sure queues are empty and then start timer
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
// Run the copy iCycles times and measure copy time on this system
for (int i = 0; i < iCycles; i++)
{
// Nonblocking Write of all of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
ciErrNum |= clFlush(cqCommandQueue[0]);
shrCheckError(ciErrNum, CL_SUCCESS);
}
clFinish(cqCommandQueue[0]);
dCopyTime = shrDeltaT(0);
// Run the compute iCycles times and measure compute time on this system
for (int i = 0; i < iCycles; i++)
{
// Launch kernel computation, command-queue 0
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
ciErrNum |= clFlush(cqCommandQueue[0]);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
}
clFinish(cqCommandQueue[0]);
dComputeTime = shrDeltaT(0);
// Determine number of core loop cycles proportional to copy/compute time ratio
dComputeTime = MAX(dComputeTime, 1.0e-6);
iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
return (iComputedLoopCount);
}
// Cleanup/Exit function
// *********************************************************************
void Cleanup (int iExitCode)
{
// Cleanup allocated objects
shrLog("Starting Cleanup...\n\n");
if(cPathAndName)free(cPathAndName);
if(cSourceCL)free(cSourceCL);
if(Golden)free(Golden);
if(ckKernel[0])clReleaseKernel(ckKernel[0]);
if(ckKernel[1])clReleaseKernel(ckKernel[1]);
if(cpProgram)clReleaseProgram(cpProgram);
if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
if(cmDevResult)clReleaseMemObject(cmDevResult);
if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
if(cxGPUContext)clReleaseContext(cxGPUContext);
if(cdDevices)free(cdDevices);
// Master status Pass/Fail (all tests)
shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
}
// "Golden" Host processing vector hyptenuse function for comparison purposes
// *********************************************************************
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
{
for (unsigned int i = 0; i < uiNumElements; i++)
{
float fA = pfData1[i];
float fB = pfData2[i];
float fC = sqrtf(fA * fA + fB * fB);
pfResult[i] = fC;
}
}

View file

@ -1,806 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
#include <fstream>
#include <vector>
#include <iostream>
#include <algorithm>
#include <stdarg.h>
#include "oclUtils.h"
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platoform ID
//////////////////////////////////////////////////////////////////////////////
cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID)
{
char chBuffer[1024];
cl_uint num_platforms;
cl_platform_id* clPlatformIDs;
cl_int ciErrNum;
*clSelectedPlatformID = NULL;
// Get OpenCL platform count
ciErrNum = clGetPlatformIDs (0, NULL, &num_platforms);
if (ciErrNum != CL_SUCCESS)
{
shrLog(" Error %i in clGetPlatformIDs Call !!!\n\n", ciErrNum);
return -1000;
}
else
{
if(num_platforms == 0)
{
shrLog("No OpenCL platform found!\n\n");
return -2000;
}
else
{
// if there's a platform or more, make space for ID's
if ((clPlatformIDs = (cl_platform_id*)malloc(num_platforms * sizeof(cl_platform_id))) == NULL)
{
shrLog("Failed to allocate memory for cl_platform ID's!\n\n");
return -3000;
}
// get platform info for each platform and trap the NVIDIA platform if found
ciErrNum = clGetPlatformIDs (num_platforms, clPlatformIDs, NULL);
for(cl_uint i = 0; i < num_platforms; ++i)
{
ciErrNum = clGetPlatformInfo (clPlatformIDs[i], CL_PLATFORM_NAME, 1024, &chBuffer, NULL);
if(ciErrNum == CL_SUCCESS)
{
if(strstr(chBuffer, "NVIDIA") != NULL)
{
*clSelectedPlatformID = clPlatformIDs[i];
break;
}
}
}
// default to zeroeth platform if NVIDIA not found
if(*clSelectedPlatformID == NULL)
{
shrLog("WARNING: NVIDIA OpenCL platform not found - defaulting to first platform!\n\n");
*clSelectedPlatformID = clPlatformIDs[0];
}
free(clPlatformIDs);
}
}
return CL_SUCCESS;
}
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
void oclPrintDevName(int iLogMode, cl_device_id device)
{
char device_string[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, "%s\n", device_string);
}
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
void oclPrintDevInfo(int iLogMode, cl_device_id device)
{
char device_string[1024];
bool nv_device_attibute_query = false;
// CL_DEVICE_NAME
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_NAME: \t\t\t%s\n", device_string);
// CL_DEVICE_VENDOR
clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_VENDOR: \t\t\t%s\n", device_string);
// CL_DRIVER_VERSION
clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DRIVER_VERSION: \t\t\t%s\n", device_string);
// CL_DEVICE_VERSION
clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_VERSION: \t\t\t%s\n", device_string);
// CL_DEVICE_OPENCL_C_VERSION (if CL_DEVICE_VERSION version > 1.0)
if(strncmp("OpenCL 1.0", device_string, 10) != 0)
{
// This code is unused for devices reporting OpenCL 1.0, but a def is needed anyway to allow compilation using v 1.0 headers
// This constant isn't #defined in 1.0
#ifndef CL_DEVICE_OPENCL_C_VERSION
#define CL_DEVICE_OPENCL_C_VERSION 0x103D
#endif
clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, sizeof(device_string), &device_string, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_OPENCL_C_VERSION: \t\t%s\n", device_string);
}
// CL_DEVICE_TYPE
cl_device_type type;
clGetDeviceInfo(device, CL_DEVICE_TYPE, sizeof(type), &type, NULL);
if( type & CL_DEVICE_TYPE_CPU )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_CPU");
if( type & CL_DEVICE_TYPE_GPU )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_GPU");
if( type & CL_DEVICE_TYPE_ACCELERATOR )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_ACCELERATOR");
if( type & CL_DEVICE_TYPE_DEFAULT )
shrLogEx(iLogMode, 0, " CL_DEVICE_TYPE:\t\t\t%s\n", "CL_DEVICE_TYPE_DEFAULT");
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_COMPUTE_UNITS:\t\t%u\n", compute_units);
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
size_t workitem_dims;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims);
// CL_DEVICE_MAX_WORK_ITEM_SIZES
size_t workitem_size[3];
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]);
// CL_DEVICE_MAX_WORK_GROUP_SIZE
size_t workgroup_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency);
// CL_DEVICE_ADDRESS_BITS
cl_uint addr_bits;
clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(addr_bits), &addr_bits, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_ADDRESS_BITS:\t\t%u\n", addr_bits);
// CL_DEVICE_MAX_MEM_ALLOC_SIZE
cl_ulong max_mem_alloc_size;
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_mem_alloc_size), &max_mem_alloc_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_MEM_ALLOC_SIZE:\t\t%u MByte\n", (unsigned int)(max_mem_alloc_size / (1024 * 1024)));
// CL_DEVICE_GLOBAL_MEM_SIZE
cl_ulong mem_size;
clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_GLOBAL_MEM_SIZE:\t\t%u MByte\n", (unsigned int)(mem_size / (1024 * 1024)));
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
cl_bool error_correction_support;
clGetDeviceInfo(device, CL_DEVICE_ERROR_CORRECTION_SUPPORT, sizeof(error_correction_support), &error_correction_support, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_ERROR_CORRECTION_SUPPORT:\t%s\n", error_correction_support == CL_TRUE ? "yes" : "no");
// CL_DEVICE_LOCAL_MEM_TYPE
cl_device_local_mem_type local_mem_type;
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_TYPE, sizeof(local_mem_type), &local_mem_type, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_TYPE:\t\t%s\n", local_mem_type == 1 ? "local" : "global");
// CL_DEVICE_LOCAL_MEM_SIZE
clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_LOCAL_MEM_SIZE:\t\t%u KByte\n", (unsigned int)(mem_size / 1024));
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(mem_size), &mem_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:\t%u KByte\n", (unsigned int)(mem_size / 1024));
// CL_DEVICE_QUEUE_PROPERTIES
cl_command_queue_properties queue_properties;
clGetDeviceInfo(device, CL_DEVICE_QUEUE_PROPERTIES, sizeof(queue_properties), &queue_properties, NULL);
if( queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE )
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE");
if( queue_properties & CL_QUEUE_PROFILING_ENABLE )
shrLogEx(iLogMode, 0, " CL_DEVICE_QUEUE_PROPERTIES:\t\t%s\n", "CL_QUEUE_PROFILING_ENABLE");
// CL_DEVICE_IMAGE_SUPPORT
cl_bool image_support;
clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(image_support), &image_support, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_IMAGE_SUPPORT:\t\t%u\n", image_support);
// CL_DEVICE_MAX_READ_IMAGE_ARGS
cl_uint max_read_image_args;
clGetDeviceInfo(device, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(max_read_image_args), &max_read_image_args, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_READ_IMAGE_ARGS:\t%u\n", max_read_image_args);
// CL_DEVICE_MAX_WRITE_IMAGE_ARGS
cl_uint max_write_image_args;
clGetDeviceInfo(device, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(max_write_image_args), &max_write_image_args, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_MAX_WRITE_IMAGE_ARGS:\t%u\n", max_write_image_args);
// CL_DEVICE_SINGLE_FP_CONFIG
cl_device_fp_config fp_config;
clGetDeviceInfo(device, CL_DEVICE_SINGLE_FP_CONFIG, sizeof(cl_device_fp_config), &fp_config, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_SINGLE_FP_CONFIG:\t\t%s%s%s%s%s%s\n",
fp_config & CL_FP_DENORM ? "denorms " : "",
fp_config & CL_FP_INF_NAN ? "INF-quietNaNs " : "",
fp_config & CL_FP_ROUND_TO_NEAREST ? "round-to-nearest " : "",
fp_config & CL_FP_ROUND_TO_ZERO ? "round-to-zero " : "",
fp_config & CL_FP_ROUND_TO_INF ? "round-to-inf " : "",
fp_config & CL_FP_FMA ? "fma " : "");
// CL_DEVICE_IMAGE2D_MAX_WIDTH, CL_DEVICE_IMAGE2D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_WIDTH, CL_DEVICE_IMAGE3D_MAX_HEIGHT, CL_DEVICE_IMAGE3D_MAX_DEPTH
size_t szMaxDims[5];
shrLogEx(iLogMode, 0, "\n CL_DEVICE_IMAGE <dim>");
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &szMaxDims[0], NULL);
shrLogEx(iLogMode, 0, "\t\t\t2D_MAX_WIDTH\t %u\n", szMaxDims[0]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[1], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t2D_MAX_HEIGHT\t %u\n", szMaxDims[1]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_WIDTH, sizeof(size_t), &szMaxDims[2], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_WIDTH\t %u\n", szMaxDims[2]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_HEIGHT, sizeof(size_t), &szMaxDims[3], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_HEIGHT\t %u\n", szMaxDims[3]);
clGetDeviceInfo(device, CL_DEVICE_IMAGE3D_MAX_DEPTH, sizeof(size_t), &szMaxDims[4], NULL);
shrLogEx(iLogMode, 0, "\t\t\t\t\t3D_MAX_DEPTH\t %u\n", szMaxDims[4]);
// CL_DEVICE_EXTENSIONS: get device extensions, and if any then parse & log the string onto separate lines
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(device_string), &device_string, NULL);
if (device_string != 0)
{
shrLogEx(iLogMode, 0, "\n CL_DEVICE_EXTENSIONS:");
std::string stdDevString;
stdDevString = std::string(device_string);
size_t szOldPos = 0;
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
while (szSpacePos != stdDevString.npos)
{
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
nv_device_attibute_query = true;
if (szOldPos > 0)
{
shrLogEx(iLogMode, 0, "\t\t");
}
shrLogEx(iLogMode, 0, "\t\t\t%s\n", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str());
do {
szOldPos = szSpacePos + 1;
szSpacePos = stdDevString.find(' ', szOldPos);
} while (szSpacePos == szOldPos);
}
shrLogEx(iLogMode, 0, "\n");
}
else
{
shrLogEx(iLogMode, 0, " CL_DEVICE_EXTENSIONS: None\n");
}
if(nv_device_attibute_query)
{
cl_uint compute_capability_major, compute_capability_minor;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &compute_capability_major, NULL);
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &compute_capability_minor, NULL);
shrLogEx(iLogMode, 0, "\n CL_DEVICE_COMPUTE_CAPABILITY_NV:\t%u.%u\n", compute_capability_major, compute_capability_minor);
shrLogEx(iLogMode, 0, " NUMBER OF MULTIPROCESSORS:\t\t%u\n", compute_units); // this is the same value reported by CL_DEVICE_MAX_COMPUTE_UNITS
shrLogEx(iLogMode, 0, " NUMBER OF CUDA CORES:\t\t\t%u\n", ConvertSMVer2Cores(compute_capability_major, compute_capability_minor) * compute_units);
cl_uint regs_per_block;
clGetDeviceInfo(device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(cl_uint), &regs_per_block, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_REGISTERS_PER_BLOCK_NV:\t%u\n", regs_per_block);
cl_uint warp_size;
clGetDeviceInfo(device, CL_DEVICE_WARP_SIZE_NV, sizeof(cl_uint), &warp_size, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_WARP_SIZE_NV:\t\t%u\n", warp_size);
cl_bool gpu_overlap;
clGetDeviceInfo(device, CL_DEVICE_GPU_OVERLAP_NV, sizeof(cl_bool), &gpu_overlap, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_GPU_OVERLAP_NV:\t\t%s\n", gpu_overlap == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
cl_bool exec_timeout;
clGetDeviceInfo(device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(cl_bool), &exec_timeout, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:\t%s\n", exec_timeout == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
cl_bool integrated_memory;
clGetDeviceInfo(device, CL_DEVICE_INTEGRATED_MEMORY_NV, sizeof(cl_bool), &integrated_memory, NULL);
shrLogEx(iLogMode, 0, " CL_DEVICE_INTEGRATED_MEMORY_NV:\t%s\n", integrated_memory == CL_TRUE ? "CL_TRUE" : "CL_FALSE");
}
// CL_DEVICE_PREFERRED_VECTOR_WIDTH_<type>
shrLogEx(iLogMode, 0, " CL_DEVICE_PREFERRED_VECTOR_WIDTH_<t>\t");
cl_uint vec_width [6];
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, sizeof(cl_uint), &vec_width[0], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, sizeof(cl_uint), &vec_width[1], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), &vec_width[2], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, sizeof(cl_uint), &vec_width[3], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, sizeof(cl_uint), &vec_width[4], NULL);
clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &vec_width[5], NULL);
shrLogEx(iLogMode, 0, "CHAR %u, SHORT %u, INT %u, LONG %u, FLOAT %u, DOUBLE %u\n\n\n",
vec_width[0], vec_width[1], vec_width[2], vec_width[3], vec_width[4], vec_width[5]);
}
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
int oclGetDevCap(cl_device_id device)
{
char cDevString[1024];
bool bDevAttributeQuery = false;
int iDevArch = -1;
// Get device extensions, and if any then search for cl_nv_device_attribute_query
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sizeof(cDevString), &cDevString, NULL);
if (cDevString != 0)
{
std::string stdDevString;
stdDevString = std::string(cDevString);
size_t szOldPos = 0;
size_t szSpacePos = stdDevString.find(' ', szOldPos); // extensions string is space delimited
while (szSpacePos != stdDevString.npos)
{
if( strcmp("cl_nv_device_attribute_query", stdDevString.substr(szOldPos, szSpacePos - szOldPos).c_str()) == 0 )
{
bDevAttributeQuery = true;
}
do {
szOldPos = szSpacePos + 1;
szSpacePos = stdDevString.find(' ', szOldPos);
} while (szSpacePos == szOldPos);
}
}
// if search succeeded, get device caps
if(bDevAttributeQuery)
{
cl_int iComputeCapMajor, iComputeCapMinor;
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), (void*)&iComputeCapMajor, NULL);
clGetDeviceInfo(device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), (void*)&iComputeCapMinor, NULL);
iDevArch = (10 * iComputeCapMajor) + iComputeCapMinor;
}
return iDevArch;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetFirstDev(cl_context cxGPUContext)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id first = cdDevices[0];
free(cdDevices);
return first;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
size_t device_count = szParmDataBytes / sizeof(cl_device_id);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id max_flops_device = cdDevices[0];
int max_flops = 0;
size_t current_device = 0;
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
max_flops = compute_units * clock_frequency;
++current_device;
while( current_device < device_count )
{
// CL_DEVICE_MAX_COMPUTE_UNITS
cl_uint compute_units;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL);
// CL_DEVICE_MAX_CLOCK_FREQUENCY
cl_uint clock_frequency;
clGetDeviceInfo(cdDevices[current_device], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL);
int flops = compute_units * clock_frequency;
if( flops > max_flops )
{
max_flops = flops;
max_flops_device = cdDevices[current_device];
}
++current_device;
}
free(cdDevices);
return max_flops_device;
}
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
{
// locals
FILE* pFileStream = NULL;
size_t szSourceLength;
// open the OpenCL source code file
#ifdef _WIN32 // Windows version
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
{
return NULL;
}
#else // Linux version
pFileStream = fopen(cFilename, "rb");
if(pFileStream == 0)
{
return NULL;
}
#endif
size_t szPreambleLength = strlen(cPreamble);
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
szSourceLength = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
memcpy(cSourceString, cPreamble, szPreambleLength);
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
{
fclose(pFileStream);
free(cSourceString);
return 0;
}
// close the file and return the total length of the combined (preamble + source) string
fclose(pFileStream);
if(szFinalLength != 0)
{
*szFinalLength = szSourceLength + szPreambleLength;
}
cSourceString[szSourceLength + szPreambleLength] = '\0';
return cSourceString;
}
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int nr)
{
size_t szParmDataBytes;
cl_device_id* cdDevices;
// get the list of GPU devices associated with context
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &szParmDataBytes);
if( szParmDataBytes / sizeof(cl_device_id) <= nr ) {
return (cl_device_id)-1;
}
cdDevices = (cl_device_id*) malloc(szParmDataBytes);
clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, szParmDataBytes, cdDevices, NULL);
cl_device_id device = cdDevices[nr];
free(cdDevices);
return device;
}
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length)
{
// Grab the number of devices associated witht the program
cl_uint num_devices;
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
// Grab the device ids
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
// Grab the sizes of the binaries
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
// Now get the binaries
char** ptx_code = (char**) malloc(num_devices * sizeof(char*));
for( unsigned int i=0; i<num_devices; ++i) {
ptx_code[i]= (char*)malloc(binary_sizes[i]);
}
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
// Find the index of the device of interest
unsigned int idx = 0;
while( idx<num_devices && devices[idx] != cdDevice ) ++idx;
// If it is associated prepare the result
if( idx < num_devices )
{
*binary = ptx_code[idx];
*length = binary_sizes[idx];
}
// Cleanup
free( devices );
free( binary_sizes );
for( unsigned int i=0; i<num_devices; ++i) {
if( i != idx ) free(ptx_code[i]);
}
free( ptx_code );
}
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName)
{
// Grab the number of devices associated with the program
cl_uint num_devices;
clGetProgramInfo(cpProgram, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &num_devices, NULL);
// Grab the device ids
cl_device_id* devices = (cl_device_id*) malloc(num_devices * sizeof(cl_device_id));
clGetProgramInfo(cpProgram, CL_PROGRAM_DEVICES, num_devices * sizeof(cl_device_id), devices, 0);
// Grab the sizes of the binaries
size_t* binary_sizes = (size_t*)malloc(num_devices * sizeof(size_t));
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARY_SIZES, num_devices * sizeof(size_t), binary_sizes, NULL);
// Now get the binaries
char** ptx_code = (char**)malloc(num_devices * sizeof(char*));
for( unsigned int i=0; i<num_devices; ++i)
{
ptx_code[i] = (char*)malloc(binary_sizes[i]);
}
clGetProgramInfo(cpProgram, CL_PROGRAM_BINARIES, 0, ptx_code, NULL);
// Find the index of the device of interest
unsigned int idx = 0;
while((idx < num_devices) && (devices[idx] != cdDevice))
{
++idx;
}
// If the index is associated, log the result
if(idx < num_devices)
{
// if a separate filename is supplied, dump ptx there
if (NULL != cPtxFileName)
{
shrLog("\nWriting ptx to separate file: %s ...\n\n", cPtxFileName);
FILE* pFileStream = NULL;
#ifdef _WIN32
fopen_s(&pFileStream, cPtxFileName, "wb");
#else
pFileStream = fopen(cPtxFileName, "wb");
#endif
fwrite(ptx_code[idx], binary_sizes[idx], 1, pFileStream);
fclose(pFileStream);
}
else // log to logfile and console if no ptx file specified
{
shrLog("\n%s\nProgram Binary:\n%s\n%s\n", HDASHLINE, ptx_code[idx], HDASHLINE);
}
}
// Cleanup
free(devices);
free(binary_sizes);
for(unsigned int i = 0; i < num_devices; ++i)
{
free(ptx_code[i]);
}
free( ptx_code );
}
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice)
{
// write out the build log and ptx, then exit
char cBuildLog[10240];
clGetProgramBuildInfo(cpProgram, cdDevice, CL_PROGRAM_BUILD_LOG,
sizeof(cBuildLog), cBuildLog, NULL );
shrLog("\n%s\nBuild Log:\n%s\n%s\n", HDASHLINE, cBuildLog, HDASHLINE);
}
// Helper function for De-allocating cl objects
// *********************************************************************
void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs)
{
int i;
for (i = 0; i < iNumObjs; i++)
{
if (cmMemObjs[i])clReleaseMemObject(cmMemObjs[i]);
}
}
// Helper function to get OpenCL error string from constant
// *********************************************************************
const char* oclErrorString(cl_int error)
{
static const char* errorString[] = {
"CL_SUCCESS",
"CL_DEVICE_NOT_FOUND",
"CL_DEVICE_NOT_AVAILABLE",
"CL_COMPILER_NOT_AVAILABLE",
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
"CL_OUT_OF_RESOURCES",
"CL_OUT_OF_HOST_MEMORY",
"CL_PROFILING_INFO_NOT_AVAILABLE",
"CL_MEM_COPY_OVERLAP",
"CL_IMAGE_FORMAT_MISMATCH",
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
"CL_BUILD_PROGRAM_FAILURE",
"CL_MAP_FAILURE",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"",
"CL_INVALID_VALUE",
"CL_INVALID_DEVICE_TYPE",
"CL_INVALID_PLATFORM",
"CL_INVALID_DEVICE",
"CL_INVALID_CONTEXT",
"CL_INVALID_QUEUE_PROPERTIES",
"CL_INVALID_COMMAND_QUEUE",
"CL_INVALID_HOST_PTR",
"CL_INVALID_MEM_OBJECT",
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
"CL_INVALID_IMAGE_SIZE",
"CL_INVALID_SAMPLER",
"CL_INVALID_BINARY",
"CL_INVALID_BUILD_OPTIONS",
"CL_INVALID_PROGRAM",
"CL_INVALID_PROGRAM_EXECUTABLE",
"CL_INVALID_KERNEL_NAME",
"CL_INVALID_KERNEL_DEFINITION",
"CL_INVALID_KERNEL",
"CL_INVALID_ARG_INDEX",
"CL_INVALID_ARG_VALUE",
"CL_INVALID_ARG_SIZE",
"CL_INVALID_KERNEL_ARGS",
"CL_INVALID_WORK_DIMENSION",
"CL_INVALID_WORK_GROUP_SIZE",
"CL_INVALID_WORK_ITEM_SIZE",
"CL_INVALID_GLOBAL_OFFSET",
"CL_INVALID_EVENT_WAIT_LIST",
"CL_INVALID_EVENT",
"CL_INVALID_OPERATION",
"CL_INVALID_GL_OBJECT",
"CL_INVALID_BUFFER_SIZE",
"CL_INVALID_MIP_LEVEL",
"CL_INVALID_GLOBAL_WORK_SIZE",
};
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
const int index = -error;
return (index >= 0 && index < errorCount) ? errorString[index] : "Unspecified Error";
}
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
const char* oclImageFormatString(cl_uint uiImageFormat)
{
// cl_channel_order
if (uiImageFormat == CL_R)return "CL_R";
if (uiImageFormat == CL_A)return "CL_A";
if (uiImageFormat == CL_RG)return "CL_RG";
if (uiImageFormat == CL_RA)return "CL_RA";
if (uiImageFormat == CL_RGB)return "CL_RGB";
if (uiImageFormat == CL_RGBA)return "CL_RGBA";
if (uiImageFormat == CL_BGRA)return "CL_BGRA";
if (uiImageFormat == CL_ARGB)return "CL_ARGB";
if (uiImageFormat == CL_INTENSITY)return "CL_INTENSITY";
if (uiImageFormat == CL_LUMINANCE)return "CL_LUMINANCE";
// cl_channel_type
if (uiImageFormat == CL_SNORM_INT8)return "CL_SNORM_INT8";
if (uiImageFormat == CL_SNORM_INT16)return "CL_SNORM_INT16";
if (uiImageFormat == CL_UNORM_INT8)return "CL_UNORM_INT8";
if (uiImageFormat == CL_UNORM_INT16)return "CL_UNORM_INT16";
if (uiImageFormat == CL_UNORM_SHORT_565)return "CL_UNORM_SHORT_565";
if (uiImageFormat == CL_UNORM_SHORT_555)return "CL_UNORM_SHORT_555";
if (uiImageFormat == CL_UNORM_INT_101010)return "CL_UNORM_INT_101010";
if (uiImageFormat == CL_SIGNED_INT8)return "CL_SIGNED_INT8";
if (uiImageFormat == CL_SIGNED_INT16)return "CL_SIGNED_INT16";
if (uiImageFormat == CL_SIGNED_INT32)return "CL_SIGNED_INT32";
if (uiImageFormat == CL_UNSIGNED_INT8)return "CL_UNSIGNED_INT8";
if (uiImageFormat == CL_UNSIGNED_INT16)return "CL_UNSIGNED_INT16";
if (uiImageFormat == CL_UNSIGNED_INT32)return "CL_UNSIGNED_INT32";
if (uiImageFormat == CL_HALF_FLOAT)return "CL_HALF_FLOAT";
if (uiImageFormat == CL_FLOAT)return "CL_FLOAT";
// unknown constant
return "Unknown";
}

View file

@ -1,198 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef OCL_UTILS_H
#define OCL_UTILS_H
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
// Common headers: Cross-API utililties and OpenCL header
#include <shrUtils.h>
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Includes
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// For systems with CL_EXT that are not updated with these extensions, we copied these
// extensions from <CL/cl_ext.h>
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
#endif
// reminders for build output window and log
#ifdef _WIN32
#pragma message ("Note: including shrUtils.h")
#pragma message ("Note: including opencl.h")
#endif
// SDK Revision #
#define OCL_SDKREVISION "7027912"
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platform ID
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" int oclGetDevCap(cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
// Helper function for De-allocating cl objects
// *********************************************************************
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
// Helper function to get OpenCL error string from constant
// *********************************************************************
extern "C" const char* oclErrorString(cl_int error);
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
// An error condition is defined by the sample/test value not equal to the reference
if (iReference != iSample)
{
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
iSample = (iSample == 0) ? -9999 : iSample;
// Log the error info
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
if (pCleanup != NULL)
{
pCleanup(iSample);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(iSample);
}
}
}
#endif

View file

@ -1,238 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_QATEST_H
#define SHR_QATEST_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// OS dependent includes
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#pragma message ("Note: including time.h")
// Headers needed for Windows
#include <windows.h>
#include <time.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <time.h>
#endif
#ifndef STRCASECMP
#ifdef _WIN32
#define STRCASECMP _stricmp
#else
#define STRCASECMP strcasecmp
#endif
#endif
#ifndef STRNCASECMP
#ifdef _WIN32
#define STRNCASECMP _strnicmp
#else
#define STRNCASECMP strncasecmp
#endif
#endif
// Standardized QA Start/Finish for CUDA SDK tests
#define shrQAStart(a, b) __shrQAStart(a, b)
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
inline int findExeNameStart(const char *exec_name)
{
int exename_start = (int)strlen(exec_name);
while( (exename_start > 0) &&
(exec_name[exename_start] != '\\') &&
(exec_name[exename_start] != '/') )
{
exename_start--;
}
if (exec_name[exename_start] == '\\' ||
exec_name[exename_start] == '/')
{
return exename_start+1;
} else {
return exename_start;
}
}
inline int __shrQAStart(int argc, char **argv)
{
bool bQATest = false;
// First clear the output buffer
fflush(stdout);
fflush(stdout);
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
}
// We don't want to print the entire path, so we search for the first
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
}
fflush(stdout);
printf("\n"); fflush(stdout);
return exename_start;
}
enum eQAstatus {
QA_FAILED = 0,
QA_PASSED = 1,
QA_WAIVED = 2
};
inline void __ExitInTime(int seconds)
{
fprintf(stdout, "> exiting in %d seconds: ", seconds);
fflush(stdout);
time_t t;
int count;
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
fprintf(stdout, "%d...", count);
#ifdef WIN32
Sleep(1000);
#else
sleep(1);
#endif
}
fprintf(stdout,"done!\n\n");
fflush(stdout);
}
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
{
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bNoPrompt = true;
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bNoPrompt = false;
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
printf("\n"); fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
} else {
if (!bNoPrompt) {
fprintf(stdout, "\nPress <Enter> to exit...\n");
fflush(stdout);
getchar();
}
}
}
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
{
bool bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
}
}
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
{
__shrQAFinish(argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
{
__shrQAFinish2(bQAtest, argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
#endif

File diff suppressed because it is too large Load diff

View file

@ -1,642 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_UTILS_H
#define SHR_UTILS_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// reminders for output window and build log
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#endif
// OS dependent includes
#ifdef _WIN32
// Headers needed for Windows
#include <windows.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#endif
// Other headers needed for both Windows and Linux
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Un-comment the following #define to enable profiling code in SDK apps
//#define GPU_PROFILING
// Beginning of GPU Architecture definitions
inline int ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] =
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
{ -1, -1 }
};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
return -1;
}
// end of GPU Architecture definitions
// Defines and enum for use with logging functions
// *********************************************************************
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
#define MASTERLOGFILE "SdkMasterLog.csv"
enum LOGMODES
{
LOGCONSOLE = 1, // bit to signal "log to console"
LOGFILE = 2, // bit to signal "log to file"
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
MASTER = 8, // bit to signal master .csv log output
ERRORMSG = 16, // bit to signal "pre-pend Error"
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
};
#define HDASHLINE "-----------------------------------------------------------\n"
// Standardized boolean
enum shrBOOL
{
shrFALSE = 0,
shrTRUE = 1
};
// Standardized MAX, MIN and CLAMP
#define MAX(a, b) ((a > b) ? a : b)
#define MIN(a, b) ((a < b) ? a : b)
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
// Standardized Exit Macro for leaving main()... extended version
// (Companion Inline Function lower on page)
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
// Standardized Exit Macro for leaving main()... short version
// (Companion Inline Function lower on page)
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
// Simple argument checker macro
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
// Define for user-customized error handling
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
// Function to deallocate memory allocated within shrUtils
// *********************************************************************
extern "C" void shrFree(void* ptr);
// *********************************************************************
// Helper function to log standardized information to Console, to File or to both
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
//!
//! Automatically opens file and stores handle if needed and not done yet
//! Closes file and nulls handle on request
//!
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
//! @param 2 dValue:
//! Positive val = double value for time in secs to be formatted to 6 decimals.
//! Negative val is an error code and this give error preformatting.
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
//! Single byte char type specifiers (%s and %c) ARE supported
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
// *********************************************************************
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
// *********************************************************************
extern "C" int shrLog(const char* cFormatString, ...);
// *********************************************************************
// Delta timer function for up to 3 independent timers using host high performance counters
// Maintains state for 3 independent counters
//! Example: double dElapsedTime = shrDeltaTime(0);
//!
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
// *********************************************************************
extern "C" double shrDeltaT(int iCounterID);
// Optional LogFileNameOverride function
// *********************************************************************
extern "C" void shrSetLogFileName (const char* cOverRideName);
// Helper function to init data arrays
// *********************************************************************
extern "C" void shrFillArray(float* pfData, int iSize);
// Helper function to print data arrays
// *********************************************************************
extern "C" void shrPrintArray(float* pfData, int iSize);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing single precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing double precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
const float epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
const double epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param OutData handle to the data read
//! @param w width of the image
//! @param h height of the image
//!
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return shrTRUE if command line argument \a flag_name has been given,
//! otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param flag_name name of command line flag
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
const char* flag_name);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
const char* arg_name, int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type unsigned int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
const char* arg_name, unsigned int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type float
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
const char* arg_name, float* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type string
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
const char* arg_name, char** val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument list those element are strings
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val command line argument list
//! @param len length of the list / number of elements
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
unsigned int* len);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned integer arrays, with epsilon and threshold
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integers with a tolernance for # of byte errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality and a
//! threshold for # pixel errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two PPM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
////////////////////////////////////////////////////////////////////////////////
//! Compare two PGM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
extern "C" size_t shrRoundUp(int group_size, int global_size);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
if (iReference != iSample)
{
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
if (pCleanup != NULL)
{
pCleanup(EXIT_FAILURE);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(EXIT_FAILURE);
}
}
}
// Standardized Exit
// *********************************************************************
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
{
#ifdef WIN32
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#else
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#endif
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
getchar();
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
}
fflush(stderr);
exit(iExitCode);
}
#endif