mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
position-independent kernel fix
This commit is contained in:
parent
2488e4736c
commit
15dc9afe93
20 changed files with 497 additions and 720 deletions
|
@ -20,8 +20,6 @@ REPOSITORY=https://github.com/vortexgpgpu/vortex-toolchain-prebuilt/raw/master
|
|||
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
|
||||
OSDIR=${OSDIR:=@OSDIR@}
|
||||
|
||||
OS="${OS:=ubuntu/bionic}"
|
||||
|
||||
riscv32()
|
||||
{
|
||||
case $OSDIR in
|
||||
|
|
|
@ -137,32 +137,6 @@ int dcr_initialize(vx_device_h hdevice) {
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint64_t* addr) {
|
||||
std::ifstream ifs(filename);
|
||||
if (!ifs) {
|
||||
std::cout << "error: " << filename << " not found" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// read file content
|
||||
ifs.seekg(0, ifs.end);
|
||||
auto size = ifs.tellg();
|
||||
std::vector<char> content(size);
|
||||
ifs.seekg(0, ifs.beg);
|
||||
ifs.read(content.data(), size);
|
||||
|
||||
uint64_t _addr = STARTUP_ADDR;
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(hdevice, _addr, content.data(), size), {
|
||||
vx_mem_free(hdevice, _addr);
|
||||
return _ret;
|
||||
});
|
||||
|
||||
*addr = _addr;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr) {
|
||||
if (NULL == content || 0 == size || NULL == addr)
|
||||
return -1;
|
||||
|
|
|
@ -95,9 +95,6 @@ int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value);
|
|||
|
||||
////////////////////////////// UTILITY FUNCTIONS //////////////////////////////
|
||||
|
||||
// upload kernel file to device
|
||||
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint64_t* addr);
|
||||
|
||||
// upload bytes to device
|
||||
int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr);
|
||||
|
||||
|
|
|
@ -7,11 +7,11 @@ XRT_DEVICE_INDEX ?= 0
|
|||
|
||||
ifeq ($(XLEN),64)
|
||||
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
K_CFLAGS += -march=rv64imafd -mabi=ilp64d
|
||||
VX_CFLAGS += -march=rv64imafd -mabi=ilp64d
|
||||
STARTUP_ADDR ?= 0x180000000
|
||||
else
|
||||
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
K_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
STARTUP_ADDR ?= 0x80000000
|
||||
endif
|
||||
|
||||
|
@ -22,13 +22,13 @@ LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
|
|||
|
||||
LIBC_LIB += -L$(LIBC_VORTEX)/lib -lm -lc -lgcc
|
||||
|
||||
K_CFLAGS += -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
|
||||
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections
|
||||
#K_CFLAGS += -mllvm -vortex-branch-divergence=0
|
||||
#K_CFLAGS += -mllvm -print-after-all
|
||||
K_CFLAGS += -mllvm -disable-loop-idiom-all # disable memset/memcpy loop idiom
|
||||
K_CFLAGS += -I$(ROOT_DIR)/hw -I$(VORTEX_KN_PATH)/include -DXLEN_$(XLEN) -DNDEBUG
|
||||
K_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(ROOT_DIR)/kernel/libvortexrt.a $(LIBC_LIB)
|
||||
VX_CFLAGS += -O3 -mcmodel=medany --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
|
||||
VX_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections
|
||||
#VX_CFLAGS += -mllvm -vortex-branch-divergence=0
|
||||
#VX_CFLAGS += -mllvm -print-after-all
|
||||
VX_CFLAGS += -mllvm -disable-loop-idiom-all # disable memset/memcpy loop idiom
|
||||
VX_CFLAGS += -I$(ROOT_DIR)/hw -I$(VORTEX_KN_PATH)/include -DXLEN_$(XLEN) -DNDEBUG
|
||||
VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(ROOT_DIR)/kernel/libvortexrt.a $(LIBC_LIB)
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
|
||||
|
@ -64,7 +64,7 @@ kernel.cl: $(SRC_DIR)/kernel.cl
|
|||
cp $(SRC_DIR)/kernel.cl $@
|
||||
|
||||
kernel.pocl: $(SRC_DIR)/kernel.cl
|
||||
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_KERNEL_CACHE=0 POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl $^
|
||||
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_KERNEL_CACHE=0 POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl $^
|
||||
|
||||
%.cc.o: $(SRC_DIR)/%.cc
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
|
|
@ -6,6 +6,8 @@
|
|||
#include <vector>
|
||||
#include "common.h"
|
||||
|
||||
#define NONCE 0xdeadbeef
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
|
@ -23,7 +25,6 @@ int test = -1;
|
|||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -68,63 +69,46 @@ void cleanup() {
|
|||
}
|
||||
}
|
||||
|
||||
uint64_t shuffle(int i, uint64_t value) {
|
||||
inline uint32_t shuffle(int i, uint32_t value) {
|
||||
return (value << i) | (value & ((1 << i)-1));;
|
||||
}
|
||||
|
||||
int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
|
||||
int errors = 0;
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
int run_memcopy_test(const kernel_arg_t& kernel_arg) {
|
||||
uint32_t num_points = kernel_arg.count;
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
int num_blocks_8 = (64 * num_blocks) / 8;
|
||||
std::vector<uint32_t> h_src(num_points);
|
||||
std::vector<uint32_t> h_dst(num_points);
|
||||
|
||||
// update source buffer
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
((uint64_t*)staging_buf.data())[i] = shuffle(i, value);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src[i] = shuffle(i, NONCE);
|
||||
}
|
||||
|
||||
/*for (int i = 0; i < num_blocks; ++i) {
|
||||
std::cout << "data[" << i << "]=0x";
|
||||
for (int j = 7; j >= 0; --j) {
|
||||
std::cout << std::hex << ((uint64_t*)staging_buf.data())[i * 8 +j];
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}*/
|
||||
|
||||
// write source buffer to local memory
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// upload source buffer
|
||||
std::cout << "write source buffer to local memory" << std::endl;
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(device, dev_addr, staging_buf.data(), 64 * num_blocks));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, h_src.data(), buf_size));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear destination buffer
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
((uint64_t*)staging_buf.data())[i] = 0;
|
||||
}
|
||||
|
||||
// read destination buffer from local memory
|
||||
// download destination buffer
|
||||
std::cout << "read destination buffer from local memory" << std::endl;
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), dev_addr, 64 * num_blocks));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// verify result
|
||||
int errors = 0;
|
||||
std::cout << "verify result" << std::endl;
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
auto curr = ((uint64_t*)staging_buf.data())[i];
|
||||
auto ref = shuffle(i, value);
|
||||
if (curr != ref) {
|
||||
std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
|
||||
<< ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto cur = h_dst[i];
|
||||
auto ref = shuffle(i, NONCE);
|
||||
if (cur != ref) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", i, ref, cur);
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
|
@ -137,37 +121,35 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
|
|||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Total elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
return 0;
|
||||
return errors;
|
||||
}
|
||||
|
||||
int run_kernel_test(const kernel_arg_t& kernel_arg,
|
||||
uint32_t buf_size,
|
||||
uint32_t num_points) {
|
||||
int errors = 0;
|
||||
int run_kernel_test(const kernel_arg_t& kernel_arg) {
|
||||
uint32_t num_points = kernel_arg.count;
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::vector<uint32_t> h_src(num_points);
|
||||
std::vector<uint32_t> h_dst(num_points);
|
||||
|
||||
// update source buffer
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src[i] = shuffle(i, NONCE);
|
||||
}
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// update source buffer
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i;
|
||||
}
|
||||
}
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
// upload source buffer
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// start device
|
||||
std::cout << "start execution" << std::endl;
|
||||
|
@ -176,29 +158,22 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
|
|||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// read destination buffer from local memory
|
||||
// download destination buffer
|
||||
std::cout << "read destination buffer from local memory" << std::endl;
|
||||
auto t4 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
auto t5 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
|
||||
// verify result
|
||||
// verify result
|
||||
int errors = 0;
|
||||
std::cout << "verify result" << std::endl;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int32_t curr = ((int32_t*)staging_buf.data())[i];
|
||||
int32_t ref = i;
|
||||
if (curr != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
|
||||
auto cur = h_dst[i];
|
||||
auto ref = shuffle(i, NONCE);
|
||||
if (cur != ref) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", i, ref, cur);
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
|
@ -213,7 +188,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
|
|||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Total elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
return 0;
|
||||
return errors;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
@ -232,8 +207,7 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
|
||||
uint32_t num_points = count * num_cores;
|
||||
uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64;
|
||||
uint32_t buf_size = num_blocks * 64;
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
@ -248,32 +222,28 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
staging_buf.resize(buf_size);
|
||||
int errors = 0;
|
||||
|
||||
// run tests
|
||||
if (0 == test || -1 == test) {
|
||||
std::cout << "run memcopy test" << std::endl;
|
||||
RT_CHECK(run_memcopy_test(kernel_arg.src_addr, 0x0badf00d40ff40ff, num_blocks));
|
||||
errors = run_memcopy_test(kernel_arg);
|
||||
}
|
||||
|
||||
if (1 == test || -1 == test) {
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
std::cout << "run kernel test" << std::endl;
|
||||
RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));
|
||||
errors = run_kernel_test(kernel_arg);
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "Test PASSED" << std::endl;
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ VX_CP = $(LLVM_VORTEX)/bin/llvm-objcopy
|
|||
#VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
|
||||
#VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
|
||||
|
||||
VX_CFLAGS += -O3 -std=c++11 -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections
|
||||
VX_CFLAGS += -O3 -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections
|
||||
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(ROOT_DIR)/hw
|
||||
VX_CFLAGS += -DXLEN_$(XLEN)
|
||||
VX_CFLAGS += -DNDEBUG
|
||||
|
|
|
@ -157,6 +157,10 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "matrix size: " << size << "x" << size << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = num_points;
|
||||
kernel_arg.width = size;
|
||||
kernel_arg.log2_width = log2(size);
|
||||
|
||||
uint32_t o_points = size * size;
|
||||
uint32_t i_points = (size+2) * (size+2);
|
||||
uint32_t w_points = 3 * 3;
|
||||
|
@ -184,10 +188,6 @@ int main(int argc, char *argv[]) {
|
|||
kernel_arg.lmem_addr = 0;
|
||||
}
|
||||
|
||||
kernel_arg.num_tasks = num_points;
|
||||
kernel_arg.width = size;
|
||||
kernel_arg.log2_width = log2(size);
|
||||
|
||||
std::cout << "dev_argI=0x" << std::hex << kernel_arg.I_addr << std::endl;
|
||||
std::cout << "dev_argW=0x" << std::hex << kernel_arg.W_addr << std::endl;
|
||||
std::cout << "dev_argO=0x" << std::hex << kernel_arg.O_addr << std::endl;
|
||||
|
@ -223,7 +223,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
|
@ -248,12 +248,12 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_copy_from_dev(device, h_O.data(), kernel_arg.O_addr, o_nbytes));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
{
|
||||
std::vector<TYPE> h_ref(o_points);
|
||||
convolution_cpu(h_ref.data(), h_I.data(), h_W.data(), size, size);
|
||||
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < h_ref.size(); ++i) {
|
||||
auto ref = h_ref[i];
|
||||
auto cur = h_O[i];
|
||||
|
@ -261,16 +261,17 @@ int main(int argc, char *argv[]) {
|
|||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
|
|
|
@ -73,8 +73,6 @@ const char* kernel_file = "kernel.bin";
|
|||
uint32_t count = 16;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<TYPE> source_data;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -140,52 +138,42 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
staging_buf.resize(buf_size);
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<TYPE> h_src0(num_points);
|
||||
std::vector<TYPE> h_src1(num_points);
|
||||
std::vector<TYPE> h_dst(num_points);
|
||||
|
||||
// generate source data
|
||||
source_data.resize(2 * num_points);
|
||||
for (uint32_t i = 0; i < source_data.size(); ++i) {
|
||||
source_data[i] = Comparator<TYPE>::generate();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src0[i] = Comparator<TYPE>::generate();
|
||||
h_src1[i] = Comparator<TYPE>::generate();
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = source_data[2 * i + 0];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
{
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = source_data[2 * i + 1];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
|
@ -201,30 +189,28 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
|
||||
auto cur = buf_ptr[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto ref = h_src0[i] + h_src1[i];
|
||||
auto cur = h_dst[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
|
|
|
@ -21,11 +21,7 @@
|
|||
const char* kernel_file = "kernel.bin";
|
||||
uint32_t count = 0;
|
||||
|
||||
std::vector<int> src_data;
|
||||
std::vector<int> ref_data;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -67,20 +63,18 @@ void cleanup() {
|
|||
}
|
||||
}
|
||||
|
||||
void gen_input_data(uint32_t num_points) {
|
||||
src_data.resize(num_points);
|
||||
|
||||
for (uint32_t i = 0; i < src_data.size(); ++i) {
|
||||
void gen_src_data(std::vector<int>& src_data, uint32_t size) {
|
||||
src_data.resize(size);
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
int value = std::rand();
|
||||
src_data[i] = value;
|
||||
//std::cout << std::dec << i << ": value=0x" << std::hex << value << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void gen_ref_data(uint32_t num_points) {
|
||||
ref_data.resize(num_points);
|
||||
|
||||
for (int i = 0; i < (int)ref_data.size(); ++i) {
|
||||
void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data, uint32_t size) {
|
||||
ref_data.resize(size);
|
||||
for (int i = 0; i < (int)size; ++i) {
|
||||
int value = src_data.at(i);
|
||||
|
||||
// none taken
|
||||
|
@ -132,7 +126,7 @@ void gen_ref_data(uint32_t num_points) {
|
|||
value *= 5;
|
||||
break;
|
||||
default:
|
||||
assert(i < (int)num_points);
|
||||
assert(i < (int)size);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -161,61 +155,46 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint32_t num_points = count;
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
// generate input data
|
||||
gen_input_data(num_points);
|
||||
|
||||
// generate reference data
|
||||
gen_ref_data(num_points);
|
||||
|
||||
uint32_t src_buf_size = src_data.size() * sizeof(int32_t);
|
||||
uint32_t dst_buf_size = ref_data.size() * sizeof(int32_t);
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size, dst_buf_size);
|
||||
staging_buf.resize(staging_buf_size);
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<int32_t> h_src;
|
||||
std::vector<int32_t> h_dst(num_points);
|
||||
gen_src_data(h_src, num_points);
|
||||
|
||||
// upload source buffer
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
// upload source buffer
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto buf_ptr = staging_buf.data();
|
||||
memcpy(buf_ptr, src_data.data(), num_points * sizeof(int32_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
|
||||
}
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
|
@ -226,32 +205,35 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
std::vector<int32_t> h_ref;
|
||||
gen_ref_data(h_ref, h_src, num_points);
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = ref_data.at(i);
|
||||
int cur = buf_ptr[i];
|
||||
int ref = h_ref[i];
|
||||
int cur = h_dst[i];
|
||||
if (cur != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
|
|
|
@ -116,6 +116,9 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
|
||||
|
@ -123,9 +126,6 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
|
||||
|
@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// upload program
|
||||
std::cout << "upload kernel" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// execute tests
|
||||
for (int t = testid_s; t <= testid_e; ++t) {
|
||||
|
|
|
@ -21,7 +21,6 @@ const char* kernel_file = "kernel.bin";
|
|||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -88,56 +87,42 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
staging_buf.resize(buf_size);
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<int32_t> h_src0(num_points);
|
||||
std::vector<int32_t> h_src1(num_points);
|
||||
std::vector<int32_t> h_dst(num_points);
|
||||
|
||||
// generate source data
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src0[i] = i-1;
|
||||
h_src1[i] = i+1;
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i-1;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
{
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i+1;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
|
@ -153,32 +138,30 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = i + i;
|
||||
int cur = buf_ptr[i];
|
||||
if (cur != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = i + i;
|
||||
int cur = h_dst[i];
|
||||
if (cur != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
|
|
|
@ -25,13 +25,8 @@ uint32_t count = 0;
|
|||
|
||||
static uint64_t io_base_addr = IO_CSR_ADDR + IO_CSR_SIZE;
|
||||
|
||||
uint64_t usr_test_mem;
|
||||
|
||||
std::vector<uint64_t> src_addrs;
|
||||
std::vector<int32_t> ref_data;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t usr_test_mem;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -74,11 +69,10 @@ void cleanup() {
|
|||
}
|
||||
}
|
||||
|
||||
void gen_src_addrs(uint32_t num_points) {
|
||||
src_addrs.resize(num_points);
|
||||
|
||||
void gen_src_addrs(std::vector<uint64_t>& src_addrs, uint32_t size) {
|
||||
src_addrs.resize(size);
|
||||
uint32_t u = 0, k = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
if (0 ==(i % 4)) {
|
||||
k = (i + u) % NUM_ADDRS;
|
||||
++u;
|
||||
|
@ -90,18 +84,15 @@ void gen_src_addrs(uint32_t num_points) {
|
|||
}
|
||||
}
|
||||
|
||||
void gen_ref_data(uint32_t num_points) {
|
||||
ref_data.resize(num_points);
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
void gen_ref_data(std::vector<int32_t>& ref_data, uint32_t size) {
|
||||
ref_data.resize(size);
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
int32_t j = i % NUM_ADDRS;
|
||||
ref_data[i] = j * j;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
uint64_t value;
|
||||
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
|
@ -115,71 +106,58 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint32_t num_points = count;
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
RT_CHECK(vx_mem_alloc(device, NUM_ADDRS * sizeof(int32_t), &usr_test_mem));
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
|
||||
// generate input data
|
||||
gen_src_addrs(num_points);
|
||||
|
||||
// generate reference data
|
||||
gen_ref_data(num_points);
|
||||
|
||||
uint32_t src_buf_size = num_points * sizeof(uint64_t);
|
||||
uint32_t src_buf_size = NUM_ADDRS * sizeof(int32_t);
|
||||
uint32_t addr_buf_size = num_points * sizeof(uint64_t);
|
||||
uint32_t dst_buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << std::dec << num_points << std::endl;
|
||||
std::cout << "usr buffer size: " << src_buf_size << " bytes" << std::endl;
|
||||
std::cout << "addr buffer size: " << addr_buf_size << " bytes" << std::endl;
|
||||
std::cout << "dst buffer size: " << dst_buf_size << " bytes" << std::endl;
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
|
||||
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
|
||||
kernel_arg.src_addr = value;
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
|
||||
kernel_arg.dst_addr = value;
|
||||
kernel_arg.num_points = num_points;
|
||||
RT_CHECK(vx_mem_alloc(device, src_buf_size, &usr_test_mem));
|
||||
RT_CHECK(vx_mem_alloc(device, addr_buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint64_t),
|
||||
std::max<uint32_t>(src_buf_size, dst_buf_size));
|
||||
staging_buf.resize(staging_buf_size);
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<uint64_t> h_addr;
|
||||
std::vector<uint32_t> h_src(NUM_ADDRS);
|
||||
std::vector<int32_t> h_dst(num_points);
|
||||
|
||||
// generate source data
|
||||
gen_src_addrs(h_addr, num_points);
|
||||
for (uint32_t i = 0; i < NUM_ADDRS; ++i) {
|
||||
h_src[i] = i * i;
|
||||
}
|
||||
|
||||
// upload user address data
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, usr_test_mem, h_src.data(), src_buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(device, io_base_addr, h_src.data(), src_buf_size));
|
||||
|
||||
// upload test address data
|
||||
{
|
||||
std::cout << "upload test address data" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < NUM_ADDRS; ++i) {
|
||||
buf_ptr[i] = i * i;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, io_base_addr, staging_buf.data(), NUM_ADDRS * sizeof(int32_t)));
|
||||
RT_CHECK(vx_copy_to_dev(device, usr_test_mem, staging_buf.data(), NUM_ADDRS * sizeof(int32_t)));
|
||||
}
|
||||
|
||||
// upload source buffer
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto buf_ptr = (uint64_t*)staging_buf.data();
|
||||
memcpy(buf_ptr, src_addrs.data(), src_buf_size);
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
|
||||
}
|
||||
|
||||
std::cout << "upload address buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_addr.data(), addr_buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
|
@ -195,32 +173,35 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, dst_buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
std::vector<int32_t> h_ref;
|
||||
gen_ref_data(h_ref, num_points);
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = ref_data.at(i);
|
||||
int cur = buf_ptr[i];
|
||||
int ref = h_ref[i];
|
||||
int cur = h_dst[i];
|
||||
if (cur != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
|
|
|
@ -73,8 +73,6 @@ const char* kernel_file = "kernel.bin";
|
|||
uint32_t count = 16;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<TYPE> source_data;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -140,9 +138,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
|
@ -150,51 +147,37 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
staging_buf.resize(buf_size);
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<TYPE> h_src0(num_points);
|
||||
std::vector<TYPE> h_src1(num_points);
|
||||
std::vector<TYPE> h_dst(num_points);
|
||||
|
||||
// generate source data
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src0[i] = Comparator<TYPE>::generate();
|
||||
h_src1[i] = Comparator<TYPE>::generate();
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
// generate source data
|
||||
source_data.resize(2 * num_points);
|
||||
for (uint32_t i = 0; i < source_data.size(); ++i) {
|
||||
source_data[i] = Comparator<TYPE>::generate();
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = source_data[2 * i + 0];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// upload source buffer1
|
||||
{
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = source_data[2 * i + 1];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
|
@ -206,24 +189,16 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
|
||||
auto cur = buf_ptr[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto ref = h_src0[i] + h_src1[i];
|
||||
auto cur = h_dst[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -231,6 +206,12 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -68,11 +68,7 @@ inline bool almost_equal(float a, float b) {
|
|||
const char* kernel_file = "kernel.bin";
|
||||
uint32_t count = 0;
|
||||
|
||||
std::vector<float> test_data;
|
||||
std::vector<uint32_t> addr_table;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -115,16 +111,19 @@ void cleanup() {
|
|||
}
|
||||
}
|
||||
|
||||
void gen_input_data(uint32_t num_points) {
|
||||
void gen_src_data(std::vector<float>& test_data,
|
||||
std::vector<uint32_t>& addr_table,
|
||||
uint32_t num_points,
|
||||
uint32_t num_addrs) {
|
||||
test_data.resize(num_points);
|
||||
addr_table.resize(num_points + NUM_LOADS - 1);
|
||||
|
||||
addr_table.resize(num_addrs);
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
float r = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
test_data[i] = r;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < addr_table.size(); ++i) {
|
||||
for (uint32_t i = 0; i < num_addrs; ++i) {
|
||||
float r = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
uint32_t index = static_cast<uint32_t>(r * num_points);
|
||||
assert(index < num_points);
|
||||
|
@ -153,20 +152,19 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t num_addrs = num_points + NUM_LOADS - 1;
|
||||
|
||||
// generate input data
|
||||
gen_input_data(num_points);
|
||||
|
||||
uint32_t addr_buf_size = addr_table.size() * sizeof(int32_t);
|
||||
uint32_t src_buf_size = test_data.size() * sizeof(int32_t);
|
||||
uint32_t dst_buf_size = test_data.size() * sizeof(int32_t);
|
||||
uint32_t addr_buf_size = num_addrs * sizeof(int32_t);
|
||||
uint32_t src_buf_size = num_points * sizeof(int32_t);
|
||||
uint32_t dst_buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
|
||||
std::cout << "addr buffer size: " << addr_buf_size << " bytes" << std::endl;
|
||||
std::cout << "src buffer size: " << src_buf_size << " bytes" << std::endl;
|
||||
std::cout << "dst buffer size: " << dst_buf_size << " bytes" << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.stride = count;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
|
@ -174,48 +172,32 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.stride = count;
|
||||
|
||||
std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
|
||||
std::max<uint32_t>(addr_buf_size, dst_buf_size));
|
||||
staging_buf.resize(staging_buf_size);
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<uint32_t> h_addr;
|
||||
std::vector<float> h_src;
|
||||
std::vector<float> h_dst(num_points);
|
||||
gen_src_data(h_src, h_addr, num_points, num_addrs);
|
||||
|
||||
// upload source buffer0
|
||||
std::cout << "upload address buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_addr.data(), addr_buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src.data(), src_buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload address buffer" << std::endl;
|
||||
auto buf_ptr = staging_buf.data();
|
||||
memcpy(buf_ptr, addr_table.data(), addr_table.size() * sizeof(int32_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), addr_buf_size));
|
||||
}
|
||||
|
||||
// upload source buffer1
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto buf_ptr = staging_buf.data();
|
||||
memcpy(buf_ptr, test_data.data(), test_data.size() * sizeof(int32_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), src_buf_size));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < test_data.size(); ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
|
||||
}
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
|
@ -227,37 +209,26 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, dst_buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (float*)staging_buf.data();
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
|
||||
float ref = 0.0f;
|
||||
for (uint32_t j = 0; j < NUM_LOADS; ++j) {
|
||||
uint32_t addr = i + j;
|
||||
uint32_t index = addr_table.at(addr);
|
||||
float value = test_data.at(index);
|
||||
//printf("*** [%d] addr=%d, index=%d, value=%f\n", i, addr, index, value);
|
||||
ref *= value;
|
||||
}
|
||||
|
||||
float cur = buf_ptr[i];
|
||||
if (!almost_equal(cur, ref)) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< ": actual " << cur << ", expected " << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
float ref = 0.0f;
|
||||
for (uint32_t j = 0; j < NUM_LOADS; ++j) {
|
||||
uint32_t addr = i + j;
|
||||
uint32_t index = h_addr[addr];
|
||||
float value = h_src[index];
|
||||
//printf("*** [%d] addr=%d, index=%d, value=%f\n", i, addr, index, value);
|
||||
ref *= value;
|
||||
}
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
|
||||
float cur = h_dst[i];
|
||||
if (!almost_equal(cur, ref)) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< ": actual " << cur << ", expected " << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -265,6 +236,12 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -21,7 +21,6 @@ const char* kernel_file = "kernel.bin";
|
|||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -76,52 +75,42 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint32_t num_points = count;
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
kernel_arg.size = num_points;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.size = num_points;
|
||||
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
staging_buf.resize(buf_size);
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<int32_t> h_src(num_points);
|
||||
std::vector<int32_t> h_dst(num_points);
|
||||
|
||||
// generate source data
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src[i] = i-1;
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i-1;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
|
@ -133,32 +122,30 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = i-1;
|
||||
int cur = buf_ptr[i];
|
||||
if (cur != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = i-1;
|
||||
int cur = h_dst[i];
|
||||
if (cur != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
int cid = vx_core_id();
|
||||
int* src_ptr = (int*)arg->src_addr;
|
||||
char* src_ptr = (char*)arg->src_addr;
|
||||
char value = 'A' + src_ptr[task_id];
|
||||
vx_printf("cid=%d: task=%d, value=%c\n", cid, task_id, value);
|
||||
}
|
||||
|
|
|
@ -21,7 +21,6 @@ const char* kernel_file = "kernel.bin";
|
|||
uint32_t count = 4;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -74,45 +73,46 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_warps, num_threads;
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
uint32_t num_points = count;
|
||||
uint32_t buf_size = count * sizeof(int32_t);
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t buf_size = num_points * sizeof(char);
|
||||
|
||||
std::cout << "number of points: " << count << std::endl;
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
staging_buf.resize(buf_size);
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<char> h_src(num_points);
|
||||
|
||||
// generate input data
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src[i] = (char)i;
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
|
@ -122,8 +122,6 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
return 0;
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
|
|
@ -143,16 +143,16 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "matrix size: " << size << "x" << size << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = num_points;
|
||||
kernel_arg.size = size;
|
||||
kernel_arg.log2_size = log2(size);
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.A_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.B_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.C_addr));
|
||||
|
||||
kernel_arg.num_tasks = num_points;
|
||||
kernel_arg.size = size;
|
||||
kernel_arg.log2_size = log2(size);
|
||||
|
||||
std::cout << "dev_argA=0x" << std::hex << kernel_arg.A_addr << std::endl;
|
||||
std::cout << "dev_argB=0x" << std::hex << kernel_arg.B_addr << std::endl;
|
||||
std::cout << "dev_argC=0x" << std::hex << kernel_arg.C_addr << std::endl;
|
||||
|
@ -182,7 +182,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
|
@ -207,27 +207,28 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_copy_from_dev(device, h_C.data(), kernel_arg.C_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
{
|
||||
std::vector<TYPE> h_ref(num_points);
|
||||
matmul_cpu(h_ref.data(), h_A.data(), h_B.data(), size, size);
|
||||
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < h_ref.size(); ++i) {
|
||||
if (!Comparator<TYPE>::compare(h_C[i], h_ref[i], i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
|
|
|
@ -20,11 +20,7 @@
|
|||
const char* kernel_file = "kernel.bin";
|
||||
uint32_t count = 0;
|
||||
|
||||
std::vector<TYPE> src_data;
|
||||
std::vector<TYPE> ref_data;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -66,24 +62,22 @@ void cleanup() {
|
|||
}
|
||||
}
|
||||
|
||||
void gen_input_data(uint32_t num_points) {
|
||||
src_data.resize(num_points);
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
void gen_src_data(std::vector<TYPE>& src_data, uint32_t size) {
|
||||
src_data.resize(size);
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
auto r = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
auto value = static_cast<TYPE>(r * num_points);
|
||||
auto value = static_cast<TYPE>(r * size);
|
||||
src_data[i] = value;
|
||||
std::cout << std::dec << i << ": value=" << value << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void gen_ref_data(uint32_t num_points) {
|
||||
ref_data.resize(num_points);
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
void gen_ref_data(std::vector<TYPE>& ref_data, const std::vector<TYPE>& src_data, uint32_t size) {
|
||||
ref_data.resize(size);
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
TYPE ref_value = src_data.at(i);
|
||||
uint32_t pos = 0;
|
||||
for (uint32_t j = 0; j < num_points; ++j) {
|
||||
for (uint32_t j = 0; j < size; ++j) {
|
||||
TYPE cur_value = src_data.at(j);
|
||||
pos += (cur_value < ref_value) || (cur_value == ref_value && j < i);
|
||||
}
|
||||
|
@ -105,58 +99,41 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint32_t num_points = count;
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
// generate input data
|
||||
gen_input_data(num_points);
|
||||
|
||||
// generate reference data
|
||||
gen_ref_data(num_points);
|
||||
|
||||
uint32_t src_buf_size = src_data.size() * sizeof(int32_t);
|
||||
uint32_t dst_buf_size = ref_data.size() * sizeof(int32_t);
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t buf_size = num_points * sizeof(TYPE);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
{
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size, dst_buf_size);
|
||||
staging_buf.resize(staging_buf_size);
|
||||
}
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<TYPE> h_src;
|
||||
std::vector<TYPE> h_dst(num_points);
|
||||
gen_src_data(h_src, num_points);
|
||||
|
||||
// upload source buffer
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto buf_ptr = staging_buf.data();
|
||||
memcpy(buf_ptr, src_data.data(), num_points * sizeof(TYPE));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
|
||||
}
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
|
@ -172,33 +149,36 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
std::vector<TYPE> h_ref;
|
||||
gen_ref_data(h_ref, h_src, num_points);
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
TYPE ref = ref_data.at(i);
|
||||
TYPE cur = buf_ptr[i];
|
||||
TYPE ref = h_ref[i];
|
||||
TYPE cur = h_dst[i];
|
||||
if (cur != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -73,8 +73,6 @@ const char* kernel_file = "kernel.bin";
|
|||
uint32_t size = 16;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<TYPE> source_data;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
@ -127,14 +125,6 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
std::cout << "number of cores: " << num_cores << std::endl;
|
||||
std::cout << "number of warps: " << num_warps << std::endl;
|
||||
std::cout << "number of threads: " << num_threads << std::endl;
|
||||
|
||||
uint32_t num_points = size;
|
||||
uint32_t buf_size = num_points * sizeof(TYPE);
|
||||
|
||||
|
@ -142,51 +132,44 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
staging_buf.resize(buf_size);
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<TYPE> h_src0(num_points);
|
||||
std::vector<TYPE> h_src1(num_points);
|
||||
std::vector<TYPE> h_dst(num_points);
|
||||
|
||||
// generate source data
|
||||
source_data.resize(2 * num_points);
|
||||
for (uint32_t i = 0; i < source_data.size(); ++i) {
|
||||
source_data[i] = Comparator<TYPE>::generate();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src0[i] = Comparator<TYPE>::generate();
|
||||
h_src1[i] = Comparator<TYPE>::generate();
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = source_data[2 * i + 0];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
|
||||
}
|
||||
|
||||
// upload source buffer1
|
||||
{
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = source_data[2 * i + 1];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
|
||||
}
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
RT_CHECK(vx_upload_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
|
@ -202,24 +185,16 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
|
||||
auto cur = buf_ptr[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto ref = h_src0[i] + h_src1[i];
|
||||
auto cur = h_dst[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -227,6 +202,12 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue