diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index ea5e2ca06..1910f7f19 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -55,6 +55,7 @@ package VX_gpu_pkg; typedef struct packed { logic [`XLEN-1:0] startup_addr; + logic [`XLEN-1:0] startup_arg; logic [7:0] mpm_class; } base_dcrs_t; diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 16339fbce..8d602b57f 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -22,8 +22,10 @@ `define VX_DCR_BASE_STATE_BEGIN 12'h001 `define VX_DCR_BASE_STARTUP_ADDR0 12'h001 `define VX_DCR_BASE_STARTUP_ADDR1 12'h002 -`define VX_DCR_BASE_MPM_CLASS 12'h003 -`define VX_DCR_BASE_STATE_END 12'h004 +`define VX_DCR_BASE_STARTUP_ARG0 12'h003 +`define VX_DCR_BASE_STARTUP_ARG1 12'h004 +`define VX_DCR_BASE_MPM_CLASS 12'h005 +`define VX_DCR_BASE_STATE_END 12'h006 `define VX_DCR_BASE_STATE(addr) ((addr) - `VX_DCR_BASE_STATE_BEGIN) `define VX_DCR_BASE_STATE_COUNT (`VX_DCR_BASE_STATE_END-`VX_DCR_BASE_STATE_BEGIN) diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index 469f75081..9982d2ca4 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -121,6 +121,9 @@ import VX_fpu_pkg::*; `endif always @(posedge clk) begin + if (reset) begin + mscratch <= base_dcrs.startup_arg; + end if (write_enable) begin case (write_addr) `ifdef EXT_F_ENABLE diff --git a/hw/rtl/core/VX_dcr_data.sv b/hw/rtl/core/VX_dcr_data.sv index a83b8b0c7..2cd8cef57 100644 --- a/hw/rtl/core/VX_dcr_data.sv +++ b/hw/rtl/core/VX_dcr_data.sv @@ -35,6 +35,10 @@ module VX_dcr_data import VX_gpu_pkg::*; ( `VX_DCR_BASE_STARTUP_ADDR0 : dcrs.startup_addr[31:0] <= dcr_bus_if.write_data; `ifdef XLEN_64 `VX_DCR_BASE_STARTUP_ADDR1 : dcrs.startup_addr[63:32] <= dcr_bus_if.write_data; + `endif + `VX_DCR_BASE_STARTUP_ARG0 : dcrs.startup_arg[31:0] <= dcr_bus_if.write_data; + `ifdef XLEN_64 + `VX_DCR_BASE_STARTUP_ARG1 : dcrs.startup_arg[63:32] <= dcr_bus_if.write_data; `endif `VX_DCR_BASE_MPM_CLASS : dcrs.mpm_class <= dcr_bus_if.write_data[7:0]; default:; diff --git a/hw/rtl/core/VX_trace.vh b/hw/rtl/core/VX_trace.vh index 364f2f9ba..21412878b 100644 --- a/hw/rtl/core/VX_trace.vh +++ b/hw/rtl/core/VX_trace.vh @@ -348,6 +348,8 @@ task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr); case (addr) `VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0")); `VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1")); + `VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0")); + `VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1")); `VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS")); default: `TRACE(level, ("?")); endcase diff --git a/hw/syn/xilinx/test/kernel/main.c b/hw/syn/xilinx/test/kernel/main.c index e448ce354..4fcfd99c0 100644 --- a/hw/syn/xilinx/test/kernel/main.c +++ b/hw/syn/xilinx/test/kernel/main.c @@ -14,8 +14,6 @@ #include #include -#define KERNEL_ARG_DEV_MEM_ADDR 0x40 - typedef struct { uint32_t count; uint32_t src_addr; @@ -23,7 +21,7 @@ typedef struct { } kernel_arg_t; int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); uint32_t count = arg->count; int32_t* src_ptr = (int32_t*)arg->src_addr; int32_t* dst_ptr = (int32_t*)arg->dst_addr; diff --git a/kernel/Makefile b/kernel/Makefile index 16664c8d1..d2e86eb84 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -20,6 +20,7 @@ LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex -mllvm -vortex-branch-div #CC = $(LLVM_VORTEX)/bin/clang $(LLVM_CFLAGS) #CXX = $(LLVM_VORTEX)/bin/clang++ $(LLVM_CFLAGS) +#AR = $(LLVM_VORTEX)/bin/llvm-ar #DP = $(LLVM_VORTEX)/bin/llvm-objdump #CP = $(LLVM_VORTEX)/bin/llvm-objcopy @@ -29,7 +30,7 @@ AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy -CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections +CFLAGS += -O3 -mcmodel=medany -fno-exceptions -fdata-sections -ffunction-sections CFLAGS += -I$(INC_DIR) -I$(ROOT_DIR)/hw CFLAGS += -DXLEN_$(XLEN) diff --git a/kernel/include/vx_intrinsics.h b/kernel/include/vx_intrinsics.h index bddea02c2..c7235a40c 100644 --- a/kernel/include/vx_intrinsics.h +++ b/kernel/include/vx_intrinsics.h @@ -15,7 +15,6 @@ #define __VX_INTRINSICS_H__ #include -#include #include #if defined(__clang__) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 5c8657be5..82af2df86 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -20,8 +20,6 @@ extern "C" { #endif -#define NUM_CORES_MAX 1024 - #ifndef MIN #define MIN(a, b) ((a) < (b) ? (a) : (b)) #endif @@ -99,11 +97,7 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { int NC = vx_num_cores(); int NW = vx_num_warps(); int NT = vx_num_threads(); - - // current core id int core_id = vx_core_id(); - if (core_id >= NUM_CORES_MAX) - return; // calculate necessary active cores int WT = NW * NT; @@ -258,11 +252,7 @@ void vx_spawn_pocl_kernel(pocl_kernel_context_t * ctx, pocl_kernel_cb callback, int NC = vx_num_cores(); int NW = vx_num_warps(); int NT = vx_num_threads(); - - // current core id - int core_id = vx_core_id(); - if (core_id >= NUM_CORES_MAX) - return; + int core_id = vx_core_id(); // calculate necessary active cores int WT = NW * NT; diff --git a/miscs/musl_libc.patch b/miscs/musl_libc.patch new file mode 100644 index 000000000..3925fc5ea --- /dev/null +++ b/miscs/musl_libc.patch @@ -0,0 +1,83 @@ +diff --git a/src/setjmp/riscv32/longjmp.S b/src/setjmp/riscv32/longjmp.S +index f9cb3318..0980ea23 100644 +--- a/src/setjmp/riscv32/longjmp.S ++++ b/src/setjmp/riscv32/longjmp.S +@@ -23,6 +23,20 @@ longjmp: + lw ra, 52(a0) + + #ifndef __riscv_float_abi_soft ++#ifdef __riscv_float_abi_single ++ flw fs0, 56(a0) ++ flw fs1, 64(a0) ++ flw fs2, 72(a0) ++ flw fs3, 80(a0) ++ flw fs4, 88(a0) ++ flw fs5, 96(a0) ++ flw fs6, 104(a0) ++ flw fs7, 112(a0) ++ flw fs8, 120(a0) ++ flw fs9, 128(a0) ++ flw fs10, 136(a0) ++ flw fs11, 144(a0) ++#else + fld fs0, 56(a0) + fld fs1, 64(a0) + fld fs2, 72(a0) +@@ -35,6 +49,7 @@ longjmp: + fld fs9, 128(a0) + fld fs10, 136(a0) + fld fs11, 144(a0) ++#endif + #endif + + seqz a0, a1 +diff --git a/src/setjmp/riscv32/setjmp.S b/src/setjmp/riscv32/setjmp.S +index 8a75cf55..7efb10e0 100644 +--- a/src/setjmp/riscv32/setjmp.S ++++ b/src/setjmp/riscv32/setjmp.S +@@ -23,18 +23,33 @@ setjmp: + sw ra, 52(a0) + + #ifndef __riscv_float_abi_soft +- fsd fs0, 56(a0) +- fsd fs1, 64(a0) +- fsd fs2, 72(a0) +- fsd fs3, 80(a0) +- fsd fs4, 88(a0) +- fsd fs5, 96(a0) +- fsd fs6, 104(a0) +- fsd fs7, 112(a0) +- fsd fs8, 120(a0) +- fsd fs9, 128(a0) +- fsd fs10, 136(a0) +- fsd fs11, 144(a0) ++#ifdef __riscv_float_abi_single ++ flw fs0, 56(a0) ++ flw fs1, 64(a0) ++ flw fs2, 72(a0) ++ flw fs3, 80(a0) ++ flw fs4, 88(a0) ++ flw fs5, 96(a0) ++ flw fs6, 104(a0) ++ flw fs7, 112(a0) ++ flw fs8, 120(a0) ++ flw fs9, 128(a0) ++ flw fs10, 136(a0) ++ flw fs11, 144(a0) ++#else ++ fld fs0, 56(a0) ++ fld fs1, 64(a0) ++ fld fs2, 72(a0) ++ fld fs3, 80(a0) ++ fld fs4, 88(a0) ++ fld fs5, 96(a0) ++ fld fs6, 104(a0) ++ fld fs7, 112(a0) ++ fld fs8, 120(a0) ++ fld fs9, 128(a0) ++ fld fs10, 136(a0) ++ fld fs11, 144(a0) ++#endif + #endif + + li a0, 0 diff --git a/runtime/common/utils.cpp b/runtime/common/utils.cpp index 456164e0c..383f4b301 100644 --- a/runtime/common/utils.cpp +++ b/runtime/common/utils.cpp @@ -96,45 +96,6 @@ void perf_remove_device(vx_device_h hdevice) { /////////////////////////////////////////////////////////////////////////////// -extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size) { - int err = 0; - - if (NULL == content || 0 == size) - return -1; - - uint64_t kernel_base_addr; - err = vx_dev_caps(hdevice, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr); - if (err != 0) - return err; - - return vx_copy_to_dev(hdevice, kernel_base_addr, content, size); -} - -extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename) { - std::ifstream ifs(filename); - if (!ifs) { - std::cout << "error: " << filename << " not found" << std::endl; - return -1; - } - - // read file content - ifs.seekg(0, ifs.end); - auto size = ifs.tellg(); - auto content = new char [size]; - ifs.seekg(0, ifs.beg); - ifs.read(content, size); - - // upload - int err = vx_upload_kernel_bytes(hdevice, content, size); - - // release buffer - delete[] content; - - return err; -} - -/////////////////////////////////////////////////////////////////////////////// - void DeviceConfig::write(uint32_t addr, uint32_t value) { data_[addr] = value; } @@ -146,18 +107,29 @@ uint32_t DeviceConfig::read(uint32_t addr) const { return data_.at(addr); } +/////////////////////////////////////////////////////////////////////////////// + int dcr_initialize(vx_device_h hdevice) { const uint64_t startup_addr(STARTUP_ADDR); + RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), { - return -1; + return _ret; }); RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), { - return -1; + return _ret; + }); + + RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, 0), { + return _ret; + }); + + RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG1, 0), { + return _ret; }); RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), { - return -1; + return _ret; }); return 0; @@ -165,14 +137,94 @@ int dcr_initialize(vx_device_h hdevice) { /////////////////////////////////////////////////////////////////////////////// -static uint64_t get_csr_64(const void* ptr, int addr) { - int offset = addr - VX_CSR_MPM_BASE; - return ((const uint64_t*)ptr)[offset]; +int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint64_t* addr) { + std::ifstream ifs(filename); + if (!ifs) { + std::cout << "error: " << filename << " not found" << std::endl; + return -1; + } + + // read file content + ifs.seekg(0, ifs.end); + auto size = ifs.tellg(); + std::vector content(size); + ifs.seekg(0, ifs.beg); + ifs.read(content.data(), size); + + uint64_t _addr = STARTUP_ADDR; + + RT_CHECK(vx_copy_to_dev(hdevice, _addr, content.data(), size), { + vx_mem_free(hdevice, _addr); + return _ret; + }); + + *addr = _addr; + + return 0; } -extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { - int ret = 0; +extern int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr) { + if (NULL == content || 0 == size || NULL == addr) + return -1; + uint64_t _addr; + + RT_CHECK(vx_mem_alloc(hdevice, size, &_addr), { + return _ret; + }); + + RT_CHECK(vx_copy_to_dev(hdevice, _addr, content, size), { + vx_mem_free(hdevice, _addr); + return _ret; + }); + + *addr = _addr; + + return 0; +} + +extern int vx_upload_file(vx_device_h hdevice, const char* filename, uint64_t* addr) { + std::ifstream ifs(filename); + if (!ifs) { + std::cout << "error: " << filename << " not found" << std::endl; + return -1; + } + + // read file content + ifs.seekg(0, ifs.end); + auto size = ifs.tellg(); + std::vector content(size); + ifs.seekg(0, ifs.beg); + ifs.read(content.data(), size); + + // upload buffer + RT_CHECK(vx_upload_bytes(hdevice, content.data(), size, addr), { + return _ret; + }); + + return 0; +} + +extern int vx_set_kernel_args(vx_device_h hdevice, const void* content, uint64_t size) { + if (NULL == content || 0 == size) + return -1; + + uint64_t startup_arg; + RT_CHECK(vx_mem_alloc(hdevice, size, &startup_arg), { + return _ret; + }); + + RT_CHECK(vx_copy_to_dev(hdevice, startup_arg, content, size), { + vx_mem_free(hdevice, startup_arg); + return _ret; + }); + + return 0; +} + +/////////////////////////////////////////////////////////////////////////////// + +extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t total_instrs = 0; uint64_t total_cycles = 0; uint64_t max_cycles = 0; @@ -234,16 +286,15 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { #endif uint64_t num_cores; - ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores); - if (ret != 0) - return ret; + RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), { + return _ret; + }); #ifdef PERF_ENABLE uint64_t isa_flags; - ret = vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags); - if (ret != 0) - return ret; - + RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), { + return _ret; + }); bool icache_enable = isa_flags & VX_ISA_EXT_ICACHE; bool dcache_enable = isa_flags & VX_ISA_EXT_DCACHE; bool l2cache_enable = isa_flags & VX_ISA_EXT_L2CACHE; @@ -251,16 +302,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { bool lmem_enable = isa_flags & VX_ISA_EXT_LMEM; #endif - std::vector staging_buf(32 * sizeof(uint64_t)); + std::vector staging_buf(32); + + auto get_mpm_csr = [&staging_buf](int csr_addr) { + return staging_buf.at(csr_addr - VX_CSR_MPM_BASE); + }; for (unsigned core_id = 0; core_id < num_cores; ++core_id) { uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size(); - ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size()); - if (ret != 0) - return ret; + RT_CHECK(vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size()), { + return _ret; + }); - uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE); - uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET); + uint64_t cycles_per_core = get_mpm_csr(VX_CSR_MCYCLE); + uint64_t instrs_per_core = get_mpm_csr(VX_CSR_MINSTRET); #ifdef PERF_ENABLE switch (perf_class) { @@ -268,7 +323,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { // PERF: pipeline // scheduler idles { - uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID); + uint64_t sched_idles_per_core = get_mpm_csr(VX_CSR_MPM_SCHED_ID); if (num_cores > 1) { int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core); fprintf(stream, "PERF: core%d: scheduler idle=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core); @@ -277,7 +332,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } // scheduler stalls { - uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST); + uint64_t sched_stalls_per_core = get_mpm_csr(VX_CSR_MPM_SCHED_ST); if (num_cores > 1) { int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core); fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core); @@ -286,7 +341,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } // ibuffer_stalls { - uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST); + uint64_t ibuffer_stalls_per_core = get_mpm_csr(VX_CSR_MPM_IBUF_ST); if (num_cores > 1) { int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core); fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core); @@ -295,11 +350,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } // issue_stalls { - uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST); - uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU); - uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU); - uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU); - uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU); + uint64_t scrb_stalls_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_ST); + uint64_t scrb_alu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_ALU); + uint64_t scrb_fpu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_FPU); + uint64_t scrb_lsu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_LSU); + uint64_t scrb_sfu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_SFU); scrb_alu += scrb_alu_per_core; scrb_fpu += scrb_fpu_per_core; scrb_lsu += scrb_lsu_per_core; @@ -316,9 +371,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } // sfu_stalls { - uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU); - uint64_t scrb_wctl_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_WCTL); - uint64_t scrb_csrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_CSRS); + uint64_t scrb_sfu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_SFU); + uint64_t scrb_wctl_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_WCTL); + uint64_t scrb_csrs_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_CSRS); if (num_cores > 1) { uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core; fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n" @@ -334,11 +389,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { // PERF: memory // ifetches { - uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCHES); + uint64_t ifetches_per_core = get_mpm_csr(VX_CSR_MPM_IFETCHES); if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core); ifetches += ifetches_per_core; - uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT); + uint64_t ifetch_lat_per_core = get_mpm_csr(VX_CSR_MPM_IFETCH_LT); if (num_cores > 1) { int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core); fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat); @@ -347,11 +402,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } // loads { - uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS); + uint64_t loads_per_core = get_mpm_csr(VX_CSR_MPM_LOADS); if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core); loads += loads_per_core; - uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT); + uint64_t load_lat_per_core = get_mpm_csr(VX_CSR_MPM_LOAD_LT); if (num_cores > 1) { int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core); fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat); @@ -360,7 +415,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } // stores { - uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES); + uint64_t stores_per_core = get_mpm_csr(VX_CSR_MPM_STORES); if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core); stores += stores_per_core; } @@ -368,9 +423,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { case VX_DCR_MPM_CLASS_MEM: { if (lmem_enable) { // PERF: lmem - uint64_t lmem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_LMEM_READS); - uint64_t lmem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_LMEM_WRITES); - uint64_t lmem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_LMEM_BANK_ST); + uint64_t lmem_reads = get_mpm_csr(VX_CSR_MPM_LMEM_READS); + uint64_t lmem_writes = get_mpm_csr(VX_CSR_MPM_LMEM_WRITES); + uint64_t lmem_bank_stalls = get_mpm_csr(VX_CSR_MPM_LMEM_BANK_ST); int lmem_bank_utilization = calcAvgPercent(lmem_reads + lmem_writes, lmem_reads + lmem_writes + lmem_bank_stalls); fprintf(stream, "PERF: core%d: lmem reads=%ld\n", core_id, lmem_reads); fprintf(stream, "PERF: core%d: lmem writes=%ld\n", core_id, lmem_writes); @@ -379,9 +434,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (icache_enable) { // PERF: Icache - uint64_t icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS); - uint64_t icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R); - uint64_t icache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MSHR_ST); + uint64_t icache_reads = get_mpm_csr(VX_CSR_MPM_ICACHE_READS); + uint64_t icache_read_misses = get_mpm_csr(VX_CSR_MPM_ICACHE_MISS_R); + uint64_t icache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_ICACHE_MSHR_ST); int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads); int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls); fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads); @@ -391,12 +446,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (dcache_enable) { // PERF: Dcache - uint64_t dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS); - uint64_t dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES); - uint64_t dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R); - uint64_t dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W); - uint64_t dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST); - uint64_t dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST); + uint64_t dcache_reads = get_mpm_csr(VX_CSR_MPM_DCACHE_READS); + uint64_t dcache_writes = get_mpm_csr(VX_CSR_MPM_DCACHE_WRITES); + uint64_t dcache_read_misses = get_mpm_csr(VX_CSR_MPM_DCACHE_MISS_R); + uint64_t dcache_write_misses = get_mpm_csr(VX_CSR_MPM_DCACHE_MISS_W); + uint64_t dcache_bank_stalls = get_mpm_csr(VX_CSR_MPM_DCACHE_BANK_ST); + uint64_t dcache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_DCACHE_MSHR_ST); int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads); int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes); int dcache_bank_utilization = calcAvgPercent(dcache_reads + dcache_writes, dcache_reads + dcache_writes + dcache_bank_stalls); @@ -411,29 +466,29 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { if (l2cache_enable) { // PERF: L2cache - l2cache_reads += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS); - l2cache_writes += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES); - l2cache_read_misses += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R); - l2cache_write_misses += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W); - l2cache_bank_stalls += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST); - l2cache_mshr_stalls += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST); + l2cache_reads += get_mpm_csr(VX_CSR_MPM_L2CACHE_READS); + l2cache_writes += get_mpm_csr(VX_CSR_MPM_L2CACHE_WRITES); + l2cache_read_misses += get_mpm_csr(VX_CSR_MPM_L2CACHE_MISS_R); + l2cache_write_misses += get_mpm_csr(VX_CSR_MPM_L2CACHE_MISS_W); + l2cache_bank_stalls += get_mpm_csr(VX_CSR_MPM_L2CACHE_BANK_ST); + l2cache_mshr_stalls += get_mpm_csr(VX_CSR_MPM_L2CACHE_MSHR_ST); } if (0 == core_id) { if (l3cache_enable) { // PERF: L3cache - l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS); - l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES); - l3cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_R); - l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W); - l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST); - l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST); + l3cache_reads = get_mpm_csr(VX_CSR_MPM_L3CACHE_READS); + l3cache_writes = get_mpm_csr(VX_CSR_MPM_L3CACHE_WRITES); + l3cache_read_misses = get_mpm_csr(VX_CSR_MPM_L3CACHE_MISS_R); + l3cache_write_misses = get_mpm_csr(VX_CSR_MPM_L3CACHE_MISS_W); + l3cache_bank_stalls = get_mpm_csr(VX_CSR_MPM_L3CACHE_BANK_ST); + l3cache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_L3CACHE_MSHR_ST); } // PERF: memory - mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS); - mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES); - mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LT); + mem_reads = get_mpm_csr(VX_CSR_MPM_MEM_READS); + mem_writes = get_mpm_csr(VX_CSR_MPM_MEM_WRITES); + mem_lat = get_mpm_csr(VX_CSR_MPM_MEM_LT); } } break; default: @@ -528,18 +583,18 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { } extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value) { - int ret = 0; uint64_t num_cores; - ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores); - if (ret != 0) - return ret; + + RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), { + return _ret; + }); if (core_id >= (int)num_cores) { std::cout << "error: core_id out of range" << std::endl; return -1; } - std::vector staging_buf(64 * sizeof(uint32_t)); + std::vector staging_buf(64); uint64_t _value = 0; @@ -551,11 +606,11 @@ extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64 for (i = 0; i < num_cores; ++i) { uint64_t mpm_mem_addr = IO_CSR_ADDR + i * staging_buf.size(); - ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size()); - if (ret != 0) - return ret; + RT_CHECK(vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size()), { + return _ret; + }); - auto per_core_value = get_csr_64(staging_buf.data(), counter); + auto per_core_value = staging_buf.at(counter-VX_CSR_MPM_BASE); if (counter == VX_CSR_MCYCLE) { _value = std::max(per_core_value, _value); } else { diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h index a74ad577f..770edbbda 100644 --- a/runtime/include/vortex.h +++ b/runtime/include/vortex.h @@ -33,8 +33,7 @@ typedef void* vx_device_h; #define VX_CAPS_GLOBAL_MEM_SIZE 0x5 #define VX_CAPS_LOCAL_MEM_SIZE 0x6 #define VX_CAPS_LOCAL_MEM_ADDR 0x7 -#define VX_CAPS_KERNEL_BASE_ADDR 0x8 -#define VX_CAPS_ISA_FLAGS 0x9 +#define VX_CAPS_ISA_FLAGS 0x8 // device isa flags #define VX_ISA_STD_A (1ull << 0) @@ -83,21 +82,27 @@ int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size); // Start device execution -int vx_start(vx_device_h hdevice); +int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr); // Wait for device ready with milliseconds timeout int vx_ready_wait(vx_device_h hdevice, uint64_t timeout); +// read device configuration registers +int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value); + // write device configuration registers -int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value); +int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value); ////////////////////////////// UTILITY FUNCTIONS ////////////////////////////// -// upload kernel bytes to device -int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size); - // upload kernel file to device -int vx_upload_kernel_file(vx_device_h hdevice, const char* filename); +int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint64_t* addr); + +// upload bytes to device +int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr); + +// upload file to device +int vx_upload_file(vx_device_h hdevice, const char* filename, uint64_t* addr); // performance counters int vx_dump_perf(vx_device_h hdevice, FILE* stream); diff --git a/runtime/opae/vortex.cpp b/runtime/opae/vortex.cpp index eea4b51d8..effb52b57 100755 --- a/runtime/opae/vortex.cpp +++ b/runtime/opae/vortex.cpp @@ -59,6 +59,12 @@ #define RAM_PAGE_SIZE 4096 +#ifndef NDEBUG +#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0) +#else +#define DBGPRINT(format, ...) ((void)0) +#endif + #define CHECK_HANDLE(handle, _expr, _cleanup) \ auto handle = _expr; \ if (handle == nullptr) { \ @@ -66,7 +72,7 @@ _cleanup \ } -#define CHECK_ERR(_expr, _cleanup) \ +#define CHECK_FPGA_ERR(_expr, _cleanup) \ do { \ auto err = _expr; \ if (err == 0) \ @@ -75,6 +81,15 @@ _cleanup \ } while (false) +#define CHECK_ERR(_expr, _cleanup) \ + do { \ + auto err = _expr; \ + if (err == 0) \ + break; \ + printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \ + _cleanup \ + } while (false) + /////////////////////////////////////////////////////////////////////////////// class vx_device { @@ -100,12 +115,12 @@ public: } // allocate new buffer - CHECK_ERR(api.fpgaPrepareBuffer(fpga, asize, (void**)&staging_ptr, &staging_wsid, 0), { + CHECK_FPGA_ERR(api.fpgaPrepareBuffer(fpga, asize, (void**)&staging_ptr, &staging_wsid, 0), { return -1; }); // get the physical address of the buffer in the accelerator - CHECK_ERR(api.fpgaGetIOAddress(fpga, staging_wsid, &staging_ioaddr), { + CHECK_FPGA_ERR(api.fpgaGetIOAddress(fpga, staging_wsid, &staging_ioaddr), { api.fpgaReleaseBuffer(fpga, staging_wsid); return -1; }); @@ -161,10 +176,6 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { case VX_CAPS_LOCAL_MEM_ADDR: *value = LMEM_BASE_ADDR; break; - case VX_CAPS_KERNEL_BASE_ADDR: - *value = (uint64_t(device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | - device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR0); - break; case VX_CAPS_ISA_FLAGS: *value = device->isa_caps; break; @@ -197,11 +208,11 @@ extern int vx_dev_open(vx_device_h* hdevice) { } // Set up a filter that will search for an accelerator - CHECK_ERR(api.fpgaGetProperties(nullptr, &filter), { + CHECK_FPGA_ERR(api.fpgaGetProperties(nullptr, &filter), { return -1; }); - CHECK_ERR(api.fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR), { + CHECK_FPGA_ERR(api.fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR), { api.fpgaDestroyProperties(&filter); return -1; }); @@ -210,19 +221,19 @@ extern int vx_dev_open(vx_device_h* hdevice) { std::string s_uuid(AFU_ACCEL_UUID); std::replace(s_uuid.begin(), s_uuid.end(), '_', '-'); uuid_parse(s_uuid.c_str(), guid); - CHECK_ERR(api.fpgaPropertiesSetGUID(filter, guid), { + CHECK_FPGA_ERR(api.fpgaPropertiesSetGUID(filter, guid), { api.fpgaDestroyProperties(&filter); return -1; }); // Do the search across the available FPGA contexts - CHECK_ERR(api.fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches), { + CHECK_FPGA_ERR(api.fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches), { api.fpgaDestroyProperties(&filter); return -1; }); // Not needed anymore - CHECK_ERR(api.fpgaDestroyProperties(&filter), { + CHECK_FPGA_ERR(api.fpgaDestroyProperties(&filter), { api.fpgaDestroyToken(&accel_token); return -1; }); @@ -234,13 +245,13 @@ extern int vx_dev_open(vx_device_h* hdevice) { } // Open accelerator - CHECK_ERR(api.fpgaOpen(accel_token, &accel_handle, 0), { + CHECK_FPGA_ERR(api.fpgaOpen(accel_token, &accel_handle, 0), { api.fpgaDestroyToken(&accel_token); return -1; }); // Done with token - CHECK_ERR(api.fpgaDestroyToken(&accel_token), { + CHECK_FPGA_ERR(api.fpgaDestroyToken(&accel_token), { api.fpgaClose(accel_handle); return -1; }); @@ -257,19 +268,19 @@ extern int vx_dev_open(vx_device_h* hdevice) { { // retrieve FPGA global memory size - CHECK_ERR(api.fpgaPropertiesGetLocalMemorySize(filter, &device->global_mem_size), { + CHECK_FPGA_ERR(api.fpgaPropertiesGetLocalMemorySize(filter, &device->global_mem_size), { // assume 8GB as default device->global_mem_size = GLOBAL_MEM_SIZE; }); // Load ISA CAPS - CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_ISA_CAPS, &device->isa_caps), { + CHECK_FPGA_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_ISA_CAPS, &device->isa_caps), { api.fpgaClose(accel_handle); return -1; }); // Load device CAPS - CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_DEV_CAPS, &device->dev_caps), { + CHECK_FPGA_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_DEV_CAPS, &device->dev_caps), { api.fpgaClose(accel_handle); return -1; }); @@ -383,6 +394,8 @@ extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* ho auto device = (vx_device*)hdevice; auto& api = device->api; + DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, size); + if (device->ensure_staging(size) != 0) return -1; @@ -405,16 +418,16 @@ extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* ho auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE); - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), { return -1; }); - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), { return -1; }); - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), { return -1; }); - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE), { return -1; }); @@ -432,6 +445,8 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad auto device = (vx_device*)hdevice; auto& api = device->api; + DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, asize); + if (device->ensure_staging(size) != 0) return -1; @@ -451,16 +466,16 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE); - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), { return -1; }); - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), { return -1; }); - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), { return -1; }); - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ), { return -1; }); @@ -474,19 +489,35 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad return 0; } -extern int vx_start(vx_device_h hdevice) { +extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) { if (nullptr == hdevice) return -1; auto device = ((vx_device*)hdevice); auto& api = device->api; - // Ensure ready for new command + DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr); + + // ensure ready for new command if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0) - return -1; + return -1; + + // set kernel info + CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff), { + return -1; + }); + CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, krnl_addr >> 32), { + return -1; + }); + CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff), { + return -1; + }); + CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, args_addr >> 32), { + return -1; + }); - // start execution - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN), { + // start execution + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN), { return -1; }); @@ -502,6 +533,8 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { auto device = ((vx_device*)hdevice); auto& api = device->api; + DBGPRINT("%s\n", "WAIT"); + struct timespec sleep_time; sleep_time.tv_sec = 0; @@ -512,7 +545,7 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { for (;;) { uint64_t status; - CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), { + CHECK_FPGA_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), { return -1; }); @@ -529,7 +562,7 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { std::cout << std::dec << "#" << cout_tid << ": " << ss_buf.str() << std::flush; ss_buf.str(""); } - CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), { + CHECK_FPGA_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), { return -1; }); cout_data = status >> STATUS_STATE_BITS; @@ -558,29 +591,43 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { return 0; } -extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) { +extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) { + if (nullptr == hdevice) + return -1; + + auto device = (vx_device*)hdevice; + + *value = device->dcrs.read(addr); + + DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value); + + return 0; +} + +extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) { if (nullptr == hdevice) return -1; auto device = ((vx_device*)hdevice); auto& api = device->api; + DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value); + // Ensure ready for new command if (vx_ready_wait(hdevice, -1) != 0) return -1; // write DCR value - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, addr), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, addr), { return -1; }); - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, value), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, value), { return -1; }); - CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_DCR_WRITE), { + CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_DCR_WRITE), { return -1; }); - // save the value device->dcrs.write(addr, value); return 0; diff --git a/runtime/rtlsim/vortex.cpp b/runtime/rtlsim/vortex.cpp index 60f5e08b8..05fc24e06 100644 --- a/runtime/rtlsim/vortex.cpp +++ b/runtime/rtlsim/vortex.cpp @@ -32,6 +32,12 @@ #define RAM_PAGE_SIZE 4096 +#ifndef NDEBUG +#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0) +#else +#define DBGPRINT(format, ...) ((void)0) +#endif + using namespace vortex; /////////////////////////////////////////////////////////////////////////////// @@ -108,11 +114,18 @@ public: return 0; } - int start() { + int start(uint64_t krnl_addr, uint64_t args_addr) { // ensure prior run completed if (future_.valid()) { future_.wait(); } + + // set kernel info + this->write_dcr(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff); + this->write_dcr(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32); + this->write_dcr(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff); + this->write_dcr(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32); + // start new run future_ = std::async(std::launch::async, [&]{ processor_.run(); @@ -163,7 +176,7 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; - vx_device *device = ((vx_device*)hdevice); + //vx_device *device = ((vx_device*)hdevice); switch (caps_id) { case VX_CAPS_VERSION: @@ -190,10 +203,6 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { case VX_CAPS_LOCAL_MEM_ADDR: *value = LMEM_BASE_ADDR; break; - case VX_CAPS_KERNEL_BASE_ADDR: - *value = (uint64_t(device->read_dcr(VX_DCR_BASE_STARTUP_ADDR1)) << 32) - | device->read_dcr(VX_DCR_BASE_STARTUP_ADDR0); - break; case VX_CAPS_ISA_FLAGS: *value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; @@ -278,6 +287,9 @@ extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* ho return -1; auto device = (vx_device*)hdevice; + + DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size); + return device->upload(dev_addr, host_ptr, size); } @@ -286,26 +298,50 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad return -1; auto device = (vx_device*)hdevice; + + DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size); + return device->download(host_ptr, dev_addr, size); } -extern int vx_start(vx_device_h hdevice) { +extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) { if (nullptr == hdevice) - return -1; + return -1; + + DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr); vx_device *device = ((vx_device*)hdevice); - return device->start(); + return device->start(krnl_addr, args_addr); } extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) - return -1; + return -1; + + DBGPRINT("%s\n", "WAIT"); vx_device *device = ((vx_device*)hdevice); return device->wait(timeout); } -extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) { +extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) { + if (nullptr == hdevice || NULL == value) + return -1; + + vx_device *device = ((vx_device*)hdevice); + + // Ensure ready for new command + if (vx_ready_wait(hdevice, -1) != 0) + return -1; + + *value = device->read_dcr(addr); + + DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value); + + return 0; +} + +extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) { if (nullptr == hdevice) return -1; @@ -314,5 +350,8 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) { // Ensure ready for new command if (vx_ready_wait(hdevice, -1) != 0) return -1; + + DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value); + return device->write_dcr(addr, value); } \ No newline at end of file diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index 3b198d6c6..fe9ba1b20 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -43,47 +43,6 @@ using namespace vortex; /////////////////////////////////////////////////////////////////////////////// -class vx_device; - -class vx_buffer { -public: - vx_buffer(uint64_t size, vx_device* device) - : size_(size) - , device_(device) { - uint64_t aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE); - data_ = aligned_malloc(aligned_asize, CACHE_BLOCK_SIZE); - // set uninitialized data to "baadf00d" - for (uint32_t i = 0; i < aligned_asize; ++i) { - ((uint8_t*)data_)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff; - } - } - - ~vx_buffer() { - if (data_) { - aligned_free(data_); - } - } - - void* data() const { - return data_; - } - - uint64_t size() const { - return size_; - } - - vx_device* device() const { - return device_; - } - -private: - uint64_t size_; - vx_device* device_; - void* data_; -}; - -/////////////////////////////////////////////////////////////////////////////// - class vx_device { public: vx_device() @@ -152,11 +111,17 @@ public: return 0; } - int start() { + int start(uint64_t krnl_addr, uint64_t args_addr) { // ensure prior run completed if (future_.valid()) { future_.wait(); } + + // set kernel info + this->write_dcr(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff); + this->write_dcr(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32); + this->write_dcr(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff); + this->write_dcr(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32); // start new run future_ = std::async(std::launch::async, [&]{ @@ -251,7 +216,7 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { if (nullptr == hdevice) return -1; - vx_device *device = ((vx_device*)hdevice); + //vx_device *device = ((vx_device*)hdevice); switch (caps_id) { case VX_CAPS_VERSION: @@ -277,11 +242,7 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { break; case VX_CAPS_LOCAL_MEM_ADDR: *value = LMEM_BASE_ADDR; - break; - case VX_CAPS_KERNEL_BASE_ADDR: - *value = (uint64_t(device->read_dcr(VX_DCR_BASE_STARTUP_ADDR1)) << 32) - | device->read_dcr(VX_DCR_BASE_STARTUP_ADDR0); - break; + break; case VX_CAPS_ISA_FLAGS: *value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; @@ -345,25 +306,44 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad return device->download(host_ptr, dev_addr, size); } -extern int vx_start(vx_device_h hdevice) { +extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) { if (nullptr == hdevice) return -1; - DBGPRINT("START\n"); + DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr); vx_device *device = ((vx_device*)hdevice); - return device->start(); + return device->start(krnl_addr, args_addr); } extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { if (nullptr == hdevice) return -1; + DBGPRINT("%s\n", "WAIT"); + vx_device *device = ((vx_device*)hdevice); return device->wait(timeout); } -extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) { +extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) { + if (nullptr == hdevice || NULL == value) + return -1; + + vx_device *device = ((vx_device*)hdevice); + + // Ensure ready for new command + if (vx_ready_wait(hdevice, -1) != 0) + return -1; + + *value = device->read_dcr(addr); + + DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value); + + return 0; +} + +extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) { if (nullptr == hdevice) return -1; @@ -373,7 +353,7 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) { if (vx_ready_wait(hdevice, -1) != 0) return -1; - DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%lx\n", addr, value); + DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value); return device->write_dcr(addr, value); } diff --git a/runtime/stub/vortex.cpp b/runtime/stub/vortex.cpp index a60008352..acf6ca1b3 100644 --- a/runtime/stub/vortex.cpp +++ b/runtime/stub/vortex.cpp @@ -45,7 +45,7 @@ extern int vx_copy_from_dev(vx_device_h /*hdevice*/, void* /*host_ptr*/, uint64_ return -1; } -extern int vx_start(vx_device_h /*hdevice*/) { +extern int vx_start(vx_device_h /*hdevice*/, uint64_t /*krnl_addr*/, uint64_t /*args_add*/) { return -1; } @@ -53,6 +53,11 @@ extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) { return -1; } -extern int vx_dcr_write(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint64_t /*value*/) { +extern int vx_dcr_read(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t* /*value*/) { + return -1; +} + + +extern int vx_dcr_write(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t /*value*/) { return -1; } diff --git a/runtime/xrt/vortex.cpp b/runtime/xrt/vortex.cpp index b9a09bba0..67b306349 100644 --- a/runtime/xrt/vortex.cpp +++ b/runtime/xrt/vortex.cpp @@ -518,10 +518,6 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) { case VX_CAPS_LOCAL_MEM_ADDR: *value = LMEM_BASE_ADDR; break; - case VX_CAPS_KERNEL_BASE_ADDR: - *value = (uint64_t(device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | - device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR0); - break; case VX_CAPS_ISA_FLAGS: *value = device->isa_caps; break; @@ -766,6 +762,8 @@ extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* ho auto device = (vx_device*)hdevice; + DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, size); + // check alignment if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE)) return -1; @@ -779,8 +777,6 @@ extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* ho CHECK_ERR(device->upload(dev_addr, host_ptr, asize), { return -1; }); - - DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, size); return 0; } @@ -791,6 +787,8 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad auto device = (vx_device*)hdevice; + DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, asize); + // check alignment if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE)) return -1; @@ -804,25 +802,38 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad CHECK_ERR(device->download(host_ptr, dev_addr, asize), { return -1; }); - - DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, asize); return 0; } -extern int vx_start(vx_device_h hdevice) { +extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) { if (nullptr == hdevice) return -1; + //wait_for_enter("\nPress ENTER to continue after setting up ILA trigger..."); + + DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr); + + // set kernel info + CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff), { + return -1; + }); + CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, krnl_addr >> 32), { + return -1; + }); + CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff), { + return -1; + }); + CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, args_addr >> 32), { + return -1; + }); + auto device = (vx_device*)hdevice; - //wait_for_enter("\nPress ENTER to continue after setting up ILA trigger..."); - + // start execution CHECK_ERR(device->write_register(MMIO_CTL_ADDR, CTL_AP_START), { return -1; }); - - DBGPRINT("START\n"); return 0; } @@ -833,6 +844,8 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { auto device = (vx_device*)hdevice; + DBGPRINT("%s\n", "WAIT"); + struct timespec sleep_time; #ifndef NDEBUG @@ -862,11 +875,26 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) { return 0; } -extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) { +extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) { if (nullptr == hdevice) return -1; auto device = (vx_device*)hdevice; + + *value = device->dcrs.read(addr); + + DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value); + + return 0; +} + +extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) { + if (nullptr == hdevice) + return -1; + + auto device = (vx_device*)hdevice; + + DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value); CHECK_ERR(device->write_register(MMIO_DCR_ADDR, addr), { return -1; @@ -875,9 +903,7 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) { CHECK_ERR(device->write_register(MMIO_DCR_ADDR + 4, value), { return -1; }); - - // save the value - DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%lx\n", addr, value); + device->dcrs.write(addr, value); return 0; diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 6fe526d05..caea25f1d 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -46,22 +46,23 @@ Emulator::warp_t::warp_t(const Arch& arch) , freg_file(arch.num_threads(), std::vector(arch.num_regs())) {} -void Emulator::warp_t::clear(const Arch& arch, const DCRS &dcrs) { - this->PC = dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0); -#if (XLEN == 64) - this->PC = (uint64_t(dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | this->PC; -#endif +void Emulator::warp_t::clear(uint64_t startup_addr) { + this->PC = startup_addr; this->tmask.reset(); - for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) { - for (auto& reg : this->ireg_file.at(i)) { - reg = 0; - } - for (auto& reg : this->freg_file.at(i)) { + this->uui_gen.reset(); + this->fcsr = 0; + + for (auto& reg_file : this->ireg_file) { + for (auto& reg : reg_file) { reg = 0; } } - this->fcsr = 0; - this->uui_gen.reset(); + + for (auto& reg_file : this->freg_file) { + for (auto& reg : reg_file) { + reg = 0; + } + } } /////////////////////////////////////////////////////////////////////////////// @@ -81,13 +82,25 @@ Emulator::~Emulator() { } void Emulator::clear() { + uint32_t startup_addr = dcrs_.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0); +#if (XLEN == 64) + startup_addr |= (uint64_t(dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32); +#endif + + uint32_t startup_arg = dcrs_.base_dcrs.read(VX_DCR_BASE_STARTUP_ARG0); +#if (XLEN == 64) + startup_arg |= (uint64_t(dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ARG1)) << 32); +#endif + for (auto& warp : warps_) { - warp.clear(arch_, dcrs_); + warp.clear(startup_addr); } for (auto& barrier : barriers_) { barrier.reset(); } + + csr_mscratch_ = startup_arg; stalled_warps_.reset(); active_warps_.reset(); diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h index 872c780d0..bbfa5a72c 100644 --- a/sim/simx/emulator.h +++ b/sim/simx/emulator.h @@ -65,7 +65,7 @@ private: struct warp_t { warp_t(const Arch& arch); - void clear(const Arch& arch, const DCRS &dcrs); + void clear(uint64_t startup_addr); Word PC; ThreadMask tmask; diff --git a/tests/kernel/common.mk b/tests/kernel/common.mk index 4cd0b8a3f..17c17bce5 100644 --- a/tests/kernel/common.mk +++ b/tests/kernel/common.mk @@ -6,10 +6,25 @@ else CFLAGS += -march=rv32imaf -mabi=ilp32f endif -CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc -AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar -DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump -CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy +LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT) +LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) +LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex -mllvm -vortex-branch-divergence=0 +#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX) +#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0 +#LLVM_CFLAGS += -Wl,-L$(RISCV_TOOLCHAIN_PATH)/lib/gcc/$(RISCV_PREFIX)/9.2.0 +#LLVM_CFLAGS += --rtlib=libgcc + +#CC = $(LLVM_VORTEX)/bin/clang $(LLVM_CFLAGS) +#CXX = $(LLVM_VORTEX)/bin/clang++ $(LLVM_CFLAGS) +#AR = $(LLVM_VORTEX)/bin/llvm-ar +#DP = $(LLVM_VORTEX)/bin/llvm-objdump +#CP = $(LLVM_VORTEX)/bin/llvm-objcopy + +CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc +CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++ +AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar +DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump +CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(ROOT_DIR)/hw diff --git a/tests/kernel/conform/tests.cpp b/tests/kernel/conform/tests.cpp index 7ee9fb1d5..baed991f9 100644 --- a/tests/kernel/conform/tests.cpp +++ b/tests/kernel/conform/tests.cpp @@ -1,6 +1,7 @@ #include "tests.h" #include #include +#include #include #include #include diff --git a/tests/opencl/common.mk b/tests/opencl/common.mk index 091e4f926..857fb6e12 100644 --- a/tests/opencl/common.mk +++ b/tests/opencl/common.mk @@ -22,12 +22,12 @@ LLVM_POCL ?= $(TOOLDIR)/llvm-vortex LIBC_LIB += -L$(LIBC_VORTEX)/lib -lm -lc -lgcc -K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex +K_CFLAGS += -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections #K_CFLAGS += -mllvm -vortex-branch-divergence=0 #K_CFLAGS += -mllvm -print-after-all K_CFLAGS += -mllvm -disable-loop-idiom-all # disable memset/memcpy loop idiom -K_CFLAGS += -I$(VORTEX_KN_PATH)/include -DXLEN_$(XLEN) -DNDEBUG +K_CFLAGS += -I$(ROOT_DIR)/hw -I$(VORTEX_KN_PATH)/include -DXLEN_$(XLEN) -DNDEBUG K_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(ROOT_DIR)/kernel/libvortexrt.a $(LIBC_LIB) CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors diff --git a/tests/regression/basic/common.h b/tests/regression/basic/common.h index 88748dc96..90b0b7904 100644 --- a/tests/regression/basic/common.h +++ b/tests/regression/basic/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - typedef struct { uint32_t count; uint64_t src_addr; diff --git a/tests/regression/basic/kernel.cpp b/tests/regression/basic/kernel.cpp index fae6019b6..d39390dcb 100644 --- a/tests/regression/basic/kernel.cpp +++ b/tests/regression/basic/kernel.cpp @@ -3,7 +3,7 @@ #include "common.h" int main() { - kernel_arg_t* __UNIFORM__ arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* __UNIFORM__ arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); uint32_t count = arg->count; int32_t* src_ptr = (int32_t*)arg->src_addr; int32_t* dst_ptr = (int32_t*)arg->dst_addr; diff --git a/tests/regression/basic/main.cpp b/tests/regression/basic/main.cpp index 1f4cc2b3e..22cb019e3 100755 --- a/tests/regression/basic/main.cpp +++ b/tests/regression/basic/main.cpp @@ -24,6 +24,8 @@ uint32_t count = 0; vx_device_h device = nullptr; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -60,6 +62,8 @@ void cleanup() { if (device) { vx_mem_free(device, kernel_arg.src_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } @@ -168,7 +172,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg, // start device std::cout << "start execution" << std::endl; auto t2 = std::chrono::high_resolution_clock::now(); - RT_CHECK(vx_start(device)); + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); auto t3 = std::chrono::high_resolution_clock::now(); @@ -246,8 +250,7 @@ int main(int argc, char *argv[]) { // allocate staging buffer std::cout << "allocate staging buffer" << std::endl; - uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - staging_buf.resize(alloc_size); + staging_buf.resize(buf_size); // run tests if (0 == test || -1 == test) { @@ -258,12 +261,11 @@ int main(int argc, char *argv[]) { if (1 == test || -1 == test) { // upload program std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); // upload kernel argument std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); std::cout << "run kernel test" << std::endl; RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points)); diff --git a/tests/regression/common.mk b/tests/regression/common.mk index bd76b36f1..27c3a6a77 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -33,7 +33,7 @@ VX_CP = $(LLVM_VORTEX)/bin/llvm-objcopy #VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump #VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy -VX_CFLAGS += -v -O3 -std=c++11 -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections +VX_CFLAGS += -O3 -std=c++11 -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(ROOT_DIR)/hw VX_CFLAGS += -DXLEN_$(XLEN) VX_CFLAGS += -DNDEBUG diff --git a/tests/regression/conv3x/common.h b/tests/regression/conv3x/common.h index 7b1207bf4..c38b2c174 100644 --- a/tests/regression/conv3x/common.h +++ b/tests/regression/conv3x/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - #ifndef TYPE #define TYPE float #endif diff --git a/tests/regression/conv3x/kernel.cpp b/tests/regression/conv3x/kernel.cpp index c1a83cdb1..c7a8864af 100644 --- a/tests/regression/conv3x/kernel.cpp +++ b/tests/regression/conv3x/kernel.cpp @@ -45,7 +45,7 @@ void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); if (arg->lmem_addr != 0) { // populate local memory auto W = reinterpret_cast(arg->W_addr); diff --git a/tests/regression/conv3x/main.cpp b/tests/regression/conv3x/main.cpp index 7c6ca10f7..753e3bad5 100644 --- a/tests/regression/conv3x/main.cpp +++ b/tests/regression/conv3x/main.cpp @@ -36,7 +36,7 @@ public: static bool compare(int a, int b, int index, int errors) { if (a != b) { if (errors < 100) { - printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a); } return false; } @@ -61,7 +61,7 @@ public: auto d = std::abs(fa.i - fb.i); if (d > FLOAT_ULP) { if (errors < 100) { - printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); } return false; } @@ -95,7 +95,8 @@ int size = 32; bool use_lmem = false; vx_device_h device = nullptr; -std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -135,6 +136,8 @@ void cleanup() { vx_mem_free(device, kernel_arg.W_addr); } vx_mem_free(device, kernel_arg.O_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } @@ -158,10 +161,6 @@ int main(int argc, char *argv[]) { uint32_t i_points = (size+2) * (size+2); uint32_t w_points = 3 * 3; - // upload program - std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); - // allocate device memory std::cout << "allocate device memory" << std::endl; size_t i_nbytes = i_points * sizeof(TYPE); @@ -192,16 +191,6 @@ int main(int argc, char *argv[]) { std::cout << "dev_argI=0x" << std::hex << kernel_arg.I_addr << std::endl; std::cout << "dev_argW=0x" << std::hex << kernel_arg.W_addr << std::endl; std::cout << "dev_argO=0x" << std::hex << kernel_arg.O_addr << std::endl; - - // allocate staging buffer - std::cout << "allocate staging buffer" << std::endl; - uint32_t alloc_size = std::max(i_nbytes, sizeof(kernel_arg_t)); - staging_buf.resize(alloc_size); - - // upload kernel argument - std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); // Generate input values std::vector h_I(i_points); @@ -219,38 +208,32 @@ int main(int argc, char *argv[]) { for (uint32_t i = 0; i < w_points; ++i) { h_W[i] = static_cast(rand()) / RAND_MAX; } - convolution_cpu(h_O.data(), h_I.data(), h_W.data(), size, size); // upload input buffer { std::cout << "upload source buffer" << std::endl; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < i_points; ++i) { - buf_ptr[i] = h_I[i]; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.I_addr, staging_buf.data(), i_nbytes)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.I_addr, h_I.data(), i_nbytes)); } // upload weight buffer { std::cout << "upload weight buffer" << std::endl; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < w_points; ++i) { - buf_ptr[i] = h_W[i]; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.W_addr, staging_buf.data(), w_nbytes)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.W_addr, h_W.data(), w_nbytes)); } - // clear destination buffer - std::cout << "clear destination buffer" << std::endl; - memset(staging_buf.data(), 0, o_nbytes); - RT_CHECK(vx_copy_to_dev(device, kernel_arg.O_addr, staging_buf.data(), o_nbytes)); + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); auto time_start = std::chrono::high_resolution_clock::now(); // start device std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); // wait for completion std::cout << "wait for completion" << std::endl; @@ -262,16 +245,18 @@ int main(int argc, char *argv[]) { // download destination buffer std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.O_addr, o_nbytes)); + RT_CHECK(vx_copy_from_dev(device, h_O.data(), kernel_arg.O_addr, o_nbytes)); // verify result std::cout << "verify result" << std::endl; { + std::vector h_ref(o_points); + convolution_cpu(h_ref.data(), h_I.data(), h_W.data(), size, size); + int errors = 0; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < h_O.size(); ++i) { - auto ref = h_O[i]; - auto cur = buf_ptr[i]; + for (uint32_t i = 0; i < h_ref.size(); ++i) { + auto ref = h_ref[i]; + auto cur = h_O[i]; if (!Comparator::compare(cur, ref, i, errors)) { ++errors; } diff --git a/tests/regression/demo/common.h b/tests/regression/demo/common.h index 941983ac4..98b8ff587 100644 --- a/tests/regression/demo/common.h +++ b/tests/regression/demo/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - #ifndef TYPE #define TYPE float #endif diff --git a/tests/regression/demo/kernel.cpp b/tests/regression/demo/kernel.cpp index 499454409..8a9fc691c 100644 --- a/tests/regression/demo/kernel.cpp +++ b/tests/regression/demo/kernel.cpp @@ -17,7 +17,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/demo/main.cpp b/tests/regression/demo/main.cpp index 942c3f094..36585a0d4 100644 --- a/tests/regression/demo/main.cpp +++ b/tests/regression/demo/main.cpp @@ -34,7 +34,7 @@ public: static bool compare(int a, int b, int index, int errors) { if (a != b) { if (errors < 100) { - printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a); } return false; } @@ -61,7 +61,7 @@ public: auto d = std::abs(fa.i - fb.i); if (d > FLOAT_ULP) { if (errors < 100) { - printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + printf("*** error: [%d] expected=%f(0x%x), actual=%f(0x%x), ulp=%d\n", index, b, fb.i, a, fa.i, d); } return false; } @@ -75,6 +75,8 @@ uint32_t count = 16; vx_device_h device = nullptr; std::vector source_data; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -109,16 +111,89 @@ void cleanup() { vx_mem_free(device, kernel_arg.src0_addr); vx_mem_free(device, kernel_arg.src1_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } -int run_test(const kernel_arg_t& kernel_arg, - uint32_t buf_size, - uint32_t num_points) { +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint64_t num_cores, num_warps, num_threads; + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads)); + + uint32_t num_tasks = num_cores * num_warps * num_threads; + uint32_t num_points = count * num_tasks; + uint32_t buf_size = num_points * sizeof(TYPE); + + std::cout << "data type: " << Comparator::type_str() << std::endl; + std::cout << "number of points: " << num_points << std::endl; + std::cout << "buffer size: " << buf_size << " bytes" << std::endl; + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr)); + + kernel_arg.num_tasks = num_tasks; + kernel_arg.task_size = count; + + std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl; + std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl; + std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; + + // allocate staging buffer + std::cout << "allocate staging buffer" << std::endl; + staging_buf.resize(buf_size); + + // generate source data + source_data.resize(2 * num_points); + for (uint32_t i = 0; i < source_data.size(); ++i) { + source_data[i] = Comparator::generate(); + } + + // upload source buffer0 + { + std::cout << "upload source buffer0" << std::endl; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = source_data[2 * i + 0]; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size)); + } + + // upload source buffer1 + { + std::cout << "upload source buffer1" << std::endl; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = source_data[2 * i + 1]; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size)); + } + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); + // start device std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); // wait for completion std::cout << "wait for completion" << std::endl; @@ -147,94 +222,6 @@ int run_test(const kernel_arg_t& kernel_arg, } } - return 0; -} - -int main(int argc, char *argv[]) { - // parse command arguments - parse_args(argc, argv); - - std::srand(50); - - // open device connection - std::cout << "open device connection" << std::endl; - RT_CHECK(vx_dev_open(&device)); - - uint64_t num_cores, num_warps, num_threads; - RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores)); - RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps)); - RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads)); - - uint32_t num_tasks = num_cores * num_warps * num_threads; - uint32_t num_points = count * num_tasks; - uint32_t buf_size = num_points * sizeof(TYPE); - - std::cout << "data type: " << Comparator::type_str() << std::endl; - std::cout << "number of points: " << num_points << std::endl; - std::cout << "buffer size: " << buf_size << " bytes" << std::endl; - - // upload program - std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); - - // allocate device memory - std::cout << "allocate device memory" << std::endl; - RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr)); - - kernel_arg.num_tasks = num_tasks; - kernel_arg.task_size = count; - - std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl; - std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl; - std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; - - // allocate staging buffer - std::cout << "allocate staging buffer" << std::endl; - uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - staging_buf.resize(alloc_size); - - // upload kernel argument - std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - - // generate source data - source_data.resize(2 * num_points); - for (uint32_t i = 0; i < source_data.size(); ++i) { - source_data[i] = Comparator::generate(); - } - - // upload source buffer0 - { - std::cout << "upload source buffer0" << std::endl; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = source_data[2 * i + 0]; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size)); - } - - // upload source buffer1 - { - std::cout << "upload source buffer1" << std::endl; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = source_data[2 * i + 1]; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size)); - } - - // clear destination buffer - std::cout << "clear destination buffer" << std::endl; - memset(staging_buf.data(), 0, num_points * sizeof(TYPE)); - RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size)); - - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, buf_size, num_points)); - // cleanup std::cout << "cleanup" << std::endl; cleanup(); diff --git a/tests/regression/diverge/common.h b/tests/regression/diverge/common.h index 5e5bf23f2..edf054657 100644 --- a/tests/regression/diverge/common.h +++ b/tests/regression/diverge/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - typedef struct { uint32_t num_points; uint64_t src_addr; diff --git a/tests/regression/diverge/kernel.cpp b/tests/regression/diverge/kernel.cpp index 3924b9204..fe5064d4d 100644 --- a/tests/regression/diverge/kernel.cpp +++ b/tests/regression/diverge/kernel.cpp @@ -77,7 +77,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/diverge/main.cpp b/tests/regression/diverge/main.cpp index ea16f0ec7..c6d028e27 100644 --- a/tests/regression/diverge/main.cpp +++ b/tests/regression/diverge/main.cpp @@ -26,6 +26,8 @@ std::vector ref_data; vx_device_h device = nullptr; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -59,6 +61,8 @@ void cleanup() { if (device) { vx_mem_free(device, kernel_arg.src_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } @@ -143,45 +147,6 @@ void gen_ref_data(uint32_t num_points) { } } -int run_test(const kernel_arg_t& kernel_arg, - uint32_t buf_size, - uint32_t num_points) { - // start device - std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); - - // wait for completion - std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); - - // download destination buffer - std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size)); - - // verify result - std::cout << "verify result" << std::endl; - { - int errors = 0; - auto buf_ptr = (int32_t*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - int ref = ref_data.at(i); - int cur = buf_ptr[i]; - if (cur != ref) { - std::cout << "error at result #" << std::dec << i - << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; - ++errors; - } - } - if (errors != 0) { - std::cout << "Found " << std::dec << errors << " errors!" << std::endl; - std::cout << "FAILED!" << std::endl; - return 1; - } - } - - return 0; -} - int main(int argc, char *argv[]) { // parse command arguments parse_args(argc, argv); @@ -212,7 +177,7 @@ int main(int argc, char *argv[]) { // upload program std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); // allocate device memory std::cout << "allocate device memory" << std::endl; @@ -226,15 +191,12 @@ int main(int argc, char *argv[]) { // allocate staging buffer std::cout << "allocate staging buffer" << std::endl; - uint32_t staging_buf_size = std::max(src_buf_size, - std::max(dst_buf_size, - sizeof(kernel_arg_t))); + uint32_t staging_buf_size = std::max(src_buf_size, dst_buf_size); staging_buf.resize(staging_buf_size); // upload kernel argument std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); // upload source buffer { @@ -254,9 +216,38 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size)); } - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points)); + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + int ref = ref_data.at(i); + int cur = buf_ptr[i]; + if (cur != ref) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } // cleanup std::cout << "cleanup" << std::endl; diff --git a/tests/regression/dogfood/common.h b/tests/regression/dogfood/common.h index 35f30d429..6824eca1d 100644 --- a/tests/regression/dogfood/common.h +++ b/tests/regression/dogfood/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - typedef struct { uint32_t testid; uint32_t num_tasks; diff --git a/tests/regression/dogfood/kernel.cpp b/tests/regression/dogfood/kernel.cpp index 312848ece..b88a5a3b6 100644 --- a/tests/regression/dogfood/kernel.cpp +++ b/tests/regression/dogfood/kernel.cpp @@ -397,34 +397,36 @@ void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) { dst_ptr[task_id] += 1; } -static const PFN_Kernel sc_tests[] = { - kernel_iadd, - kernel_imul, - kernel_idiv, - kernel_idiv_mul, - kernel_fadd, - kernel_fsub, - kernel_fmul, - kernel_fmadd, - kernel_fmsub, - kernel_fnmadd, - kernel_fnmsub, - kernel_fnmadd_madd, - kernel_fdiv, - kernel_fdiv2, - kernel_fsqrt, - kernel_ftoi, - kernel_ftou, - kernel_itof, - kernel_utof, - kernel_fclamp, - kernel_trigo, - kernel_bar, - kernel_gbar -}; +static PFN_Kernel sc_tests[23]; +void register_tests() { + sc_tests[0] = kernel_iadd; + sc_tests[1] = kernel_imul; + sc_tests[2] = kernel_idiv; + sc_tests[3] = kernel_idiv_mul; + sc_tests[4] = kernel_fadd; + sc_tests[5] = kernel_fsub; + sc_tests[6] = kernel_fmul; + sc_tests[7] = kernel_fmadd; + sc_tests[8] = kernel_fmsub; + sc_tests[9] = kernel_fnmadd; + sc_tests[10] = kernel_fnmsub; + sc_tests[11] = kernel_fnmadd_madd; + sc_tests[12] = kernel_fdiv; + sc_tests[13] = kernel_fdiv2; + sc_tests[14] = kernel_fsqrt; + sc_tests[15] = kernel_ftoi; + sc_tests[16] = kernel_ftou; + sc_tests[17] = kernel_itof; + sc_tests[18] = kernel_utof; + sc_tests[19] = kernel_fclamp; + sc_tests[20] = kernel_trigo; + sc_tests[21] = kernel_bar; + sc_tests[22] = kernel_gbar; +} int main() { - auto arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + register_tests(); + auto arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg); return 0; } diff --git a/tests/regression/dogfood/main.cpp b/tests/regression/dogfood/main.cpp index 972a9fcf4..fd4300bda 100644 --- a/tests/regression/dogfood/main.cpp +++ b/tests/regression/dogfood/main.cpp @@ -20,10 +20,11 @@ int testid_e = 0; bool stop_on_error = true; vx_device_h device = nullptr; -std::vector arg_buf; std::vector src1_buf; std::vector src2_buf; std::vector dst_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -77,6 +78,8 @@ void cleanup() { vx_mem_free(device, kernel_arg.src0_addr); vx_mem_free(device, kernel_arg.src1_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } @@ -113,15 +116,12 @@ int main(int argc, char *argv[]) { std::cout << "number of points: " << num_points << std::endl; std::cout << "buffer size: " << buf_size << " bytes" << std::endl; - // upload program - std::cout << "upload kernel" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); - // allocate device memory std::cout << "allocate device memory" << std::endl; RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr)); RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr)); RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr)); + RT_CHECK(vx_mem_alloc(device, sizeof(kernel_arg_t), &kernel_args_addr)); kernel_arg.num_tasks = num_tasks; kernel_arg.task_size = count; @@ -132,7 +132,6 @@ int main(int argc, char *argv[]) { // allocate staging buffer std::cout << "allocate staging buffer" << std::endl; - arg_buf.resize(sizeof(kernel_arg_t)); src1_buf.resize(buf_size); src2_buf.resize(buf_size); dst_buf.resize(buf_size); @@ -142,6 +141,11 @@ int main(int argc, char *argv[]) { if (testid_e == 0) { testid_e = (testSuite->size() - 1); } + + // upload program + std::cout << "upload kernel" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); + // execute tests for (int t = testid_s; t <= testid_e; ++t) { auto test = testSuite->get_test(t); @@ -159,12 +163,6 @@ int main(int argc, char *argv[]) { std::cout << "Test" << t << ": " << name << std::endl; - // upload kernel argument - std::cout << "upload kernel argument" << std::endl; - kernel_arg.testid = t; - memcpy(arg_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, arg_buf.data(), sizeof(kernel_arg_t))); - // get test arguments std::cout << "get test arguments" << std::endl; RT_CHECK(test->setup(num_points, (void*)src1_buf.data(), (void*)src2_buf.data())); @@ -184,9 +182,14 @@ int main(int argc, char *argv[]) { } RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, dst_buf.data(), buf_size)); + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + kernel_arg.testid = t; + RT_CHECK(vx_copy_to_dev(device, kernel_args_addr, &kernel_arg, sizeof(kernel_arg_t))); + // start device std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); // wait for completion std::cout << "wait for completion" << std::endl; diff --git a/tests/regression/dogfood/testcases.h b/tests/regression/dogfood/testcases.h index 0a0290998..63eb7de6f 100644 --- a/tests/regression/dogfood/testcases.h +++ b/tests/regression/dogfood/testcases.h @@ -32,6 +32,19 @@ inline float fround(float x, int32_t precision = 8) { return std::round(x * power_of_10) / power_of_10; } +inline bool almost_equal_precision(float a, float b, int precision = 4) { + auto power_of_10 = std::pow(10, precision); + auto ap = std::round(a * power_of_10) / power_of_10; + auto bp = std::round(b * power_of_10) / power_of_10; + auto eps = std::numeric_limits::epsilon(); + auto d = fabs(ap - bp); + if (d > eps) { + std::cout << "*** almost_equal_precision: d=" << d << ", precision=" << precision << std::endl; + return false; + } + return true; +} + inline bool almost_equal_eps(float a, float b, int ulp = 128) { auto eps = std::numeric_limits::epsilon() * (std::max(fabs(a), fabs(b)) * ulp); auto d = fabs(a - b); @@ -727,8 +740,8 @@ public: auto a = (float*)src1; auto b = (float*)src2; for (uint32_t i = 0; i < n; ++i) { - a[i] = fround((n - i) * (1.0f/n)); - b[i] = fround((n + i) * (1.0f/n)); + a[i] = fround((2*i-n) * (1.0f/n) * 3.1416); + b[i] = fround((2*i-n) * (1.0f/n) * 3.1416); } return 0; } @@ -740,7 +753,7 @@ public: auto c = (float*)dst; for (uint32_t i = 0; i < n; ++i) { auto ref = sin(a[i]) + cos(b[i]); - if (!almost_equal_ulp(c[i], ref, 20)) { + if (!almost_equal(c[i], ref)) { std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl; ++errors; } diff --git a/tests/regression/fence/common.h b/tests/regression/fence/common.h index a57e54840..16551609c 100644 --- a/tests/regression/fence/common.h +++ b/tests/regression/fence/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - typedef struct { uint32_t num_tasks; uint32_t task_size; diff --git a/tests/regression/fence/kernel.cpp b/tests/regression/fence/kernel.cpp index 15e1c25e9..6aa8066c7 100644 --- a/tests/regression/fence/kernel.cpp +++ b/tests/regression/fence/kernel.cpp @@ -18,7 +18,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/fence/main.cpp b/tests/regression/fence/main.cpp index 029021aed..4964c25c1 100644 --- a/tests/regression/fence/main.cpp +++ b/tests/regression/fence/main.cpp @@ -22,6 +22,8 @@ uint32_t count = 0; vx_device_h device = nullptr; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -56,49 +58,12 @@ void cleanup() { vx_mem_free(device, kernel_arg.src0_addr); vx_mem_free(device, kernel_arg.src1_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } -int run_test(const kernel_arg_t& kernel_arg, - uint32_t buf_size, - uint32_t num_points) { - // start device - std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); - - // wait for completion - std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); - - // download destination buffer - std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size)); - - // verify result - std::cout << "verify result" << std::endl; - { - int errors = 0; - auto buf_ptr = (int32_t*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - int ref = i + i; - int cur = buf_ptr[i]; - if (cur != ref) { - std::cout << "error at result #" << std::dec << i - << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; - ++errors; - } - } - if (errors != 0) { - std::cout << "Found " << std::dec << errors << " errors!" << std::endl; - std::cout << "FAILED!" << std::endl; - return 1; - } - } - - return 0; -} - int main(int argc, char *argv[]) { // parse command arguments parse_args(argc, argv); @@ -123,10 +88,6 @@ int main(int argc, char *argv[]) { std::cout << "number of points: " << num_points << std::endl; std::cout << "buffer size: " << buf_size << " bytes" << std::endl; - // upload program - std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); - // allocate device memory std::cout << "allocate device memory" << std::endl; RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr)); @@ -141,14 +102,8 @@ int main(int argc, char *argv[]) { std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; // allocate staging buffer - std::cout << "allocate staging buffer" << std::endl; - uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - staging_buf.resize(alloc_size); - - // upload kernel argument - std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + std::cout << "allocate staging buffer" << std::endl; + staging_buf.resize(buf_size); // upload source buffer0 { @@ -180,9 +135,46 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size)); } - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, buf_size, num_points)); + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); + + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + int ref = i + i; + int cur = buf_ptr[i]; + if (cur != ref) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } // cleanup std::cout << "cleanup" << std::endl; diff --git a/tests/regression/io_addr/common.h b/tests/regression/io_addr/common.h index 5e5bf23f2..edf054657 100644 --- a/tests/regression/io_addr/common.h +++ b/tests/regression/io_addr/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - typedef struct { uint32_t num_points; uint64_t src_addr; diff --git a/tests/regression/io_addr/kernel.cpp b/tests/regression/io_addr/kernel.cpp index 5328e2b9c..4d2ea30c9 100644 --- a/tests/regression/io_addr/kernel.cpp +++ b/tests/regression/io_addr/kernel.cpp @@ -13,7 +13,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/io_addr/main.cpp b/tests/regression/io_addr/main.cpp index 7001a063a..54cb8d5ed 100644 --- a/tests/regression/io_addr/main.cpp +++ b/tests/regression/io_addr/main.cpp @@ -32,6 +32,8 @@ std::vector ref_data; vx_device_h device = nullptr; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -65,6 +67,8 @@ void cleanup() { if (device) { vx_mem_free(device, kernel_arg.src_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_mem_free(device, usr_test_mem); vx_dev_close(device); } @@ -95,45 +99,6 @@ void gen_ref_data(uint32_t num_points) { } } -int run_test(const kernel_arg_t& kernel_arg, - uint32_t buf_size, - uint32_t num_points) { - // start device - std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); - - // wait for completion - std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); - - // download destination buffer - std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size)); - - // verify result - std::cout << "verify result" << std::endl; - { - int errors = 0; - auto buf_ptr = (int32_t*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - int ref = ref_data.at(i); - int cur = buf_ptr[i]; - if (cur != ref) { - std::cout << "error at result #" << std::dec << i - << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; - ++errors; - } - } - if (errors != 0) { - std::cout << "Found " << std::dec << errors << " errors!" << std::endl; - std::cout << "FAILED!" << std::endl; - return 1; - } - } - - return 0; -} - int main(int argc, char *argv[]) { uint64_t value; @@ -165,10 +130,6 @@ int main(int argc, char *argv[]) { std::cout << "number of points: " << std::dec << num_points << std::endl; - // upload program - std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); - // allocate device memory std::cout << "allocate device memory" << std::endl; @@ -184,16 +145,9 @@ int main(int argc, char *argv[]) { // allocate staging buffer std::cout << "allocate staging buffer" << std::endl; uint32_t staging_buf_size = std::max(NUM_ADDRS * sizeof(uint64_t), - std::max(src_buf_size, - std::max(dst_buf_size, - sizeof(kernel_arg_t)))); + std::max(src_buf_size, dst_buf_size)); staging_buf.resize(staging_buf_size); - // upload kernel argument - std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - // upload test address data { std::cout << "upload test address data" << std::endl; @@ -223,9 +177,46 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size)); } - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points)); + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); + + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + int ref = ref_data.at(i); + int cur = buf_ptr[i]; + if (cur != ref) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } // cleanup std::cout << "cleanup" << std::endl; diff --git a/tests/regression/lmem/common.h b/tests/regression/lmem/common.h index 941983ac4..98b8ff587 100644 --- a/tests/regression/lmem/common.h +++ b/tests/regression/lmem/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - #ifndef TYPE #define TYPE float #endif diff --git a/tests/regression/lmem/kernel.cpp b/tests/regression/lmem/kernel.cpp index 499454409..8a9fc691c 100644 --- a/tests/regression/lmem/kernel.cpp +++ b/tests/regression/lmem/kernel.cpp @@ -17,7 +17,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/lmem/main.cpp b/tests/regression/lmem/main.cpp index 942c3f094..fb997642e 100644 --- a/tests/regression/lmem/main.cpp +++ b/tests/regression/lmem/main.cpp @@ -75,6 +75,8 @@ uint32_t count = 16; vx_device_h device = nullptr; std::vector source_data; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -109,47 +111,12 @@ void cleanup() { vx_mem_free(device, kernel_arg.src0_addr); vx_mem_free(device, kernel_arg.src1_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } -int run_test(const kernel_arg_t& kernel_arg, - uint32_t buf_size, - uint32_t num_points) { - // start device - std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); - - // wait for completion - std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); - - // download destination buffer - std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size)); - - // verify result - std::cout << "verify result" << std::endl; - { - int errors = 0; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - auto ref = source_data[2 * i + 0] + source_data[2 * i + 1]; - auto cur = buf_ptr[i]; - if (!Comparator::compare(cur, ref, i, errors)) { - ++errors; - } - } - if (errors != 0) { - std::cout << "Found " << std::dec << errors << " errors!" << std::endl; - std::cout << "FAILED!" << std::endl; - return 1; - } - } - - return 0; -} - int main(int argc, char *argv[]) { // parse command arguments parse_args(argc, argv); @@ -175,7 +142,7 @@ int main(int argc, char *argv[]) { // upload program std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); // allocate device memory std::cout << "allocate device memory" << std::endl; @@ -191,14 +158,12 @@ int main(int argc, char *argv[]) { std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; // allocate staging buffer - std::cout << "allocate staging buffer" << std::endl; - uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - staging_buf.resize(alloc_size); + std::cout << "allocate staging buffer" << std::endl; + staging_buf.resize(buf_size); // upload kernel argument std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); // generate source data source_data.resize(2 * num_points); @@ -231,9 +196,36 @@ int main(int argc, char *argv[]) { memset(staging_buf.data(), 0, num_points * sizeof(TYPE)); RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size)); - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, buf_size, num_points)); + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + auto ref = source_data[2 * i + 0] + source_data[2 * i + 1]; + auto cur = buf_ptr[i]; + if (!Comparator::compare(cur, ref, i, errors)) { + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } // cleanup std::cout << "cleanup" << std::endl; diff --git a/tests/regression/mstress/common.h b/tests/regression/mstress/common.h index 3fb4169d4..27d572a7b 100644 --- a/tests/regression/mstress/common.h +++ b/tests/regression/mstress/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - #define NUM_LOADS 8 typedef struct { diff --git a/tests/regression/mstress/kernel.cpp b/tests/regression/mstress/kernel.cpp index 535dfd10c..231517934 100644 --- a/tests/regression/mstress/kernel.cpp +++ b/tests/regression/mstress/kernel.cpp @@ -23,7 +23,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/mstress/main.cpp b/tests/regression/mstress/main.cpp index ee31cc1d8..1d9e3e3d1 100644 --- a/tests/regression/mstress/main.cpp +++ b/tests/regression/mstress/main.cpp @@ -73,6 +73,8 @@ std::vector addr_table; vx_device_h device = nullptr; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -107,6 +109,8 @@ void cleanup() { vx_mem_free(device, kernel_arg.src0_addr); vx_mem_free(device, kernel_arg.src1_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } @@ -128,12 +132,94 @@ void gen_input_data(uint32_t num_points) { } } -int run_test(const kernel_arg_t& kernel_arg, - uint32_t dst_buf_size, - uint32_t num_points) { +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint64_t num_cores, num_warps, num_threads; + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads)); + + uint32_t num_tasks = num_cores * num_warps * num_threads; + uint32_t num_points = count * num_tasks; + + // generate input data + gen_input_data(num_points); + + uint32_t addr_buf_size = addr_table.size() * sizeof(int32_t); + uint32_t src_buf_size = test_data.size() * sizeof(int32_t); + uint32_t dst_buf_size = test_data.size() * sizeof(int32_t); + + std::cout << "number of points: " << num_points << std::endl; + std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, addr_buf_size, &kernel_arg.src0_addr)); + RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src1_addr)); + RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr)); + + kernel_arg.num_tasks = num_tasks; + kernel_arg.stride = count; + + std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl; + std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl; + std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; + + // allocate staging buffer + std::cout << "allocate staging buffer" << std::endl; + uint32_t staging_buf_size = std::max(src_buf_size, + std::max(addr_buf_size, dst_buf_size)); + staging_buf.resize(staging_buf_size); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); + + // upload source buffer0 + { + std::cout << "upload address buffer" << std::endl; + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, addr_table.data(), addr_table.size() * sizeof(int32_t)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), addr_buf_size)); + } + + // upload source buffer1 + { + std::cout << "upload source buffer" << std::endl; + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, test_data.data(), test_data.size() * sizeof(int32_t)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), src_buf_size)); + } + + // clear destination buffer + { + std::cout << "clear destination buffer" << std::endl; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < test_data.size(); ++i) { + buf_ptr[i] = 0xdeadbeef; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size)); + } + // start device std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); // wait for completion std::cout << "wait for completion" << std::endl; @@ -175,101 +261,6 @@ int run_test(const kernel_arg_t& kernel_arg, } } - return 0; -} - -int main(int argc, char *argv[]) { - // parse command arguments - parse_args(argc, argv); - - if (count == 0) { - count = 1; - } - - std::srand(50); - - // open device connection - std::cout << "open device connection" << std::endl; - RT_CHECK(vx_dev_open(&device)); - - uint64_t num_cores, num_warps, num_threads; - RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores)); - RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps)); - RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads)); - - uint32_t num_tasks = num_cores * num_warps * num_threads; - uint32_t num_points = count * num_tasks; - - // generate input data - gen_input_data(num_points); - - uint32_t addr_buf_size = addr_table.size() * sizeof(int32_t); - uint32_t src_buf_size = test_data.size() * sizeof(int32_t); - uint32_t dst_buf_size = test_data.size() * sizeof(int32_t); - - std::cout << "number of points: " << num_points << std::endl; - std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; - - // upload program - std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); - - // allocate device memory - std::cout << "allocate device memory" << std::endl; - RT_CHECK(vx_mem_alloc(device, addr_buf_size, &kernel_arg.src0_addr)); - RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src1_addr)); - RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr)); - - kernel_arg.num_tasks = num_tasks; - kernel_arg.stride = count; - - std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl; - std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl; - std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; - - // allocate staging buffer - std::cout << "allocate staging buffer" << std::endl; - uint32_t staging_buf_size = std::max(src_buf_size, - std::max(addr_buf_size, - std::max(dst_buf_size, - sizeof(kernel_arg_t)))); - staging_buf.resize(staging_buf_size); - - // upload kernel argument - std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - - // upload source buffer0 - { - std::cout << "upload address buffer" << std::endl; - auto buf_ptr = staging_buf.data(); - memcpy(buf_ptr, addr_table.data(), addr_table.size() * sizeof(int32_t)); - RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), addr_buf_size)); - } - - // upload source buffer1 - { - std::cout << "upload source buffer" << std::endl; - auto buf_ptr = staging_buf.data(); - memcpy(buf_ptr, test_data.data(), test_data.size() * sizeof(int32_t)); - RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), src_buf_size)); - } - - // clear destination buffer - { - std::cout << "clear destination buffer" << std::endl; - auto buf_ptr = (int32_t*)staging_buf.data(); - for (uint32_t i = 0; i < test_data.size(); ++i) { - buf_ptr[i] = 0xdeadbeef; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size)); - } - - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points)); - // cleanup std::cout << "cleanup" << std::endl; cleanup(); diff --git a/tests/regression/no_mf_ext/common.h b/tests/regression/no_mf_ext/common.h index 05a761557..3f0b7132c 100644 --- a/tests/regression/no_mf_ext/common.h +++ b/tests/regression/no_mf_ext/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - typedef struct { uint32_t size; uint64_t src_addr; diff --git a/tests/regression/no_mf_ext/kernel.cpp b/tests/regression/no_mf_ext/kernel.cpp index b378f9535..df8e8e646 100644 --- a/tests/regression/no_mf_ext/kernel.cpp +++ b/tests/regression/no_mf_ext/kernel.cpp @@ -4,7 +4,7 @@ #include "common.h" int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); uint32_t size = arg->size; int32_t* src_ptr = (int32_t*)arg->src_addr; diff --git a/tests/regression/no_mf_ext/main.cpp b/tests/regression/no_mf_ext/main.cpp index 5ff9e2fcf..fa746ec5a 100644 --- a/tests/regression/no_mf_ext/main.cpp +++ b/tests/regression/no_mf_ext/main.cpp @@ -22,6 +22,8 @@ uint32_t count = 0; vx_device_h device = nullptr; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -55,16 +57,75 @@ void cleanup() { if (device) { vx_mem_free(device, kernel_arg.src_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } -int run_test(const kernel_arg_t& kernel_arg, - uint32_t buf_size, - uint32_t num_points) { +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint32_t num_points = count; + uint32_t buf_size = num_points * sizeof(int32_t); + + std::cout << "number of points: " << num_points << std::endl; + std::cout << "buffer size: " << buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr)); + + kernel_arg.size = num_points; + + std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl; + std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; + + // allocate staging buffer + std::cout << "allocate staging buffer" << std::endl; + staging_buf.resize(buf_size); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); + + // upload source buffer0 + { + std::cout << "upload source buffer" << std::endl; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = i-1; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size)); + } + + // clear destination buffer + { + std::cout << "clear destination buffer" << std::endl; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = 0xdeadbeef; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size)); + } + // start device std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); // wait for completion std::cout << "wait for completion" << std::endl; @@ -95,75 +156,6 @@ int run_test(const kernel_arg_t& kernel_arg, } } - return 0; -} - -int main(int argc, char *argv[]) { - // parse command arguments - parse_args(argc, argv); - - if (count == 0) { - count = 1; - } - - // open device connection - std::cout << "open device connection" << std::endl; - RT_CHECK(vx_dev_open(&device)); - - uint32_t num_points = count; - uint32_t buf_size = num_points * sizeof(int32_t); - - std::cout << "number of points: " << num_points << std::endl; - std::cout << "buffer size: " << buf_size << " bytes" << std::endl; - - // upload program - std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); - - // allocate device memory - std::cout << "allocate device memory" << std::endl; - RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr)); - - kernel_arg.size = num_points; - - std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl; - std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; - - // allocate staging buffer - std::cout << "allocate staging buffer" << std::endl; - uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - staging_buf.resize(alloc_size); - - // upload kernel argument - std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - - // upload source buffer0 - { - std::cout << "upload source buffer" << std::endl; - auto buf_ptr = (int32_t*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = i-1; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size)); - } - - // clear destination buffer - { - std::cout << "clear destination buffer" << std::endl; - auto buf_ptr = (int32_t*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = 0xdeadbeef; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size)); - } - - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, buf_size, num_points)); - // cleanup std::cout << "cleanup" << std::endl; cleanup(); diff --git a/tests/regression/printf/common.h b/tests/regression/printf/common.h index ac1009490..3841cd5bb 100644 --- a/tests/regression/printf/common.h +++ b/tests/regression/printf/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - typedef struct { uint32_t num_points; uint64_t src_addr; diff --git a/tests/regression/printf/kernel.cpp b/tests/regression/printf/kernel.cpp index 8f35de7e8..dba73d8e0 100644 --- a/tests/regression/printf/kernel.cpp +++ b/tests/regression/printf/kernel.cpp @@ -12,7 +12,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/printf/main.cpp b/tests/regression/printf/main.cpp index 5edfa8e8f..d4a6c2ea4 100644 --- a/tests/regression/printf/main.cpp +++ b/tests/regression/printf/main.cpp @@ -22,6 +22,8 @@ uint32_t count = 4; vx_device_h device = nullptr; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -54,22 +56,12 @@ static void parse_args(int argc, char **argv) { void cleanup() { if (device) { vx_mem_free(device, kernel_arg.src_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } -int run_test() { - // start device - std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); - - // wait for completion - std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); - - return 0; -} - int main(int argc, char *argv[]) { // parse command arguments parse_args(argc, argv); @@ -94,7 +86,7 @@ int main(int argc, char *argv[]) { // upload program std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); // allocate device memory std::cout << "allocate device memory" << std::endl; @@ -105,14 +97,12 @@ int main(int argc, char *argv[]) { std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl; // allocate staging buffer - std::cout << "allocate staging buffer" << std::endl; - uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - staging_buf.resize(alloc_size); + std::cout << "allocate staging buffer" << std::endl; + staging_buf.resize(buf_size); // upload kernel argument std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); // upload source buffer0 { @@ -124,9 +114,15 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size)); } - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test()); + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + return 0; // cleanup std::cout << "cleanup" << std::endl; diff --git a/tests/regression/sgemmx/common.h b/tests/regression/sgemmx/common.h index 20d612071..6e5c460ff 100644 --- a/tests/regression/sgemmx/common.h +++ b/tests/regression/sgemmx/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - #ifndef TYPE #define TYPE float #endif diff --git a/tests/regression/sgemmx/kernel.cpp b/tests/regression/sgemmx/kernel.cpp index cbee7affc..29026fdef 100644 --- a/tests/regression/sgemmx/kernel.cpp +++ b/tests/regression/sgemmx/kernel.cpp @@ -31,7 +31,7 @@ void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/sgemmx/main.cpp b/tests/regression/sgemmx/main.cpp index 1b405e688..994270ec1 100644 --- a/tests/regression/sgemmx/main.cpp +++ b/tests/regression/sgemmx/main.cpp @@ -36,7 +36,7 @@ public: static bool compare(int a, int b, int index, int errors) { if (a != b) { if (errors < 100) { - printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a); } return false; } @@ -61,7 +61,7 @@ public: auto d = std::abs(fa.i - fb.i); if (d > FLOAT_ULP) { if (errors < 100) { - printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); } return false; } @@ -85,7 +85,8 @@ const char* kernel_file = "kernel.bin"; uint32_t size = 32; vx_device_h device = nullptr; -std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -120,6 +121,8 @@ void cleanup() { vx_mem_free(device, kernel_arg.A_addr); vx_mem_free(device, kernel_arg.B_addr); vx_mem_free(device, kernel_arg.C_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } @@ -140,10 +143,6 @@ int main(int argc, char *argv[]) { std::cout << "data type: " << Comparator::type_str() << std::endl; std::cout << "matrix size: " << size << "x" << size << std::endl; - // upload program - std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); - // allocate device memory std::cout << "allocate device memory" << std::endl; RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.A_addr)); @@ -157,16 +156,6 @@ int main(int argc, char *argv[]) { std::cout << "dev_argA=0x" << std::hex << kernel_arg.A_addr << std::endl; std::cout << "dev_argB=0x" << std::hex << kernel_arg.B_addr << std::endl; std::cout << "dev_argC=0x" << std::hex << kernel_arg.C_addr << std::endl; - - // allocate staging buffer - std::cout << "allocate staging buffer" << std::endl; - uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - staging_buf.resize(alloc_size); - - // upload kernel argument - std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); // generate source data std::vector h_A(num_points); @@ -178,38 +167,32 @@ int main(int argc, char *argv[]) { h_A[i] = static_cast(a * size); h_B[i] = static_cast(b * size); } - matmul_cpu(h_C.data(), h_A.data(), h_B.data(), size, size); // upload matrix A buffer { std::cout << "upload matrix A buffer" << std::endl; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = h_A[i]; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, staging_buf.data(), buf_size)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, h_A.data(), buf_size)); } // upload matrix B buffer { std::cout << "upload matrix B buffer" << std::endl; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = h_B[i]; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, staging_buf.data(), buf_size)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, h_B.data(), buf_size)); } - // clear destination buffer - std::cout << "clear destination buffer" << std::endl; - memset(staging_buf.data(), 0, buf_size); - RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size)); + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); auto time_start = std::chrono::high_resolution_clock::now(); // start device std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); // wait for completion std::cout << "wait for completion" << std::endl; @@ -221,17 +204,17 @@ int main(int argc, char *argv[]) { // download destination buffer std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size)); + RT_CHECK(vx_copy_from_dev(device, h_C.data(), kernel_arg.C_addr, buf_size)); // verify result std::cout << "verify result" << std::endl; { - int errors = 0; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < h_C.size(); ++i) { - auto ref = h_C[i]; - auto cur = buf_ptr[i]; - if (!Comparator::compare(cur, ref, i, errors)) { + std::vector h_ref(num_points); + matmul_cpu(h_ref.data(), h_A.data(), h_B.data(), size, size); + + int errors = 0; + for (uint32_t i = 0; i < h_ref.size(); ++i) { + if (!Comparator::compare(h_C[i], h_ref[i], i, errors)) { ++errors; } } diff --git a/tests/regression/sort/common.h b/tests/regression/sort/common.h index 92ceeb91c..4ebfb742d 100644 --- a/tests/regression/sort/common.h +++ b/tests/regression/sort/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - #ifndef TYPE #define TYPE int #endif diff --git a/tests/regression/sort/kernel.cpp b/tests/regression/sort/kernel.cpp index 2e9d3453a..bea1ab5eb 100644 --- a/tests/regression/sort/kernel.cpp +++ b/tests/regression/sort/kernel.cpp @@ -19,7 +19,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/sort/main.cpp b/tests/regression/sort/main.cpp index 566e6d415..4806c14f5 100644 --- a/tests/regression/sort/main.cpp +++ b/tests/regression/sort/main.cpp @@ -25,6 +25,8 @@ std::vector ref_data; vx_device_h device = nullptr; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -58,6 +60,8 @@ void cleanup() { if (device) { vx_mem_free(device, kernel_arg.src_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } @@ -87,45 +91,6 @@ void gen_ref_data(uint32_t num_points) { } } -int run_test(const kernel_arg_t& kernel_arg, - uint32_t buf_size, - uint32_t num_points) { - // start device - std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); - - // wait for completion - std::cout << "wait for completion" << std::endl; - RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); - - // download destination buffer - std::cout << "download destination buffer" << std::endl; - RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size)); - - // verify result - std::cout << "verify result" << std::endl; - { - int errors = 0; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - TYPE ref = ref_data.at(i); - TYPE cur = buf_ptr[i]; - if (cur != ref) { - std::cout << "error at result #" << std::dec << i - << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl; - ++errors; - } - } - if (errors != 0) { - std::cout << "Found " << std::dec << errors << " errors!" << std::endl; - std::cout << "FAILED!" << std::endl; - return 1; - } - } - - return 0; -} - int main(int argc, char *argv[]) { // parse command arguments parse_args(argc, argv); @@ -154,10 +119,6 @@ int main(int argc, char *argv[]) { std::cout << "number of points: " << num_points << std::endl; std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; - // upload program - std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); - // allocate device memory std::cout << "allocate device memory" << std::endl; RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src_addr)); @@ -171,17 +132,10 @@ int main(int argc, char *argv[]) { // allocate staging buffer { std::cout << "allocate staging buffer" << std::endl; - uint32_t staging_buf_size = std::max(src_buf_size, - std::max(dst_buf_size, - sizeof(kernel_arg_t))); + uint32_t staging_buf_size = std::max(src_buf_size, dst_buf_size); staging_buf.resize(staging_buf_size); } - // upload kernel argument - std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - // upload source buffer { std::cout << "upload source buffer" << std::endl; @@ -200,9 +154,46 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size)); } - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points)); + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); + + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + TYPE ref = ref_data.at(i); + TYPE cur = buf_ptr[i]; + if (cur != ref) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } // cleanup std::cout << "cleanup" << std::endl; diff --git a/tests/regression/vecaddx/common.h b/tests/regression/vecaddx/common.h index 2b8f164a5..b511332c1 100644 --- a/tests/regression/vecaddx/common.h +++ b/tests/regression/vecaddx/common.h @@ -1,8 +1,6 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 - #ifndef TYPE #define TYPE float #endif diff --git a/tests/regression/vecaddx/kernel.cpp b/tests/regression/vecaddx/kernel.cpp index 6ed421642..596ce8d1a 100644 --- a/tests/regression/vecaddx/kernel.cpp +++ b/tests/regression/vecaddx/kernel.cpp @@ -12,7 +12,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { } int main() { - kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/vecaddx/main.cpp b/tests/regression/vecaddx/main.cpp index a3f1124d6..6522c9409 100644 --- a/tests/regression/vecaddx/main.cpp +++ b/tests/regression/vecaddx/main.cpp @@ -34,7 +34,7 @@ public: static bool compare(int a, int b, int index, int errors) { if (a != b) { if (errors < 100) { - printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b); + printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a); } return false; } @@ -61,7 +61,7 @@ public: auto d = std::abs(fa.i - fb.i); if (d > FLOAT_ULP) { if (errors < 100) { - printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b); + printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a); } return false; } @@ -75,6 +75,8 @@ uint32_t size = 16; vx_device_h device = nullptr; std::vector source_data; std::vector staging_buf; +uint64_t kernel_prog_addr; +uint64_t kernel_args_addr; kernel_arg_t kernel_arg = {}; static void show_usage() { @@ -109,16 +111,90 @@ void cleanup() { vx_mem_free(device, kernel_arg.src0_addr); vx_mem_free(device, kernel_arg.src1_addr); vx_mem_free(device, kernel_arg.dst_addr); + vx_mem_free(device, kernel_prog_addr); + vx_mem_free(device, kernel_args_addr); vx_dev_close(device); } } -int run_test(const kernel_arg_t& kernel_arg, - uint32_t buf_size, - uint32_t num_points) { +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint64_t num_cores, num_warps, num_threads; + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps)); + RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads)); + std::cout << "number of cores: " << num_cores << std::endl; + std::cout << "number of warps: " << num_warps << std::endl; + std::cout << "number of threads: " << num_threads << std::endl; + + uint32_t num_points = size; + uint32_t buf_size = num_points * sizeof(TYPE); + + std::cout << "number of points: " << num_points << std::endl; + std::cout << "data type: " << Comparator::type_str() << std::endl; + std::cout << "buffer size: " << buf_size << " bytes" << std::endl; + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr)); + RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr)); + + kernel_arg.num_points = num_points; + + std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl; + std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl; + std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; + + // allocate staging buffer + std::cout << "allocate staging buffer" << std::endl; + staging_buf.resize(buf_size); + + // generate source data + source_data.resize(2 * num_points); + for (uint32_t i = 0; i < source_data.size(); ++i) { + source_data[i] = Comparator::generate(); + } + + // upload source buffer0 + { + std::cout << "upload source buffer0" << std::endl; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = source_data[2 * i + 0]; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size)); + } + + // upload source buffer1 + { + std::cout << "upload source buffer1" << std::endl; + auto buf_ptr = (TYPE*)staging_buf.data(); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = source_data[2 * i + 1]; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size)); + } + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr)); + // start device std::cout << "start device" << std::endl; - RT_CHECK(vx_start(device)); + RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr)); // wait for completion std::cout << "wait for completion" << std::endl; @@ -147,95 +223,6 @@ int run_test(const kernel_arg_t& kernel_arg, } } - return 0; -} - -int main(int argc, char *argv[]) { - // parse command arguments - parse_args(argc, argv); - - std::srand(50); - - // open device connection - std::cout << "open device connection" << std::endl; - RT_CHECK(vx_dev_open(&device)); - - uint64_t num_cores, num_warps, num_threads; - RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores)); - RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps)); - RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads)); - std::cout << "number of cores: " << num_cores << std::endl; - std::cout << "number of warps: " << num_warps << std::endl; - std::cout << "number of threads: " << num_threads << std::endl; - - uint32_t num_points = size; - uint32_t buf_size = num_points * sizeof(TYPE); - - std::cout << "number of points: " << num_points << std::endl; - std::cout << "data type: " << Comparator::type_str() << std::endl; - std::cout << "buffer size: " << buf_size << " bytes" << std::endl; - - // upload program - std::cout << "upload program" << std::endl; - RT_CHECK(vx_upload_kernel_file(device, kernel_file)); - - // allocate device memory - std::cout << "allocate device memory" << std::endl; - RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr)); - - kernel_arg.num_points = num_points; - - std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl; - std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl; - std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl; - - // allocate staging buffer - std::cout << "allocate staging buffer" << std::endl; - uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); - staging_buf.resize(alloc_size); - - // upload kernel argument - std::cout << "upload kernel argument" << std::endl; - memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); - RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); - - // generate source data - source_data.resize(2 * num_points); - for (uint32_t i = 0; i < source_data.size(); ++i) { - source_data[i] = Comparator::generate(); - } - - // upload source buffer0 - { - std::cout << "upload source buffer0" << std::endl; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = source_data[2 * i + 0]; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size)); - } - - // upload source buffer1 - { - std::cout << "upload source buffer1" << std::endl; - auto buf_ptr = (TYPE*)staging_buf.data(); - for (uint32_t i = 0; i < num_points; ++i) { - buf_ptr[i] = source_data[2 * i + 1]; - } - RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size)); - } - - // clear destination buffer - std::cout << "clear destination buffer" << std::endl; - memset(staging_buf.data(), 0, num_points * sizeof(TYPE)); - RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size)); - - // run tests - std::cout << "run tests" << std::endl; - RT_CHECK(run_test(kernel_arg, buf_size, num_points)); - // cleanup std::cout << "cleanup" << std::endl; cleanup();