enabling explicit kernel address and arguments allocation

This commit is contained in:
Blaise Tine 2024-04-12 06:58:42 -07:00
parent b32ea5b750
commit 25d0c76d14
66 changed files with 1223 additions and 1084 deletions

View file

@ -55,6 +55,7 @@ package VX_gpu_pkg;
typedef struct packed {
logic [`XLEN-1:0] startup_addr;
logic [`XLEN-1:0] startup_arg;
logic [7:0] mpm_class;
} base_dcrs_t;

View file

@ -22,8 +22,10 @@
`define VX_DCR_BASE_STATE_BEGIN 12'h001
`define VX_DCR_BASE_STARTUP_ADDR0 12'h001
`define VX_DCR_BASE_STARTUP_ADDR1 12'h002
`define VX_DCR_BASE_MPM_CLASS 12'h003
`define VX_DCR_BASE_STATE_END 12'h004
`define VX_DCR_BASE_STARTUP_ARG0 12'h003
`define VX_DCR_BASE_STARTUP_ARG1 12'h004
`define VX_DCR_BASE_MPM_CLASS 12'h005
`define VX_DCR_BASE_STATE_END 12'h006
`define VX_DCR_BASE_STATE(addr) ((addr) - `VX_DCR_BASE_STATE_BEGIN)
`define VX_DCR_BASE_STATE_COUNT (`VX_DCR_BASE_STATE_END-`VX_DCR_BASE_STATE_BEGIN)

View file

@ -121,6 +121,9 @@ import VX_fpu_pkg::*;
`endif
always @(posedge clk) begin
if (reset) begin
mscratch <= base_dcrs.startup_arg;
end
if (write_enable) begin
case (write_addr)
`ifdef EXT_F_ENABLE

View file

@ -35,6 +35,10 @@ module VX_dcr_data import VX_gpu_pkg::*; (
`VX_DCR_BASE_STARTUP_ADDR0 : dcrs.startup_addr[31:0] <= dcr_bus_if.write_data;
`ifdef XLEN_64
`VX_DCR_BASE_STARTUP_ADDR1 : dcrs.startup_addr[63:32] <= dcr_bus_if.write_data;
`endif
`VX_DCR_BASE_STARTUP_ARG0 : dcrs.startup_arg[31:0] <= dcr_bus_if.write_data;
`ifdef XLEN_64
`VX_DCR_BASE_STARTUP_ARG1 : dcrs.startup_arg[63:32] <= dcr_bus_if.write_data;
`endif
`VX_DCR_BASE_MPM_CLASS : dcrs.mpm_class <= dcr_bus_if.write_data[7:0];
default:;

View file

@ -348,6 +348,8 @@ task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"));
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"));
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"));
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"));
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"));
default: `TRACE(level, ("?"));
endcase

View file

@ -14,8 +14,6 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#define KERNEL_ARG_DEV_MEM_ADDR 0x40
typedef struct {
uint32_t count;
uint32_t src_addr;
@ -23,7 +21,7 @@ typedef struct {
} kernel_arg_t;
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
uint32_t count = arg->count;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;

View file

@ -20,6 +20,7 @@ LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex -mllvm -vortex-branch-div
#CC = $(LLVM_VORTEX)/bin/clang $(LLVM_CFLAGS)
#CXX = $(LLVM_VORTEX)/bin/clang++ $(LLVM_CFLAGS)
#AR = $(LLVM_VORTEX)/bin/llvm-ar
#DP = $(LLVM_VORTEX)/bin/llvm-objdump
#CP = $(LLVM_VORTEX)/bin/llvm-objcopy
@ -29,7 +30,7 @@ AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -fdata-sections -ffunction-sections
CFLAGS += -I$(INC_DIR) -I$(ROOT_DIR)/hw
CFLAGS += -DXLEN_$(XLEN)

View file

@ -15,7 +15,6 @@
#define __VX_INTRINSICS_H__
#include <stddef.h>
#include <VX_config.h>
#include <VX_types.h>
#if defined(__clang__)

View file

@ -20,8 +20,6 @@
extern "C" {
#endif
#define NUM_CORES_MAX 1024
#ifndef MIN
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#endif
@ -99,11 +97,7 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;
@ -258,11 +252,7 @@ void vx_spawn_pocl_kernel(pocl_kernel_context_t * ctx, pocl_kernel_cb callback,
int NC = vx_num_cores();
int NW = vx_num_warps();
int NT = vx_num_threads();
// current core id
int core_id = vx_core_id();
if (core_id >= NUM_CORES_MAX)
return;
// calculate necessary active cores
int WT = NW * NT;

83
miscs/musl_libc.patch Normal file
View file

@ -0,0 +1,83 @@
diff --git a/src/setjmp/riscv32/longjmp.S b/src/setjmp/riscv32/longjmp.S
index f9cb3318..0980ea23 100644
--- a/src/setjmp/riscv32/longjmp.S
+++ b/src/setjmp/riscv32/longjmp.S
@@ -23,6 +23,20 @@ longjmp:
lw ra, 52(a0)
#ifndef __riscv_float_abi_soft
+#ifdef __riscv_float_abi_single
+ flw fs0, 56(a0)
+ flw fs1, 64(a0)
+ flw fs2, 72(a0)
+ flw fs3, 80(a0)
+ flw fs4, 88(a0)
+ flw fs5, 96(a0)
+ flw fs6, 104(a0)
+ flw fs7, 112(a0)
+ flw fs8, 120(a0)
+ flw fs9, 128(a0)
+ flw fs10, 136(a0)
+ flw fs11, 144(a0)
+#else
fld fs0, 56(a0)
fld fs1, 64(a0)
fld fs2, 72(a0)
@@ -35,6 +49,7 @@ longjmp:
fld fs9, 128(a0)
fld fs10, 136(a0)
fld fs11, 144(a0)
+#endif
#endif
seqz a0, a1
diff --git a/src/setjmp/riscv32/setjmp.S b/src/setjmp/riscv32/setjmp.S
index 8a75cf55..7efb10e0 100644
--- a/src/setjmp/riscv32/setjmp.S
+++ b/src/setjmp/riscv32/setjmp.S
@@ -23,18 +23,33 @@ setjmp:
sw ra, 52(a0)
#ifndef __riscv_float_abi_soft
- fsd fs0, 56(a0)
- fsd fs1, 64(a0)
- fsd fs2, 72(a0)
- fsd fs3, 80(a0)
- fsd fs4, 88(a0)
- fsd fs5, 96(a0)
- fsd fs6, 104(a0)
- fsd fs7, 112(a0)
- fsd fs8, 120(a0)
- fsd fs9, 128(a0)
- fsd fs10, 136(a0)
- fsd fs11, 144(a0)
+#ifdef __riscv_float_abi_single
+ flw fs0, 56(a0)
+ flw fs1, 64(a0)
+ flw fs2, 72(a0)
+ flw fs3, 80(a0)
+ flw fs4, 88(a0)
+ flw fs5, 96(a0)
+ flw fs6, 104(a0)
+ flw fs7, 112(a0)
+ flw fs8, 120(a0)
+ flw fs9, 128(a0)
+ flw fs10, 136(a0)
+ flw fs11, 144(a0)
+#else
+ fld fs0, 56(a0)
+ fld fs1, 64(a0)
+ fld fs2, 72(a0)
+ fld fs3, 80(a0)
+ fld fs4, 88(a0)
+ fld fs5, 96(a0)
+ fld fs6, 104(a0)
+ fld fs7, 112(a0)
+ fld fs8, 120(a0)
+ fld fs9, 128(a0)
+ fld fs10, 136(a0)
+ fld fs11, 144(a0)
+#endif
#endif
li a0, 0

View file

@ -96,45 +96,6 @@ void perf_remove_device(vx_device_h hdevice) {
///////////////////////////////////////////////////////////////////////////////
extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size) {
int err = 0;
if (NULL == content || 0 == size)
return -1;
uint64_t kernel_base_addr;
err = vx_dev_caps(hdevice, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr);
if (err != 0)
return err;
return vx_copy_to_dev(hdevice, kernel_base_addr, content, size);
}
extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
return -1;
}
// read file content
ifs.seekg(0, ifs.end);
auto size = ifs.tellg();
auto content = new char [size];
ifs.seekg(0, ifs.beg);
ifs.read(content, size);
// upload
int err = vx_upload_kernel_bytes(hdevice, content, size);
// release buffer
delete[] content;
return err;
}
///////////////////////////////////////////////////////////////////////////////
void DeviceConfig::write(uint32_t addr, uint32_t value) {
data_[addr] = value;
}
@ -146,18 +107,29 @@ uint32_t DeviceConfig::read(uint32_t addr) const {
return data_.at(addr);
}
///////////////////////////////////////////////////////////////////////////////
int dcr_initialize(vx_device_h hdevice) {
const uint64_t startup_addr(STARTUP_ADDR);
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
return -1;
return _ret;
});
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
return -1;
return _ret;
});
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, 0), {
return _ret;
});
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG1, 0), {
return _ret;
});
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
return -1;
return _ret;
});
return 0;
@ -165,14 +137,94 @@ int dcr_initialize(vx_device_h hdevice) {
///////////////////////////////////////////////////////////////////////////////
static uint64_t get_csr_64(const void* ptr, int addr) {
int offset = addr - VX_CSR_MPM_BASE;
return ((const uint64_t*)ptr)[offset];
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint64_t* addr) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
return -1;
}
// read file content
ifs.seekg(0, ifs.end);
auto size = ifs.tellg();
std::vector<char> content(size);
ifs.seekg(0, ifs.beg);
ifs.read(content.data(), size);
uint64_t _addr = STARTUP_ADDR;
RT_CHECK(vx_copy_to_dev(hdevice, _addr, content.data(), size), {
vx_mem_free(hdevice, _addr);
return _ret;
});
*addr = _addr;
return 0;
}
extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int ret = 0;
extern int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr) {
if (NULL == content || 0 == size || NULL == addr)
return -1;
uint64_t _addr;
RT_CHECK(vx_mem_alloc(hdevice, size, &_addr), {
return _ret;
});
RT_CHECK(vx_copy_to_dev(hdevice, _addr, content, size), {
vx_mem_free(hdevice, _addr);
return _ret;
});
*addr = _addr;
return 0;
}
extern int vx_upload_file(vx_device_h hdevice, const char* filename, uint64_t* addr) {
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
return -1;
}
// read file content
ifs.seekg(0, ifs.end);
auto size = ifs.tellg();
std::vector<char> content(size);
ifs.seekg(0, ifs.beg);
ifs.read(content.data(), size);
// upload buffer
RT_CHECK(vx_upload_bytes(hdevice, content.data(), size, addr), {
return _ret;
});
return 0;
}
extern int vx_set_kernel_args(vx_device_h hdevice, const void* content, uint64_t size) {
if (NULL == content || 0 == size)
return -1;
uint64_t startup_arg;
RT_CHECK(vx_mem_alloc(hdevice, size, &startup_arg), {
return _ret;
});
RT_CHECK(vx_copy_to_dev(hdevice, startup_arg, content, size), {
vx_mem_free(hdevice, startup_arg);
return _ret;
});
return 0;
}
///////////////////////////////////////////////////////////////////////////////
extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t total_instrs = 0;
uint64_t total_cycles = 0;
uint64_t max_cycles = 0;
@ -234,16 +286,15 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
#endif
uint64_t num_cores;
ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
if (ret != 0)
return ret;
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
return _ret;
});
#ifdef PERF_ENABLE
uint64_t isa_flags;
ret = vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags);
if (ret != 0)
return ret;
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
return _ret;
});
bool icache_enable = isa_flags & VX_ISA_EXT_ICACHE;
bool dcache_enable = isa_flags & VX_ISA_EXT_DCACHE;
bool l2cache_enable = isa_flags & VX_ISA_EXT_L2CACHE;
@ -251,16 +302,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
bool lmem_enable = isa_flags & VX_ISA_EXT_LMEM;
#endif
std::vector<uint8_t> staging_buf(32 * sizeof(uint64_t));
std::vector<uint64_t> staging_buf(32);
auto get_mpm_csr = [&staging_buf](int csr_addr) {
return staging_buf.at(csr_addr - VX_CSR_MPM_BASE);
};
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size();
ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
if (ret != 0)
return ret;
RT_CHECK(vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size()), {
return _ret;
});
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
uint64_t cycles_per_core = get_mpm_csr(VX_CSR_MCYCLE);
uint64_t instrs_per_core = get_mpm_csr(VX_CSR_MINSTRET);
#ifdef PERF_ENABLE
switch (perf_class) {
@ -268,7 +323,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
// PERF: pipeline
// scheduler idles
{
uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID);
uint64_t sched_idles_per_core = get_mpm_csr(VX_CSR_MPM_SCHED_ID);
if (num_cores > 1) {
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scheduler idle=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
@ -277,7 +332,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// scheduler stalls
{
uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
uint64_t sched_stalls_per_core = get_mpm_csr(VX_CSR_MPM_SCHED_ST);
if (num_cores > 1) {
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
@ -286,7 +341,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// ibuffer_stalls
{
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
uint64_t ibuffer_stalls_per_core = get_mpm_csr(VX_CSR_MPM_IBUF_ST);
if (num_cores > 1) {
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
@ -295,11 +350,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// issue_stalls
{
uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU);
uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU);
uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU);
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
uint64_t scrb_stalls_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_ST);
uint64_t scrb_alu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_ALU);
uint64_t scrb_fpu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_FPU);
uint64_t scrb_lsu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_LSU);
uint64_t scrb_sfu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_SFU);
scrb_alu += scrb_alu_per_core;
scrb_fpu += scrb_fpu_per_core;
scrb_lsu += scrb_lsu_per_core;
@ -316,9 +371,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// sfu_stalls
{
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
uint64_t scrb_wctl_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_WCTL);
uint64_t scrb_csrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_CSRS);
uint64_t scrb_sfu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_SFU);
uint64_t scrb_wctl_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_WCTL);
uint64_t scrb_csrs_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_CSRS);
if (num_cores > 1) {
uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core;
fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
@ -334,11 +389,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
// PERF: memory
// ifetches
{
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCHES);
uint64_t ifetches_per_core = get_mpm_csr(VX_CSR_MPM_IFETCHES);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
ifetches += ifetches_per_core;
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT);
uint64_t ifetch_lat_per_core = get_mpm_csr(VX_CSR_MPM_IFETCH_LT);
if (num_cores > 1) {
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
@ -347,11 +402,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// loads
{
uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
uint64_t loads_per_core = get_mpm_csr(VX_CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
loads += loads_per_core;
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT);
uint64_t load_lat_per_core = get_mpm_csr(VX_CSR_MPM_LOAD_LT);
if (num_cores > 1) {
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
@ -360,7 +415,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// stores
{
uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
uint64_t stores_per_core = get_mpm_csr(VX_CSR_MPM_STORES);
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
stores += stores_per_core;
}
@ -368,9 +423,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
case VX_DCR_MPM_CLASS_MEM: {
if (lmem_enable) {
// PERF: lmem
uint64_t lmem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_LMEM_READS);
uint64_t lmem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_LMEM_WRITES);
uint64_t lmem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_LMEM_BANK_ST);
uint64_t lmem_reads = get_mpm_csr(VX_CSR_MPM_LMEM_READS);
uint64_t lmem_writes = get_mpm_csr(VX_CSR_MPM_LMEM_WRITES);
uint64_t lmem_bank_stalls = get_mpm_csr(VX_CSR_MPM_LMEM_BANK_ST);
int lmem_bank_utilization = calcAvgPercent(lmem_reads + lmem_writes, lmem_reads + lmem_writes + lmem_bank_stalls);
fprintf(stream, "PERF: core%d: lmem reads=%ld\n", core_id, lmem_reads);
fprintf(stream, "PERF: core%d: lmem writes=%ld\n", core_id, lmem_writes);
@ -379,9 +434,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (icache_enable) {
// PERF: Icache
uint64_t icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
uint64_t icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
uint64_t icache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MSHR_ST);
uint64_t icache_reads = get_mpm_csr(VX_CSR_MPM_ICACHE_READS);
uint64_t icache_read_misses = get_mpm_csr(VX_CSR_MPM_ICACHE_MISS_R);
uint64_t icache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_ICACHE_MSHR_ST);
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls);
fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads);
@ -391,12 +446,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (dcache_enable) {
// PERF: Dcache
uint64_t dcache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_READS);
uint64_t dcache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_WRITES);
uint64_t dcache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_R);
uint64_t dcache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MISS_W);
uint64_t dcache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_BANK_ST);
uint64_t dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
uint64_t dcache_reads = get_mpm_csr(VX_CSR_MPM_DCACHE_READS);
uint64_t dcache_writes = get_mpm_csr(VX_CSR_MPM_DCACHE_WRITES);
uint64_t dcache_read_misses = get_mpm_csr(VX_CSR_MPM_DCACHE_MISS_R);
uint64_t dcache_write_misses = get_mpm_csr(VX_CSR_MPM_DCACHE_MISS_W);
uint64_t dcache_bank_stalls = get_mpm_csr(VX_CSR_MPM_DCACHE_BANK_ST);
uint64_t dcache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_DCACHE_MSHR_ST);
int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads);
int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes);
int dcache_bank_utilization = calcAvgPercent(dcache_reads + dcache_writes, dcache_reads + dcache_writes + dcache_bank_stalls);
@ -411,29 +466,29 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (l2cache_enable) {
// PERF: L2cache
l2cache_reads += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_READS);
l2cache_writes += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_WRITES);
l2cache_read_misses += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_R);
l2cache_write_misses += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MISS_W);
l2cache_bank_stalls += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_BANK_ST);
l2cache_mshr_stalls += get_csr_64(staging_buf.data(), VX_CSR_MPM_L2CACHE_MSHR_ST);
l2cache_reads += get_mpm_csr(VX_CSR_MPM_L2CACHE_READS);
l2cache_writes += get_mpm_csr(VX_CSR_MPM_L2CACHE_WRITES);
l2cache_read_misses += get_mpm_csr(VX_CSR_MPM_L2CACHE_MISS_R);
l2cache_write_misses += get_mpm_csr(VX_CSR_MPM_L2CACHE_MISS_W);
l2cache_bank_stalls += get_mpm_csr(VX_CSR_MPM_L2CACHE_BANK_ST);
l2cache_mshr_stalls += get_mpm_csr(VX_CSR_MPM_L2CACHE_MSHR_ST);
}
if (0 == core_id) {
if (l3cache_enable) {
// PERF: L3cache
l3cache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_READS);
l3cache_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_WRITES);
l3cache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_R);
l3cache_write_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MISS_W);
l3cache_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_BANK_ST);
l3cache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_L3CACHE_MSHR_ST);
l3cache_reads = get_mpm_csr(VX_CSR_MPM_L3CACHE_READS);
l3cache_writes = get_mpm_csr(VX_CSR_MPM_L3CACHE_WRITES);
l3cache_read_misses = get_mpm_csr(VX_CSR_MPM_L3CACHE_MISS_R);
l3cache_write_misses = get_mpm_csr(VX_CSR_MPM_L3CACHE_MISS_W);
l3cache_bank_stalls = get_mpm_csr(VX_CSR_MPM_L3CACHE_BANK_ST);
l3cache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_L3CACHE_MSHR_ST);
}
// PERF: memory
mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES);
mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LT);
mem_reads = get_mpm_csr(VX_CSR_MPM_MEM_READS);
mem_writes = get_mpm_csr(VX_CSR_MPM_MEM_WRITES);
mem_lat = get_mpm_csr(VX_CSR_MPM_MEM_LT);
}
} break;
default:
@ -528,18 +583,18 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value) {
int ret = 0;
uint64_t num_cores;
ret = vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores);
if (ret != 0)
return ret;
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
return _ret;
});
if (core_id >= (int)num_cores) {
std::cout << "error: core_id out of range" << std::endl;
return -1;
}
std::vector<uint8_t> staging_buf(64 * sizeof(uint32_t));
std::vector<uint64_t> staging_buf(64);
uint64_t _value = 0;
@ -551,11 +606,11 @@ extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64
for (i = 0; i < num_cores; ++i) {
uint64_t mpm_mem_addr = IO_CSR_ADDR + i * staging_buf.size();
ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
if (ret != 0)
return ret;
RT_CHECK(vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size()), {
return _ret;
});
auto per_core_value = get_csr_64(staging_buf.data(), counter);
auto per_core_value = staging_buf.at(counter-VX_CSR_MPM_BASE);
if (counter == VX_CSR_MCYCLE) {
_value = std::max<uint64_t>(per_core_value, _value);
} else {

View file

@ -33,8 +33,7 @@ typedef void* vx_device_h;
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
#define VX_CAPS_LOCAL_MEM_ADDR 0x7
#define VX_CAPS_KERNEL_BASE_ADDR 0x8
#define VX_CAPS_ISA_FLAGS 0x9
#define VX_CAPS_ISA_FLAGS 0x8
// device isa flags
#define VX_ISA_STD_A (1ull << 0)
@ -83,21 +82,27 @@ int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr,
int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size);
// Start device execution
int vx_start(vx_device_h hdevice);
int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr);
// Wait for device ready with milliseconds timeout
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
// read device configuration registers
int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value);
// write device configuration registers
int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value);
int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value);
////////////////////////////// UTILITY FUNCTIONS //////////////////////////////
// upload kernel bytes to device
int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size);
// upload kernel file to device
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename);
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint64_t* addr);
// upload bytes to device
int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr);
// upload file to device
int vx_upload_file(vx_device_h hdevice, const char* filename, uint64_t* addr);
// performance counters
int vx_dump_perf(vx_device_h hdevice, FILE* stream);

View file

@ -59,6 +59,12 @@
#define RAM_PAGE_SIZE 4096
#ifndef NDEBUG
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
#else
#define DBGPRINT(format, ...) ((void)0)
#endif
#define CHECK_HANDLE(handle, _expr, _cleanup) \
auto handle = _expr; \
if (handle == nullptr) { \
@ -66,7 +72,7 @@
_cleanup \
}
#define CHECK_ERR(_expr, _cleanup) \
#define CHECK_FPGA_ERR(_expr, _cleanup) \
do { \
auto err = _expr; \
if (err == 0) \
@ -75,6 +81,15 @@
_cleanup \
} while (false)
#define CHECK_ERR(_expr, _cleanup) \
do { \
auto err = _expr; \
if (err == 0) \
break; \
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
_cleanup \
} while (false)
///////////////////////////////////////////////////////////////////////////////
class vx_device {
@ -100,12 +115,12 @@ public:
}
// allocate new buffer
CHECK_ERR(api.fpgaPrepareBuffer(fpga, asize, (void**)&staging_ptr, &staging_wsid, 0), {
CHECK_FPGA_ERR(api.fpgaPrepareBuffer(fpga, asize, (void**)&staging_ptr, &staging_wsid, 0), {
return -1;
});
// get the physical address of the buffer in the accelerator
CHECK_ERR(api.fpgaGetIOAddress(fpga, staging_wsid, &staging_ioaddr), {
CHECK_FPGA_ERR(api.fpgaGetIOAddress(fpga, staging_wsid, &staging_ioaddr), {
api.fpgaReleaseBuffer(fpga, staging_wsid);
return -1;
});
@ -161,10 +176,6 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
case VX_CAPS_LOCAL_MEM_ADDR:
*value = LMEM_BASE_ADDR;
break;
case VX_CAPS_KERNEL_BASE_ADDR:
*value = (uint64_t(device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) |
device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
break;
case VX_CAPS_ISA_FLAGS:
*value = device->isa_caps;
break;
@ -197,11 +208,11 @@ extern int vx_dev_open(vx_device_h* hdevice) {
}
// Set up a filter that will search for an accelerator
CHECK_ERR(api.fpgaGetProperties(nullptr, &filter), {
CHECK_FPGA_ERR(api.fpgaGetProperties(nullptr, &filter), {
return -1;
});
CHECK_ERR(api.fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR), {
CHECK_FPGA_ERR(api.fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR), {
api.fpgaDestroyProperties(&filter);
return -1;
});
@ -210,19 +221,19 @@ extern int vx_dev_open(vx_device_h* hdevice) {
std::string s_uuid(AFU_ACCEL_UUID);
std::replace(s_uuid.begin(), s_uuid.end(), '_', '-');
uuid_parse(s_uuid.c_str(), guid);
CHECK_ERR(api.fpgaPropertiesSetGUID(filter, guid), {
CHECK_FPGA_ERR(api.fpgaPropertiesSetGUID(filter, guid), {
api.fpgaDestroyProperties(&filter);
return -1;
});
// Do the search across the available FPGA contexts
CHECK_ERR(api.fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches), {
CHECK_FPGA_ERR(api.fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches), {
api.fpgaDestroyProperties(&filter);
return -1;
});
// Not needed anymore
CHECK_ERR(api.fpgaDestroyProperties(&filter), {
CHECK_FPGA_ERR(api.fpgaDestroyProperties(&filter), {
api.fpgaDestroyToken(&accel_token);
return -1;
});
@ -234,13 +245,13 @@ extern int vx_dev_open(vx_device_h* hdevice) {
}
// Open accelerator
CHECK_ERR(api.fpgaOpen(accel_token, &accel_handle, 0), {
CHECK_FPGA_ERR(api.fpgaOpen(accel_token, &accel_handle, 0), {
api.fpgaDestroyToken(&accel_token);
return -1;
});
// Done with token
CHECK_ERR(api.fpgaDestroyToken(&accel_token), {
CHECK_FPGA_ERR(api.fpgaDestroyToken(&accel_token), {
api.fpgaClose(accel_handle);
return -1;
});
@ -257,19 +268,19 @@ extern int vx_dev_open(vx_device_h* hdevice) {
{
// retrieve FPGA global memory size
CHECK_ERR(api.fpgaPropertiesGetLocalMemorySize(filter, &device->global_mem_size), {
CHECK_FPGA_ERR(api.fpgaPropertiesGetLocalMemorySize(filter, &device->global_mem_size), {
// assume 8GB as default
device->global_mem_size = GLOBAL_MEM_SIZE;
});
// Load ISA CAPS
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_ISA_CAPS, &device->isa_caps), {
CHECK_FPGA_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_ISA_CAPS, &device->isa_caps), {
api.fpgaClose(accel_handle);
return -1;
});
// Load device CAPS
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_DEV_CAPS, &device->dev_caps), {
CHECK_FPGA_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_DEV_CAPS, &device->dev_caps), {
api.fpgaClose(accel_handle);
return -1;
});
@ -383,6 +394,8 @@ extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* ho
auto device = (vx_device*)hdevice;
auto& api = device->api;
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, size);
if (device->ensure_staging(size) != 0)
return -1;
@ -405,16 +418,16 @@ extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* ho
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE), {
return -1;
});
@ -432,6 +445,8 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad
auto device = (vx_device*)hdevice;
auto& api = device->api;
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, asize);
if (device->ensure_staging(size) != 0)
return -1;
@ -451,16 +466,16 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, device->staging_ioaddr >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, dev_addr >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG2, asize >> ls_shift), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ), {
return -1;
});
@ -474,19 +489,35 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad
return 0;
}
extern int vx_start(vx_device_h hdevice) {
extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
auto& api = device->api;
// Ensure ready for new command
DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr);
// ensure ready for new command
if (vx_ready_wait(hdevice, VX_MAX_TIMEOUT) != 0)
return -1;
// set kernel info
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff), {
return -1;
});
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, krnl_addr >> 32), {
return -1;
});
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff), {
return -1;
});
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, args_addr >> 32), {
return -1;
});
// start execution
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN), {
return -1;
});
@ -502,6 +533,8 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
auto device = ((vx_device*)hdevice);
auto& api = device->api;
DBGPRINT("%s\n", "WAIT");
struct timespec sleep_time;
sleep_time.tv_sec = 0;
@ -512,7 +545,7 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
for (;;) {
uint64_t status;
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), {
CHECK_FPGA_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), {
return -1;
});
@ -529,7 +562,7 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
std::cout << std::dec << "#" << cout_tid << ": " << ss_buf.str() << std::flush;
ss_buf.str("");
}
CHECK_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), {
CHECK_FPGA_ERR(api.fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status), {
return -1;
});
cout_data = status >> STATUS_STATE_BITS;
@ -558,29 +591,43 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
return 0;
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
*value = device->dcrs.read(addr);
DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value);
return 0;
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
if (nullptr == hdevice)
return -1;
auto device = ((vx_device*)hdevice);
auto& api = device->api;
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
// write DCR value
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, addr), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG0, addr), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, value), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_ARG1, value), {
return -1;
});
CHECK_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_DCR_WRITE), {
CHECK_FPGA_ERR(api.fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_DCR_WRITE), {
return -1;
});
// save the value
device->dcrs.write(addr, value);
return 0;

View file

@ -32,6 +32,12 @@
#define RAM_PAGE_SIZE 4096
#ifndef NDEBUG
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
#else
#define DBGPRINT(format, ...) ((void)0)
#endif
using namespace vortex;
///////////////////////////////////////////////////////////////////////////////
@ -108,11 +114,18 @@ public:
return 0;
}
int start() {
int start(uint64_t krnl_addr, uint64_t args_addr) {
// ensure prior run completed
if (future_.valid()) {
future_.wait();
}
// set kernel info
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
this->write_dcr(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
this->write_dcr(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
// start new run
future_ = std::async(std::launch::async, [&]{
processor_.run();
@ -163,7 +176,7 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
//vx_device *device = ((vx_device*)hdevice);
switch (caps_id) {
case VX_CAPS_VERSION:
@ -190,10 +203,6 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
case VX_CAPS_LOCAL_MEM_ADDR:
*value = LMEM_BASE_ADDR;
break;
case VX_CAPS_KERNEL_BASE_ADDR:
*value = (uint64_t(device->read_dcr(VX_DCR_BASE_STARTUP_ADDR1)) << 32)
| device->read_dcr(VX_DCR_BASE_STARTUP_ADDR0);
break;
case VX_CAPS_ISA_FLAGS:
*value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
@ -278,6 +287,9 @@ extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* ho
return -1;
auto device = (vx_device*)hdevice;
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size);
return device->upload(dev_addr, host_ptr, size);
}
@ -286,26 +298,50 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad
return -1;
auto device = (vx_device*)hdevice;
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size);
return device->download(host_ptr, dev_addr, size);
}
extern int vx_start(vx_device_h hdevice) {
extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) {
if (nullptr == hdevice)
return -1;
DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr);
vx_device *device = ((vx_device*)hdevice);
return device->start();
return device->start(krnl_addr, args_addr);
}
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
if (nullptr == hdevice)
return -1;
DBGPRINT("%s\n", "WAIT");
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
if (nullptr == hdevice || NULL == value)
return -1;
vx_device *device = ((vx_device*)hdevice);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
*value = device->read_dcr(addr);
DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value);
return 0;
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
if (nullptr == hdevice)
return -1;
@ -314,5 +350,8 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value);
return device->write_dcr(addr, value);
}

View file

@ -43,47 +43,6 @@ using namespace vortex;
///////////////////////////////////////////////////////////////////////////////
class vx_device;
class vx_buffer {
public:
vx_buffer(uint64_t size, vx_device* device)
: size_(size)
, device_(device) {
uint64_t aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE);
data_ = aligned_malloc(aligned_asize, CACHE_BLOCK_SIZE);
// set uninitialized data to "baadf00d"
for (uint32_t i = 0; i < aligned_asize; ++i) {
((uint8_t*)data_)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff;
}
}
~vx_buffer() {
if (data_) {
aligned_free(data_);
}
}
void* data() const {
return data_;
}
uint64_t size() const {
return size_;
}
vx_device* device() const {
return device_;
}
private:
uint64_t size_;
vx_device* device_;
void* data_;
};
///////////////////////////////////////////////////////////////////////////////
class vx_device {
public:
vx_device()
@ -152,12 +111,18 @@ public:
return 0;
}
int start() {
int start(uint64_t krnl_addr, uint64_t args_addr) {
// ensure prior run completed
if (future_.valid()) {
future_.wait();
}
// set kernel info
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
this->write_dcr(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
this->write_dcr(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
// start new run
future_ = std::async(std::launch::async, [&]{
processor_.run();
@ -251,7 +216,7 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
//vx_device *device = ((vx_device*)hdevice);
switch (caps_id) {
case VX_CAPS_VERSION:
@ -278,10 +243,6 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
case VX_CAPS_LOCAL_MEM_ADDR:
*value = LMEM_BASE_ADDR;
break;
case VX_CAPS_KERNEL_BASE_ADDR:
*value = (uint64_t(device->read_dcr(VX_DCR_BASE_STARTUP_ADDR1)) << 32)
| device->read_dcr(VX_DCR_BASE_STARTUP_ADDR0);
break;
case VX_CAPS_ISA_FLAGS:
*value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
@ -345,25 +306,44 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad
return device->download(host_ptr, dev_addr, size);
}
extern int vx_start(vx_device_h hdevice) {
extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) {
if (nullptr == hdevice)
return -1;
DBGPRINT("START\n");
DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr);
vx_device *device = ((vx_device*)hdevice);
return device->start();
return device->start(krnl_addr, args_addr);
}
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
if (nullptr == hdevice)
return -1;
DBGPRINT("%s\n", "WAIT");
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
if (nullptr == hdevice || NULL == value)
return -1;
vx_device *device = ((vx_device*)hdevice);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
*value = device->read_dcr(addr);
DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value);
return 0;
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
if (nullptr == hdevice)
return -1;
@ -373,7 +353,7 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%lx\n", addr, value);
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value);
return device->write_dcr(addr, value);
}

View file

@ -45,7 +45,7 @@ extern int vx_copy_from_dev(vx_device_h /*hdevice*/, void* /*host_ptr*/, uint64_
return -1;
}
extern int vx_start(vx_device_h /*hdevice*/) {
extern int vx_start(vx_device_h /*hdevice*/, uint64_t /*krnl_addr*/, uint64_t /*args_add*/) {
return -1;
}
@ -53,6 +53,11 @@ extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) {
return -1;
}
extern int vx_dcr_write(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint64_t /*value*/) {
extern int vx_dcr_read(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t* /*value*/) {
return -1;
}
extern int vx_dcr_write(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t /*value*/) {
return -1;
}

View file

@ -518,10 +518,6 @@ extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
case VX_CAPS_LOCAL_MEM_ADDR:
*value = LMEM_BASE_ADDR;
break;
case VX_CAPS_KERNEL_BASE_ADDR:
*value = (uint64_t(device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) |
device->dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
break;
case VX_CAPS_ISA_FLAGS:
*value = device->isa_caps;
break;
@ -766,6 +762,8 @@ extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* ho
auto device = (vx_device*)hdevice;
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, size);
// check alignment
if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
return -1;
@ -780,8 +778,6 @@ extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* ho
return -1;
});
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, size);
return 0;
}
@ -791,6 +787,8 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad
auto device = (vx_device*)hdevice;
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, asize);
// check alignment
if (!is_aligned(dev_addr, CACHE_BLOCK_SIZE))
return -1;
@ -805,24 +803,37 @@ extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_ad
return -1;
});
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%lx, size=%ld\n", dev_addr, (uintptr_t)host_ptr, asize);
return 0;
}
extern int vx_start(vx_device_h hdevice) {
extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
//wait_for_enter("\nPress ENTER to continue after setting up ILA trigger...");
CHECK_ERR(device->write_register(MMIO_CTL_ADDR, CTL_AP_START), {
DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr);
// set kernel info
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff), {
return -1;
});
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, krnl_addr >> 32), {
return -1;
});
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff), {
return -1;
});
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, args_addr >> 32), {
return -1;
});
DBGPRINT("START\n");
auto device = (vx_device*)hdevice;
// start execution
CHECK_ERR(device->write_register(MMIO_CTL_ADDR, CTL_AP_START), {
return -1;
});
return 0;
}
@ -833,6 +844,8 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
auto device = (vx_device*)hdevice;
DBGPRINT("%s\n", "WAIT");
struct timespec sleep_time;
#ifndef NDEBUG
@ -862,12 +875,27 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
return 0;
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
*value = device->dcrs.read(addr);
DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value);
return 0;
}
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value);
CHECK_ERR(device->write_register(MMIO_DCR_ADDR, addr), {
return -1;
});
@ -876,8 +904,6 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint64_t value) {
return -1;
});
// save the value
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%lx\n", addr, value);
device->dcrs.write(addr, value);
return 0;

View file

@ -46,22 +46,23 @@ Emulator::warp_t::warp_t(const Arch& arch)
, freg_file(arch.num_threads(), std::vector<uint64_t>(arch.num_regs()))
{}
void Emulator::warp_t::clear(const Arch& arch, const DCRS &dcrs) {
this->PC = dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
#if (XLEN == 64)
this->PC = (uint64_t(dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32) | this->PC;
#endif
void Emulator::warp_t::clear(uint64_t startup_addr) {
this->PC = startup_addr;
this->tmask.reset();
for (uint32_t i = 0, n = arch.num_threads(); i < n; ++i) {
for (auto& reg : this->ireg_file.at(i)) {
reg = 0;
}
for (auto& reg : this->freg_file.at(i)) {
reg = 0;
}
}
this->fcsr = 0;
this->uui_gen.reset();
this->fcsr = 0;
for (auto& reg_file : this->ireg_file) {
for (auto& reg : reg_file) {
reg = 0;
}
}
for (auto& reg_file : this->freg_file) {
for (auto& reg : reg_file) {
reg = 0;
}
}
}
///////////////////////////////////////////////////////////////////////////////
@ -81,14 +82,26 @@ Emulator::~Emulator() {
}
void Emulator::clear() {
uint32_t startup_addr = dcrs_.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR0);
#if (XLEN == 64)
startup_addr |= (uint64_t(dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ADDR1)) << 32);
#endif
uint32_t startup_arg = dcrs_.base_dcrs.read(VX_DCR_BASE_STARTUP_ARG0);
#if (XLEN == 64)
startup_arg |= (uint64_t(dcrs.base_dcrs.read(VX_DCR_BASE_STARTUP_ARG1)) << 32);
#endif
for (auto& warp : warps_) {
warp.clear(arch_, dcrs_);
warp.clear(startup_addr);
}
for (auto& barrier : barriers_) {
barrier.reset();
}
csr_mscratch_ = startup_arg;
stalled_warps_.reset();
active_warps_.reset();

View file

@ -65,7 +65,7 @@ private:
struct warp_t {
warp_t(const Arch& arch);
void clear(const Arch& arch, const DCRS &dcrs);
void clear(uint64_t startup_addr);
Word PC;
ThreadMask tmask;

View file

@ -6,7 +6,22 @@ else
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex -mllvm -vortex-branch-divergence=0
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX)
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0
#LLVM_CFLAGS += -Wl,-L$(RISCV_TOOLCHAIN_PATH)/lib/gcc/$(RISCV_PREFIX)/9.2.0
#LLVM_CFLAGS += --rtlib=libgcc
#CC = $(LLVM_VORTEX)/bin/clang $(LLVM_CFLAGS)
#CXX = $(LLVM_VORTEX)/bin/clang++ $(LLVM_CFLAGS)
#AR = $(LLVM_VORTEX)/bin/llvm-ar
#DP = $(LLVM_VORTEX)/bin/llvm-objdump
#CP = $(LLVM_VORTEX)/bin/llvm-objcopy
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy

View file

@ -1,6 +1,7 @@
#include "tests.h"
#include <stdio.h>
#include <algorithm>
#include <VX_config.h>
#include <vx_intrinsics.h>
#include <vx_print.h>
#include <vx_spawn.h>

View file

@ -22,12 +22,12 @@ LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
LIBC_LIB += -L$(LIBC_VORTEX)/lib -lm -lc -lgcc
K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
K_CFLAGS += -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections
#K_CFLAGS += -mllvm -vortex-branch-divergence=0
#K_CFLAGS += -mllvm -print-after-all
K_CFLAGS += -mllvm -disable-loop-idiom-all # disable memset/memcpy loop idiom
K_CFLAGS += -I$(VORTEX_KN_PATH)/include -DXLEN_$(XLEN) -DNDEBUG
K_CFLAGS += -I$(ROOT_DIR)/hw -I$(VORTEX_KN_PATH)/include -DXLEN_$(XLEN) -DNDEBUG
K_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(ROOT_DIR)/kernel/libvortexrt.a $(LIBC_LIB)
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t count;
uint64_t src_addr;

View file

@ -3,7 +3,7 @@
#include "common.h"
int main() {
kernel_arg_t* __UNIFORM__ arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* __UNIFORM__ arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
uint32_t count = arg->count;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;

View file

@ -24,6 +24,8 @@ uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -60,6 +62,8 @@ void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
@ -168,7 +172,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
// start device
std::cout << "start execution" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_start(device));
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto t3 = std::chrono::high_resolution_clock::now();
@ -246,8 +250,7 @@ int main(int argc, char *argv[]) {
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
staging_buf.resize(buf_size);
// run tests
if (0 == test || -1 == test) {
@ -258,12 +261,11 @@ int main(int argc, char *argv[]) {
if (1 == test || -1 == test) {
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
std::cout << "run kernel test" << std::endl;
RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));

View file

@ -33,7 +33,7 @@ VX_CP = $(LLVM_VORTEX)/bin/llvm-objcopy
#VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
#VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
VX_CFLAGS += -v -O3 -std=c++11 -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections
VX_CFLAGS += -O3 -std=c++11 -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -nostdlib -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(ROOT_DIR)/hw
VX_CFLAGS += -DXLEN_$(XLEN)
VX_CFLAGS += -DNDEBUG

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif

View file

@ -45,7 +45,7 @@ void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
if (arg->lmem_addr != 0) {
// populate local memory
auto W = reinterpret_cast<TYPE*>(arg->W_addr);

View file

@ -36,7 +36,7 @@ public:
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
}
return false;
}
@ -61,7 +61,7 @@ public:
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a);
}
return false;
}
@ -95,7 +95,8 @@ int size = 32;
bool use_lmem = false;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -135,6 +136,8 @@ void cleanup() {
vx_mem_free(device, kernel_arg.W_addr);
}
vx_mem_free(device, kernel_arg.O_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
@ -158,10 +161,6 @@ int main(int argc, char *argv[]) {
uint32_t i_points = (size+2) * (size+2);
uint32_t w_points = 3 * 3;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
size_t i_nbytes = i_points * sizeof(TYPE);
@ -193,16 +192,6 @@ int main(int argc, char *argv[]) {
std::cout << "dev_argW=0x" << std::hex << kernel_arg.W_addr << std::endl;
std::cout << "dev_argO=0x" << std::hex << kernel_arg.O_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(i_nbytes, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// Generate input values
std::vector<TYPE> h_I(i_points);
std::vector<TYPE> h_W(w_points);
@ -219,38 +208,32 @@ int main(int argc, char *argv[]) {
for (uint32_t i = 0; i < w_points; ++i) {
h_W[i] = static_cast<TYPE>(rand()) / RAND_MAX;
}
convolution_cpu(h_O.data(), h_I.data(), h_W.data(), size, size);
// upload input buffer
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < i_points; ++i) {
buf_ptr[i] = h_I[i];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.I_addr, staging_buf.data(), i_nbytes));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.I_addr, h_I.data(), i_nbytes));
}
// upload weight buffer
{
std::cout << "upload weight buffer" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < w_points; ++i) {
buf_ptr[i] = h_W[i];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.W_addr, staging_buf.data(), w_nbytes));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.W_addr, h_W.data(), w_nbytes));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, o_nbytes);
RT_CHECK(vx_copy_to_dev(device, kernel_arg.O_addr, staging_buf.data(), o_nbytes));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
auto time_start = std::chrono::high_resolution_clock::now();
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -262,16 +245,18 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.O_addr, o_nbytes));
RT_CHECK(vx_copy_from_dev(device, h_O.data(), kernel_arg.O_addr, o_nbytes));
// verify result
std::cout << "verify result" << std::endl;
{
std::vector<TYPE> h_ref(o_points);
convolution_cpu(h_ref.data(), h_I.data(), h_W.data(), size, size);
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < h_O.size(); ++i) {
auto ref = h_O[i];
auto cur = buf_ptr[i];
for (uint32_t i = 0; i < h_ref.size(); ++i) {
auto ref = h_ref[i];
auto cur = h_O[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif

View file

@ -17,7 +17,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -34,7 +34,7 @@ public:
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
}
return false;
}
@ -61,7 +61,7 @@ public:
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
printf("*** error: [%d] expected=%f(0x%x), actual=%f(0x%x), ulp=%d\n", index, b, fb.i, a, fa.i, d);
}
return false;
}
@ -75,6 +75,8 @@ uint32_t count = 16;
vx_device_h device = nullptr;
std::vector<TYPE> source_data;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -109,16 +111,89 @@ void cleanup() {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
staging_buf.resize(buf_size);
// generate source data
source_data.resize(2 * num_points);
for (uint32_t i = 0; i < source_data.size(); ++i) {
source_data[i] = Comparator<TYPE>::generate();
}
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 0];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 1];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -147,94 +222,6 @@ int run_test(const kernel_arg_t& kernel_arg,
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// generate source data
source_data.resize(2 * num_points);
for (uint32_t i = 0; i < source_data.size(); ++i) {
source_data[i] = Comparator<TYPE>::generate();
}
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 0];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 1];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_points;
uint64_t src_addr;

View file

@ -77,7 +77,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -26,6 +26,8 @@ std::vector<int> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -59,6 +61,8 @@ void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
@ -143,45 +147,6 @@ void gen_ref_data(uint32_t num_points) {
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -212,7 +177,7 @@ int main(int argc, char *argv[]) {
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
@ -226,15 +191,12 @@ int main(int argc, char *argv[]) {
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size, dst_buf_size);
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
// upload source buffer
{
@ -254,9 +216,38 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
// cleanup
std::cout << "cleanup" << std::endl;

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t testid;
uint32_t num_tasks;

View file

@ -397,34 +397,36 @@ void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
dst_ptr[task_id] += 1;
}
static const PFN_Kernel sc_tests[] = {
kernel_iadd,
kernel_imul,
kernel_idiv,
kernel_idiv_mul,
kernel_fadd,
kernel_fsub,
kernel_fmul,
kernel_fmadd,
kernel_fmsub,
kernel_fnmadd,
kernel_fnmsub,
kernel_fnmadd_madd,
kernel_fdiv,
kernel_fdiv2,
kernel_fsqrt,
kernel_ftoi,
kernel_ftou,
kernel_itof,
kernel_utof,
kernel_fclamp,
kernel_trigo,
kernel_bar,
kernel_gbar
};
static PFN_Kernel sc_tests[23];
void register_tests() {
sc_tests[0] = kernel_iadd;
sc_tests[1] = kernel_imul;
sc_tests[2] = kernel_idiv;
sc_tests[3] = kernel_idiv_mul;
sc_tests[4] = kernel_fadd;
sc_tests[5] = kernel_fsub;
sc_tests[6] = kernel_fmul;
sc_tests[7] = kernel_fmadd;
sc_tests[8] = kernel_fmsub;
sc_tests[9] = kernel_fnmadd;
sc_tests[10] = kernel_fnmsub;
sc_tests[11] = kernel_fnmadd_madd;
sc_tests[12] = kernel_fdiv;
sc_tests[13] = kernel_fdiv2;
sc_tests[14] = kernel_fsqrt;
sc_tests[15] = kernel_ftoi;
sc_tests[16] = kernel_ftou;
sc_tests[17] = kernel_itof;
sc_tests[18] = kernel_utof;
sc_tests[19] = kernel_fclamp;
sc_tests[20] = kernel_trigo;
sc_tests[21] = kernel_bar;
sc_tests[22] = kernel_gbar;
}
int main() {
auto arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
register_tests();
auto arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
return 0;
}

View file

@ -20,10 +20,11 @@ int testid_e = 0;
bool stop_on_error = true;
vx_device_h device = nullptr;
std::vector<uint8_t> arg_buf;
std::vector<uint8_t> src1_buf;
std::vector<uint8_t> src2_buf;
std::vector<uint8_t> dst_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -77,6 +78,8 @@ void cleanup() {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
@ -113,15 +116,12 @@ int main(int argc, char *argv[]) {
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload kernel" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
RT_CHECK(vx_mem_alloc(device, sizeof(kernel_arg_t), &kernel_args_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
@ -132,7 +132,6 @@ int main(int argc, char *argv[]) {
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
arg_buf.resize(sizeof(kernel_arg_t));
src1_buf.resize(buf_size);
src2_buf.resize(buf_size);
dst_buf.resize(buf_size);
@ -142,6 +141,11 @@ int main(int argc, char *argv[]) {
if (testid_e == 0) {
testid_e = (testSuite->size() - 1);
}
// upload program
std::cout << "upload kernel" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// execute tests
for (int t = testid_s; t <= testid_e; ++t) {
auto test = testSuite->get_test(t);
@ -159,12 +163,6 @@ int main(int argc, char *argv[]) {
std::cout << "Test" << t << ": " << name << std::endl;
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
kernel_arg.testid = t;
memcpy(arg_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, arg_buf.data(), sizeof(kernel_arg_t)));
// get test arguments
std::cout << "get test arguments" << std::endl;
RT_CHECK(test->setup(num_points, (void*)src1_buf.data(), (void*)src2_buf.data()));
@ -184,9 +182,14 @@ int main(int argc, char *argv[]) {
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, dst_buf.data(), buf_size));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
kernel_arg.testid = t;
RT_CHECK(vx_copy_to_dev(device, kernel_args_addr, &kernel_arg, sizeof(kernel_arg_t)));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;

View file

@ -32,6 +32,19 @@ inline float fround(float x, int32_t precision = 8) {
return std::round(x * power_of_10) / power_of_10;
}
inline bool almost_equal_precision(float a, float b, int precision = 4) {
auto power_of_10 = std::pow(10, precision);
auto ap = std::round(a * power_of_10) / power_of_10;
auto bp = std::round(b * power_of_10) / power_of_10;
auto eps = std::numeric_limits<float>::epsilon();
auto d = fabs(ap - bp);
if (d > eps) {
std::cout << "*** almost_equal_precision: d=" << d << ", precision=" << precision << std::endl;
return false;
}
return true;
}
inline bool almost_equal_eps(float a, float b, int ulp = 128) {
auto eps = std::numeric_limits<float>::epsilon() * (std::max(fabs(a), fabs(b)) * ulp);
auto d = fabs(a - b);
@ -727,8 +740,8 @@ public:
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
a[i] = fround((2*i-n) * (1.0f/n) * 3.1416);
b[i] = fround((2*i-n) * (1.0f/n) * 3.1416);
}
return 0;
}
@ -740,7 +753,7 @@ public:
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = sin(a[i]) + cos(b[i]);
if (!almost_equal_ulp(c[i], ref, 20)) {
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_tasks;
uint32_t task_size;

View file

@ -18,7 +18,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -22,6 +22,8 @@ uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -56,49 +58,12 @@ void cleanup() {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -123,10 +88,6 @@ int main(int argc, char *argv[]) {
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
@ -142,13 +103,7 @@ int main(int argc, char *argv[]) {
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
staging_buf.resize(buf_size);
// upload source buffer0
{
@ -180,9 +135,46 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
// cleanup
std::cout << "cleanup" << std::endl;

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_points;
uint64_t src_addr;

View file

@ -13,7 +13,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -32,6 +32,8 @@ std::vector<int32_t> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -65,6 +67,8 @@ void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_mem_free(device, usr_test_mem);
vx_dev_close(device);
}
@ -95,45 +99,6 @@ void gen_ref_data(uint32_t num_points) {
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
uint64_t value;
@ -165,10 +130,6 @@ int main(int argc, char *argv[]) {
std::cout << "number of points: " << std::dec << num_points << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
@ -184,16 +145,9 @@ int main(int argc, char *argv[]) {
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint64_t),
std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
std::max<uint32_t>(src_buf_size, dst_buf_size));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload test address data
{
std::cout << "upload test address data" << std::endl;
@ -223,9 +177,46 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
// cleanup
std::cout << "cleanup" << std::endl;

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif

View file

@ -17,7 +17,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -75,6 +75,8 @@ uint32_t count = 16;
vx_device_h device = nullptr;
std::vector<TYPE> source_data;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -109,47 +111,12 @@ void cleanup() {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -175,7 +142,7 @@ int main(int argc, char *argv[]) {
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
@ -192,13 +159,11 @@ int main(int argc, char *argv[]) {
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
staging_buf.resize(buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
// generate source data
source_data.resize(2 * num_points);
@ -231,9 +196,36 @@ int main(int argc, char *argv[]) {
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
// cleanup
std::cout << "cleanup" << std::endl;

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#define NUM_LOADS 8
typedef struct {

View file

@ -23,7 +23,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -73,6 +73,8 @@ std::vector<uint32_t> addr_table;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -107,6 +109,8 @@ void cleanup() {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
@ -128,12 +132,94 @@ void gen_input_data(uint32_t num_points) {
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t dst_buf_size,
uint32_t num_points) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
// generate input data
gen_input_data(num_points);
uint32_t addr_buf_size = addr_table.size() * sizeof(int32_t);
uint32_t src_buf_size = test_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = test_data.size() * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, addr_buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.stride = count;
std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(addr_buf_size, dst_buf_size));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
// upload source buffer0
{
std::cout << "upload address buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, addr_table.data(), addr_table.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), addr_buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, test_data.data(), test_data.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), src_buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < test_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -175,101 +261,6 @@ int run_test(const kernel_arg_t& kernel_arg,
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
// generate input data
gen_input_data(num_points);
uint32_t addr_buf_size = addr_table.size() * sizeof(int32_t);
uint32_t src_buf_size = test_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = test_data.size() * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, addr_buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.stride = count;
std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(addr_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload address buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, addr_table.data(), addr_table.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), addr_buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, test_data.data(), test_data.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), src_buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < test_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t size;
uint64_t src_addr;

View file

@ -4,7 +4,7 @@
#include "common.h"
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
uint32_t size = arg->size;
int32_t* src_ptr = (int32_t*)arg->src_addr;

View file

@ -22,6 +22,8 @@ uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -55,16 +57,75 @@ void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
kernel_arg.size = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
staging_buf.resize(buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
// upload source buffer0
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -95,75 +156,6 @@ int run_test(const kernel_arg_t& kernel_arg,
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
kernel_arg.size = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_points;
uint64_t src_addr;

View file

@ -12,7 +12,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -22,6 +22,8 @@ uint32_t count = 4;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -54,22 +56,12 @@ static void parse_args(int argc, char **argv) {
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
int run_test() {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -94,7 +86,7 @@ int main(int argc, char *argv[]) {
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
@ -106,13 +98,11 @@ int main(int argc, char *argv[]) {
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
staging_buf.resize(buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
// upload source buffer0
{
@ -124,9 +114,15 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test());
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
return 0;
// cleanup
std::cout << "cleanup" << std::endl;

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif

View file

@ -31,7 +31,7 @@ void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -36,7 +36,7 @@ public:
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
}
return false;
}
@ -61,7 +61,7 @@ public:
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a);
}
return false;
}
@ -85,7 +85,8 @@ const char* kernel_file = "kernel.bin";
uint32_t size = 32;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -120,6 +121,8 @@ void cleanup() {
vx_mem_free(device, kernel_arg.A_addr);
vx_mem_free(device, kernel_arg.B_addr);
vx_mem_free(device, kernel_arg.C_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
@ -140,10 +143,6 @@ int main(int argc, char *argv[]) {
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "matrix size: " << size << "x" << size << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.A_addr));
@ -158,16 +157,6 @@ int main(int argc, char *argv[]) {
std::cout << "dev_argB=0x" << std::hex << kernel_arg.B_addr << std::endl;
std::cout << "dev_argC=0x" << std::hex << kernel_arg.C_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// generate source data
std::vector<TYPE> h_A(num_points);
std::vector<TYPE> h_B(num_points);
@ -178,38 +167,32 @@ int main(int argc, char *argv[]) {
h_A[i] = static_cast<TYPE>(a * size);
h_B[i] = static_cast<TYPE>(b * size);
}
matmul_cpu(h_C.data(), h_A.data(), h_B.data(), size, size);
// upload matrix A buffer
{
std::cout << "upload matrix A buffer" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = h_A[i];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, staging_buf.data(), buf_size));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, h_A.data(), buf_size));
}
// upload matrix B buffer
{
std::cout << "upload matrix B buffer" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = h_B[i];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, staging_buf.data(), buf_size));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, h_B.data(), buf_size));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, buf_size);
RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
auto time_start = std::chrono::high_resolution_clock::now();
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -221,17 +204,17 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
RT_CHECK(vx_copy_from_dev(device, h_C.data(), kernel_arg.C_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
std::vector<TYPE> h_ref(num_points);
matmul_cpu(h_ref.data(), h_A.data(), h_B.data(), size, size);
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < h_C.size(); ++i) {
auto ref = h_C[i];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
for (uint32_t i = 0; i < h_ref.size(); ++i) {
if (!Comparator<TYPE>::compare(h_C[i], h_ref[i], i, errors)) {
++errors;
}
}

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE int
#endif

View file

@ -19,7 +19,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -25,6 +25,8 @@ std::vector<TYPE> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -58,6 +60,8 @@ void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
@ -87,45 +91,6 @@ void gen_ref_data(uint32_t num_points) {
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
TYPE ref = ref_data.at(i);
TYPE cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -154,10 +119,6 @@ int main(int argc, char *argv[]) {
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src_addr));
@ -171,17 +132,10 @@ int main(int argc, char *argv[]) {
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size, dst_buf_size);
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer
{
std::cout << "upload source buffer" << std::endl;
@ -200,9 +154,46 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
TYPE ref = ref_data.at(i);
TYPE cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
// cleanup
std::cout << "cleanup" << std::endl;

View file

@ -1,8 +1,6 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif

View file

@ -12,7 +12,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -34,7 +34,7 @@ public:
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
}
return false;
}
@ -61,7 +61,7 @@ public:
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
printf("*** error: [%d] expected=%f, actual=%f\n", index, b, a);
}
return false;
}
@ -75,6 +75,8 @@ uint32_t size = 16;
vx_device_h device = nullptr;
std::vector<TYPE> source_data;
std::vector<uint8_t> staging_buf;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -109,16 +111,90 @@ void cleanup() {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
std::cout << "number of cores: " << num_cores << std::endl;
std::cout << "number of warps: " << num_warps << std::endl;
std::cout << "number of threads: " << num_threads << std::endl;
uint32_t num_points = size;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
staging_buf.resize(buf_size);
// generate source data
source_data.resize(2 * num_points);
for (uint32_t i = 0; i < source_data.size(); ++i) {
source_data[i] = Comparator<TYPE>::generate();
}
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 0];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 1];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -147,95 +223,6 @@ int run_test(const kernel_arg_t& kernel_arg,
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
std::cout << "number of cores: " << num_cores << std::endl;
std::cout << "number of warps: " << num_warps << std::endl;
std::cout << "number of threads: " << num_threads << std::endl;
uint32_t num_points = size;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// generate source data
source_data.resize(2 * num_points);
for (uint32_t i = 0; i < source_data.size(); ++i) {
source_data[i] = Comparator<TYPE>::generate();
}
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 0];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 1];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();