mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
runtime refactoring
This commit is contained in:
parent
405d6b468f
commit
c1000f6a3b
13 changed files with 317 additions and 484 deletions
77
runtime/common/common.h
Normal file
77
runtime/common/common.h
Normal file
|
@ -0,0 +1,77 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vortex.h>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
#include <callbacks.h>
|
||||
#include <malloc.h>
|
||||
|
||||
#include <cstdint>
|
||||
#include <unordered_map>
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
#define ALLOC_BASE_ADDR CACHE_BLOCK_SIZE
|
||||
|
||||
#if (XLEN == 64)
|
||||
#define GLOBAL_MEM_SIZE 0x200000000 // 8 GB
|
||||
#else
|
||||
#define GLOBAL_MEM_SIZE 0x100000000 // 4 GB
|
||||
#endif
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
class DeviceConfig {
|
||||
public:
|
||||
void write(uint32_t addr, uint32_t value) {
|
||||
store_[addr] = value;
|
||||
}
|
||||
|
||||
int read(uint32_t addr, uint32_t* value) const {
|
||||
auto it = store_.find(addr);
|
||||
if (it == store_.end())
|
||||
return -1;
|
||||
*value = it->second;
|
||||
return 0;
|
||||
}
|
||||
private:
|
||||
std::unordered_map<uint32_t, uint32_t> store_;
|
||||
};
|
||||
|
||||
inline uint64_t aligned_size(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
inline bool is_aligned(uint64_t addr, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return 0 == (addr & (alignment - 1));
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vortex.h>
|
||||
#include <cstdint>
|
||||
#include <unordered_map>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
class DeviceConfig {
|
||||
public:
|
||||
void write(uint32_t addr, uint32_t value);
|
||||
int read(uint32_t addr, uint32_t* value) const;
|
||||
private:
|
||||
std::unordered_map<uint32_t, uint32_t> store_;
|
||||
};
|
||||
|
||||
int dcr_initialize(vx_device_h device);
|
||||
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment);
|
||||
|
||||
bool is_aligned(uint64_t addr, uint64_t alignment);
|
||||
|
||||
int profiling_add(vx_device_h device);
|
||||
|
||||
void profiling_remove(int id);
|
||||
|
||||
void profiling_begin(int id);
|
||||
|
||||
void profiling_end(int id);
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
#define ALLOC_BASE_ADDR CACHE_BLOCK_SIZE
|
||||
#if (XLEN == 64)
|
||||
#define GLOBAL_MEM_SIZE 0x200000000 // 8 GB
|
||||
#else
|
||||
#define GLOBAL_MEM_SIZE 0x100000000 // 4 GB
|
||||
#endif
|
|
@ -20,7 +20,7 @@ CXXFLAGS += $(CONFIGS)
|
|||
|
||||
LDFLAGS += -shared -luuid -ldl -pthread
|
||||
|
||||
SRCS = $(SRC_DIR)/vortex.cpp $(SRC_DIR)/driver.cpp $(COMMON_DIR)/utils.cpp
|
||||
SRCS = $(SRC_DIR)/vortex.cpp $(SRC_DIR)/driver.cpp
|
||||
|
||||
# set up target types
|
||||
ifeq ($(TARGET), opaesim)
|
||||
|
@ -49,11 +49,6 @@ ifdef SCOPE
|
|||
SRCS += $(COMMON_DIR)/scope.cpp
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
CXXFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
PROJECT := libvortex-opae.so
|
||||
|
||||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
|
|
@ -11,9 +11,14 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <utils.h>
|
||||
#include <malloc.h>
|
||||
#include <common.h>
|
||||
#include <vortex_afu.h>
|
||||
|
||||
#include "driver.h"
|
||||
#ifdef SCOPE
|
||||
#include "scope.h"
|
||||
#endif
|
||||
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -29,16 +34,6 @@
|
|||
#include <memory>
|
||||
#include <list>
|
||||
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
#include <vortex_afu.h>
|
||||
|
||||
#ifdef SCOPE
|
||||
#include "scope.h"
|
||||
#endif
|
||||
|
||||
#include <callbacks.h>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
#define CMD_MEM_READ AFU_IMAGE_CMD_MEM_READ
|
||||
|
@ -58,14 +53,6 @@ using namespace vortex;
|
|||
|
||||
#define STATUS_STATE_BITS 8
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
#define CHECK_HANDLE(handle, _expr, _cleanup) \
|
||||
auto handle = _expr; \
|
||||
if (handle == nullptr) { \
|
||||
|
@ -82,15 +69,6 @@ using namespace vortex;
|
|||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
|
@ -113,8 +91,6 @@ public:
|
|||
}
|
||||
api_.fpgaClose(fpga_);
|
||||
}
|
||||
|
||||
profiling_remove(profiling_id_);
|
||||
}
|
||||
|
||||
int init() {
|
||||
|
@ -211,12 +187,6 @@ public:
|
|||
}
|
||||
#endif
|
||||
|
||||
CHECK_ERR(dcr_initialize(this), {
|
||||
return err;
|
||||
});
|
||||
|
||||
profiling_id_ = profiling_add(this);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -406,8 +376,6 @@ public:
|
|||
return err;
|
||||
});
|
||||
|
||||
profiling_begin(profiling_id_);
|
||||
|
||||
// start execution
|
||||
CHECK_FPGA_ERR(api_.fpgaWriteMMIO64(fpga_, 0, MMIO_CMD_TYPE, CMD_RUN), {
|
||||
return -1;
|
||||
|
@ -475,8 +443,6 @@ public:
|
|||
timeout -= sleep_time_ms;
|
||||
};
|
||||
|
||||
profiling_end(profiling_id_);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -553,7 +519,6 @@ private:
|
|||
uint8_t* staging_ptr_;
|
||||
uint64_t staging_size_;
|
||||
std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
|
||||
int profiling_id_;
|
||||
};
|
||||
|
||||
struct vx_buffer {
|
||||
|
|
|
@ -17,7 +17,7 @@ CXXFLAGS += $(CONFIGS)
|
|||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L$(DESTDIR) -lrtlsim
|
||||
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
|
||||
SRCS := $(SRC_DIR)/vortex.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
|
@ -26,11 +26,6 @@ else
|
|||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
CXXFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
PROJECT := libvortex-rtlsim.so
|
||||
|
||||
all: $(DESTDIR)/$(PROJECT)
|
||||
|
|
|
@ -11,6 +11,12 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <common.h>
|
||||
|
||||
#include <mem.h>
|
||||
#include <util.h>
|
||||
#include <processor.h>
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -20,39 +26,8 @@
|
|||
#include <list>
|
||||
#include <chrono>
|
||||
|
||||
#include <vortex.h>
|
||||
#include <malloc.h>
|
||||
#include <utils.h>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
#include <mem.h>
|
||||
#include <util.h>
|
||||
#include <processor.h>
|
||||
|
||||
#include <callbacks.h>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
|
@ -66,14 +41,9 @@ public:
|
|||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
profiling_remove(profiling_id_);
|
||||
}
|
||||
|
||||
int init() {
|
||||
CHECK_ERR(dcr_initialize(this), {
|
||||
return err;
|
||||
});
|
||||
profiling_id_ = profiling_add(this);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -219,8 +189,6 @@ public:
|
|||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
|
||||
|
||||
profiling_begin(profiling_id_);
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
|
@ -245,7 +213,6 @@ public:
|
|||
if (0 == timeout_sec--)
|
||||
return -1;
|
||||
}
|
||||
profiling_end(profiling_id_);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -284,7 +251,6 @@ private:
|
|||
DeviceConfig dcrs_;
|
||||
std::future<void> future_;
|
||||
std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
|
||||
int profiling_id_;
|
||||
};
|
||||
|
||||
struct vx_buffer {
|
||||
|
|
|
@ -13,7 +13,7 @@ CXXFLAGS += -DXLEN_$(XLEN)
|
|||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L$(DESTDIR) -lsimx
|
||||
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
|
||||
SRCS := $(SRC_DIR)/vortex.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
|
|
|
@ -11,6 +11,14 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <common.h>
|
||||
|
||||
#include <util.h>
|
||||
#include <processor.h>
|
||||
#include <arch.h>
|
||||
#include <mem.h>
|
||||
#include <constants.h>
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -19,40 +27,8 @@
|
|||
#include <future>
|
||||
#include <chrono>
|
||||
|
||||
#include <utils.h>
|
||||
#include <malloc.h>
|
||||
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include <processor.h>
|
||||
#include <arch.h>
|
||||
#include <mem.h>
|
||||
#include <constants.h>
|
||||
|
||||
#include <callbacks.h>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
|
@ -69,14 +45,9 @@ public:
|
|||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
profiling_remove(profiling_id_);
|
||||
}
|
||||
|
||||
int init() {
|
||||
CHECK_ERR(dcr_initialize(this), {
|
||||
return err;
|
||||
});
|
||||
profiling_id_ = profiling_add(this);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -213,8 +184,6 @@ public:
|
|||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
|
||||
|
||||
profiling_begin(profiling_id_);
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
|
@ -239,7 +208,6 @@ public:
|
|||
if (0 == timeout_sec--)
|
||||
return -1;
|
||||
}
|
||||
profiling_end(profiling_id_);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -278,7 +246,6 @@ private:
|
|||
DeviceConfig dcrs_;
|
||||
std::future<void> future_;
|
||||
std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
|
||||
int profiling_id_;
|
||||
};
|
||||
|
||||
struct vx_buffer {
|
||||
|
|
|
@ -4,13 +4,20 @@ DESTDIR ?= $(CURDIR)/..
|
|||
|
||||
SRC_DIR := $(VORTEX_HOME)/runtime/stub
|
||||
|
||||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
CXXFLAGS += -I$(INC_DIR) -I$(COMMON_DIR) -I$(ROOT_DIR)/hw -I$(SIM_DIR)/common
|
||||
CXXFLAGS += -fPIC
|
||||
|
||||
LDFLAGS += -shared -pthread -ldl
|
||||
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(SRC_DIR)/utils.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT := libvortex.so
|
||||
|
||||
|
|
|
@ -11,7 +11,8 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include "utils.h"
|
||||
#include <common.h>
|
||||
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <list>
|
||||
|
@ -21,129 +22,30 @@
|
|||
#include <vortex.h>
|
||||
#include <assert.h>
|
||||
|
||||
#define RT_CHECK(_expr, _cleanup) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
bool is_aligned(uint64_t addr, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return 0 == (addr & (alignment - 1));
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class AutoPerfDump {
|
||||
class ProfilingMode {
|
||||
public:
|
||||
AutoPerfDump() : perf_class_(0) {
|
||||
ProfilingMode() : perf_class_(0) {
|
||||
auto profiling_s = getenv("VORTEX_PROFILING");
|
||||
if (profiling_s) {
|
||||
perf_class_ = std::atoi(profiling_s);
|
||||
}
|
||||
}
|
||||
|
||||
~AutoPerfDump() {}
|
||||
|
||||
int add(vx_device_h hdevice) {
|
||||
int ret = devices_.size();
|
||||
devices_[ret] = hdevice;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void remove(int id) {
|
||||
devices_.erase(id);
|
||||
}
|
||||
|
||||
void begin(int id) {
|
||||
auto device = devices_.at(id);
|
||||
vx_dcr_write(device, VX_DCR_BASE_MPM_CLASS, perf_class_);
|
||||
}
|
||||
|
||||
void end(int id) {
|
||||
auto device = devices_.at(id);
|
||||
vx_dump_perf(device, stdout);
|
||||
}
|
||||
~ProfilingMode() {}
|
||||
|
||||
int perf_class() const {
|
||||
return perf_class_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unordered_map<int, vx_device_h> devices_;
|
||||
int perf_class_;
|
||||
};
|
||||
|
||||
static AutoPerfDump gAutoPerfDump;
|
||||
|
||||
int profiling_add(vx_device_h hdevice) {
|
||||
return gAutoPerfDump.add(hdevice);
|
||||
int get_profiling_mode() {
|
||||
static ProfilingMode gProfilingMode;
|
||||
return gProfilingMode.perf_class();
|
||||
}
|
||||
|
||||
void profiling_remove(int id) {
|
||||
gAutoPerfDump.remove(id);
|
||||
}
|
||||
|
||||
void profiling_begin(int id) {
|
||||
gAutoPerfDump.begin(id);
|
||||
}
|
||||
|
||||
void profiling_end(int id) {
|
||||
gAutoPerfDump.end(id);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void DeviceConfig::write(uint32_t addr, uint32_t value) {
|
||||
store_[addr] = value;
|
||||
}
|
||||
|
||||
int DeviceConfig::read(uint32_t addr, uint32_t* value) const {
|
||||
auto it = store_.find(addr);
|
||||
if (it == store_.end())
|
||||
return -1;
|
||||
*value = it->second;
|
||||
return 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int dcr_initialize(vx_device_h hdevice) {
|
||||
const uint64_t startup_addr(STARTUP_ADDR);
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, 0), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG1, 0), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice || nullptr == content || size <= 8 || nullptr == hbuffer)
|
||||
return -1;
|
||||
|
@ -157,30 +59,30 @@ extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint
|
|||
|
||||
vx_buffer_h _hbuffer;
|
||||
#ifndef NDEBUG
|
||||
RT_CHECK(vx_mem_reserve(hdevice, min_vma, runtime_size, 0, &_hbuffer), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mem_reserve(hdevice, min_vma, runtime_size, 0, &_hbuffer), {
|
||||
return err;
|
||||
});
|
||||
#else
|
||||
RT_CHECK(vx_mem_alloc(hdevice, runtime_size, 0, &_hbuffer), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mem_alloc(hdevice, runtime_size, 0, &_hbuffer), {
|
||||
return err;
|
||||
});
|
||||
#endif
|
||||
|
||||
// mask binary region as read-only
|
||||
RT_CHECK(vx_mem_access(_hbuffer, 0, bin_size, VX_MEM_READ), {
|
||||
CHECK_ERR(vx_mem_access(_hbuffer, 0, bin_size, VX_MEM_READ), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
return err;
|
||||
});
|
||||
|
||||
// mark global variables region as read-write
|
||||
RT_CHECK(vx_mem_access(_hbuffer, bin_size, runtime_size - bin_size, VX_MEM_READ_WRITE), {
|
||||
CHECK_ERR(vx_mem_access(_hbuffer, bin_size, runtime_size - bin_size, VX_MEM_READ_WRITE), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
return err;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(_hbuffer, bytes, 0, bin_size), {
|
||||
CHECK_ERR(vx_copy_to_dev(_hbuffer, bytes, 0, bin_size), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
return err;
|
||||
});
|
||||
|
||||
*hbuffer = _hbuffer;
|
||||
|
@ -206,8 +108,8 @@ extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, vx_b
|
|||
ifs.read(content.data(), size);
|
||||
|
||||
// upload buffer
|
||||
RT_CHECK(vx_upload_kernel_bytes(hdevice, content.data(), size, hbuffer), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_upload_kernel_bytes(hdevice, content.data(), size, hbuffer), {
|
||||
return err;
|
||||
});
|
||||
|
||||
return 0;
|
||||
|
@ -219,13 +121,13 @@ extern int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t si
|
|||
|
||||
vx_buffer_h _hbuffer;
|
||||
|
||||
RT_CHECK(vx_mem_alloc(hdevice, size, VX_MEM_READ, &_hbuffer), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mem_alloc(hdevice, size, VX_MEM_READ, &_hbuffer), {
|
||||
return err;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(_hbuffer, content, 0, size), {
|
||||
CHECK_ERR(vx_copy_to_dev(_hbuffer, content, 0, size), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
return err;
|
||||
});
|
||||
|
||||
*hbuffer = _hbuffer;
|
||||
|
@ -251,8 +153,8 @@ extern int vx_upload_file(vx_device_h hdevice, const char* filename, vx_buffer_h
|
|||
ifs.read(content.data(), size);
|
||||
|
||||
// upload buffer
|
||||
RT_CHECK(vx_upload_bytes(hdevice, content.data(), size, hbuffer), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_upload_bytes(hdevice, content.data(), size, hbuffer), {
|
||||
return err;
|
||||
});
|
||||
|
||||
return 0;
|
||||
|
@ -265,8 +167,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t total_cycles = 0;
|
||||
uint64_t max_cycles = 0;
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
|
||||
auto calcRatio = [&](uint64_t part, uint64_t total)->int {
|
||||
if (total == 0)
|
||||
return 0;
|
||||
|
@ -283,8 +183,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
return int(caclAverage(part, total) * 100);
|
||||
};
|
||||
|
||||
auto perf_class = gAutoPerfDump.perf_class();
|
||||
|
||||
// PERF: pipeline stalls
|
||||
uint64_t sched_idles = 0;
|
||||
uint64_t sched_stalls = 0;
|
||||
|
@ -319,45 +217,44 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t mem_reads = 0;
|
||||
uint64_t mem_writes = 0;
|
||||
uint64_t mem_lat = 0;
|
||||
#endif
|
||||
|
||||
uint64_t num_cores;
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
|
||||
return err;
|
||||
});
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
uint64_t isa_flags;
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
|
||||
return err;
|
||||
});
|
||||
|
||||
bool icache_enable = isa_flags & VX_ISA_EXT_ICACHE;
|
||||
bool dcache_enable = isa_flags & VX_ISA_EXT_DCACHE;
|
||||
bool l2cache_enable = isa_flags & VX_ISA_EXT_L2CACHE;
|
||||
bool l3cache_enable = isa_flags & VX_ISA_EXT_L3CACHE;
|
||||
bool lmem_enable = isa_flags & VX_ISA_EXT_LMEM;
|
||||
#endif
|
||||
|
||||
auto perf_class = get_profiling_mode();
|
||||
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
uint64_t cycles_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MCYCLE, core_id, &cycles_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MCYCLE, core_id, &cycles_per_core), {
|
||||
return err;
|
||||
});
|
||||
|
||||
uint64_t instrs_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MINSTRET, core_id, &instrs_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MINSTRET, core_id, &instrs_per_core), {
|
||||
return err;
|
||||
});
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
// PERF: pipeline
|
||||
// scheduler idles
|
||||
{
|
||||
uint64_t sched_idles_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ID, core_id, &sched_idles_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ID, core_id, &sched_idles_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
|
||||
|
@ -368,8 +265,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// scheduler stalls
|
||||
{
|
||||
uint64_t sched_stalls_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ST, core_id, &sched_stalls_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ST, core_id, &sched_stalls_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
|
||||
|
@ -380,8 +277,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// ibuffer_stalls
|
||||
{
|
||||
uint64_t ibuffer_stalls_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IBUF_ST, core_id, &ibuffer_stalls_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_IBUF_ST, core_id, &ibuffer_stalls_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
|
||||
|
@ -392,24 +289,24 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// issue_stalls
|
||||
{
|
||||
uint64_t scrb_stalls_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ST, core_id, &scrb_stalls_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ST, core_id, &scrb_stalls_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_alu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ALU, core_id, &scrb_alu_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ALU, core_id, &scrb_alu_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_fpu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_FPU, core_id, &scrb_fpu_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_FPU, core_id, &scrb_fpu_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_lsu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_LSU, core_id, &scrb_lsu_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_LSU, core_id, &scrb_lsu_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_sfu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return err;
|
||||
});
|
||||
scrb_alu += scrb_alu_per_core;
|
||||
scrb_fpu += scrb_fpu_per_core;
|
||||
|
@ -428,16 +325,16 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// sfu_stalls
|
||||
{
|
||||
uint64_t scrb_sfu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_wctl_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_csrs_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core;
|
||||
|
@ -455,15 +352,15 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// ifetches
|
||||
{
|
||||
uint64_t ifetches_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCHES, core_id, &ifetches_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCHES, core_id, &ifetches_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
||||
ifetches += ifetches_per_core;
|
||||
|
||||
uint64_t ifetch_lat_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCH_LT, core_id, &ifetch_lat_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCH_LT, core_id, &ifetch_lat_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
|
||||
|
@ -474,15 +371,15 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// loads
|
||||
{
|
||||
uint64_t loads_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LOADS, core_id, &loads_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LOADS, core_id, &loads_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||
loads += loads_per_core;
|
||||
|
||||
uint64_t load_lat_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LOAD_LT, core_id, &load_lat_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LOAD_LT, core_id, &load_lat_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
|
||||
|
@ -493,8 +390,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// stores
|
||||
{
|
||||
uint64_t stores_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_STORES, core_id, &stores_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_STORES, core_id, &stores_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
|
||||
stores += stores_per_core;
|
||||
|
@ -504,16 +401,16 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
if (lmem_enable) {
|
||||
// PERF: lmem
|
||||
uint64_t lmem_reads;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_READS, core_id, &lmem_reads), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_READS, core_id, &lmem_reads), {
|
||||
return err;
|
||||
});
|
||||
uint64_t lmem_writes;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_WRITES, core_id, &lmem_writes), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_WRITES, core_id, &lmem_writes), {
|
||||
return err;
|
||||
});
|
||||
uint64_t lmem_bank_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_BANK_ST, core_id, &lmem_bank_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_BANK_ST, core_id, &lmem_bank_stalls), {
|
||||
return err;
|
||||
});
|
||||
int lmem_bank_utilization = calcAvgPercent(lmem_reads + lmem_writes, lmem_reads + lmem_writes + lmem_bank_stalls);
|
||||
fprintf(stream, "PERF: core%d: lmem reads=%ld\n", core_id, lmem_reads);
|
||||
|
@ -524,16 +421,16 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
if (icache_enable) {
|
||||
// PERF: Icache
|
||||
uint64_t icache_reads;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_READS, core_id, &icache_reads), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_READS, core_id, &icache_reads), {
|
||||
return err;
|
||||
});
|
||||
uint64_t icache_read_misses;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MISS_R, core_id, &icache_read_misses), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MISS_R, core_id, &icache_read_misses), {
|
||||
return err;
|
||||
});
|
||||
uint64_t icache_mshr_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MSHR_ST, core_id, &icache_mshr_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MSHR_ST, core_id, &icache_mshr_stalls), {
|
||||
return err;
|
||||
});
|
||||
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
|
||||
int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls);
|
||||
|
@ -545,28 +442,28 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
if (dcache_enable) {
|
||||
// PERF: Dcache
|
||||
uint64_t dcache_reads;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_READS, core_id, &dcache_reads), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_READS, core_id, &dcache_reads), {
|
||||
return err;
|
||||
});
|
||||
uint64_t dcache_writes;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
|
||||
return err;
|
||||
});
|
||||
uint64_t dcache_read_misses;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
|
||||
return err;
|
||||
});
|
||||
uint64_t dcache_write_misses;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_W, core_id, &dcache_write_misses), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_W, core_id, &dcache_write_misses), {
|
||||
return err;
|
||||
});
|
||||
uint64_t dcache_bank_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_BANK_ST, core_id, &dcache_bank_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_BANK_ST, core_id, &dcache_bank_stalls), {
|
||||
return err;
|
||||
});
|
||||
uint64_t dcache_mshr_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MSHR_ST, core_id, &dcache_mshr_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MSHR_ST, core_id, &dcache_mshr_stalls), {
|
||||
return err;
|
||||
});
|
||||
int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads);
|
||||
int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes);
|
||||
|
@ -583,74 +480,73 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
if (l2cache_enable) {
|
||||
// PERF: L2cache
|
||||
uint64_t tmp;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_READS, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_READS, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_reads += tmp;
|
||||
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_WRITES, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_WRITES, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_writes += tmp;
|
||||
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_R, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_R, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_read_misses += tmp;
|
||||
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_W, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MISS_W, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_write_misses += tmp;
|
||||
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_BANK_ST, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_BANK_ST, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_bank_stalls += tmp;
|
||||
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MSHR_ST, core_id, &tmp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L2CACHE_MSHR_ST, core_id, &tmp), {
|
||||
return err;
|
||||
});
|
||||
l2cache_mshr_stalls += tmp;
|
||||
}
|
||||
if (0 == core_id) {
|
||||
if (l3cache_enable) {
|
||||
// PERF: L3cache
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_READS, core_id, &l3cache_reads), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_READS, core_id, &l3cache_reads), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_WRITES, core_id, &l3cache_writes), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_WRITES, core_id, &l3cache_writes), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_R, core_id, &l3cache_read_misses), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_R, core_id, &l3cache_read_misses), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_W, core_id, &l3cache_write_misses), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MISS_W, core_id, &l3cache_write_misses), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_BANK_ST, core_id, &l3cache_bank_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_BANK_ST, core_id, &l3cache_bank_stalls), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MSHR_ST, core_id, &l3cache_mshr_stalls), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_L3CACHE_MSHR_ST, core_id, &l3cache_mshr_stalls), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
// PERF: memory
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_READS, core_id, &mem_reads), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_READS, core_id, &mem_reads), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_WRITES, core_id, &mem_writes), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_WRITES, core_id, &mem_writes), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
||||
|
@ -659,7 +555,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
max_cycles = std::max<uint64_t>(cycles_per_core, max_cycles);
|
||||
}
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles);
|
||||
|
@ -728,7 +623,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
float IPC = (float)(double(total_instrs) / double(max_cycles));
|
||||
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC);
|
||||
|
@ -741,11 +635,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_barriers, uint32_t* max_localmem) {
|
||||
// check group size
|
||||
uint64_t warps_per_core, threads_per_warp;
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_WARPS, &warps_per_core), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_WARPS, &warps_per_core), {
|
||||
return err;
|
||||
});
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_THREADS, &threads_per_warp), {
|
||||
return err;
|
||||
});
|
||||
uint32_t threads_per_core = warps_per_core * threads_per_warp;
|
||||
if (group_size > threads_per_core) {
|
||||
|
@ -760,8 +654,8 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
|
|||
// check barriers capacity
|
||||
if (max_barriers) {
|
||||
uint64_t num_barriers;
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_BARRIERS, &num_barriers), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_BARRIERS, &num_barriers), {
|
||||
return err;
|
||||
});
|
||||
if (warps_per_group < 2) {
|
||||
*max_barriers = -1;
|
||||
|
@ -773,8 +667,8 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
|
|||
// check local memory capacity
|
||||
if (max_localmem) {
|
||||
uint64_t local_mem_size;
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size), {
|
||||
return _ret;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_LOCAL_MEM_SIZE, &local_mem_size), {
|
||||
return err;
|
||||
});
|
||||
*max_localmem = local_mem_size / groups_per_core;
|
||||
}
|
|
@ -11,7 +11,8 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <callbacks.h>
|
||||
#include <common.h>
|
||||
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
@ -19,12 +20,42 @@
|
|||
#include <dlfcn.h>
|
||||
#include <iostream>
|
||||
|
||||
int get_profiling_mode();
|
||||
|
||||
static int dcr_initialize(vx_device_h hdevice) {
|
||||
const uint64_t startup_addr(STARTUP_ADDR);
|
||||
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff), {
|
||||
return err;
|
||||
});
|
||||
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32), {
|
||||
return err;
|
||||
});
|
||||
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG0, 0), {
|
||||
return err;
|
||||
});
|
||||
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_STARTUP_ARG1, 0), {
|
||||
return err;
|
||||
});
|
||||
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
|
||||
return err;
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static callbacks_t g_callbacks;
|
||||
static void* g_drv_handle = nullptr;
|
||||
|
||||
typedef int (*vx_dev_init_t)(callbacks_t*);
|
||||
|
||||
int vx_dev_open(vx_device_h* hdevice) {
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
{
|
||||
const char* driverName = getenv("VORTEX_DRIVER");
|
||||
if (driverName == nullptr) {
|
||||
|
@ -50,67 +81,86 @@ int vx_dev_open(vx_device_h* hdevice) {
|
|||
g_drv_handle = handle;
|
||||
}
|
||||
|
||||
return (g_callbacks.dev_open)(hdevice);
|
||||
vx_device_h _hdevice;
|
||||
|
||||
CHECK_ERR((g_callbacks.dev_open)(&_hdevice), {
|
||||
return err;
|
||||
});
|
||||
|
||||
CHECK_ERR(dcr_initialize(_hdevice), {
|
||||
return err;
|
||||
});
|
||||
|
||||
*hdevice = _hdevice;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vx_dev_close(vx_device_h hdevice) {
|
||||
extern int vx_dev_close(vx_device_h hdevice) {
|
||||
vx_dump_perf(hdevice, stdout);
|
||||
int ret = (g_callbacks.dev_close)(hdevice);
|
||||
dlclose(g_drv_handle);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t* value) {
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t* value) {
|
||||
return (g_callbacks.dev_caps)(hdevice, caps_id, value);
|
||||
}
|
||||
|
||||
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
return (g_callbacks.mem_alloc)(hdevice, size, flags, hbuffer);
|
||||
}
|
||||
|
||||
int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
extern int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
return (g_callbacks.mem_reserve)(hdevice, address, size, flags, hbuffer);
|
||||
}
|
||||
|
||||
int vx_mem_free(vx_buffer_h hbuffer) {
|
||||
extern int vx_mem_free(vx_buffer_h hbuffer) {
|
||||
return (g_callbacks.mem_free)(hbuffer);
|
||||
}
|
||||
|
||||
int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags) {
|
||||
extern int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags) {
|
||||
return (g_callbacks.mem_access)(hbuffer, offset, size, flags);
|
||||
}
|
||||
|
||||
int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address) {
|
||||
extern int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address) {
|
||||
return (g_callbacks.mem_address)(hbuffer, address);
|
||||
}
|
||||
|
||||
int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used) {
|
||||
extern int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used) {
|
||||
return (g_callbacks.mem_info)(hdevice, mem_free, mem_used);
|
||||
}
|
||||
|
||||
int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size) {
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size) {
|
||||
return (g_callbacks.copy_to_dev)(hbuffer, host_ptr, dst_offset, size);
|
||||
}
|
||||
|
||||
int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
|
||||
extern int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
|
||||
return (g_callbacks.copy_from_dev)(host_ptr, hbuffer, src_offset, size);
|
||||
}
|
||||
|
||||
int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {
|
||||
extern int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {
|
||||
int profiling_mode = get_profiling_mode();
|
||||
if (profiling_mode != 0) {
|
||||
CHECK_ERR(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, profiling_mode), {
|
||||
return err;
|
||||
});
|
||||
}
|
||||
return (g_callbacks.start)(hdevice, hkernel, harguments);
|
||||
}
|
||||
|
||||
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
return (g_callbacks.ready_wait)(hdevice, timeout);
|
||||
}
|
||||
|
||||
int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
|
||||
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
|
||||
return (g_callbacks.dcr_read)(hdevice, addr, value);
|
||||
}
|
||||
|
||||
int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
|
||||
extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
|
||||
return (g_callbacks.dcr_write)(hdevice, addr, value);
|
||||
}
|
||||
|
||||
int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
return (g_callbacks.mpm_query)(hdevice, addr, core_id, value);
|
||||
}
|
|
@ -13,7 +13,7 @@ CXXFLAGS += -fPIC
|
|||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L$(XILINX_XRT)/lib
|
||||
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp $(SIM_DIR)/common/util.cpp
|
||||
SRCS := $(SRC_DIR)/vortex.cpp $(SIM_DIR)/common/util.cpp
|
||||
|
||||
# set up target types
|
||||
ifeq ($(TARGET), xrtsim)
|
||||
|
|
|
@ -11,16 +11,7 @@
|
|||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <malloc.h>
|
||||
#include <utils.h>
|
||||
#include <VX_config.h>
|
||||
#include <VX_types.h>
|
||||
#include <stdarg.h>
|
||||
#include <util.h>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <common.h>
|
||||
|
||||
#ifdef SCOPE
|
||||
#include "scope.h"
|
||||
|
@ -38,7 +29,12 @@
|
|||
#include <fpga.h>
|
||||
#endif
|
||||
|
||||
#include <callbacks.h>
|
||||
#include <stdarg.h>
|
||||
#include <util.h>
|
||||
#include <limits>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
|
@ -91,20 +87,12 @@ static const platform_info_t g_platforms [] = {
|
|||
|
||||
#endif
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
#define DEFAULT_DEVICE_INDEX 0
|
||||
|
||||
#define DEFAULT_XCLBIN_PATH "vortex_afu.xclbin"
|
||||
|
||||
#define KERNEL_NAME "vortex_afu"
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
#define CHECK_HANDLE(handle, _expr, _cleanup) \
|
||||
auto handle = _expr; \
|
||||
if (handle == nullptr) { \
|
||||
|
@ -112,15 +100,6 @@ static const platform_info_t g_platforms [] = {
|
|||
_cleanup \
|
||||
}
|
||||
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
#ifndef CPP_API
|
||||
|
||||
static void dump_xrt_error(xrtDeviceHandle xrtDevice, xrtErrorCode err) {
|
||||
|
@ -164,7 +143,6 @@ public:
|
|||
#ifndef CPP_API
|
||||
|
||||
~vx_device() {
|
||||
profiling_remove(profiling_id_);
|
||||
for (auto& entry : xrtBuffers_) {
|
||||
#ifdef BANK_INTERLEAVE
|
||||
xrtBOFree(entry);
|
||||
|
@ -238,12 +216,6 @@ public:
|
|||
}
|
||||
#endif
|
||||
|
||||
CHECK_ERR(dcr_initialize(this), {
|
||||
return err;
|
||||
});
|
||||
|
||||
profiling_id_ = profiling_add(this);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -522,8 +494,6 @@ public:
|
|||
return err;
|
||||
});
|
||||
|
||||
profiling_begin(profiling_id_);
|
||||
|
||||
// start execution
|
||||
CHECK_ERR(this->write_register(MMIO_CTL_ADDR, CTL_AP_START), {
|
||||
return err;
|
||||
|
@ -563,8 +533,6 @@ public:
|
|||
timeout -= sleep_time_ms;
|
||||
};
|
||||
|
||||
profiling_end(profiling_id_);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -608,7 +576,6 @@ private:
|
|||
uint64_t global_mem_size_;
|
||||
DeviceConfig dcrs_;
|
||||
std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
|
||||
int profiling_id_;
|
||||
|
||||
#ifdef BANK_INTERLEAVE
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue