perf counters profiling refactoring

This commit is contained in:
Blaise Tine 2024-05-11 17:10:08 -07:00
parent dc27d3c014
commit 98f080340a
10 changed files with 161 additions and 137 deletions

View file

@ -1,12 +1,12 @@
#!/bin/sh
# Copyright © 2019-2023
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -91,12 +91,12 @@ case $i in
;;
--scope)
SCOPE=1
CORES=1
CORES=1
shift
;;
--perf=*)
PERF_FLAG=-DPERF_ENABLE
PERF_CLASS=${i#*=}
PERF_CLASS=${i#*=}
shift
;;
--args=*)
@ -117,8 +117,8 @@ case $i in
exit 0
;;
*)
show_usage
exit -1
show_usage
exit -1
;;
esac
done
@ -162,7 +162,7 @@ else
exit -1
fi
if [ "$DRIVER" = "gpu" ];
if [ "$DRIVER" = "gpu" ];
then
# running application
if [ $HAS_ARGS -eq 1 ]
@ -183,11 +183,11 @@ CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_TH
echo "CONFIGS=$CONFIGS"
if [ $REBUILD -ne 0 ]
if [ $REBUILD -ne 0 ]
then
BLACKBOX_CACHE=blackbox.$DRIVER.cache
BLACKBOX_CACHE=blackbox.$DRIVER.cache
if [ -f "$BLACKBOX_CACHE" ]
then
then
LAST_CONFIGS=`cat $BLACKBOX_CACHE`
fi
@ -199,7 +199,7 @@ then
fi
# export performance monitor class identifier
export PERF_CLASS=$PERF_CLASS
export VORTEX_PROFILING=$PERF_CLASS
status=0
@ -210,7 +210,7 @@ make -C $ROOT_DIR/hw config > /dev/null
make -C $ROOT_DIR/runtime/stub > /dev/null
if [ $DEBUG -ne 0 ]
then
then
# running application
if [ $TEMPBUILD -eq 1 ]
then
@ -265,18 +265,18 @@ then
status=$?
fi
fi
if [ -f "$APP_PATH/trace.vcd" ]
then
then
mv -f $APP_PATH/trace.vcd .
fi
else
else
if [ $TEMPBUILD -eq 1 ]
then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization
if [ $SCOPE -eq 1 ]
then
@ -286,7 +286,7 @@ else
echo "running: DESTDIR=$TEMPDIR/$DRIVER CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
@ -302,7 +302,7 @@ else
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else
# driver initialization
if [ $SCOPE -eq 1 ]
then

View file

@ -17,81 +17,87 @@
#include <list>
#include <cstring>
#include <vector>
#include <unordered_map>
#include <vortex.h>
#include <assert.h>
#define RT_CHECK(_expr, _cleanup) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
_cleanup \
} while (false)
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
_cleanup \
} while (false)
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
bool is_aligned(uint64_t addr, uint64_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return 0 == (addr & (alignment - 1));
assert(0 == (alignment & (alignment - 1)));
return 0 == (addr & (alignment - 1));
}
///////////////////////////////////////////////////////////////////////////////
class AutoPerfDump {
public:
AutoPerfDump() : perf_class_(0) {}
~AutoPerfDump() {
for (auto hdevice : hdevices_) {
vx_dump_perf(hdevice, stdout);
}
AutoPerfDump() : perf_class_(0) {
auto profiling_s = getenv("VORTEX_PROFILING");
if (profiling_s) {
perf_class_ = std::atoi(profiling_s);
}
}
void add_device(vx_device_h hdevice) {
auto perf_class_s = getenv("PERF_CLASS");
if (perf_class_s) {
perf_class_ = std::atoi(perf_class_s);
vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, perf_class_);
}
hdevices_.push_back(hdevice);
}
~AutoPerfDump() {}
void remove_device(vx_device_h hdevice) {
hdevices_.remove(hdevice);
vx_dump_perf(hdevice, stdout);
}
int add(vx_device_h hdevice) {
int ret = devices_.size();
devices_[ret] = hdevice;
return ret;
}
int get_perf_class() const {
return perf_class_;
}
void remove(int id) {
devices_.erase(id);
}
void begin(int id) {
auto device = devices_.at(id);
vx_dcr_write(device, VX_DCR_BASE_MPM_CLASS, perf_class_);
}
void end(int id) {
auto device = devices_.at(id);
vx_dump_perf(device, stdout);
}
int get_perf_class() const {
return perf_class_;
}
private:
std::list<vx_device_h> hdevices_;
int perf_class_;
std::unordered_map<int, vx_device_h> devices_;
int perf_class_;
};
#ifdef DUMP_PERF_STATS
AutoPerfDump gAutoPerfDump;
#endif
void perf_add_device(vx_device_h hdevice) {
#ifdef DUMP_PERF_STATS
gAutoPerfDump.add_device(hdevice);
#else
(void)hdevice;
#endif
int profiling_add(vx_device_h hdevice) {
return gAutoPerfDump.add(hdevice);
}
void perf_remove_device(vx_device_h hdevice) {
#ifdef DUMP_PERF_STATS
gAutoPerfDump.remove_device(hdevice);
#else
(void)hdevice;
#endif
void profiling_remove(int id) {
gAutoPerfDump.remove(id);
}
void profiling_begin(int id) {
gAutoPerfDump.begin(id);
}
void profiling_end(int id) {
gAutoPerfDump.end(id);
}
///////////////////////////////////////////////////////////////////////////////

View file

@ -33,9 +33,13 @@ uint64_t aligned_size(uint64_t size, uint64_t alignment);
bool is_aligned(uint64_t addr, uint64_t alignment);
void perf_add_device(vx_device_h device);
int profiling_add(vx_device_h device);
void perf_remove_device(vx_device_h device);
void profiling_remove(int id);
void profiling_begin(int id);
void profiling_end(int id);
#define CACHE_BLOCK_SIZE 64
#define ALLOC_BASE_ADDR CACHE_BLOCK_SIZE

View file

@ -18,9 +18,6 @@ CXXFLAGS += -fPIC
# Add external configuration
CXXFLAGS += $(CONFIGS)
# Dump perf stats
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -luuid -ldl -pthread
SRCS = $(SRC_DIR)/vortex.cpp $(SRC_DIR)/driver.cpp $(COMMON_DIR)/utils.cpp
@ -42,13 +39,13 @@ endif
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
else
CXXFLAGS += -O2 -DNDEBUG
endif
# Enable scope logic analyzer
ifdef SCOPE
CXXFLAGS += -DSCOPE
CXXFLAGS += -DSCOPE
SRCS += $(COMMON_DIR)/scope.cpp
endif

View file

@ -112,6 +112,8 @@ public:
}
api_.fpgaClose(fpga_);
}
profiling_remove(profiling_id_);
}
int init() {
@ -208,7 +210,13 @@ public:
}
#endif
return dcr_initialize(this);
CHECK_ERR(dcr_initialize(this), {
return err;
});
profiling_id_ = profiling_add(this);
return 0;
}
int get_caps(uint32_t caps_id, uint64_t *value) {
@ -397,6 +405,8 @@ public:
return err;
});
profiling_begin(profiling_id_);
// start execution
CHECK_FPGA_ERR(api_.fpgaWriteMMIO64(fpga_, 0, MMIO_CMD_TYPE, CMD_RUN), {
return -1;
@ -455,6 +465,7 @@ public:
}
if (state != 0) {
fprintf(stdout, "[VXDRV] ready-wait timed out: state=%d\n", state);
return -1;
}
break;
}
@ -462,6 +473,9 @@ public:
nanosleep(&sleep_time, nullptr);
timeout -= sleep_time_ms;
};
profiling_end(profiling_id_);
return 0;
}
@ -538,6 +552,7 @@ private:
uint8_t* staging_ptr_;
uint64_t staging_size_;
std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
int profiling_id_;
};
struct vx_buffer {
@ -569,10 +584,6 @@ extern int vx_dev_open(vx_device_h* hdevice) {
return err;
});
#ifdef DUMP_PERF_STATS
perf_add_device(device);
#endif
DBGPRINT("DEV_OPEN: hdevice=%p\n", (void*)device);
*hdevice = device;
@ -592,10 +603,6 @@ extern int vx_dev_close(vx_device_h hdevice) {
vx_scope_stop(hdevice);
#endif
#ifdef DUMP_PERF_STATS
perf_remove_device(hdevice);
#endif
delete device;
drv_close();
@ -758,11 +765,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t ds
DBGPRINT("COPY_TO_DEV: hbuffer=%p, host_addr=%p, dst_offset=%ld, size=%ld\n", hbuffer, host_ptr, dst_offset, size);
CHECK_ERR(device->upload(buffer->addr + dst_offset, host_ptr, size), {
return err;
});
return 0;
return device->upload(buffer->addr + dst_offset, host_ptr, size);
}
extern int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
@ -777,11 +780,7 @@ extern int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_of
DBGPRINT("COPY_FROM_DEV: hbuffer=%p, host_addr=%p, src_offset=%ld, size=%ld\n", hbuffer, host_ptr, src_offset, size);
CHECK_ERR(device->download(host_ptr, buffer->addr + src_offset, size), {
return err;
});
return 0;
return device->download(host_ptr, buffer->addr + src_offset, size);
}
extern int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {

View file

@ -14,9 +14,6 @@ CXXFLAGS += -fPIC
# Add external configuration
CXXFLAGS += $(CONFIGS)
# Dump perf stats
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
LDFLAGS += -L$(DESTDIR) -lrtlsim
@ -25,7 +22,7 @@ SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
else
CXXFLAGS += -O2 -DNDEBUG
endif
@ -37,7 +34,7 @@ endif
PROJECT := libvortex.so
all: $(DESTDIR)/$(PROJECT)
$(DESTDIR)/$(PROJECT): $(SRCS)
DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/rtlsim $(DESTDIR)/librtlsim.so
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $@

View file

@ -64,6 +64,15 @@ public:
if (future_.valid()) {
future_.wait();
}
profiling_remove(profiling_id_);
}
int init() {
CHECK_ERR(dcr_initialize(this), {
return err;
});
profiling_id_ = profiling_add(this);
return 0;
}
int get_caps(uint32_t caps_id, uint64_t *value) {
@ -208,6 +217,8 @@ public:
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
profiling_begin(profiling_id_);
// start new run
future_ = std::async(std::launch::async, [&]{
processor_.run();
@ -227,10 +238,12 @@ public:
for (;;) {
// wait for 1 sec and check status
auto status = future_.wait_for(wait_time);
if (status == std::future_status::ready
|| 0 == timeout_sec--)
if (status == std::future_status::ready)
break;
if (0 == timeout_sec--)
return -1;
}
profiling_end(profiling_id_);
return 0;
}
@ -269,6 +282,7 @@ private:
DeviceConfig dcrs_;
std::future<void> future_;
std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
int profiling_id_;
};
struct vx_buffer {
@ -287,15 +301,10 @@ extern int vx_dev_open(vx_device_h* hdevice) {
if (device == nullptr)
return -1;
int err = dcr_initialize(device);
if (err != 0) {
CHECK_ERR(device->init(), {
delete device;
return err;
}
#ifdef DUMP_PERF_STATS
perf_add_device(device);
#endif
});
DBGPRINT("DEV_OPEN: hdevice=%p\n", (void*)device);
@ -310,11 +319,7 @@ extern int vx_dev_close(vx_device_h hdevice) {
DBGPRINT("DEV_CLOSE: hdevice=%p\n", hdevice);
#ifdef DUMP_PERF_STATS
perf_remove_device(hdevice);
#endif
vx_device *device = ((vx_device*)hdevice);
auto device = ((vx_device*)hdevice);
delete device;
@ -512,6 +517,7 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
DBGPRINT("READY_WAIT: hdevice=%p, timeout=%ld\n", hdevice, timeout);
auto device = ((vx_device*)hdevice);
return device->ready_wait(timeout);
}

View file

@ -8,7 +8,6 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I$(INC_DIR) -I../common -I$(ROOT_DIR)/hw -I$(SIM_DIR)/simx -I$(COMMON_DIR) -I$(SIM_DIR)/common
CXXFLAGS += $(CONFIGS)
CXXFLAGS += -DDUMP_PERF_STATS
CXXFLAGS += -DXLEN_$(XLEN)
LDFLAGS += -shared -pthread
@ -19,7 +18,7 @@ SRCS := $(SRC_DIR)/vortex.cpp $(COMMON_DIR)/utils.cpp
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
else
CXXFLAGS += -O2 -DNDEBUG
endif

View file

@ -68,6 +68,15 @@ public:
if (future_.valid()) {
future_.wait();
}
profiling_remove(profiling_id_);
}
int init() {
CHECK_ERR(dcr_initialize(this), {
return err;
});
profiling_id_ = profiling_add(this);
return 0;
}
int get_caps(uint32_t caps_id, uint64_t *value) {
@ -203,6 +212,8 @@ public:
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
profiling_begin(profiling_id_);
// start new run
future_ = std::async(std::launch::async, [&]{
processor_.run();
@ -222,10 +233,12 @@ public:
for (;;) {
// wait for 1 sec and check status
auto status = future_.wait_for(wait_time);
if (status == std::future_status::ready
|| 0 == timeout_sec--)
if (status == std::future_status::ready)
break;
if (0 == timeout_sec--)
return -1;
}
profiling_end(profiling_id_);
return 0;
}
@ -264,6 +277,7 @@ private:
DeviceConfig dcrs_;
std::future<void> future_;
std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
int profiling_id_;
};
struct vx_buffer {
@ -282,15 +296,10 @@ extern int vx_dev_open(vx_device_h* hdevice) {
if (device == nullptr)
return -1;
int err = dcr_initialize(device);
if (err != 0) {
CHECK_ERR(device->init(), {
delete device;
return err;
}
#ifdef DUMP_PERF_STATS
perf_add_device(device);
#endif
});
DBGPRINT("DEV_OPEN: hdevice=%p\n", (void*)device);
@ -307,10 +316,6 @@ extern int vx_dev_close(vx_device_h hdevice) {
auto device = ((vx_device*)hdevice);
#ifdef DUMP_PERF_STATS
perf_remove_device(hdevice);
#endif
delete device;
return 0;
@ -507,6 +512,7 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
DBGPRINT("READY_WAIT: hdevice=%p, timeout=%ld\n", hdevice, timeout);
auto device = ((vx_device*)hdevice);
return device->ready_wait(timeout);
}

View file

@ -154,6 +154,7 @@ public:
#ifndef CPP_API
~vx_device() {
profiling_remove(profiling_id_);
for (auto& entry : xrtBuffers_) {
#ifdef BANK_INTERLEAVE
xrtBOFree(entry);
@ -227,6 +228,12 @@ public:
}
#endif
CHECK_ERR(dcr_initialize(this), {
return err;
});
profiling_id_ = profiling_add(this);
return 0;
}
@ -505,6 +512,8 @@ public:
return err;
});
profiling_begin(profiling_id_);
// start execution
CHECK_ERR(device->write_register(MMIO_CTL_ADDR, CTL_AP_START), {
return err;
@ -535,12 +544,17 @@ public:
return err;
});
bool is_done = (status & CTL_AP_DONE) == CTL_AP_DONE;
if (is_done || 0 == timeout) {
if (is_done)
break;
if (0 == timeout) {
return -1;
}
nanosleep(&sleep_time, nullptr);
timeout -= sleep_time_ms;
};
profiling_end(profiling_id_);
return 0;
}
@ -584,6 +598,7 @@ private:
uint64_t global_mem_size_;
DeviceConfig dcrs_;
std::unordered_map<uint32_t, std::array<uint64_t, 32>> mpm_cache_;
int profiling_id_;
#ifdef BANK_INTERLEAVE
@ -841,15 +856,6 @@ extern int vx_dev_open(vx_device_h* hdevice) {
}
#endif
CHECK_ERR(dcr_initialize(device), {
delete device;
return err;
});
#ifdef DUMP_PERF_STATS
perf_add_device(device);
#endif
DBGPRINT("DEV_OPEN: hdevice=%p\n", (void*)device);
*hdevice = device;
@ -1078,7 +1084,11 @@ extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
auto device = ((vx_device*)hdevice);
return device->ready_wait(timeout);
CHECK_ERR(device->ready_wait(timeout), {
return err;
});
return 0;
}
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {