runtime perf counter

This commit is contained in:
Blaise Tine 2022-07-26 15:07:59 -07:00
parent 6c1b08f45d
commit 500afe661e
10 changed files with 95 additions and 32 deletions

View file

@ -326,7 +326,7 @@
// Number of raster units
`ifndef NUM_RASTER_UNITS
`define NUM_RASTER_UNITS 1
`define NUM_RASTER_UNITS `UP(`NUM_CORES / 16)
`endif
// RASTER memory pending size
@ -363,7 +363,7 @@
// Number of rop units
`ifndef NUM_ROP_UNITS
`define NUM_ROP_UNITS 1
`define NUM_ROP_UNITS `UP(`NUM_CORES / 16)
`endif
// ROP memory pending size
@ -712,7 +712,7 @@
// Size of cache in bytes
`ifndef L2_CACHE_SIZE
`define L2_CACHE_SIZE 1048576
`define L2_CACHE_SIZE 2097152
`endif
// Number of banks

View file

@ -27,7 +27,7 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_ROP
DBG_FLAGS += $(DBG_TRACE_FLAGS)
#CONFIGS += -DEXT_GFX_ENABLE
CONFIGS += -DEXT_GFX_ENABLE
#CONFIGS += -DL1_DISABLE
#CONFIGS += -DSM_DISABLE

12
perf/cache/perf.sh vendored
View file

@ -10,17 +10,17 @@ sgemm()
{
echo "begin cache tests"
CONFIGS="-DICACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' > ./perf/cache/cache_perf.log
CONFIGS="-DICACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' > ./perf/cache/cache_perf.log
echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log
CONFIGS="-DDCACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log
CONFIGS="-DDCACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log
echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log
CONFIGS="-DICACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log
CONFIGS="-DICACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log
echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log
CONFIGS="-DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log
CONFIGS="-DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log
echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log
CONFIGS="-DICACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log
CONFIGS="-DICACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log
echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log
CONFIGS="-DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log
CONFIGS="-DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log
echo "cache tests done!"
}

View file

@ -1,27 +1,23 @@
#!/bin/bash
LOG=./perf/draw3d/draw3d_perf.log
declare -a traces=(vase filmtv skybox coverflow evilskull polybump tekkaman carnival)
# exit when any command fails
set -e
# ensure build
make -s
WIDTH=1920
HEIGHT=1080
LOG_FILE=./perf/draw3d/perf_${DEVICE_FAMILY}_${WIDTH}_${HEIGHT}.log
declare -a traces=(vase filmtv skybox coverflow evilskull polybump tekkaman carnival)
# draw3d benchmarks
draw3d(){
echo > $LOG # clear log
for TRACE in "${traces[@]}"
echo > $LOG_FILE # clear log
for trace in "${traces[@]}"
do
echo -e "\n**************************************\n" >> $LOG
echo -e "draw3d $TRACE benchmark\n" >> $LOG
if [ $ALL = true ]
then
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-t$TRACE.cgltrace -w512 -h512" >> $LOG
else
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-t$TRACE.cgltrace -w512 -h512" | grep 'PERF' >> $LOG
fi
echo -e "\n**************************************\n" >> $LOG_FILE
echo -e "draw3d $trace benchmark\n" >> $LOG_FILE
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=fpga --app=draw3d --args="-t$trace.cgltrace -w${WIDTH} -h${HEIGHT}" | grep 'Total elapsed time:' >> $LOG_FILE
done
echo "draw3d tests done!"
}

View file

@ -18,7 +18,7 @@ rtlsim()
for i in 1 4 16
do
echo "NUM_CORES = " $i >> $LOG
CONFIGS="-DEXT_ROP_ENABLE" ./ci/blackbox.sh --driver=rtlsim --cores=$i --warps=4 --threads=4 --app=rop --args="-w128 -h128 -b -d" --perf=4 | grep 'PERF' >> $LOG
CONFIGS="-DEXT_ROP_ENABLE" ./ci/blackbox.sh --driver=rtlsim --cores=$i --warps=4 --threads=4 --app=rop --args="-w128 -h128 -b -d" --perf=4 | grep 'PERF\|Total elapsed time' >> $LOG
echo "**************************************" >> $LOG
done

View file

@ -535,7 +535,62 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
// release allocated resources
vx_buf_free(staging_buf);
return ret;
return 0;
}
extern int vx_perf_counter(vx_device_h device, int counter, int core_id, uint64_t* value) {
int ret = 0;
uint64_t num_cores;
ret = vx_dev_caps(device, VX_CAPS_MAX_CORES, &num_cores);
if (ret != 0)
return ret;
if (core_id >= (int)num_cores) {
std::cout << "error: core_id out of range" << std::endl;
return -1;
}
uint64_t mpm_mem_size = 64 * sizeof(uint32_t);
vx_buffer_h staging_buf;
ret = vx_buf_alloc(device, mpm_mem_size, &staging_buf);
if (ret != 0)
return ret;
auto staging_ptr = (uint32_t*)vx_host_ptr(staging_buf);
uint64_t _value = 0;
unsigned i = 0;
if (core_id != -1) {
i = core_id;
num_cores = core_id + 1;
}
for (i = 0; i < num_cores; ++i) {
uint64_t mpm_mem_addr = IO_CSR_ADDR + i * mpm_mem_size;
ret = vx_copy_from_dev(staging_buf, mpm_mem_addr, mpm_mem_size, 0);
if (ret != 0) {
vx_buf_free(staging_buf);
return ret;
}
auto per_core_value = get_csr_64(staging_ptr, counter);
if (counter == CSR_MCYCLE) {
_value = std::max<uint64_t>(per_core_value, _value);
} else {
_value += per_core_value;
}
}
// release allocated resources
vx_buf_free(staging_buf);
// output
*value = _value;
return 0;
}
// Deprecated API functions

View file

@ -91,8 +91,9 @@ int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t siz
// upload kernel file to device
int vx_upload_kernel_file(vx_device_h device, const char* filename);
// dump performance counters
// performance counters
int vx_dump_perf(vx_device_h device, FILE* stream);
int vx_perf_counter(vx_device_h device, int counter, int core_id, uint64_t* value);
//////////////////////////// DEPRECATED FUNCTIONS /////////////////////////////
int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr);

View file

@ -137,6 +137,9 @@ int render(const CGLTrace& trace) {
uint32_t draw_idx = 0;
uint64_t instrs = 0;
uint64_t cycles = 0;
// render each draw call
for (auto& drawcall : trace.drawcalls) {
auto& states = drawcall.states;
@ -344,9 +347,16 @@ int render(const CGLTrace& trace) {
printf("Elapsed time: %lg ms\n", elapsed);
if (draw_idx < trace.drawcalls.size()-1) {
vx_dump_perf(device, stdout);
vx_dump_perf(device, stdout);
}
uint64_t instrs_;
uint64_t cycles_;
RT_CHECK(vx_perf_counter(device, CSR_MCYCLE, -1, &cycles_));
RT_CHECK(vx_perf_counter(device, CSR_MINSTRET, -1, &instrs_));
cycles += cycles_;
instrs += instrs_;
++draw_idx;
}
@ -364,7 +374,8 @@ int render(const CGLTrace& trace) {
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_begin).count();
printf("Total elapsed time: %lg ms\n", elapsed);
float IPC = (float)(double(instrs) / double(cycles));
printf("Total elapsed time: %lg ms, instrs=%ld, cycles=%ld, IPC=%f\n", elapsed, instrs, cycles, IPC);
// save output image
std::cout << "save output image" << std::endl;

View file

@ -33,7 +33,7 @@ CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(VORTEX_KN_PATH)/../sim/common -I$(VORTEX_KN_PATH)/../third_party
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(VORTEX_KN_PATH)/../third_party/cocogfx/libcocogfx.a -lpng -lz -lboost_serialization
LDFLAGS += -L/homes/tinebp/tools/boost/lib -L$(VORTEX_RT_PATH)/stub -lvortex $(VORTEX_KN_PATH)/../third_party/cocogfx/libcocogfx.a -lpng -lz -lboost_serialization
# Debugigng
ifdef DEBUG

View file

@ -22,7 +22,7 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(VORTEX_KN_PATH)/../sim/common -I$(VORTEX_KN_PATH)/../third_party
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(VORTEX_KN_PATH)/../third_party/cocogfx/libcocogfx.a -lpng -lz
LDFLAGS += -L/homes/tinebp/tools/boost/lib -L$(VORTEX_RT_PATH)/stub -lvortex $(VORTEX_KN_PATH)/../third_party/cocogfx/libcocogfx.a -lpng -lz
# Debugigng
ifdef DEBUG