Added performance counter for number of cycles when the rop is not used

This commit is contained in:
Santosh Srivatsan 2022-04-10 14:07:20 -04:00
parent 1a871385a0
commit 283837dffb
10 changed files with 303 additions and 25 deletions

View file

@ -146,6 +146,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
uint64_t rop_mem_reads = 0;
uint64_t rop_mem_writes = 0;
uint64_t rop_mem_lat = 0;
uint64_t rop_inactive_cycles = 0;
#endif
#endif
@ -302,6 +303,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
rop_mem_reads = get_csr_64(staging_ptr, CSR_MPM_ROP_READS);
rop_mem_writes = get_csr_64(staging_ptr, CSR_MPM_ROP_WRITES);
rop_mem_lat = get_csr_64(staging_ptr, CSR_MPM_ROP_LAT);
rop_inactive_cycles = get_csr_64(staging_ptr, CSR_MPM_ROP_INACTIVE_CYC);
#endif
#endif
}
@ -345,9 +347,10 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat);
#endif
#ifdef EXT_ROP_ENABLE
fprintf(stream, "PERF: rop memory reads=%ld\n", rop_mem_reads);
fprintf(stream, "PERF: rop memory writes=%ld\n", rop_mem_writes);
fprintf(stream, "PERF: rop memory latency=%ld\n", rop_mem_lat);
fprintf(stream, "PERF: rop memory reads=%ld\n", rop_mem_reads);
fprintf(stream, "PERF: rop memory writes=%ld\n", rop_mem_writes);
fprintf(stream, "PERF: rop memory latency=%ld\n", rop_mem_lat);
fprintf(stream, "PERF: rop inactive cycles=%ld\n", rop_inactive_cycles);
#endif
#endif

View file

@ -172,6 +172,7 @@ module VX_cluster #(
`UNUSED_VAR (rop_perf_if.mem_reads)
`UNUSED_VAR (rop_perf_if.mem_writes)
`UNUSED_VAR (rop_perf_if.mem_latency)
`UNUSED_VAR (rop_perf_if.inactive_cycles)
`UNUSED_VAR (perf_ocache_if.reads)
`UNUSED_VAR (perf_ocache_if.writes)
`UNUSED_VAR (perf_ocache_if.read_misses)

View file

@ -290,6 +290,13 @@ module VX_csr_data #(
`CSR_MPM_TEX_LAT : read_data_r = {`NUM_THREADS{tex_perf_if.mem_latency[31:0]}};
`CSR_MPM_TEX_LAT_H : read_data_r = {`NUM_THREADS{32'(tex_perf_if.mem_latency[`PERF_CTR_BITS-1:32])}};
`endif
`ifdef EXT_RASTER_ENABLE
// PERF: rasterunit
`CSR_MPM_RAS_READS : read_data_r = {`NUM_THREADS{raster_perf_if.mem_reads[31:0]}};
`CSR_MPM_RAS_READS_H : read_data_r = {`NUM_THREADS{32'(raster_perf_if.mem_reads[`PERF_CTR_BITS-1:32])}};
`CSR_MPM_RAS_LAT : read_data_r = {`NUM_THREADS{raster_perf_if.mem_latency[31:0]}};
`CSR_MPM_RAS_LAT_H : read_data_r = {`NUM_THREADS{32'(raster_perf_if.mem_latency[`PERF_CTR_BITS-1:32])}};
`endif
`ifdef EXT_ROP_ENABLE
// PERF: ropunit
`CSR_MPM_ROP_READS : read_data_r = {`NUM_THREADS{rop_perf_if.mem_reads[31:0]}};
@ -298,12 +305,8 @@ module VX_csr_data #(
`CSR_MPM_ROP_WRITES_H : read_data_r = {`NUM_THREADS{32'(rop_perf_if.mem_writes[`PERF_CTR_BITS-1:32])}};
`CSR_MPM_ROP_LAT : read_data_r = {`NUM_THREADS{rop_perf_if.mem_latency[31:0]}};
`CSR_MPM_ROP_LAT_H : read_data_r = {`NUM_THREADS{32'(rop_perf_if.mem_latency[`PERF_CTR_BITS-1:32])}};
`ifdef EXT_RASTER_ENABLE
// PERF: rasterunit
`CSR_MPM_RAS_READS : read_data_r = {`NUM_THREADS{raster_perf_if.mem_reads[31:0]}};
`CSR_MPM_RAS_READS_H : read_data_r = {`NUM_THREADS{32'(raster_perf_if.mem_reads[`PERF_CTR_BITS-1:32])}};
`CSR_MPM_RAS_LAT : read_data_r = {`NUM_THREADS{raster_perf_if.mem_latency[31:0]}};
`CSR_MPM_RAS_LAT_H : read_data_r = {`NUM_THREADS{32'(raster_perf_if.mem_latency[`PERF_CTR_BITS-1:32])}};
`CSR_MPM_ROP_INACTIVE_CYC:read_data_r = {`NUM_THREADS{rop_perf_if.inactive_cycles[31:0]}};
`CSR_MPM_ROP_INACTIVE_CYC_H: read_data_r = {`NUM_THREADS{32'(rop_perf_if.inactive_cycles[`PERF_CTR_BITS-1:32])}};
`endif
// PERF: reserved
`CSR_MPM_RESERVED : read_data_r = '0;

View file

@ -33,7 +33,7 @@ module VX_mem_unit # (
);
`ifdef PERF_ENABLE
VX_perf_cache_if perf_icache_if(), perf_dcache_if(), perf_smem_if();
VX_perf_cache_if perf_icache_if(), perf_dcache_if(), perf_tcache_if(), perf_smem_if();
`endif
///////////////////////////////////////////////////////////////////////////

View file

@ -97,13 +97,16 @@
`define CSR_MPM_ROP_READS_H 12'hB9D
`define CSR_MPM_ROP_WRITES 12'hB1E // rop memory writes
`define CSR_MPM_ROP_WRITES_H 12'hB9E
`define CSR_MPM_ROP_LAT 12'hB1F // rop memory latency
`define CSR_MPM_ROP_LAT_H 12'hB9F
`define CSR_MPM_ROP_LAT 12'hB20 // rop memory latency
`define CSR_MPM_ROP_LAT_H 12'hBA0
`define CSR_MPM_ROP_INACTIVE_CYC 12'hB1F // rop inactive cycles
`define CSR_MPM_ROP_INACTIVE_CYC_H 12'hB9F
// PERF: rasterunit
`define CSR_MPM_RAS_READS 12'hB20 // raster accesses
`define CSR_MPM_RAS_READS_H 12'hBA0
`define CSR_MPM_RAS_LAT 12'hB21 // raster latency
`define CSR_MPM_RAS_LAT_H 12'hBA1
`define CSR_MPM_RAS_READS 12'hB21 // raster accesses
`define CSR_MPM_RAS_READS_H 12'hBA1
`define CSR_MPM_RAS_LAT 12'hB22 // raster latency
`define CSR_MPM_RAS_LAT_H 12'hBA2
// Machine Information Registers
`define CSR_MVENDORID 12'hF11

View file

@ -5,18 +5,20 @@ interface VX_rop_perf_if ();
wire [`PERF_CTR_BITS-1:0] mem_reads;
wire [`PERF_CTR_BITS-1:0] mem_writes;
wire [`PERF_CTR_BITS-1:0] mem_latency;
wire [`PERF_CTR_BITS-1:0] rop_inactive;
wire [`PERF_CTR_BITS-1:0] inactive_cycles;
modport master (
output mem_reads,
output mem_writes,
output mem_latency
output mem_latency,
output inactive_cycles
);
modport slave (
input mem_reads,
input mem_writes,
input mem_latency
input mem_latency,
input inactive_cycles
);
endinterface

View file

@ -41,19 +41,19 @@ module VX_rop_unit #(
);
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_rop_inactive;
reg [`PERF_CTR_BITS-1:0] perf_inactive_cycles;
wire perf_rop_inactive_cycle = ~rop_req_if.valid & rop_req_if.ready;
wire perf_inactive_cycle = ~rop_req_if.valid & rop_req_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_rop_inactive <= 0;
perf_inactive_cycles <= 0;
end else begin
perf_rop_inactive <= perf_rop_inactive + `PERF_CTR_BITS(perf_rop_inactive_cycle);
perf_inactive_cycles <= perf_inactive_cycles + `PERF_CTR_BITS'(perf_inactive_cycle);
end
end
assign rop_perf_if.rop_inactive = perf_rop_inactive;
assign rop_perf_if.inactive_cycles = perf_inactive_cycles;
`endif
endmodule

View file

@ -179,7 +179,7 @@ module VX_tex_unit #(
wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_rsp_per_cycle;
wire [`NUM_THREADS-1:0] perf_mem_req_per_req = cache_req_if.valid & cache_req_if.ready;
wire [`NUM_THREADS-1:0] perf_mem_rsp_per_req = cache_rsp_if.tmask & {`NUM_THREADS{cache_rsp_if.valid & cache_rsp_if.ready}};
wire [`NUM_THREADS-1:0] perf_mem_rsp_per_req = cache_rsp_if.valid & cache_rsp_if.ready;
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_per_req);
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_per_req);

94
perf/rop/perf.sh Executable file
View file

@ -0,0 +1,94 @@
#!/bin/bash
# exit when any command fails
set -e
# ensure build
make -s
simple()
{
echo "begin rop tests"
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_8.png -w8 -h8" --perf | grep 'PERF' > ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_16.png -w16 -h16" --perf | grep 'PERF' >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_32.png -w32 -h32" --perf | grep 'PERF' >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_64.png -w64 -h64" --perf | grep 'PERF' >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_128.png -w128 -h128" --perf | grep 'PERF' >> ./perf/rop/rop_perf.log
echo "rop tests done!"
}
depth_stencil()
{
echo "begin rop tests (with depth-stencil)"
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_8.png -w8 -h8 -d" --perf > ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_16.png -w16 -h16 -d" --perf >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_32.png -w32 -h32 -d" --perf >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_64.png -w64 -h64 -d" --perf >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_128.png -w128 -h128 -d" --perf >> ./perf/rop/rop_perf.log
echo "rop tests done!"
}
blend()
{
echo "begin rop tests (with blend)"
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_8.png -w8 -h8 -b" --perf > ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_16.png -w16 -h16 -b" --perf >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_32.png -w32 -h32 -b" --perf >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_64.png -w64 -h64 -b" --perf >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_128.png -w128 -h128 -b" --perf >> ./perf/rop/rop_perf.log
echo "rop tests done!"
}
depth_stencil_blend()
{
echo "begin rop tests (with depth-stencil & blend)"
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_8.png -w8 -h8 -b -d" --perf > ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_16.png -w16 -h16 -b -d" --perf >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_32.png -w32 -h32 -b -d" --perf >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_64.png -w64 -h64 -b -d" --perf >> ./perf/rop/rop_perf.log
echo -e "\n**************************************\n" >> ./perf/rop/rop_perf.log
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=rop --args="-rwhitebox_128.png -w128 -h128 -b -d" --perf >> ./perf/rop/rop_perf.log
echo "rop tests done!"
}
usage()
{
echo "usage: [-d] [-b] [-db] [-h|--help]"
}
case $1 in
-d ) depth_stencil
;;
-b ) blend
;;
-db ) depth_stencil_blend
;;
-h | --help ) usage
;;
* ) simple
;;
esac
shift

172
perf/rop/rop_perf.log Normal file
View file

@ -0,0 +1,172 @@
CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DEXT_GFX_ENABLE
running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DEXT_GFX_ENABLE make -C ./ci/../driver/rtlsim
PERF: instrs=1005, cycles=3685, IPC=0.272727
PERF: ibuffer stalls=14
PERF: scoreboard stalls=421
PERF: alu unit stalls=0
PERF: lsu unit stalls=0
PERF: csr unit stalls=0
PERF: fpu unit stalls=0
PERF: gpu unit stalls=0
PERF: loads=124
PERF: stores=107
PERF: branches=185
PERF: icache reads=685
PERF: icache read misses=38 (hit ratio=94%)
PERF: dcache reads=32
PERF: dcache writes=37
PERF: dcache read misses=7 (hit ratio=78%)
PERF: dcache write misses=32 (hit ratio=13%)
PERF: dcache bank stalls=0 (utilization=100%)
PERF: dcache mshr stalls=0
PERF: smem reads=62
PERF: smem writes=59
PERF: smem bank stalls=0 (utilization=100%)
PERF: memory requests=103 (reads=41, writes=62)
PERF: memory average latency=14 cycles
PERF: tex memory reads=0
PERF: tex memory latency=-2147483648 cycles
PERF: rop memory reads=0
PERF: rop memory writes=64
PERF: rop memory latency=0
PERF: rop inactive cycles=4102
**************************************
CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DEXT_GFX_ENABLE
running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DEXT_GFX_ENABLE make -C ./ci/../driver/rtlsim
PERF: instrs=2005, cycles=5275, IPC=0.380095
PERF: ibuffer stalls=14
PERF: scoreboard stalls=679
PERF: alu unit stalls=0
PERF: lsu unit stalls=0
PERF: csr unit stalls=0
PERF: fpu unit stalls=0
PERF: gpu unit stalls=0
PERF: loads=124
PERF: stores=107
PERF: branches=385
PERF: icache reads=1185
PERF: icache read misses=38 (hit ratio=96%)
PERF: dcache reads=32
PERF: dcache writes=37
PERF: dcache read misses=7 (hit ratio=78%)
PERF: dcache write misses=32 (hit ratio=13%)
PERF: dcache bank stalls=0 (utilization=100%)
PERF: dcache mshr stalls=0
PERF: smem reads=62
PERF: smem writes=59
PERF: smem bank stalls=0 (utilization=100%)
PERF: memory requests=103 (reads=41, writes=62)
PERF: memory average latency=17 cycles
PERF: tex memory reads=0
PERF: tex memory latency=-2147483648 cycles
PERF: rop memory reads=0
PERF: rop memory writes=256
PERF: rop memory latency=0
PERF: rop inactive cycles=5596
**************************************
CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DEXT_GFX_ENABLE
running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DEXT_GFX_ENABLE make -C ./ci/../driver/rtlsim
PERF: instrs=5925, cycles=10831, IPC=0.547041
PERF: ibuffer stalls=14
PERF: scoreboard stalls=1675
PERF: alu unit stalls=0
PERF: lsu unit stalls=0
PERF: csr unit stalls=0
PERF: fpu unit stalls=0
PERF: gpu unit stalls=0
PERF: loads=124
PERF: stores=107
PERF: branches=1169
PERF: icache reads=3145
PERF: icache read misses=38 (hit ratio=98%)
PERF: dcache reads=32
PERF: dcache writes=37
PERF: dcache read misses=7 (hit ratio=78%)
PERF: dcache write misses=32 (hit ratio=13%)
PERF: dcache bank stalls=0 (utilization=100%)
PERF: dcache mshr stalls=0
PERF: smem reads=62
PERF: smem writes=59
PERF: smem bank stalls=0 (utilization=100%)
PERF: memory requests=103 (reads=41, writes=62)
PERF: memory average latency=14 cycles
PERF: tex memory reads=0
PERF: tex memory latency=-2147483648 cycles
PERF: rop memory reads=0
PERF: rop memory writes=1024
PERF: rop memory latency=0
PERF: rop inactive cycles=10768
**************************************
CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DEXT_GFX_ENABLE
running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DEXT_GFX_ENABLE make -C ./ci/../driver/rtlsim
PERF: instrs=21445, cycles=33425, IPC=0.641586
PERF: ibuffer stalls=14
PERF: scoreboard stalls=5587
PERF: alu unit stalls=0
PERF: lsu unit stalls=0
PERF: csr unit stalls=0
PERF: fpu unit stalls=0
PERF: gpu unit stalls=0
PERF: loads=124
PERF: stores=107
PERF: branches=4273
PERF: icache reads=10905
PERF: icache read misses=38 (hit ratio=99%)
PERF: dcache reads=32
PERF: dcache writes=37
PERF: dcache read misses=7 (hit ratio=78%)
PERF: dcache write misses=32 (hit ratio=13%)
PERF: dcache bank stalls=0 (utilization=100%)
PERF: dcache mshr stalls=0
PERF: smem reads=62
PERF: smem writes=59
PERF: smem bank stalls=0 (utilization=100%)
PERF: memory requests=103 (reads=41, writes=62)
PERF: memory average latency=16 cycles
PERF: tex memory reads=0
PERF: tex memory latency=-2147483648 cycles
PERF: rop memory reads=0
PERF: rop memory writes=4096
PERF: rop memory latency=0
PERF: rop inactive cycles=31826
**************************************
CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DEXT_GFX_ENABLE
running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DEXT_GFX_ENABLE make -C ./ci/../driver/rtlsim
PERF: instrs=83205, cycles=122919, IPC=0.676909
PERF: ibuffer stalls=14
PERF: scoreboard stalls=21091
PERF: alu unit stalls=0
PERF: lsu unit stalls=0
PERF: csr unit stalls=0
PERF: fpu unit stalls=0
PERF: gpu unit stalls=0
PERF: loads=124
PERF: stores=107
PERF: branches=16625
PERF: icache reads=41785
PERF: icache read misses=38 (hit ratio=99%)
PERF: dcache reads=32
PERF: dcache writes=37
PERF: dcache read misses=7 (hit ratio=78%)
PERF: dcache write misses=32 (hit ratio=13%)
PERF: dcache bank stalls=0 (utilization=100%)
PERF: dcache mshr stalls=0
PERF: smem reads=62
PERF: smem writes=59
PERF: smem bank stalls=0 (utilization=100%)
PERF: memory requests=103 (reads=41, writes=62)
PERF: memory average latency=14 cycles
PERF: tex memory reads=0
PERF: tex memory latency=-2147483648 cycles
PERF: rop memory reads=0
PERF: rop memory writes=16384
PERF: rop memory latency=0
PERF: rop inactive cycles=115176