mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
perf update
This commit is contained in:
parent
649e15c2b3
commit
59ed24dc0b
8 changed files with 40 additions and 134 deletions
|
@ -397,27 +397,27 @@
|
|||
|
||||
`define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \
|
||||
if (count > 1) begin \
|
||||
wire [count-1:0][width-1:0] __reduce_add_i_``field; \
|
||||
wire [width-1:0] __reduce_add_o_``field; \
|
||||
wire [count-1:0][width-1:0] __reduce_add_i_field; \
|
||||
wire [width-1:0] __reduce_add_o_field; \
|
||||
for (genvar __i = 0; __i < count; ++__i) begin \
|
||||
assign __reduce_add_i_``field[__i] = src[__i].``field; \
|
||||
assign __reduce_add_i_field[__i] = src[__i].``field; \
|
||||
end \
|
||||
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``field ( \
|
||||
__reduce_add_i_``field, \
|
||||
__reduce_add_o_``field \
|
||||
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
|
||||
__reduce_add_i_field, \
|
||||
__reduce_add_o_field \
|
||||
); \
|
||||
if (reg_enable) begin \
|
||||
reg [width-1:0] __reduce_add_r_``field; \
|
||||
reg [width-1:0] __reduce_add_r_field; \
|
||||
always @(posedge clk) begin \
|
||||
if (reset) begin \
|
||||
__reduce_add_r_``field <= '0; \
|
||||
__reduce_add_r_field <= '0; \
|
||||
end else begin \
|
||||
__reduce_add_r_``field <= __reduce_add_o_``field; \
|
||||
__reduce_add_r_field <= __reduce_add_o_field; \
|
||||
end \
|
||||
end \
|
||||
assign dst.``field = __reduce_add_r_``field; \
|
||||
assign dst.``field = __reduce_add_r_field; \
|
||||
end else begin \
|
||||
assign dst.``field = __reduce_add_o_``field; \
|
||||
assign dst.``field = __reduce_add_o_field; \
|
||||
end \
|
||||
end else begin \
|
||||
assign dst.``field = src[0].``field; \
|
||||
|
|
|
@ -89,7 +89,6 @@ package VX_gpu_pkg;
|
|||
logic [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
logic [`PERF_CTR_BITS-1:0] opd_stalls;
|
||||
logic [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] units_uses;
|
||||
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
|
||||
} issue_perf_t;
|
||||
|
||||
//////////////////////// instruction arguments ////////////////////////////
|
||||
|
|
|
@ -93,23 +93,19 @@
|
|||
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
|
||||
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
|
||||
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
|
||||
`define VX_CSR_MPM_SCRB_WCTL 12'hB0B
|
||||
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB8B
|
||||
`define VX_CSR_MPM_SCRB_CSRS 12'hB0C
|
||||
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
|
||||
`define VX_CSR_MPM_OPDS_ST 12'hB0D
|
||||
`define VX_CSR_MPM_OPDS_ST_H 12'hB8D
|
||||
`define VX_CSR_MPM_OPDS_ST 12'hB0B
|
||||
`define VX_CSR_MPM_OPDS_ST_H 12'hB8B
|
||||
// PERF: memory
|
||||
`define VX_CSR_MPM_IFETCHES 12'hB0E
|
||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8E
|
||||
`define VX_CSR_MPM_LOADS 12'hB0F
|
||||
`define VX_CSR_MPM_LOADS_H 12'hB8F
|
||||
`define VX_CSR_MPM_STORES 12'hB10
|
||||
`define VX_CSR_MPM_STORES_H 12'hB90
|
||||
`define VX_CSR_MPM_IFETCH_LT 12'hB11
|
||||
`define VX_CSR_MPM_IFETCH_LT_H 12'hB91
|
||||
`define VX_CSR_MPM_LOAD_LT 12'hB12
|
||||
`define VX_CSR_MPM_LOAD_LT_H 12'hB92
|
||||
`define VX_CSR_MPM_IFETCHES 12'hB0C
|
||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8C
|
||||
`define VX_CSR_MPM_LOADS 12'hB0D
|
||||
`define VX_CSR_MPM_LOADS_H 12'hB8D
|
||||
`define VX_CSR_MPM_STORES 12'hB0E
|
||||
`define VX_CSR_MPM_STORES_H 12'hB8E
|
||||
`define VX_CSR_MPM_IFETCH_LT 12'hB1F
|
||||
`define VX_CSR_MPM_IFETCH_LT_H 12'hB9F
|
||||
`define VX_CSR_MPM_LOAD_LT 12'hB10
|
||||
`define VX_CSR_MPM_LOAD_LT_H 12'hB90
|
||||
|
||||
// Machine Performance-monitoring memory counters (class 2) ///////////////////
|
||||
|
||||
|
|
|
@ -224,8 +224,6 @@ import VX_fpu_pkg::*;
|
|||
`endif
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
|
||||
// PERF: memory
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
|
||||
|
|
|
@ -36,10 +36,7 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
end
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
end
|
||||
`endif
|
||||
|
||||
|
|
|
@ -61,7 +61,6 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
.perf_stalls (issue_perf.scb_stalls),
|
||||
.perf_units_uses(issue_perf.units_uses),
|
||||
.perf_sfu_uses (issue_perf.sfu_uses),
|
||||
`endif
|
||||
.writeback_if (writeback_if),
|
||||
.ibuffer_if (ibuffer_if),
|
||||
|
|
|
@ -22,7 +22,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_stalls,
|
||||
output reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_uses,
|
||||
output reg [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_uses,
|
||||
`endif
|
||||
|
||||
VX_writeback_if.slave writeback_if,
|
||||
|
@ -32,18 +31,13 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
|
||||
|
||||
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
|
||||
reg [PER_ISSUE_WARPS-1:0] operands_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [PER_ISSUE_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
|
||||
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
|
||||
|
||||
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
|
||||
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||
|
||||
wire [PER_ISSUE_WARPS-1:0] perf_issue_stalls_per_cycle;
|
||||
wire [`CLOG2(PER_ISSUE_WARPS+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
|
||||
|
||||
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
|
@ -53,24 +47,20 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
.data_out (perf_units_per_cycle)
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_SFU_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
.OP ("|")
|
||||
) perf_sfu_reduce (
|
||||
.data_in (perf_inuse_sfu_per_cycle),
|
||||
.data_out (perf_sfu_per_cycle)
|
||||
);
|
||||
|
||||
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
|
||||
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
|
||||
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
|
||||
|
||||
wire [PER_ISSUE_WARPS-1:0] stg_valid_in;
|
||||
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
|
||||
assign stg_valid_in[i] = staging_if[i].valid;
|
||||
end
|
||||
|
||||
wire perf_stall_per_cycle = (|stg_valid_in) && ~(|(stg_valid_in & operands_ready));
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_stalls <= '0;
|
||||
end else begin
|
||||
perf_stalls <= perf_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
|
||||
perf_stalls <= perf_stalls + `PERF_CTR_BITS'(perf_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -83,21 +73,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sfu_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
|
||||
reg [PER_ISSUE_WARPS-1:0] operands_ready;
|
||||
|
||||
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
|
@ -129,49 +106,24 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||
|
||||
reg [`SFU_WIDTH-1:0] sfu_type;
|
||||
always @(*) begin
|
||||
case (staging_if[i].data.op_type)
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
|
||||
default: sfu_type = `SFU_WCTL;
|
||||
endcase
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
perf_inuse_units_per_cycle[i] = '0;
|
||||
perf_inuse_sfu_per_cycle[i] = '0;
|
||||
if (staging_if[i].valid) begin
|
||||
if (operands_busy[0]) begin
|
||||
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rd]] = 1;
|
||||
if (inuse_units[staging_if[i].data.rd] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rd]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[1]) begin
|
||||
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs1]] = 1;
|
||||
if (inuse_units[staging_if[i].data.rs1] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs1]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[2]) begin
|
||||
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs2]] = 1;
|
||||
if (inuse_units[staging_if[i].data.rs2] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs2]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[3]) begin
|
||||
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs3]] = 1;
|
||||
if (inuse_units[staging_if[i].data.rs3] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs3]] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
assign perf_issue_stalls_per_cycle[i] = staging_if[i].valid && ~staging_if[i].ready;
|
||||
`endif
|
||||
|
||||
always @(*) begin
|
||||
|
@ -245,9 +197,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
if (staging_fire && staging_if[i].data.wb) begin
|
||||
inuse_units[staging_if[i].data.rd] <= staging_if[i].data.ex_type;
|
||||
if (staging_if[i].data.ex_type == `EX_SFU) begin
|
||||
inuse_sfu[staging_if[i].data.rd] <= sfu_type;
|
||||
end
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
|
|
@ -187,8 +187,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t scrb_fpu = 0;
|
||||
uint64_t scrb_lsu = 0;
|
||||
uint64_t scrb_sfu = 0;
|
||||
uint64_t scrb_wctl = 0;
|
||||
uint64_t scrb_csrs = 0;
|
||||
uint64_t ifetches = 0;
|
||||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
|
@ -281,7 +279,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
}
|
||||
ibuffer_stalls += ibuffer_stalls_per_core;
|
||||
}
|
||||
// issue stalls
|
||||
// scoreboard stalls
|
||||
{
|
||||
uint64_t scrb_stalls_per_core;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ST, core_id, &scrb_stalls_per_core), {
|
||||
|
@ -309,7 +307,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
scrb_sfu += scrb_sfu_per_core;
|
||||
if (num_cores > 1) {
|
||||
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
|
||||
fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
|
||||
int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
|
||||
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d, fpu=%d, lsu=%d, sfu=%d)\n", core_id, scrb_stalls_per_core, scrb_percent_per_core,
|
||||
calcAvgPercent(scrb_alu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_fpu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_lsu_per_core, scrb_total),
|
||||
|
@ -317,32 +316,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
}
|
||||
scrb_stalls += scrb_stalls_per_core;
|
||||
}
|
||||
// sfu stalls
|
||||
{
|
||||
uint64_t scrb_sfu_per_core;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_wctl_per_core;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_csrs_per_core;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
|
||||
return err;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core;
|
||||
fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
|
||||
, core_id
|
||||
, scrb_sfu_per_core
|
||||
, calcAvgPercent(scrb_csrs_per_core, sfu_total)
|
||||
, calcAvgPercent(scrb_wctl_per_core, sfu_total)
|
||||
);
|
||||
}
|
||||
scrb_wctl += scrb_wctl_per_core;
|
||||
scrb_csrs += scrb_csrs_per_core;
|
||||
}
|
||||
// operands stalls
|
||||
{
|
||||
uint64_t opds_stalls_per_core;
|
||||
|
@ -567,24 +540,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles);
|
||||
int sched_stalls_percent = calcAvgPercent(sched_stalls, total_cycles);
|
||||
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, total_cycles);
|
||||
int scrb_percent = calcAvgPercent(scrb_stalls, total_cycles);
|
||||
int opds_percent = calcAvgPercent(opds_stalls, total_cycles);
|
||||
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
|
||||
int load_avg_lat = (int)(double(load_lat) / double(loads));
|
||||
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
|
||||
uint64_t sfu_total = scrb_wctl + scrb_csrs;
|
||||
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
|
||||
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
||||
fprintf(stream, "PERF: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d, fpu=%d, lsu=%d, sfu=%d)\n", scrb_stalls, scrb_percent,
|
||||
calcAvgPercent(scrb_alu, scrb_total),
|
||||
calcAvgPercent(scrb_fpu, scrb_total),
|
||||
calcAvgPercent(scrb_lsu, scrb_total),
|
||||
calcAvgPercent(scrb_sfu, scrb_total));
|
||||
fprintf(stream, "PERF: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
|
||||
, scrb_sfu
|
||||
, calcAvgPercent(scrb_csrs, sfu_total)
|
||||
, calcAvgPercent(scrb_wctl, sfu_total)
|
||||
);
|
||||
fprintf(stream, "PERF: operands stalls=%ld (%d%%)\n", opds_stalls, opds_percent);
|
||||
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue