Merge branch 'develop' of https://github.com/vortexgpgpu/vortex into develop

This commit is contained in:
Blaise Tine 2023-12-15 01:14:21 -08:00
commit 4e51544402
13 changed files with 92 additions and 83 deletions

View file

@ -70,10 +70,10 @@
`define VX_CSR_MINSTRET 12'hB02
`define VX_CSR_MINSTRET_H 12'hB82
// PERF: pipeline
`define VX_CSR_MPM_SCHED_ST 12'hB03
`define VX_CSR_MPM_SCHED_ST_H 12'hB83
`define VX_CSR_MPM_FETCH_ST 12'hB04
`define VX_CSR_MPM_FETCH_ST_H 12'hB84
`define VX_CSR_MPM_SCHED_ID 12'hB03
`define VX_CSR_MPM_SCHED_ID_H 12'hB83
`define VX_CSR_MPM_SCHED_ST 12'hB04
`define VX_CSR_MPM_SCHED_ST_H 12'hB84
`define VX_CSR_MPM_IBUF_ST 12'hB05
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
`define VX_CSR_MPM_SCRB_ST 12'hB06
@ -101,10 +101,10 @@
`define VX_CSR_MPM_LOADS_H 12'hB90
`define VX_CSR_MPM_STORES 12'hB11
`define VX_CSR_MPM_STORES_H 12'hB91
`define VX_CSR_MPM_IFETCH_LAT 12'hB12
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB92
`define VX_CSR_MPM_LOAD_LAT 12'hB13
`define VX_CSR_MPM_LOAD_LAT_H 12'hB93
`define VX_CSR_MPM_IFETCH_LT 12'hB12
`define VX_CSR_MPM_IFETCH_LT_H 12'hB92
`define VX_CSR_MPM_LOAD_LT 12'hB13
`define VX_CSR_MPM_LOAD_LT_H 12'hB93
// Machine Performance-monitoring memory counters
// PERF: icache
@ -158,8 +158,8 @@
`define VX_CSR_MPM_MEM_READS_H 12'hB98
`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LAT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LAT_H 12'hB9A
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
// PERF: smem
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B

View file

@ -181,16 +181,15 @@ module Vortex import VX_gpu_pkg::*; (
end
end
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
always @(posedge clk) begin
if (reset) begin
mem_perf <= '0;
end else begin
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1);
end
if (mem_req_fire && mem_bus_if.req_data.rw) begin
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1);
end
end else begin
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end
end

View file

@ -186,11 +186,11 @@ import VX_fpu_pkg::*;
case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
// PERF: pipeline
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0];
`VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
@ -228,10 +228,10 @@ import VX_fpu_pkg::*;
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end
@ -295,8 +295,8 @@ import VX_fpu_pkg::*;
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end

View file

@ -156,13 +156,14 @@ module VX_issue #(
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
if (decode_if.valid && ~decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
end
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
end
end

View file

@ -554,7 +554,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.OUT_REG (1)
.OUT_REG (2)
) rsp_arb (
.clk (clk),
.reset (commit_reset),

View file

@ -381,20 +381,24 @@ module VX_schedule import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls;
wire schedule_idle = ~schedule_valid;
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_sched_stalls <= '0;
perf_fetch_stalls <= '0;
perf_sched_idles <= '0;
perf_sched_stalls <= '0;
end else begin
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(!schedule_valid);
perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_if.valid && !schedule_if.ready);
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
end
end
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
assign perf_schedule_if.fetch_stalls = perf_fetch_stalls;
assign perf_schedule_if.sched_idles = perf_sched_idles;
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
`endif
endmodule

View file

@ -196,11 +196,14 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_wctl_stalls;
wire wctl_execute_stall = wctl_execute_if.valid && ~wctl_execute_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_wctl_stalls <= '0;
end else begin
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_if.valid && ~wctl_execute_if.ready);
perf_wctl_stalls <= perf_wctl_stalls + `PERF_CTR_BITS'(wctl_execute_stall);
end
end
assign sfu_perf_if.wctl_stalls = perf_wctl_stalls;

View file

@ -49,7 +49,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
wire [LANE_BITS-1:0] tid;
wire [`UP(LANE_BITS)-1:0] tid;
if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin

View file

@ -14,8 +14,8 @@
`include "VX_define.vh"
interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] sched_idles;
wire [`PERF_CTR_BITS-1:0] sched_stalls;
wire [`PERF_CTR_BITS-1:0] fetch_stalls;
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
@ -28,8 +28,8 @@ interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] load_latency;
modport schedule (
output sched_stalls,
output fetch_stalls
output sched_idles,
output sched_stalls
);
modport issue (
@ -40,8 +40,8 @@ interface VX_pipeline_perf_if ();
);
modport slave (
input sched_idles,
input sched_stalls,
input fetch_stalls,
input ibf_stalls,
input scb_stalls,
input scb_uses,

View file

@ -175,8 +175,9 @@ static uint64_t get_csr_64(const void* ptr, int addr) {
extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int ret = 0;
uint64_t instrs = 0;
uint64_t cycles = 0;
uint64_t total_instrs = 0;
uint64_t total_cycles = 0;
uint64_t max_cycles = 0;
#ifdef PERF_ENABLE
@ -199,8 +200,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
auto perf_class = gAutoPerfDump.get_perf_class();
// PERF: pipeline stalls
uint64_t scheduler_stalls = 0;
uint64_t fetch_stalls = 0;
uint64_t sched_idles = 0;
uint64_t sched_stalls = 0;
uint64_t ibuffer_stalls = 0;
uint64_t scrb_stalls = 0;
uint64_t lsu_stalls = 0;
@ -269,19 +270,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: {
// PERF: pipeline
// schedule stalls
// scheduler idles
{
uint64_t scheduler_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
int scheduler_percent_per_core = calcAvgPercent(scheduler_stalls_per_core, cycles_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: schedule stalls=%ld (%d%%)\n", core_id, scheduler_stalls_per_core, scheduler_percent_per_core);
scheduler_stalls += scheduler_stalls_per_core;
uint64_t sched_idles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ID);
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler idles=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
sched_idles += sched_idles_per_core;
}
// fetch stalls
// scheduler stalls
{
uint64_t fetch_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FETCH_ST);
int fetch_percent_per_core = calcAvgPercent(fetch_stalls_per_core, cycles_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch stalls=%ld (%d%%)\n", core_id, fetch_stalls_per_core, fetch_percent_per_core);
fetch_stalls += fetch_stalls_per_core;
uint64_t sched_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
sched_stalls += sched_stalls_per_core;
}
// ibuffer_stalls
{
@ -340,7 +341,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
ifetches += ifetches_per_core;
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LT);
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
ifetch_lat += ifetch_lat_per_core;
@ -351,7 +352,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
loads += loads_per_core;
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LT);
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
load_lat += load_lat_per_core;
@ -431,7 +432,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
// PERF: memory
mem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_READS);
mem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_WRITES);
mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LAT);
mem_lat = get_csr_64(staging_buf.data(), VX_CSR_MPM_MEM_LT);
}
} break;
default:
@ -441,21 +442,22 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
instrs += instrs_per_core;
cycles = std::max<uint64_t>(cycles_per_core, cycles);
total_instrs += instrs_per_core;
total_cycles += cycles_per_core;
max_cycles = std::max<uint64_t>(cycles_per_core, max_cycles);
}
#ifdef PERF_ENABLE
switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: {
int scheduler_percent = calcAvgPercent(scheduler_stalls, cycles);
int fetch_percent = calcAvgPercent(fetch_stalls, cycles);
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, cycles);
int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles);
int sched_stalls_percent = calcAvgPercent(sched_stalls, total_cycles);
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, total_cycles);
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
int load_avg_lat = (int)(double(load_lat) / double(loads));
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", scheduler_stalls, scheduler_percent);
fprintf(stream, "PERF: fetch stalls=%ld (%d%%)\n", fetch_stalls, fetch_percent);
fprintf(stream, "PERF: scheduler idles=%ld (%d%%)\n", sched_idles, sched_idles_percent);
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
calcAvgPercent(scrb_alu, scrb_total),
@ -514,8 +516,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
#endif
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
float IPC = (float)(double(total_instrs) / double(max_cycles));
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC);
fflush(stream);

View file

@ -167,7 +167,7 @@ void Core::schedule() {
}
}
if (scheduled_warp == -1) {
++perf_stats_.sched_stalls;
++perf_stats_.sched_idles;
return;
}
@ -548,10 +548,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
break;
case VX_DCR_MPM_CLASS_CORE: {
switch (addr) {
case VX_CSR_MPM_SCHED_ID: return perf_stats_.sched_idles & 0xffffffff;
case VX_CSR_MPM_SCHED_ID_H:return perf_stats_.sched_idles >> 32;
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
case VX_CSR_MPM_FETCH_ST: return perf_stats_.fetch_stalls & 0xffffffff;
case VX_CSR_MPM_FETCH_ST_H:return perf_stats_.fetch_stalls >> 32;
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
@ -579,10 +579,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_LOADS_H: return perf_stats_.loads >> 32;
case VX_CSR_MPM_STORES: return perf_stats_.stores & 0xffffffff;
case VX_CSR_MPM_STORES_H: return perf_stats_.stores >> 32;
case VX_CSR_MPM_IFETCH_LAT: return perf_stats_.ifetch_latency & 0xffffffff;
case VX_CSR_MPM_IFETCH_LAT_H: return perf_stats_.ifetch_latency >> 32;
case VX_CSR_MPM_LOAD_LAT: return perf_stats_.load_latency & 0xffffffff;
case VX_CSR_MPM_LOAD_LAT_H: return perf_stats_.load_latency >> 32;
case VX_CSR_MPM_IFETCH_LT: return perf_stats_.ifetch_latency & 0xffffffff;
case VX_CSR_MPM_IFETCH_LT_H: return perf_stats_.ifetch_latency >> 32;
case VX_CSR_MPM_LOAD_LT: return perf_stats_.load_latency & 0xffffffff;
case VX_CSR_MPM_LOAD_LT_H: return perf_stats_.load_latency >> 32;
}
} break;
case VX_DCR_MPM_CLASS_MEM: {
@ -638,8 +638,8 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff;
case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32;
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;

View file

@ -49,8 +49,8 @@ public:
struct PerfStats {
uint64_t cycles;
uint64_t instrs;
uint64_t sched_idles;
uint64_t sched_stalls;
uint64_t fetch_stalls;
uint64_t ibuf_stalls;
uint64_t scrb_stalls;
uint64_t alu_stalls;
@ -70,8 +70,8 @@ public:
PerfStats()
: cycles(0)
, instrs(0)
, sched_idles(0)
, sched_stalls(0)
, fetch_stalls(0)
, ibuf_stalls(0)
, scrb_stalls(0)
, alu_stalls(0)

View file

@ -37,10 +37,10 @@ run-simx:
$(MAKE) -C blackscholes run-simx
$(MAKE) -C transpose run-simx
$(MAKE) -C convolution run-simx
$(MAKE) -C cutcp run-simx
$(MAKE) -C sgemm2 run-simx
$(MAKE) -C cutcp run-simx
$(MAKE) -C vectorhypot run-simx
$(MAKE) -C mri-q run-simx
# $(MAKE) -C sgemm2 run-simx
run-rtlsim:
$(MAKE) -C vecadd run-rtlsim