perf counters refactory + uuid fixes

This commit is contained in:
Blaise Tine 2022-07-30 17:05:41 -07:00
parent 47847ec920
commit 611e76ae0e
27 changed files with 522 additions and 292 deletions

View file

@ -49,7 +49,7 @@ localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8;
localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_DATA_SIZE);
localparam AVS_RD_QUEUE_SIZE = 4;
localparam AVS_RD_QUEUE_SIZE = 64;
localparam _VX_MEM_TAG_WIDTH = `VX_MEM_TAG_WIDTH;
localparam _AVS_REQ_TAGW_VX = _VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH);
localparam _AVS_REQ_TAGW_VX2 = `MAX(_VX_MEM_TAG_WIDTH, _AVS_REQ_TAGW_VX);

View file

@ -312,14 +312,14 @@
`define NUM_TEX_UNITS `UP(`NUM_CORES / 8)
`endif
// Size of texture Request Queue
// Size of texture Request Queue (Quad=4)
`ifndef TEX_REQ_QUEUE_SIZE
`define TEX_REQ_QUEUE_SIZE `MAX(2, `NUM_WARPS * 4)
`endif
// Texture Unit memory pending Queue
`ifndef TEX_MEM_QUEUE_SIZE
`define TEX_MEM_QUEUE_SIZE `MAX(2, `NUM_WARPS * 4)
`define TEX_MEM_QUEUE_SIZE (`TEX_REQ_QUEUE_SIZE * 2)
`endif
// Raster Units ////////////////////////////////////////////////////////////////
@ -407,7 +407,7 @@
// Miss Handling Register Size
`ifndef ICACHE_MSHR_SIZE
`define ICACHE_MSHR_SIZE `CLAMP(`NUM_WARPS * `UP(`NUM_CORES / `UP(`NUM_ICACHES)), 2, 16)
`define ICACHE_MSHR_SIZE 16
`endif
// Memory Request Queue Size
@ -471,7 +471,7 @@
// Miss Handling Register Size
`ifndef DCACHE_MSHR_SIZE
`define DCACHE_MSHR_SIZE `CLAMP(`LSUQ_SIZE * `UP(`NUM_CORES / `UP(`NUM_DCACHES)), 2, 16)
`define DCACHE_MSHR_SIZE 32
`endif
// Memory Request Queue Size
@ -562,7 +562,7 @@
// Miss Handling Register Size
`ifndef TCACHE_MSHR_SIZE
`define TCACHE_MSHR_SIZE (8 * 8)
`define TCACHE_MSHR_SIZE 32
`endif
// Memory Request Queue Size
@ -626,7 +626,7 @@
// Miss Handling Register Size
`ifndef RCACHE_MSHR_SIZE
`define RCACHE_MSHR_SIZE 8
`define RCACHE_MSHR_SIZE 16
`endif
// Memory Request Queue Size
@ -690,7 +690,7 @@
// Miss Handling Register Size
`ifndef OCACHE_MSHR_SIZE
`define OCACHE_MSHR_SIZE (8 * 8)
`define OCACHE_MSHR_SIZE 32
`endif
// Memory Request Queue Size
@ -741,7 +741,7 @@
// Miss Handling Register Size
`ifndef L2_MSHR_SIZE
`define L2_MSHR_SIZE 16
`define L2_MSHR_SIZE 64
`endif
// Memory Request Queue Size
@ -792,7 +792,7 @@
// Miss Handling Register Size
`ifndef L3_MSHR_SIZE
`define L3_MSHR_SIZE 16
`define L3_MSHR_SIZE 64
`endif
// Memory Request Queue Size

View file

@ -129,10 +129,7 @@ module VX_core #(
.CORE_ID(CORE_ID)
) decode (
.clk (clk),
.reset (decode_reset),
`ifdef PERF_ENABLE
.perf_decode_if (perf_pipeline_if.decode),
`endif
.reset (decode_reset),
.ifetch_rsp_if (ifetch_rsp_if),
.decode_if (decode_if),
.wrelease_if (wrelease_if),
@ -174,7 +171,7 @@ module VX_core #(
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if(perf_pipeline_if),
`endif
@ -252,5 +249,79 @@ module VX_core #(
.sim_wb_value (sim_wb_value)
);
`ifdef PERF_ENABLE
wire [$clog2(ICACHE_NUM_REQS+1)-1:0] perf_icache_req_per_cycle;
wire [$clog2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [$clog2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [$clog2(ICACHE_NUM_REQS+1)-1:0] perf_icache_rsp_per_cycle;
wire [$clog2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire [$clog2(ICACHE_NUM_REQS+1)+1-1:0] perf_icache_pending_read_cycle;
wire [$clog2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
reg [`PERF_CTR_BITS-1:0] perf_loads;
reg [`PERF_CTR_BITS-1:0] perf_stores;
wire [ICACHE_NUM_REQS-1:0] perf_icache_req_fire = icache_req_if.valid & icache_req_if.ready;
wire [ICACHE_NUM_REQS-1:0] perf_icache_rsp_fire = icache_rsp_if.valid & icache_rsp_if.ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire = dcache_req_if.valid & ~dcache_req_if.rw & dcache_req_if.ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire = dcache_req_if.valid & dcache_req_if.rw & dcache_req_if.ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire = dcache_rsp_if.valid & dcache_rsp_if.ready;
`POP_COUNT(perf_icache_req_per_cycle, perf_icache_req_fire);
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire);
`POP_COUNT(perf_icache_rsp_per_cycle, perf_icache_rsp_fire);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
assign perf_icache_pending_read_cycle = perf_icache_req_per_cycle - perf_icache_rsp_per_cycle;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
always @(posedge clk) begin
if (reset) begin
perf_icache_pending_reads <= 0;
perf_dcache_pending_reads <= 0;
end else begin
perf_icache_pending_reads <= perf_icache_pending_reads + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
perf_dcache_pending_reads <= perf_dcache_pending_reads + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
end
end
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
always @(posedge clk) begin
if (reset) begin
perf_ifetches <= 0;
perf_loads <= 0;
perf_stores <= 0;
perf_icache_lat <= 0;
perf_dcache_lat <= 0;
end else begin
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_per_cycle);
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
end
end
assign perf_pipeline_if.ifetches = perf_ifetches;
assign perf_pipeline_if.loads = perf_loads;
assign perf_pipeline_if.stores = perf_stores;
assign perf_pipeline_if.load_latency = perf_dcache_lat;
assign perf_pipeline_if.ifetch_latency = perf_icache_lat;
assign perf_pipeline_if.load_latency = perf_dcache_lat;
`endif
endmodule

View file

@ -161,32 +161,41 @@ module VX_csr_data #(
`DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`CSR_MPM_IBUF_ST : read_data_ro_r = perf_pipeline_if.ibf_stalls[31:0];
`CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(perf_pipeline_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SCRB_ST : read_data_ro_r = perf_pipeline_if.scb_stalls[31:0];
`CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(perf_pipeline_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_ALU_ST : read_data_ro_r = perf_pipeline_if.alu_stalls[31:0];
`CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.alu_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_LSU_ST : read_data_ro_r = perf_pipeline_if.lsu_stalls[31:0];
`CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_CSR_ST : read_data_ro_r = perf_pipeline_if.csr_stalls[31:0];
`CSR_MPM_CSR_ST_H : read_data_ro_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_IBUF_ST : read_data_ro_r = perf_pipeline_if.ibf_stalls[31:0];
`CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(perf_pipeline_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SCRB_ST : read_data_ro_r = perf_pipeline_if.scb_stalls[31:0];
`CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(perf_pipeline_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_ALU_ST : read_data_ro_r = perf_pipeline_if.alu_stalls[31:0];
`CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.alu_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_LSU_ST : read_data_ro_r = perf_pipeline_if.lsu_stalls[31:0];
`CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_CSR_ST : read_data_ro_r = perf_pipeline_if.csr_stalls[31:0];
`CSR_MPM_CSR_ST_H : read_data_ro_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]);
`ifdef EXT_F_ENABLE
`CSR_MPM_FPU_ST : read_data_ro_r = perf_pipeline_if.fpu_stalls[31:0];
`CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_FPU_ST : read_data_ro_r = perf_pipeline_if.fpu_stalls[31:0];
`CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]);
`else
`CSR_MPM_FPU_ST : read_data_ro_r = '0;
`CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
`CSR_MPM_FPU_ST : read_data_ro_r = '0;
`CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
`endif
`CSR_MPM_GPU_ST : read_data_ro_r = perf_pipeline_if.gpu_stalls[31:0];
`CSR_MPM_GPU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]);
// PERF: decode
`CSR_MPM_LOADS : read_data_ro_r = perf_pipeline_if.loads[31:0];
`CSR_MPM_LOADS_H : read_data_ro_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_STORES : read_data_ro_r = perf_pipeline_if.stores[31:0];
`CSR_MPM_STORES_H : read_data_ro_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]);
`CSR_MPM_BRANCHES : read_data_ro_r = perf_pipeline_if.branches[31:0];
`CSR_MPM_BRANCHES_H : read_data_ro_r = 32'(perf_pipeline_if.branches[`PERF_CTR_BITS-1:32]);
`CSR_MPM_GPU_ST : read_data_ro_r = perf_pipeline_if.gpu_stalls[31:0];
`CSR_MPM_GPU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]);
// PERF: memory
`CSR_MPM_IFETCHES : read_data_ro_r = perf_pipeline_if.ifetches[31:0];
`CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(perf_pipeline_if.ifetches[`PERF_CTR_BITS-1:32]);
`CSR_MPM_LOADS : read_data_ro_r = perf_pipeline_if.loads[31:0];
`CSR_MPM_LOADS_H : read_data_ro_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_STORES : read_data_ro_r = perf_pipeline_if.stores[31:0];
`CSR_MPM_STORES_H : read_data_ro_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]);
`CSR_MPM_IFETCH_LAT : read_data_ro_r = perf_pipeline_if.ifetch_latency[31:0];
`CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(perf_pipeline_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`CSR_MPM_LOAD_LAT : read_data_ro_r = perf_pipeline_if.load_latency[31:0];
`CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(perf_pipeline_if.load_latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end
`DCR_MPM_CLASS_MEM: begin
case (read_addr)
// PERF: icache
`CSR_MPM_ICACHE_READS : read_data_ro_r = perf_memsys_if.icache_reads[31:0];
`CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]);
@ -206,22 +215,45 @@ module VX_csr_data #(
`CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = perf_memsys_if.dcache_mshr_stalls[31:0];
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem
`CSR_MPM_SMEM_READS : read_data_ro_r = perf_memsys_if.smem_reads[31:0];
`CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_WRITES : read_data_ro_r = perf_memsys_if.smem_writes[31:0];
`CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_BANK_ST : read_data_ro_r = perf_memsys_if.smem_bank_stalls[31:0];
`CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_READS : read_data_ro_r = perf_memsys_if.smem_reads[31:0];
`CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_WRITES : read_data_ro_r = perf_memsys_if.smem_writes[31:0];
`CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_BANK_ST : read_data_ro_r = perf_memsys_if.smem_bank_stalls[31:0];
`CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l2cache
`CSR_MPM_L2CACHE_READS : read_data_ro_r = perf_memsys_if.l2cache_reads[31:0];
`CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_L2CACHE_WRITES : read_data_ro_r = perf_memsys_if.l2cache_writes[31:0];
`CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = perf_memsys_if.l2cache_read_misses[31:0];
`CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_read_misses[`PERF_CTR_BITS-1:32]);
`CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = perf_memsys_if.l2cache_write_misses[31:0];
`CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_write_misses[`PERF_CTR_BITS-1:32]);
`CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = perf_memsys_if.l2cache_bank_stalls[31:0];
`CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_bank_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = perf_memsys_if.l2cache_mshr_stalls[31:0];
`CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l3cache
`CSR_MPM_L3CACHE_READS : read_data_ro_r = perf_memsys_if.l3cache_reads[31:0];
`CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_L3CACHE_WRITES : read_data_ro_r = perf_memsys_if.l3cache_writes[31:0];
`CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = perf_memsys_if.l3cache_read_misses[31:0];
`CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_read_misses[`PERF_CTR_BITS-1:32]);
`CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = perf_memsys_if.l3cache_write_misses[31:0];
`CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_write_misses[`PERF_CTR_BITS-1:32]);
`CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = perf_memsys_if.l3cache_bank_stalls[31:0];
`CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_bank_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = perf_memsys_if.l3cache_mshr_stalls[31:0];
`CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: memory
`CSR_MPM_MEM_READS : read_data_ro_r = perf_memsys_if.mem_reads[31:0];
`CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_WRITES : read_data_ro_r = perf_memsys_if.mem_writes[31:0];
`CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_LAT : read_data_ro_r = perf_memsys_if.mem_latency[31:0];
`CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
// PERF: wctl
`CSR_MPM_WCTL_ISSUE_ST : read_data_ro_r = perf_gpu_if.wctl_stalls[31:0];
`CSR_MPM_WCTL_ISSUE_ST_H : read_data_ro_r = 32'(perf_gpu_if.wctl_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_READS : read_data_ro_r = perf_memsys_if.mem_reads[31:0];
`CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_WRITES : read_data_ro_r = perf_memsys_if.mem_writes[31:0];
`CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_LAT : read_data_ro_r = perf_memsys_if.mem_latency[31:0];
`CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end
@ -327,6 +359,8 @@ module VX_csr_data #(
wire [`PERF_CTR_BITS-1:0] perf_imadd_stalls = perf_gpu_if.imadd_stalls;
`UNUSED_VAR (perf_imadd_stalls);
`endif
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = perf_gpu_if.wctl_stalls;
`UNUSED_VAR (perf_wctl_stalls);
`endif
`ifdef EXT_F_ENABLE

View file

@ -18,10 +18,6 @@ module VX_decode #(
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_perf_pipeline_if.decode perf_decode_if,
`endif
// inputs
VX_ifetch_rsp_if.slave ifetch_rsp_if,
@ -488,42 +484,6 @@ module VX_decode #(
assign ifetch_rsp_if.ibuf_pop = decode_if.ibuf_pop;
assign ifetch_rsp_if.ready = decode_if.ready;
`ifdef PERF_ENABLE
wire [$clog2(`NUM_THREADS+1)-1:0] perf_loads_per_cycle;
wire [$clog2(`NUM_THREADS+1)-1:0] perf_stores_per_cycle;
wire [$clog2(`NUM_THREADS+1)-1:0] perf_branches_per_cycle;
wire [`NUM_THREADS-1:0] perf_loads_per_thread = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && decode_if.wb}};
wire [`NUM_THREADS-1:0] perf_stores_per_thread = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && ~decode_if.wb}};
wire [`NUM_THREADS-1:0] perf_branches_per_thread = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_ALU && `INST_ALU_IS_BR(decode_if.op_mod)}};
`POP_COUNT(perf_loads_per_cycle, perf_loads_per_thread);
`POP_COUNT(perf_stores_per_cycle, perf_stores_per_thread);
`POP_COUNT(perf_branches_per_cycle, perf_branches_per_thread);
reg [`PERF_CTR_BITS-1:0] perf_loads;
reg [`PERF_CTR_BITS-1:0] perf_stores;
reg [`PERF_CTR_BITS-1:0] perf_branches;
always @(posedge clk) begin
if (reset) begin
perf_loads <= 0;
perf_stores <= 0;
perf_branches <= 0;
end else begin
if (decode_if.valid && decode_if.ready) begin
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_loads_per_cycle);
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_stores_per_cycle);
perf_branches <= perf_branches + `PERF_CTR_BITS'(perf_branches_per_cycle);
end
end
end
assign perf_decode_if.loads = perf_loads;
assign perf_decode_if.stores = perf_stores;
assign perf_decode_if.branches = perf_branches;
`endif
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin

View file

@ -65,7 +65,7 @@ module VX_fetch #(
`SCOPE_BIND_VX_fetch_icache_stage
.clk (clk),
.reset (reset),
.reset (reset),
.icache_rsp_if (icache_rsp_if),
.icache_req_if (icache_req_if),

View file

@ -12,7 +12,7 @@ module VX_icache_stage #(
input wire clk,
input wire reset,
// Icache interface
VX_cache_req_if.master icache_req_if,
VX_cache_rsp_if.slave icache_rsp_if,

View file

@ -205,7 +205,6 @@ module VX_lsu_unit #(
.rsp_tag (mem_rsp_tag),
.rsp_eop (mem_rsp_eop),
.rsp_ready (mem_rsp_ready),
`UNUSED_PIN (write_notify),
// Memory request
.mem_req_valid (cache_req_tmp_if.valid),

View file

@ -583,41 +583,32 @@ module VX_mem_unit # (
assign perf_memsys_if.smem_bank_stalls = 0;
`endif
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
always @(posedge clk) begin
if (reset) begin
perf_mem_pending_reads <= 0;
end else begin
perf_mem_pending_reads <= perf_mem_pending_reads +
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw))));
end
end
`ifdef L2_ENABLE
assign perf_memsys_if.l2cache_reads = perf_l2cache_if.reads;
assign perf_memsys_if.l2cache_writes = perf_l2cache_if.writes;
assign perf_memsys_if.l2cache_read_misses = perf_l2cache_if.read_misses;
assign perf_memsys_if.l2cache_write_misses= perf_l2cache_if.write_misses;
assign perf_memsys_if.l2cache_bank_stalls = perf_l2cache_if.bank_stalls;
assign perf_memsys_if.l2cache_mshr_stalls = perf_l2cache_if.mshr_stalls;
`else
assign perf_memsys_if.l2cache_reads = 0;
assign perf_memsys_if.l2cache_writes = 0;
assign perf_memsys_if.l2cache_read_misses = 0;
assign perf_memsys_if.l2cache_write_misses= 0;
assign perf_memsys_if.l2cache_bank_stalls = 0;
assign perf_memsys_if.l2cache_mshr_stalls = 0;
`endif
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
assign perf_memsys_if.l3cache_reads = 0;
assign perf_memsys_if.l3cache_writes = 0;
assign perf_memsys_if.l3cache_read_misses = 0;
assign perf_memsys_if.l3cache_write_misses= 0;
assign perf_memsys_if.l3cache_bank_stalls = 0;
assign perf_memsys_if.l3cache_mshr_stalls = 0;
always @(posedge clk) begin
if (reset) begin
perf_mem_reads <= 0;
perf_mem_writes <= 0;
perf_mem_lat <= 0;
end else begin
if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(1);
end
if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'(1);
end
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
end
end
assign perf_memsys_if.mem_reads = perf_mem_reads;
assign perf_memsys_if.mem_writes = perf_mem_writes;
assign perf_memsys_if.mem_latency = perf_mem_lat;
assign perf_memsys_if.mem_reads = 0;
assign perf_memsys_if.mem_writes = 0;
assign perf_memsys_if.mem_latency = 0;
`endif

View file

@ -18,9 +18,10 @@
`define DCR_MPM_CLASS_NONE 0
`define DCR_MPM_CLASS_CORE 1
`define DCR_MPM_CLASS_TEX 2
`define DCR_MPM_CLASS_RASTER 3
`define DCR_MPM_CLASS_ROP 4
`define DCR_MPM_CLASS_MEM 2
`define DCR_MPM_CLASS_TEX 3
`define DCR_MPM_CLASS_RASTER 4
`define DCR_MPM_CLASS_ROP 5
// User Floating-Point CSRs
@ -68,48 +69,77 @@
`define CSR_MPM_FPU_ST_H 12'hB88
`define CSR_MPM_GPU_ST 12'hB09
`define CSR_MPM_GPU_ST_H 12'hB89
// PERF: decode
`define CSR_MPM_LOADS 12'hB0A
`define CSR_MPM_LOADS_H 12'hB8A
`define CSR_MPM_STORES 12'hB0B
`define CSR_MPM_STORES_H 12'hB8B
`define CSR_MPM_BRANCHES 12'hB0C
`define CSR_MPM_BRANCHES_H 12'hB8C
// PERF: icache
`define CSR_MPM_ICACHE_READS 12'hB0D // total reads
`define CSR_MPM_ICACHE_READS_H 12'hB8D
`define CSR_MPM_ICACHE_MISS_R 12'hB0E // read misses
`define CSR_MPM_ICACHE_MISS_R_H 12'hB8E
// PERF: dcache
`define CSR_MPM_DCACHE_READS 12'hB0F // total reads
`define CSR_MPM_DCACHE_READS_H 12'hB8F
`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes
`define CSR_MPM_DCACHE_WRITES_H 12'hB90
`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses
`define CSR_MPM_DCACHE_MISS_R_H 12'hB91
`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses
`define CSR_MPM_DCACHE_MISS_W_H 12'hB92
`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93
`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94
// PERF: smem
`define CSR_MPM_SMEM_READS 12'hB15 // total reads
`define CSR_MPM_SMEM_READS_H 12'hB95
`define CSR_MPM_SMEM_WRITES 12'hB16 // total writes
`define CSR_MPM_SMEM_WRITES_H 12'hB96
`define CSR_MPM_SMEM_BANK_ST 12'hB17 // bank conflicts
`define CSR_MPM_SMEM_BANK_ST_H 12'hB97
// PERF: memory
`define CSR_MPM_MEM_READS 12'hB18 // memory reads
`define CSR_MPM_MEM_READS_H 12'hB98
`define CSR_MPM_MEM_WRITES 12'hB19 // memory writes
`define CSR_MPM_MEM_WRITES_H 12'hB99
`define CSR_MPM_MEM_LAT 12'hB1A // memory latency
`define CSR_MPM_MEM_LAT_H 12'hB9A
// PERF: wctl
`define CSR_MPM_WCTL_ISSUE_ST 12'hB1B // issue stalls
`define CSR_MPM_WCTL_ISSUE_ST_H 12'hB9B
`define CSR_MPM_IFETCHES 12'hB0A
`define CSR_MPM_IFETCHES_H 12'hB8A
`define CSR_MPM_LOADS 12'hB0B
`define CSR_MPM_LOADS_H 12'hB8B
`define CSR_MPM_STORES 12'hB0C
`define CSR_MPM_STORES_H 12'hB8C
`define CSR_MPM_IFETCH_LAT 12'hB0D
`define CSR_MPM_IFETCH_LAT_H 12'hB8D
`define CSR_MPM_LOAD_LAT 12'hB0E
`define CSR_MPM_LOAD_LAT_H 12'hB8E
// Machine Performance-monitoring memory counters
// PERF: icache
`define CSR_MPM_ICACHE_READS 12'hB03 // total reads
`define CSR_MPM_ICACHE_READS_H 12'hB83
`define CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
`define CSR_MPM_ICACHE_MISS_R_H 12'hB84
// PERF: dcache
`define CSR_MPM_DCACHE_READS 12'hB05 // total reads
`define CSR_MPM_DCACHE_READS_H 12'hB85
`define CSR_MPM_DCACHE_WRITES 12'hB06 // total writes
`define CSR_MPM_DCACHE_WRITES_H 12'hB86
`define CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses
`define CSR_MPM_DCACHE_MISS_R_H 12'hB87
`define CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses
`define CSR_MPM_DCACHE_MISS_W_H 12'hB88
`define CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB89
`define CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A
// PERF: smem
`define CSR_MPM_SMEM_READS 12'hB0B // memory reads
`define CSR_MPM_SMEM_READS_H 12'hB8B
`define CSR_MPM_SMEM_WRITES 12'hB0C // memory writes
`define CSR_MPM_SMEM_WRITES_H 12'hB8C
`define CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts
`define CSR_MPM_SMEM_BANK_ST_H 12'hB8D
// PERF: l2cache
`define CSR_MPM_L2CACHE_READS 12'hB0E // total reads
`define CSR_MPM_L2CACHE_READS_H 12'hB8E
`define CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes
`define CSR_MPM_L2CACHE_WRITES_H 12'hB8F
`define CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses
`define CSR_MPM_L2CACHE_MISS_R_H 12'hB90
`define CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses
`define CSR_MPM_L2CACHE_MISS_W_H 12'hB91
`define CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts
`define CSR_MPM_L2CACHE_BANK_ST_H 12'hB92
`define CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls
`define CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93
// PERF: l3cache
`define CSR_MPM_L3CACHE_READS 12'hB14 // total reads
`define CSR_MPM_L3CACHE_READS_H 12'hB94
`define CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes
`define CSR_MPM_L3CACHE_WRITES_H 12'hB95
`define CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses
`define CSR_MPM_L3CACHE_MISS_R_H 12'hB96
`define CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses
`define CSR_MPM_L3CACHE_MISS_W_H 12'hB97
`define CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts
`define CSR_MPM_L3CACHE_BANK_ST_H 12'hB98
`define CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls
`define CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99
// PERF: memory
`define CSR_MPM_MEM_READS 12'hB1A // total reads
`define CSR_MPM_MEM_READS_H 12'hB9A
`define CSR_MPM_MEM_WRITES 12'hB1B // total writes
`define CSR_MPM_MEM_WRITES_H 12'hB9B
`define CSR_MPM_MEM_LAT 12'hB1C // memory latency
`define CSR_MPM_MEM_LAT_H 12'hB9C
// Machine Performance-monitoring texture counters
// PERF: texture unit

View file

@ -139,10 +139,10 @@ module VX_warp_sched #(
stalled_warps[schedule_wid] <= 1;
// release wspawn
use_wspawn[schedule_wid] <= 0;
if (use_wspawn[schedule_wid]) begin
thread_masks[schedule_wid] <= 1;
end
use_wspawn[schedule_wid] <= 0;
issued_instrs[schedule_wid] <= issued_instrs[schedule_wid] + `UP(`UUID_BITS)'(1);
end
@ -248,8 +248,8 @@ module VX_warp_sched #(
`ifdef SIMULATION
assign instr_uuid = (issued_instrs[schedule_wid] * `NUM_WARPS * `NUM_CORES * `NUM_CLUSTERS)
+ (schedule_wid * `NUM_CORES * `NUM_CLUSTERS)
+ `UUID_BITS'(CORE_ID);
+ (`NUM_WARPS * CORE_ID)
+ `UUID_BITS'(schedule_wid);
`else
assign instr_uuid = 0;
`endif

View file

@ -52,7 +52,7 @@ module Vortex (
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if[`NUM_CLUSTERS]();
VX_perf_memsys_if perf_memsys_total_if();
VX_perf_memsys_if perf_memsys_total_if();
VX_perf_cache_if perf_l3cache_if();
`PERF_MEMSYS_ADD (perf_memsys_total_if, perf_memsys_if, `NUM_CLUSTERS);
`endif
@ -237,6 +237,62 @@ module Vortex (
.mem_rsp_if (mem_rsp_if)
);
`ifdef PERF_ENABLE
`ifdef L3_ENABLE
assign perf_memsys_total_if.l3cache_reads = perf_l3cache_if.reads;
assign perf_memsys_total_if.l3cache_writes = perf_l3cache_if.writes;
assign perf_memsys_total_if.l3cache_read_misses = perf_l3cache_if.read_misses;
assign perf_memsys_total_if.l3cache_write_misses= perf_l3cache_if.write_misses;
assign perf_memsys_total_if.l3cache_bank_stalls = perf_l3cache_if.bank_stalls;
assign perf_memsys_total_if.l3cache_mshr_stalls = perf_l3cache_if.mshr_stalls;
`else
assign perf_memsys_total_if.l3cache_reads = 0;
assign perf_memsys_total_if.l3cache_writes = 0;
assign perf_memsys_total_if.l3cache_read_misses = 0;
assign perf_memsys_total_if.l3cache_write_misses= 0;
assign perf_memsys_total_if.l3cache_bank_stalls = 0;
assign perf_memsys_total_if.l3cache_mshr_stalls = 0;
`endif
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
always @(posedge clk) begin
if (reset) begin
perf_mem_pending_reads <= 0;
end else begin
perf_mem_pending_reads <= perf_mem_pending_reads +
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw))));
end
end
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
always @(posedge clk) begin
if (reset) begin
perf_mem_reads <= 0;
perf_mem_writes <= 0;
perf_mem_lat <= 0;
end else begin
if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(1);
end
if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'(1);
end
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
end
end
assign perf_memsys_total_if.mem_reads = perf_mem_reads;
assign perf_memsys_total_if.mem_writes = perf_mem_writes;
assign perf_memsys_total_if.mem_latency = perf_mem_lat;
`endif
`SCOPE_ASSIGN (reset, reset);
`SCOPE_ASSIGN (mem_req_fire, mem_req_valid && mem_req_ready);
`SCOPE_ASSIGN (mem_req_addr, `TO_FULL_ADDR(mem_req_addr));

View file

@ -124,6 +124,12 @@
`REDUCE_ADD (dst, src, smem_reads, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, smem_writes, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, smem_bank_stalls, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, l2cache_reads, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, l2cache_writes, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, l2cache_read_misses, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, l2cache_write_misses, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, l2cache_bank_stalls, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, l2cache_mshr_stalls, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, mem_reads, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, mem_writes, `PERF_CTR_BITS, count); \
`REDUCE_ADD (dst, src, mem_latency, `PERF_CTR_BITS, count)

View file

@ -4,15 +4,32 @@ interface VX_perf_memsys_if ();
wire [`PERF_CTR_BITS-1:0] icache_reads;
wire [`PERF_CTR_BITS-1:0] icache_read_misses;
wire [`PERF_CTR_BITS-1:0] dcache_reads;
wire [`PERF_CTR_BITS-1:0] dcache_writes;
wire [`PERF_CTR_BITS-1:0] dcache_read_misses;
wire [`PERF_CTR_BITS-1:0] dcache_write_misses;
wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls;
wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls;
wire [`PERF_CTR_BITS-1:0] smem_reads;
wire [`PERF_CTR_BITS-1:0] smem_writes;
wire [`PERF_CTR_BITS-1:0] smem_bank_stalls;
wire [`PERF_CTR_BITS-1:0] l2cache_reads;
wire [`PERF_CTR_BITS-1:0] l2cache_writes;
wire [`PERF_CTR_BITS-1:0] l2cache_read_misses;
wire [`PERF_CTR_BITS-1:0] l2cache_write_misses;
wire [`PERF_CTR_BITS-1:0] l2cache_bank_stalls;
wire [`PERF_CTR_BITS-1:0] l2cache_mshr_stalls;
wire [`PERF_CTR_BITS-1:0] l3cache_reads;
wire [`PERF_CTR_BITS-1:0] l3cache_writes;
wire [`PERF_CTR_BITS-1:0] l3cache_read_misses;
wire [`PERF_CTR_BITS-1:0] l3cache_write_misses;
wire [`PERF_CTR_BITS-1:0] l3cache_bank_stalls;
wire [`PERF_CTR_BITS-1:0] l3cache_mshr_stalls;
wire [`PERF_CTR_BITS-1:0] mem_reads;
wire [`PERF_CTR_BITS-1:0] mem_writes;
wire [`PERF_CTR_BITS-1:0] mem_latency;
@ -20,15 +37,32 @@ interface VX_perf_memsys_if ();
modport master (
output icache_reads,
output icache_read_misses,
output dcache_reads,
output dcache_writes,
output dcache_read_misses,
output dcache_write_misses,
output dcache_bank_stalls,
output dcache_mshr_stalls,
output smem_reads,
output smem_writes,
output smem_bank_stalls,
output l2cache_reads,
output l2cache_writes,
output l2cache_read_misses,
output l2cache_write_misses,
output l2cache_bank_stalls,
output l2cache_mshr_stalls,
output l3cache_reads,
output l3cache_writes,
output l3cache_read_misses,
output l3cache_write_misses,
output l3cache_bank_stalls,
output l3cache_mshr_stalls,
output mem_reads,
output mem_writes,
output mem_latency
@ -37,15 +71,32 @@ interface VX_perf_memsys_if ();
modport slave (
input icache_reads,
input icache_read_misses,
input dcache_reads,
input dcache_writes,
input dcache_read_misses,
input dcache_write_misses,
input dcache_bank_stalls,
input dcache_mshr_stalls,
input smem_reads,
input smem_writes,
input smem_bank_stalls,
input l2cache_reads,
input l2cache_writes,
input l2cache_read_misses,
input l2cache_write_misses,
input l2cache_bank_stalls,
input l2cache_mshr_stalls,
input l3cache_reads,
input l3cache_writes,
input l3cache_read_misses,
input l3cache_write_misses,
input l3cache_bank_stalls,
input l3cache_mshr_stalls,
input mem_reads,
input mem_writes,
input mem_latency

View file

@ -1,10 +1,6 @@
`include "VX_define.vh"
interface VX_perf_pipeline_if ();
wire [`PERF_CTR_BITS-1:0] loads;
wire [`PERF_CTR_BITS-1:0] stores;
wire [`PERF_CTR_BITS-1:0] branches;
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] lsu_stalls;
@ -15,11 +11,11 @@ interface VX_perf_pipeline_if ();
`endif
wire [`PERF_CTR_BITS-1:0] gpu_stalls;
modport decode (
output loads,
output stores,
output branches
);
wire [`PERF_CTR_BITS-1:0] ifetches;
wire [`PERF_CTR_BITS-1:0] loads;
wire [`PERF_CTR_BITS-1:0] stores;
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency;
modport issue (
output ibf_stalls,
@ -34,9 +30,6 @@ interface VX_perf_pipeline_if ();
);
modport slave (
input loads,
input stores,
input branches,
input ibf_stalls,
input scb_stalls,
input lsu_stalls,
@ -45,7 +38,12 @@ interface VX_perf_pipeline_if ();
`ifdef EXT_F_ENABLE
input fpu_stalls,
`endif
input gpu_stalls
input gpu_stalls,
input ifetches,
input loads,
input stores,
input ifetch_latency,
input load_latency
);
endinterface

View file

@ -33,6 +33,7 @@ module VX_mem_scheduler #(
input wire [TAG_WIDTH-1:0] req_tag,
output wire req_empty,
output wire req_ready,
output wire write_notify,
// Output response
output wire rsp_valid,
@ -41,7 +42,6 @@ module VX_mem_scheduler #(
output wire [TAG_WIDTH-1:0] rsp_tag,
output wire rsp_eop,
input wire rsp_ready,
output wire write_notify,
// Memory request
output wire [NUM_BANKS-1:0] mem_req_valid,
@ -308,7 +308,7 @@ module VX_mem_scheduler #(
// Handle memory responses ////////////////////////////////////////////////
reg [REQ_SIZEW-1:0] rsp_rem_size [QUEUE_SIZE-1:0];
wire [REQ_SIZEW-1:0] rsp_rem_size_n;
wire [REQ_SIZEW-1:0] rsp_rem_size_n;
wire [`UP(BATCH_SEL_BITS)-1:0] rsp_batch_idx;
// Select memory response
@ -383,7 +383,7 @@ module VX_mem_scheduler #(
reg [NUM_BATCHES-1:0][NUM_BANKS-1:0][DATA_WIDTH-1:0] rsp_store [QUEUE_SIZE-1:0];
wire [NUM_BATCHES-1:0][NUM_BANKS-1:0][DATA_WIDTH-1:0] rsp_store_n;
reg [QUEUE_SIZE-1:0][NUM_REQS-1:0] rsp_orig_mask;
reg [NUM_REQS-1:0] rsp_orig_mask [QUEUE_SIZE-1:0];
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
for (genvar j = 0; j < NUM_BANKS; ++j) begin

View file

@ -274,6 +274,7 @@ module VX_raster_mem #(
.req_tag (mem_req_tag),
`UNUSED_PIN (req_empty),
.req_ready (mem_req_ready),
`UNUSED_PIN (write_notify),
// Output response
.rsp_valid (mem_rsp_valid),
@ -281,8 +282,7 @@ module VX_raster_mem #(
.rsp_data (mem_rsp_data),
.rsp_tag (mem_rsp_tag),
`UNUSED_PIN (rsp_eop),
.rsp_ready (mem_rsp_ready),
`UNUSED_PIN (write_notify),
.rsp_ready (mem_rsp_ready),
// Memory request
.mem_req_valid (cache_req_if.valid),

View file

@ -259,11 +259,11 @@ module VX_raster_unit #(
wire [$clog2(RCACHE_NUM_REQS+1)-1:0] perf_mem_rsp_per_cycle;
wire [$clog2(RCACHE_NUM_REQS+1)+1-1:0] perf_pending_reads_cycle;
wire [RCACHE_NUM_REQS-1:0] perf_mem_req_per_req = cache_req_if.valid & cache_req_if.ready;
wire [RCACHE_NUM_REQS-1:0] perf_mem_rsp_per_req = cache_rsp_if.valid & cache_rsp_if.ready;
wire [RCACHE_NUM_REQS-1:0] perf_mem_req_fire = cache_req_if.valid & cache_req_if.ready;
wire [RCACHE_NUM_REQS-1:0] perf_mem_rsp_fire = cache_rsp_if.valid & cache_rsp_if.ready;
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_per_req);
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_per_req);
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_fire);
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_fire);
reg [`PERF_CTR_BITS-1:0] perf_pending_reads;
assign perf_pending_reads_cycle = perf_mem_req_per_cycle - perf_mem_rsp_per_cycle;

View file

@ -34,6 +34,7 @@ module VX_rop_mem #(
input wire [NUM_LANES-1:0] req_face,
input wire [TAG_WIDTH-1:0] req_tag,
output wire req_ready,
output wire write_notify,
// Response interface
output wire rsp_valid,
@ -42,8 +43,7 @@ module VX_rop_mem #(
output wire [NUM_LANES-1:0][`ROP_DEPTH_BITS-1:0] rsp_depth,
output wire [NUM_LANES-1:0][`ROP_STENCIL_BITS-1:0] rsp_stencil,
output wire [TAG_WIDTH-1:0] rsp_tag,
input wire rsp_ready,
output wire write_notify
input wire rsp_ready
);
localparam NUM_REQS = ROP_MEM_REQS;
@ -214,6 +214,7 @@ module VX_rop_mem #(
.req_tag (mreq_tag_r),
`UNUSED_PIN (req_empty),
.req_ready (mreq_ready_r),
.write_notify (write_notify),
.rsp_valid (mrsp_valid),
.rsp_mask (mrsp_mask),
@ -221,7 +222,6 @@ module VX_rop_mem #(
.rsp_tag (mrsp_tag),
`UNUSED_PIN (rsp_eop),
.rsp_ready (mrsp_ready),
.write_notify (write_notify),
.mem_req_valid (cache_req_if.valid),
.mem_req_rw (cache_req_if.rw),

View file

@ -84,6 +84,7 @@ module VX_rop_unit #(
.req_face (mem_req_face_r),
.req_tag (mem_req_tag_r),
.req_ready (mem_req_ready_r),
.write_notify (mem_write_notify),
.rsp_valid (mem_rsp_valid),
.rsp_mask (mem_rsp_mask),
@ -91,8 +92,7 @@ module VX_rop_unit #(
.rsp_depth (mem_rsp_depth),
.rsp_stencil (mem_rsp_stencil),
.rsp_tag (mem_rsp_tag),
.rsp_ready (mem_rsp_ready),
.write_notify (mem_write_notify)
.rsp_ready (mem_rsp_ready)
);
///////////////////////////////////////////////////////////////////////////
@ -293,13 +293,13 @@ module VX_rop_unit #(
wire [$clog2(OCACHE_NUM_REQS+1)-1:0] perf_mem_rd_rsp_per_cycle;
wire [$clog2(OCACHE_NUM_REQS+1)+1-1:0] perf_pending_reads_cycle;
wire [OCACHE_NUM_REQS-1:0] perf_mem_rd_req_per_mask = cache_req_if.valid & ~cache_req_if.rw & cache_req_if.ready;
wire [OCACHE_NUM_REQS-1:0] perf_mem_wr_req_per_mask = cache_req_if.valid & cache_req_if.rw & cache_req_if.ready;
wire [OCACHE_NUM_REQS-1:0] perf_mem_rd_rsp_per_mask = cache_rsp_if.valid & cache_rsp_if.ready;
wire [OCACHE_NUM_REQS-1:0] perf_mem_rd_req_fire = cache_req_if.valid & ~cache_req_if.rw & cache_req_if.ready;
wire [OCACHE_NUM_REQS-1:0] perf_mem_wr_req_fire = cache_req_if.valid & cache_req_if.rw & cache_req_if.ready;
wire [OCACHE_NUM_REQS-1:0] perf_mem_rd_rsp_fire = cache_rsp_if.valid & cache_rsp_if.ready;
`POP_COUNT(perf_mem_rd_req_per_cycle, perf_mem_rd_req_per_mask);
`POP_COUNT(perf_mem_wr_req_per_cycle, perf_mem_wr_req_per_mask);
`POP_COUNT(perf_mem_rd_rsp_per_cycle, perf_mem_rd_rsp_per_mask);
`POP_COUNT(perf_mem_rd_req_per_cycle, perf_mem_rd_req_fire);
`POP_COUNT(perf_mem_wr_req_per_cycle, perf_mem_wr_req_fire);
`POP_COUNT(perf_mem_rd_rsp_per_cycle, perf_mem_rd_rsp_fire);
reg [`PERF_CTR_BITS-1:0] perf_pending_reads;
assign perf_pending_reads_cycle = perf_mem_rd_req_per_cycle - perf_mem_rd_rsp_per_cycle;

View file

@ -122,6 +122,7 @@ module VX_tex_mem #(
.req_tag (mem_req_tag),
`UNUSED_PIN (req_empty),
.req_ready (mem_req_ready),
`UNUSED_PIN (write_notify),
// Output response
.rsp_valid (mem_rsp_valid),
@ -130,7 +131,6 @@ module VX_tex_mem #(
.rsp_tag (mem_rsp_tag),
`UNUSED_PIN (rsp_eop),
.rsp_ready (mem_rsp_ready),
`UNUSED_PIN (write_notify),
// Memory request
.mem_req_valid (cache_req_if.valid),

View file

@ -206,11 +206,11 @@ module VX_tex_unit #(
wire [$clog2(TCACHE_NUM_REQS+1)-1:0] perf_mem_rsp_per_cycle;
wire [$clog2(TCACHE_NUM_REQS+1)+1-1:0] perf_pending_reads_cycle;
wire [TCACHE_NUM_REQS-1:0] perf_mem_req_per_req = cache_req_if.valid & cache_req_if.ready;
wire [TCACHE_NUM_REQS-1:0] perf_mem_rsp_per_req = cache_rsp_if.valid & cache_rsp_if.ready;
wire [TCACHE_NUM_REQS-1:0] perf_mem_req_fire = cache_req_if.valid & cache_req_if.ready;
wire [TCACHE_NUM_REQS-1:0] perf_mem_rsp_fire = cache_rsp_if.valid & cache_rsp_if.ready;
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_per_req);
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_per_req);
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_fire);
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_fire);
reg [`PERF_CTR_BITS-1:0] perf_pending_reads;
assign perf_pending_reads_cycle = perf_mem_req_per_cycle - perf_mem_rsp_per_cycle;

View file

@ -21,7 +21,7 @@ set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON
set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON
set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON
set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON
set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON
#set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON
set_global_assignment -name USE_HIGH_SPEED_ADDER ON
set_global_assignment -name MUX_RESTRUCTURE ON
@ -29,7 +29,7 @@ set_global_assignment -name ADV_NETLIST_OPT_SYNTH_WYSIWYG_REMAP ON
set_global_assignment -name PROGRAMMABLE_POWER_TECHNOLOGY_SETTING "FORCE ALL TILES WITH FAILING TIMING PATHS TO HIGH SPEED"
set_global_assignment -name PHYSICAL_SYNTHESIS_COMBO_LOGIC ON
set_global_assignment -name PHYSICAL_SYNTHESIS_REGISTER_RETIMING ON
set_global_assignment -name PHYSICAL_SYNTHESIS_REGISTER_DUPLICATION ON
#set_global_assignment -name PHYSICAL_SYNTHESIS_REGISTER_DUPLICATION ON
set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0
set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100

View file

@ -6,7 +6,11 @@ set -e
WIDTH=1920
HEIGHT=1080
LOG_FILE=./perf/draw3d/perf_${DEVICE_FAMILY}_${WIDTH}_${HEIGHT}.log
TOKEN=${1:-}_${DEVICE_FAMILY}_${WIDTH}x${HEIGHT}
LOG_DIR=./perf/draw3d
LOG_FILE=${LOG_DIR}/perf_${TOKEN}.log
declare -a traces=(vase filmtv skybox coverflow evilskull polybump tekkaman carnival)

View file

@ -201,12 +201,12 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
uint64_t fpu_stalls = 0;
uint64_t csr_stalls = 0;
uint64_t alu_stalls = 0;
uint64_t gpu_stalls = 0;
uint64_t wctl_issue_stalls = 0;
// PERF: decode
uint64_t gpu_stalls = 0;
uint64_t ifetches = 0;
uint64_t loads = 0;
uint64_t stores = 0;
uint64_t branches = 0;
uint64_t ifetch_lat = 0;
uint64_t load_lat = 0;
// PERF: Icache
uint64_t icache_reads = 0;
uint64_t icache_read_misses = 0;
@ -221,6 +221,20 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
uint64_t smem_reads = 0;
uint64_t smem_writes = 0;
uint64_t smem_bank_stalls = 0;
// PERF: l2cache
uint64_t l2cache_reads = 0;
uint64_t l2cache_writes = 0;
uint64_t l2cache_read_misses = 0;
uint64_t l2cache_write_misses = 0;
uint64_t l2cache_bank_stalls = 0;
uint64_t l2cache_mshr_stalls = 0;
// PERF: l3cache
uint64_t l3cache_reads = 0;
uint64_t l3cache_writes = 0;
uint64_t l3cache_read_misses = 0;
uint64_t l3cache_write_misses = 0;
uint64_t l3cache_bank_stalls = 0;
uint64_t l3cache_mshr_stalls = 0;
// PERF: memory
uint64_t mem_reads = 0;
uint64_t mem_writes = 0;
@ -327,12 +341,11 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
uint64_t gpu_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_GPU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu unit stalls=%ld\n", core_id, gpu_stalls_per_core);
gpu_stalls += gpu_stalls_per_core;
// wctl_stall
uint64_t wctl_issue_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_WCTL_ISSUE_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: wctl issue stalls=%ld\n", core_id, wctl_issue_stalls_per_core);
wctl_issue_stalls += wctl_issue_stalls_per_core;
// PERF: decode
// PERF: memory
// ifetches
uint64_t ifetches_per_core = get_csr_64(staging_ptr, CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
ifetches += ifetches_per_core;
// loads
uint64_t loads_per_core = get_csr_64(staging_ptr, CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
@ -341,63 +354,57 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
uint64_t stores_per_core = get_csr_64(staging_ptr, CSR_MPM_STORES);
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
stores += stores_per_core;
// branches
uint64_t branches_per_core = get_csr_64(staging_ptr, CSR_MPM_BRANCHES);
if (num_cores > 1) fprintf(stream, "PERF: core%d: branches=%ld\n", core_id, branches_per_core);
branches += branches_per_core;
// PERF: Icache
// total reads
// ifetch latency
uint64_t ifetch_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_IFETCH_LAT);
if (num_cores > 1) {
int mem_avg_lat = (int)(double(ifetch_lat_per_core) / double(ifetches_per_core));
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
}
ifetch_lat += ifetch_lat_per_core;
// load latency
uint64_t load_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_LOAD_LAT);
if (num_cores > 1) {
int mem_avg_lat = (int)(double(load_lat_per_core) / double(loads_per_core));
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
}
load_lat += load_lat_per_core;
} break;
case DCR_MPM_CLASS_MEM: {
if (0 == core_id) {
// PERF: Icache
icache_reads = get_csr_64(staging_ptr, CSR_MPM_ICACHE_READS);
}
// read misses
if (0 == core_id) {
icache_read_misses = get_csr_64(staging_ptr, CSR_MPM_ICACHE_MISS_R);
}
// PERF: Dcache
// total reads
if (0 == core_id) {
// PERF: Dcache
dcache_reads = get_csr_64(staging_ptr, CSR_MPM_DCACHE_READS);
}
// total write
if (0 == core_id) {
dcache_writes = get_csr_64(staging_ptr, CSR_MPM_DCACHE_WRITES);
}
// read misses
if (0 == core_id) {
dcache_read_misses = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MISS_R);
}
// read misses
if (0 == core_id) {
dcache_write_misses = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MISS_W);
}
// bank_stalls
if (0 == core_id) {
dcache_bank_stalls = get_csr_64(staging_ptr, CSR_MPM_DCACHE_BANK_ST);
}
// mshr_stalls
if (0 == core_id) {
dcache_mshr_stalls = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MSHR_ST);
}
// PERF: SMEM
// total reads
if (0 == core_id) {
smem_reads = get_csr_64(staging_ptr, CSR_MPM_SMEM_READS);
}
// total write
if (0 == core_id) {
// PERF: smem
smem_reads = get_csr_64(staging_ptr, CSR_MPM_SMEM_READS);
smem_writes = get_csr_64(staging_ptr, CSR_MPM_SMEM_WRITES);
}
// bank_stalls
if (0 == core_id) {
smem_bank_stalls = get_csr_64(staging_ptr, CSR_MPM_SMEM_BANK_ST);
}
// PERF: memory
if (0 == core_id) {
// PERF: L2cache
l2cache_reads = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_READS);
l2cache_writes = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_WRITES);
l2cache_read_misses = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_MISS_R);
l2cache_write_misses = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_MISS_W);
l2cache_bank_stalls = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_BANK_ST);
l2cache_mshr_stalls = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_MSHR_ST);
// PERF: L3cache
l3cache_reads = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_READS);
l3cache_writes = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_WRITES);
l3cache_read_misses = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_MISS_R);
l3cache_write_misses = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_MISS_W);
l3cache_bank_stalls = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_BANK_ST);
l3cache_mshr_stalls = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_MSHR_ST);
// PERF: memory
mem_reads = get_csr_64(staging_ptr, CSR_MPM_MEM_READS);
mem_writes = get_csr_64(staging_ptr, CSR_MPM_MEM_WRITES);
mem_lat = get_csr_64(staging_ptr, CSR_MPM_MEM_LAT);
@ -470,13 +477,9 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
#ifdef PERF_ENABLE
switch (perf_class) {
case DCR_MPM_CLASS_CORE: {
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
case DCR_MPM_CLASS_CORE: {
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
int load_avg_lat = (int)(double(load_lat) / double(loads));
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
@ -484,10 +487,26 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
fprintf(stream, "PERF: csr unit stalls=%ld\n", csr_stalls);
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls);
fprintf(stream, "PERF: wctl issue stalls=%ld\n", wctl_issue_stalls);
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
fprintf(stream, "PERF: loads=%ld\n", loads);
fprintf(stream, "PERF: stores=%ld\n", stores);
fprintf(stream, "PERF: branches=%ld\n", branches);
fprintf(stream, "PERF: stores=%ld\n", stores);
fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
} break;
case DCR_MPM_CLASS_MEM: {
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
int l2cache_read_hit_ratio = (int)((1.0 - (double(l2cache_read_misses) / double(l2cache_reads))) * 100);
int l2cache_write_hit_ratio = (int)((1.0 - (double(l2cache_write_misses) / double(l2cache_writes))) * 100);
int l2cache_bank_utilization = (int)((double(l2cache_reads + l2cache_writes) / double(l2cache_reads + l2cache_writes + l2cache_bank_stalls)) * 100);
int l3cache_read_hit_ratio = (int)((1.0 - (double(l3cache_read_misses) / double(l3cache_reads))) * 100);
int l3cache_write_hit_ratio = (int)((1.0 - (double(l3cache_write_misses) / double(l3cache_writes))) * 100);
int l3cache_bank_utilization = (int)((double(l3cache_reads + l3cache_writes) / double(l3cache_reads + l3cache_writes + l3cache_bank_stalls)) * 100);
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
@ -499,14 +518,26 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat);
} break;
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
} break;
case DCR_MPM_CLASS_TEX: {
#ifdef EXT_TEX_ENABLE
int tex_avg_lat = (int)(double(tex_mem_lat) / double(tex_mem_reads));
fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads);
fprintf(stream, "PERF: tex memory average latency=%d cycles\n", tex_avg_lat);
fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat);
fprintf(stream, "PERF: tex issue stalls=%ld\n", tex_issue_stalls);
int tcache_read_hit_ratio = (int)((1.0 - (double(tcache_read_misses) / double(tcache_reads))) * 100);
int tcache_bank_utilization = (int)((double(tcache_reads) / double(tcache_reads + tcache_bank_stalls)) * 100);
@ -539,7 +570,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
int rop_stall_cycles_ratio = (int)(100 * double(rop_stall_cycles) / cycles);
fprintf(stream, "PERF: rop memory reads=%ld\n", rop_mem_reads);
fprintf(stream, "PERF: rop memory writes=%ld\n", rop_mem_writes);
fprintf(stream, "PERF: rop memory average latency=%d cycles\n", rop_mem_avg_lat);
fprintf(stream, "PERF: rop memory latency=%d cycles\n", rop_mem_avg_lat);
fprintf(stream, "PERF: rop stall cycles=%ld cycles (%d%%)\n", rop_stall_cycles, rop_stall_cycles_ratio);
fprintf(stream, "PERF: rop issue stalls=%ld\n", rop_issue_stalls);
// cache perf counters

View file

@ -424,14 +424,13 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
return tid + (wid * arch_.num_threads());
case CSR_GTID:
// Processor threadID
return tid + (wid * arch_.num_threads()) +
(arch_.num_threads() * arch_.num_warps() * core_id_);
return (core_id_ * arch_.num_warps() + wid) * arch_.num_threads() + tid;
case CSR_LWID:
// Core warpID
return wid;
case CSR_GWID:
// Processor warpID
return wid + (arch_.num_warps() * core_id_);
return core_id_ * arch_.num_warps() + wid;
case CSR_GCID:
// Processor coreID
return core_id_;

View file

@ -42,7 +42,7 @@ void Warp::clear() {
pipeline_trace_t* Warp::eval() {
assert(tmask_.any());
uint64_t uuid = ((issued_instrs_++ * arch_.num_warps() + warp_id_) * arch_.num_cores()) + core_->id();
uint64_t uuid = (issued_instrs_++ * arch_.num_cores() + core_->id()) * arch_.num_warps() + warp_id_;
DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)