mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 13:27:29 -04:00
perf counters refactory + uuid fixes
This commit is contained in:
parent
47847ec920
commit
611e76ae0e
27 changed files with 522 additions and 292 deletions
|
@ -49,7 +49,7 @@ localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8;
|
|||
localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_DATA_SIZE);
|
||||
|
||||
|
||||
localparam AVS_RD_QUEUE_SIZE = 4;
|
||||
localparam AVS_RD_QUEUE_SIZE = 64;
|
||||
localparam _VX_MEM_TAG_WIDTH = `VX_MEM_TAG_WIDTH;
|
||||
localparam _AVS_REQ_TAGW_VX = _VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH);
|
||||
localparam _AVS_REQ_TAGW_VX2 = `MAX(_VX_MEM_TAG_WIDTH, _AVS_REQ_TAGW_VX);
|
||||
|
|
|
@ -312,14 +312,14 @@
|
|||
`define NUM_TEX_UNITS `UP(`NUM_CORES / 8)
|
||||
`endif
|
||||
|
||||
// Size of texture Request Queue
|
||||
// Size of texture Request Queue (Quad=4)
|
||||
`ifndef TEX_REQ_QUEUE_SIZE
|
||||
`define TEX_REQ_QUEUE_SIZE `MAX(2, `NUM_WARPS * 4)
|
||||
`endif
|
||||
|
||||
// Texture Unit memory pending Queue
|
||||
`ifndef TEX_MEM_QUEUE_SIZE
|
||||
`define TEX_MEM_QUEUE_SIZE `MAX(2, `NUM_WARPS * 4)
|
||||
`define TEX_MEM_QUEUE_SIZE (`TEX_REQ_QUEUE_SIZE * 2)
|
||||
`endif
|
||||
|
||||
// Raster Units ////////////////////////////////////////////////////////////////
|
||||
|
@ -407,7 +407,7 @@
|
|||
|
||||
// Miss Handling Register Size
|
||||
`ifndef ICACHE_MSHR_SIZE
|
||||
`define ICACHE_MSHR_SIZE `CLAMP(`NUM_WARPS * `UP(`NUM_CORES / `UP(`NUM_ICACHES)), 2, 16)
|
||||
`define ICACHE_MSHR_SIZE 16
|
||||
`endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
|
@ -471,7 +471,7 @@
|
|||
|
||||
// Miss Handling Register Size
|
||||
`ifndef DCACHE_MSHR_SIZE
|
||||
`define DCACHE_MSHR_SIZE `CLAMP(`LSUQ_SIZE * `UP(`NUM_CORES / `UP(`NUM_DCACHES)), 2, 16)
|
||||
`define DCACHE_MSHR_SIZE 32
|
||||
`endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
|
@ -562,7 +562,7 @@
|
|||
|
||||
// Miss Handling Register Size
|
||||
`ifndef TCACHE_MSHR_SIZE
|
||||
`define TCACHE_MSHR_SIZE (8 * 8)
|
||||
`define TCACHE_MSHR_SIZE 32
|
||||
`endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
|
@ -626,7 +626,7 @@
|
|||
|
||||
// Miss Handling Register Size
|
||||
`ifndef RCACHE_MSHR_SIZE
|
||||
`define RCACHE_MSHR_SIZE 8
|
||||
`define RCACHE_MSHR_SIZE 16
|
||||
`endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
|
@ -690,7 +690,7 @@
|
|||
|
||||
// Miss Handling Register Size
|
||||
`ifndef OCACHE_MSHR_SIZE
|
||||
`define OCACHE_MSHR_SIZE (8 * 8)
|
||||
`define OCACHE_MSHR_SIZE 32
|
||||
`endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
|
@ -741,7 +741,7 @@
|
|||
|
||||
// Miss Handling Register Size
|
||||
`ifndef L2_MSHR_SIZE
|
||||
`define L2_MSHR_SIZE 16
|
||||
`define L2_MSHR_SIZE 64
|
||||
`endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
|
@ -792,7 +792,7 @@
|
|||
|
||||
// Miss Handling Register Size
|
||||
`ifndef L3_MSHR_SIZE
|
||||
`define L3_MSHR_SIZE 16
|
||||
`define L3_MSHR_SIZE 64
|
||||
`endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
|
|
|
@ -129,10 +129,7 @@ module VX_core #(
|
|||
.CORE_ID(CORE_ID)
|
||||
) decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_decode_if (perf_pipeline_if.decode),
|
||||
`endif
|
||||
.reset (decode_reset),
|
||||
.ifetch_rsp_if (ifetch_rsp_if),
|
||||
.decode_if (decode_if),
|
||||
.wrelease_if (wrelease_if),
|
||||
|
@ -174,7 +171,7 @@ module VX_core #(
|
|||
.base_dcrs (base_dcrs),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if(perf_pipeline_if),
|
||||
`endif
|
||||
|
||||
|
@ -252,5 +249,79 @@ module VX_core #(
|
|||
|
||||
.sim_wb_value (sim_wb_value)
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
wire [$clog2(ICACHE_NUM_REQS+1)-1:0] perf_icache_req_per_cycle;
|
||||
wire [$clog2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||
wire [$clog2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
|
||||
|
||||
wire [$clog2(ICACHE_NUM_REQS+1)-1:0] perf_icache_rsp_per_cycle;
|
||||
wire [$clog2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||
|
||||
wire [$clog2(ICACHE_NUM_REQS+1)+1-1:0] perf_icache_pending_read_cycle;
|
||||
wire [$clog2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||
|
||||
wire [ICACHE_NUM_REQS-1:0] perf_icache_req_fire = icache_req_if.valid & icache_req_if.ready;
|
||||
wire [ICACHE_NUM_REQS-1:0] perf_icache_rsp_fire = icache_rsp_if.valid & icache_rsp_if.ready;
|
||||
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire = dcache_req_if.valid & ~dcache_req_if.rw & dcache_req_if.ready;
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire = dcache_req_if.valid & dcache_req_if.rw & dcache_req_if.ready;
|
||||
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire = dcache_rsp_if.valid & dcache_rsp_if.ready;
|
||||
|
||||
`POP_COUNT(perf_icache_req_per_cycle, perf_icache_req_fire);
|
||||
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire);
|
||||
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire);
|
||||
|
||||
`POP_COUNT(perf_icache_rsp_per_cycle, perf_icache_rsp_fire);
|
||||
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
|
||||
|
||||
assign perf_icache_pending_read_cycle = perf_icache_req_per_cycle - perf_icache_rsp_per_cycle;
|
||||
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_icache_pending_reads <= 0;
|
||||
perf_dcache_pending_reads <= 0;
|
||||
end else begin
|
||||
perf_icache_pending_reads <= perf_icache_pending_reads + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
|
||||
perf_dcache_pending_reads <= perf_dcache_pending_reads + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_ifetches <= 0;
|
||||
perf_loads <= 0;
|
||||
perf_stores <= 0;
|
||||
perf_icache_lat <= 0;
|
||||
perf_dcache_lat <= 0;
|
||||
end else begin
|
||||
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_per_cycle);
|
||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
|
||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
|
||||
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
|
||||
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_pipeline_if.ifetches = perf_ifetches;
|
||||
assign perf_pipeline_if.loads = perf_loads;
|
||||
assign perf_pipeline_if.stores = perf_stores;
|
||||
assign perf_pipeline_if.load_latency = perf_dcache_lat;
|
||||
assign perf_pipeline_if.ifetch_latency = perf_icache_lat;
|
||||
assign perf_pipeline_if.load_latency = perf_dcache_lat;
|
||||
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -161,32 +161,41 @@ module VX_csr_data #(
|
|||
`DCR_MPM_CLASS_CORE: begin
|
||||
case (read_addr)
|
||||
// PERF: pipeline
|
||||
`CSR_MPM_IBUF_ST : read_data_ro_r = perf_pipeline_if.ibf_stalls[31:0];
|
||||
`CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(perf_pipeline_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SCRB_ST : read_data_ro_r = perf_pipeline_if.scb_stalls[31:0];
|
||||
`CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(perf_pipeline_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ALU_ST : read_data_ro_r = perf_pipeline_if.alu_stalls[31:0];
|
||||
`CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.alu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_LSU_ST : read_data_ro_r = perf_pipeline_if.lsu_stalls[31:0];
|
||||
`CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_CSR_ST : read_data_ro_r = perf_pipeline_if.csr_stalls[31:0];
|
||||
`CSR_MPM_CSR_ST_H : read_data_ro_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_IBUF_ST : read_data_ro_r = perf_pipeline_if.ibf_stalls[31:0];
|
||||
`CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(perf_pipeline_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SCRB_ST : read_data_ro_r = perf_pipeline_if.scb_stalls[31:0];
|
||||
`CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(perf_pipeline_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ALU_ST : read_data_ro_r = perf_pipeline_if.alu_stalls[31:0];
|
||||
`CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.alu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_LSU_ST : read_data_ro_r = perf_pipeline_if.lsu_stalls[31:0];
|
||||
`CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_CSR_ST : read_data_ro_r = perf_pipeline_if.csr_stalls[31:0];
|
||||
`CSR_MPM_CSR_ST_H : read_data_ro_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`CSR_MPM_FPU_ST : read_data_ro_r = perf_pipeline_if.fpu_stalls[31:0];
|
||||
`CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_FPU_ST : read_data_ro_r = perf_pipeline_if.fpu_stalls[31:0];
|
||||
`CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`else
|
||||
`CSR_MPM_FPU_ST : read_data_ro_r = '0;
|
||||
`CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
|
||||
`CSR_MPM_FPU_ST : read_data_ro_r = '0;
|
||||
`CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
|
||||
`endif
|
||||
`CSR_MPM_GPU_ST : read_data_ro_r = perf_pipeline_if.gpu_stalls[31:0];
|
||||
`CSR_MPM_GPU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: decode
|
||||
`CSR_MPM_LOADS : read_data_ro_r = perf_pipeline_if.loads[31:0];
|
||||
`CSR_MPM_LOADS_H : read_data_ro_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_STORES : read_data_ro_r = perf_pipeline_if.stores[31:0];
|
||||
`CSR_MPM_STORES_H : read_data_ro_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_BRANCHES : read_data_ro_r = perf_pipeline_if.branches[31:0];
|
||||
`CSR_MPM_BRANCHES_H : read_data_ro_r = 32'(perf_pipeline_if.branches[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_GPU_ST : read_data_ro_r = perf_pipeline_if.gpu_stalls[31:0];
|
||||
`CSR_MPM_GPU_ST_H : read_data_ro_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: memory
|
||||
`CSR_MPM_IFETCHES : read_data_ro_r = perf_pipeline_if.ifetches[31:0];
|
||||
`CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(perf_pipeline_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_LOADS : read_data_ro_r = perf_pipeline_if.loads[31:0];
|
||||
`CSR_MPM_LOADS_H : read_data_ro_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_STORES : read_data_ro_r = perf_pipeline_if.stores[31:0];
|
||||
`CSR_MPM_STORES_H : read_data_ro_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_IFETCH_LAT : read_data_ro_r = perf_pipeline_if.ifetch_latency[31:0];
|
||||
`CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(perf_pipeline_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_LOAD_LAT : read_data_ro_r = perf_pipeline_if.load_latency[31:0];
|
||||
`CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(perf_pipeline_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`DCR_MPM_CLASS_MEM: begin
|
||||
case (read_addr)
|
||||
// PERF: icache
|
||||
`CSR_MPM_ICACHE_READS : read_data_ro_r = perf_memsys_if.icache_reads[31:0];
|
||||
`CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]);
|
||||
|
@ -206,22 +215,45 @@ module VX_csr_data #(
|
|||
`CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = perf_memsys_if.dcache_mshr_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: smem
|
||||
`CSR_MPM_SMEM_READS : read_data_ro_r = perf_memsys_if.smem_reads[31:0];
|
||||
`CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_WRITES : read_data_ro_r = perf_memsys_if.smem_writes[31:0];
|
||||
`CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_BANK_ST : read_data_ro_r = perf_memsys_if.smem_bank_stalls[31:0];
|
||||
`CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_READS : read_data_ro_r = perf_memsys_if.smem_reads[31:0];
|
||||
`CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_WRITES : read_data_ro_r = perf_memsys_if.smem_writes[31:0];
|
||||
`CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_BANK_ST : read_data_ro_r = perf_memsys_if.smem_bank_stalls[31:0];
|
||||
`CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: l2cache
|
||||
`CSR_MPM_L2CACHE_READS : read_data_ro_r = perf_memsys_if.l2cache_reads[31:0];
|
||||
`CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_L2CACHE_WRITES : read_data_ro_r = perf_memsys_if.l2cache_writes[31:0];
|
||||
`CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = perf_memsys_if.l2cache_read_misses[31:0];
|
||||
`CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = perf_memsys_if.l2cache_write_misses[31:0];
|
||||
`CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = perf_memsys_if.l2cache_bank_stalls[31:0];
|
||||
`CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = perf_memsys_if.l2cache_mshr_stalls[31:0];
|
||||
`CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(perf_memsys_if.l2cache_mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: l3cache
|
||||
`CSR_MPM_L3CACHE_READS : read_data_ro_r = perf_memsys_if.l3cache_reads[31:0];
|
||||
`CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_L3CACHE_WRITES : read_data_ro_r = perf_memsys_if.l3cache_writes[31:0];
|
||||
`CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = perf_memsys_if.l3cache_read_misses[31:0];
|
||||
`CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = perf_memsys_if.l3cache_write_misses[31:0];
|
||||
`CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = perf_memsys_if.l3cache_bank_stalls[31:0];
|
||||
`CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = perf_memsys_if.l3cache_mshr_stalls[31:0];
|
||||
`CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(perf_memsys_if.l3cache_mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: memory
|
||||
`CSR_MPM_MEM_READS : read_data_ro_r = perf_memsys_if.mem_reads[31:0];
|
||||
`CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_WRITES : read_data_ro_r = perf_memsys_if.mem_writes[31:0];
|
||||
`CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_LAT : read_data_ro_r = perf_memsys_if.mem_latency[31:0];
|
||||
`CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: wctl
|
||||
`CSR_MPM_WCTL_ISSUE_ST : read_data_ro_r = perf_gpu_if.wctl_stalls[31:0];
|
||||
`CSR_MPM_WCTL_ISSUE_ST_H : read_data_ro_r = 32'(perf_gpu_if.wctl_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_READS : read_data_ro_r = perf_memsys_if.mem_reads[31:0];
|
||||
`CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_WRITES : read_data_ro_r = perf_memsys_if.mem_writes[31:0];
|
||||
`CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_LAT : read_data_ro_r = perf_memsys_if.mem_latency[31:0];
|
||||
`CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
@ -327,6 +359,8 @@ module VX_csr_data #(
|
|||
wire [`PERF_CTR_BITS-1:0] perf_imadd_stalls = perf_gpu_if.imadd_stalls;
|
||||
`UNUSED_VAR (perf_imadd_stalls);
|
||||
`endif
|
||||
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = perf_gpu_if.wctl_stalls;
|
||||
`UNUSED_VAR (perf_wctl_stalls);
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
|
|
@ -18,10 +18,6 @@ module VX_decode #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_pipeline_if.decode perf_decode_if,
|
||||
`endif
|
||||
|
||||
// inputs
|
||||
VX_ifetch_rsp_if.slave ifetch_rsp_if,
|
||||
|
||||
|
@ -488,42 +484,6 @@ module VX_decode #(
|
|||
assign ifetch_rsp_if.ibuf_pop = decode_if.ibuf_pop;
|
||||
assign ifetch_rsp_if.ready = decode_if.ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_loads_per_cycle;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_stores_per_cycle;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_branches_per_cycle;
|
||||
|
||||
wire [`NUM_THREADS-1:0] perf_loads_per_thread = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && decode_if.wb}};
|
||||
wire [`NUM_THREADS-1:0] perf_stores_per_thread = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && ~decode_if.wb}};
|
||||
wire [`NUM_THREADS-1:0] perf_branches_per_thread = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_ALU && `INST_ALU_IS_BR(decode_if.op_mod)}};
|
||||
|
||||
`POP_COUNT(perf_loads_per_cycle, perf_loads_per_thread);
|
||||
`POP_COUNT(perf_stores_per_cycle, perf_stores_per_thread);
|
||||
`POP_COUNT(perf_branches_per_cycle, perf_branches_per_thread);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_branches;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_loads <= 0;
|
||||
perf_stores <= 0;
|
||||
perf_branches <= 0;
|
||||
end else begin
|
||||
if (decode_if.valid && decode_if.ready) begin
|
||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_loads_per_cycle);
|
||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_stores_per_cycle);
|
||||
perf_branches <= perf_branches + `PERF_CTR_BITS'(perf_branches_per_cycle);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_decode_if.loads = perf_loads;
|
||||
assign perf_decode_if.stores = perf_stores;
|
||||
assign perf_decode_if.branches = perf_branches;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (decode_if.valid && decode_if.ready) begin
|
||||
|
|
|
@ -65,7 +65,7 @@ module VX_fetch #(
|
|||
`SCOPE_BIND_VX_fetch_icache_stage
|
||||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (reset),
|
||||
|
||||
.icache_rsp_if (icache_rsp_if),
|
||||
.icache_req_if (icache_req_if),
|
||||
|
|
|
@ -12,7 +12,7 @@ module VX_icache_stage #(
|
|||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
|
||||
// Icache interface
|
||||
VX_cache_req_if.master icache_req_if,
|
||||
VX_cache_rsp_if.slave icache_rsp_if,
|
||||
|
|
|
@ -205,7 +205,6 @@ module VX_lsu_unit #(
|
|||
.rsp_tag (mem_rsp_tag),
|
||||
.rsp_eop (mem_rsp_eop),
|
||||
.rsp_ready (mem_rsp_ready),
|
||||
`UNUSED_PIN (write_notify),
|
||||
|
||||
// Memory request
|
||||
.mem_req_valid (cache_req_tmp_if.valid),
|
||||
|
|
|
@ -583,41 +583,32 @@ module VX_mem_unit # (
|
|||
assign perf_memsys_if.smem_bank_stalls = 0;
|
||||
`endif
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_pending_reads <= 0;
|
||||
end else begin
|
||||
perf_mem_pending_reads <= perf_mem_pending_reads +
|
||||
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
|
||||
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw))));
|
||||
end
|
||||
end
|
||||
`ifdef L2_ENABLE
|
||||
assign perf_memsys_if.l2cache_reads = perf_l2cache_if.reads;
|
||||
assign perf_memsys_if.l2cache_writes = perf_l2cache_if.writes;
|
||||
assign perf_memsys_if.l2cache_read_misses = perf_l2cache_if.read_misses;
|
||||
assign perf_memsys_if.l2cache_write_misses= perf_l2cache_if.write_misses;
|
||||
assign perf_memsys_if.l2cache_bank_stalls = perf_l2cache_if.bank_stalls;
|
||||
assign perf_memsys_if.l2cache_mshr_stalls = perf_l2cache_if.mshr_stalls;
|
||||
`else
|
||||
assign perf_memsys_if.l2cache_reads = 0;
|
||||
assign perf_memsys_if.l2cache_writes = 0;
|
||||
assign perf_memsys_if.l2cache_read_misses = 0;
|
||||
assign perf_memsys_if.l2cache_write_misses= 0;
|
||||
assign perf_memsys_if.l2cache_bank_stalls = 0;
|
||||
assign perf_memsys_if.l2cache_mshr_stalls = 0;
|
||||
`endif
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
|
||||
assign perf_memsys_if.l3cache_reads = 0;
|
||||
assign perf_memsys_if.l3cache_writes = 0;
|
||||
assign perf_memsys_if.l3cache_read_misses = 0;
|
||||
assign perf_memsys_if.l3cache_write_misses= 0;
|
||||
assign perf_memsys_if.l3cache_bank_stalls = 0;
|
||||
assign perf_memsys_if.l3cache_mshr_stalls = 0;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_reads <= 0;
|
||||
perf_mem_writes <= 0;
|
||||
perf_mem_lat <= 0;
|
||||
end else begin
|
||||
if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin
|
||||
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin
|
||||
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_memsys_if.mem_reads = perf_mem_reads;
|
||||
assign perf_memsys_if.mem_writes = perf_mem_writes;
|
||||
assign perf_memsys_if.mem_latency = perf_mem_lat;
|
||||
assign perf_memsys_if.mem_reads = 0;
|
||||
assign perf_memsys_if.mem_writes = 0;
|
||||
assign perf_memsys_if.mem_latency = 0;
|
||||
|
||||
`endif
|
||||
|
||||
|
|
|
@ -18,9 +18,10 @@
|
|||
|
||||
`define DCR_MPM_CLASS_NONE 0
|
||||
`define DCR_MPM_CLASS_CORE 1
|
||||
`define DCR_MPM_CLASS_TEX 2
|
||||
`define DCR_MPM_CLASS_RASTER 3
|
||||
`define DCR_MPM_CLASS_ROP 4
|
||||
`define DCR_MPM_CLASS_MEM 2
|
||||
`define DCR_MPM_CLASS_TEX 3
|
||||
`define DCR_MPM_CLASS_RASTER 4
|
||||
`define DCR_MPM_CLASS_ROP 5
|
||||
|
||||
// User Floating-Point CSRs
|
||||
|
||||
|
@ -68,48 +69,77 @@
|
|||
`define CSR_MPM_FPU_ST_H 12'hB88
|
||||
`define CSR_MPM_GPU_ST 12'hB09
|
||||
`define CSR_MPM_GPU_ST_H 12'hB89
|
||||
// PERF: decode
|
||||
`define CSR_MPM_LOADS 12'hB0A
|
||||
`define CSR_MPM_LOADS_H 12'hB8A
|
||||
`define CSR_MPM_STORES 12'hB0B
|
||||
`define CSR_MPM_STORES_H 12'hB8B
|
||||
`define CSR_MPM_BRANCHES 12'hB0C
|
||||
`define CSR_MPM_BRANCHES_H 12'hB8C
|
||||
// PERF: icache
|
||||
`define CSR_MPM_ICACHE_READS 12'hB0D // total reads
|
||||
`define CSR_MPM_ICACHE_READS_H 12'hB8D
|
||||
`define CSR_MPM_ICACHE_MISS_R 12'hB0E // read misses
|
||||
`define CSR_MPM_ICACHE_MISS_R_H 12'hB8E
|
||||
// PERF: dcache
|
||||
`define CSR_MPM_DCACHE_READS 12'hB0F // total reads
|
||||
`define CSR_MPM_DCACHE_READS_H 12'hB8F
|
||||
`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes
|
||||
`define CSR_MPM_DCACHE_WRITES_H 12'hB90
|
||||
`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses
|
||||
`define CSR_MPM_DCACHE_MISS_R_H 12'hB91
|
||||
`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses
|
||||
`define CSR_MPM_DCACHE_MISS_W_H 12'hB92
|
||||
`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts
|
||||
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93
|
||||
`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls
|
||||
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94
|
||||
// PERF: smem
|
||||
`define CSR_MPM_SMEM_READS 12'hB15 // total reads
|
||||
`define CSR_MPM_SMEM_READS_H 12'hB95
|
||||
`define CSR_MPM_SMEM_WRITES 12'hB16 // total writes
|
||||
`define CSR_MPM_SMEM_WRITES_H 12'hB96
|
||||
`define CSR_MPM_SMEM_BANK_ST 12'hB17 // bank conflicts
|
||||
`define CSR_MPM_SMEM_BANK_ST_H 12'hB97
|
||||
// PERF: memory
|
||||
`define CSR_MPM_MEM_READS 12'hB18 // memory reads
|
||||
`define CSR_MPM_MEM_READS_H 12'hB98
|
||||
`define CSR_MPM_MEM_WRITES 12'hB19 // memory writes
|
||||
`define CSR_MPM_MEM_WRITES_H 12'hB99
|
||||
`define CSR_MPM_MEM_LAT 12'hB1A // memory latency
|
||||
`define CSR_MPM_MEM_LAT_H 12'hB9A
|
||||
// PERF: wctl
|
||||
`define CSR_MPM_WCTL_ISSUE_ST 12'hB1B // issue stalls
|
||||
`define CSR_MPM_WCTL_ISSUE_ST_H 12'hB9B
|
||||
`define CSR_MPM_IFETCHES 12'hB0A
|
||||
`define CSR_MPM_IFETCHES_H 12'hB8A
|
||||
`define CSR_MPM_LOADS 12'hB0B
|
||||
`define CSR_MPM_LOADS_H 12'hB8B
|
||||
`define CSR_MPM_STORES 12'hB0C
|
||||
`define CSR_MPM_STORES_H 12'hB8C
|
||||
`define CSR_MPM_IFETCH_LAT 12'hB0D
|
||||
`define CSR_MPM_IFETCH_LAT_H 12'hB8D
|
||||
`define CSR_MPM_LOAD_LAT 12'hB0E
|
||||
`define CSR_MPM_LOAD_LAT_H 12'hB8E
|
||||
|
||||
// Machine Performance-monitoring memory counters
|
||||
// PERF: icache
|
||||
`define CSR_MPM_ICACHE_READS 12'hB03 // total reads
|
||||
`define CSR_MPM_ICACHE_READS_H 12'hB83
|
||||
`define CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
|
||||
`define CSR_MPM_ICACHE_MISS_R_H 12'hB84
|
||||
// PERF: dcache
|
||||
`define CSR_MPM_DCACHE_READS 12'hB05 // total reads
|
||||
`define CSR_MPM_DCACHE_READS_H 12'hB85
|
||||
`define CSR_MPM_DCACHE_WRITES 12'hB06 // total writes
|
||||
`define CSR_MPM_DCACHE_WRITES_H 12'hB86
|
||||
`define CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses
|
||||
`define CSR_MPM_DCACHE_MISS_R_H 12'hB87
|
||||
`define CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses
|
||||
`define CSR_MPM_DCACHE_MISS_W_H 12'hB88
|
||||
`define CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts
|
||||
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB89
|
||||
`define CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls
|
||||
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A
|
||||
// PERF: smem
|
||||
`define CSR_MPM_SMEM_READS 12'hB0B // memory reads
|
||||
`define CSR_MPM_SMEM_READS_H 12'hB8B
|
||||
`define CSR_MPM_SMEM_WRITES 12'hB0C // memory writes
|
||||
`define CSR_MPM_SMEM_WRITES_H 12'hB8C
|
||||
`define CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts
|
||||
`define CSR_MPM_SMEM_BANK_ST_H 12'hB8D
|
||||
// PERF: l2cache
|
||||
`define CSR_MPM_L2CACHE_READS 12'hB0E // total reads
|
||||
`define CSR_MPM_L2CACHE_READS_H 12'hB8E
|
||||
`define CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes
|
||||
`define CSR_MPM_L2CACHE_WRITES_H 12'hB8F
|
||||
`define CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses
|
||||
`define CSR_MPM_L2CACHE_MISS_R_H 12'hB90
|
||||
`define CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses
|
||||
`define CSR_MPM_L2CACHE_MISS_W_H 12'hB91
|
||||
`define CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts
|
||||
`define CSR_MPM_L2CACHE_BANK_ST_H 12'hB92
|
||||
`define CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls
|
||||
`define CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93
|
||||
// PERF: l3cache
|
||||
`define CSR_MPM_L3CACHE_READS 12'hB14 // total reads
|
||||
`define CSR_MPM_L3CACHE_READS_H 12'hB94
|
||||
`define CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes
|
||||
`define CSR_MPM_L3CACHE_WRITES_H 12'hB95
|
||||
`define CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses
|
||||
`define CSR_MPM_L3CACHE_MISS_R_H 12'hB96
|
||||
`define CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses
|
||||
`define CSR_MPM_L3CACHE_MISS_W_H 12'hB97
|
||||
`define CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts
|
||||
`define CSR_MPM_L3CACHE_BANK_ST_H 12'hB98
|
||||
`define CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls
|
||||
`define CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99
|
||||
// PERF: memory
|
||||
`define CSR_MPM_MEM_READS 12'hB1A // total reads
|
||||
`define CSR_MPM_MEM_READS_H 12'hB9A
|
||||
`define CSR_MPM_MEM_WRITES 12'hB1B // total writes
|
||||
`define CSR_MPM_MEM_WRITES_H 12'hB9B
|
||||
`define CSR_MPM_MEM_LAT 12'hB1C // memory latency
|
||||
`define CSR_MPM_MEM_LAT_H 12'hB9C
|
||||
|
||||
// Machine Performance-monitoring texture counters
|
||||
// PERF: texture unit
|
||||
|
|
|
@ -139,10 +139,10 @@ module VX_warp_sched #(
|
|||
stalled_warps[schedule_wid] <= 1;
|
||||
|
||||
// release wspawn
|
||||
use_wspawn[schedule_wid] <= 0;
|
||||
if (use_wspawn[schedule_wid]) begin
|
||||
thread_masks[schedule_wid] <= 1;
|
||||
end
|
||||
use_wspawn[schedule_wid] <= 0;
|
||||
|
||||
issued_instrs[schedule_wid] <= issued_instrs[schedule_wid] + `UP(`UUID_BITS)'(1);
|
||||
end
|
||||
|
@ -248,8 +248,8 @@ module VX_warp_sched #(
|
|||
|
||||
`ifdef SIMULATION
|
||||
assign instr_uuid = (issued_instrs[schedule_wid] * `NUM_WARPS * `NUM_CORES * `NUM_CLUSTERS)
|
||||
+ (schedule_wid * `NUM_CORES * `NUM_CLUSTERS)
|
||||
+ `UUID_BITS'(CORE_ID);
|
||||
+ (`NUM_WARPS * CORE_ID)
|
||||
+ `UUID_BITS'(schedule_wid);
|
||||
`else
|
||||
assign instr_uuid = 0;
|
||||
`endif
|
||||
|
|
|
@ -52,7 +52,7 @@ module Vortex (
|
|||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_memsys_if perf_memsys_if[`NUM_CLUSTERS]();
|
||||
VX_perf_memsys_if perf_memsys_total_if();
|
||||
VX_perf_memsys_if perf_memsys_total_if();
|
||||
VX_perf_cache_if perf_l3cache_if();
|
||||
`PERF_MEMSYS_ADD (perf_memsys_total_if, perf_memsys_if, `NUM_CLUSTERS);
|
||||
`endif
|
||||
|
@ -237,6 +237,62 @@ module Vortex (
|
|||
.mem_rsp_if (mem_rsp_if)
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
`ifdef L3_ENABLE
|
||||
assign perf_memsys_total_if.l3cache_reads = perf_l3cache_if.reads;
|
||||
assign perf_memsys_total_if.l3cache_writes = perf_l3cache_if.writes;
|
||||
assign perf_memsys_total_if.l3cache_read_misses = perf_l3cache_if.read_misses;
|
||||
assign perf_memsys_total_if.l3cache_write_misses= perf_l3cache_if.write_misses;
|
||||
assign perf_memsys_total_if.l3cache_bank_stalls = perf_l3cache_if.bank_stalls;
|
||||
assign perf_memsys_total_if.l3cache_mshr_stalls = perf_l3cache_if.mshr_stalls;
|
||||
`else
|
||||
assign perf_memsys_total_if.l3cache_reads = 0;
|
||||
assign perf_memsys_total_if.l3cache_writes = 0;
|
||||
assign perf_memsys_total_if.l3cache_read_misses = 0;
|
||||
assign perf_memsys_total_if.l3cache_write_misses= 0;
|
||||
assign perf_memsys_total_if.l3cache_bank_stalls = 0;
|
||||
assign perf_memsys_total_if.l3cache_mshr_stalls = 0;
|
||||
`endif
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_pending_reads <= 0;
|
||||
end else begin
|
||||
perf_mem_pending_reads <= perf_mem_pending_reads +
|
||||
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
|
||||
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw))));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_reads <= 0;
|
||||
perf_mem_writes <= 0;
|
||||
perf_mem_lat <= 0;
|
||||
end else begin
|
||||
if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin
|
||||
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin
|
||||
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_memsys_total_if.mem_reads = perf_mem_reads;
|
||||
assign perf_memsys_total_if.mem_writes = perf_mem_writes;
|
||||
assign perf_memsys_total_if.mem_latency = perf_mem_lat;
|
||||
|
||||
`endif
|
||||
|
||||
`SCOPE_ASSIGN (reset, reset);
|
||||
`SCOPE_ASSIGN (mem_req_fire, mem_req_valid && mem_req_ready);
|
||||
`SCOPE_ASSIGN (mem_req_addr, `TO_FULL_ADDR(mem_req_addr));
|
||||
|
|
6
hw/rtl/cache/VX_cache_define.vh
vendored
6
hw/rtl/cache/VX_cache_define.vh
vendored
|
@ -124,6 +124,12 @@
|
|||
`REDUCE_ADD (dst, src, smem_reads, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, smem_writes, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, smem_bank_stalls, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, l2cache_reads, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, l2cache_writes, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, l2cache_read_misses, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, l2cache_write_misses, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, l2cache_bank_stalls, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, l2cache_mshr_stalls, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, mem_reads, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, mem_writes, `PERF_CTR_BITS, count); \
|
||||
`REDUCE_ADD (dst, src, mem_latency, `PERF_CTR_BITS, count)
|
||||
|
|
|
@ -4,15 +4,32 @@ interface VX_perf_memsys_if ();
|
|||
|
||||
wire [`PERF_CTR_BITS-1:0] icache_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_read_misses;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_read_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_write_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] smem_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] smem_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] smem_bank_stalls;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] l2cache_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] l2cache_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] l2cache_read_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] l2cache_write_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] l2cache_bank_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] l2cache_mshr_stalls;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] l3cache_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] l3cache_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] l3cache_read_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] l3cache_write_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] l3cache_bank_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] l3cache_mshr_stalls;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] mem_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_latency;
|
||||
|
@ -20,15 +37,32 @@ interface VX_perf_memsys_if ();
|
|||
modport master (
|
||||
output icache_reads,
|
||||
output icache_read_misses,
|
||||
|
||||
output dcache_reads,
|
||||
output dcache_writes,
|
||||
output dcache_read_misses,
|
||||
output dcache_write_misses,
|
||||
output dcache_bank_stalls,
|
||||
output dcache_mshr_stalls,
|
||||
|
||||
output smem_reads,
|
||||
output smem_writes,
|
||||
output smem_bank_stalls,
|
||||
|
||||
output l2cache_reads,
|
||||
output l2cache_writes,
|
||||
output l2cache_read_misses,
|
||||
output l2cache_write_misses,
|
||||
output l2cache_bank_stalls,
|
||||
output l2cache_mshr_stalls,
|
||||
|
||||
output l3cache_reads,
|
||||
output l3cache_writes,
|
||||
output l3cache_read_misses,
|
||||
output l3cache_write_misses,
|
||||
output l3cache_bank_stalls,
|
||||
output l3cache_mshr_stalls,
|
||||
|
||||
output mem_reads,
|
||||
output mem_writes,
|
||||
output mem_latency
|
||||
|
@ -37,15 +71,32 @@ interface VX_perf_memsys_if ();
|
|||
modport slave (
|
||||
input icache_reads,
|
||||
input icache_read_misses,
|
||||
|
||||
input dcache_reads,
|
||||
input dcache_writes,
|
||||
input dcache_read_misses,
|
||||
input dcache_write_misses,
|
||||
input dcache_bank_stalls,
|
||||
input dcache_mshr_stalls,
|
||||
|
||||
input smem_reads,
|
||||
input smem_writes,
|
||||
input smem_bank_stalls,
|
||||
|
||||
input l2cache_reads,
|
||||
input l2cache_writes,
|
||||
input l2cache_read_misses,
|
||||
input l2cache_write_misses,
|
||||
input l2cache_bank_stalls,
|
||||
input l2cache_mshr_stalls,
|
||||
|
||||
input l3cache_reads,
|
||||
input l3cache_writes,
|
||||
input l3cache_read_misses,
|
||||
input l3cache_write_misses,
|
||||
input l3cache_bank_stalls,
|
||||
input l3cache_mshr_stalls,
|
||||
|
||||
input mem_reads,
|
||||
input mem_writes,
|
||||
input mem_latency
|
||||
|
|
|
@ -1,10 +1,6 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
interface VX_perf_pipeline_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] loads;
|
||||
wire [`PERF_CTR_BITS-1:0] stores;
|
||||
wire [`PERF_CTR_BITS-1:0] branches;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] lsu_stalls;
|
||||
|
@ -15,11 +11,11 @@ interface VX_perf_pipeline_if ();
|
|||
`endif
|
||||
wire [`PERF_CTR_BITS-1:0] gpu_stalls;
|
||||
|
||||
modport decode (
|
||||
output loads,
|
||||
output stores,
|
||||
output branches
|
||||
);
|
||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||
wire [`PERF_CTR_BITS-1:0] loads;
|
||||
wire [`PERF_CTR_BITS-1:0] stores;
|
||||
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||
|
||||
modport issue (
|
||||
output ibf_stalls,
|
||||
|
@ -34,9 +30,6 @@ interface VX_perf_pipeline_if ();
|
|||
);
|
||||
|
||||
modport slave (
|
||||
input loads,
|
||||
input stores,
|
||||
input branches,
|
||||
input ibf_stalls,
|
||||
input scb_stalls,
|
||||
input lsu_stalls,
|
||||
|
@ -45,7 +38,12 @@ interface VX_perf_pipeline_if ();
|
|||
`ifdef EXT_F_ENABLE
|
||||
input fpu_stalls,
|
||||
`endif
|
||||
input gpu_stalls
|
||||
input gpu_stalls,
|
||||
input ifetches,
|
||||
input loads,
|
||||
input stores,
|
||||
input ifetch_latency,
|
||||
input load_latency
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
|
|
@ -33,6 +33,7 @@ module VX_mem_scheduler #(
|
|||
input wire [TAG_WIDTH-1:0] req_tag,
|
||||
output wire req_empty,
|
||||
output wire req_ready,
|
||||
output wire write_notify,
|
||||
|
||||
// Output response
|
||||
output wire rsp_valid,
|
||||
|
@ -41,7 +42,6 @@ module VX_mem_scheduler #(
|
|||
output wire [TAG_WIDTH-1:0] rsp_tag,
|
||||
output wire rsp_eop,
|
||||
input wire rsp_ready,
|
||||
output wire write_notify,
|
||||
|
||||
// Memory request
|
||||
output wire [NUM_BANKS-1:0] mem_req_valid,
|
||||
|
@ -308,7 +308,7 @@ module VX_mem_scheduler #(
|
|||
// Handle memory responses ////////////////////////////////////////////////
|
||||
|
||||
reg [REQ_SIZEW-1:0] rsp_rem_size [QUEUE_SIZE-1:0];
|
||||
wire [REQ_SIZEW-1:0] rsp_rem_size_n;
|
||||
wire [REQ_SIZEW-1:0] rsp_rem_size_n;
|
||||
wire [`UP(BATCH_SEL_BITS)-1:0] rsp_batch_idx;
|
||||
|
||||
// Select memory response
|
||||
|
@ -383,7 +383,7 @@ module VX_mem_scheduler #(
|
|||
|
||||
reg [NUM_BATCHES-1:0][NUM_BANKS-1:0][DATA_WIDTH-1:0] rsp_store [QUEUE_SIZE-1:0];
|
||||
wire [NUM_BATCHES-1:0][NUM_BANKS-1:0][DATA_WIDTH-1:0] rsp_store_n;
|
||||
reg [QUEUE_SIZE-1:0][NUM_REQS-1:0] rsp_orig_mask;
|
||||
reg [NUM_REQS-1:0] rsp_orig_mask [QUEUE_SIZE-1:0];
|
||||
|
||||
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
|
||||
for (genvar j = 0; j < NUM_BANKS; ++j) begin
|
||||
|
|
|
@ -274,6 +274,7 @@ module VX_raster_mem #(
|
|||
.req_tag (mem_req_tag),
|
||||
`UNUSED_PIN (req_empty),
|
||||
.req_ready (mem_req_ready),
|
||||
`UNUSED_PIN (write_notify),
|
||||
|
||||
// Output response
|
||||
.rsp_valid (mem_rsp_valid),
|
||||
|
@ -281,8 +282,7 @@ module VX_raster_mem #(
|
|||
.rsp_data (mem_rsp_data),
|
||||
.rsp_tag (mem_rsp_tag),
|
||||
`UNUSED_PIN (rsp_eop),
|
||||
.rsp_ready (mem_rsp_ready),
|
||||
`UNUSED_PIN (write_notify),
|
||||
.rsp_ready (mem_rsp_ready),
|
||||
|
||||
// Memory request
|
||||
.mem_req_valid (cache_req_if.valid),
|
||||
|
|
|
@ -259,11 +259,11 @@ module VX_raster_unit #(
|
|||
wire [$clog2(RCACHE_NUM_REQS+1)-1:0] perf_mem_rsp_per_cycle;
|
||||
wire [$clog2(RCACHE_NUM_REQS+1)+1-1:0] perf_pending_reads_cycle;
|
||||
|
||||
wire [RCACHE_NUM_REQS-1:0] perf_mem_req_per_req = cache_req_if.valid & cache_req_if.ready;
|
||||
wire [RCACHE_NUM_REQS-1:0] perf_mem_rsp_per_req = cache_rsp_if.valid & cache_rsp_if.ready;
|
||||
wire [RCACHE_NUM_REQS-1:0] perf_mem_req_fire = cache_req_if.valid & cache_req_if.ready;
|
||||
wire [RCACHE_NUM_REQS-1:0] perf_mem_rsp_fire = cache_rsp_if.valid & cache_rsp_if.ready;
|
||||
|
||||
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_per_req);
|
||||
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_per_req);
|
||||
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_fire);
|
||||
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_fire);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_pending_reads;
|
||||
assign perf_pending_reads_cycle = perf_mem_req_per_cycle - perf_mem_rsp_per_cycle;
|
||||
|
|
|
@ -34,6 +34,7 @@ module VX_rop_mem #(
|
|||
input wire [NUM_LANES-1:0] req_face,
|
||||
input wire [TAG_WIDTH-1:0] req_tag,
|
||||
output wire req_ready,
|
||||
output wire write_notify,
|
||||
|
||||
// Response interface
|
||||
output wire rsp_valid,
|
||||
|
@ -42,8 +43,7 @@ module VX_rop_mem #(
|
|||
output wire [NUM_LANES-1:0][`ROP_DEPTH_BITS-1:0] rsp_depth,
|
||||
output wire [NUM_LANES-1:0][`ROP_STENCIL_BITS-1:0] rsp_stencil,
|
||||
output wire [TAG_WIDTH-1:0] rsp_tag,
|
||||
input wire rsp_ready,
|
||||
output wire write_notify
|
||||
input wire rsp_ready
|
||||
);
|
||||
|
||||
localparam NUM_REQS = ROP_MEM_REQS;
|
||||
|
@ -214,6 +214,7 @@ module VX_rop_mem #(
|
|||
.req_tag (mreq_tag_r),
|
||||
`UNUSED_PIN (req_empty),
|
||||
.req_ready (mreq_ready_r),
|
||||
.write_notify (write_notify),
|
||||
|
||||
.rsp_valid (mrsp_valid),
|
||||
.rsp_mask (mrsp_mask),
|
||||
|
@ -221,7 +222,6 @@ module VX_rop_mem #(
|
|||
.rsp_tag (mrsp_tag),
|
||||
`UNUSED_PIN (rsp_eop),
|
||||
.rsp_ready (mrsp_ready),
|
||||
.write_notify (write_notify),
|
||||
|
||||
.mem_req_valid (cache_req_if.valid),
|
||||
.mem_req_rw (cache_req_if.rw),
|
||||
|
|
|
@ -84,6 +84,7 @@ module VX_rop_unit #(
|
|||
.req_face (mem_req_face_r),
|
||||
.req_tag (mem_req_tag_r),
|
||||
.req_ready (mem_req_ready_r),
|
||||
.write_notify (mem_write_notify),
|
||||
|
||||
.rsp_valid (mem_rsp_valid),
|
||||
.rsp_mask (mem_rsp_mask),
|
||||
|
@ -91,8 +92,7 @@ module VX_rop_unit #(
|
|||
.rsp_depth (mem_rsp_depth),
|
||||
.rsp_stencil (mem_rsp_stencil),
|
||||
.rsp_tag (mem_rsp_tag),
|
||||
.rsp_ready (mem_rsp_ready),
|
||||
.write_notify (mem_write_notify)
|
||||
.rsp_ready (mem_rsp_ready)
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
@ -293,13 +293,13 @@ module VX_rop_unit #(
|
|||
wire [$clog2(OCACHE_NUM_REQS+1)-1:0] perf_mem_rd_rsp_per_cycle;
|
||||
wire [$clog2(OCACHE_NUM_REQS+1)+1-1:0] perf_pending_reads_cycle;
|
||||
|
||||
wire [OCACHE_NUM_REQS-1:0] perf_mem_rd_req_per_mask = cache_req_if.valid & ~cache_req_if.rw & cache_req_if.ready;
|
||||
wire [OCACHE_NUM_REQS-1:0] perf_mem_wr_req_per_mask = cache_req_if.valid & cache_req_if.rw & cache_req_if.ready;
|
||||
wire [OCACHE_NUM_REQS-1:0] perf_mem_rd_rsp_per_mask = cache_rsp_if.valid & cache_rsp_if.ready;
|
||||
wire [OCACHE_NUM_REQS-1:0] perf_mem_rd_req_fire = cache_req_if.valid & ~cache_req_if.rw & cache_req_if.ready;
|
||||
wire [OCACHE_NUM_REQS-1:0] perf_mem_wr_req_fire = cache_req_if.valid & cache_req_if.rw & cache_req_if.ready;
|
||||
wire [OCACHE_NUM_REQS-1:0] perf_mem_rd_rsp_fire = cache_rsp_if.valid & cache_rsp_if.ready;
|
||||
|
||||
`POP_COUNT(perf_mem_rd_req_per_cycle, perf_mem_rd_req_per_mask);
|
||||
`POP_COUNT(perf_mem_wr_req_per_cycle, perf_mem_wr_req_per_mask);
|
||||
`POP_COUNT(perf_mem_rd_rsp_per_cycle, perf_mem_rd_rsp_per_mask);
|
||||
`POP_COUNT(perf_mem_rd_req_per_cycle, perf_mem_rd_req_fire);
|
||||
`POP_COUNT(perf_mem_wr_req_per_cycle, perf_mem_wr_req_fire);
|
||||
`POP_COUNT(perf_mem_rd_rsp_per_cycle, perf_mem_rd_rsp_fire);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_pending_reads;
|
||||
assign perf_pending_reads_cycle = perf_mem_rd_req_per_cycle - perf_mem_rd_rsp_per_cycle;
|
||||
|
|
|
@ -122,6 +122,7 @@ module VX_tex_mem #(
|
|||
.req_tag (mem_req_tag),
|
||||
`UNUSED_PIN (req_empty),
|
||||
.req_ready (mem_req_ready),
|
||||
`UNUSED_PIN (write_notify),
|
||||
|
||||
// Output response
|
||||
.rsp_valid (mem_rsp_valid),
|
||||
|
@ -130,7 +131,6 @@ module VX_tex_mem #(
|
|||
.rsp_tag (mem_rsp_tag),
|
||||
`UNUSED_PIN (rsp_eop),
|
||||
.rsp_ready (mem_rsp_ready),
|
||||
`UNUSED_PIN (write_notify),
|
||||
|
||||
// Memory request
|
||||
.mem_req_valid (cache_req_if.valid),
|
||||
|
|
|
@ -206,11 +206,11 @@ module VX_tex_unit #(
|
|||
wire [$clog2(TCACHE_NUM_REQS+1)-1:0] perf_mem_rsp_per_cycle;
|
||||
wire [$clog2(TCACHE_NUM_REQS+1)+1-1:0] perf_pending_reads_cycle;
|
||||
|
||||
wire [TCACHE_NUM_REQS-1:0] perf_mem_req_per_req = cache_req_if.valid & cache_req_if.ready;
|
||||
wire [TCACHE_NUM_REQS-1:0] perf_mem_rsp_per_req = cache_rsp_if.valid & cache_rsp_if.ready;
|
||||
wire [TCACHE_NUM_REQS-1:0] perf_mem_req_fire = cache_req_if.valid & cache_req_if.ready;
|
||||
wire [TCACHE_NUM_REQS-1:0] perf_mem_rsp_fire = cache_rsp_if.valid & cache_rsp_if.ready;
|
||||
|
||||
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_per_req);
|
||||
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_per_req);
|
||||
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_fire);
|
||||
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_fire);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_pending_reads;
|
||||
assign perf_pending_reads_cycle = perf_mem_req_per_cycle - perf_mem_rsp_per_cycle;
|
||||
|
|
|
@ -21,7 +21,7 @@ set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON
|
|||
set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON
|
||||
set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON
|
||||
set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON
|
||||
set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON
|
||||
#set_global_assignment -name TIMEQUEST_DO_CCPP_REMOVAL ON
|
||||
|
||||
set_global_assignment -name USE_HIGH_SPEED_ADDER ON
|
||||
set_global_assignment -name MUX_RESTRUCTURE ON
|
||||
|
@ -29,7 +29,7 @@ set_global_assignment -name ADV_NETLIST_OPT_SYNTH_WYSIWYG_REMAP ON
|
|||
set_global_assignment -name PROGRAMMABLE_POWER_TECHNOLOGY_SETTING "FORCE ALL TILES WITH FAILING TIMING PATHS TO HIGH SPEED"
|
||||
set_global_assignment -name PHYSICAL_SYNTHESIS_COMBO_LOGIC ON
|
||||
set_global_assignment -name PHYSICAL_SYNTHESIS_REGISTER_RETIMING ON
|
||||
set_global_assignment -name PHYSICAL_SYNTHESIS_REGISTER_DUPLICATION ON
|
||||
#set_global_assignment -name PHYSICAL_SYNTHESIS_REGISTER_DUPLICATION ON
|
||||
|
||||
set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0
|
||||
set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100
|
||||
|
|
|
@ -6,7 +6,11 @@ set -e
|
|||
WIDTH=1920
|
||||
HEIGHT=1080
|
||||
|
||||
LOG_FILE=./perf/draw3d/perf_${DEVICE_FAMILY}_${WIDTH}_${HEIGHT}.log
|
||||
TOKEN=${1:-}_${DEVICE_FAMILY}_${WIDTH}x${HEIGHT}
|
||||
|
||||
LOG_DIR=./perf/draw3d
|
||||
|
||||
LOG_FILE=${LOG_DIR}/perf_${TOKEN}.log
|
||||
|
||||
declare -a traces=(vase filmtv skybox coverflow evilskull polybump tekkaman carnival)
|
||||
|
||||
|
|
|
@ -201,12 +201,12 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t fpu_stalls = 0;
|
||||
uint64_t csr_stalls = 0;
|
||||
uint64_t alu_stalls = 0;
|
||||
uint64_t gpu_stalls = 0;
|
||||
uint64_t wctl_issue_stalls = 0;
|
||||
// PERF: decode
|
||||
uint64_t gpu_stalls = 0;
|
||||
uint64_t ifetches = 0;
|
||||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
uint64_t branches = 0;
|
||||
uint64_t ifetch_lat = 0;
|
||||
uint64_t load_lat = 0;
|
||||
// PERF: Icache
|
||||
uint64_t icache_reads = 0;
|
||||
uint64_t icache_read_misses = 0;
|
||||
|
@ -221,6 +221,20 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t smem_reads = 0;
|
||||
uint64_t smem_writes = 0;
|
||||
uint64_t smem_bank_stalls = 0;
|
||||
// PERF: l2cache
|
||||
uint64_t l2cache_reads = 0;
|
||||
uint64_t l2cache_writes = 0;
|
||||
uint64_t l2cache_read_misses = 0;
|
||||
uint64_t l2cache_write_misses = 0;
|
||||
uint64_t l2cache_bank_stalls = 0;
|
||||
uint64_t l2cache_mshr_stalls = 0;
|
||||
// PERF: l3cache
|
||||
uint64_t l3cache_reads = 0;
|
||||
uint64_t l3cache_writes = 0;
|
||||
uint64_t l3cache_read_misses = 0;
|
||||
uint64_t l3cache_write_misses = 0;
|
||||
uint64_t l3cache_bank_stalls = 0;
|
||||
uint64_t l3cache_mshr_stalls = 0;
|
||||
// PERF: memory
|
||||
uint64_t mem_reads = 0;
|
||||
uint64_t mem_writes = 0;
|
||||
|
@ -327,12 +341,11 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t gpu_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_GPU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu unit stalls=%ld\n", core_id, gpu_stalls_per_core);
|
||||
gpu_stalls += gpu_stalls_per_core;
|
||||
// wctl_stall
|
||||
uint64_t wctl_issue_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_WCTL_ISSUE_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: wctl issue stalls=%ld\n", core_id, wctl_issue_stalls_per_core);
|
||||
wctl_issue_stalls += wctl_issue_stalls_per_core;
|
||||
|
||||
// PERF: decode
|
||||
// PERF: memory
|
||||
// ifetches
|
||||
uint64_t ifetches_per_core = get_csr_64(staging_ptr, CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
||||
ifetches += ifetches_per_core;
|
||||
// loads
|
||||
uint64_t loads_per_core = get_csr_64(staging_ptr, CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||
|
@ -341,63 +354,57 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t stores_per_core = get_csr_64(staging_ptr, CSR_MPM_STORES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
|
||||
stores += stores_per_core;
|
||||
// branches
|
||||
uint64_t branches_per_core = get_csr_64(staging_ptr, CSR_MPM_BRANCHES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: branches=%ld\n", core_id, branches_per_core);
|
||||
branches += branches_per_core;
|
||||
|
||||
// PERF: Icache
|
||||
// total reads
|
||||
// ifetch latency
|
||||
uint64_t ifetch_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_IFETCH_LAT);
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = (int)(double(ifetch_lat_per_core) / double(ifetches_per_core));
|
||||
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
}
|
||||
ifetch_lat += ifetch_lat_per_core;
|
||||
// load latency
|
||||
uint64_t load_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_LOAD_LAT);
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = (int)(double(load_lat_per_core) / double(loads_per_core));
|
||||
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
}
|
||||
load_lat += load_lat_per_core;
|
||||
} break;
|
||||
case DCR_MPM_CLASS_MEM: {
|
||||
if (0 == core_id) {
|
||||
// PERF: Icache
|
||||
icache_reads = get_csr_64(staging_ptr, CSR_MPM_ICACHE_READS);
|
||||
}
|
||||
// read misses
|
||||
if (0 == core_id) {
|
||||
icache_read_misses = get_csr_64(staging_ptr, CSR_MPM_ICACHE_MISS_R);
|
||||
}
|
||||
|
||||
// PERF: Dcache
|
||||
// total reads
|
||||
if (0 == core_id) {
|
||||
|
||||
// PERF: Dcache
|
||||
dcache_reads = get_csr_64(staging_ptr, CSR_MPM_DCACHE_READS);
|
||||
}
|
||||
// total write
|
||||
if (0 == core_id) {
|
||||
dcache_writes = get_csr_64(staging_ptr, CSR_MPM_DCACHE_WRITES);
|
||||
}
|
||||
// read misses
|
||||
if (0 == core_id) {
|
||||
dcache_read_misses = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MISS_R);
|
||||
}
|
||||
// read misses
|
||||
if (0 == core_id) {
|
||||
dcache_write_misses = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MISS_W);
|
||||
}
|
||||
// bank_stalls
|
||||
if (0 == core_id) {
|
||||
dcache_bank_stalls = get_csr_64(staging_ptr, CSR_MPM_DCACHE_BANK_ST);
|
||||
}
|
||||
// mshr_stalls
|
||||
if (0 == core_id) {
|
||||
dcache_mshr_stalls = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MSHR_ST);
|
||||
}
|
||||
|
||||
// PERF: SMEM
|
||||
// total reads
|
||||
if (0 == core_id) {
|
||||
smem_reads = get_csr_64(staging_ptr, CSR_MPM_SMEM_READS);
|
||||
}
|
||||
// total write
|
||||
if (0 == core_id) {
|
||||
|
||||
// PERF: smem
|
||||
smem_reads = get_csr_64(staging_ptr, CSR_MPM_SMEM_READS);
|
||||
smem_writes = get_csr_64(staging_ptr, CSR_MPM_SMEM_WRITES);
|
||||
}
|
||||
// bank_stalls
|
||||
if (0 == core_id) {
|
||||
smem_bank_stalls = get_csr_64(staging_ptr, CSR_MPM_SMEM_BANK_ST);
|
||||
}
|
||||
|
||||
// PERF: memory
|
||||
if (0 == core_id) {
|
||||
|
||||
// PERF: L2cache
|
||||
l2cache_reads = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_READS);
|
||||
l2cache_writes = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_WRITES);
|
||||
l2cache_read_misses = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_MISS_R);
|
||||
l2cache_write_misses = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_MISS_W);
|
||||
l2cache_bank_stalls = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_BANK_ST);
|
||||
l2cache_mshr_stalls = get_csr_64(staging_ptr, CSR_MPM_L2CACHE_MSHR_ST);
|
||||
|
||||
// PERF: L3cache
|
||||
l3cache_reads = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_READS);
|
||||
l3cache_writes = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_WRITES);
|
||||
l3cache_read_misses = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_MISS_R);
|
||||
l3cache_write_misses = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_MISS_W);
|
||||
l3cache_bank_stalls = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_BANK_ST);
|
||||
l3cache_mshr_stalls = get_csr_64(staging_ptr, CSR_MPM_L3CACHE_MSHR_ST);
|
||||
|
||||
// PERF: memory
|
||||
mem_reads = get_csr_64(staging_ptr, CSR_MPM_MEM_READS);
|
||||
mem_writes = get_csr_64(staging_ptr, CSR_MPM_MEM_WRITES);
|
||||
mem_lat = get_csr_64(staging_ptr, CSR_MPM_MEM_LAT);
|
||||
|
@ -470,13 +477,9 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
|
||||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case DCR_MPM_CLASS_CORE: {
|
||||
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
|
||||
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
|
||||
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
|
||||
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
|
||||
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
|
||||
case DCR_MPM_CLASS_CORE: {
|
||||
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
|
||||
int load_avg_lat = (int)(double(load_lat) / double(loads));
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
|
||||
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
|
||||
|
@ -484,10 +487,26 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
fprintf(stream, "PERF: csr unit stalls=%ld\n", csr_stalls);
|
||||
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
|
||||
fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls);
|
||||
fprintf(stream, "PERF: wctl issue stalls=%ld\n", wctl_issue_stalls);
|
||||
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||
fprintf(stream, "PERF: branches=%ld\n", branches);
|
||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||
fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
|
||||
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
|
||||
|
||||
} break;
|
||||
case DCR_MPM_CLASS_MEM: {
|
||||
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
|
||||
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
|
||||
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
|
||||
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
|
||||
int l2cache_read_hit_ratio = (int)((1.0 - (double(l2cache_read_misses) / double(l2cache_reads))) * 100);
|
||||
int l2cache_write_hit_ratio = (int)((1.0 - (double(l2cache_write_misses) / double(l2cache_writes))) * 100);
|
||||
int l2cache_bank_utilization = (int)((double(l2cache_reads + l2cache_writes) / double(l2cache_reads + l2cache_writes + l2cache_bank_stalls)) * 100);
|
||||
int l3cache_read_hit_ratio = (int)((1.0 - (double(l3cache_read_misses) / double(l3cache_reads))) * 100);
|
||||
int l3cache_write_hit_ratio = (int)((1.0 - (double(l3cache_write_misses) / double(l3cache_writes))) * 100);
|
||||
int l3cache_bank_utilization = (int)((double(l3cache_reads + l3cache_writes) / double(l3cache_reads + l3cache_writes + l3cache_bank_stalls)) * 100);
|
||||
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
|
||||
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
|
||||
fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
|
||||
|
@ -499,14 +518,26 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
|
||||
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
|
||||
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
|
||||
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
|
||||
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
|
||||
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
|
||||
fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
|
||||
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
|
||||
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
|
||||
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
|
||||
fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat);
|
||||
} break;
|
||||
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
|
||||
} break;
|
||||
case DCR_MPM_CLASS_TEX: {
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
int tex_avg_lat = (int)(double(tex_mem_lat) / double(tex_mem_reads));
|
||||
fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads);
|
||||
fprintf(stream, "PERF: tex memory average latency=%d cycles\n", tex_avg_lat);
|
||||
fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat);
|
||||
fprintf(stream, "PERF: tex issue stalls=%ld\n", tex_issue_stalls);
|
||||
int tcache_read_hit_ratio = (int)((1.0 - (double(tcache_read_misses) / double(tcache_reads))) * 100);
|
||||
int tcache_bank_utilization = (int)((double(tcache_reads) / double(tcache_reads + tcache_bank_stalls)) * 100);
|
||||
|
@ -539,7 +570,7 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
int rop_stall_cycles_ratio = (int)(100 * double(rop_stall_cycles) / cycles);
|
||||
fprintf(stream, "PERF: rop memory reads=%ld\n", rop_mem_reads);
|
||||
fprintf(stream, "PERF: rop memory writes=%ld\n", rop_mem_writes);
|
||||
fprintf(stream, "PERF: rop memory average latency=%d cycles\n", rop_mem_avg_lat);
|
||||
fprintf(stream, "PERF: rop memory latency=%d cycles\n", rop_mem_avg_lat);
|
||||
fprintf(stream, "PERF: rop stall cycles=%ld cycles (%d%%)\n", rop_stall_cycles, rop_stall_cycles_ratio);
|
||||
fprintf(stream, "PERF: rop issue stalls=%ld\n", rop_issue_stalls);
|
||||
// cache perf counters
|
||||
|
|
|
@ -424,14 +424,13 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
return tid + (wid * arch_.num_threads());
|
||||
case CSR_GTID:
|
||||
// Processor threadID
|
||||
return tid + (wid * arch_.num_threads()) +
|
||||
(arch_.num_threads() * arch_.num_warps() * core_id_);
|
||||
return (core_id_ * arch_.num_warps() + wid) * arch_.num_threads() + tid;
|
||||
case CSR_LWID:
|
||||
// Core warpID
|
||||
return wid;
|
||||
case CSR_GWID:
|
||||
// Processor warpID
|
||||
return wid + (arch_.num_warps() * core_id_);
|
||||
return core_id_ * arch_.num_warps() + wid;
|
||||
case CSR_GCID:
|
||||
// Processor coreID
|
||||
return core_id_;
|
||||
|
|
|
@ -42,7 +42,7 @@ void Warp::clear() {
|
|||
pipeline_trace_t* Warp::eval() {
|
||||
assert(tmask_.any());
|
||||
|
||||
uint64_t uuid = ((issued_instrs_++ * arch_.num_warps() + warp_id_) * arch_.num_cores()) + core_->id();
|
||||
uint64_t uuid = (issued_instrs_++ * arch_.num_cores() + core_->id()) * arch_.num_warps() + warp_id_;
|
||||
|
||||
DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << warp_id_ << ", tmask=");
|
||||
for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue