memory mem_coalescer miss perf counter

RTL perf counters refactoring
This commit is contained in:
tinebp 2024-12-26 08:00:36 -08:00
parent f478bdcf25
commit 704f525fd6
41 changed files with 581 additions and 521 deletions

View file

@ -24,7 +24,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
input sysmem_perf_t sysmem_perf,
`endif
// DCRs
@ -43,12 +43,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
cache_perf_t l2_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.l2cache = l2_perf;
end
`endif
`ifdef GBAR_ENABLE
@ -111,7 +111,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.clk (clk),
.reset (l2_reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.l2cache),
.cache_perf (l2_perf),
`endif
.core_bus_if (per_socket_mem_bus_if),
.mem_bus_if (mem_bus_if)
@ -140,7 +140,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.reset (socket_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.sysmem_perf (sysmem_perf_tmp),
`endif
.dcr_bus_if (socket_dcr_bus_if),

View file

@ -329,19 +329,19 @@
VX_edge_trigger #( \
.POS (0), \
.INIT (0) \
) __``dst``__ ( \
) __neg_edge`__LINE__ ( \
.clk (clk), \
.reset (1'b0), \
.data_in (src), \
.data_out (dst) \
)
`define BUFFER_EX(dst, src, ena, RSTW, latency) \
`define BUFFER_EX(dst, src, ena, resetw, latency) \
VX_pipe_register #( \
.DATAW ($bits(dst)), \
.RESETW (RSTW), \
.RESETW (resetw), \
.DEPTH (latency) \
) __``dst``__ ( \
) __buffer_ex`__LINE__ ( \
.clk (clk), \
.reset (reset), \
.enable (ena), \
@ -349,13 +349,13 @@
.data_out (dst) \
)
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 0, 1)
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, $bits(dst), 1)
`define POP_COUNT_EX(out, in, model) \
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out``__ ( \
) __pop_count_ex`__LINE__ ( \
.data_in (in), \
.data_out (out) \
)
@ -482,7 +482,7 @@
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_field[__i] = src[__i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
VX_reduce_tree #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
__reduce_add_i_field, \
__reduce_add_o_field \
); \

View file

@ -73,6 +73,17 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
} cache_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
logic [`PERF_CTR_BITS-1:0] bank_stalls;
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
} lmem_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] misses;
} coalescer_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
@ -92,6 +103,26 @@ package VX_gpu_pkg;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t;
typedef struct packed {
cache_perf_t icache;
cache_perf_t dcache;
cache_perf_t l2cache;
cache_perf_t l3cache;
lmem_perf_t lmem;
coalescer_perf_t coalescer;
mem_perf_t mem;
} sysmem_perf_t;
typedef struct packed {
sched_perf_t sched;
issue_perf_t issue;
logic [`PERF_CTR_BITS-1:0] ifetches;
logic [`PERF_CTR_BITS-1:0] loads;
logic [`PERF_CTR_BITS-1:0] stores;
logic [`PERF_CTR_BITS-1:0] ifetch_latency;
logic [`PERF_CTR_BITS-1:0] load_latency;
} pipeline_perf_t;
//////////////////////// instruction arguments ////////////////////////////
typedef struct packed {
@ -145,6 +176,7 @@ package VX_gpu_pkg;
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
localparam LMEM_TAG_WIDTH = LSU_TAG_WIDTH + `CLOG2(`NUM_LSU_BLOCKS);
////////////////////////// Icache Parameters //////////////////////////////

View file

@ -24,7 +24,7 @@ module VX_socket import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
input sysmem_perf_t sysmem_perf,
`endif
// DCRs
@ -63,11 +63,13 @@ module VX_socket import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
cache_perf_t icache_perf, dcache_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.icache = icache_perf;
sysmem_perf_tmp.dcache = dcache_perf;
end
`endif
///////////////////////////////////////////////////////////////////////////
@ -110,7 +112,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_BUF (2)
) icache (
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.icache),
.cache_perf (icache_perf),
`endif
.clk (clk),
.reset (icache_reset),
@ -160,7 +162,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_BUF (2)
) dcache (
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.dcache),
.cache_perf (dcache_perf),
`endif
.clk (clk),
.reset (dcache_reset),
@ -187,6 +189,7 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_arb #(
.NUM_INPUTS (2),
.NUM_OUTPUTS(1),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX(0),
@ -234,7 +237,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.reset (core_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.sysmem_perf (sysmem_perf_tmp),
`endif
.dcr_bus_if (core_dcr_bus_if),

View file

@ -177,6 +177,9 @@
`define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D
// PERF: coalescer
`define VX_CSR_MPM_COALESCE_MISS 12'hB1E // coalescer misses
`define VX_CSR_MPM_COALESCE_MISS_H 12'hB9E
// Machine Performance-monitoring memory counters (class 3) ///////////////////
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>

View file

@ -50,11 +50,14 @@ module Vortex import VX_gpu_pkg::*; (
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.lmem = 'x;
cache_perf_t l3_perf;
mem_perf_t mem_perf;
sysmem_perf_t sysmem_perf;
always @(*) begin
sysmem_perf = '0;
sysmem_perf.l3cache = l3_perf;
sysmem_perf.mem = mem_perf;
end
`endif
VX_mem_bus_if #(
@ -98,7 +101,7 @@ module Vortex import VX_gpu_pkg::*; (
.reset (l3_reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_if.l3cache),
.cache_perf (l3_perf),
`endif
.core_bus_if (per_cluster_mem_bus_if),
@ -146,7 +149,7 @@ module Vortex import VX_gpu_pkg::*; (
.reset (cluster_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.sysmem_perf (sysmem_perf),
`endif
.dcr_bus_if (cluster_dcr_bus_if),
@ -182,7 +185,6 @@ module Vortex import VX_gpu_pkg::*; (
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
always @(posedge clk) begin
if (reset) begin
@ -202,7 +204,6 @@ module Vortex import VX_gpu_pkg::*; (
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end
end
assign mem_perf_if.mem = mem_perf;
`endif

View file

@ -620,6 +620,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
VX_mem_arb #(
.NUM_INPUTS (2),
.NUM_OUTPUTS (1),
.DATA_SIZE (LMEM_DATA_SIZE),
.ADDR_WIDTH (CCI_VX_ADDR_WIDTH),
.TAG_WIDTH (CCI_VX_TAG_WIDTH),
@ -1097,7 +1098,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire vx_mem_req_fire = vx_mem_req_valid[0] && vx_mem_req_ready[0];
wire vx_mem_rsp_fire = vx_mem_rsp_valid[0] && vx_mem_rsp_ready[0];
wire avs_req_fire = (avs_write[0] || avs_read[0]) && ~avs_waitrequest[0];
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP (0, 0, {
vx_reset,

View file

@ -330,7 +330,7 @@ module VX_afu_wrap #(
wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0];
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0];
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP (0, 0, {
ap_reset,

View file

@ -210,7 +210,59 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
end
`ifdef PERF_ENABLE
assign cache_perf = '0;
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
assign perf_core_reads_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && ~core_bus_if[i].req_data.rw;
assign perf_core_writes_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && core_bus_if[i].req_data.rw;
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
end
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
perf_core_reads <= '0;
perf_core_writes <= '0;
perf_mem_stalls <= '0;
perf_crsp_stalls <= '0;
end else begin
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
end
end
assign cache_perf.reads = perf_core_reads;
assign cache_perf.writes = perf_core_writes;
assign cache_perf.read_misses = '0;
assign cache_perf.write_misses = '0;
assign cache_perf.bank_stalls = '0;
assign cache_perf.mshr_stalls = '0;
assign cache_perf.mem_stalls = perf_mem_stalls;
assign cache_perf.crsp_stalls = perf_crsp_stalls;
`endif
end
@ -220,13 +272,13 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin
if (core_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.tag.uuid))
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.tag.uuid))
end
end
if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, i, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
end
end
end

View file

@ -101,7 +101,7 @@ module VX_commit import VX_gpu_pkg::*; #(
.data_out ({commit_fire_any_r, commit_size_r})
);
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (COMMIT_SIZEW),
.DATAW_OUT (COMMIT_ALL_SIZEW),
.N (`ISSUE_WIDTH),

View file

@ -28,7 +28,7 @@ module VX_core import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
input sysmem_perf_t sysmem_perf,
`endif
VX_dcr_bus_if.slave dcr_bus_if,
@ -65,14 +65,15 @@ module VX_core import VX_gpu_pkg::*; #(
) lsu_mem_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
VX_pipeline_perf_if pipeline_perf_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
lmem_perf_t lmem_perf;
coalescer_perf_t coalescer_perf;
pipeline_perf_t pipeline_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.lmem = lmem_perf;
sysmem_perf_tmp.coalescer = coalescer_perf;
end
`endif
base_dcrs_t base_dcrs;
@ -94,7 +95,7 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.sched_perf (pipeline_perf_if.sched),
.sched_perf (pipeline_perf.sched),
`endif
.base_dcrs (base_dcrs),
@ -144,7 +145,7 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.issue_perf (pipeline_perf_if.issue),
.issue_perf (pipeline_perf.issue),
`endif
.decode_if (decode_if),
@ -162,8 +163,8 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
.sysmem_perf (sysmem_perf_tmp),
.pipeline_perf (pipeline_perf),
`endif
.base_dcrs (base_dcrs),
@ -200,7 +201,8 @@ module VX_core import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.lmem_perf (mem_perf_tmp_if.lmem),
.lmem_perf (lmem_perf),
.coalescer_perf(coalescer_perf),
`endif
.lsu_mem_if (lsu_mem_if),
.dcache_bus_if (dcache_bus_if)
@ -276,12 +278,11 @@ module VX_core import VX_gpu_pkg::*; #(
end
end
assign pipeline_perf_if.ifetches = perf_ifetches;
assign pipeline_perf_if.loads = perf_loads;
assign pipeline_perf_if.stores = perf_stores;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf.ifetches = perf_ifetches;
assign pipeline_perf.loads = perf_loads;
assign pipeline_perf.stores = perf_stores;
assign pipeline_perf.ifetch_latency = perf_icache_lat;
assign pipeline_perf.load_latency = perf_dcache_lat;
`endif

View file

@ -127,13 +127,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_rsp_ready = icache_bus_if.rsp_ready;
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = '0;
assign mem_perf_if.dcache = '0;
assign mem_perf_if.l2cache = '0;
assign mem_perf_if.l3cache = '0;
assign mem_perf_if.lmem = '0;
assign mem_perf_if.mem = '0;
sysmem_perf_t mem_perf;
assign mem_perf.icache = '0;
assign mem_perf.dcache = '0;
assign mem_perf.l2cache = '0;
assign mem_perf.l3cache = '0;
assign mem_perf.lmem = '0;
assign mem_perf.mem = '0;
`endif
`ifdef SCOPE
@ -152,7 +152,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.sysmem_perf (sysmem_perf),
`endif
.dcr_bus_if (dcr_bus_if),

View file

@ -41,8 +41,8 @@ import VX_fpu_pkg::*;
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
VX_commit_csr_if.slave commit_csr_if,
@ -212,65 +212,67 @@ import VX_fpu_pkg::*;
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf_if.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf_if.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf_if.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf_if.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf_if.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_ALU]);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_FPU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_FPU]);
`else
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0));
`endif
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_WCTL]);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf_if.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf_if.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf_if.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf_if.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf_if.load_latency);
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf.load_latency);
default:;
endcase
end
`VX_DCR_MPM_CLASS_MEM: begin
case (read_addr)
// PERF: icache
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, mem_perf_if.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, mem_perf_if.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, mem_perf_if.icache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls);
// PERF: dcache
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, mem_perf_if.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, mem_perf_if.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, mem_perf_if.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, mem_perf_if.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, mem_perf_if.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, mem_perf_if.dcache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls);
// PERF: lmem
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, mem_perf_if.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, mem_perf_if.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, mem_perf_if.lmem.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls);
// PERF: l2cache
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, mem_perf_if.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, mem_perf_if.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, mem_perf_if.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, mem_perf_if.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l2cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls);
// PERF: l3cache
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, mem_perf_if.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, mem_perf_if.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, mem_perf_if.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, mem_perf_if.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l3cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, mem_perf_if.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, mem_perf_if.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, mem_perf_if.mem.latency);
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency);
// PERF: coalescer
`CSR_READ_64(`VX_CSR_MPM_COALESCE_MISS, read_data_ro_w, sysmem_perf.coalescer.misses);
default:;
endcase
end
@ -290,8 +292,8 @@ import VX_fpu_pkg::*;
`RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE
`UNUSED_VAR (mem_perf_if.icache);
`UNUSED_VAR (mem_perf_if.lmem);
`UNUSED_VAR (sysmem_perf.icache);
`UNUSED_VAR (sysmem_perf.lmem);
`endif
endmodule

View file

@ -24,8 +24,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
`ifdef EXT_F_ENABLE
@ -82,8 +82,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
`endif
.commit_csr_if (commit_csr_if),

View file

@ -23,8 +23,8 @@ module VX_execute import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
input base_dcrs_t base_dcrs,
@ -93,8 +93,8 @@ module VX_execute import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if (pipeline_perf_if),
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
`endif
.base_dcrs (base_dcrs),
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),

View file

@ -137,6 +137,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 1, 6, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +

View file

@ -93,6 +93,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
`SCOPE_IO_SWITCH (1);
wire decode_fire = decode_if.valid && decode_if.ready;
wire operands_fire = operands_if.valid && operands_if.ready;
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 2, 4, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 +

View file

@ -535,6 +535,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`ifdef SCOPE
`ifdef DBG_SCOPE_LSU
`SCOPE_IO_SWITCH (1);
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 3, 4, 2, (
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH

View file

@ -20,7 +20,8 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
output cache_perf_t lmem_perf,
output lmem_perf_t lmem_perf,
output coalescer_perf_t coalescer_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS],
@ -60,46 +61,58 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
);
end
VX_mem_bus_if #(
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS]();
.TAG_WIDTH (LMEM_TAG_WIDTH)
) lmem_arb_if[1]();
VX_lsu_mem_arb #(
.NUM_INPUTS (`NUM_LSU_BLOCKS),
.NUM_OUTPUTS(1),
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_IDX(0),
.ARBITER ("R"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(2)
) lmem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (lsu_lmem_if),
.bus_out_if (lmem_arb_if)
);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_adapters
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
.TAG_WIDTH (LMEM_TAG_WIDTH)
) lmem_adapt_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
.TAG_WIDTH (LMEM_TAG_WIDTH),
.TAG_SEL_BITS (LMEM_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (2)
.RSP_OUT_BUF (0)
) lmem_adapter (
.clk (clk),
.reset (reset),
.lsu_mem_if (lsu_lmem_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
.lsu_mem_if (lmem_arb_if[0]),
.mem_bus_if (lmem_adapt_if)
);
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_lmem_bus_if
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
end
end
VX_local_mem #(
.INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (LSU_NUM_REQS),
.NUM_REQS (`NUM_LSU_LANES),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_WIDTH (LMEM_TAG_WIDTH),
.OUT_BUF (3)
) local_mem (
.clk (clk),
@ -107,7 +120,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.lmem_perf (lmem_perf),
`endif
.mem_bus_if (lmem_bus_if)
.mem_bus_if (lmem_adapt_if)
);
`else
@ -115,6 +128,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
assign lmem_perf = '0;
`endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if
`ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end
@ -127,6 +141,21 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
wire [`NUM_LSU_BLOCKS-1:0][`PERF_CTR_BITS-1:0] per_block_coalescer_misses;
wire [`PERF_CTR_BITS-1:0] coalescer_misses;
VX_reduce_tree #(
.DATAW_IN (`PERF_CTR_BITS),
.DATAW_OUT (`PERF_CTR_BITS),
.N (`NUM_LSU_BLOCKS),
.OP ("+")
) coalescer_reduce (
.data_in (per_block_coalescer_misses),
.data_out (coalescer_misses)
);
`BUFFER(coalescer_perf.misses, coalescer_misses);
`endif
if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers
@ -139,11 +168,16 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
.QUEUE_SIZE (`LSUQ_OUT_SIZE),
.PERF_CTR_BITS (`PERF_CTR_BITS)
) mem_coalescer (
.clk (clk),
.reset (reset),
`ifdef LMEM_ENABLE
.misses (per_block_coalescer_misses[i]),
`endif
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
@ -186,6 +220,9 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if
`ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
`ifdef LMEM_ENABLE
assign per_block_coalescer_misses[i] = '0;
`endif
end
end

View file

@ -106,7 +106,6 @@ module VX_operands import VX_gpu_pkg::*; #(
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.ARBITER ("P"), // use priority arbiter
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (0) // no output buffering
) req_xbar (
.clk (clk),

View file

@ -44,7 +44,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (`NUM_EX_UNITS),
.N (PER_ISSUE_WARPS),
.OP ("|")
@ -53,7 +53,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
.data_out (perf_units_per_cycle)
);
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (PER_ISSUE_WARPS),
.OP ("|")

View file

@ -21,8 +21,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
input base_dcrs_t base_dcrs,
@ -121,8 +121,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.execute_if (pe_execute_if[PE_IDX_CSRS]),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
`endif
`ifdef EXT_F_ENABLE

View file

@ -1,46 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_pipeline_perf_if import VX_gpu_pkg::*; ();
sched_perf_t sched;
issue_perf_t issue;
wire [`PERF_CTR_BITS-1:0] ifetches;
wire [`PERF_CTR_BITS-1:0] loads;
wire [`PERF_CTR_BITS-1:0] stores;
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency;
modport master (
output sched,
output issue,
output ifetches,
output loads,
output stores,
output ifetch_latency,
output load_latency
);
modport slave (
input sched,
input issue,
input ifetches,
input loads,
input stores,
input ifetch_latency,
input load_latency
);
endinterface

View file

@ -1,27 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_sfu_perf_if ();
wire [`PERF_CTR_BITS-1:0] wctl_stalls;
modport master (
output wctl_stalls
);
modport slave (
input wctl_stalls
);
endinterface

View file

@ -24,6 +24,7 @@ module VX_mem_coalescer #(
parameter TAG_WIDTH = 8,
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
parameter QUEUE_SIZE = 8,
parameter PERF_CTR_BITS = `CLOG2(NUM_REQS+1),
parameter DATA_IN_WIDTH = DATA_IN_SIZE * 8,
parameter DATA_OUT_WIDTH= DATA_OUT_SIZE * 8,
@ -37,6 +38,8 @@ module VX_mem_coalescer #(
input wire clk,
input wire reset,
output wire [PERF_CTR_BITS-1:0] misses,
// Input request
input wire in_req_valid,
input wire in_req_rw,
@ -323,6 +326,23 @@ module VX_mem_coalescer #(
assign in_rsp_tag = {out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout_tag};
assign out_rsp_ready = in_rsp_ready;
// compute coalescing misses
// misses are partial transfers (not fuly coalesced)
reg [PERF_CTR_BITS-1:0] misses_r;
wire partial_transfer = (out_req_fire && req_rem_mask_r != '1);
always @(posedge clk) begin
if (reset) begin
misses_r <= '0;
end else begin
misses_r <= misses_r + PERF_CTR_BITS'(partial_transfer);
end
end
assign misses = misses_r;
`ifdef DBG_TRACE_MEM
wire [`UP(UUID_WIDTH)-1:0] out_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] out_rsp_uuid;

View file

@ -237,6 +237,8 @@ module VX_mem_scheduler #(
.clk (clk),
.reset (reset),
`UNUSED_PIN (misses),
// Input request
.in_req_valid (reqq_valid),
.in_req_mask (reqq_mask),

View file

@ -1,78 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`TRACING_OFF
module VX_reduce #(
parameter DATAW_IN = 1,
parameter DATAW_OUT = DATAW_IN,
parameter N = 1,
parameter `STRING OP = "+"
) (
input wire [N-1:0][DATAW_IN-1:0] data_in,
output wire [DATAW_OUT-1:0] data_out
);
if (N == 1) begin : g_passthru
assign data_out = DATAW_OUT'(data_in[0]);
end else begin : g_reduce
localparam int N_A = N / 2;
localparam int N_B = N - N_A;
wire [N_A-1:0][DATAW_IN-1:0] in_A;
wire [N_B-1:0][DATAW_IN-1:0] in_B;
wire [DATAW_OUT-1:0] out_A, out_B;
for (genvar i = 0; i < N_A; i++) begin : g_in_A
assign in_A[i] = data_in[i];
end
for (genvar i = 0; i < N_B; i++) begin : g_in_B
assign in_B[i] = data_in[N_A + i];
end
VX_reduce #(
.DATAW_IN (DATAW_IN),
.DATAW_OUT (DATAW_OUT),
.N (N_A),
.OP (OP)
) reduce_A (
.data_in (in_A),
.data_out (out_A)
);
VX_reduce #(
.DATAW_IN (DATAW_IN),
.DATAW_OUT (DATAW_OUT),
.N (N_B),
.OP (OP)
) reduce_B (
.data_in (in_B),
.data_out (out_B)
);
if (OP == "+") begin : g_plus
assign data_out = out_A + out_B;
end else if (OP == "^") begin : g_xor
assign data_out = out_A ^ out_B;
end else if (OP == "&") begin : g_and
assign data_out = out_A & out_B;
end else if (OP == "|") begin : g_or
assign data_out = out_A | out_B;
end else begin : g_error
`ERROR(("invalid parameter"));
end
end
endmodule
`TRACING_ON

View file

@ -206,13 +206,13 @@ module VX_stream_xbar #(
reg [PERF_CTR_BITS-1:0] collisions_r;
always @(*) begin
per_cycle_collision = 0;
per_cycle_collision = '0;
for (integer i = 0; i < NUM_INPUTS; ++i) begin
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
for (integer j = i + 1; j < NUM_INPUTS; ++j) begin
per_cycle_collision[i] |= valid_in[i]
&& valid_in[j+i]
&& (sel_in[i] == sel_in[j+i])
&& (ready_in[i] | ready_in[j+i]);
&& valid_in[j]
&& (sel_in[i] == sel_in[j])
&& (ready_in[i] | ready_in[j]);
end
end
end

View file

@ -43,7 +43,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
// PERF
`ifdef PERF_ENABLE
output cache_perf_t lmem_perf,
output lmem_perf_t lmem_perf,
`endif
VX_mem_bus_if.slave mem_bus_if [NUM_REQS]
@ -288,11 +288,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
assign lmem_perf.reads = perf_reads;
assign lmem_perf.writes = perf_writes;
assign lmem_perf.read_misses = '0;
assign lmem_perf.write_misses = '0;
assign lmem_perf.bank_stalls = perf_collisions;
assign lmem_perf.mshr_stalls = '0;
assign lmem_perf.mem_stalls = '0;
assign lmem_perf.crsp_stalls = perf_crsp_stalls;
`endif
@ -321,15 +317,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end
end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s rd-rsp: req_idx=%0d, data=0x%h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
end
end
@ -339,15 +335,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin
if (per_bank_req_rw[i]) begin
`TRACE(2, ("%t: %s-bank%0d wr-req: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s bank-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
end else begin
`TRACE(2, ("%t: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s bank-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
end
end
if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin
`TRACE(2, ("%t: %s-bank%0d rd-rsp: data=0x%h, tag=0x%0h (#%0d)\n",
`TRACE(2, ("%t: %s bank-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_rsp_data[i], per_bank_rsp_tag_value[i], per_bank_rsp_uuid[i]))
end
end

View file

@ -1,43 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_mem_perf_if import VX_gpu_pkg::*; ();
cache_perf_t icache;
cache_perf_t dcache;
cache_perf_t l2cache;
cache_perf_t l3cache;
cache_perf_t lmem;
mem_perf_t mem;
modport master (
output icache,
output dcache,
output l2cache,
output l3cache,
output lmem,
output mem
);
modport slave (
input icache,
input dcache,
input l2cache,
input l3cache,
input lmem,
input mem
);
endinterface

View file

@ -437,6 +437,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization);
}
uint64_t dcache_requests_per_core = 0;
if (dcache_enable) {
// PERF: Dcache
uint64_t dcache_reads;
@ -447,6 +449,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
return err;
});
dcache_requests_per_core += dcache_reads + dcache_writes;
uint64_t dcache_read_misses;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
return err;
@ -475,6 +478,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization);
}
// PERF: coalescer
uint64_t coalescer_misses;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_COALESCER_ST, core_id, &coalescer_misses), {
return err;
});
int coalescer_utilization = calcAvgPercent(dcache_requests_per_core - coalescer_misses, dcache_requests_per_core);
fprintf(stream, "PERF: core%d: coalescer misses=%ld (hit ratio=%d%%)\n", core_id, coalescer_misses, coalescer_utilization);
if (l2cache_enable) {
// PERF: L2cache
uint64_t tmp;

View file

@ -104,6 +104,27 @@ inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
return (bits << shift) >> (shift + start);
}
inline uint64_t bit_reverse(uint64_t bits) {
bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1) | ((bits & 0x5555555555555555) << 1);
bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2) | ((bits & 0x3333333333333333) << 2);
bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4) | ((bits & 0x0F0F0F0F0F0F0F0F) << 4);
bits = ((bits & 0xFF00FF00FF00FF00) >> 8) | ((bits & 0x00FF00FF00FF00FF) << 8);
bits = ((bits & 0xFFFF0000FFFF0000) >> 16) | ((bits & 0x0000FFFF0000FFFF) << 16);
bits = (bits >> 32) | (bits << 32);
return bits;
}
inline uint64_t bit_reverse(uint64_t bits, uint32_t width) {
assert(width <= 64);
uint64_t reversed(0);
for (uint32_t i = 0; i < width; ++i) {
if (bits & (1ULL << i)) {
reversed |= (1ULL << (width - 1 - i));
}
}
return reversed;
}
template <typename T = uint32_t>
T sext(const T& word, uint32_t width) {
assert(width > 1);

View file

@ -21,32 +21,32 @@ template <typename T = uint32_t>
class BitVector {
private:
static constexpr size_t BITS_PER_WORD = sizeof(T) * 8;
std::vector<T> bits_;
std::vector<T> words_;
size_t size_;
bool all_zero_;
size_t wordIndex(size_t pos) const {
constexpr size_t wordIndex(size_t pos) const {
return pos / BITS_PER_WORD;
}
T bitMask(size_t pos) const {
constexpr T bitMask(size_t pos) const {
return T(1) << (pos % BITS_PER_WORD);
}
void updateAllZero() {
all_zero_ = std::all_of(bits_.begin(), bits_.end(), [](T word) { return word == 0; });
all_zero_ = std::all_of(words_.begin(), words_.end(), [](T word) { return word == 0; });
}
public:
explicit BitVector(size_t size = 0)
: bits_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD)
: words_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD)
, size_(size)
, all_zero_(true)
{}
void set(size_t pos) {
if (pos >= size_) throw std::out_of_range("Index out of range");
bits_[this->wordIndex(pos)] |= this->bitMask(pos);
words_[this->wordIndex(pos)] |= this->bitMask(pos);
all_zero_ = false;
}
@ -59,19 +59,19 @@ public:
}
void reset() {
std::fill(bits_.begin(), bits_.end(), 0);
std::fill(words_.begin(), words_.end(), 0);
all_zero_ = true;
}
void reset(size_t pos) {
if (pos >= size_) throw std::out_of_range("Index out of range");
bits_[this->wordIndex(pos)] &= ~this->bitMask(pos);
words_[this->wordIndex(pos)] &= ~this->bitMask(pos);
this->updateAllZero();
}
bool test(size_t pos) const {
if (pos >= size_) throw std::out_of_range("Index out of range");
return bits_[this->wordIndex(pos)] & this->bitMask(pos);
return words_[this->wordIndex(pos)] & this->bitMask(pos);
}
size_t size() const {
@ -80,12 +80,12 @@ public:
void resize(size_t new_size) {
size_ = new_size;
bits_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0);
words_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0);
this->updateAllZero();
}
bool operator==(const BitVector& other) const {
return (size_ == other.size_) && (bits_ == other.bits_);
return (size_ == other.size_) && (words_ == other.words_);
}
bool operator!=(const BitVector& other) const {
@ -98,8 +98,8 @@ public:
BitVector& operator&=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) {
bits_[i] &= other.bits_[i];
for (size_t i = 0; i < words_.size(); ++i) {
words_[i] &= other.words_[i];
}
this->updateAllZero();
return *this;
@ -107,8 +107,8 @@ public:
BitVector& operator|=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) {
bits_[i] |= other.bits_[i];
for (size_t i = 0; i < words_.size(); ++i) {
words_[i] |= other.words_[i];
}
this->updateAllZero();
return *this;
@ -116,8 +116,8 @@ public:
BitVector& operator^=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) {
bits_[i] ^= other.bits_[i];
for (size_t i = 0; i < words_.size(); ++i) {
words_[i] ^= other.words_[i];
}
this->updateAllZero();
return *this;
@ -125,23 +125,48 @@ public:
BitVector operator~() const {
BitVector result(size_);
for (size_t i = 0; i < bits_.size(); ++i) {
result.bits_[i] = ~bits_[i];
for (size_t i = 0; i < words_.size(); ++i) {
result.words_[i] = ~words_[i];
}
result.updateAllZero();
return result;
}
void flip() {
for (auto &word : bits_) {
for (auto &word : words_) {
word = ~word;
}
this->updateAllZero();
}
void reverse() {
if (size_ == 0)
return;
size_t remaining_bits = size_ % BITS_PER_WORD;
if (remaining_bits != 0) {
std::vector<T> reversed_words(words_.size(), 0);
for (size_t i = 0; i < size_; ++i) {
size_t reversed_pos = size_ - 1 - i;
size_t src_word = i / BITS_PER_WORD;
size_t src_offset = i % BITS_PER_WORD;
size_t dst_word = reversed_pos / BITS_PER_WORD;
size_t dst_offset = reversed_pos % BITS_PER_WORD;
if (words_[src_word] & (T(1) << src_offset)) {
reversed_words[dst_word] |= (T(1) << dst_offset);
}
}
words_ = std::move(reversed_words);
} else {
std::reverse(words_.begin(), words_.end());
for (auto &word : words_) {
word = static_cast<T>(bit_reverse(static_cast<uint64_t>(word)));
}
}
}
size_t count() const {
size_t count = 0;
for (const auto &word : bits_) {
for (const auto &word : words_) {
count += std::bitset<BITS_PER_WORD>(word).count();
}
return count;
@ -160,12 +185,12 @@ public:
size_t remaining_bits = size_ % BITS_PER_WORD;
T full_mask = ~T(0);
for (size_t i = 0; i < full_bits; ++i) {
if (bits_[i] != full_mask)
if (words_[i] != full_mask)
return false;
}
if (remaining_bits > 0) {
T partial_mask = (T(1) << remaining_bits) - 1;
if ((bits_[full_bits] & partial_mask) != partial_mask)
if ((words_[full_bits] & partial_mask) != partial_mask)
return false;
}
return true;
@ -181,17 +206,17 @@ public:
size_t bit_shift = pos % BITS_PER_WORD;
if (word_shift > 0) {
for (size_t i = bits_.size() - 1; i >= word_shift; --i) {
bits_[i] = bits_[i - word_shift];
for (size_t i = words_.size() - 1; i >= word_shift; --i) {
words_[i] = words_[i - word_shift];
}
std::fill(bits_.begin(), bits_.begin() + word_shift, 0);
std::fill(words_.begin(), words_.begin() + word_shift, 0);
}
if (bit_shift > 0) {
for (size_t i = bits_.size() - 1; i > 0; --i) {
bits_[i] = (bits_[i] << bit_shift) | (bits_[i - 1] >> (BITS_PER_WORD - bit_shift));
for (size_t i = words_.size() - 1; i > 0; --i) {
words_[i] = (words_[i] << bit_shift) | (words_[i - 1] >> (BITS_PER_WORD - bit_shift));
}
bits_[0] <<= bit_shift;
words_[0] <<= bit_shift;
}
this->updateAllZero();
@ -208,17 +233,17 @@ public:
size_t bit_shift = pos % BITS_PER_WORD;
if (word_shift > 0) {
for (size_t i = 0; i < bits_.size() - word_shift; ++i) {
bits_[i] = bits_[i + word_shift];
for (size_t i = 0; i < words_.size() - word_shift; ++i) {
words_[i] = words_[i + word_shift];
}
std::fill(bits_.end() - word_shift, bits_.end(), 0);
std::fill(words_.end() - word_shift, words_.end(), 0);
}
if (bit_shift > 0) {
for (size_t i = 0; i < bits_.size() - 1; ++i) {
bits_[i] = (bits_[i] >> bit_shift) | (bits_[i + 1] << (BITS_PER_WORD - bit_shift));
for (size_t i = 0; i < words_.size() - 1; ++i) {
words_[i] = (words_[i] >> bit_shift) | (words_[i + 1] << (BITS_PER_WORD - bit_shift));
}
bits_.back() >>= bit_shift;
words_.back() >>= bit_shift;
}
this->updateAllZero();

View file

@ -53,25 +53,25 @@ public:
SimPort(SimObjectBase* module)
: SimPortBase(module)
, peer_(nullptr)
, sink_(nullptr)
, tx_cb_(nullptr)
{}
void bind(SimPort<Pkt>* peer) {
assert(peer_ == nullptr);
peer_ = peer;
void bind(SimPort<Pkt>* sink) {
assert(sink_ == nullptr);
sink_ = sink;
}
void unbind() {
peer_ = nullptr;
sink_ = nullptr;
}
bool connected() const {
return (peer_ != nullptr);
return (sink_ != nullptr);
}
SimPort* peer() const {
return peer_;
SimPort* sink() const {
return sink_;
}
bool empty() const {
@ -111,15 +111,15 @@ protected:
};
std::queue<timed_pkt_t> queue_;
SimPort* peer_;
SimPort* sink_;
TxCallback tx_cb_;
void transfer(const Pkt& data, uint64_t cycles) {
if (tx_cb_) {
tx_cb_(data, cycles);
}
if (peer_) {
peer_->transfer(data, cycles);
if (sink_) {
sink_->transfer(data, cycles);
} else {
queue_.push({data, cycles});
}
@ -402,8 +402,8 @@ typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
template <typename Pkt>
void SimPort<Pkt>::push(const Pkt& pkt, uint64_t delay) const {
if (peer_ && !tx_cb_) {
reinterpret_cast<const SimPort<Pkt>*>(peer_)->push(pkt, delay);
if (sink_ && !tx_cb_) {
reinterpret_cast<const SimPort<Pkt>*>(sink_)->push(pkt, delay);
} else {
SimPlatform::instance().schedule(this, pkt, delay);
}

View file

@ -46,8 +46,6 @@ Core::Core(const SimContext& ctx,
, func_units_((uint32_t)FUType::Count)
, lmem_switch_(NUM_LSU_BLOCKS)
, mem_coalescers_(NUM_LSU_BLOCKS)
, lsu_dcache_adapter_(NUM_LSU_BLOCKS)
, lsu_lmem_adapter_(NUM_LSU_BLOCKS)
, pending_icache_(arch_.num_warps())
, commit_arbs_(ISSUE_WIDTH)
{
@ -64,11 +62,11 @@ Core::Core(const SimContext& ctx,
}
// create local memory
snprintf(sname, 100, "%s-local_mem", this->name().c_str());
snprintf(sname, 100, "%s-lmem", this->name().c_str());
local_mem_ = LocalMem::Create(sname, LocalMem::Config{
(1 << LMEM_LOG_SIZE),
LSU_WORD_SIZE,
LSU_NUM_REQS,
LSU_CHANNELS,
log2ceil(LMEM_NUM_BANKS),
false
});
@ -79,48 +77,52 @@ Core::Core(const SimContext& ctx,
lmem_switch_.at(i) = LocalMemSwitch::Create(sname, 1);
}
// create lsu dcache adapter
// create dcache adapter
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter(NUM_LSU_BLOCKS);
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
snprintf(sname, 100, "%s-lsu_dcache_adapter%d", this->name().c_str(), i);
lsu_dcache_adapter_.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
lsu_dcache_adapter.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
}
// create lsu lmem adapter
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
snprintf(sname, 100, "%s-lsu_lmem_adapter%d", this->name().c_str(), i);
lsu_lmem_adapter_.at(i) = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
}
// create lmem arbiter
snprintf(sname, 100, "%s-lmem_arb", this->name().c_str());
auto lmem_arb = LsuArbiter::Create(sname, ArbiterType::RoundRobin, NUM_LSU_BLOCKS, 1);
// connect lsu demux
// create lmem adapter
snprintf(sname, 100, "%s-lsu_lmem_adapter", this->name().c_str());
auto lsu_lmem_adapter = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
// connect lmem switch
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
lmem_switch_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
lmem_switch_.at(b)->ReqLmem.bind(&lmem_arb->ReqIn.at(b));
mem_coalescers_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspDC);
lmem_switch_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
lsu_lmem_adapter_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspLmem);
lmem_arb->RspIn.at(b).bind(&lmem_switch_.at(b)->RspLmem);
}
// connect coalescer-adapter
// connect lmem arbiter
lmem_arb->ReqOut.at(0).bind(&lsu_lmem_adapter->ReqIn);
lsu_lmem_adapter->RspIn.bind(&lmem_arb->RspOut.at(0));
// connect lmem adapter
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
lsu_lmem_adapter->ReqOut.at(c).bind(&local_mem_->Inputs.at(c));
local_mem_->Outputs.at(c).bind(&lsu_lmem_adapter->RspOut.at(c));
}
// connect dcache coalescer
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter_.at(b)->ReqIn);
lsu_dcache_adapter_.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter.at(b)->ReqIn);
lsu_dcache_adapter.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
}
// connect adapter-dcache
// connect dcache adapter
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
uint32_t i = b * DCACHE_CHANNELS + c;
lsu_dcache_adapter_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter_.at(b)->RspOut.at(c));
}
}
// connect adapter-lmem
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
uint32_t i = b * LSU_CHANNELS + c;
lsu_lmem_adapter_.at(b)->ReqOut.at(c).bind(&local_mem_->Inputs.at(i));
local_mem_->Outputs.at(i).bind(&lsu_lmem_adapter_.at(b)->RspOut.at(c));
lsu_dcache_adapter.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter.at(b)->RspOut.at(c));
}
}

View file

@ -127,6 +127,10 @@ public:
return local_mem_;
}
const MemCoalescer::Ptr& mem_coalescer(uint32_t idx) const {
return mem_coalescers_.at(idx);
}
const PerfStats& perf_stats() const {
return perf_stats_;
}
@ -156,8 +160,6 @@ private:
LocalMem::Ptr local_mem_;
std::vector<LocalMemSwitch::Ptr> lmem_switch_;
std::vector<MemCoalescer::Ptr> mem_coalescers_;
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter_;
std::vector<LsuMemAdapter::Ptr> lsu_lmem_adapter_;
PipelineLatch fetch_latch_;
PipelineLatch decode_latch_;

View file

@ -360,7 +360,6 @@ void Emulator::dcache_read(void *data, uint64_t addr, uint32_t size) {
} else {
mmu_.read(data, addr, size, 0);
}
DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl);
}
#endif
@ -565,6 +564,12 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
auto cluster_perf = core_->socket()->cluster()->perf_stats();
auto socket_perf = core_->socket()->perf_stats();
auto lmem_perf = core_->local_mem()->perf_stats();
uint64_t coalescer_misses = 0;
for (uint i = 0; i < NUM_LSU_BLOCKS; ++i) {
coalescer_misses += core_->mem_coalescer(i)->perf_stats().misses;
}
switch (addr) {
CSR_READ_64(VX_CSR_MPM_ICACHE_READS, socket_perf.icache.reads);
CSR_READ_64(VX_CSR_MPM_ICACHE_MISS_R, socket_perf.icache.read_misses);

View file

@ -24,14 +24,12 @@ protected:
LocalMem* simobject_;
Config config_;
RAM ram_;
uint32_t line_bits_;
MemCrossBar::Ptr mem_xbar_;
mutable PerfStats perf_stats_;
uint64_t to_local_addr(uint64_t addr) {
uint32_t total_lines = config_.capacity / config_.line_size;
uint32_t line_bits = log2ceil(total_lines);
uint32_t offset = bit_getw(addr, 0, line_bits-1);
return offset;
return bit_getw(addr, 0, line_bits_-1);
}
public:
@ -40,9 +38,13 @@ public:
, config_(config)
, ram_(config.capacity)
{
uint32_t total_lines = config.capacity / config.line_size;
line_bits_ = log2ceil(total_lines);
char sname[100];
snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_reqs, (1 << config.B));
uint32_t wsel_bits = log2ceil(config_.line_size);
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, (1 << config.B), wsel_bits);
for (uint32_t i = 0; i < config.num_reqs; ++i) {
simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i));
mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i));
@ -56,15 +58,15 @@ public:
}
void read(void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl);
ram_.read(data, s_addr, size);
auto l_addr = to_local_addr(addr);
DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl);
ram_.read(data, l_addr, size);
}
void write(const void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr);
DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl);
ram_.write(data, s_addr, size);
auto l_addr = to_local_addr(addr);
DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl);
ram_.write(data, l_addr, size);
}
void tick() {
@ -94,7 +96,7 @@ public:
}
const PerfStats& perf_stats() const {
perf_stats_.bank_stalls = mem_xbar_->collisions();
perf_stats_.bank_stalls = mem_xbar_->req_collisions();
return perf_stats_;
}
};

View file

@ -147,6 +147,9 @@ void MemCoalescer::tick() {
ReqOut.push(out_req, delay_);
DT(4, this->name() << "-mem-req: coalesced=" << cur_mask.count() << ", " << out_req);
// track partial responses
perf_stats_.misses += (cur_mask.count() != in_req.mask.count());
// update sent mask
sent_mask_ |= cur_mask;
if (sent_mask_ == in_req.mask) {
@ -154,3 +157,7 @@ void MemCoalescer::tick() {
sent_mask_.reset();
}
}
const MemCoalescer::PerfStats& MemCoalescer::perf_stats() const {
return perf_stats_;
}

View file

@ -23,6 +23,19 @@ public:
SimPort<LsuReq> ReqOut;
SimPort<LsuRsp> RspOut;
struct PerfStats {
uint64_t misses;
PerfStats()
: misses(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->misses += rhs.misses;
return *this;
}
};
MemCoalescer(
const SimContext& ctx,
const char* name,
@ -37,6 +50,8 @@ public:
void tick();
const PerfStats& perf_stats() const;
private:
struct pending_req_t {
@ -52,6 +67,7 @@ private:
BitVector<> sent_mask_;
uint32_t line_size_;
uint32_t delay_;
PerfStats perf_stats_;
};
}

View file

@ -527,6 +527,7 @@ public:
auto& req_in = Inputs.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
DT(4, this->name() << "-req" << o << ": " << req);
Outputs.at(o).push(req, delay_);
req_in.pop();
this->update_grant(o, g);
@ -597,37 +598,36 @@ public:
// process incoming requests
for (uint32_t o = 0; o < O; ++o) {
int32_t input_idx = -1;
bool has_collision = false;
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (grants_.at(o) + r) & (R-1);
if (i >= I)
continue;
auto& req_in = Inputs.at(i);
if (!req_in.empty()) {
if (req_in.empty())
continue;
auto& req = req_in.front();
// skip if input is not going to current output
uint32_t output_idx = 0;
if (O != 1) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
}
if (lg2_outputs_ != 0) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
// skip if input is not going to current output
if (output_idx != o)
continue;
}
if (input_idx != -1) {
++collisions_;
has_collision = true;
continue;
}
input_idx = i;
}
}
if (input_idx != -1) {
auto& req_in = Inputs.at(input_idx);
auto& req = req_in.front();
if (lg2_inputs_ != 0) {
req.tag = (req.tag << lg2_inputs_) | input_idx;
}
DT(4, this->name() << "-req" << input_idx << ": " << req);
DT(4, this->name() << "-req" << o << ": " << req);
Outputs.at(o).push(req, delay_);
req_in.pop();
this->update_grant(o, input_idx);
collisions_ += has_collision;
}
}
}
@ -721,8 +721,8 @@ public:
g = rsp.tag & (R-1);
rsp.tag >>= lg2_num_reqs_;
}
DT(4, this->name() << "-rsp" << o << ": " << rsp);
uint32_t j = o * R + g;
DT(4, this->name() << "-rsp" << j << ": " << rsp);
RspIn.at(j).push(rsp, 1);
rsp_out.pop();
}
@ -742,7 +742,7 @@ public:
if (lg2_num_reqs_ != 0) {
req.tag = (req.tag << lg2_num_reqs_) | g;
}
DT(4, this->name() << "-req" << j << ": " << req);
DT(4, this->name() << "-req" << o << ": " << req);
ReqOut.at(o).push(req, delay_);
req_in.pop();
this->update_grant(o, g);
@ -798,7 +798,8 @@ public:
, lg2_inputs_(log2ceil(num_inputs))
, lg2_outputs_(log2ceil(num_outputs))
, addr_start_(addr_start)
, collisions_(0) {
, req_collisions_(0)
, rsp_collisions_(0) {
assert(delay != 0);
assert(num_inputs <= 64);
assert(num_outputs <= 64);
@ -824,27 +825,28 @@ public:
// process outgoing responses
for (uint32_t i = 0; i < I; ++i) {
int32_t output_idx = -1;
bool has_collision = false;
for (uint32_t t = 0; t < T; ++t) {
uint32_t o = (rsp_grants_.at(i) + t) & (T-1);
if (o >= O)
continue;
auto& rsp_out = RspOut.at(o);
if (!rsp_out.empty()) {
if (rsp_out.empty())
continue;
auto& rsp = rsp_out.front();
// skip if response is not going to current input
uint32_t input_idx = 0;
if (lg2_inputs_ != 0) {
input_idx = rsp.tag & (R-1);
}
// skip if response is not going to current input
if (input_idx != i)
continue;
}
if (output_idx != -1) {
++collisions_;
has_collision = true;
continue;
}
output_idx = o;
}
}
if (output_idx != -1) {
auto& rsp_out = RspOut.at(output_idx);
auto& rsp = rsp_out.front();
@ -853,53 +855,60 @@ public:
input_idx = rsp.tag & (R-1);
rsp.tag >>= lg2_inputs_;
}
DT(4, this->name() << "-rsp" << output_idx << ": " << rsp);
RspIn.at(input_idx).push(rsp, 1);
DT(4, this->name() << "-rsp" << i << ": " << rsp);
RspIn.at(i).push(rsp, 1);
rsp_out.pop();
this->update_rsp_grant(i, output_idx);
rsp_collisions_ += has_collision;
}
}
// process incoming requests
for (uint32_t o = 0; o < O; ++o) {
int32_t input_idx = -1;
bool has_collision = false;
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (req_grants_.at(o) + r) & (R-1);
if (i >= I)
continue;
auto& req_in = ReqIn.at(i);
if (!req_in.empty()) {
if (req_in.empty())
continue;
auto& req = req_in.front();
// skip if request is not going to current output
uint32_t output_idx = 0;
if (O != 1) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
}
if (lg2_outputs_ != 0) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
// skip if request is not going to current output
if (output_idx != o)
continue;
}
if (input_idx != -1) {
++collisions_;
has_collision = true;
continue;
}
input_idx = i;
}
}
if (input_idx != -1) {
auto& req_in = ReqIn.at(input_idx);
auto& req = req_in.front();
if (lg2_inputs_ != 0) {
req.tag = (req.tag << lg2_inputs_) | input_idx;
}
DT(4, this->name() << "-req" << input_idx << ": " << req);
DT(4, this->name() << "-req" << o << ": " << req);
ReqOut.at(o).push(req, delay_);
req_in.pop();
this->update_req_grant(o, input_idx);
req_collisions_ += has_collision;
}
}
}
uint64_t collisions() const {
return collisions_;
uint64_t req_collisions() const {
return req_collisions_;
}
uint64_t rsp_collisions() const {
return rsp_collisions_;
}
protected:
@ -923,7 +932,8 @@ protected:
uint32_t lg2_inputs_;
uint32_t lg2_outputs_;
uint32_t addr_start_;
uint64_t collisions_;
uint64_t req_collisions_;
uint64_t rsp_collisions_;
};
///////////////////////////////////////////////////////////////////////////////
@ -978,6 +988,7 @@ private:
uint32_t delay_;
};
using LsuArbiter = TxArbiter<LsuReq, LsuRsp>;
using MemArbiter = TxArbiter<MemReq, MemRsp>;
using MemCrossBar = TxCrossBar<MemReq, MemRsp>;