mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 22:07:41 -04:00
memory mem_coalescer miss perf counter
RTL perf counters refactoring
This commit is contained in:
parent
f478bdcf25
commit
704f525fd6
41 changed files with 581 additions and 521 deletions
|
@ -24,7 +24,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
`endif
|
||||
|
||||
// DCRs
|
||||
|
@ -43,12 +43,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
assign mem_perf_tmp_if.icache = 'x;
|
||||
assign mem_perf_tmp_if.dcache = 'x;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.lmem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
cache_perf_t l2_perf;
|
||||
sysmem_perf_t sysmem_perf_tmp;
|
||||
always @(*) begin
|
||||
sysmem_perf_tmp = sysmem_perf;
|
||||
sysmem_perf_tmp.l2cache = l2_perf;
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
|
@ -111,7 +111,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (l2_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.l2cache),
|
||||
.cache_perf (l2_perf),
|
||||
`endif
|
||||
.core_bus_if (per_socket_mem_bus_if),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
|
@ -140,7 +140,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
.reset (socket_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.sysmem_perf (sysmem_perf_tmp),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (socket_dcr_bus_if),
|
||||
|
|
|
@ -329,19 +329,19 @@
|
|||
VX_edge_trigger #( \
|
||||
.POS (0), \
|
||||
.INIT (0) \
|
||||
) __``dst``__ ( \
|
||||
) __neg_edge`__LINE__ ( \
|
||||
.clk (clk), \
|
||||
.reset (1'b0), \
|
||||
.data_in (src), \
|
||||
.data_out (dst) \
|
||||
)
|
||||
|
||||
`define BUFFER_EX(dst, src, ena, RSTW, latency) \
|
||||
`define BUFFER_EX(dst, src, ena, resetw, latency) \
|
||||
VX_pipe_register #( \
|
||||
.DATAW ($bits(dst)), \
|
||||
.RESETW (RSTW), \
|
||||
.RESETW (resetw), \
|
||||
.DEPTH (latency) \
|
||||
) __``dst``__ ( \
|
||||
) __buffer_ex`__LINE__ ( \
|
||||
.clk (clk), \
|
||||
.reset (reset), \
|
||||
.enable (ena), \
|
||||
|
@ -349,13 +349,13 @@
|
|||
.data_out (dst) \
|
||||
)
|
||||
|
||||
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 0, 1)
|
||||
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, $bits(dst), 1)
|
||||
|
||||
`define POP_COUNT_EX(out, in, model) \
|
||||
VX_popcount #( \
|
||||
.N ($bits(in)), \
|
||||
.MODEL (model) \
|
||||
) __``out``__ ( \
|
||||
) __pop_count_ex`__LINE__ ( \
|
||||
.data_in (in), \
|
||||
.data_out (out) \
|
||||
)
|
||||
|
@ -482,7 +482,7 @@
|
|||
for (genvar __i = 0; __i < count; ++__i) begin \
|
||||
assign __reduce_add_i_field[__i] = src[__i].``field; \
|
||||
end \
|
||||
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
|
||||
VX_reduce_tree #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
|
||||
__reduce_add_i_field, \
|
||||
__reduce_add_o_field \
|
||||
); \
|
||||
|
|
|
@ -73,6 +73,17 @@ package VX_gpu_pkg;
|
|||
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
|
||||
} cache_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`PERF_CTR_BITS-1:0] reads;
|
||||
logic [`PERF_CTR_BITS-1:0] writes;
|
||||
logic [`PERF_CTR_BITS-1:0] bank_stalls;
|
||||
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
|
||||
} lmem_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`PERF_CTR_BITS-1:0] misses;
|
||||
} coalescer_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`PERF_CTR_BITS-1:0] reads;
|
||||
logic [`PERF_CTR_BITS-1:0] writes;
|
||||
|
@ -92,6 +103,26 @@ package VX_gpu_pkg;
|
|||
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
|
||||
} issue_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
cache_perf_t icache;
|
||||
cache_perf_t dcache;
|
||||
cache_perf_t l2cache;
|
||||
cache_perf_t l3cache;
|
||||
lmem_perf_t lmem;
|
||||
coalescer_perf_t coalescer;
|
||||
mem_perf_t mem;
|
||||
} sysmem_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
sched_perf_t sched;
|
||||
issue_perf_t issue;
|
||||
logic [`PERF_CTR_BITS-1:0] ifetches;
|
||||
logic [`PERF_CTR_BITS-1:0] loads;
|
||||
logic [`PERF_CTR_BITS-1:0] stores;
|
||||
logic [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||
logic [`PERF_CTR_BITS-1:0] load_latency;
|
||||
} pipeline_perf_t;
|
||||
|
||||
//////////////////////// instruction arguments ////////////////////////////
|
||||
|
||||
typedef struct packed {
|
||||
|
@ -145,6 +176,7 @@ package VX_gpu_pkg;
|
|||
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
|
||||
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
|
||||
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
|
||||
localparam LMEM_TAG_WIDTH = LSU_TAG_WIDTH + `CLOG2(`NUM_LSU_BLOCKS);
|
||||
|
||||
////////////////////////// Icache Parameters //////////////////////////////
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
`endif
|
||||
|
||||
// DCRs
|
||||
|
@ -63,11 +63,13 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.lmem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
cache_perf_t icache_perf, dcache_perf;
|
||||
sysmem_perf_t sysmem_perf_tmp;
|
||||
always @(*) begin
|
||||
sysmem_perf_tmp = sysmem_perf;
|
||||
sysmem_perf_tmp.icache = icache_perf;
|
||||
sysmem_perf_tmp.dcache = dcache_perf;
|
||||
end
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
@ -110,7 +112,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.MEM_OUT_BUF (2)
|
||||
) icache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.icache),
|
||||
.cache_perf (icache_perf),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (icache_reset),
|
||||
|
@ -160,7 +162,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.MEM_OUT_BUF (2)
|
||||
) dcache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.dcache),
|
||||
.cache_perf (dcache_perf),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (dcache_reset),
|
||||
|
@ -187,6 +189,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.NUM_OUTPUTS(1),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX(0),
|
||||
|
@ -234,7 +237,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.reset (core_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.sysmem_perf (sysmem_perf_tmp),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (core_dcr_bus_if),
|
||||
|
|
|
@ -177,6 +177,9 @@
|
|||
`define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C
|
||||
`define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts
|
||||
`define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D
|
||||
// PERF: coalescer
|
||||
`define VX_CSR_MPM_COALESCE_MISS 12'hB1E // coalescer misses
|
||||
`define VX_CSR_MPM_COALESCE_MISS_H 12'hB9E
|
||||
|
||||
// Machine Performance-monitoring memory counters (class 3) ///////////////////
|
||||
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>
|
||||
|
|
|
@ -50,11 +50,14 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = 'x;
|
||||
assign mem_perf_if.dcache = 'x;
|
||||
assign mem_perf_if.l2cache = 'x;
|
||||
assign mem_perf_if.lmem = 'x;
|
||||
cache_perf_t l3_perf;
|
||||
mem_perf_t mem_perf;
|
||||
sysmem_perf_t sysmem_perf;
|
||||
always @(*) begin
|
||||
sysmem_perf = '0;
|
||||
sysmem_perf.l3cache = l3_perf;
|
||||
sysmem_perf.mem = mem_perf;
|
||||
end
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
|
@ -98,7 +101,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.reset (l3_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_if.l3cache),
|
||||
.cache_perf (l3_perf),
|
||||
`endif
|
||||
|
||||
.core_bus_if (per_cluster_mem_bus_if),
|
||||
|
@ -146,7 +149,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.reset (cluster_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.sysmem_perf (sysmem_perf),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (cluster_dcr_bus_if),
|
||||
|
@ -182,7 +185,6 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
mem_perf_t mem_perf;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -202,7 +204,6 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
assign mem_perf_if.mem = mem_perf;
|
||||
|
||||
`endif
|
||||
|
||||
|
|
|
@ -620,6 +620,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.NUM_OUTPUTS (1),
|
||||
.DATA_SIZE (LMEM_DATA_SIZE),
|
||||
.ADDR_WIDTH (CCI_VX_ADDR_WIDTH),
|
||||
.TAG_WIDTH (CCI_VX_TAG_WIDTH),
|
||||
|
@ -1097,7 +1098,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
wire vx_mem_req_fire = vx_mem_req_valid[0] && vx_mem_req_ready[0];
|
||||
wire vx_mem_rsp_fire = vx_mem_rsp_valid[0] && vx_mem_rsp_ready[0];
|
||||
wire avs_req_fire = (avs_write[0] || avs_read[0]) && ~avs_waitrequest[0];
|
||||
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP (0, 0, {
|
||||
vx_reset,
|
||||
|
|
|
@ -328,9 +328,9 @@ module VX_afu_wrap #(
|
|||
`ifdef DBG_SCOPE_AFU
|
||||
wire m_axi_mem_awfire_0 = m_axi_mem_awvalid_a[0] & m_axi_mem_awready_a[0];
|
||||
wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0];
|
||||
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
|
||||
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
|
||||
wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0];
|
||||
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP (0, 0, {
|
||||
ap_reset,
|
||||
|
|
60
hw/rtl/cache/VX_cache_wrap.sv
vendored
60
hw/rtl/cache/VX_cache_wrap.sv
vendored
|
@ -210,7 +210,59 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign cache_perf = '0;
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_req;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_req;
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
|
||||
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
|
||||
assign perf_core_reads_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && ~core_bus_if[i].req_data.rw;
|
||||
assign perf_core_writes_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && core_bus_if[i].req_data.rw;
|
||||
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
|
||||
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
|
||||
end
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
|
||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
||||
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_core_reads <= '0;
|
||||
perf_core_writes <= '0;
|
||||
perf_mem_stalls <= '0;
|
||||
perf_crsp_stalls <= '0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
assign cache_perf.reads = perf_core_reads;
|
||||
assign cache_perf.writes = perf_core_writes;
|
||||
assign cache_perf.read_misses = '0;
|
||||
assign cache_perf.write_misses = '0;
|
||||
assign cache_perf.bank_stalls = '0;
|
||||
assign cache_perf.mshr_stalls = '0;
|
||||
assign cache_perf.mem_stalls = perf_mem_stalls;
|
||||
assign cache_perf.crsp_stalls = perf_crsp_stalls;
|
||||
`endif
|
||||
|
||||
end
|
||||
|
@ -220,13 +272,13 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
always @(posedge clk) begin
|
||||
if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin
|
||||
if (core_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
|
||||
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.tag.uuid))
|
||||
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, i, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
|
||||
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -101,7 +101,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
.data_out ({commit_fire_any_r, commit_size_r})
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (COMMIT_SIZEW),
|
||||
.DATAW_OUT (COMMIT_ALL_SIZEW),
|
||||
.N (`ISSUE_WIDTH),
|
||||
|
|
|
@ -28,7 +28,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
`endif
|
||||
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
@ -65,14 +65,15 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
) lsu_mem_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
lmem_perf_t lmem_perf;
|
||||
coalescer_perf_t coalescer_perf;
|
||||
pipeline_perf_t pipeline_perf;
|
||||
sysmem_perf_t sysmem_perf_tmp;
|
||||
always @(*) begin
|
||||
sysmem_perf_tmp = sysmem_perf;
|
||||
sysmem_perf_tmp.lmem = lmem_perf;
|
||||
sysmem_perf_tmp.coalescer = coalescer_perf;
|
||||
end
|
||||
`endif
|
||||
|
||||
base_dcrs_t base_dcrs;
|
||||
|
@ -94,7 +95,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.sched_perf (pipeline_perf_if.sched),
|
||||
.sched_perf (pipeline_perf.sched),
|
||||
`endif
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
@ -144,7 +145,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.issue_perf (pipeline_perf_if.issue),
|
||||
.issue_perf (pipeline_perf.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
|
@ -162,8 +163,8 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sysmem_perf (sysmem_perf_tmp),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
`endif
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
@ -200,7 +201,8 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.lmem_perf (mem_perf_tmp_if.lmem),
|
||||
.lmem_perf (lmem_perf),
|
||||
.coalescer_perf(coalescer_perf),
|
||||
`endif
|
||||
.lsu_mem_if (lsu_mem_if),
|
||||
.dcache_bus_if (dcache_bus_if)
|
||||
|
@ -276,12 +278,11 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
assign pipeline_perf_if.ifetches = perf_ifetches;
|
||||
assign pipeline_perf_if.loads = perf_loads;
|
||||
assign pipeline_perf_if.stores = perf_stores;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
assign pipeline_perf.ifetches = perf_ifetches;
|
||||
assign pipeline_perf.loads = perf_loads;
|
||||
assign pipeline_perf.stores = perf_stores;
|
||||
assign pipeline_perf.ifetch_latency = perf_icache_lat;
|
||||
assign pipeline_perf.load_latency = perf_dcache_lat;
|
||||
|
||||
`endif
|
||||
|
||||
|
|
|
@ -127,13 +127,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
assign icache_rsp_ready = icache_bus_if.rsp_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = '0;
|
||||
assign mem_perf_if.dcache = '0;
|
||||
assign mem_perf_if.l2cache = '0;
|
||||
assign mem_perf_if.l3cache = '0;
|
||||
assign mem_perf_if.lmem = '0;
|
||||
assign mem_perf_if.mem = '0;
|
||||
sysmem_perf_t mem_perf;
|
||||
assign mem_perf.icache = '0;
|
||||
assign mem_perf.dcache = '0;
|
||||
assign mem_perf.l2cache = '0;
|
||||
assign mem_perf.l3cache = '0;
|
||||
assign mem_perf.lmem = '0;
|
||||
assign mem_perf.mem = '0;
|
||||
`endif
|
||||
|
||||
`ifdef SCOPE
|
||||
|
@ -152,7 +152,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.sysmem_perf (sysmem_perf),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (dcr_bus_if),
|
||||
|
|
|
@ -41,8 +41,8 @@ import VX_fpu_pkg::*;
|
|||
input base_dcrs_t base_dcrs,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
`endif
|
||||
|
||||
VX_commit_csr_if.slave commit_csr_if,
|
||||
|
@ -212,65 +212,67 @@ import VX_fpu_pkg::*;
|
|||
`VX_DCR_MPM_CLASS_CORE: begin
|
||||
case (read_addr)
|
||||
// PERF: pipeline
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf_if.sched.idles);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf_if.sched.stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf_if.issue.ibf_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf_if.issue.scb_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf_if.issue.opd_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_ALU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf.sched.idles);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf.sched.stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf.issue.ibf_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf.issue.scb_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf.issue.opd_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_ALU]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_FPU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_FPU]);
|
||||
`else
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0));
|
||||
`endif
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_WCTL]);
|
||||
// PERF: memory
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf_if.ifetches);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf_if.loads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf_if.stores);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf_if.ifetch_latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf_if.load_latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf.ifetch_latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf.load_latency);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`VX_DCR_MPM_CLASS_MEM: begin
|
||||
case (read_addr)
|
||||
// PERF: icache
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, mem_perf_if.icache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, mem_perf_if.icache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, mem_perf_if.icache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls);
|
||||
// PERF: dcache
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, mem_perf_if.dcache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, mem_perf_if.dcache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, mem_perf_if.dcache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, mem_perf_if.dcache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, mem_perf_if.dcache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, mem_perf_if.dcache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls);
|
||||
// PERF: lmem
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, mem_perf_if.lmem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, mem_perf_if.lmem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, mem_perf_if.lmem.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls);
|
||||
// PERF: l2cache
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, mem_perf_if.l2cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, mem_perf_if.l2cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, mem_perf_if.l2cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, mem_perf_if.l2cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l2cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l2cache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls);
|
||||
// PERF: l3cache
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, mem_perf_if.l3cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, mem_perf_if.l3cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, mem_perf_if.l3cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, mem_perf_if.l3cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l3cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l3cache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls);
|
||||
// PERF: memory
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, mem_perf_if.mem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, mem_perf_if.mem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, mem_perf_if.mem.latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency);
|
||||
// PERF: coalescer
|
||||
`CSR_READ_64(`VX_CSR_MPM_COALESCE_MISS, read_data_ro_w, sysmem_perf.coalescer.misses);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
@ -290,8 +292,8 @@ import VX_fpu_pkg::*;
|
|||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
`UNUSED_VAR (mem_perf_if.icache);
|
||||
`UNUSED_VAR (mem_perf_if.lmem);
|
||||
`UNUSED_VAR (sysmem_perf.icache);
|
||||
`UNUSED_VAR (sysmem_perf.lmem);
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -24,8 +24,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
input base_dcrs_t base_dcrs,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -82,8 +82,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
.base_dcrs (base_dcrs),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
`endif
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
|
|
|
@ -23,8 +23,8 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
`endif
|
||||
|
||||
input base_dcrs_t base_dcrs,
|
||||
|
@ -93,8 +93,8 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if (pipeline_perf_if),
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
`endif
|
||||
.base_dcrs (base_dcrs),
|
||||
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
|
|
|
@ -137,6 +137,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
wire schedule_fire = schedule_if.valid && schedule_if.ready;
|
||||
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
|
||||
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 1, 6, 3, (
|
||||
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
|
||||
|
|
|
@ -93,6 +93,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
|
|||
`SCOPE_IO_SWITCH (1);
|
||||
wire decode_fire = decode_if.valid && decode_if.ready;
|
||||
wire operands_fire = operands_if.valid && operands_if.ready;
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 2, 4, 3, (
|
||||
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 +
|
||||
|
|
|
@ -535,6 +535,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
`ifdef SCOPE
|
||||
`ifdef DBG_SCOPE_LSU
|
||||
`SCOPE_IO_SWITCH (1);
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 3, 4, 2, (
|
||||
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH
|
||||
|
|
|
@ -20,7 +20,8 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output cache_perf_t lmem_perf,
|
||||
output lmem_perf_t lmem_perf,
|
||||
output coalescer_perf_t coalescer_perf,
|
||||
`endif
|
||||
|
||||
VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS],
|
||||
|
@ -39,7 +40,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
|
@ -60,46 +61,58 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
);
|
||||
end
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH)
|
||||
) lmem_arb_if[1]();
|
||||
|
||||
VX_lsu_mem_arb #(
|
||||
.NUM_INPUTS (`NUM_LSU_BLOCKS),
|
||||
.NUM_OUTPUTS(1),
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.TAG_SEL_IDX(0),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF(0),
|
||||
.RSP_OUT_BUF(2)
|
||||
) lmem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (lsu_lmem_if),
|
||||
.bus_out_if (lmem_arb_if)
|
||||
);
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lmem_bus_if[LSU_NUM_REQS]();
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH)
|
||||
) lmem_adapt_if[`NUM_LSU_LANES]();
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_adapters
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
|
||||
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (3),
|
||||
.RSP_OUT_BUF (2)
|
||||
) lmem_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.lsu_mem_if (lsu_lmem_if[i]),
|
||||
.mem_bus_if (lmem_bus_tmp_if)
|
||||
);
|
||||
|
||||
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_lmem_bus_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
|
||||
end
|
||||
end
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (LMEM_TAG_WIDTH - `UUID_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (3),
|
||||
.RSP_OUT_BUF (0)
|
||||
) lmem_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.lsu_mem_if (lmem_arb_if[0]),
|
||||
.mem_bus_if (lmem_adapt_if)
|
||||
);
|
||||
|
||||
VX_local_mem #(
|
||||
.INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))),
|
||||
.SIZE (1 << `LMEM_LOG_SIZE),
|
||||
.NUM_REQS (LSU_NUM_REQS),
|
||||
.NUM_REQS (`NUM_LSU_LANES),
|
||||
.NUM_BANKS (`LMEM_NUM_BANKS),
|
||||
.WORD_SIZE (LSU_WORD_SIZE),
|
||||
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH),
|
||||
.OUT_BUF (3)
|
||||
) local_mem (
|
||||
.clk (clk),
|
||||
|
@ -107,7 +120,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
.lmem_perf (lmem_perf),
|
||||
`endif
|
||||
.mem_bus_if (lmem_bus_if)
|
||||
.mem_bus_if (lmem_adapt_if)
|
||||
);
|
||||
|
||||
`else
|
||||
|
@ -115,6 +128,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
assign lmem_perf = '0;
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]);
|
||||
end
|
||||
|
@ -127,6 +141,21 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`NUM_LSU_BLOCKS-1:0][`PERF_CTR_BITS-1:0] per_block_coalescer_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] coalescer_misses;
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (`PERF_CTR_BITS),
|
||||
.DATAW_OUT (`PERF_CTR_BITS),
|
||||
.N (`NUM_LSU_BLOCKS),
|
||||
.OP ("+")
|
||||
) coalescer_reduce (
|
||||
.data_in (per_block_coalescer_misses),
|
||||
.data_out (coalescer_misses)
|
||||
);
|
||||
`BUFFER(coalescer_perf.misses, coalescer_misses);
|
||||
`endif
|
||||
|
||||
if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers
|
||||
|
@ -139,11 +168,16 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
|
||||
.QUEUE_SIZE (`LSUQ_OUT_SIZE),
|
||||
.PERF_CTR_BITS (`PERF_CTR_BITS)
|
||||
) mem_coalescer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
`ifdef LMEM_ENABLE
|
||||
.misses (per_block_coalescer_misses[i]),
|
||||
`endif
|
||||
|
||||
// Input request
|
||||
.in_req_valid (lsu_dcache_if[i].req_valid),
|
||||
.in_req_mask (lsu_dcache_if[i].req_data.mask),
|
||||
|
@ -186,6 +220,9 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
|
||||
`ifdef LMEM_ENABLE
|
||||
assign per_block_coalescer_misses[i] = '0;
|
||||
`endif
|
||||
end
|
||||
|
||||
end
|
||||
|
|
|
@ -106,7 +106,6 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (PER_BANK_ADDRW),
|
||||
.ARBITER ("P"), // use priority arbiter
|
||||
.PERF_CTR_BITS(`PERF_CTR_BITS),
|
||||
.OUT_BUF (0) // no output buffering
|
||||
) req_xbar (
|
||||
.clk (clk),
|
||||
|
|
|
@ -44,7 +44,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
|
||||
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||
|
||||
VX_reduce #(
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
.OP ("|")
|
||||
|
@ -53,7 +53,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
.data_out (perf_units_per_cycle)
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (`NUM_SFU_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
.OP ("|")
|
||||
|
|
|
@ -21,8 +21,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
`endif
|
||||
|
||||
input base_dcrs_t base_dcrs,
|
||||
|
@ -121,8 +121,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
.execute_if (pe_execute_if[PE_IDX_CSRS]),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
|
|
@ -1,46 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_pipeline_perf_if import VX_gpu_pkg::*; ();
|
||||
sched_perf_t sched;
|
||||
issue_perf_t issue;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||
wire [`PERF_CTR_BITS-1:0] loads;
|
||||
wire [`PERF_CTR_BITS-1:0] stores;
|
||||
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||
|
||||
modport master (
|
||||
output sched,
|
||||
output issue,
|
||||
output ifetches,
|
||||
output loads,
|
||||
output stores,
|
||||
output ifetch_latency,
|
||||
output load_latency
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input sched,
|
||||
input issue,
|
||||
input ifetches,
|
||||
input loads,
|
||||
input stores,
|
||||
input ifetch_latency,
|
||||
input load_latency
|
||||
);
|
||||
|
||||
endinterface
|
|
@ -1,27 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_sfu_perf_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] wctl_stalls;
|
||||
|
||||
modport master (
|
||||
output wctl_stalls
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input wctl_stalls
|
||||
);
|
||||
|
||||
endinterface
|
|
@ -24,6 +24,7 @@ module VX_mem_coalescer #(
|
|||
parameter TAG_WIDTH = 8,
|
||||
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
|
||||
parameter QUEUE_SIZE = 8,
|
||||
parameter PERF_CTR_BITS = `CLOG2(NUM_REQS+1),
|
||||
|
||||
parameter DATA_IN_WIDTH = DATA_IN_SIZE * 8,
|
||||
parameter DATA_OUT_WIDTH= DATA_OUT_SIZE * 8,
|
||||
|
@ -37,6 +38,8 @@ module VX_mem_coalescer #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire [PERF_CTR_BITS-1:0] misses,
|
||||
|
||||
// Input request
|
||||
input wire in_req_valid,
|
||||
input wire in_req_rw,
|
||||
|
@ -323,6 +326,23 @@ module VX_mem_coalescer #(
|
|||
assign in_rsp_tag = {out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout_tag};
|
||||
assign out_rsp_ready = in_rsp_ready;
|
||||
|
||||
// compute coalescing misses
|
||||
// misses are partial transfers (not fuly coalesced)
|
||||
|
||||
reg [PERF_CTR_BITS-1:0] misses_r;
|
||||
|
||||
wire partial_transfer = (out_req_fire && req_rem_mask_r != '1);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
misses_r <= '0;
|
||||
end else begin
|
||||
misses_r <= misses_r + PERF_CTR_BITS'(partial_transfer);
|
||||
end
|
||||
end
|
||||
|
||||
assign misses = misses_r;
|
||||
|
||||
`ifdef DBG_TRACE_MEM
|
||||
wire [`UP(UUID_WIDTH)-1:0] out_req_uuid;
|
||||
wire [`UP(UUID_WIDTH)-1:0] out_rsp_uuid;
|
||||
|
|
|
@ -237,6 +237,8 @@ module VX_mem_scheduler #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
`UNUSED_PIN (misses),
|
||||
|
||||
// Input request
|
||||
.in_req_valid (reqq_valid),
|
||||
.in_req_mask (reqq_mask),
|
||||
|
|
|
@ -1,78 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
`TRACING_OFF
|
||||
module VX_reduce #(
|
||||
parameter DATAW_IN = 1,
|
||||
parameter DATAW_OUT = DATAW_IN,
|
||||
parameter N = 1,
|
||||
parameter `STRING OP = "+"
|
||||
) (
|
||||
input wire [N-1:0][DATAW_IN-1:0] data_in,
|
||||
output wire [DATAW_OUT-1:0] data_out
|
||||
);
|
||||
if (N == 1) begin : g_passthru
|
||||
assign data_out = DATAW_OUT'(data_in[0]);
|
||||
end else begin : g_reduce
|
||||
localparam int N_A = N / 2;
|
||||
localparam int N_B = N - N_A;
|
||||
|
||||
wire [N_A-1:0][DATAW_IN-1:0] in_A;
|
||||
wire [N_B-1:0][DATAW_IN-1:0] in_B;
|
||||
wire [DATAW_OUT-1:0] out_A, out_B;
|
||||
|
||||
for (genvar i = 0; i < N_A; i++) begin : g_in_A
|
||||
assign in_A[i] = data_in[i];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < N_B; i++) begin : g_in_B
|
||||
assign in_B[i] = data_in[N_A + i];
|
||||
end
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (DATAW_IN),
|
||||
.DATAW_OUT (DATAW_OUT),
|
||||
.N (N_A),
|
||||
.OP (OP)
|
||||
) reduce_A (
|
||||
.data_in (in_A),
|
||||
.data_out (out_A)
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (DATAW_IN),
|
||||
.DATAW_OUT (DATAW_OUT),
|
||||
.N (N_B),
|
||||
.OP (OP)
|
||||
) reduce_B (
|
||||
.data_in (in_B),
|
||||
.data_out (out_B)
|
||||
);
|
||||
|
||||
if (OP == "+") begin : g_plus
|
||||
assign data_out = out_A + out_B;
|
||||
end else if (OP == "^") begin : g_xor
|
||||
assign data_out = out_A ^ out_B;
|
||||
end else if (OP == "&") begin : g_and
|
||||
assign data_out = out_A & out_B;
|
||||
end else if (OP == "|") begin : g_or
|
||||
assign data_out = out_A | out_B;
|
||||
end else begin : g_error
|
||||
`ERROR(("invalid parameter"));
|
||||
end
|
||||
end
|
||||
|
||||
endmodule
|
||||
`TRACING_ON
|
|
@ -206,13 +206,13 @@ module VX_stream_xbar #(
|
|||
reg [PERF_CTR_BITS-1:0] collisions_r;
|
||||
|
||||
always @(*) begin
|
||||
per_cycle_collision = 0;
|
||||
per_cycle_collision = '0;
|
||||
for (integer i = 0; i < NUM_INPUTS; ++i) begin
|
||||
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin
|
||||
for (integer j = i + 1; j < NUM_INPUTS; ++j) begin
|
||||
per_cycle_collision[i] |= valid_in[i]
|
||||
&& valid_in[j+i]
|
||||
&& (sel_in[i] == sel_in[j+i])
|
||||
&& (ready_in[i] | ready_in[j+i]);
|
||||
&& valid_in[j]
|
||||
&& (sel_in[i] == sel_in[j])
|
||||
&& (ready_in[i] | ready_in[j]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -43,7 +43,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
output cache_perf_t lmem_perf,
|
||||
output lmem_perf_t lmem_perf,
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if.slave mem_bus_if [NUM_REQS]
|
||||
|
@ -286,14 +286,10 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
assign lmem_perf.reads = perf_reads;
|
||||
assign lmem_perf.writes = perf_writes;
|
||||
assign lmem_perf.read_misses = '0;
|
||||
assign lmem_perf.write_misses = '0;
|
||||
assign lmem_perf.bank_stalls = perf_collisions;
|
||||
assign lmem_perf.mshr_stalls = '0;
|
||||
assign lmem_perf.mem_stalls = '0;
|
||||
assign lmem_perf.crsp_stalls = perf_crsp_stalls;
|
||||
assign lmem_perf.reads = perf_reads;
|
||||
assign lmem_perf.writes = perf_writes;
|
||||
assign lmem_perf.bank_stalls = perf_collisions;
|
||||
assign lmem_perf.crsp_stalls = perf_crsp_stalls;
|
||||
|
||||
`endif
|
||||
|
||||
|
@ -321,15 +317,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
always @(posedge clk) begin
|
||||
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
|
||||
if (mem_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: %s rd-rsp: req_idx=%0d, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
|
@ -339,15 +335,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
always @(posedge clk) begin
|
||||
if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin
|
||||
if (per_bank_req_rw[i]) begin
|
||||
`TRACE(2, ("%t: %s-bank%0d wr-req: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s bank-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s bank-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
|
||||
end
|
||||
end
|
||||
if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin
|
||||
`TRACE(2, ("%t: %s-bank%0d rd-rsp: data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
`TRACE(2, ("%t: %s bank-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, per_bank_rsp_data[i], per_bank_rsp_tag_value[i], per_bank_rsp_uuid[i]))
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,43 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_mem_perf_if import VX_gpu_pkg::*; ();
|
||||
|
||||
cache_perf_t icache;
|
||||
cache_perf_t dcache;
|
||||
cache_perf_t l2cache;
|
||||
cache_perf_t l3cache;
|
||||
cache_perf_t lmem;
|
||||
mem_perf_t mem;
|
||||
|
||||
modport master (
|
||||
output icache,
|
||||
output dcache,
|
||||
output l2cache,
|
||||
output l3cache,
|
||||
output lmem,
|
||||
output mem
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input icache,
|
||||
input dcache,
|
||||
input l2cache,
|
||||
input l3cache,
|
||||
input lmem,
|
||||
input mem
|
||||
);
|
||||
|
||||
endinterface
|
|
@ -223,7 +223,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
|
||||
return err;
|
||||
});
|
||||
|
||||
|
||||
uint64_t num_mem_bank_ports;
|
||||
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_MEM_BANKS, &num_mem_bank_ports), {
|
||||
return err;
|
||||
|
@ -437,6 +437,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
uint64_t dcache_requests_per_core = 0;
|
||||
|
||||
if (dcache_enable) {
|
||||
// PERF: Dcache
|
||||
uint64_t dcache_reads;
|
||||
|
@ -447,6 +449,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
|
||||
return err;
|
||||
});
|
||||
dcache_requests_per_core += dcache_reads + dcache_writes;
|
||||
uint64_t dcache_read_misses;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
|
||||
return err;
|
||||
|
@ -475,6 +478,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
// PERF: coalescer
|
||||
uint64_t coalescer_misses;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_COALESCER_ST, core_id, &coalescer_misses), {
|
||||
return err;
|
||||
});
|
||||
int coalescer_utilization = calcAvgPercent(dcache_requests_per_core - coalescer_misses, dcache_requests_per_core);
|
||||
fprintf(stream, "PERF: core%d: coalescer misses=%ld (hit ratio=%d%%)\n", core_id, coalescer_misses, coalescer_utilization);
|
||||
|
||||
if (l2cache_enable) {
|
||||
// PERF: L2cache
|
||||
uint64_t tmp;
|
||||
|
@ -612,7 +623,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
|
||||
int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
|
||||
int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls);
|
||||
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
|
||||
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
|
||||
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
|
||||
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
|
||||
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio);
|
||||
|
|
|
@ -104,6 +104,27 @@ inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
|
|||
return (bits << shift) >> (shift + start);
|
||||
}
|
||||
|
||||
inline uint64_t bit_reverse(uint64_t bits) {
|
||||
bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1) | ((bits & 0x5555555555555555) << 1);
|
||||
bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2) | ((bits & 0x3333333333333333) << 2);
|
||||
bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4) | ((bits & 0x0F0F0F0F0F0F0F0F) << 4);
|
||||
bits = ((bits & 0xFF00FF00FF00FF00) >> 8) | ((bits & 0x00FF00FF00FF00FF) << 8);
|
||||
bits = ((bits & 0xFFFF0000FFFF0000) >> 16) | ((bits & 0x0000FFFF0000FFFF) << 16);
|
||||
bits = (bits >> 32) | (bits << 32);
|
||||
return bits;
|
||||
}
|
||||
|
||||
inline uint64_t bit_reverse(uint64_t bits, uint32_t width) {
|
||||
assert(width <= 64);
|
||||
uint64_t reversed(0);
|
||||
for (uint32_t i = 0; i < width; ++i) {
|
||||
if (bits & (1ULL << i)) {
|
||||
reversed |= (1ULL << (width - 1 - i));
|
||||
}
|
||||
}
|
||||
return reversed;
|
||||
}
|
||||
|
||||
template <typename T = uint32_t>
|
||||
T sext(const T& word, uint32_t width) {
|
||||
assert(width > 1);
|
||||
|
|
|
@ -21,32 +21,32 @@ template <typename T = uint32_t>
|
|||
class BitVector {
|
||||
private:
|
||||
static constexpr size_t BITS_PER_WORD = sizeof(T) * 8;
|
||||
std::vector<T> bits_;
|
||||
std::vector<T> words_;
|
||||
size_t size_;
|
||||
bool all_zero_;
|
||||
|
||||
size_t wordIndex(size_t pos) const {
|
||||
constexpr size_t wordIndex(size_t pos) const {
|
||||
return pos / BITS_PER_WORD;
|
||||
}
|
||||
|
||||
T bitMask(size_t pos) const {
|
||||
constexpr T bitMask(size_t pos) const {
|
||||
return T(1) << (pos % BITS_PER_WORD);
|
||||
}
|
||||
|
||||
void updateAllZero() {
|
||||
all_zero_ = std::all_of(bits_.begin(), bits_.end(), [](T word) { return word == 0; });
|
||||
all_zero_ = std::all_of(words_.begin(), words_.end(), [](T word) { return word == 0; });
|
||||
}
|
||||
|
||||
public:
|
||||
explicit BitVector(size_t size = 0)
|
||||
: bits_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD)
|
||||
: words_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD)
|
||||
, size_(size)
|
||||
, all_zero_(true)
|
||||
{}
|
||||
|
||||
void set(size_t pos) {
|
||||
if (pos >= size_) throw std::out_of_range("Index out of range");
|
||||
bits_[this->wordIndex(pos)] |= this->bitMask(pos);
|
||||
words_[this->wordIndex(pos)] |= this->bitMask(pos);
|
||||
all_zero_ = false;
|
||||
}
|
||||
|
||||
|
@ -59,19 +59,19 @@ public:
|
|||
}
|
||||
|
||||
void reset() {
|
||||
std::fill(bits_.begin(), bits_.end(), 0);
|
||||
std::fill(words_.begin(), words_.end(), 0);
|
||||
all_zero_ = true;
|
||||
}
|
||||
|
||||
void reset(size_t pos) {
|
||||
if (pos >= size_) throw std::out_of_range("Index out of range");
|
||||
bits_[this->wordIndex(pos)] &= ~this->bitMask(pos);
|
||||
words_[this->wordIndex(pos)] &= ~this->bitMask(pos);
|
||||
this->updateAllZero();
|
||||
}
|
||||
|
||||
bool test(size_t pos) const {
|
||||
if (pos >= size_) throw std::out_of_range("Index out of range");
|
||||
return bits_[this->wordIndex(pos)] & this->bitMask(pos);
|
||||
return words_[this->wordIndex(pos)] & this->bitMask(pos);
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
|
@ -80,12 +80,12 @@ public:
|
|||
|
||||
void resize(size_t new_size) {
|
||||
size_ = new_size;
|
||||
bits_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0);
|
||||
words_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0);
|
||||
this->updateAllZero();
|
||||
}
|
||||
|
||||
bool operator==(const BitVector& other) const {
|
||||
return (size_ == other.size_) && (bits_ == other.bits_);
|
||||
return (size_ == other.size_) && (words_ == other.words_);
|
||||
}
|
||||
|
||||
bool operator!=(const BitVector& other) const {
|
||||
|
@ -98,8 +98,8 @@ public:
|
|||
|
||||
BitVector& operator&=(const BitVector& other) {
|
||||
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
|
||||
for (size_t i = 0; i < bits_.size(); ++i) {
|
||||
bits_[i] &= other.bits_[i];
|
||||
for (size_t i = 0; i < words_.size(); ++i) {
|
||||
words_[i] &= other.words_[i];
|
||||
}
|
||||
this->updateAllZero();
|
||||
return *this;
|
||||
|
@ -107,8 +107,8 @@ public:
|
|||
|
||||
BitVector& operator|=(const BitVector& other) {
|
||||
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
|
||||
for (size_t i = 0; i < bits_.size(); ++i) {
|
||||
bits_[i] |= other.bits_[i];
|
||||
for (size_t i = 0; i < words_.size(); ++i) {
|
||||
words_[i] |= other.words_[i];
|
||||
}
|
||||
this->updateAllZero();
|
||||
return *this;
|
||||
|
@ -116,8 +116,8 @@ public:
|
|||
|
||||
BitVector& operator^=(const BitVector& other) {
|
||||
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
|
||||
for (size_t i = 0; i < bits_.size(); ++i) {
|
||||
bits_[i] ^= other.bits_[i];
|
||||
for (size_t i = 0; i < words_.size(); ++i) {
|
||||
words_[i] ^= other.words_[i];
|
||||
}
|
||||
this->updateAllZero();
|
||||
return *this;
|
||||
|
@ -125,23 +125,48 @@ public:
|
|||
|
||||
BitVector operator~() const {
|
||||
BitVector result(size_);
|
||||
for (size_t i = 0; i < bits_.size(); ++i) {
|
||||
result.bits_[i] = ~bits_[i];
|
||||
for (size_t i = 0; i < words_.size(); ++i) {
|
||||
result.words_[i] = ~words_[i];
|
||||
}
|
||||
result.updateAllZero();
|
||||
return result;
|
||||
}
|
||||
|
||||
void flip() {
|
||||
for (auto &word : bits_) {
|
||||
for (auto &word : words_) {
|
||||
word = ~word;
|
||||
}
|
||||
this->updateAllZero();
|
||||
}
|
||||
|
||||
void reverse() {
|
||||
if (size_ == 0)
|
||||
return;
|
||||
size_t remaining_bits = size_ % BITS_PER_WORD;
|
||||
if (remaining_bits != 0) {
|
||||
std::vector<T> reversed_words(words_.size(), 0);
|
||||
for (size_t i = 0; i < size_; ++i) {
|
||||
size_t reversed_pos = size_ - 1 - i;
|
||||
size_t src_word = i / BITS_PER_WORD;
|
||||
size_t src_offset = i % BITS_PER_WORD;
|
||||
size_t dst_word = reversed_pos / BITS_PER_WORD;
|
||||
size_t dst_offset = reversed_pos % BITS_PER_WORD;
|
||||
if (words_[src_word] & (T(1) << src_offset)) {
|
||||
reversed_words[dst_word] |= (T(1) << dst_offset);
|
||||
}
|
||||
}
|
||||
words_ = std::move(reversed_words);
|
||||
} else {
|
||||
std::reverse(words_.begin(), words_.end());
|
||||
for (auto &word : words_) {
|
||||
word = static_cast<T>(bit_reverse(static_cast<uint64_t>(word)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t count() const {
|
||||
size_t count = 0;
|
||||
for (const auto &word : bits_) {
|
||||
for (const auto &word : words_) {
|
||||
count += std::bitset<BITS_PER_WORD>(word).count();
|
||||
}
|
||||
return count;
|
||||
|
@ -160,12 +185,12 @@ public:
|
|||
size_t remaining_bits = size_ % BITS_PER_WORD;
|
||||
T full_mask = ~T(0);
|
||||
for (size_t i = 0; i < full_bits; ++i) {
|
||||
if (bits_[i] != full_mask)
|
||||
if (words_[i] != full_mask)
|
||||
return false;
|
||||
}
|
||||
if (remaining_bits > 0) {
|
||||
T partial_mask = (T(1) << remaining_bits) - 1;
|
||||
if ((bits_[full_bits] & partial_mask) != partial_mask)
|
||||
if ((words_[full_bits] & partial_mask) != partial_mask)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
|
@ -181,17 +206,17 @@ public:
|
|||
size_t bit_shift = pos % BITS_PER_WORD;
|
||||
|
||||
if (word_shift > 0) {
|
||||
for (size_t i = bits_.size() - 1; i >= word_shift; --i) {
|
||||
bits_[i] = bits_[i - word_shift];
|
||||
for (size_t i = words_.size() - 1; i >= word_shift; --i) {
|
||||
words_[i] = words_[i - word_shift];
|
||||
}
|
||||
std::fill(bits_.begin(), bits_.begin() + word_shift, 0);
|
||||
std::fill(words_.begin(), words_.begin() + word_shift, 0);
|
||||
}
|
||||
|
||||
if (bit_shift > 0) {
|
||||
for (size_t i = bits_.size() - 1; i > 0; --i) {
|
||||
bits_[i] = (bits_[i] << bit_shift) | (bits_[i - 1] >> (BITS_PER_WORD - bit_shift));
|
||||
for (size_t i = words_.size() - 1; i > 0; --i) {
|
||||
words_[i] = (words_[i] << bit_shift) | (words_[i - 1] >> (BITS_PER_WORD - bit_shift));
|
||||
}
|
||||
bits_[0] <<= bit_shift;
|
||||
words_[0] <<= bit_shift;
|
||||
}
|
||||
|
||||
this->updateAllZero();
|
||||
|
@ -208,17 +233,17 @@ public:
|
|||
size_t bit_shift = pos % BITS_PER_WORD;
|
||||
|
||||
if (word_shift > 0) {
|
||||
for (size_t i = 0; i < bits_.size() - word_shift; ++i) {
|
||||
bits_[i] = bits_[i + word_shift];
|
||||
for (size_t i = 0; i < words_.size() - word_shift; ++i) {
|
||||
words_[i] = words_[i + word_shift];
|
||||
}
|
||||
std::fill(bits_.end() - word_shift, bits_.end(), 0);
|
||||
std::fill(words_.end() - word_shift, words_.end(), 0);
|
||||
}
|
||||
|
||||
if (bit_shift > 0) {
|
||||
for (size_t i = 0; i < bits_.size() - 1; ++i) {
|
||||
bits_[i] = (bits_[i] >> bit_shift) | (bits_[i + 1] << (BITS_PER_WORD - bit_shift));
|
||||
for (size_t i = 0; i < words_.size() - 1; ++i) {
|
||||
words_[i] = (words_[i] >> bit_shift) | (words_[i + 1] << (BITS_PER_WORD - bit_shift));
|
||||
}
|
||||
bits_.back() >>= bit_shift;
|
||||
words_.back() >>= bit_shift;
|
||||
}
|
||||
|
||||
this->updateAllZero();
|
||||
|
|
|
@ -53,25 +53,25 @@ public:
|
|||
|
||||
SimPort(SimObjectBase* module)
|
||||
: SimPortBase(module)
|
||||
, peer_(nullptr)
|
||||
, sink_(nullptr)
|
||||
, tx_cb_(nullptr)
|
||||
{}
|
||||
|
||||
void bind(SimPort<Pkt>* peer) {
|
||||
assert(peer_ == nullptr);
|
||||
peer_ = peer;
|
||||
void bind(SimPort<Pkt>* sink) {
|
||||
assert(sink_ == nullptr);
|
||||
sink_ = sink;
|
||||
}
|
||||
|
||||
void unbind() {
|
||||
peer_ = nullptr;
|
||||
sink_ = nullptr;
|
||||
}
|
||||
|
||||
bool connected() const {
|
||||
return (peer_ != nullptr);
|
||||
return (sink_ != nullptr);
|
||||
}
|
||||
|
||||
SimPort* peer() const {
|
||||
return peer_;
|
||||
SimPort* sink() const {
|
||||
return sink_;
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
|
@ -111,15 +111,15 @@ protected:
|
|||
};
|
||||
|
||||
std::queue<timed_pkt_t> queue_;
|
||||
SimPort* peer_;
|
||||
SimPort* sink_;
|
||||
TxCallback tx_cb_;
|
||||
|
||||
void transfer(const Pkt& data, uint64_t cycles) {
|
||||
if (tx_cb_) {
|
||||
tx_cb_(data, cycles);
|
||||
}
|
||||
if (peer_) {
|
||||
peer_->transfer(data, cycles);
|
||||
if (sink_) {
|
||||
sink_->transfer(data, cycles);
|
||||
} else {
|
||||
queue_.push({data, cycles});
|
||||
}
|
||||
|
@ -402,8 +402,8 @@ typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
|
|||
|
||||
template <typename Pkt>
|
||||
void SimPort<Pkt>::push(const Pkt& pkt, uint64_t delay) const {
|
||||
if (peer_ && !tx_cb_) {
|
||||
reinterpret_cast<const SimPort<Pkt>*>(peer_)->push(pkt, delay);
|
||||
if (sink_ && !tx_cb_) {
|
||||
reinterpret_cast<const SimPort<Pkt>*>(sink_)->push(pkt, delay);
|
||||
} else {
|
||||
SimPlatform::instance().schedule(this, pkt, delay);
|
||||
}
|
||||
|
|
|
@ -46,8 +46,6 @@ Core::Core(const SimContext& ctx,
|
|||
, func_units_((uint32_t)FUType::Count)
|
||||
, lmem_switch_(NUM_LSU_BLOCKS)
|
||||
, mem_coalescers_(NUM_LSU_BLOCKS)
|
||||
, lsu_dcache_adapter_(NUM_LSU_BLOCKS)
|
||||
, lsu_lmem_adapter_(NUM_LSU_BLOCKS)
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, commit_arbs_(ISSUE_WIDTH)
|
||||
{
|
||||
|
@ -64,11 +62,11 @@ Core::Core(const SimContext& ctx,
|
|||
}
|
||||
|
||||
// create local memory
|
||||
snprintf(sname, 100, "%s-local_mem", this->name().c_str());
|
||||
snprintf(sname, 100, "%s-lmem", this->name().c_str());
|
||||
local_mem_ = LocalMem::Create(sname, LocalMem::Config{
|
||||
(1 << LMEM_LOG_SIZE),
|
||||
LSU_WORD_SIZE,
|
||||
LSU_NUM_REQS,
|
||||
LSU_CHANNELS,
|
||||
log2ceil(LMEM_NUM_BANKS),
|
||||
false
|
||||
});
|
||||
|
@ -79,48 +77,52 @@ Core::Core(const SimContext& ctx,
|
|||
lmem_switch_.at(i) = LocalMemSwitch::Create(sname, 1);
|
||||
}
|
||||
|
||||
// create lsu dcache adapter
|
||||
// create dcache adapter
|
||||
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter(NUM_LSU_BLOCKS);
|
||||
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
|
||||
snprintf(sname, 100, "%s-lsu_dcache_adapter%d", this->name().c_str(), i);
|
||||
lsu_dcache_adapter_.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
|
||||
lsu_dcache_adapter.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
|
||||
}
|
||||
|
||||
// create lsu lmem adapter
|
||||
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
|
||||
snprintf(sname, 100, "%s-lsu_lmem_adapter%d", this->name().c_str(), i);
|
||||
lsu_lmem_adapter_.at(i) = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
|
||||
}
|
||||
// create lmem arbiter
|
||||
snprintf(sname, 100, "%s-lmem_arb", this->name().c_str());
|
||||
auto lmem_arb = LsuArbiter::Create(sname, ArbiterType::RoundRobin, NUM_LSU_BLOCKS, 1);
|
||||
|
||||
// connect lsu demux
|
||||
// create lmem adapter
|
||||
snprintf(sname, 100, "%s-lsu_lmem_adapter", this->name().c_str());
|
||||
auto lsu_lmem_adapter = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
|
||||
|
||||
// connect lmem switch
|
||||
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
|
||||
lmem_switch_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
|
||||
lmem_switch_.at(b)->ReqLmem.bind(&lmem_arb->ReqIn.at(b));
|
||||
|
||||
mem_coalescers_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspDC);
|
||||
|
||||
lmem_switch_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
|
||||
lsu_lmem_adapter_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspLmem);
|
||||
lmem_arb->RspIn.at(b).bind(&lmem_switch_.at(b)->RspLmem);
|
||||
}
|
||||
|
||||
// connect coalescer-adapter
|
||||
// connect lmem arbiter
|
||||
lmem_arb->ReqOut.at(0).bind(&lsu_lmem_adapter->ReqIn);
|
||||
lsu_lmem_adapter->RspIn.bind(&lmem_arb->RspOut.at(0));
|
||||
|
||||
// connect lmem adapter
|
||||
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
|
||||
lsu_lmem_adapter->ReqOut.at(c).bind(&local_mem_->Inputs.at(c));
|
||||
local_mem_->Outputs.at(c).bind(&lsu_lmem_adapter->RspOut.at(c));
|
||||
}
|
||||
|
||||
// connect dcache coalescer
|
||||
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
|
||||
mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter_.at(b)->ReqIn);
|
||||
lsu_dcache_adapter_.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
|
||||
mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter.at(b)->ReqIn);
|
||||
lsu_dcache_adapter.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
|
||||
}
|
||||
|
||||
// connect adapter-dcache
|
||||
// connect dcache adapter
|
||||
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
|
||||
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
|
||||
uint32_t i = b * DCACHE_CHANNELS + c;
|
||||
lsu_dcache_adapter_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
|
||||
dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter_.at(b)->RspOut.at(c));
|
||||
}
|
||||
}
|
||||
|
||||
// connect adapter-lmem
|
||||
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
|
||||
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
|
||||
uint32_t i = b * LSU_CHANNELS + c;
|
||||
lsu_lmem_adapter_.at(b)->ReqOut.at(c).bind(&local_mem_->Inputs.at(i));
|
||||
local_mem_->Outputs.at(i).bind(&lsu_lmem_adapter_.at(b)->RspOut.at(c));
|
||||
lsu_dcache_adapter.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
|
||||
dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter.at(b)->RspOut.at(c));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -127,6 +127,10 @@ public:
|
|||
return local_mem_;
|
||||
}
|
||||
|
||||
const MemCoalescer::Ptr& mem_coalescer(uint32_t idx) const {
|
||||
return mem_coalescers_.at(idx);
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
@ -156,8 +160,6 @@ private:
|
|||
LocalMem::Ptr local_mem_;
|
||||
std::vector<LocalMemSwitch::Ptr> lmem_switch_;
|
||||
std::vector<MemCoalescer::Ptr> mem_coalescers_;
|
||||
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter_;
|
||||
std::vector<LsuMemAdapter::Ptr> lsu_lmem_adapter_;
|
||||
|
||||
PipelineLatch fetch_latch_;
|
||||
PipelineLatch decode_latch_;
|
||||
|
|
|
@ -360,7 +360,6 @@ void Emulator::dcache_read(void *data, uint64_t addr, uint32_t size) {
|
|||
} else {
|
||||
mmu_.read(data, addr, size, 0);
|
||||
}
|
||||
|
||||
DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl);
|
||||
}
|
||||
#endif
|
||||
|
@ -565,6 +564,12 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
auto cluster_perf = core_->socket()->cluster()->perf_stats();
|
||||
auto socket_perf = core_->socket()->perf_stats();
|
||||
auto lmem_perf = core_->local_mem()->perf_stats();
|
||||
|
||||
uint64_t coalescer_misses = 0;
|
||||
for (uint i = 0; i < NUM_LSU_BLOCKS; ++i) {
|
||||
coalescer_misses += core_->mem_coalescer(i)->perf_stats().misses;
|
||||
}
|
||||
|
||||
switch (addr) {
|
||||
CSR_READ_64(VX_CSR_MPM_ICACHE_READS, socket_perf.icache.reads);
|
||||
CSR_READ_64(VX_CSR_MPM_ICACHE_MISS_R, socket_perf.icache.read_misses);
|
||||
|
|
|
@ -24,14 +24,12 @@ protected:
|
|||
LocalMem* simobject_;
|
||||
Config config_;
|
||||
RAM ram_;
|
||||
uint32_t line_bits_;
|
||||
MemCrossBar::Ptr mem_xbar_;
|
||||
mutable PerfStats perf_stats_;
|
||||
|
||||
uint64_t to_local_addr(uint64_t addr) {
|
||||
uint32_t total_lines = config_.capacity / config_.line_size;
|
||||
uint32_t line_bits = log2ceil(total_lines);
|
||||
uint32_t offset = bit_getw(addr, 0, line_bits-1);
|
||||
return offset;
|
||||
return bit_getw(addr, 0, line_bits_-1);
|
||||
}
|
||||
|
||||
public:
|
||||
|
@ -40,9 +38,13 @@ public:
|
|||
, config_(config)
|
||||
, ram_(config.capacity)
|
||||
{
|
||||
uint32_t total_lines = config.capacity / config.line_size;
|
||||
line_bits_ = log2ceil(total_lines);
|
||||
|
||||
char sname[100];
|
||||
snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
|
||||
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_reqs, (1 << config.B));
|
||||
uint32_t wsel_bits = log2ceil(config_.line_size);
|
||||
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, (1 << config.B), wsel_bits);
|
||||
for (uint32_t i = 0; i < config.num_reqs; ++i) {
|
||||
simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i));
|
||||
mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i));
|
||||
|
@ -56,15 +58,15 @@ public:
|
|||
}
|
||||
|
||||
void read(void* data, uint64_t addr, uint32_t size) {
|
||||
auto s_addr = to_local_addr(addr);
|
||||
DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl);
|
||||
ram_.read(data, s_addr, size);
|
||||
auto l_addr = to_local_addr(addr);
|
||||
DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl);
|
||||
ram_.read(data, l_addr, size);
|
||||
}
|
||||
|
||||
void write(const void* data, uint64_t addr, uint32_t size) {
|
||||
auto s_addr = to_local_addr(addr);
|
||||
DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl);
|
||||
ram_.write(data, s_addr, size);
|
||||
auto l_addr = to_local_addr(addr);
|
||||
DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl);
|
||||
ram_.write(data, l_addr, size);
|
||||
}
|
||||
|
||||
void tick() {
|
||||
|
@ -94,7 +96,7 @@ public:
|
|||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
perf_stats_.bank_stalls = mem_xbar_->collisions();
|
||||
perf_stats_.bank_stalls = mem_xbar_->req_collisions();
|
||||
return perf_stats_;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -147,10 +147,17 @@ void MemCoalescer::tick() {
|
|||
ReqOut.push(out_req, delay_);
|
||||
DT(4, this->name() << "-mem-req: coalesced=" << cur_mask.count() << ", " << out_req);
|
||||
|
||||
// track partial responses
|
||||
perf_stats_.misses += (cur_mask.count() != in_req.mask.count());
|
||||
|
||||
// update sent mask
|
||||
sent_mask_ |= cur_mask;
|
||||
if (sent_mask_ == in_req.mask) {
|
||||
ReqIn.pop();
|
||||
sent_mask_.reset();
|
||||
}
|
||||
}
|
||||
|
||||
const MemCoalescer::PerfStats& MemCoalescer::perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
|
@ -23,6 +23,19 @@ public:
|
|||
SimPort<LsuReq> ReqOut;
|
||||
SimPort<LsuRsp> RspOut;
|
||||
|
||||
struct PerfStats {
|
||||
uint64_t misses;
|
||||
|
||||
PerfStats()
|
||||
: misses(0)
|
||||
{}
|
||||
|
||||
PerfStats& operator+=(const PerfStats& rhs) {
|
||||
this->misses += rhs.misses;
|
||||
return *this;
|
||||
}
|
||||
};
|
||||
|
||||
MemCoalescer(
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
|
@ -37,6 +50,8 @@ public:
|
|||
|
||||
void tick();
|
||||
|
||||
const PerfStats& perf_stats() const;
|
||||
|
||||
private:
|
||||
|
||||
struct pending_req_t {
|
||||
|
@ -52,6 +67,7 @@ private:
|
|||
BitVector<> sent_mask_;
|
||||
uint32_t line_size_;
|
||||
uint32_t delay_;
|
||||
PerfStats perf_stats_;
|
||||
};
|
||||
|
||||
}
|
105
sim/simx/types.h
105
sim/simx/types.h
|
@ -527,6 +527,7 @@ public:
|
|||
auto& req_in = Inputs.at(j);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
DT(4, this->name() << "-req" << o << ": " << req);
|
||||
Outputs.at(o).push(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_grant(o, g);
|
||||
|
@ -597,37 +598,36 @@ public:
|
|||
// process incoming requests
|
||||
for (uint32_t o = 0; o < O; ++o) {
|
||||
int32_t input_idx = -1;
|
||||
bool has_collision = false;
|
||||
for (uint32_t r = 0; r < R; ++r) {
|
||||
uint32_t i = (grants_.at(o) + r) & (R-1);
|
||||
if (i >= I)
|
||||
continue;
|
||||
auto& req_in = Inputs.at(i);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
if (req_in.empty())
|
||||
continue;
|
||||
auto& req = req_in.front();
|
||||
uint32_t output_idx = 0;
|
||||
if (lg2_outputs_ != 0) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
|
||||
// skip if input is not going to current output
|
||||
uint32_t output_idx = 0;
|
||||
if (O != 1) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
|
||||
}
|
||||
if (output_idx != o)
|
||||
continue;
|
||||
if (input_idx != -1) {
|
||||
++collisions_;
|
||||
continue;
|
||||
}
|
||||
input_idx = i;
|
||||
}
|
||||
if (input_idx != -1) {
|
||||
has_collision = true;
|
||||
continue;
|
||||
}
|
||||
input_idx = i;
|
||||
}
|
||||
if (input_idx != -1) {
|
||||
auto& req_in = Inputs.at(input_idx);
|
||||
auto& req = req_in.front();
|
||||
if (lg2_inputs_ != 0) {
|
||||
req.tag = (req.tag << lg2_inputs_) | input_idx;
|
||||
}
|
||||
DT(4, this->name() << "-req" << input_idx << ": " << req);
|
||||
DT(4, this->name() << "-req" << o << ": " << req);
|
||||
Outputs.at(o).push(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_grant(o, input_idx);
|
||||
collisions_ += has_collision;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -721,8 +721,8 @@ public:
|
|||
g = rsp.tag & (R-1);
|
||||
rsp.tag >>= lg2_num_reqs_;
|
||||
}
|
||||
DT(4, this->name() << "-rsp" << o << ": " << rsp);
|
||||
uint32_t j = o * R + g;
|
||||
DT(4, this->name() << "-rsp" << j << ": " << rsp);
|
||||
RspIn.at(j).push(rsp, 1);
|
||||
rsp_out.pop();
|
||||
}
|
||||
|
@ -742,7 +742,7 @@ public:
|
|||
if (lg2_num_reqs_ != 0) {
|
||||
req.tag = (req.tag << lg2_num_reqs_) | g;
|
||||
}
|
||||
DT(4, this->name() << "-req" << j << ": " << req);
|
||||
DT(4, this->name() << "-req" << o << ": " << req);
|
||||
ReqOut.at(o).push(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_grant(o, g);
|
||||
|
@ -798,7 +798,8 @@ public:
|
|||
, lg2_inputs_(log2ceil(num_inputs))
|
||||
, lg2_outputs_(log2ceil(num_outputs))
|
||||
, addr_start_(addr_start)
|
||||
, collisions_(0) {
|
||||
, req_collisions_(0)
|
||||
, rsp_collisions_(0) {
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 64);
|
||||
assert(num_outputs <= 64);
|
||||
|
@ -824,26 +825,27 @@ public:
|
|||
// process outgoing responses
|
||||
for (uint32_t i = 0; i < I; ++i) {
|
||||
int32_t output_idx = -1;
|
||||
bool has_collision = false;
|
||||
for (uint32_t t = 0; t < T; ++t) {
|
||||
uint32_t o = (rsp_grants_.at(i) + t) & (T-1);
|
||||
if (o >= O)
|
||||
continue;
|
||||
auto& rsp_out = RspOut.at(o);
|
||||
if (!rsp_out.empty()) {
|
||||
auto& rsp = rsp_out.front();
|
||||
if (rsp_out.empty())
|
||||
continue;
|
||||
auto& rsp = rsp_out.front();
|
||||
uint32_t input_idx = 0;
|
||||
if (lg2_inputs_ != 0) {
|
||||
input_idx = rsp.tag & (R-1);
|
||||
// skip if response is not going to current input
|
||||
uint32_t input_idx = 0;
|
||||
if (lg2_inputs_ != 0) {
|
||||
input_idx = rsp.tag & (R-1);
|
||||
}
|
||||
if (input_idx != i)
|
||||
continue;
|
||||
if (output_idx != -1) {
|
||||
++collisions_;
|
||||
continue;
|
||||
}
|
||||
output_idx = o;
|
||||
}
|
||||
if (output_idx != -1) {
|
||||
has_collision = true;
|
||||
continue;
|
||||
}
|
||||
output_idx = o;
|
||||
}
|
||||
if (output_idx != -1) {
|
||||
auto& rsp_out = RspOut.at(output_idx);
|
||||
|
@ -853,36 +855,38 @@ public:
|
|||
input_idx = rsp.tag & (R-1);
|
||||
rsp.tag >>= lg2_inputs_;
|
||||
}
|
||||
DT(4, this->name() << "-rsp" << output_idx << ": " << rsp);
|
||||
RspIn.at(input_idx).push(rsp, 1);
|
||||
DT(4, this->name() << "-rsp" << i << ": " << rsp);
|
||||
RspIn.at(i).push(rsp, 1);
|
||||
rsp_out.pop();
|
||||
this->update_rsp_grant(i, output_idx);
|
||||
rsp_collisions_ += has_collision;
|
||||
}
|
||||
}
|
||||
|
||||
// process incoming requests
|
||||
for (uint32_t o = 0; o < O; ++o) {
|
||||
int32_t input_idx = -1;
|
||||
bool has_collision = false;
|
||||
for (uint32_t r = 0; r < R; ++r) {
|
||||
uint32_t i = (req_grants_.at(o) + r) & (R-1);
|
||||
if (i >= I)
|
||||
continue;
|
||||
auto& req_in = ReqIn.at(i);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
if (req_in.empty())
|
||||
continue;
|
||||
auto& req = req_in.front();
|
||||
uint32_t output_idx = 0;
|
||||
if (lg2_outputs_ != 0) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
|
||||
// skip if request is not going to current output
|
||||
uint32_t output_idx = 0;
|
||||
if (O != 1) {
|
||||
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
|
||||
}
|
||||
if (output_idx != o)
|
||||
continue;
|
||||
if (input_idx != -1) {
|
||||
++collisions_;
|
||||
continue;
|
||||
}
|
||||
input_idx = i;
|
||||
}
|
||||
if (input_idx != -1) {
|
||||
has_collision = true;
|
||||
continue;
|
||||
}
|
||||
input_idx = i;
|
||||
}
|
||||
if (input_idx != -1) {
|
||||
auto& req_in = ReqIn.at(input_idx);
|
||||
|
@ -890,16 +894,21 @@ public:
|
|||
if (lg2_inputs_ != 0) {
|
||||
req.tag = (req.tag << lg2_inputs_) | input_idx;
|
||||
}
|
||||
DT(4, this->name() << "-req" << input_idx << ": " << req);
|
||||
DT(4, this->name() << "-req" << o << ": " << req);
|
||||
ReqOut.at(o).push(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_req_grant(o, input_idx);
|
||||
req_collisions_ += has_collision;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t collisions() const {
|
||||
return collisions_;
|
||||
uint64_t req_collisions() const {
|
||||
return req_collisions_;
|
||||
}
|
||||
|
||||
uint64_t rsp_collisions() const {
|
||||
return rsp_collisions_;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
@ -923,7 +932,8 @@ protected:
|
|||
uint32_t lg2_inputs_;
|
||||
uint32_t lg2_outputs_;
|
||||
uint32_t addr_start_;
|
||||
uint64_t collisions_;
|
||||
uint64_t req_collisions_;
|
||||
uint64_t rsp_collisions_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -978,7 +988,8 @@ private:
|
|||
uint32_t delay_;
|
||||
};
|
||||
|
||||
using MemArbiter = TxArbiter<MemReq, MemRsp>;
|
||||
using LsuArbiter = TxArbiter<LsuReq, LsuRsp>;
|
||||
using MemArbiter = TxArbiter<MemReq, MemRsp>;
|
||||
using MemCrossBar = TxCrossBar<MemReq, MemRsp>;
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue