memory mem_coalescer miss perf counter

RTL perf counters refactoring
This commit is contained in:
tinebp 2024-12-26 08:00:36 -08:00
parent f478bdcf25
commit 704f525fd6
41 changed files with 581 additions and 521 deletions

View file

@ -24,7 +24,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
input wire reset, input wire reset,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, input sysmem_perf_t sysmem_perf,
`endif `endif
// DCRs // DCRs
@ -43,12 +43,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
`endif `endif
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if(); cache_perf_t l2_perf;
assign mem_perf_tmp_if.icache = 'x; sysmem_perf_t sysmem_perf_tmp;
assign mem_perf_tmp_if.dcache = 'x; always @(*) begin
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; sysmem_perf_tmp = sysmem_perf;
assign mem_perf_tmp_if.lmem = 'x; sysmem_perf_tmp.l2cache = l2_perf;
assign mem_perf_tmp_if.mem = mem_perf_if.mem; end
`endif `endif
`ifdef GBAR_ENABLE `ifdef GBAR_ENABLE
@ -111,7 +111,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (l2_reset), .reset (l2_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.l2cache), .cache_perf (l2_perf),
`endif `endif
.core_bus_if (per_socket_mem_bus_if), .core_bus_if (per_socket_mem_bus_if),
.mem_bus_if (mem_bus_if) .mem_bus_if (mem_bus_if)
@ -140,7 +140,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.reset (socket_reset), .reset (socket_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if), .sysmem_perf (sysmem_perf_tmp),
`endif `endif
.dcr_bus_if (socket_dcr_bus_if), .dcr_bus_if (socket_dcr_bus_if),

View file

@ -329,19 +329,19 @@
VX_edge_trigger #( \ VX_edge_trigger #( \
.POS (0), \ .POS (0), \
.INIT (0) \ .INIT (0) \
) __``dst``__ ( \ ) __neg_edge`__LINE__ ( \
.clk (clk), \ .clk (clk), \
.reset (1'b0), \ .reset (1'b0), \
.data_in (src), \ .data_in (src), \
.data_out (dst) \ .data_out (dst) \
) )
`define BUFFER_EX(dst, src, ena, RSTW, latency) \ `define BUFFER_EX(dst, src, ena, resetw, latency) \
VX_pipe_register #( \ VX_pipe_register #( \
.DATAW ($bits(dst)), \ .DATAW ($bits(dst)), \
.RESETW (RSTW), \ .RESETW (resetw), \
.DEPTH (latency) \ .DEPTH (latency) \
) __``dst``__ ( \ ) __buffer_ex`__LINE__ ( \
.clk (clk), \ .clk (clk), \
.reset (reset), \ .reset (reset), \
.enable (ena), \ .enable (ena), \
@ -349,13 +349,13 @@
.data_out (dst) \ .data_out (dst) \
) )
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 0, 1) `define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, $bits(dst), 1)
`define POP_COUNT_EX(out, in, model) \ `define POP_COUNT_EX(out, in, model) \
VX_popcount #( \ VX_popcount #( \
.N ($bits(in)), \ .N ($bits(in)), \
.MODEL (model) \ .MODEL (model) \
) __``out``__ ( \ ) __pop_count_ex`__LINE__ ( \
.data_in (in), \ .data_in (in), \
.data_out (out) \ .data_out (out) \
) )
@ -482,7 +482,7 @@
for (genvar __i = 0; __i < count; ++__i) begin \ for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_field[__i] = src[__i].``field; \ assign __reduce_add_i_field[__i] = src[__i].``field; \
end \ end \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \ VX_reduce_tree #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
__reduce_add_i_field, \ __reduce_add_i_field, \
__reduce_add_o_field \ __reduce_add_o_field \
); \ ); \

View file

@ -73,6 +73,17 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] crsp_stalls; logic [`PERF_CTR_BITS-1:0] crsp_stalls;
} cache_perf_t; } cache_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
logic [`PERF_CTR_BITS-1:0] bank_stalls;
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
} lmem_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] misses;
} coalescer_perf_t;
typedef struct packed { typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads; logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes; logic [`PERF_CTR_BITS-1:0] writes;
@ -92,6 +103,26 @@ package VX_gpu_pkg;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses; logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t; } issue_perf_t;
typedef struct packed {
cache_perf_t icache;
cache_perf_t dcache;
cache_perf_t l2cache;
cache_perf_t l3cache;
lmem_perf_t lmem;
coalescer_perf_t coalescer;
mem_perf_t mem;
} sysmem_perf_t;
typedef struct packed {
sched_perf_t sched;
issue_perf_t issue;
logic [`PERF_CTR_BITS-1:0] ifetches;
logic [`PERF_CTR_BITS-1:0] loads;
logic [`PERF_CTR_BITS-1:0] stores;
logic [`PERF_CTR_BITS-1:0] ifetch_latency;
logic [`PERF_CTR_BITS-1:0] load_latency;
} pipeline_perf_t;
//////////////////////// instruction arguments //////////////////////////// //////////////////////// instruction arguments ////////////////////////////
typedef struct packed { typedef struct packed {
@ -145,6 +176,7 @@ package VX_gpu_pkg;
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES)); localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS); localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES; localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
localparam LMEM_TAG_WIDTH = LSU_TAG_WIDTH + `CLOG2(`NUM_LSU_BLOCKS);
////////////////////////// Icache Parameters ////////////////////////////// ////////////////////////// Icache Parameters //////////////////////////////

View file

@ -24,7 +24,7 @@ module VX_socket import VX_gpu_pkg::*; #(
input wire reset, input wire reset,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, input sysmem_perf_t sysmem_perf,
`endif `endif
// DCRs // DCRs
@ -63,11 +63,13 @@ module VX_socket import VX_gpu_pkg::*; #(
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if(); cache_perf_t icache_perf, dcache_perf;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; sysmem_perf_t sysmem_perf_tmp;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; always @(*) begin
assign mem_perf_tmp_if.lmem = 'x; sysmem_perf_tmp = sysmem_perf;
assign mem_perf_tmp_if.mem = mem_perf_if.mem; sysmem_perf_tmp.icache = icache_perf;
sysmem_perf_tmp.dcache = dcache_perf;
end
`endif `endif
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
@ -110,7 +112,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_BUF (2) .MEM_OUT_BUF (2)
) icache ( ) icache (
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.icache), .cache_perf (icache_perf),
`endif `endif
.clk (clk), .clk (clk),
.reset (icache_reset), .reset (icache_reset),
@ -160,7 +162,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MEM_OUT_BUF (2) .MEM_OUT_BUF (2)
) dcache ( ) dcache (
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.dcache), .cache_perf (dcache_perf),
`endif `endif
.clk (clk), .clk (clk),
.reset (dcache_reset), .reset (dcache_reset),
@ -187,6 +189,7 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_arb #( VX_mem_arb #(
.NUM_INPUTS (2), .NUM_INPUTS (2),
.NUM_OUTPUTS(1),
.DATA_SIZE (`L1_LINE_SIZE), .DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH), .TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX(0), .TAG_SEL_IDX(0),
@ -234,7 +237,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.reset (core_reset), .reset (core_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if), .sysmem_perf (sysmem_perf_tmp),
`endif `endif
.dcr_bus_if (core_dcr_bus_if), .dcr_bus_if (core_dcr_bus_if),

View file

@ -177,6 +177,9 @@
`define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C `define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts `define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D `define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D
// PERF: coalescer
`define VX_CSR_MPM_COALESCE_MISS 12'hB1E // coalescer misses
`define VX_CSR_MPM_COALESCE_MISS_H 12'hB9E
// Machine Performance-monitoring memory counters (class 3) /////////////////// // Machine Performance-monitoring memory counters (class 3) ///////////////////
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F> // <Add your own counters: use addresses hB03..B1F, hB83..hB9F>

View file

@ -50,11 +50,14 @@ module Vortex import VX_gpu_pkg::*; (
`endif `endif
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if(); cache_perf_t l3_perf;
assign mem_perf_if.icache = 'x; mem_perf_t mem_perf;
assign mem_perf_if.dcache = 'x; sysmem_perf_t sysmem_perf;
assign mem_perf_if.l2cache = 'x; always @(*) begin
assign mem_perf_if.lmem = 'x; sysmem_perf = '0;
sysmem_perf.l3cache = l3_perf;
sysmem_perf.mem = mem_perf;
end
`endif `endif
VX_mem_bus_if #( VX_mem_bus_if #(
@ -98,7 +101,7 @@ module Vortex import VX_gpu_pkg::*; (
.reset (l3_reset), .reset (l3_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (mem_perf_if.l3cache), .cache_perf (l3_perf),
`endif `endif
.core_bus_if (per_cluster_mem_bus_if), .core_bus_if (per_cluster_mem_bus_if),
@ -146,7 +149,7 @@ module Vortex import VX_gpu_pkg::*; (
.reset (cluster_reset), .reset (cluster_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .sysmem_perf (sysmem_perf),
`endif `endif
.dcr_bus_if (cluster_dcr_bus_if), .dcr_bus_if (cluster_dcr_bus_if),
@ -182,7 +185,6 @@ module Vortex import VX_gpu_pkg::*; (
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire); `POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
@ -202,7 +204,6 @@ module Vortex import VX_gpu_pkg::*; (
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads; mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end end
end end
assign mem_perf_if.mem = mem_perf;
`endif `endif

View file

@ -620,6 +620,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
VX_mem_arb #( VX_mem_arb #(
.NUM_INPUTS (2), .NUM_INPUTS (2),
.NUM_OUTPUTS (1),
.DATA_SIZE (LMEM_DATA_SIZE), .DATA_SIZE (LMEM_DATA_SIZE),
.ADDR_WIDTH (CCI_VX_ADDR_WIDTH), .ADDR_WIDTH (CCI_VX_ADDR_WIDTH),
.TAG_WIDTH (CCI_VX_TAG_WIDTH), .TAG_WIDTH (CCI_VX_TAG_WIDTH),
@ -1097,7 +1098,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire vx_mem_req_fire = vx_mem_req_valid[0] && vx_mem_req_ready[0]; wire vx_mem_req_fire = vx_mem_req_valid[0] && vx_mem_req_ready[0];
wire vx_mem_rsp_fire = vx_mem_rsp_valid[0] && vx_mem_rsp_ready[0]; wire vx_mem_rsp_fire = vx_mem_rsp_valid[0] && vx_mem_rsp_ready[0];
wire avs_req_fire = (avs_write[0] || avs_read[0]) && ~avs_waitrequest[0]; wire avs_req_fire = (avs_write[0] || avs_read[0]) && ~avs_waitrequest[0];
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset); `NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP (0, 0, { `SCOPE_TAP (0, 0, {
vx_reset, vx_reset,

View file

@ -328,9 +328,9 @@ module VX_afu_wrap #(
`ifdef DBG_SCOPE_AFU `ifdef DBG_SCOPE_AFU
wire m_axi_mem_awfire_0 = m_axi_mem_awvalid_a[0] & m_axi_mem_awready_a[0]; wire m_axi_mem_awfire_0 = m_axi_mem_awvalid_a[0] & m_axi_mem_awready_a[0];
wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0]; wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0];
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0]; wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0]; wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0];
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset); `NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP (0, 0, { `SCOPE_TAP (0, 0, {
ap_reset, ap_reset,

View file

@ -210,7 +210,59 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
end end
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
assign cache_perf = '0; wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
assign perf_core_reads_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && ~core_bus_if[i].req_data.rw;
assign perf_core_writes_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && core_bus_if[i].req_data.rw;
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
end
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
perf_core_reads <= '0;
perf_core_writes <= '0;
perf_mem_stalls <= '0;
perf_crsp_stalls <= '0;
end else begin
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
end
end
assign cache_perf.reads = perf_core_reads;
assign cache_perf.writes = perf_core_writes;
assign cache_perf.read_misses = '0;
assign cache_perf.write_misses = '0;
assign cache_perf.bank_stalls = '0;
assign cache_perf.mshr_stalls = '0;
assign cache_perf.mem_stalls = perf_mem_stalls;
assign cache_perf.crsp_stalls = perf_crsp_stalls;
`endif `endif
end end
@ -220,13 +272,13 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin always @(posedge clk) begin
if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin
if (core_bus_if[i].req_data.rw) begin if (core_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid)) `TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
end else begin end else begin
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, i, core_bus_if[i].req_data.tag.uuid)) `TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.tag.uuid))
end end
end end
if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, i, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid)) `TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
end end
end end
end end

View file

@ -101,7 +101,7 @@ module VX_commit import VX_gpu_pkg::*; #(
.data_out ({commit_fire_any_r, commit_size_r}) .data_out ({commit_fire_any_r, commit_size_r})
); );
VX_reduce #( VX_reduce_tree #(
.DATAW_IN (COMMIT_SIZEW), .DATAW_IN (COMMIT_SIZEW),
.DATAW_OUT (COMMIT_ALL_SIZEW), .DATAW_OUT (COMMIT_ALL_SIZEW),
.N (`ISSUE_WIDTH), .N (`ISSUE_WIDTH),

View file

@ -28,7 +28,7 @@ module VX_core import VX_gpu_pkg::*; #(
input wire reset, input wire reset,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, input sysmem_perf_t sysmem_perf,
`endif `endif
VX_dcr_bus_if.slave dcr_bus_if, VX_dcr_bus_if.slave dcr_bus_if,
@ -65,14 +65,15 @@ module VX_core import VX_gpu_pkg::*; #(
) lsu_mem_if[`NUM_LSU_BLOCKS](); ) lsu_mem_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if(); lmem_perf_t lmem_perf;
VX_pipeline_perf_if pipeline_perf_if(); coalescer_perf_t coalescer_perf;
pipeline_perf_t pipeline_perf;
assign mem_perf_tmp_if.icache = mem_perf_if.icache; sysmem_perf_t sysmem_perf_tmp;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache; always @(*) begin
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; sysmem_perf_tmp = sysmem_perf;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; sysmem_perf_tmp.lmem = lmem_perf;
assign mem_perf_tmp_if.mem = mem_perf_if.mem; sysmem_perf_tmp.coalescer = coalescer_perf;
end
`endif `endif
base_dcrs_t base_dcrs; base_dcrs_t base_dcrs;
@ -94,7 +95,7 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (reset), .reset (reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.sched_perf (pipeline_perf_if.sched), .sched_perf (pipeline_perf.sched),
`endif `endif
.base_dcrs (base_dcrs), .base_dcrs (base_dcrs),
@ -144,7 +145,7 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (reset), .reset (reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.issue_perf (pipeline_perf_if.issue), .issue_perf (pipeline_perf.issue),
`endif `endif
.decode_if (decode_if), .decode_if (decode_if),
@ -162,8 +163,8 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (reset), .reset (reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if), .sysmem_perf (sysmem_perf_tmp),
.pipeline_perf_if(pipeline_perf_if), .pipeline_perf (pipeline_perf),
`endif `endif
.base_dcrs (base_dcrs), .base_dcrs (base_dcrs),
@ -200,7 +201,8 @@ module VX_core import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.lmem_perf (mem_perf_tmp_if.lmem), .lmem_perf (lmem_perf),
.coalescer_perf(coalescer_perf),
`endif `endif
.lsu_mem_if (lsu_mem_if), .lsu_mem_if (lsu_mem_if),
.dcache_bus_if (dcache_bus_if) .dcache_bus_if (dcache_bus_if)
@ -276,12 +278,11 @@ module VX_core import VX_gpu_pkg::*; #(
end end
end end
assign pipeline_perf_if.ifetches = perf_ifetches; assign pipeline_perf.ifetches = perf_ifetches;
assign pipeline_perf_if.loads = perf_loads; assign pipeline_perf.loads = perf_loads;
assign pipeline_perf_if.stores = perf_stores; assign pipeline_perf.stores = perf_stores;
assign pipeline_perf_if.load_latency = perf_dcache_lat; assign pipeline_perf.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.ifetch_latency = perf_icache_lat; assign pipeline_perf.load_latency = perf_dcache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
`endif `endif

View file

@ -127,13 +127,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_rsp_ready = icache_bus_if.rsp_ready; assign icache_rsp_ready = icache_bus_if.rsp_ready;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if(); sysmem_perf_t mem_perf;
assign mem_perf_if.icache = '0; assign mem_perf.icache = '0;
assign mem_perf_if.dcache = '0; assign mem_perf.dcache = '0;
assign mem_perf_if.l2cache = '0; assign mem_perf.l2cache = '0;
assign mem_perf_if.l3cache = '0; assign mem_perf.l3cache = '0;
assign mem_perf_if.lmem = '0; assign mem_perf.lmem = '0;
assign mem_perf_if.mem = '0; assign mem_perf.mem = '0;
`endif `endif
`ifdef SCOPE `ifdef SCOPE
@ -152,7 +152,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
.reset (reset), .reset (reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .sysmem_perf (sysmem_perf),
`endif `endif
.dcr_bus_if (dcr_bus_if), .dcr_bus_if (dcr_bus_if),

View file

@ -41,8 +41,8 @@ import VX_fpu_pkg::*;
input base_dcrs_t base_dcrs, input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, input sysmem_perf_t sysmem_perf,
VX_pipeline_perf_if.slave pipeline_perf_if, input pipeline_perf_t pipeline_perf,
`endif `endif
VX_commit_csr_if.slave commit_csr_if, VX_commit_csr_if.slave commit_csr_if,
@ -212,65 +212,67 @@ import VX_fpu_pkg::*;
`VX_DCR_MPM_CLASS_CORE: begin `VX_DCR_MPM_CLASS_CORE: begin
case (read_addr) case (read_addr)
// PERF: pipeline // PERF: pipeline
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf_if.sched.idles); `CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf_if.sched.stalls); `CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf_if.issue.ibf_stalls); `CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf_if.issue.scb_stalls); `CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf_if.issue.opd_stalls); `CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_ALU]); `CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_FPU]); `CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_FPU]);
`else `else
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0)); `CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0));
`endif `endif
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_LSU]); `CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_SFU]); `CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]); `CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]); `CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_WCTL]);
// PERF: memory // PERF: memory
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf_if.ifetches); `CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf_if.loads); `CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf_if.stores); `CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf_if.ifetch_latency); `CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf_if.load_latency); `CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf.load_latency);
default:; default:;
endcase endcase
end end
`VX_DCR_MPM_CLASS_MEM: begin `VX_DCR_MPM_CLASS_MEM: begin
case (read_addr) case (read_addr)
// PERF: icache // PERF: icache
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, mem_perf_if.icache.reads); `CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, mem_perf_if.icache.read_misses); `CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, mem_perf_if.icache.mshr_stalls); `CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls);
// PERF: dcache // PERF: dcache
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, mem_perf_if.dcache.reads); `CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, mem_perf_if.dcache.writes); `CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, mem_perf_if.dcache.read_misses); `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, mem_perf_if.dcache.write_misses); `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, mem_perf_if.dcache.bank_stalls); `CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, mem_perf_if.dcache.mshr_stalls); `CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls);
// PERF: lmem // PERF: lmem
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, mem_perf_if.lmem.reads); `CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, mem_perf_if.lmem.writes); `CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, mem_perf_if.lmem.bank_stalls); `CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls);
// PERF: l2cache // PERF: l2cache
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, mem_perf_if.l2cache.reads); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, mem_perf_if.l2cache.writes); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, mem_perf_if.l2cache.read_misses); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, mem_perf_if.l2cache.write_misses); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l2cache.bank_stalls); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l2cache.mshr_stalls); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls);
// PERF: l3cache // PERF: l3cache
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, mem_perf_if.l3cache.reads); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, mem_perf_if.l3cache.writes); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, mem_perf_if.l3cache.read_misses); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, mem_perf_if.l3cache.write_misses); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l3cache.bank_stalls); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l3cache.mshr_stalls); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls);
// PERF: memory // PERF: memory
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, mem_perf_if.mem.reads); `CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, mem_perf_if.mem.writes); `CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, mem_perf_if.mem.latency); `CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency);
// PERF: coalescer
`CSR_READ_64(`VX_CSR_MPM_COALESCE_MISS, read_data_ro_w, sysmem_perf.coalescer.misses);
default:; default:;
endcase endcase
end end
@ -290,8 +292,8 @@ import VX_fpu_pkg::*;
`RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid)) `RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
`UNUSED_VAR (mem_perf_if.icache); `UNUSED_VAR (sysmem_perf.icache);
`UNUSED_VAR (mem_perf_if.lmem); `UNUSED_VAR (sysmem_perf.lmem);
`endif `endif
endmodule endmodule

View file

@ -24,8 +24,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
input base_dcrs_t base_dcrs, input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, input sysmem_perf_t sysmem_perf,
VX_pipeline_perf_if.slave pipeline_perf_if, input pipeline_perf_t pipeline_perf,
`endif `endif
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
@ -82,8 +82,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
.base_dcrs (base_dcrs), .base_dcrs (base_dcrs),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .sysmem_perf (sysmem_perf),
.pipeline_perf_if(pipeline_perf_if), .pipeline_perf (pipeline_perf),
`endif `endif
.commit_csr_if (commit_csr_if), .commit_csr_if (commit_csr_if),

View file

@ -23,8 +23,8 @@ module VX_execute import VX_gpu_pkg::*; #(
input wire reset, input wire reset,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, input sysmem_perf_t sysmem_perf,
VX_pipeline_perf_if.slave pipeline_perf_if, input pipeline_perf_t pipeline_perf,
`endif `endif
input base_dcrs_t base_dcrs, input base_dcrs_t base_dcrs,
@ -93,8 +93,8 @@ module VX_execute import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .sysmem_perf (sysmem_perf),
.pipeline_perf_if (pipeline_perf_if), .pipeline_perf (pipeline_perf),
`endif `endif
.base_dcrs (base_dcrs), .base_dcrs (base_dcrs),
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]), .dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),

View file

@ -137,6 +137,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire schedule_fire = schedule_if.valid && schedule_if.ready; wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready; wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready; wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset); `NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 1, 6, 3, ( `SCOPE_TAP_EX (0, 1, 6, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +

View file

@ -93,6 +93,7 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
`SCOPE_IO_SWITCH (1); `SCOPE_IO_SWITCH (1);
wire decode_fire = decode_if.valid && decode_if.ready; wire decode_fire = decode_if.valid && decode_if.ready;
wire operands_fire = operands_if.valid && operands_if.ready; wire operands_fire = operands_if.valid && operands_if.ready;
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset); `NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 2, 4, 3, ( `SCOPE_TAP_EX (0, 2, 4, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 + `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 +

View file

@ -535,6 +535,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`ifdef SCOPE `ifdef SCOPE
`ifdef DBG_SCOPE_LSU `ifdef DBG_SCOPE_LSU
`SCOPE_IO_SWITCH (1); `SCOPE_IO_SWITCH (1);
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset); `NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 3, 4, 2, ( `SCOPE_TAP_EX (0, 3, 4, 2, (
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH 1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH

View file

@ -20,7 +20,8 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
input wire reset, input wire reset,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
output cache_perf_t lmem_perf, output lmem_perf_t lmem_perf,
output coalescer_perf_t coalescer_perf,
`endif `endif
VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS], VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS],
@ -39,7 +40,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE); localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_lsu_mem_if #( VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES), .NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE), .DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH) .TAG_WIDTH (LSU_TAG_WIDTH)
@ -60,46 +61,58 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
); );
end end
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LMEM_TAG_WIDTH)
) lmem_arb_if[1]();
VX_lsu_mem_arb #(
.NUM_INPUTS (`NUM_LSU_BLOCKS),
.NUM_OUTPUTS(1),
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_IDX(0),
.ARBITER ("R"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(2)
) lmem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (lsu_lmem_if),
.bus_out_if (lmem_arb_if)
);
VX_mem_bus_if #( VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE), .DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH) .TAG_WIDTH (LMEM_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS](); ) lmem_adapt_if[`NUM_LSU_LANES]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_adapters VX_lsu_adapter #(
VX_mem_bus_if #( .NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE), .DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH) .TAG_WIDTH (LMEM_TAG_WIDTH),
) lmem_bus_tmp_if[`NUM_LSU_LANES](); .TAG_SEL_BITS (LMEM_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
VX_lsu_adapter #( .REQ_OUT_BUF (3),
.NUM_LANES (`NUM_LSU_LANES), .RSP_OUT_BUF (0)
.DATA_SIZE (LSU_WORD_SIZE), ) lmem_adapter (
.TAG_WIDTH (LSU_TAG_WIDTH), .clk (clk),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH), .reset (reset),
.ARBITER ("P"), .lsu_mem_if (lmem_arb_if[0]),
.REQ_OUT_BUF (3), .mem_bus_if (lmem_adapt_if)
.RSP_OUT_BUF (2) );
) lmem_adapter (
.clk (clk),
.reset (reset),
.lsu_mem_if (lsu_lmem_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_lmem_bus_if
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
end
end
VX_local_mem #( VX_local_mem #(
.INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))), .INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))),
.SIZE (1 << `LMEM_LOG_SIZE), .SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (LSU_NUM_REQS), .NUM_REQS (`NUM_LSU_LANES),
.NUM_BANKS (`LMEM_NUM_BANKS), .NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE), .WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH), .ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH), .UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH), .TAG_WIDTH (LMEM_TAG_WIDTH),
.OUT_BUF (3) .OUT_BUF (3)
) local_mem ( ) local_mem (
.clk (clk), .clk (clk),
@ -107,7 +120,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.lmem_perf (lmem_perf), .lmem_perf (lmem_perf),
`endif `endif
.mem_bus_if (lmem_bus_if) .mem_bus_if (lmem_adapt_if)
); );
`else `else
@ -115,6 +128,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
assign lmem_perf = '0; assign lmem_perf = '0;
`endif `endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if
`ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]); `ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end end
@ -127,6 +141,21 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
.TAG_WIDTH (DCACHE_TAG_WIDTH) .TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if[`NUM_LSU_BLOCKS](); ) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
wire [`NUM_LSU_BLOCKS-1:0][`PERF_CTR_BITS-1:0] per_block_coalescer_misses;
wire [`PERF_CTR_BITS-1:0] coalescer_misses;
VX_reduce_tree #(
.DATAW_IN (`PERF_CTR_BITS),
.DATAW_OUT (`PERF_CTR_BITS),
.N (`NUM_LSU_BLOCKS),
.OP ("+")
) coalescer_reduce (
.data_in (per_block_coalescer_misses),
.data_out (coalescer_misses)
);
`BUFFER(coalescer_perf.misses, coalescer_misses);
`endif
if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers
@ -139,11 +168,16 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH), .FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH), .TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH), .UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE) .QUEUE_SIZE (`LSUQ_OUT_SIZE),
.PERF_CTR_BITS (`PERF_CTR_BITS)
) mem_coalescer ( ) mem_coalescer (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
`ifdef LMEM_ENABLE
.misses (per_block_coalescer_misses[i]),
`endif
// Input request // Input request
.in_req_valid (lsu_dcache_if[i].req_valid), .in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask), .in_req_mask (lsu_dcache_if[i].req_data.mask),
@ -186,6 +220,9 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if
`ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]); `ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
`ifdef LMEM_ENABLE
assign per_block_coalescer_misses[i] = '0;
`endif
end end
end end

View file

@ -106,7 +106,6 @@ module VX_operands import VX_gpu_pkg::*; #(
.NUM_OUTPUTS (NUM_BANKS), .NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW), .DATAW (PER_BANK_ADDRW),
.ARBITER ("P"), // use priority arbiter .ARBITER ("P"), // use priority arbiter
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (0) // no output buffering .OUT_BUF (0) // no output buffering
) req_xbar ( ) req_xbar (
.clk (clk), .clk (clk),

View file

@ -44,7 +44,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle; reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r; wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
VX_reduce #( VX_reduce_tree #(
.DATAW_IN (`NUM_EX_UNITS), .DATAW_IN (`NUM_EX_UNITS),
.N (PER_ISSUE_WARPS), .N (PER_ISSUE_WARPS),
.OP ("|") .OP ("|")
@ -53,7 +53,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
.data_out (perf_units_per_cycle) .data_out (perf_units_per_cycle)
); );
VX_reduce #( VX_reduce_tree #(
.DATAW_IN (`NUM_SFU_UNITS), .DATAW_IN (`NUM_SFU_UNITS),
.N (PER_ISSUE_WARPS), .N (PER_ISSUE_WARPS),
.OP ("|") .OP ("|")

View file

@ -21,8 +21,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
input wire reset, input wire reset,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, input sysmem_perf_t sysmem_perf,
VX_pipeline_perf_if.slave pipeline_perf_if, input pipeline_perf_t pipeline_perf,
`endif `endif
input base_dcrs_t base_dcrs, input base_dcrs_t base_dcrs,
@ -121,8 +121,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.execute_if (pe_execute_if[PE_IDX_CSRS]), .execute_if (pe_execute_if[PE_IDX_CSRS]),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .sysmem_perf (sysmem_perf),
.pipeline_perf_if(pipeline_perf_if), .pipeline_perf (pipeline_perf),
`endif `endif
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE

View file

@ -1,46 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_pipeline_perf_if import VX_gpu_pkg::*; ();
sched_perf_t sched;
issue_perf_t issue;
wire [`PERF_CTR_BITS-1:0] ifetches;
wire [`PERF_CTR_BITS-1:0] loads;
wire [`PERF_CTR_BITS-1:0] stores;
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency;
modport master (
output sched,
output issue,
output ifetches,
output loads,
output stores,
output ifetch_latency,
output load_latency
);
modport slave (
input sched,
input issue,
input ifetches,
input loads,
input stores,
input ifetch_latency,
input load_latency
);
endinterface

View file

@ -1,27 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_sfu_perf_if ();
wire [`PERF_CTR_BITS-1:0] wctl_stalls;
modport master (
output wctl_stalls
);
modport slave (
input wctl_stalls
);
endinterface

View file

@ -24,6 +24,7 @@ module VX_mem_coalescer #(
parameter TAG_WIDTH = 8, parameter TAG_WIDTH = 8,
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
parameter QUEUE_SIZE = 8, parameter QUEUE_SIZE = 8,
parameter PERF_CTR_BITS = `CLOG2(NUM_REQS+1),
parameter DATA_IN_WIDTH = DATA_IN_SIZE * 8, parameter DATA_IN_WIDTH = DATA_IN_SIZE * 8,
parameter DATA_OUT_WIDTH= DATA_OUT_SIZE * 8, parameter DATA_OUT_WIDTH= DATA_OUT_SIZE * 8,
@ -37,6 +38,8 @@ module VX_mem_coalescer #(
input wire clk, input wire clk,
input wire reset, input wire reset,
output wire [PERF_CTR_BITS-1:0] misses,
// Input request // Input request
input wire in_req_valid, input wire in_req_valid,
input wire in_req_rw, input wire in_req_rw,
@ -323,6 +326,23 @@ module VX_mem_coalescer #(
assign in_rsp_tag = {out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout_tag}; assign in_rsp_tag = {out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout_tag};
assign out_rsp_ready = in_rsp_ready; assign out_rsp_ready = in_rsp_ready;
// compute coalescing misses
// misses are partial transfers (not fuly coalesced)
reg [PERF_CTR_BITS-1:0] misses_r;
wire partial_transfer = (out_req_fire && req_rem_mask_r != '1);
always @(posedge clk) begin
if (reset) begin
misses_r <= '0;
end else begin
misses_r <= misses_r + PERF_CTR_BITS'(partial_transfer);
end
end
assign misses = misses_r;
`ifdef DBG_TRACE_MEM `ifdef DBG_TRACE_MEM
wire [`UP(UUID_WIDTH)-1:0] out_req_uuid; wire [`UP(UUID_WIDTH)-1:0] out_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] out_rsp_uuid; wire [`UP(UUID_WIDTH)-1:0] out_rsp_uuid;

View file

@ -237,6 +237,8 @@ module VX_mem_scheduler #(
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
`UNUSED_PIN (misses),
// Input request // Input request
.in_req_valid (reqq_valid), .in_req_valid (reqq_valid),
.in_req_mask (reqq_mask), .in_req_mask (reqq_mask),

View file

@ -1,78 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`TRACING_OFF
module VX_reduce #(
parameter DATAW_IN = 1,
parameter DATAW_OUT = DATAW_IN,
parameter N = 1,
parameter `STRING OP = "+"
) (
input wire [N-1:0][DATAW_IN-1:0] data_in,
output wire [DATAW_OUT-1:0] data_out
);
if (N == 1) begin : g_passthru
assign data_out = DATAW_OUT'(data_in[0]);
end else begin : g_reduce
localparam int N_A = N / 2;
localparam int N_B = N - N_A;
wire [N_A-1:0][DATAW_IN-1:0] in_A;
wire [N_B-1:0][DATAW_IN-1:0] in_B;
wire [DATAW_OUT-1:0] out_A, out_B;
for (genvar i = 0; i < N_A; i++) begin : g_in_A
assign in_A[i] = data_in[i];
end
for (genvar i = 0; i < N_B; i++) begin : g_in_B
assign in_B[i] = data_in[N_A + i];
end
VX_reduce #(
.DATAW_IN (DATAW_IN),
.DATAW_OUT (DATAW_OUT),
.N (N_A),
.OP (OP)
) reduce_A (
.data_in (in_A),
.data_out (out_A)
);
VX_reduce #(
.DATAW_IN (DATAW_IN),
.DATAW_OUT (DATAW_OUT),
.N (N_B),
.OP (OP)
) reduce_B (
.data_in (in_B),
.data_out (out_B)
);
if (OP == "+") begin : g_plus
assign data_out = out_A + out_B;
end else if (OP == "^") begin : g_xor
assign data_out = out_A ^ out_B;
end else if (OP == "&") begin : g_and
assign data_out = out_A & out_B;
end else if (OP == "|") begin : g_or
assign data_out = out_A | out_B;
end else begin : g_error
`ERROR(("invalid parameter"));
end
end
endmodule
`TRACING_ON

View file

@ -206,13 +206,13 @@ module VX_stream_xbar #(
reg [PERF_CTR_BITS-1:0] collisions_r; reg [PERF_CTR_BITS-1:0] collisions_r;
always @(*) begin always @(*) begin
per_cycle_collision = 0; per_cycle_collision = '0;
for (integer i = 0; i < NUM_INPUTS; ++i) begin for (integer i = 0; i < NUM_INPUTS; ++i) begin
for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin for (integer j = i + 1; j < NUM_INPUTS; ++j) begin
per_cycle_collision[i] |= valid_in[i] per_cycle_collision[i] |= valid_in[i]
&& valid_in[j+i] && valid_in[j]
&& (sel_in[i] == sel_in[j+i]) && (sel_in[i] == sel_in[j])
&& (ready_in[i] | ready_in[j+i]); && (ready_in[i] | ready_in[j]);
end end
end end
end end

View file

@ -43,7 +43,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
// PERF // PERF
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
output cache_perf_t lmem_perf, output lmem_perf_t lmem_perf,
`endif `endif
VX_mem_bus_if.slave mem_bus_if [NUM_REQS] VX_mem_bus_if.slave mem_bus_if [NUM_REQS]
@ -286,14 +286,10 @@ module VX_local_mem import VX_gpu_pkg::*; #(
end end
end end
assign lmem_perf.reads = perf_reads; assign lmem_perf.reads = perf_reads;
assign lmem_perf.writes = perf_writes; assign lmem_perf.writes = perf_writes;
assign lmem_perf.read_misses = '0; assign lmem_perf.bank_stalls = perf_collisions;
assign lmem_perf.write_misses = '0; assign lmem_perf.crsp_stalls = perf_crsp_stalls;
assign lmem_perf.bank_stalls = perf_collisions;
assign lmem_perf.mshr_stalls = '0;
assign lmem_perf.mem_stalls = '0;
assign lmem_perf.crsp_stalls = perf_crsp_stalls;
`endif `endif
@ -321,15 +317,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
always @(posedge clk) begin always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin if (mem_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s wr-req: req_idx=%0d, addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n", `TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid)) $time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end else begin end else begin
`TRACE(2, ("%t: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n", `TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid)) $time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end end
end end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s rd-rsp: req_idx=%0d, data=0x%h, tag=0x%0h (#%0d)\n", `TRACE(2, ("%t: %s core-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid)) $time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
end end
end end
@ -339,15 +335,15 @@ module VX_local_mem import VX_gpu_pkg::*; #(
always @(posedge clk) begin always @(posedge clk) begin
if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin
if (per_bank_req_rw[i]) begin if (per_bank_req_rw[i]) begin
`TRACE(2, ("%t: %s-bank%0d wr-req: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n", `TRACE(2, ("%t: %s bank-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_tag_value[i], per_bank_req_uuid[i])) $time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
end else begin end else begin
`TRACE(2, ("%t: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n", `TRACE(2, ("%t: %s bank-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag_value[i], per_bank_req_uuid[i])) $time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag_value[i], per_bank_req_uuid[i]))
end end
end end
if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin
`TRACE(2, ("%t: %s-bank%0d rd-rsp: data=0x%h, tag=0x%0h (#%0d)\n", `TRACE(2, ("%t: %s bank-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_rsp_data[i], per_bank_rsp_tag_value[i], per_bank_rsp_uuid[i])) $time, INSTANCE_ID, i, per_bank_rsp_data[i], per_bank_rsp_tag_value[i], per_bank_rsp_uuid[i]))
end end
end end

View file

@ -1,43 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
interface VX_mem_perf_if import VX_gpu_pkg::*; ();
cache_perf_t icache;
cache_perf_t dcache;
cache_perf_t l2cache;
cache_perf_t l3cache;
cache_perf_t lmem;
mem_perf_t mem;
modport master (
output icache,
output dcache,
output l2cache,
output l3cache,
output lmem,
output mem
);
modport slave (
input icache,
input dcache,
input l2cache,
input l3cache,
input lmem,
input mem
);
endinterface

View file

@ -223,7 +223,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), { CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
return err; return err;
}); });
uint64_t num_mem_bank_ports; uint64_t num_mem_bank_ports;
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_MEM_BANKS, &num_mem_bank_ports), { CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_MEM_BANKS, &num_mem_bank_ports), {
return err; return err;
@ -437,6 +437,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization); fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization);
} }
uint64_t dcache_requests_per_core = 0;
if (dcache_enable) { if (dcache_enable) {
// PERF: Dcache // PERF: Dcache
uint64_t dcache_reads; uint64_t dcache_reads;
@ -447,6 +449,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), { CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
return err; return err;
}); });
dcache_requests_per_core += dcache_reads + dcache_writes;
uint64_t dcache_read_misses; uint64_t dcache_read_misses;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), { CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
return err; return err;
@ -475,6 +478,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization); fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization);
} }
// PERF: coalescer
uint64_t coalescer_misses;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_COALESCER_ST, core_id, &coalescer_misses), {
return err;
});
int coalescer_utilization = calcAvgPercent(dcache_requests_per_core - coalescer_misses, dcache_requests_per_core);
fprintf(stream, "PERF: core%d: coalescer misses=%ld (hit ratio=%d%%)\n", core_id, coalescer_misses, coalescer_utilization);
if (l2cache_enable) { if (l2cache_enable) {
// PERF: L2cache // PERF: L2cache
uint64_t tmp; uint64_t tmp;
@ -612,7 +623,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads); int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes); int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls); int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls);
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls); int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads); fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes); fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio); fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio);

View file

@ -104,6 +104,27 @@ inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) {
return (bits << shift) >> (shift + start); return (bits << shift) >> (shift + start);
} }
inline uint64_t bit_reverse(uint64_t bits) {
bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1) | ((bits & 0x5555555555555555) << 1);
bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2) | ((bits & 0x3333333333333333) << 2);
bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4) | ((bits & 0x0F0F0F0F0F0F0F0F) << 4);
bits = ((bits & 0xFF00FF00FF00FF00) >> 8) | ((bits & 0x00FF00FF00FF00FF) << 8);
bits = ((bits & 0xFFFF0000FFFF0000) >> 16) | ((bits & 0x0000FFFF0000FFFF) << 16);
bits = (bits >> 32) | (bits << 32);
return bits;
}
inline uint64_t bit_reverse(uint64_t bits, uint32_t width) {
assert(width <= 64);
uint64_t reversed(0);
for (uint32_t i = 0; i < width; ++i) {
if (bits & (1ULL << i)) {
reversed |= (1ULL << (width - 1 - i));
}
}
return reversed;
}
template <typename T = uint32_t> template <typename T = uint32_t>
T sext(const T& word, uint32_t width) { T sext(const T& word, uint32_t width) {
assert(width > 1); assert(width > 1);

View file

@ -21,32 +21,32 @@ template <typename T = uint32_t>
class BitVector { class BitVector {
private: private:
static constexpr size_t BITS_PER_WORD = sizeof(T) * 8; static constexpr size_t BITS_PER_WORD = sizeof(T) * 8;
std::vector<T> bits_; std::vector<T> words_;
size_t size_; size_t size_;
bool all_zero_; bool all_zero_;
size_t wordIndex(size_t pos) const { constexpr size_t wordIndex(size_t pos) const {
return pos / BITS_PER_WORD; return pos / BITS_PER_WORD;
} }
T bitMask(size_t pos) const { constexpr T bitMask(size_t pos) const {
return T(1) << (pos % BITS_PER_WORD); return T(1) << (pos % BITS_PER_WORD);
} }
void updateAllZero() { void updateAllZero() {
all_zero_ = std::all_of(bits_.begin(), bits_.end(), [](T word) { return word == 0; }); all_zero_ = std::all_of(words_.begin(), words_.end(), [](T word) { return word == 0; });
} }
public: public:
explicit BitVector(size_t size = 0) explicit BitVector(size_t size = 0)
: bits_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD) : words_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD)
, size_(size) , size_(size)
, all_zero_(true) , all_zero_(true)
{} {}
void set(size_t pos) { void set(size_t pos) {
if (pos >= size_) throw std::out_of_range("Index out of range"); if (pos >= size_) throw std::out_of_range("Index out of range");
bits_[this->wordIndex(pos)] |= this->bitMask(pos); words_[this->wordIndex(pos)] |= this->bitMask(pos);
all_zero_ = false; all_zero_ = false;
} }
@ -59,19 +59,19 @@ public:
} }
void reset() { void reset() {
std::fill(bits_.begin(), bits_.end(), 0); std::fill(words_.begin(), words_.end(), 0);
all_zero_ = true; all_zero_ = true;
} }
void reset(size_t pos) { void reset(size_t pos) {
if (pos >= size_) throw std::out_of_range("Index out of range"); if (pos >= size_) throw std::out_of_range("Index out of range");
bits_[this->wordIndex(pos)] &= ~this->bitMask(pos); words_[this->wordIndex(pos)] &= ~this->bitMask(pos);
this->updateAllZero(); this->updateAllZero();
} }
bool test(size_t pos) const { bool test(size_t pos) const {
if (pos >= size_) throw std::out_of_range("Index out of range"); if (pos >= size_) throw std::out_of_range("Index out of range");
return bits_[this->wordIndex(pos)] & this->bitMask(pos); return words_[this->wordIndex(pos)] & this->bitMask(pos);
} }
size_t size() const { size_t size() const {
@ -80,12 +80,12 @@ public:
void resize(size_t new_size) { void resize(size_t new_size) {
size_ = new_size; size_ = new_size;
bits_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0); words_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0);
this->updateAllZero(); this->updateAllZero();
} }
bool operator==(const BitVector& other) const { bool operator==(const BitVector& other) const {
return (size_ == other.size_) && (bits_ == other.bits_); return (size_ == other.size_) && (words_ == other.words_);
} }
bool operator!=(const BitVector& other) const { bool operator!=(const BitVector& other) const {
@ -98,8 +98,8 @@ public:
BitVector& operator&=(const BitVector& other) { BitVector& operator&=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match"); if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) { for (size_t i = 0; i < words_.size(); ++i) {
bits_[i] &= other.bits_[i]; words_[i] &= other.words_[i];
} }
this->updateAllZero(); this->updateAllZero();
return *this; return *this;
@ -107,8 +107,8 @@ public:
BitVector& operator|=(const BitVector& other) { BitVector& operator|=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match"); if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) { for (size_t i = 0; i < words_.size(); ++i) {
bits_[i] |= other.bits_[i]; words_[i] |= other.words_[i];
} }
this->updateAllZero(); this->updateAllZero();
return *this; return *this;
@ -116,8 +116,8 @@ public:
BitVector& operator^=(const BitVector& other) { BitVector& operator^=(const BitVector& other) {
if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match"); if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match");
for (size_t i = 0; i < bits_.size(); ++i) { for (size_t i = 0; i < words_.size(); ++i) {
bits_[i] ^= other.bits_[i]; words_[i] ^= other.words_[i];
} }
this->updateAllZero(); this->updateAllZero();
return *this; return *this;
@ -125,23 +125,48 @@ public:
BitVector operator~() const { BitVector operator~() const {
BitVector result(size_); BitVector result(size_);
for (size_t i = 0; i < bits_.size(); ++i) { for (size_t i = 0; i < words_.size(); ++i) {
result.bits_[i] = ~bits_[i]; result.words_[i] = ~words_[i];
} }
result.updateAllZero(); result.updateAllZero();
return result; return result;
} }
void flip() { void flip() {
for (auto &word : bits_) { for (auto &word : words_) {
word = ~word; word = ~word;
} }
this->updateAllZero(); this->updateAllZero();
} }
void reverse() {
if (size_ == 0)
return;
size_t remaining_bits = size_ % BITS_PER_WORD;
if (remaining_bits != 0) {
std::vector<T> reversed_words(words_.size(), 0);
for (size_t i = 0; i < size_; ++i) {
size_t reversed_pos = size_ - 1 - i;
size_t src_word = i / BITS_PER_WORD;
size_t src_offset = i % BITS_PER_WORD;
size_t dst_word = reversed_pos / BITS_PER_WORD;
size_t dst_offset = reversed_pos % BITS_PER_WORD;
if (words_[src_word] & (T(1) << src_offset)) {
reversed_words[dst_word] |= (T(1) << dst_offset);
}
}
words_ = std::move(reversed_words);
} else {
std::reverse(words_.begin(), words_.end());
for (auto &word : words_) {
word = static_cast<T>(bit_reverse(static_cast<uint64_t>(word)));
}
}
}
size_t count() const { size_t count() const {
size_t count = 0; size_t count = 0;
for (const auto &word : bits_) { for (const auto &word : words_) {
count += std::bitset<BITS_PER_WORD>(word).count(); count += std::bitset<BITS_PER_WORD>(word).count();
} }
return count; return count;
@ -160,12 +185,12 @@ public:
size_t remaining_bits = size_ % BITS_PER_WORD; size_t remaining_bits = size_ % BITS_PER_WORD;
T full_mask = ~T(0); T full_mask = ~T(0);
for (size_t i = 0; i < full_bits; ++i) { for (size_t i = 0; i < full_bits; ++i) {
if (bits_[i] != full_mask) if (words_[i] != full_mask)
return false; return false;
} }
if (remaining_bits > 0) { if (remaining_bits > 0) {
T partial_mask = (T(1) << remaining_bits) - 1; T partial_mask = (T(1) << remaining_bits) - 1;
if ((bits_[full_bits] & partial_mask) != partial_mask) if ((words_[full_bits] & partial_mask) != partial_mask)
return false; return false;
} }
return true; return true;
@ -181,17 +206,17 @@ public:
size_t bit_shift = pos % BITS_PER_WORD; size_t bit_shift = pos % BITS_PER_WORD;
if (word_shift > 0) { if (word_shift > 0) {
for (size_t i = bits_.size() - 1; i >= word_shift; --i) { for (size_t i = words_.size() - 1; i >= word_shift; --i) {
bits_[i] = bits_[i - word_shift]; words_[i] = words_[i - word_shift];
} }
std::fill(bits_.begin(), bits_.begin() + word_shift, 0); std::fill(words_.begin(), words_.begin() + word_shift, 0);
} }
if (bit_shift > 0) { if (bit_shift > 0) {
for (size_t i = bits_.size() - 1; i > 0; --i) { for (size_t i = words_.size() - 1; i > 0; --i) {
bits_[i] = (bits_[i] << bit_shift) | (bits_[i - 1] >> (BITS_PER_WORD - bit_shift)); words_[i] = (words_[i] << bit_shift) | (words_[i - 1] >> (BITS_PER_WORD - bit_shift));
} }
bits_[0] <<= bit_shift; words_[0] <<= bit_shift;
} }
this->updateAllZero(); this->updateAllZero();
@ -208,17 +233,17 @@ public:
size_t bit_shift = pos % BITS_PER_WORD; size_t bit_shift = pos % BITS_PER_WORD;
if (word_shift > 0) { if (word_shift > 0) {
for (size_t i = 0; i < bits_.size() - word_shift; ++i) { for (size_t i = 0; i < words_.size() - word_shift; ++i) {
bits_[i] = bits_[i + word_shift]; words_[i] = words_[i + word_shift];
} }
std::fill(bits_.end() - word_shift, bits_.end(), 0); std::fill(words_.end() - word_shift, words_.end(), 0);
} }
if (bit_shift > 0) { if (bit_shift > 0) {
for (size_t i = 0; i < bits_.size() - 1; ++i) { for (size_t i = 0; i < words_.size() - 1; ++i) {
bits_[i] = (bits_[i] >> bit_shift) | (bits_[i + 1] << (BITS_PER_WORD - bit_shift)); words_[i] = (words_[i] >> bit_shift) | (words_[i + 1] << (BITS_PER_WORD - bit_shift));
} }
bits_.back() >>= bit_shift; words_.back() >>= bit_shift;
} }
this->updateAllZero(); this->updateAllZero();

View file

@ -53,25 +53,25 @@ public:
SimPort(SimObjectBase* module) SimPort(SimObjectBase* module)
: SimPortBase(module) : SimPortBase(module)
, peer_(nullptr) , sink_(nullptr)
, tx_cb_(nullptr) , tx_cb_(nullptr)
{} {}
void bind(SimPort<Pkt>* peer) { void bind(SimPort<Pkt>* sink) {
assert(peer_ == nullptr); assert(sink_ == nullptr);
peer_ = peer; sink_ = sink;
} }
void unbind() { void unbind() {
peer_ = nullptr; sink_ = nullptr;
} }
bool connected() const { bool connected() const {
return (peer_ != nullptr); return (sink_ != nullptr);
} }
SimPort* peer() const { SimPort* sink() const {
return peer_; return sink_;
} }
bool empty() const { bool empty() const {
@ -111,15 +111,15 @@ protected:
}; };
std::queue<timed_pkt_t> queue_; std::queue<timed_pkt_t> queue_;
SimPort* peer_; SimPort* sink_;
TxCallback tx_cb_; TxCallback tx_cb_;
void transfer(const Pkt& data, uint64_t cycles) { void transfer(const Pkt& data, uint64_t cycles) {
if (tx_cb_) { if (tx_cb_) {
tx_cb_(data, cycles); tx_cb_(data, cycles);
} }
if (peer_) { if (sink_) {
peer_->transfer(data, cycles); sink_->transfer(data, cycles);
} else { } else {
queue_.push({data, cycles}); queue_.push({data, cycles});
} }
@ -402,8 +402,8 @@ typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
template <typename Pkt> template <typename Pkt>
void SimPort<Pkt>::push(const Pkt& pkt, uint64_t delay) const { void SimPort<Pkt>::push(const Pkt& pkt, uint64_t delay) const {
if (peer_ && !tx_cb_) { if (sink_ && !tx_cb_) {
reinterpret_cast<const SimPort<Pkt>*>(peer_)->push(pkt, delay); reinterpret_cast<const SimPort<Pkt>*>(sink_)->push(pkt, delay);
} else { } else {
SimPlatform::instance().schedule(this, pkt, delay); SimPlatform::instance().schedule(this, pkt, delay);
} }

View file

@ -46,8 +46,6 @@ Core::Core(const SimContext& ctx,
, func_units_((uint32_t)FUType::Count) , func_units_((uint32_t)FUType::Count)
, lmem_switch_(NUM_LSU_BLOCKS) , lmem_switch_(NUM_LSU_BLOCKS)
, mem_coalescers_(NUM_LSU_BLOCKS) , mem_coalescers_(NUM_LSU_BLOCKS)
, lsu_dcache_adapter_(NUM_LSU_BLOCKS)
, lsu_lmem_adapter_(NUM_LSU_BLOCKS)
, pending_icache_(arch_.num_warps()) , pending_icache_(arch_.num_warps())
, commit_arbs_(ISSUE_WIDTH) , commit_arbs_(ISSUE_WIDTH)
{ {
@ -64,11 +62,11 @@ Core::Core(const SimContext& ctx,
} }
// create local memory // create local memory
snprintf(sname, 100, "%s-local_mem", this->name().c_str()); snprintf(sname, 100, "%s-lmem", this->name().c_str());
local_mem_ = LocalMem::Create(sname, LocalMem::Config{ local_mem_ = LocalMem::Create(sname, LocalMem::Config{
(1 << LMEM_LOG_SIZE), (1 << LMEM_LOG_SIZE),
LSU_WORD_SIZE, LSU_WORD_SIZE,
LSU_NUM_REQS, LSU_CHANNELS,
log2ceil(LMEM_NUM_BANKS), log2ceil(LMEM_NUM_BANKS),
false false
}); });
@ -79,48 +77,52 @@ Core::Core(const SimContext& ctx,
lmem_switch_.at(i) = LocalMemSwitch::Create(sname, 1); lmem_switch_.at(i) = LocalMemSwitch::Create(sname, 1);
} }
// create lsu dcache adapter // create dcache adapter
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter(NUM_LSU_BLOCKS);
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) { for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) {
snprintf(sname, 100, "%s-lsu_dcache_adapter%d", this->name().c_str(), i); snprintf(sname, 100, "%s-lsu_dcache_adapter%d", this->name().c_str(), i);
lsu_dcache_adapter_.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1); lsu_dcache_adapter.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1);
} }
// create lsu lmem adapter // create lmem arbiter
for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) { snprintf(sname, 100, "%s-lmem_arb", this->name().c_str());
snprintf(sname, 100, "%s-lsu_lmem_adapter%d", this->name().c_str(), i); auto lmem_arb = LsuArbiter::Create(sname, ArbiterType::RoundRobin, NUM_LSU_BLOCKS, 1);
lsu_lmem_adapter_.at(i) = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
}
// connect lsu demux // create lmem adapter
snprintf(sname, 100, "%s-lsu_lmem_adapter", this->name().c_str());
auto lsu_lmem_adapter = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1);
// connect lmem switch
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) { for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
lmem_switch_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn); lmem_switch_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn);
lmem_switch_.at(b)->ReqLmem.bind(&lmem_arb->ReqIn.at(b));
mem_coalescers_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspDC); mem_coalescers_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspDC);
lmem_arb->RspIn.at(b).bind(&lmem_switch_.at(b)->RspLmem);
lmem_switch_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn);
lsu_lmem_adapter_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspLmem);
} }
// connect coalescer-adapter // connect lmem arbiter
lmem_arb->ReqOut.at(0).bind(&lsu_lmem_adapter->ReqIn);
lsu_lmem_adapter->RspIn.bind(&lmem_arb->RspOut.at(0));
// connect lmem adapter
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
lsu_lmem_adapter->ReqOut.at(c).bind(&local_mem_->Inputs.at(c));
local_mem_->Outputs.at(c).bind(&lsu_lmem_adapter->RspOut.at(c));
}
// connect dcache coalescer
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) { for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter_.at(b)->ReqIn); mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter.at(b)->ReqIn);
lsu_dcache_adapter_.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut); lsu_dcache_adapter.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut);
} }
// connect adapter-dcache // connect dcache adapter
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) { for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) { for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) {
uint32_t i = b * DCACHE_CHANNELS + c; uint32_t i = b * DCACHE_CHANNELS + c;
lsu_dcache_adapter_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i)); lsu_dcache_adapter.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i));
dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter_.at(b)->RspOut.at(c)); dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter.at(b)->RspOut.at(c));
}
}
// connect adapter-lmem
for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) {
for (uint32_t c = 0; c < LSU_CHANNELS; ++c) {
uint32_t i = b * LSU_CHANNELS + c;
lsu_lmem_adapter_.at(b)->ReqOut.at(c).bind(&local_mem_->Inputs.at(i));
local_mem_->Outputs.at(i).bind(&lsu_lmem_adapter_.at(b)->RspOut.at(c));
} }
} }

View file

@ -127,6 +127,10 @@ public:
return local_mem_; return local_mem_;
} }
const MemCoalescer::Ptr& mem_coalescer(uint32_t idx) const {
return mem_coalescers_.at(idx);
}
const PerfStats& perf_stats() const { const PerfStats& perf_stats() const {
return perf_stats_; return perf_stats_;
} }
@ -156,8 +160,6 @@ private:
LocalMem::Ptr local_mem_; LocalMem::Ptr local_mem_;
std::vector<LocalMemSwitch::Ptr> lmem_switch_; std::vector<LocalMemSwitch::Ptr> lmem_switch_;
std::vector<MemCoalescer::Ptr> mem_coalescers_; std::vector<MemCoalescer::Ptr> mem_coalescers_;
std::vector<LsuMemAdapter::Ptr> lsu_dcache_adapter_;
std::vector<LsuMemAdapter::Ptr> lsu_lmem_adapter_;
PipelineLatch fetch_latch_; PipelineLatch fetch_latch_;
PipelineLatch decode_latch_; PipelineLatch decode_latch_;

View file

@ -360,7 +360,6 @@ void Emulator::dcache_read(void *data, uint64_t addr, uint32_t size) {
} else { } else {
mmu_.read(data, addr, size, 0); mmu_.read(data, addr, size, 0);
} }
DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl); DPH(2, "Mem Read: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl);
} }
#endif #endif
@ -565,6 +564,12 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
auto cluster_perf = core_->socket()->cluster()->perf_stats(); auto cluster_perf = core_->socket()->cluster()->perf_stats();
auto socket_perf = core_->socket()->perf_stats(); auto socket_perf = core_->socket()->perf_stats();
auto lmem_perf = core_->local_mem()->perf_stats(); auto lmem_perf = core_->local_mem()->perf_stats();
uint64_t coalescer_misses = 0;
for (uint i = 0; i < NUM_LSU_BLOCKS; ++i) {
coalescer_misses += core_->mem_coalescer(i)->perf_stats().misses;
}
switch (addr) { switch (addr) {
CSR_READ_64(VX_CSR_MPM_ICACHE_READS, socket_perf.icache.reads); CSR_READ_64(VX_CSR_MPM_ICACHE_READS, socket_perf.icache.reads);
CSR_READ_64(VX_CSR_MPM_ICACHE_MISS_R, socket_perf.icache.read_misses); CSR_READ_64(VX_CSR_MPM_ICACHE_MISS_R, socket_perf.icache.read_misses);

View file

@ -24,14 +24,12 @@ protected:
LocalMem* simobject_; LocalMem* simobject_;
Config config_; Config config_;
RAM ram_; RAM ram_;
uint32_t line_bits_;
MemCrossBar::Ptr mem_xbar_; MemCrossBar::Ptr mem_xbar_;
mutable PerfStats perf_stats_; mutable PerfStats perf_stats_;
uint64_t to_local_addr(uint64_t addr) { uint64_t to_local_addr(uint64_t addr) {
uint32_t total_lines = config_.capacity / config_.line_size; return bit_getw(addr, 0, line_bits_-1);
uint32_t line_bits = log2ceil(total_lines);
uint32_t offset = bit_getw(addr, 0, line_bits-1);
return offset;
} }
public: public:
@ -40,9 +38,13 @@ public:
, config_(config) , config_(config)
, ram_(config.capacity) , ram_(config.capacity)
{ {
uint32_t total_lines = config.capacity / config.line_size;
line_bits_ = log2ceil(total_lines);
char sname[100]; char sname[100];
snprintf(sname, 100, "%s-xbar", simobject->name().c_str()); snprintf(sname, 100, "%s-xbar", simobject->name().c_str());
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_reqs, (1 << config.B)); uint32_t wsel_bits = log2ceil(config_.line_size);
mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, (1 << config.B), wsel_bits);
for (uint32_t i = 0; i < config.num_reqs; ++i) { for (uint32_t i = 0; i < config.num_reqs; ++i) {
simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i)); simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i));
mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i)); mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i));
@ -56,15 +58,15 @@ public:
} }
void read(void* data, uint64_t addr, uint32_t size) { void read(void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr); auto l_addr = to_local_addr(addr);
DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl); DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl);
ram_.read(data, s_addr, size); ram_.read(data, l_addr, size);
} }
void write(const void* data, uint64_t addr, uint32_t size) { void write(const void* data, uint64_t addr, uint32_t size) {
auto s_addr = to_local_addr(addr); auto l_addr = to_local_addr(addr);
DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl); DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl);
ram_.write(data, s_addr, size); ram_.write(data, l_addr, size);
} }
void tick() { void tick() {
@ -94,7 +96,7 @@ public:
} }
const PerfStats& perf_stats() const { const PerfStats& perf_stats() const {
perf_stats_.bank_stalls = mem_xbar_->collisions(); perf_stats_.bank_stalls = mem_xbar_->req_collisions();
return perf_stats_; return perf_stats_;
} }
}; };

View file

@ -147,10 +147,17 @@ void MemCoalescer::tick() {
ReqOut.push(out_req, delay_); ReqOut.push(out_req, delay_);
DT(4, this->name() << "-mem-req: coalesced=" << cur_mask.count() << ", " << out_req); DT(4, this->name() << "-mem-req: coalesced=" << cur_mask.count() << ", " << out_req);
// track partial responses
perf_stats_.misses += (cur_mask.count() != in_req.mask.count());
// update sent mask // update sent mask
sent_mask_ |= cur_mask; sent_mask_ |= cur_mask;
if (sent_mask_ == in_req.mask) { if (sent_mask_ == in_req.mask) {
ReqIn.pop(); ReqIn.pop();
sent_mask_.reset(); sent_mask_.reset();
} }
}
const MemCoalescer::PerfStats& MemCoalescer::perf_stats() const {
return perf_stats_;
} }

View file

@ -23,6 +23,19 @@ public:
SimPort<LsuReq> ReqOut; SimPort<LsuReq> ReqOut;
SimPort<LsuRsp> RspOut; SimPort<LsuRsp> RspOut;
struct PerfStats {
uint64_t misses;
PerfStats()
: misses(0)
{}
PerfStats& operator+=(const PerfStats& rhs) {
this->misses += rhs.misses;
return *this;
}
};
MemCoalescer( MemCoalescer(
const SimContext& ctx, const SimContext& ctx,
const char* name, const char* name,
@ -37,6 +50,8 @@ public:
void tick(); void tick();
const PerfStats& perf_stats() const;
private: private:
struct pending_req_t { struct pending_req_t {
@ -52,6 +67,7 @@ private:
BitVector<> sent_mask_; BitVector<> sent_mask_;
uint32_t line_size_; uint32_t line_size_;
uint32_t delay_; uint32_t delay_;
PerfStats perf_stats_;
}; };
} }

View file

@ -527,6 +527,7 @@ public:
auto& req_in = Inputs.at(j); auto& req_in = Inputs.at(j);
if (!req_in.empty()) { if (!req_in.empty()) {
auto& req = req_in.front(); auto& req = req_in.front();
DT(4, this->name() << "-req" << o << ": " << req);
Outputs.at(o).push(req, delay_); Outputs.at(o).push(req, delay_);
req_in.pop(); req_in.pop();
this->update_grant(o, g); this->update_grant(o, g);
@ -597,37 +598,36 @@ public:
// process incoming requests // process incoming requests
for (uint32_t o = 0; o < O; ++o) { for (uint32_t o = 0; o < O; ++o) {
int32_t input_idx = -1; int32_t input_idx = -1;
bool has_collision = false;
for (uint32_t r = 0; r < R; ++r) { for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (grants_.at(o) + r) & (R-1); uint32_t i = (grants_.at(o) + r) & (R-1);
if (i >= I) if (i >= I)
continue; continue;
auto& req_in = Inputs.at(i); auto& req_in = Inputs.at(i);
if (!req_in.empty()) { if (req_in.empty())
auto& req = req_in.front(); continue;
auto& req = req_in.front();
uint32_t output_idx = 0;
if (lg2_outputs_ != 0) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
// skip if input is not going to current output // skip if input is not going to current output
uint32_t output_idx = 0;
if (O != 1) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
}
if (output_idx != o) if (output_idx != o)
continue; continue;
if (input_idx != -1) {
++collisions_;
continue;
}
input_idx = i;
} }
if (input_idx != -1) {
has_collision = true;
continue;
}
input_idx = i;
} }
if (input_idx != -1) { if (input_idx != -1) {
auto& req_in = Inputs.at(input_idx); auto& req_in = Inputs.at(input_idx);
auto& req = req_in.front(); auto& req = req_in.front();
if (lg2_inputs_ != 0) { DT(4, this->name() << "-req" << o << ": " << req);
req.tag = (req.tag << lg2_inputs_) | input_idx;
}
DT(4, this->name() << "-req" << input_idx << ": " << req);
Outputs.at(o).push(req, delay_); Outputs.at(o).push(req, delay_);
req_in.pop(); req_in.pop();
this->update_grant(o, input_idx); this->update_grant(o, input_idx);
collisions_ += has_collision;
} }
} }
} }
@ -721,8 +721,8 @@ public:
g = rsp.tag & (R-1); g = rsp.tag & (R-1);
rsp.tag >>= lg2_num_reqs_; rsp.tag >>= lg2_num_reqs_;
} }
DT(4, this->name() << "-rsp" << o << ": " << rsp);
uint32_t j = o * R + g; uint32_t j = o * R + g;
DT(4, this->name() << "-rsp" << j << ": " << rsp);
RspIn.at(j).push(rsp, 1); RspIn.at(j).push(rsp, 1);
rsp_out.pop(); rsp_out.pop();
} }
@ -742,7 +742,7 @@ public:
if (lg2_num_reqs_ != 0) { if (lg2_num_reqs_ != 0) {
req.tag = (req.tag << lg2_num_reqs_) | g; req.tag = (req.tag << lg2_num_reqs_) | g;
} }
DT(4, this->name() << "-req" << j << ": " << req); DT(4, this->name() << "-req" << o << ": " << req);
ReqOut.at(o).push(req, delay_); ReqOut.at(o).push(req, delay_);
req_in.pop(); req_in.pop();
this->update_grant(o, g); this->update_grant(o, g);
@ -798,7 +798,8 @@ public:
, lg2_inputs_(log2ceil(num_inputs)) , lg2_inputs_(log2ceil(num_inputs))
, lg2_outputs_(log2ceil(num_outputs)) , lg2_outputs_(log2ceil(num_outputs))
, addr_start_(addr_start) , addr_start_(addr_start)
, collisions_(0) { , req_collisions_(0)
, rsp_collisions_(0) {
assert(delay != 0); assert(delay != 0);
assert(num_inputs <= 64); assert(num_inputs <= 64);
assert(num_outputs <= 64); assert(num_outputs <= 64);
@ -824,26 +825,27 @@ public:
// process outgoing responses // process outgoing responses
for (uint32_t i = 0; i < I; ++i) { for (uint32_t i = 0; i < I; ++i) {
int32_t output_idx = -1; int32_t output_idx = -1;
bool has_collision = false;
for (uint32_t t = 0; t < T; ++t) { for (uint32_t t = 0; t < T; ++t) {
uint32_t o = (rsp_grants_.at(i) + t) & (T-1); uint32_t o = (rsp_grants_.at(i) + t) & (T-1);
if (o >= O) if (o >= O)
continue; continue;
auto& rsp_out = RspOut.at(o); auto& rsp_out = RspOut.at(o);
if (!rsp_out.empty()) { if (rsp_out.empty())
auto& rsp = rsp_out.front(); continue;
auto& rsp = rsp_out.front();
uint32_t input_idx = 0;
if (lg2_inputs_ != 0) {
input_idx = rsp.tag & (R-1);
// skip if response is not going to current input // skip if response is not going to current input
uint32_t input_idx = 0;
if (lg2_inputs_ != 0) {
input_idx = rsp.tag & (R-1);
}
if (input_idx != i) if (input_idx != i)
continue; continue;
if (output_idx != -1) {
++collisions_;
continue;
}
output_idx = o;
} }
if (output_idx != -1) {
has_collision = true;
continue;
}
output_idx = o;
} }
if (output_idx != -1) { if (output_idx != -1) {
auto& rsp_out = RspOut.at(output_idx); auto& rsp_out = RspOut.at(output_idx);
@ -853,36 +855,38 @@ public:
input_idx = rsp.tag & (R-1); input_idx = rsp.tag & (R-1);
rsp.tag >>= lg2_inputs_; rsp.tag >>= lg2_inputs_;
} }
DT(4, this->name() << "-rsp" << output_idx << ": " << rsp); DT(4, this->name() << "-rsp" << i << ": " << rsp);
RspIn.at(input_idx).push(rsp, 1); RspIn.at(i).push(rsp, 1);
rsp_out.pop(); rsp_out.pop();
this->update_rsp_grant(i, output_idx); this->update_rsp_grant(i, output_idx);
rsp_collisions_ += has_collision;
} }
} }
// process incoming requests // process incoming requests
for (uint32_t o = 0; o < O; ++o) { for (uint32_t o = 0; o < O; ++o) {
int32_t input_idx = -1; int32_t input_idx = -1;
bool has_collision = false;
for (uint32_t r = 0; r < R; ++r) { for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (req_grants_.at(o) + r) & (R-1); uint32_t i = (req_grants_.at(o) + r) & (R-1);
if (i >= I) if (i >= I)
continue; continue;
auto& req_in = ReqIn.at(i); auto& req_in = ReqIn.at(i);
if (!req_in.empty()) { if (req_in.empty())
auto& req = req_in.front(); continue;
auto& req = req_in.front();
uint32_t output_idx = 0;
if (lg2_outputs_ != 0) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, addr_start_ + (lg2_outputs_-1));
// skip if request is not going to current output // skip if request is not going to current output
uint32_t output_idx = 0;
if (O != 1) {
output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1);
}
if (output_idx != o) if (output_idx != o)
continue; continue;
if (input_idx != -1) {
++collisions_;
continue;
}
input_idx = i;
} }
if (input_idx != -1) {
has_collision = true;
continue;
}
input_idx = i;
} }
if (input_idx != -1) { if (input_idx != -1) {
auto& req_in = ReqIn.at(input_idx); auto& req_in = ReqIn.at(input_idx);
@ -890,16 +894,21 @@ public:
if (lg2_inputs_ != 0) { if (lg2_inputs_ != 0) {
req.tag = (req.tag << lg2_inputs_) | input_idx; req.tag = (req.tag << lg2_inputs_) | input_idx;
} }
DT(4, this->name() << "-req" << input_idx << ": " << req); DT(4, this->name() << "-req" << o << ": " << req);
ReqOut.at(o).push(req, delay_); ReqOut.at(o).push(req, delay_);
req_in.pop(); req_in.pop();
this->update_req_grant(o, input_idx); this->update_req_grant(o, input_idx);
req_collisions_ += has_collision;
} }
} }
} }
uint64_t collisions() const { uint64_t req_collisions() const {
return collisions_; return req_collisions_;
}
uint64_t rsp_collisions() const {
return rsp_collisions_;
} }
protected: protected:
@ -923,7 +932,8 @@ protected:
uint32_t lg2_inputs_; uint32_t lg2_inputs_;
uint32_t lg2_outputs_; uint32_t lg2_outputs_;
uint32_t addr_start_; uint32_t addr_start_;
uint64_t collisions_; uint64_t req_collisions_;
uint64_t rsp_collisions_;
}; };
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -978,7 +988,8 @@ private:
uint32_t delay_; uint32_t delay_;
}; };
using MemArbiter = TxArbiter<MemReq, MemRsp>; using LsuArbiter = TxArbiter<LsuReq, LsuRsp>;
using MemArbiter = TxArbiter<MemReq, MemRsp>;
using MemCrossBar = TxCrossBar<MemReq, MemRsp>; using MemCrossBar = TxCrossBar<MemReq, MemRsp>;
} }