profiling optimizations

This commit is contained in:
Blaise Tine 2023-12-05 05:12:13 -08:00
parent 65036e2d34
commit 4b73762aea
11 changed files with 165 additions and 182 deletions

View file

@ -692,9 +692,11 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.reset (reset),
.incr (cci_rd_req_fire),
.decr (cci_rdq_pop),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (cci_pending_reads_full),
.size (cci_pending_reads),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
.size (cci_pending_reads)
);
`UNUSED_VAR (cci_pending_reads)
@ -852,7 +854,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.incr (cci_mem_rd_rsp_fire),
.decr (cci_wr_rsp_fire),
.empty (cci_pending_writes_empty),
`UNUSED_PIN (alm_empty),
.full (cci_pending_writes_full),
`UNUSED_PIN (alm_full),
.size (cci_pending_writes)
);

View file

@ -364,9 +364,11 @@ module VX_cache_bank #(
.reset (reset),
.incr (core_req_fire),
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (mshr_alm_full),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
`RESET_RELAY (mshr_reset, reset);

View file

@ -251,7 +251,7 @@ module VX_cache_mshr #(
assign dequeue_rw = write_table[dequeue_id_r];
assign dequeue_id = dequeue_id_r;
assign lookup_matches = addr_matches & ~write_table;
assign lookup_matches = addr_matches;
`UNUSED_VAR (lookup_valid)

View file

@ -36,10 +36,10 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
VX_commit_if commit_arb_if[`ISSUE_WIDTH]();
wire [`ISSUE_WIDTH-1:0] commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
wire [`ISSUE_WIDTH-1:0] commit_eop;
wire [`ISSUE_WIDTH-1:0] per_issue_commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] per_issue_commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] per_issue_commit_tmask;
wire [`ISSUE_WIDTH-1:0] per_issue_commit_eop;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
@ -72,10 +72,10 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
`UNUSED_PIN (sel_out)
);
assign commit_fire[i] = commit_arb_if[i].valid && commit_arb_if[i].ready;
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_arb_if[i].data.tmask;
assign commit_wid[i] = commit_arb_if[i].data.wid;
assign commit_eop[i] = commit_arb_if[i].data.eop;
assign per_issue_commit_fire[i] = commit_arb_if[i].valid && commit_arb_if[i].ready;
assign per_issue_commit_tmask[i]= {`NUM_THREADS{per_issue_commit_fire[i]}} & commit_arb_if[i].data.tmask;
assign per_issue_commit_wid[i] = commit_arb_if[i].data.wid;
assign per_issue_commit_eop[i] = commit_arb_if[i].data.eop;
end
// CSRs update
@ -84,11 +84,11 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire);
assign commit_fire_any = (| per_issue_commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] count;
`POP_COUNT(count, commit_tmask[i]);
`POP_COUNT(count, per_issue_commit_tmask[i]);
assign commit_size[i] = count;
end
@ -136,19 +136,28 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
end
assign commit_csr_if.instret = instret;
// Committed instructions
// Track committed instructions
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
reg [`NUM_WARPS-1:0] committed_warps;
always @(*) begin
committed_warps = 0;
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
if (per_issue_commit_fire[i] && per_issue_commit_eop[i]) begin
committed_warps[per_issue_commit_wid[i]] = 1;
end
end
end
VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
.RESETW (`ISSUE_WIDTH)
.DATAW (`NUM_WARPS),
.RESETW (`NUM_WARPS)
) committed_pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({committed, commit_wid}),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
.data_in (committed_warps),
.data_out ({commit_sched_if.committed_warps})
);
// Writeback

View file

@ -78,9 +78,11 @@ module VX_fetch import VX_gpu_pkg::*; #(
.reset (reset),
.incr (icache_req_fire && schedule_if.data.wid == i),
.decr (fetch_if.ibuf_pop[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (pending_ibuf_full[i]),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
wire ibuf_ready = ~pending_ibuf_full[schedule_if.data.wid];

View file

@ -1,79 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_pending_instr #(
parameter CTR_WIDTH = 12,
parameter ALM_EMPTY = 1,
parameter DECR_COUNT = 1
) (
input wire clk,
input wire reset,
input wire incr,
input wire [`NW_WIDTH-1:0] incr_wid,
input wire [DECR_COUNT-1:0] decr,
input wire [DECR_COUNT-1:0][`NW_WIDTH-1:0] decr_wid,
input wire [`NW_WIDTH-1:0] alm_empty_wid,
output wire empty,
output wire alm_empty
);
localparam COUNTW = `CLOG2(DECR_COUNT+1);
reg [`NUM_WARPS-1:0][CTR_WIDTH-1:0] pending_instrs;
reg [`NUM_WARPS-1:0][COUNTW-1:0] decr_cnt;
reg [`NUM_WARPS-1:0][DECR_COUNT-1:0] decr_mask;
reg [`NUM_WARPS-1:0] incr_cnt, incr_cnt_n;
reg [`NUM_WARPS-1:0] alm_empty_r, empty_r;
always @(*) begin
incr_cnt_n = 0;
decr_mask = 0;
if (incr) begin
incr_cnt_n[incr_wid] = 1;
end
for (integer i = 0; i < DECR_COUNT; ++i) begin
if (decr[i]) begin
decr_mask[decr_wid[i]][i] = 1;
end
end
end
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
wire [COUNTW-1:0] decr_cnt_n;
`POP_COUNT(decr_cnt_n, decr_mask[i]);
wire [CTR_WIDTH-1:0] pending_instrs_n = pending_instrs[i] + CTR_WIDTH'(incr_cnt[i]) - CTR_WIDTH'(decr_cnt[i]);
always @(posedge clk) begin
if (reset) begin
incr_cnt[i] <= '0;
decr_cnt[i] <= '0;
pending_instrs[i] <= '0;
alm_empty_r[i] <= 0;
empty_r[i] <= 1;
end else begin
incr_cnt[i] <= incr_cnt_n[i];
decr_cnt[i] <= decr_cnt_n;
pending_instrs[i] <= pending_instrs_n;
alm_empty_r[i] <= (pending_instrs_n == ALM_EMPTY);
empty_r[i] <= (pending_instrs_n == 0);
end
end
end
assign alm_empty = alm_empty_r[alm_empty_wid];
assign empty = (& empty_r);
endmodule

View file

@ -370,24 +370,42 @@ module VX_schedule import VX_gpu_pkg::*; #(
assign schedule_if.data.uuid = instr_uuid;
`RESET_RELAY (pending_instr_reset, reset);
// Track pending instructions per warp
wire no_pending_instr;
VX_pending_instr #(
.CTR_WIDTH (12),
.DECR_COUNT (`ISSUE_WIDTH),
.ALM_EMPTY (1)
) pending_instr(
.clk (clk),
.reset (pending_instr_reset),
.incr (schedule_if_fire),
.incr_wid (schedule_if.data.wid),
.decr (commit_sched_if.committed),
.decr_wid (commit_sched_if.committed_wid),
.alm_empty_wid (sched_csr_if.alm_empty_wid),
.alm_empty (sched_csr_if.alm_empty),
.empty (no_pending_instr)
);
reg [`NUM_WARPS-1:0] per_warp_incr;
always @(*) begin
per_warp_incr = 0;
if (schedule_if_fire) begin
per_warp_incr[schedule_if.data.wid] = 1;
end
end
wire [`NUM_WARPS-1:0] pending_warp_empty;
wire [`NUM_WARPS-1:0] pending_warp_alm_empty;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
`RESET_RELAY (pending_instr_reset, reset);
VX_pending_size #(
.SIZE (4096),
.ALM_EMPTY (1)
) counter (
.clk (clk),
.reset (pending_instr_reset),
.incr (per_warp_incr[i]),
.decr (commit_sched_if.committed_warps[i]),
.empty (pending_warp_empty[i]),
.alm_empty (pending_warp_alm_empty[i]),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
assign sched_csr_if.alm_empty = pending_warp_alm_empty[sched_csr_if.alm_empty_wid];
wire no_pending_instr = (& pending_warp_empty);
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,17 +15,14 @@
interface VX_commit_sched_if ();
wire [`ISSUE_WIDTH-1:0] committed;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] committed_wid;
wire [`NUM_WARPS-1:0] committed_warps;
modport master (
output committed,
output committed_wid
output committed_warps
);
modport slave (
input committed,
input committed_wid
input committed_warps
);
endinterface

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,11 +14,11 @@
`include "VX_define.vh"
`TRACING_OFF
module VX_avs_adapter #(
parameter DATA_WIDTH = 1,
parameter ADDR_WIDTH = 1,
module VX_avs_adapter #(
parameter DATA_WIDTH = 1,
parameter ADDR_WIDTH = 1,
parameter BURST_WIDTH = 1,
parameter NUM_BANKS = 1,
parameter NUM_BANKS = 1,
parameter TAG_WIDTH = 1,
parameter RD_QUEUE_SIZE = 1,
parameter REQ_OUT_BUF = 0,
@ -29,15 +29,15 @@ module VX_avs_adapter #(
// Memory request
input wire mem_req_valid,
input wire mem_req_rw,
input wire [DATA_WIDTH/8-1:0] mem_req_byteen,
input wire mem_req_rw,
input wire [DATA_WIDTH/8-1:0] mem_req_byteen,
input wire [ADDR_WIDTH-1:0] mem_req_addr,
input wire [DATA_WIDTH-1:0] mem_req_data,
input wire [TAG_WIDTH-1:0] mem_req_tag,
output wire mem_req_ready,
// Memory response
output wire mem_rsp_valid,
// Memory response
output wire mem_rsp_valid,
output wire [DATA_WIDTH-1:0] mem_rsp_data,
output wire [TAG_WIDTH-1:0] mem_rsp_tag,
input wire mem_rsp_ready,
@ -60,7 +60,7 @@ module VX_avs_adapter #(
localparam BANK_OFFSETW = ADDR_WIDTH - LOG2_NUM_BANKS;
// Requests handling //////////////////////////////////////////////////////
wire [NUM_BANKS-1:0] req_queue_push, req_queue_pop;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] req_queue_tag_out;
wire [NUM_BANKS-1:0] req_queue_going_full;
@ -70,38 +70,40 @@ module VX_avs_adapter #(
wire [NUM_BANKS-1:0] bank_req_ready;
if (NUM_BANKS > 1) begin
assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0];
assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0];
end else begin
assign req_bank_sel = '0;
end
assign req_bank_off = mem_req_addr[ADDR_WIDTH-1:LOG2_NUM_BANKS];
for (genvar i = 0; i < NUM_BANKS; ++i) begin
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign req_queue_push[i] = mem_req_valid && ~mem_req_rw && bank_req_ready[i] && (req_bank_sel == i);
end
for (genvar i = 0; i < NUM_BANKS; ++i) begin
VX_pending_size #(
VX_pending_size #(
.SIZE (RD_QUEUE_SIZE)
) pending_size (
.clk (clk),
.reset (reset),
.incr (req_queue_push[i]),
.decr (req_queue_pop[i]),
.decr (req_queue_pop[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (req_queue_going_full[i]),
.size (req_queue_size[i]),
`UNUSED_PIN (empty)
);
`UNUSED_PIN (alm_full),
.size (req_queue_size[i])
);
`UNUSED_VAR (req_queue_size)
VX_fifo_queue #(
.DATAW (TAG_WIDTH),
.DEPTH (RD_QUEUE_SIZE)
) rd_req_queue (
.clk (clk),
.reset (reset),
.push (req_queue_push[i]),
.push (req_queue_push[i]),
.pop (req_queue_pop[i]),
.data_in (mem_req_tag),
.data_out (req_queue_tag_out[i]),
@ -111,9 +113,9 @@ module VX_avs_adapter #(
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
end
for (genvar i = 0; i < NUM_BANKS; ++i) begin
for (genvar i = 0; i < NUM_BANKS; ++i) begin
wire valid_out;
wire rw_out;
wire [DATA_SIZE-1:0] byteen_out;
@ -174,7 +176,7 @@ module VX_avs_adapter #(
.reset (reset),
.push (avs_readdatavalid[i]),
.pop (req_queue_pop[i]),
.data_in (avs_readdata[i]),
.data_in (avs_readdata[i]),
.data_out (rsp_queue_data_out[i]),
.empty (rsp_queue_empty[i]),
`UNUSED_PIN (full),
@ -183,7 +185,7 @@ module VX_avs_adapter #(
`UNUSED_PIN (size)
);
end
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign rsp_arb_valid_in[i] = !rsp_queue_empty[i];
assign rsp_arb_data_in[i] = {rsp_queue_data_out[i], req_queue_tag_out[i]};

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,42 +15,51 @@
`TRACING_OFF
module VX_pending_size #(
parameter SIZE = 1,
parameter INCRW = 1,
parameter DECRW = 1,
parameter SIZEW = `CLOG2(SIZE+1)
parameter SIZE = 1,
parameter INCRW = 1,
parameter DECRW = 1,
parameter ALM_FULL = (SIZE - 1),
parameter ALM_EMPTY = 1,
parameter SIZEW = `CLOG2(SIZE+1)
) (
input wire clk,
input wire reset,
input wire [INCRW-1:0] incr,
input wire [DECRW-1:0] decr,
output wire empty,
output wire alm_empty,
output wire full,
output wire alm_full,
output wire [SIZEW-1:0] size
);
`STATIC_ASSERT(INCRW <= SIZEW, ("invalid parameter"))
`STATIC_ASSERT(DECRW <= SIZEW, ("invalid parameter"))
localparam ADDRW = `LOG2UP(SIZE);
reg empty_r;
reg full_r;
reg empty_r, alm_empty_r;
reg full_r, alm_full_r;
if (INCRW != 1 || DECRW != 1) begin
reg [SIZEW-1:0] size_r;
wire [SIZEW-1:0] size_n;
assign size_n = size_r + SIZEW'(incr) - SIZEW'(decr);
wire [SIZEW-1:0] size_n = size_r + SIZEW'(incr) - SIZEW'(decr);
always @(posedge clk) begin
if (reset) begin
size_r <= '0;
empty_r <= 1;
full_r <= 0;
if (reset) begin
empty_r <= 1;
alm_empty_r <= 1;
alm_full_r <= 0;
full_r <= 0;
size_r <= '0;
end else begin
size_r <= size_n;
empty_r <= (size_n == SIZEW'(0));
full_r <= (size_n == SIZEW'(SIZE));
`ASSERT((incr >= decr) || (size_n >= size_r), ("runtime error: counter overflow"));
`ASSERT((incr <= decr) || (size_n <= size_r), ("runtime error: counter underflow"));
size_r <= size_n;
empty_r <= (size_n == SIZEW'(0));
alm_empty_r <= (size_n == SIZEW'(ALM_EMPTY));
full_r <= (size_n == SIZEW'(SIZE));
alm_full_r <= (size_n == SIZEW'(ALM_FULL));
end
end
@ -59,30 +68,47 @@ module VX_pending_size #(
end else begin
reg [ADDRW-1:0] used_r;
wire [ADDRW-1:0] used_n;
always @(posedge clk) begin
if (reset) begin
used_r <= '0;
empty_r <= 1;
full_r <= 0;
end else begin
`ASSERT(~(incr && ~decr) || ~full, ("runtime error: incrementing full counter"));
`ASSERT(~(decr && ~incr) || ~empty, ("runtime error: decrementing empty counter"));
if (reset) begin
empty_r <= 1;
alm_empty_r <= 1;
full_r <= 0;
alm_full_r <= 0;
used_r <= '0;
end else begin
`ASSERT(~(incr && ~decr) || ~full, ("runtime error: counter overflow"));
`ASSERT(~(decr && ~incr) || ~empty, ("runtime error: counter underflow"));
if (incr) begin
if (~decr) begin
empty_r <= 0;
if (used_r == ADDRW'(ALM_EMPTY))
alm_empty_r <= 0;
if (used_r == ADDRW'(SIZE-1))
full_r <= 1;
if (used_r == ADDRW'(ALM_FULL-1))
alm_full_r <= 1;
end
end else if (decr) begin
full_r <= 0;
if (used_r == ADDRW'(1))
empty_r <= 1;
empty_r <= 1;
if (used_r == ADDRW'(ALM_EMPTY+1))
alm_empty_r <= 1;
full_r <= 0;
if (used_r == ADDRW'(ALM_FULL))
alm_full_r <= 0;
end
used_r <= $signed(used_r) + ADDRW'($signed(2'(incr) - 2'(decr)));
used_r <= used_n;
end
end
if (SIZE == 2) begin
assign used_n = used_r ^ (incr ^ decr);
end else begin
assign used_n = $signed(used_r) + ADDRW'($signed(2'(incr) - 2'(decr)));
end
if (SIZE > 1) begin
if (SIZEW > ADDRW) begin
assign size = {full_r, used_r};
@ -95,8 +121,10 @@ module VX_pending_size #(
end
assign empty = empty_r;
assign full = full_r;
assign empty = empty_r;
assign alm_empty = alm_empty_r;
assign alm_full = alm_full_r;
assign full = full_r;
endmodule
`TRACING_ON

View file

@ -186,7 +186,7 @@ module VX_stream_arb #(
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign ready_in[i] = ready_in_r & arb_onehot[i];
assign ready_in[i] = ready_in_r && arb_onehot[i];
end
VX_elastic_buffer #(