issue stage partitioning into slices

This commit is contained in:
Blaise Tine 2024-06-28 08:22:18 -07:00
parent 56e9e19508
commit 58c3c63dae
12 changed files with 729 additions and 663 deletions

View file

@ -322,7 +322,7 @@
.DATAW ($bits(dst)), \
.RESETW ($bits(dst)), \
.DEPTH (latency) \
) __``dst ( \
) __``dst``__ ( \
.clk (clk), \
.reset (reset), \
.enable (ena), \
@ -336,13 +336,18 @@
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out ( \
) __``out``__ ( \
.data_in (in), \
.data_out (out) \
)
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
`define ASSIGN_VX_IF(dst, src) \
assign dst.valid = src.valid; \
assign dst.data = src.data; \
assign src.ready = dst.ready
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
@ -377,42 +382,42 @@
assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
if (enable) begin \
reg [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __dst; \
always @(posedge clk) begin \
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
__dst <= {src.write_valid, src.write_addr, src.write_data}; \
end \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __dst; \
end else begin \
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
end \
VX_dcr_bus_if dst(); \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid, src.write_addr, src.write_data}; \
end
`define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
for (genvar __d = 0; __d < dst_count; ++__d) begin \
localparam __count = ((src_count > dst_count) ? `CDIV(src_count, dst_count) : 1); \
wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
wire [width-1:0] __reduce_add_o_``dst``field; \
for (genvar __i = 0; __i < __count; ++__i) begin \
assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
`define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \
if (count > 1) begin \
wire [count-1:0][width-1:0] __reduce_add_i_``field; \
wire [width-1:0] __reduce_add_o_``field; \
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_``field[__i] = src[__i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
__reduce_add_i_``src``field, \
__reduce_add_o_``dst``field \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``field ( \
__reduce_add_i_``field, \
__reduce_add_o_``field \
); \
if (reg_enable) begin \
reg [width-1:0] __reduce_add_r_``dst``field; \
reg [width-1:0] __reduce_add_r_``field; \
always @(posedge clk) begin \
if (reset) begin \
__reduce_add_r_``dst``field <= '0; \
__reduce_add_r_``field <= '0; \
end else begin \
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
__reduce_add_r_``field <= __reduce_add_o_``field; \
end \
end \
assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
assign dst.``field = __reduce_add_r_``field; \
end else begin \
assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
assign dst.``field = __reduce_add_o_``field; \
end \
end else begin \
assign dst.``field = src[0].``field; \
end
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
@ -426,20 +431,4 @@
assign dst = src; \
end
`define TO_DISPATCH_DATA(data, tid) { \
data.uuid, \
data.wis, \
data.tmask, \
data.PC, \
data.op_type, \
data.op_args, \
data.wb, \
data.rd, \
tid, \
data.rs1_data, \
data.rs2_data, \
data.rs3_data}
///////////////////////////////////////////////////////////////////////////////
`endif // VX_DEFINE_VH

View file

@ -60,6 +60,8 @@ package VX_gpu_pkg;
logic [7:0] mpm_class;
} base_dcrs_t;
//////////////////////////// Perf counter types ///////////////////////////
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
@ -77,48 +79,63 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] latency;
} mem_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] idles;
logic [`PERF_CTR_BITS-1:0] stalls;
} sched_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] ibf_stalls;
logic [`PERF_CTR_BITS-1:0] scb_stalls;
logic [`PERF_CTR_BITS-1:0] opd_stalls;
logic [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] units_uses;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t;
//////////////////////// instruction arguments ////////////////////////////
typedef struct packed {
logic use_PC;
logic use_imm;
logic is_w;
logic [`ALU_TYPE_BITS-1:0] xtype;
logic [`IMM_BITS-1:0] imm;
} alu_mod_t;
} alu_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-`INST_FRM_BITS-`INST_FMT_BITS)-1:0] __padding;
logic [($bits(alu_args_t)-`INST_FRM_BITS-`INST_FMT_BITS)-1:0] __padding;
logic [`INST_FRM_BITS-1:0] frm;
logic [`INST_FMT_BITS-1:0] fmt;
} fpu_mod_t;
} fpu_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-1-1-`OFFSET_BITS)-1:0] __padding;
logic [($bits(alu_args_t)-1-1-`OFFSET_BITS)-1:0] __padding;
logic is_store;
logic is_float;
logic [`OFFSET_BITS-1:0] offset;
} lsu_mod_t;
} lsu_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-1-`VX_CSR_ADDR_BITS-5)-1:0] __padding;
logic [($bits(alu_args_t)-1-`VX_CSR_ADDR_BITS-5)-1:0] __padding;
logic use_imm;
logic [`VX_CSR_ADDR_BITS-1:0] addr;
logic [4:0] imm;
} csr_mod_t;
} csr_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-1)-1:0] __padding;
logic [($bits(alu_args_t)-1)-1:0] __padding;
logic is_neg;
} wctl_mod_t;
} wctl_args_t;
typedef union packed {
alu_mod_t alu;
fpu_mod_t fpu;
lsu_mod_t lsu;
csr_mod_t csr;
wctl_mod_t wctl;
alu_args_t alu;
fpu_args_t fpu;
lsu_args_t lsu;
csr_args_t csr;
wctl_args_t wctl;
} op_args_t;
/* verilator lint_off UNUSED */
`IGNORE_UNUSED_BEGIN
///////////////////////// LSU memory Parameters ///////////////////////////
@ -154,11 +171,11 @@ package VX_gpu_pkg;
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef DCACHE_ENABLE
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`else
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
`endif
////////////////////////// Icache Parameters //////////////////////////////
@ -179,11 +196,11 @@ package VX_gpu_pkg;
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef ICACHE_ENABLE
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
@ -208,11 +225,11 @@ package VX_gpu_pkg;
localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8);
// Memory request tag bits
`ifdef L2_ENABLE
`ifdef L2_ENABLE
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`else
`else
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`endif
`endif
/////////////////////////////// L3 Parameters /////////////////////////////
@ -229,13 +246,11 @@ package VX_gpu_pkg;
localparam L3_MEM_DATA_WIDTH = (`L3_LINE_SIZE * 8);
// Memory request tag bits
`ifdef L3_ENABLE
`ifdef L3_ENABLE
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`else
`else
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif
/* verilator lint_on UNUSED */
`endif
/////////////////////////////// Issue parameters //////////////////////////
@ -245,7 +260,6 @@ package VX_gpu_pkg;
localparam ISSUE_WIS = `CLOG2(PER_ISSUE_WARPS);
localparam ISSUE_WIS_W = `UP(ISSUE_WIS);
`IGNORE_UNUSED_BEGIN
function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_ISW_W-1:0] isw
@ -278,6 +292,7 @@ package VX_gpu_pkg;
wid_to_wis = 0;
end
endfunction
`IGNORE_UNUSED_END
endpackage

View file

@ -93,22 +93,23 @@
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
`define VX_CSR_MPM_SCRB_WCTL 12'hB0B
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB8B
`define VX_CSR_MPM_SCRB_CSRS 12'hB0C
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
`define VX_CSR_MPM_OPDS_ST 12'hB0D
`define VX_CSR_MPM_OPDS_ST_H 12'hB8D
// PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0B
`define VX_CSR_MPM_IFETCHES_H 12'hB8B
`define VX_CSR_MPM_LOADS 12'hB0C
`define VX_CSR_MPM_LOADS_H 12'hB8C
`define VX_CSR_MPM_STORES 12'hB0D
`define VX_CSR_MPM_STORES_H 12'hB8D
`define VX_CSR_MPM_IFETCH_LT 12'hB0E
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
`define VX_CSR_MPM_LOAD_LT 12'hB0F
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
// SFU: scoreboard
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
`define VX_CSR_MPM_IFETCHES 12'hB0E
`define VX_CSR_MPM_IFETCHES_H 12'hB8E
`define VX_CSR_MPM_LOADS 12'hB0F
`define VX_CSR_MPM_LOADS_H 12'hB8F
`define VX_CSR_MPM_STORES 12'hB10
`define VX_CSR_MPM_STORES_H 12'hB90
`define VX_CSR_MPM_IFETCH_LT 12'hB11
`define VX_CSR_MPM_IFETCH_LT_H 12'hB91
`define VX_CSR_MPM_LOAD_LT 12'hB12
`define VX_CSR_MPM_LOAD_LT_H 12'hB92
// Machine Performance-monitoring memory counters (class 2) ///////////////////

View file

@ -26,7 +26,6 @@
addr+12'h80 : dst = 32'(src[$bits(src)-1:32])
`endif
module VX_csr_data
import VX_gpu_pkg::*;
`ifdef EXT_F_ENABLE
@ -212,21 +211,21 @@ import VX_fpu_pkg::*;
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched_idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched_stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_ALU]);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_FPU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_FPU]);
`else
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, `PERF_CTR_BITS'(0));
`endif
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.sfu_uses[`SFU_WCTL]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);

View file

@ -13,8 +13,8 @@
`include "VX_define.vh"
module VX_dispatch import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter CORE_ID = 0
module VX_dispatch import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -23,12 +23,12 @@ module VX_dispatch import VX_gpu_pkg::*, VX_trace_pkg::*; #(
output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS],
`endif
// inputs
VX_operands_if.slave operands_if [`ISSUE_WIDTH],
VX_operands_if.slave operands_if,
// outputs
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
@ -37,104 +37,70 @@ module VX_dispatch import VX_gpu_pkg::*, VX_trace_pkg::*; #(
assign tids[i] = `NT_WIDTH'(i);
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [`NT_WIDTH-1:0] last_active_tid;
wire [`NT_WIDTH-1:0] last_active_tid;
VX_find_first #(
.N (`NUM_THREADS),
.DATAW (`NT_WIDTH),
.REVERSE (1)
) last_tid_select (
.valid_in (operands_if.data.tmask),
.data_in (tids),
.data_out (last_active_tid),
`UNUSED_PIN (valid_out)
);
VX_find_first #(
.N (`NUM_THREADS),
.DATAW (`NT_WIDTH),
.REVERSE (1)
) last_tid_select (
.valid_in (operands_if[i].data.tmask),
.data_in (tids),
.data_out (last_active_tid),
`UNUSED_PIN (valid_out)
wire [`NUM_EX_UNITS-1:0] operands_reset;
assign operands_if.ready = operands_reset[operands_if.data.ex_type];
`RESET_RELAY (buf_reset, reset);
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2), // using 2-cycle EB for area reduction
.LUTRAM (1)
) buffer (
.clk (clk),
.reset (buf_reset),
.valid_in (operands_if.valid && (operands_if.data.ex_type == `EX_BITS'(i))),
.ready_in (operands_reset[i]),
.data_in ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.wb,
operands_if.data.rd,
last_active_tid,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
}),
.data_out (dispatch_if[i].data),
.valid_out (dispatch_if[i].valid),
.ready_out (dispatch_if[i].ready)
);
wire [`NUM_EX_UNITS-1:0] operands_reset;
`RESET_RELAY (buf_reset, reset);
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) buffer (
.clk (clk),
.reset (buf_reset),
.valid_in (operands_if[i].valid && (operands_if[i].data.ex_type == j)),
.ready_in (operands_reset[j]),
.data_in (`TO_DISPATCH_DATA(operands_if[i].data, last_active_tid)),
.data_out (dispatch_if[j * `ISSUE_WIDTH + i].data),
.valid_out (dispatch_if[j * `ISSUE_WIDTH + i].valid),
.ready_out (dispatch_if[j * `ISSUE_WIDTH + i].ready)
);
end
assign operands_if[i].ready = operands_reset[operands_if[i].data.ex_type];
end
`ifdef PERF_ENABLE
wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(*) begin
perf_issue_unit_stalls_per_cycle[i] = '0;
if (operands_if[i].valid && ~operands_if[i].ready) begin
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
end
end
end
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) reduce (
.data_in (perf_issue_unit_stalls_per_cycle),
.data_out (perf_unit_stalls_per_cycle)
);
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
wire operands_if_stall = operands_if.valid && ~operands_if.ready;
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_stalls_r[i] <= '0;
end else begin
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(operands_if_stall && operands_if.data.ex_type == `EX_BITS'(i));
end
end
end
for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin
assign perf_stalls[i] = perf_stalls_r[i];
end
`endif
`ifdef DBG_TRACE_PIPELINE
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (operands_if[i].valid && operands_if[i].ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), {operands_if[i].data.PC, 1'b0}));
trace_ex_type(1, operands_if[i].data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs3_data, `NUM_THREADS);
trace_op_args(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_args);
`TRACE(1, (" (#%0d)\n", operands_if[i].data.uuid));
end
end
end
`endif
endmodule

View file

@ -1,233 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_gpr_slice import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter NUM_BANKS = 4,
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if,
VX_scoreboard_if.slave scoreboard_if,
VX_operands_if.master operands_if
);
`UNUSED_PARAM (CORE_ID)
localparam NUM_SRC_REGS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS);
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + 3 * `NUM_THREADS * `XLEN;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
localparam BYTEENW = `NUM_THREADS * XLEN_SIZE;
`UNUSED_VAR (writeback_if.data.sop)
wire [NUM_SRC_REGS-1:0] req_valid_in;
wire [NUM_SRC_REGS-1:0] req_ready_in;
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_data_in;
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data, src_data_n;
wire [NUM_SRC_REGS-1:0] src_valid;
reg [NUM_SRC_REGS-1:0] data_fetched;
reg data_ready;
assign src_valid[0] = (scoreboard_if.data.rs1 != 0) && ~data_fetched[0];
assign src_valid[1] = (scoreboard_if.data.rs2 != 0) && ~data_fetched[1];
assign src_valid[2] = (scoreboard_if.data.rs3 != 0) && ~data_fetched[2];
assign req_valid_in[0] = scoreboard_if.valid && src_valid[0];
assign req_valid_in[1] = scoreboard_if.valid && src_valid[1];
assign req_valid_in[2] = scoreboard_if.valid && src_valid[2];
if (ISSUE_WIS != 0) begin
assign req_data_in[0] = {scoreboard_if.data.wis, scoreboard_if.data.rs1[`NR_BITS-1:BANK_SEL_BITS]};
assign req_data_in[1] = {scoreboard_if.data.wis, scoreboard_if.data.rs2[`NR_BITS-1:BANK_SEL_BITS]};
assign req_data_in[2] = {scoreboard_if.data.wis, scoreboard_if.data.rs3[`NR_BITS-1:BANK_SEL_BITS]};
end else begin
assign req_data_in[0] = {scoreboard_if.data.rs1[`NR_BITS-1:BANK_SEL_BITS]};
assign req_data_in[1] = {scoreboard_if.data.rs2[`NR_BITS-1:BANK_SEL_BITS]};
assign req_data_in[2] = {scoreboard_if.data.rs3[`NR_BITS-1:BANK_SEL_BITS]};
end
if (NUM_BANKS > 1) begin
assign req_bank_idx[0] = scoreboard_if.data.rs1[BANK_SEL_BITS-1:0];
assign req_bank_idx[1] = scoreboard_if.data.rs2[BANK_SEL_BITS-1:0];
assign req_bank_idx[2] = scoreboard_if.data.rs3[BANK_SEL_BITS-1:0];
end else begin
assign req_bank_idx = '0;
end
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_REGS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (1) // single-cycle EB since ready_out=1
) req_xbar (
.clk (clk),
.reset (reset),
`UNUSED_PIN(collisions),
.valid_in (req_valid_in),
.data_in (req_data_in),
.sel_in (req_bank_idx),
.ready_in (req_ready_in),
.valid_out (gpr_rd_valid),
.data_out (gpr_rd_addr),
.sel_out (gpr_rd_req_idx),
.ready_out ({NUM_BANKS{1'b1}})
);
always @(*) begin
src_data_n = src_data;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid[b]) begin
src_data_n[gpr_rd_req_idx[b]] = gpr_rd_data[b];
end
end
end
always @(posedge clk) begin
if (reset) begin
data_fetched <= '0;
src_data <= '0;
data_ready <= '0;
end else begin
if (scoreboard_if.ready) begin
data_fetched <= '0;
src_data <= '0;
data_ready <= '0;
end else begin
data_fetched <= data_fetched | req_ready_in;
src_data <= src_data_n;
data_ready <= scoreboard_if.valid
&& (~src_valid[0] || req_ready_in[0])
&& (~src_valid[1] || req_ready_in[1])
&& (~src_valid[2] || req_ready_in[2]);
end
end
end
wire stg_valid_in, stg_ready_in;
assign stg_valid_in = scoreboard_if.valid && data_ready;
assign scoreboard_if.ready = stg_ready_in && data_ready;
// We use a toggle buffer since the input signal also toggles
VX_toggle_buffer #(
.DATAW (DATAW),
.PASSTHRU (~OUT_REG)
) rsp_buffer (
.clk (clk),
.reset (reset),
.valid_in (stg_valid_in),
.data_in ({
scoreboard_if.data.uuid,
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd,
src_data_n[0],
src_data_n[1],
src_data_n[2]
}),
.ready_in (stg_ready_in),
.valid_out (operands_if.valid),
.data_out ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.wb,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.rd,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
}),
.ready_out (operands_if.ready)
);
wire [RAM_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if.data.wis, writeback_if.data.rd};
end else begin
assign gpr_wr_addr = writeback_if.data.rd;
end
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
end
end
`else
wire wr_enabled = 1;
`endif
for (genvar b = 0; b < NUM_BANKS; ++b) begin
wire gpr_wr_enabled;
if (BANK_SEL_BITS != 0) begin
assign gpr_wr_enabled = wr_enabled && writeback_if.valid
&& (gpr_wr_addr[BANK_SEL_BITS-1:0] == BANK_SEL_BITS'(b));
end else begin
assign gpr_wr_enabled = wr_enabled && writeback_if.valid;
end
wire [BYTEENW-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
end
VX_dp_ram #(
.DATAW (`XLEN * `NUM_THREADS),
.SIZE (PER_BANK_REGS * PER_ISSUE_WARPS),
.WRENW (BYTEENW),
`ifdef GPR_RESET
.INIT_ENABLE (1),
.INIT_VALUE (0),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
.read (1'b1),
.wren (wren),
.write (gpr_wr_enabled),
.waddr (gpr_wr_addr[BANK_SEL_BITS +: PER_BANK_ADDRW]),
.wdata (writeback_if.data.data),
.raddr (gpr_rd_addr[b]),
.rdata (gpr_rd_data[b])
);
end
endmodule

View file

@ -14,33 +14,36 @@
`include "VX_define.vh"
module VX_ibuffer import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
`endif
// inputs
VX_decode_if.slave decode_if,
// outputs
VX_ibuffer_if.master ibuffer_if [`NUM_WARPS]
VX_ibuffer_if.master ibuffer_if [PER_ISSUE_WARPS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4);
wire [`NUM_WARPS-1:0] ibuf_ready_in;
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUT_REG (2) // use a 2-cycle FIFO
.OUT_REG (2) // using 2-cycle EB for area reduction
) instr_buf (
.clk (clk),
.reset (reset),
.valid_in (decode_if.valid && decode_if.data.wid == i),
.valid_in (decode_if.valid && decode_if.data.wid == ISSUE_WIS_W'(i)),
.data_in ({
decode_if.data.uuid,
decode_if.data.tmask,
@ -52,7 +55,8 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
decode_if.data.rd,
decode_if.data.rs1,
decode_if.data.rs2,
decode_if.data.rs3}),
decode_if.data.rs3
}),
.ready_in (ibuf_ready_in[i]),
.valid_out(ibuffer_if[i].valid),
.data_out (ibuffer_if[i].data),
@ -63,4 +67,20 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
`endif
end
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_if_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_if_stall);
end
end
assign perf_stalls = perf_ibf_stalls;
`endif
endmodule

View file

@ -13,8 +13,8 @@
`include "VX_define.vh"
module VX_issue #(
parameter CORE_ID = 0
module VX_issue import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -22,137 +22,78 @@ module VX_issue #(
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.issue perf_issue_if,
output issue_perf_t issue_perf,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
);
VX_ibuffer_if ibuffer_if [`NUM_WARPS]();
VX_scoreboard_if scoreboard_if [`ISSUE_WIDTH]();
VX_operands_if operands_if [`ISSUE_WIDTH]();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.CORE_ID (CORE_ID)
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.CORE_ID (CORE_ID)
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_units_uses(perf_issue_if.units_uses),
.perf_sfu_uses (perf_issue_if.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.CORE_ID (CORE_ID)
) operands (
.clk (clk),
.reset (operands_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.CORE_ID (CORE_ID)
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
.operands_if (operands_if),
.dispatch_if (dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
if (CORE_ID == 0) begin
`ifdef SCOPE
wire operands_if_fire = operands_if[0].valid && operands_if[0].ready;
wire operands_if_not_ready = ~operands_if[0].ready;
wire writeback_if_valid = writeback_if[0].valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes({
operands_if[0].data.uuid,
operands_if[0].data.tmask,
operands_if[0].data.ex_type,
operands_if[0].data.op_type,
operands_if[0].data.wb,
operands_if[0].data.rd,
operands_if[0].data.rs1_data,
operands_if[0].data.rs2_data,
operands_if[0].data.rs3_data,
writeback_if[0].data.uuid,
writeback_if[0].data.tmask,
writeback_if[0].data.rd,
writeback_if[0].data.data,
writeback_if[0].data.eop
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({operands_if.uuid, ibuffer.rs3, ibuffer.rs2, ibuffer.rs1, operands_if.PC, operands_if.tmask, operands_if.wid, operands_if.ex_type, operands_if.op_type, operands_if.ready, operands_if.valid}),
.probe1 ({writeback_if.uuid, writeback_if.data[0], writeback_if.PC, writeback_if.tmask, writeback_if.wid, writeback_if.eop, writeback_if.valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
end
issue_perf_t per_issue_perf [`ISSUE_WIDTH];
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
`endif
wire [ISSUE_ISW_W-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
wire [`ISSUE_WIDTH-1:0] decode_ready_in;
assign decode_if.ready = decode_ready_in[decode_isw];
`SCOPE_IO_SWITCH (`ISSUE_WIDTH)
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin
VX_decode_if #(
.WID_WiDTH(ISSUE_WIS_W)
) per_issue_decode_if();
VX_dispatch_if per_issue_dispatch_if[`NUM_EX_UNITS]();
assign per_issue_decode_if.valid = decode_if.valid && (decode_isw == ISSUE_ISW_W'(issue_id));
assign per_issue_decode_if.data.uuid = decode_if.data.uuid;
assign per_issue_decode_if.data.wid = decode_wis;
assign per_issue_decode_if.data.tmask = decode_if.data.tmask;
assign per_issue_decode_if.data.PC = decode_if.data.PC;
assign per_issue_decode_if.data.ex_type = decode_if.data.ex_type;
assign per_issue_decode_if.data.op_type = decode_if.data.op_type;
assign per_issue_decode_if.data.op_args = decode_if.data.op_args;
assign per_issue_decode_if.data.wb = decode_if.data.wb;
assign per_issue_decode_if.data.rd = decode_if.data.rd;
assign per_issue_decode_if.data.rs1 = decode_if.data.rs1;
assign per_issue_decode_if.data.rs2 = decode_if.data.rs2;
assign per_issue_decode_if.data.rs3 = decode_if.data.rs3;
assign decode_ready_in[issue_id] = per_issue_decode_if.ready;
`RESET_RELAY (slice_reset, reset);
VX_issue_slice #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, issue_id)),
.ISSUE_ID (issue_id)
) issue_slice (
`SCOPE_IO_BIND(issue_id)
.clk (clk),
.reset (slice_reset),
`ifdef PERF_ENABLE
.issue_perf (per_issue_perf[issue_id]),
`endif
.decode_if (per_issue_decode_if),
.writeback_if (writeback_if[issue_id]),
.dispatch_if (per_issue_dispatch_if)
);
// Assign transposed dispatch_if
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin
`ASSIGN_VX_IF(dispatch_if[ex_id * `ISSUE_WIDTH + issue_id], per_issue_dispatch_if[ex_id]);
end
end
endmodule

View file

@ -0,0 +1,159 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter ISSUE_ID
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output issue_perf_t issue_perf,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if,
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
);
`UNUSED_PARAM (ISSUE_ID)
VX_ibuffer_if ibuffer_if [PER_ISSUE_WARPS]();
VX_scoreboard_if scoreboard_if();
VX_operands_if operands_if();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.INSTANCE_ID ($sformatf("%s-ibuffer", INSTANCE_ID))
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.ibf_stalls),
`endif
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.INSTANCE_ID ($sformatf("%s-scoreboard", INSTANCE_ID))
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.scb_stalls),
.perf_units_uses(issue_perf.units_uses),
.perf_sfu_uses (issue_perf.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.INSTANCE_ID ($sformatf("%s-operands", INSTANCE_ID))
) operands (
.clk (clk),
.reset (operands_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.opd_stalls),
`endif
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.INSTANCE_ID ($sformatf("%s-dispatch", INSTANCE_ID))
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
.operands_if (operands_if),
.dispatch_if (dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
wire operands_if_fire = operands_if.valid && operands_if.ready;
wire operands_if_not_ready = ~operands_if.ready;
wire writeback_if_valid = writeback_if.valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers ({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes ({
operands_if.data.uuid,
operands_if.data.tmask,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.wb,
operands_if.data.rd,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data,
writeback_if.data.uuid,
writeback_if.data.tmask,
writeback_if.data.rd,
writeback_if.data.data,
writeback_if.data.eop
}),
.bus_in (scope_bus_in),
.bus_out (scope_bus_out)
);
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (operands_if.valid && operands_if.ready) begin
`TRACE(1, ("%d: %s wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}));
trace_ex_type(1, operands_if.data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS);
trace_op_args(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid));
end
end
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,28 +14,227 @@
`include "VX_define.vh"
module VX_operands import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = "",
parameter NUM_BANKS = 4,
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_scoreboard_if.slave scoreboard_if [`ISSUE_WIDTH],
VX_operands_if.master operands_if [`ISSUE_WIDTH]
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
`endif
VX_writeback_if.slave writeback_if,
VX_scoreboard_if.slave scoreboard_if,
VX_operands_if.master operands_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_REGS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS);
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + 3 * `NUM_THREADS * `XLEN;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
localparam BYTEENW = `NUM_THREADS * XLEN_SIZE;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
`RESET_RELAY (slice_reset, reset);
`UNUSED_VAR (writeback_if.data.sop)
VX_gpr_slice #(
.CORE_ID (CORE_ID)
) gpr_slice (
.clk (clk),
.reset (slice_reset),
.writeback_if (writeback_if[i]),
.scoreboard_if(scoreboard_if[i]),
.operands_if (operands_if[i])
wire [NUM_SRC_REGS-1:0] req_valid_in;
wire [NUM_SRC_REGS-1:0] req_ready_in;
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_data_in;
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data, src_data_n;
wire [NUM_SRC_REGS-1:0] src_valid;
reg [NUM_SRC_REGS-1:0] data_fetched;
reg data_ready;
assign src_valid[0] = (scoreboard_if.data.rs1 != 0) && ~data_fetched[0];
assign src_valid[1] = (scoreboard_if.data.rs2 != 0) && ~data_fetched[1];
assign src_valid[2] = (scoreboard_if.data.rs3 != 0) && ~data_fetched[2];
assign req_valid_in[0] = scoreboard_if.valid && src_valid[0];
assign req_valid_in[1] = scoreboard_if.valid && src_valid[1];
assign req_valid_in[2] = scoreboard_if.valid && src_valid[2];
if (ISSUE_WIS != 0) begin
assign req_data_in[0] = {scoreboard_if.data.wis, scoreboard_if.data.rs1[`NR_BITS-1:BANK_SEL_BITS]};
assign req_data_in[1] = {scoreboard_if.data.wis, scoreboard_if.data.rs2[`NR_BITS-1:BANK_SEL_BITS]};
assign req_data_in[2] = {scoreboard_if.data.wis, scoreboard_if.data.rs3[`NR_BITS-1:BANK_SEL_BITS]};
end else begin
assign req_data_in[0] = {scoreboard_if.data.rs1[`NR_BITS-1:BANK_SEL_BITS]};
assign req_data_in[1] = {scoreboard_if.data.rs2[`NR_BITS-1:BANK_SEL_BITS]};
assign req_data_in[2] = {scoreboard_if.data.rs3[`NR_BITS-1:BANK_SEL_BITS]};
end
if (NUM_BANKS > 1) begin
assign req_bank_idx[0] = scoreboard_if.data.rs1[BANK_SEL_BITS-1:0];
assign req_bank_idx[1] = scoreboard_if.data.rs2[BANK_SEL_BITS-1:0];
assign req_bank_idx[2] = scoreboard_if.data.rs3[BANK_SEL_BITS-1:0];
end else begin
assign req_bank_idx = '0;
end
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_REGS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (1) // single-cycle EB since ready_out=1
) req_xbar (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.collisions(perf_stalls),
`else
`UNUSED_PIN(collisions),
`endif
.valid_in (req_valid_in),
.data_in (req_data_in),
.sel_in (req_bank_idx),
.ready_in (req_ready_in),
.valid_out (gpr_rd_valid),
.data_out (gpr_rd_addr),
.sel_out (gpr_rd_req_idx),
.ready_out ({NUM_BANKS{1'b1}})
);
always @(*) begin
src_data_n = src_data;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid[b]) begin
src_data_n[gpr_rd_req_idx[b]] = gpr_rd_data[b];
end
end
end
always @(posedge clk) begin
if (reset) begin
data_fetched <= '0;
src_data <= '0;
data_ready <= '0;
end else begin
if (scoreboard_if.ready) begin
data_fetched <= '0;
src_data <= '0;
data_ready <= '0;
end else begin
data_fetched <= data_fetched | req_ready_in;
src_data <= src_data_n;
data_ready <= scoreboard_if.valid
&& (~src_valid[0] || req_ready_in[0])
&& (~src_valid[1] || req_ready_in[1])
&& (~src_valid[2] || req_ready_in[2]);
end
end
end
wire stg_valid_in, stg_ready_in;
assign stg_valid_in = scoreboard_if.valid && data_ready;
assign scoreboard_if.ready = stg_ready_in && data_ready;
// We use a toggle buffer since the input signal also toggles
VX_toggle_buffer #(
.DATAW (DATAW),
.PASSTHRU (~OUT_REG)
) rsp_buffer (
.clk (clk),
.reset (reset),
.valid_in (stg_valid_in),
.data_in ({
scoreboard_if.data.uuid,
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd,
src_data_n[0],
src_data_n[1],
src_data_n[2]
}),
.ready_in (stg_ready_in),
.valid_out (operands_if.valid),
.data_out ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.wb,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.rd,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
}),
.ready_out (operands_if.ready)
);
wire [RAM_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if.data.wis, writeback_if.data.rd};
end else begin
assign gpr_wr_addr = writeback_if.data.rd;
end
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
end
end
`else
wire wr_enabled = 1;
`endif
for (genvar b = 0; b < NUM_BANKS; ++b) begin
wire gpr_wr_enabled;
if (BANK_SEL_BITS != 0) begin
assign gpr_wr_enabled = wr_enabled && writeback_if.valid
&& (gpr_wr_addr[BANK_SEL_BITS-1:0] == BANK_SEL_BITS'(b));
end else begin
assign gpr_wr_enabled = wr_enabled && writeback_if.valid;
end
wire [BYTEENW-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
end
VX_dp_ram #(
.DATAW (`XLEN * `NUM_THREADS),
.SIZE (PER_BANK_REGS * PER_ISSUE_WARPS),
.WRENW (BYTEENW),
`ifdef GPR_RESET
.INIT_ENABLE (1),
.INIT_VALUE (0),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
.read (1'b1),
.wren (wren),
.write (gpr_wr_enabled),
.waddr (gpr_wr_addr[BANK_SEL_BITS +: PER_BANK_ADDRW]),
.wdata (writeback_if.data.data),
.raddr (gpr_rd_addr[b]),
.rdata (gpr_rd_data[b])
);
end

View file

@ -14,39 +14,39 @@
`include "VX_define.vh"
module VX_scoreboard import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_stalls,
output reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_uses,
output reg [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_uses,
`endif
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave ibuffer_if [`NUM_WARPS],
VX_scoreboard_if.master scoreboard_if [`ISSUE_WIDTH]
VX_writeback_if.slave writeback_if,
VX_ibuffer_if.slave ibuffer_if [PER_ISSUE_WARPS],
VX_scoreboard_if.master scoreboard_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
`ifdef PERF_ENABLE
reg [`NUM_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
reg [PER_ISSUE_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
reg [`NUM_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
wire [`NUM_WARPS-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`NUM_WARPS+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
wire [PER_ISSUE_WARPS-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(PER_ISSUE_WARPS+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`NUM_WARPS),
.N (PER_ISSUE_WARPS),
.OP ("|")
) perf_units_reduce (
.data_in (perf_inuse_units_per_cycle),
@ -55,7 +55,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (`NUM_WARPS),
.N (PER_ISSUE_WARPS),
.OP ("|")
) perf_sfu_reduce (
.data_in (perf_inuse_sfu_per_cycle),
@ -63,14 +63,14 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
);
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(`NUM_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(`NUM_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
perf_stalls <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
perf_stalls <= perf_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
end
end
@ -95,10 +95,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`endif
VX_ibuffer_if staging_if [`NUM_WARPS]();
wire [`NUM_WARPS-1:0][3:0] staging_opds_busy;
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
wire [PER_ISSUE_WARPS-1:0][3:0] staging_opds_busy;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (1)
@ -114,21 +114,18 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
);
end
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
reg [`NUM_REGS-1:0] inuse_regs;
reg [3:0] operands_busy_r, operands_busy_n;
localparam iw = i % `ISSUE_WIDTH;
localparam wis = i / `ISSUE_WIDTH;
reg [3:0] operands_busy, operands_busy_n;
wire ibuffer_fire = ibuffer_if[i].valid && ibuffer_if[i].ready;
wire staging_fire = staging_if[i].valid && staging_if[i].ready;
wire writeback_fire = writeback_if[iw].valid
&& (writeback_if[iw].data.wis == ISSUE_WIS_W'(wis))
&& writeback_if[iw].data.eop;
wire writeback_fire = writeback_if.valid
&& (writeback_if.data.wis == ISSUE_WIS_W'(i))
&& writeback_if.data.eop;
`ifdef PERF_ENABLE
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
@ -148,25 +145,25 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
perf_inuse_units_per_cycle[i] = '0;
perf_inuse_sfu_per_cycle[i] = '0;
if (staging_if[i].valid) begin
if (operands_busy_r[0]) begin
if (operands_busy[0]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rd]] = 1;
if (inuse_units[staging_if[i].data.rd] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rd]] = 1;
end
end
if (operands_busy_r[1]) begin
if (operands_busy[1]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs1]] = 1;
if (inuse_units[staging_if[i].data.rs1] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs1]] = 1;
end
end
if (operands_busy_r[2]) begin
if (operands_busy[2]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs2]] = 1;
if (inuse_units[staging_if[i].data.rs2] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs2]] = 1;
end
end
if (operands_busy_r[3]) begin
if (operands_busy[3]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs3]] = 1;
if (inuse_units[staging_if[i].data.rs3] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs3]] = 1;
@ -178,7 +175,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`endif
always @(*) begin
operands_busy_n = operands_busy_r;
operands_busy_n = operands_busy;
if (ibuffer_fire) begin
operands_busy_n = {
inuse_regs[ibuffer_if[i].data.rs3],
@ -189,29 +186,29 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
if (writeback_fire) begin
if (ibuffer_fire) begin
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rd) begin
if (writeback_if.data.rd == ibuffer_if[i].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rs1) begin
if (writeback_if.data.rd == ibuffer_if[i].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rs2) begin
if (writeback_if.data.rd == ibuffer_if[i].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rs3) begin
if (writeback_if.data.rd == ibuffer_if[i].data.rs3) begin
operands_busy_n[3] = 0;
end
end else begin
if (writeback_if[iw].data.rd == staging_if[i].data.rd) begin
if (writeback_if.data.rd == staging_if[i].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if[iw].data.rd == staging_if[i].data.rs1) begin
if (writeback_if.data.rd == staging_if[i].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if[iw].data.rd == staging_if[i].data.rs2) begin
if (writeback_if.data.rd == staging_if[i].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if[iw].data.rd == staging_if[i].data.rs3) begin
if (writeback_if.data.rd == staging_if[i].data.rs3) begin
operands_busy_n[3] = 0;
end
end
@ -237,13 +234,13 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
inuse_regs <= '0;
end else begin
if (writeback_fire) begin
inuse_regs[writeback_if[iw].data.rd] <= 0;
inuse_regs[writeback_if.data.rd] <= 0;
end
if (staging_fire && staging_if[i].data.wb) begin
inuse_regs[staging_if[i].data.rd] <= 1;
end
end
operands_busy_r <= operands_busy_n;
operands_busy <= operands_busy_n;
`ifdef PERF_ENABLE
if (staging_fire && staging_if[i].data.wb) begin
inuse_units[staging_if[i].data.rd] <= staging_if[i].data.ex_type;
@ -254,7 +251,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`endif
end
assign staging_opds_busy[i] = operands_busy_r;
assign staging_opds_busy[i] = operands_busy;
`ifdef SIMULATION
reg [31:0] timeout_ctr;
@ -265,9 +262,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end else begin
if (staging_if[i].valid && ~staging_if[i].ready) begin
`ifdef DBG_TRACE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
operands_busy_r, staging_if[i].data.uuid));
`TRACE(3, ("%d: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, INSTANCE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
operands_busy, staging_if[i].data.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (ibuffer_fire) begin
@ -277,59 +274,57 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
operands_busy_r, staging_if[i].data.uuid));
("%t: *** %s-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, INSTANCE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
operands_busy, staging_if[i].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[iw].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, i, {writeback_if[iw].data.PC, 1'b0}, writeback_if[iw].data.tmask, writeback_if[iw].data.rd, writeback_if[iw].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if.data.rd] != 0,
("%t: *** %s: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, INSTANCE_ID, i, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid));
`endif
end
`RESET_RELAY (arb_reset, reset);
wire [PER_ISSUE_WARPS-1:0] arb_valid_in;
wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] arb_data_in;
wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [PER_ISSUE_WARPS-1:0] valid_in;
wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] data_in;
wire [PER_ISSUE_WARPS-1:0] ready_in;
for (genvar j = 0; j < PER_ISSUE_WARPS; ++j) begin
wire operands_ready = ~(| staging_opds_busy[j * `ISSUE_WIDTH + i]);
assign valid_in[j] = staging_if[j * `ISSUE_WIDTH + i].valid && operands_ready;
assign data_in[j] = staging_if[j * `ISSUE_WIDTH + i].data;
assign staging_if[j * `ISSUE_WIDTH + i].ready = ready_in[j] && operands_ready;
end
VX_stream_arb #(
.NUM_INPUTS (PER_ISSUE_WARPS),
.DATAW (DATAW),
.ARBITER ("R"),
.OUT_BUF (2)
) out_arb (
.clk (clk),
.reset (arb_reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
.data_out ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_args,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.rd,
scoreboard_if[i].data.rs1,
scoreboard_if[i].data.rs2,
scoreboard_if[i].data.rs3
}),
.valid_out (scoreboard_if[i].valid),
.ready_out (scoreboard_if[i].ready),
.sel_out (scoreboard_if[i].data.wis)
);
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
wire operands_ready = ~(| staging_opds_busy[i]);
assign arb_valid_in[i] = staging_if[i].valid && operands_ready;
assign arb_data_in[i] = staging_if[i].data;
assign staging_if[i].ready = arb_ready_in[i] && operands_ready;
end
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (PER_ISSUE_WARPS),
.DATAW (DATAW),
.ARBITER ("R"),
.OUT_BUF (2)
) out_arb (
.clk (clk),
.reset (arb_reset),
.valid_in (arb_valid_in),
.ready_in (arb_ready_in),
.data_in (arb_data_in),
.data_out ({
scoreboard_if.data.uuid,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.wb,
scoreboard_if.data.rd,
scoreboard_if.data.rs1,
scoreboard_if.data.rs2,
scoreboard_if.data.rs3
}),
.valid_out (scoreboard_if.valid),
.ready_out (scoreboard_if.ready),
.sel_out (scoreboard_if.data.wis)
);
endmodule

View file

@ -182,6 +182,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t sched_stalls = 0;
uint64_t ibuffer_stalls = 0;
uint64_t scrb_stalls = 0;
uint64_t opds_stalls = 0;
uint64_t scrb_alu = 0;
uint64_t scrb_fpu = 0;
uint64_t scrb_lsu = 0;
@ -268,7 +269,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
sched_stalls += sched_stalls_per_core;
}
// ibuffer_stalls
// ibuffer stalls
{
uint64_t ibuffer_stalls_per_core;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_IBUF_ST, core_id, &ibuffer_stalls_per_core), {
@ -280,7 +281,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
ibuffer_stalls += ibuffer_stalls_per_core;
}
// issue_stalls
// issue stalls
{
uint64_t scrb_stalls_per_core;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ST, core_id, &scrb_stalls_per_core), {
@ -316,7 +317,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
scrb_stalls += scrb_stalls_per_core;
}
// sfu_stalls
// sfu stalls
{
uint64_t scrb_sfu_per_core;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
@ -342,6 +343,18 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
scrb_wctl += scrb_wctl_per_core;
scrb_csrs += scrb_csrs_per_core;
}
// operands stalls
{
uint64_t opds_stalls_per_core;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_OPDS_ST, core_id, &opds_stalls_per_core), {
return err;
});
if (num_cores > 1) {
int opds_percent_per_core = calcAvgPercent(opds_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: operands stalls=%ld (%d%%)\n", core_id, opds_stalls_per_core, opds_percent_per_core);
}
opds_stalls += opds_stalls_per_core;
}
// PERF: memory
// ifetches
{
@ -554,6 +567,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles);
int sched_stalls_percent = calcAvgPercent(sched_stalls, total_cycles);
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, total_cycles);
int opds_percent = calcAvgPercent(opds_stalls, total_cycles);
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
int load_avg_lat = (int)(double(load_lat) / double(loads));
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
@ -571,6 +585,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
, calcAvgPercent(scrb_csrs, sfu_total)
, calcAvgPercent(scrb_wctl, sfu_total)
);
fprintf(stream, "PERF: operands stalls=%ld (%d%%)\n", opds_stalls, opds_percent);
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
fprintf(stream, "PERF: loads=%ld\n", loads);
fprintf(stream, "PERF: stores=%ld\n", stores);