cumulative fixes

This commit is contained in:
Blaise Tine 2024-07-15 10:13:57 -07:00
parent 0dbcddcb54
commit 578c3d33d2
26 changed files with 267 additions and 150 deletions

View file

@ -32,4 +32,6 @@ RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
VORTEX_RT_PATH ?= $(VORTEX_HOME)/runtime
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party

2
configure vendored
View file

@ -141,7 +141,7 @@ usage() {
echo "Usage: $0 [--xlen=<value>] [--tooldir=<path>] [--osversion=<version>]"
echo " --xlen=<value> Set the XLEN value (default: 32)"
echo " --tooldir=<path> Set the TOOLDIR path (default: /opt)"
echo " --osversion=<version> Set the OS Version (default: $(detect_os))"
echo " --osversion=<version> Set the OS Version (default: $(detect_osversion))"
echo " --prefix=<path> Set installation directory"
exit 1
}

View file

@ -16,7 +16,7 @@ VX_types.h: $(RTL_DIR)/VX_types.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_types.vh -o VX_types.h
clean:
$(MAKE) -C unitest clean
$(MAKE) -C unittest clean
rm -f VX_config.h VX_types.h
.PHONY: VX_config.h VX_types.h

View file

@ -408,22 +408,27 @@
`define LATENCY_FCVT 5
`endif
// FMA Bandwidth ratio
`ifndef FMA_PE_RATIO
`define FMA_PE_RATIO 1
`endif
// FDIV Bandwidth ratio
`ifndef FDIV_PE_RATIO
`define FDIV_PE_RATIO 8
`endif
// FSQRT Bandwidth ratio
`ifndef FSQRT_PE_RATIO
`define FSQRT_PE_RATIO 8
`endif
// FCVT Bandwidth ratio
`ifndef FCVT_PE_RATIO
`define FCVT_PE_RATIO 8
`endif
// FNCP Bandwidth ratio
`ifndef FNCP_PE_RATIO
`define FNCP_PE_RATIO 2
`endif
@ -530,7 +535,7 @@
`define DCACHE_NUM_WAYS 1
`endif
// SM Configurable Knobs //////////////////////////////////////////////////////
// LMEM Configurable Knobs ////////////////////////////////////////////////////
`ifndef LMEM_DISABLE
`define LMEM_ENABLE

View file

@ -89,6 +89,7 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] scb_stalls;
logic [`PERF_CTR_BITS-1:0] opd_stalls;
logic [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] units_uses;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t;
//////////////////////// instruction arguments ////////////////////////////
@ -145,6 +146,31 @@ package VX_gpu_pkg;
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
////////////////////////// Icache Parameters //////////////////////////////
// Word size in bytes
localparam ICACHE_WORD_SIZE = 4;
localparam ICACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(ICACHE_WORD_SIZE));
// Block size in bytes
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
// Core request tag Id bits
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
// Core request tag bits
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
// Memory request data bits
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
// Word size in bytes
@ -176,31 +202,6 @@ package VX_gpu_pkg;
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
////////////////////////// Icache Parameters //////////////////////////////
// Word size in bytes
localparam ICACHE_WORD_SIZE = 4;
localparam ICACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(ICACHE_WORD_SIZE));
// Block size in bytes
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
// Core request tag Id bits
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
// Core request tag bits
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
// Memory request data bits
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
@ -292,6 +293,19 @@ package VX_gpu_pkg;
end
endfunction
///////////////////////// Miscaellaneous functions ////////////////////////
function logic [`SFU_WIDTH-1:0] op_to_sfu_type(
input logic [`INST_OP_BITS-1:0] op_type
);
case (op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: op_to_sfu_type = `SFU_CSRS;
default: op_to_sfu_type = `SFU_WCTL;
endcase
endfunction
`IGNORE_UNUSED_END
endpackage

View file

@ -85,27 +85,31 @@
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
`define VX_CSR_MPM_SCRB_ST 12'hB06
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
`define VX_CSR_MPM_SCRB_ALU 12'hB07
`define VX_CSR_MPM_SCRB_ALU_H 12'hB87
`define VX_CSR_MPM_SCRB_FPU 12'hB08
`define VX_CSR_MPM_SCRB_FPU_H 12'hB88
`define VX_CSR_MPM_SCRB_LSU 12'hB09
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
`define VX_CSR_MPM_OPDS_ST 12'hB0B
`define VX_CSR_MPM_OPDS_ST_H 12'hB8B
`define VX_CSR_MPM_OPDS_ST 12'hB07
`define VX_CSR_MPM_OPDS_ST_H 12'hB87
`define VX_CSR_MPM_SCRB_ALU 12'hB08
`define VX_CSR_MPM_SCRB_ALU_H 12'hB88
`define VX_CSR_MPM_SCRB_FPU 12'hB09
`define VX_CSR_MPM_SCRB_FPU_H 12'hB89
`define VX_CSR_MPM_SCRB_LSU 12'hB0A
`define VX_CSR_MPM_SCRB_LSU_H 12'hB8A
`define VX_CSR_MPM_SCRB_SFU 12'hB0B
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8B
`define VX_CSR_MPM_SCRB_CSRS 12'hB0C
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
`define VX_CSR_MPM_SCRB_WCTL 12'hB0D
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB8D
// PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0C
`define VX_CSR_MPM_IFETCHES_H 12'hB8C
`define VX_CSR_MPM_LOADS 12'hB0D
`define VX_CSR_MPM_LOADS_H 12'hB8D
`define VX_CSR_MPM_STORES 12'hB0E
`define VX_CSR_MPM_STORES_H 12'hB8E
`define VX_CSR_MPM_IFETCH_LT 12'hB1F
`define VX_CSR_MPM_IFETCH_LT_H 12'hB9F
`define VX_CSR_MPM_LOAD_LT 12'hB10
`define VX_CSR_MPM_LOAD_LT_H 12'hB90
`define VX_CSR_MPM_IFETCHES 12'hB0E
`define VX_CSR_MPM_IFETCHES_H 12'hB8E
`define VX_CSR_MPM_LOADS 12'hB0F
`define VX_CSR_MPM_LOADS_H 12'hB8F
`define VX_CSR_MPM_STORES 12'hB10
`define VX_CSR_MPM_STORES_H 12'hB90
`define VX_CSR_MPM_IFETCH_LT 12'hB11
`define VX_CSR_MPM_IFETCH_LT_H 12'hB91
`define VX_CSR_MPM_LOAD_LT 12'hB12
`define VX_CSR_MPM_LOAD_LT_H 12'hB92
// Machine Performance-monitoring memory counters (class 2) ///////////////////

View file

@ -216,6 +216,7 @@ import VX_fpu_pkg::*;
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_FPU]);
@ -224,7 +225,8 @@ import VX_fpu_pkg::*;
`endif
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);

View file

@ -35,7 +35,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
@ -43,7 +43,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
) instr_buf (
.clk (clk),
.reset (reset),
.valid_in (decode_if.valid && decode_if.data.wid == ISSUE_WIS_W'(i)),
.valid_in (decode_if.valid && decode_if.data.wid == ISSUE_WIS_W'(w)),
.data_in ({
decode_if.data.uuid,
decode_if.data.tmask,
@ -57,13 +57,13 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
decode_if.data.rs2,
decode_if.data.rs3
}),
.ready_in (ibuf_ready_in[i]),
.valid_out(ibuffer_if[i].valid),
.data_out (ibuffer_if[i].data),
.ready_out(ibuffer_if[i].ready)
.ready_in (ibuf_ready_in[w]),
.valid_out(ibuffer_if[w].valid),
.data_out (ibuffer_if[w].data),
.ready_out(ibuffer_if[w].ready)
);
`ifndef L1_ENABLE
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
assign decode_if.ibuf_pop[w] = ibuffer_if[w].valid && ibuffer_if[w].ready;
`endif
end

View file

@ -38,6 +38,9 @@ module VX_issue import VX_gpu_pkg::*; #(
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
`endif
wire [ISSUE_ISW_W-1:0] decode_isw = wid_to_isw(decode_if.data.wid);

View file

@ -61,6 +61,7 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.scb_stalls),
.perf_units_uses(issue_perf.units_uses),
.perf_sfu_uses (issue_perf.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),

View file

@ -22,6 +22,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_stalls,
output reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_uses,
output reg [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_uses,
`endif
VX_writeback_if.slave writeback_if,
@ -38,6 +39,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
reg [PER_ISSUE_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (PER_ISSUE_WARPS),
@ -47,11 +51,21 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
.data_out (perf_units_per_cycle)
);
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (PER_ISSUE_WARPS),
.OP ("|")
) perf_sfu_reduce (
.data_in (perf_inuse_sfu_per_cycle),
.data_out (perf_sfu_per_cycle)
);
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
wire [PER_ISSUE_WARPS-1:0] stg_valid_in;
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
assign stg_valid_in[i] = staging_if[i].valid;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
assign stg_valid_in[w] = staging_if[w].valid;
end
wire perf_stall_per_cycle = (|stg_valid_in) && ~(|(stg_valid_in & operands_ready));
@ -73,54 +87,78 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_sfu_uses[i] <= '0;
end else begin
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
end
end
end
`endif
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (1)
) stanging_buf (
.clk (clk),
.reset (reset),
.valid_in (ibuffer_if[i].valid),
.data_in (ibuffer_if[i].data),
.ready_in (ibuffer_if[i].ready),
.valid_out(staging_if[i].valid),
.data_out (staging_if[i].data),
.ready_out(staging_if[i].ready)
.valid_in (ibuffer_if[w].valid),
.data_in (ibuffer_if[w].data),
.ready_in (ibuffer_if[w].ready),
.valid_out(staging_if[w].valid),
.data_out (staging_if[w].data),
.ready_out(staging_if[w].ready)
);
end
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
reg [`NUM_REGS-1:0] inuse_regs;
reg [3:0] operands_busy, operands_busy_n;
wire ibuffer_fire = ibuffer_if[i].valid && ibuffer_if[i].ready;
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
wire staging_fire = staging_if[i].valid && staging_if[i].ready;
wire staging_fire = staging_if[w].valid && staging_if[w].ready;
wire writeback_fire = writeback_if.valid
&& (writeback_if.data.wis == ISSUE_WIS_W'(i))
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
&& writeback_if.data.eop;
`ifdef PERF_ENABLE
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
always @(*) begin
perf_inuse_units_per_cycle[i] = '0;
if (staging_if[i].valid) begin
perf_inuse_units_per_cycle[w] = '0;
perf_inuse_sfu_per_cycle[w] = '0;
if (staging_if[w].valid) begin
if (operands_busy[0]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rd]] = 1;
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rd]] = 1;
if (inuse_units[staging_if[w].data.rd] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rd]] = 1;
end
end
if (operands_busy[1]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs1]] = 1;
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs1]] = 1;
if (inuse_units[staging_if[w].data.rs1] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs1]] = 1;
end
end
if (operands_busy[2]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs2]] = 1;
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs2]] = 1;
if (inuse_units[staging_if[w].data.rs2] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs2]] = 1;
end
end
if (operands_busy[3]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs3]] = 1;
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs3]] = 1;
if (inuse_units[staging_if[w].data.rs3] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs3]] = 1;
end
end
end
end
@ -130,52 +168,52 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
operands_busy_n = operands_busy;
if (ibuffer_fire) begin
operands_busy_n = {
inuse_regs[ibuffer_if[i].data.rs3],
inuse_regs[ibuffer_if[i].data.rs2],
inuse_regs[ibuffer_if[i].data.rs1],
inuse_regs[ibuffer_if[i].data.rd]
inuse_regs[ibuffer_if[w].data.rs3],
inuse_regs[ibuffer_if[w].data.rs2],
inuse_regs[ibuffer_if[w].data.rs1],
inuse_regs[ibuffer_if[w].data.rd]
};
end
if (writeback_fire) begin
if (ibuffer_fire) begin
if (writeback_if.data.rd == ibuffer_if[i].data.rd) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if.data.rd == ibuffer_if[i].data.rs1) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if.data.rd == ibuffer_if[i].data.rs2) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if.data.rd == ibuffer_if[i].data.rs3) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 0;
end
end else begin
if (writeback_if.data.rd == staging_if[i].data.rd) begin
if (writeback_if.data.rd == staging_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if.data.rd == staging_if[i].data.rs1) begin
if (writeback_if.data.rd == staging_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if.data.rd == staging_if[i].data.rs2) begin
if (writeback_if.data.rd == staging_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if.data.rd == staging_if[i].data.rs3) begin
if (writeback_if.data.rd == staging_if[w].data.rs3) begin
operands_busy_n[3] = 0;
end
end
end
if (staging_fire && staging_if[i].data.wb) begin
if (staging_if[i].data.rd == ibuffer_if[i].data.rd) begin
if (staging_fire && staging_if[w].data.wb) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 1;
end
if (staging_if[i].data.rd == ibuffer_if[i].data.rs1) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 1;
end
if (staging_if[i].data.rd == ibuffer_if[i].data.rs2) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 1;
end
if (staging_if[i].data.rd == ibuffer_if[i].data.rs3) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 1;
end
end
@ -188,15 +226,18 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
if (writeback_fire) begin
inuse_regs[writeback_if.data.rd] <= 0;
end
if (staging_fire && staging_if[i].data.wb) begin
inuse_regs[staging_if[i].data.rd] <= 1;
if (staging_fire && staging_if[w].data.wb) begin
inuse_regs[staging_if[w].data.rd] <= 1;
end
end
operands_busy <= operands_busy_n;
operands_ready[i] <= ~(| operands_busy_n);
operands_ready[w] <= ~(| operands_busy_n);
`ifdef PERF_ENABLE
if (staging_fire && staging_if[i].data.wb) begin
inuse_units[staging_if[i].data.rd] <= staging_if[i].data.ex_type;
if (staging_fire && staging_if[w].data.wb) begin
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
if (staging_if[w].data.ex_type == `EX_SFU) begin
inuse_sfu[staging_if[w].data.rd] <= op_to_sfu_type(staging_if[w].data.op_type);
end
end
`endif
end
@ -208,11 +249,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
if (reset) begin
timeout_ctr <= '0;
end else begin
if (staging_if[i].valid && ~staging_if[i].ready) begin
if (staging_if[w].valid && ~staging_if[w].ready) begin
`ifdef DBG_TRACE_PIPELINE
`TRACE(3, ("%d: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, INSTANCE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
operands_busy, staging_if[i].data.uuid));
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (ibuffer_fire) begin
@ -223,12 +264,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** %s timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, INSTANCE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
operands_busy, staging_if[i].data.uuid));
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if.data.rd] != 0,
("%t: *** %s invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, INSTANCE_ID, i, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid));
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid));
`endif
end
@ -237,10 +278,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] arb_data_in;
wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
assign arb_valid_in[i] = staging_if[i].valid && operands_ready[i];
assign arb_data_in[i] = staging_if[i].data;
assign staging_if[i].ready = arb_ready_in[i] && operands_ready[i];
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w];
assign arb_data_in[w] = staging_if[w].data;
assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w];
end
`RESET_RELAY (arb_reset, reset);

View file

@ -45,7 +45,7 @@ module VX_generic_arbiter #(
end else if (TYPE == "R") begin
VX_rr_arbiter #(
.NUM_REQS (NUM_REQS),
.NUM_REQS (NUM_REQS)
) rr_arbiter (
.clk (clk),
.reset (reset),
@ -59,7 +59,7 @@ module VX_generic_arbiter #(
end else if (TYPE == "F") begin
VX_fair_arbiter #(
.NUM_REQS (NUM_REQS),
.NUM_REQS (NUM_REQS)
) fair_arbiter (
.clk (clk),
.reset (reset),
@ -73,7 +73,7 @@ module VX_generic_arbiter #(
end else if (TYPE == "M") begin
VX_matrix_arbiter #(
.NUM_REQS (NUM_REQS),
.NUM_REQS (NUM_REQS)
) matrix_arbiter (
.clk (clk),
.reset (reset),
@ -87,7 +87,7 @@ module VX_generic_arbiter #(
end else if (TYPE == "C") begin
VX_cyclic_arbiter #(
.NUM_REQS (NUM_REQS),
.NUM_REQS (NUM_REQS)
) cyclic_arbiter (
.clk (clk),
.reset (reset),

View file

@ -105,21 +105,21 @@ module VX_pe_serializer #(
reg [TAG_WIDTH-1:0] tag_out_r;
wire valid_out_b = valid_out_s && batch_out_done;
wire enable_r = ready_out || ~valid_out;
wire ready_out_b = ready_out || ~valid_out;
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 1'b0;
end else if (enable_r) begin
end else if (enablready_out_be_r) begin
valid_out_r <= valid_out_b;
end
if (enable_r) begin
if (ready_out_b) begin
data_out_r[batch_out_idx] <= pe_data_out;
tag_out_r <= tag_out_s;
end
end
assign enable = enable_r || ~valid_out_b;
assign enable = ready_out_b || ~valid_out_b;
assign ready_in = enable && batch_in_done;
assign pe_enable = enable;

View file

@ -12,7 +12,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/opae
RTL_DIR := $(VORTEX_HOME)/hw/rtl
DPI_DIR := $(VORTEX_HOME)/hw/dpi
AFU_DIR := $(RTL_DIR)/afu/opae
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
IP_CACHE_DIR := $(ROOT_DIR)/hw/syn/altera/ip_cache/$(DEVICE_FAMILY)

View file

@ -5,7 +5,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/quartus
RTL_DIR := $(VORTEX_HOME)/hw/rtl
AFU_DIR := $(RTL_DIR)/afu/opae
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
IP_CACHE_DIR := $(ROOT_DIR)/hw/syn/altera/ip_cache/$(DEVICE_FAMILY)
@ -81,7 +80,7 @@ smart.log: $(PROJECT_FILES)
# Project initialization
$(PROJECT_FILES): gen-sources
quartus_sh -t $(SRC_DIR)/project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc $(SRC_DIR)/project.sdc -inc "src"
syn.chg:
$(STAMP) syn.chg
@ -99,6 +98,6 @@ pow.chg:
program: $(PROJECT).sof
quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof"
clean:
rm -rf src bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox

View file

@ -8,7 +8,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/xilinx/test
RTL_DIR := $(VORTEX_HOME)/hw/rtl
DPI_DIR := $(VORTEX_HOME)/hw/dpi
AFU_DIR := $(RTL_DIR)/afu/opae
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
# include paths

View file

@ -24,7 +24,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/xilinx/xrt
RTL_DIR := $(VORTEX_HOME)/hw/rtl
DPI_DIR := $(VORTEX_HOME)/hw/dpi
AFU_DIR := $(RTL_DIR)/afu/xrt
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
VIVADO := $(XILINX_VIVADO)/bin/vivado

View file

@ -9,7 +9,6 @@ NUM_CORES ?= 1
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
RTL_DIR := $(VORTEX_HOME)/hw/rtl
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
CP = cp -rf
RMDIR = rm -rf

View file

@ -186,7 +186,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t scrb_alu = 0;
uint64_t scrb_fpu = 0;
uint64_t scrb_lsu = 0;
uint64_t scrb_sfu = 0;
uint64_t scrb_csrs = 0;
uint64_t scrb_wctl = 0;
uint64_t ifetches = 0;
uint64_t loads = 0;
uint64_t stores = 0;
@ -297,22 +298,32 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_LSU, core_id, &scrb_lsu_per_core), {
return err;
});
uint64_t scrb_sfu_per_core;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
uint64_t scrb_csrs_per_core;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
return err;
});
uint64_t scrb_wctl_per_core;
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
return err;
});
scrb_alu += scrb_alu_per_core;
scrb_fpu += scrb_fpu_per_core;
scrb_lsu += scrb_lsu_per_core;
scrb_sfu += scrb_sfu_per_core;
scrb_csrs += scrb_csrs_per_core;
scrb_wctl += scrb_wctl_per_core;
if (num_cores > 1) {
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core;
int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, scrb_percent_per_core,
calcAvgPercent(scrb_alu_per_core, scrb_total),
calcAvgPercent(scrb_fpu_per_core, scrb_total),
calcAvgPercent(scrb_lsu_per_core, scrb_total),
calcAvgPercent(scrb_sfu_per_core, scrb_total));
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n"
, core_id
, scrb_stalls_per_core
, scrb_percent_per_core
, calcAvgPercent(scrb_alu_per_core, scrb_total)
, calcAvgPercent(scrb_fpu_per_core, scrb_total)
, calcAvgPercent(scrb_lsu_per_core, scrb_total)
, calcAvgPercent(scrb_csrs_per_core, scrb_total)
, calcAvgPercent(scrb_wctl_per_core, scrb_total)
);
}
scrb_stalls += scrb_stalls_per_core;
}
@ -544,15 +555,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int opds_percent = calcAvgPercent(opds_stalls, total_cycles);
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
int load_avg_lat = (int)(double(load_lat) / double(loads));
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_csrs + scrb_wctl;
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls, scrb_percent,
calcAvgPercent(scrb_alu, scrb_total),
calcAvgPercent(scrb_fpu, scrb_total),
calcAvgPercent(scrb_lsu, scrb_total),
calcAvgPercent(scrb_sfu, scrb_total));
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n"
, scrb_stalls
, scrb_percent
, calcAvgPercent(scrb_alu, scrb_total)
, calcAvgPercent(scrb_fpu, scrb_total)
, calcAvgPercent(scrb_lsu, scrb_total)
, calcAvgPercent(scrb_csrs, scrb_total)
, calcAvgPercent(scrb_wctl, scrb_total)
);
fprintf(stream, "PERF: operands stalls=%ld (%d%%)\n", opds_stalls, opds_percent);
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
fprintf(stream, "PERF: loads=%ld\n", loads);

View file

@ -162,5 +162,22 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
}
extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
return (g_callbacks.mpm_query)(hdevice, addr, core_id, value);
if (core_id == 0xffffffff) {
uint64_t num_cores;
CHECK_ERR((g_callbacks.dev_caps)(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
return err;
});
uint64_t sum_value = 0;
uint64_t cur_value;
for (uint32_t i = 0; i < num_cores; ++i) {
CHECK_ERR((g_callbacks.mpm_query)(hdevice, addr, i, &cur_value), {
return err;
});
sum_value += cur_value;
}
*value = sum_value;
return 0;
} else {
return (g_callbacks.mpm_query)(hdevice, addr, core_id, value);
}
}

View file

@ -5,7 +5,4 @@ HW_DIR := $(VORTEX_HOME)/hw
RTL_DIR := $(HW_DIR)/rtl
DPI_DIR := $(HW_DIR)/dpi
SCRIPT_DIR := $(HW_DIR)/scripts
COMMON_DIR := $(VORTEX_HOME)/sim/common
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
COMMON_DIR := $(VORTEX_HOME)/sim/common

View file

@ -266,8 +266,8 @@ void Core::issue() {
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
bool has_instrs = false;
bool found_match = false;
for (uint32_t k = 0; k < PER_ISSUE_WARPS; ++k) {
uint32_t kk = (ibuffer_idx_ + k) % PER_ISSUE_WARPS;
for (uint32_t w = 0; w < PER_ISSUE_WARPS; ++w) {
uint32_t kk = (ibuffer_idx_ + w) % PER_ISSUE_WARPS;
uint32_t ii = kk * ISSUE_WIDTH + i;
auto& ibuffer = ibuffers_.at(ii);
if (ibuffer.empty())
@ -293,7 +293,21 @@ void Core::issue() {
case FUType::ALU: ++perf_stats_.scrb_alu; break;
case FUType::FPU: ++perf_stats_.scrb_fpu; break;
case FUType::LSU: ++perf_stats_.scrb_lsu; break;
case FUType::SFU: ++perf_stats_.scrb_sfu; break;
case FUType::SFU: {
++perf_stats_.scrb_sfu;
switch (use.sfu_type) {
case SfuType::TMC:
case SfuType::WSPAWN:
case SfuType::SPLIT:
case SfuType::JOIN:
case SfuType::BAR:
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
case SfuType::CSRRW:
case SfuType::CSRRS:
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
default: assert(false);
}
} break;
default: assert(false);
}
}

View file

@ -49,6 +49,8 @@ public:
uint64_t scrb_fpu;
uint64_t scrb_lsu;
uint64_t scrb_sfu;
uint64_t scrb_csrs;
uint64_t scrb_wctl;
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
@ -67,6 +69,8 @@ public:
, scrb_fpu(0)
, scrb_lsu(0)
, scrb_sfu(0)
, scrb_csrs(0)
, scrb_wctl(0)
, ifetches(0)
, loads(0)
, stores(0)

View file

@ -623,7 +623,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
instr->setDestReg(rd, RegType::Integer);
auto imm = (code >> shift_func3) << shift_func3;
instr->setImm(imm);
} break;
} break;
case InstType::J: {
instr->setDestReg(rd, RegType::Integer);

View file

@ -397,6 +397,8 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
CSR_READ_64(VX_CSR_MPM_SCRB_FPU, core_perf.scrb_fpu);
CSR_READ_64(VX_CSR_MPM_SCRB_LSU, core_perf.scrb_lsu);
CSR_READ_64(VX_CSR_MPM_SCRB_SFU, core_perf.scrb_sfu);
CSR_READ_64(VX_CSR_MPM_SCRB_CSRS, core_perf.scrb_csrs);
CSR_READ_64(VX_CSR_MPM_SCRB_WCTL, core_perf.scrb_wctl);
CSR_READ_64(VX_CSR_MPM_IFETCHES, core_perf.ifetches);
CSR_READ_64(VX_CSR_MPM_LOADS, core_perf.loads);
CSR_READ_64(VX_CSR_MPM_STORES, core_perf.stores);

View file

@ -26,6 +26,7 @@ public:
RegType reg_type;
uint32_t reg_id;
FUType fu_type;
SfuType sfu_type;
uint64_t uuid;
};
@ -70,7 +71,7 @@ public:
if (in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).test(trace->dst_reg.idx)) {
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
auto owner = owners_.at(tag);
out.push_back({trace->dst_reg.type, trace->dst_reg.idx, owner->fu_type, owner->uuid});
out.push_back({trace->dst_reg.type, trace->dst_reg.idx, owner->fu_type, owner->sfu_type, owner->uuid});
}
}
for (uint32_t i = 0; i < trace->src_regs.size(); ++i) {
@ -78,7 +79,7 @@ public:
if (in_use_regs_.at(trace->wid).at((int)trace->src_regs[i].type).test(trace->src_regs[i].idx)) {
uint32_t tag = (trace->src_regs[i].idx << 16) | (trace->wid << 4) | (int)trace->src_regs[i].type;
auto owner = owners_.at(tag);
out.push_back({trace->src_regs[i].type, trace->src_regs[i].idx, owner->fu_type, owner->uuid});
out.push_back({trace->src_regs[i].type, trace->src_regs[i].idx, owner->fu_type, owner->sfu_type, owner->uuid});
}
}
}