mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
cumulative fixes
This commit is contained in:
parent
0dbcddcb54
commit
578c3d33d2
26 changed files with 267 additions and 150 deletions
|
@ -32,4 +32,6 @@ RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
|||
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
|
||||
|
||||
VORTEX_RT_PATH ?= $(VORTEX_HOME)/runtime
|
||||
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
|
||||
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
|
||||
|
||||
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party
|
2
configure
vendored
2
configure
vendored
|
@ -141,7 +141,7 @@ usage() {
|
|||
echo "Usage: $0 [--xlen=<value>] [--tooldir=<path>] [--osversion=<version>]"
|
||||
echo " --xlen=<value> Set the XLEN value (default: 32)"
|
||||
echo " --tooldir=<path> Set the TOOLDIR path (default: /opt)"
|
||||
echo " --osversion=<version> Set the OS Version (default: $(detect_os))"
|
||||
echo " --osversion=<version> Set the OS Version (default: $(detect_osversion))"
|
||||
echo " --prefix=<path> Set installation directory"
|
||||
exit 1
|
||||
}
|
||||
|
|
|
@ -16,7 +16,7 @@ VX_types.h: $(RTL_DIR)/VX_types.vh
|
|||
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_types.vh -o VX_types.h
|
||||
|
||||
clean:
|
||||
$(MAKE) -C unitest clean
|
||||
$(MAKE) -C unittest clean
|
||||
rm -f VX_config.h VX_types.h
|
||||
|
||||
.PHONY: VX_config.h VX_types.h
|
|
@ -408,22 +408,27 @@
|
|||
`define LATENCY_FCVT 5
|
||||
`endif
|
||||
|
||||
// FMA Bandwidth ratio
|
||||
`ifndef FMA_PE_RATIO
|
||||
`define FMA_PE_RATIO 1
|
||||
`endif
|
||||
|
||||
// FDIV Bandwidth ratio
|
||||
`ifndef FDIV_PE_RATIO
|
||||
`define FDIV_PE_RATIO 8
|
||||
`endif
|
||||
|
||||
// FSQRT Bandwidth ratio
|
||||
`ifndef FSQRT_PE_RATIO
|
||||
`define FSQRT_PE_RATIO 8
|
||||
`endif
|
||||
|
||||
// FCVT Bandwidth ratio
|
||||
`ifndef FCVT_PE_RATIO
|
||||
`define FCVT_PE_RATIO 8
|
||||
`endif
|
||||
|
||||
// FNCP Bandwidth ratio
|
||||
`ifndef FNCP_PE_RATIO
|
||||
`define FNCP_PE_RATIO 2
|
||||
`endif
|
||||
|
@ -530,7 +535,7 @@
|
|||
`define DCACHE_NUM_WAYS 1
|
||||
`endif
|
||||
|
||||
// SM Configurable Knobs //////////////////////////////////////////////////////
|
||||
// LMEM Configurable Knobs ////////////////////////////////////////////////////
|
||||
|
||||
`ifndef LMEM_DISABLE
|
||||
`define LMEM_ENABLE
|
||||
|
|
|
@ -89,6 +89,7 @@ package VX_gpu_pkg;
|
|||
logic [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
logic [`PERF_CTR_BITS-1:0] opd_stalls;
|
||||
logic [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] units_uses;
|
||||
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
|
||||
} issue_perf_t;
|
||||
|
||||
//////////////////////// instruction arguments ////////////////////////////
|
||||
|
@ -145,6 +146,31 @@ package VX_gpu_pkg;
|
|||
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
|
||||
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
|
||||
|
||||
////////////////////////// Icache Parameters //////////////////////////////
|
||||
|
||||
// Word size in bytes
|
||||
localparam ICACHE_WORD_SIZE = 4;
|
||||
localparam ICACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(ICACHE_WORD_SIZE));
|
||||
|
||||
// Block size in bytes
|
||||
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Core request tag Id bits
|
||||
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
|
||||
|
||||
// Core request tag bits
|
||||
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
|
||||
|
||||
// Memory request data bits
|
||||
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
|
||||
|
||||
// Memory request tag bits
|
||||
`ifdef ICACHE_ENABLE
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
||||
`else
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
`endif
|
||||
|
||||
////////////////////////// Dcache Parameters //////////////////////////////
|
||||
|
||||
// Word size in bytes
|
||||
|
@ -176,31 +202,6 @@ package VX_gpu_pkg;
|
|||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
`endif
|
||||
|
||||
////////////////////////// Icache Parameters //////////////////////////////
|
||||
|
||||
// Word size in bytes
|
||||
localparam ICACHE_WORD_SIZE = 4;
|
||||
localparam ICACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(ICACHE_WORD_SIZE));
|
||||
|
||||
// Block size in bytes
|
||||
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Core request tag Id bits
|
||||
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
|
||||
|
||||
// Core request tag bits
|
||||
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
|
||||
|
||||
// Memory request data bits
|
||||
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
|
||||
|
||||
// Memory request tag bits
|
||||
`ifdef ICACHE_ENABLE
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
||||
`else
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// L1 Parameters /////////////////////////////
|
||||
|
||||
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||
|
@ -292,6 +293,19 @@ package VX_gpu_pkg;
|
|||
end
|
||||
endfunction
|
||||
|
||||
///////////////////////// Miscaellaneous functions ////////////////////////
|
||||
|
||||
function logic [`SFU_WIDTH-1:0] op_to_sfu_type(
|
||||
input logic [`INST_OP_BITS-1:0] op_type
|
||||
);
|
||||
case (op_type)
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: op_to_sfu_type = `SFU_CSRS;
|
||||
default: op_to_sfu_type = `SFU_WCTL;
|
||||
endcase
|
||||
endfunction
|
||||
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
endpackage
|
||||
|
|
|
@ -85,27 +85,31 @@
|
|||
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
|
||||
`define VX_CSR_MPM_SCRB_ST 12'hB06
|
||||
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
|
||||
`define VX_CSR_MPM_SCRB_ALU 12'hB07
|
||||
`define VX_CSR_MPM_SCRB_ALU_H 12'hB87
|
||||
`define VX_CSR_MPM_SCRB_FPU 12'hB08
|
||||
`define VX_CSR_MPM_SCRB_FPU_H 12'hB88
|
||||
`define VX_CSR_MPM_SCRB_LSU 12'hB09
|
||||
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
|
||||
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
|
||||
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
|
||||
`define VX_CSR_MPM_OPDS_ST 12'hB0B
|
||||
`define VX_CSR_MPM_OPDS_ST_H 12'hB8B
|
||||
`define VX_CSR_MPM_OPDS_ST 12'hB07
|
||||
`define VX_CSR_MPM_OPDS_ST_H 12'hB87
|
||||
`define VX_CSR_MPM_SCRB_ALU 12'hB08
|
||||
`define VX_CSR_MPM_SCRB_ALU_H 12'hB88
|
||||
`define VX_CSR_MPM_SCRB_FPU 12'hB09
|
||||
`define VX_CSR_MPM_SCRB_FPU_H 12'hB89
|
||||
`define VX_CSR_MPM_SCRB_LSU 12'hB0A
|
||||
`define VX_CSR_MPM_SCRB_LSU_H 12'hB8A
|
||||
`define VX_CSR_MPM_SCRB_SFU 12'hB0B
|
||||
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8B
|
||||
`define VX_CSR_MPM_SCRB_CSRS 12'hB0C
|
||||
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
|
||||
`define VX_CSR_MPM_SCRB_WCTL 12'hB0D
|
||||
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB8D
|
||||
// PERF: memory
|
||||
`define VX_CSR_MPM_IFETCHES 12'hB0C
|
||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8C
|
||||
`define VX_CSR_MPM_LOADS 12'hB0D
|
||||
`define VX_CSR_MPM_LOADS_H 12'hB8D
|
||||
`define VX_CSR_MPM_STORES 12'hB0E
|
||||
`define VX_CSR_MPM_STORES_H 12'hB8E
|
||||
`define VX_CSR_MPM_IFETCH_LT 12'hB1F
|
||||
`define VX_CSR_MPM_IFETCH_LT_H 12'hB9F
|
||||
`define VX_CSR_MPM_LOAD_LT 12'hB10
|
||||
`define VX_CSR_MPM_LOAD_LT_H 12'hB90
|
||||
`define VX_CSR_MPM_IFETCHES 12'hB0E
|
||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8E
|
||||
`define VX_CSR_MPM_LOADS 12'hB0F
|
||||
`define VX_CSR_MPM_LOADS_H 12'hB8F
|
||||
`define VX_CSR_MPM_STORES 12'hB10
|
||||
`define VX_CSR_MPM_STORES_H 12'hB90
|
||||
`define VX_CSR_MPM_IFETCH_LT 12'hB11
|
||||
`define VX_CSR_MPM_IFETCH_LT_H 12'hB91
|
||||
`define VX_CSR_MPM_LOAD_LT 12'hB12
|
||||
`define VX_CSR_MPM_LOAD_LT_H 12'hB92
|
||||
|
||||
// Machine Performance-monitoring memory counters (class 2) ///////////////////
|
||||
|
||||
|
|
|
@ -216,6 +216,7 @@ import VX_fpu_pkg::*;
|
|||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched.stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.issue.ibf_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.issue.scb_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_ALU]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_FPU]);
|
||||
|
@ -224,7 +225,8 @@ import VX_fpu_pkg::*;
|
|||
`endif
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
|
||||
// PERF: memory
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);
|
||||
|
|
|
@ -35,7 +35,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
|
||||
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
|
||||
|
||||
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`IBUF_SIZE),
|
||||
|
@ -43,7 +43,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
) instr_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (decode_if.valid && decode_if.data.wid == ISSUE_WIS_W'(i)),
|
||||
.valid_in (decode_if.valid && decode_if.data.wid == ISSUE_WIS_W'(w)),
|
||||
.data_in ({
|
||||
decode_if.data.uuid,
|
||||
decode_if.data.tmask,
|
||||
|
@ -57,13 +57,13 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
decode_if.data.rs2,
|
||||
decode_if.data.rs3
|
||||
}),
|
||||
.ready_in (ibuf_ready_in[i]),
|
||||
.valid_out(ibuffer_if[i].valid),
|
||||
.data_out (ibuffer_if[i].data),
|
||||
.ready_out(ibuffer_if[i].ready)
|
||||
.ready_in (ibuf_ready_in[w]),
|
||||
.valid_out(ibuffer_if[w].valid),
|
||||
.data_out (ibuffer_if[w].data),
|
||||
.ready_out(ibuffer_if[w].ready)
|
||||
);
|
||||
`ifndef L1_ENABLE
|
||||
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
|
||||
assign decode_if.ibuf_pop[w] = ibuffer_if[w].valid && ibuffer_if[w].ready;
|
||||
`endif
|
||||
end
|
||||
|
||||
|
|
|
@ -38,6 +38,9 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
end
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
end
|
||||
`endif
|
||||
|
||||
wire [ISSUE_ISW_W-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
|
||||
|
|
|
@ -61,6 +61,7 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
.perf_stalls (issue_perf.scb_stalls),
|
||||
.perf_units_uses(issue_perf.units_uses),
|
||||
.perf_sfu_uses (issue_perf.sfu_uses),
|
||||
`endif
|
||||
.writeback_if (writeback_if),
|
||||
.ibuffer_if (ibuffer_if),
|
||||
|
|
|
@ -22,6 +22,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_stalls,
|
||||
output reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_uses,
|
||||
output reg [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_uses,
|
||||
`endif
|
||||
|
||||
VX_writeback_if.slave writeback_if,
|
||||
|
@ -38,6 +39,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
reg [PER_ISSUE_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
|
||||
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
|
||||
|
||||
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
|
||||
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
|
@ -47,11 +51,21 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
.data_out (perf_units_per_cycle)
|
||||
);
|
||||
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_SFU_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
.OP ("|")
|
||||
) perf_sfu_reduce (
|
||||
.data_in (perf_inuse_sfu_per_cycle),
|
||||
.data_out (perf_sfu_per_cycle)
|
||||
);
|
||||
|
||||
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
|
||||
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
|
||||
|
||||
wire [PER_ISSUE_WARPS-1:0] stg_valid_in;
|
||||
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
|
||||
assign stg_valid_in[i] = staging_if[i].valid;
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
assign stg_valid_in[w] = staging_if[w].valid;
|
||||
end
|
||||
|
||||
wire perf_stall_per_cycle = (|stg_valid_in) && ~(|(stg_valid_in & operands_ready));
|
||||
|
@ -73,54 +87,78 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sfu_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (1)
|
||||
) stanging_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (ibuffer_if[i].valid),
|
||||
.data_in (ibuffer_if[i].data),
|
||||
.ready_in (ibuffer_if[i].ready),
|
||||
.valid_out(staging_if[i].valid),
|
||||
.data_out (staging_if[i].data),
|
||||
.ready_out(staging_if[i].ready)
|
||||
.valid_in (ibuffer_if[w].valid),
|
||||
.data_in (ibuffer_if[w].data),
|
||||
.ready_in (ibuffer_if[w].ready),
|
||||
.valid_out(staging_if[w].valid),
|
||||
.data_out (staging_if[w].data),
|
||||
.ready_out(staging_if[w].ready)
|
||||
);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
reg [`NUM_REGS-1:0] inuse_regs;
|
||||
|
||||
reg [3:0] operands_busy, operands_busy_n;
|
||||
|
||||
wire ibuffer_fire = ibuffer_if[i].valid && ibuffer_if[i].ready;
|
||||
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
|
||||
|
||||
wire staging_fire = staging_if[i].valid && staging_if[i].ready;
|
||||
wire staging_fire = staging_if[w].valid && staging_if[w].ready;
|
||||
|
||||
wire writeback_fire = writeback_if.valid
|
||||
&& (writeback_if.data.wis == ISSUE_WIS_W'(i))
|
||||
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
|
||||
&& writeback_if.data.eop;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||
|
||||
always @(*) begin
|
||||
perf_inuse_units_per_cycle[i] = '0;
|
||||
if (staging_if[i].valid) begin
|
||||
perf_inuse_units_per_cycle[w] = '0;
|
||||
perf_inuse_sfu_per_cycle[w] = '0;
|
||||
if (staging_if[w].valid) begin
|
||||
if (operands_busy[0]) begin
|
||||
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rd]] = 1;
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rd]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rd] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rd]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[1]) begin
|
||||
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs1]] = 1;
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs1]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rs1] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs1]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[2]) begin
|
||||
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs2]] = 1;
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs2]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rs2] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs2]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[3]) begin
|
||||
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs3]] = 1;
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs3]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rs3] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs3]] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -130,52 +168,52 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
operands_busy_n = operands_busy;
|
||||
if (ibuffer_fire) begin
|
||||
operands_busy_n = {
|
||||
inuse_regs[ibuffer_if[i].data.rs3],
|
||||
inuse_regs[ibuffer_if[i].data.rs2],
|
||||
inuse_regs[ibuffer_if[i].data.rs1],
|
||||
inuse_regs[ibuffer_if[i].data.rd]
|
||||
inuse_regs[ibuffer_if[w].data.rs3],
|
||||
inuse_regs[ibuffer_if[w].data.rs2],
|
||||
inuse_regs[ibuffer_if[w].data.rs1],
|
||||
inuse_regs[ibuffer_if[w].data.rd]
|
||||
};
|
||||
end
|
||||
if (writeback_fire) begin
|
||||
if (ibuffer_fire) begin
|
||||
if (writeback_if.data.rd == ibuffer_if[i].data.rd) begin
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rd) begin
|
||||
operands_busy_n[0] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == ibuffer_if[i].data.rs1) begin
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rs1) begin
|
||||
operands_busy_n[1] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == ibuffer_if[i].data.rs2) begin
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rs2) begin
|
||||
operands_busy_n[2] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == ibuffer_if[i].data.rs3) begin
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rs3) begin
|
||||
operands_busy_n[3] = 0;
|
||||
end
|
||||
end else begin
|
||||
if (writeback_if.data.rd == staging_if[i].data.rd) begin
|
||||
if (writeback_if.data.rd == staging_if[w].data.rd) begin
|
||||
operands_busy_n[0] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == staging_if[i].data.rs1) begin
|
||||
if (writeback_if.data.rd == staging_if[w].data.rs1) begin
|
||||
operands_busy_n[1] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == staging_if[i].data.rs2) begin
|
||||
if (writeback_if.data.rd == staging_if[w].data.rs2) begin
|
||||
operands_busy_n[2] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == staging_if[i].data.rs3) begin
|
||||
if (writeback_if.data.rd == staging_if[w].data.rs3) begin
|
||||
operands_busy_n[3] = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
if (staging_fire && staging_if[i].data.wb) begin
|
||||
if (staging_if[i].data.rd == ibuffer_if[i].data.rd) begin
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rd) begin
|
||||
operands_busy_n[0] = 1;
|
||||
end
|
||||
if (staging_if[i].data.rd == ibuffer_if[i].data.rs1) begin
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rs1) begin
|
||||
operands_busy_n[1] = 1;
|
||||
end
|
||||
if (staging_if[i].data.rd == ibuffer_if[i].data.rs2) begin
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rs2) begin
|
||||
operands_busy_n[2] = 1;
|
||||
end
|
||||
if (staging_if[i].data.rd == ibuffer_if[i].data.rs3) begin
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rs3) begin
|
||||
operands_busy_n[3] = 1;
|
||||
end
|
||||
end
|
||||
|
@ -188,15 +226,18 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
if (writeback_fire) begin
|
||||
inuse_regs[writeback_if.data.rd] <= 0;
|
||||
end
|
||||
if (staging_fire && staging_if[i].data.wb) begin
|
||||
inuse_regs[staging_if[i].data.rd] <= 1;
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
inuse_regs[staging_if[w].data.rd] <= 1;
|
||||
end
|
||||
end
|
||||
operands_busy <= operands_busy_n;
|
||||
operands_ready[i] <= ~(| operands_busy_n);
|
||||
operands_ready[w] <= ~(| operands_busy_n);
|
||||
`ifdef PERF_ENABLE
|
||||
if (staging_fire && staging_if[i].data.wb) begin
|
||||
inuse_units[staging_if[i].data.rd] <= staging_if[i].data.ex_type;
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
|
||||
if (staging_if[w].data.ex_type == `EX_SFU) begin
|
||||
inuse_sfu[staging_if[w].data.rd] <= op_to_sfu_type(staging_if[w].data.op_type);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
@ -208,11 +249,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
if (reset) begin
|
||||
timeout_ctr <= '0;
|
||||
end else begin
|
||||
if (staging_if[i].valid && ~staging_if[i].ready) begin
|
||||
if (staging_if[w].valid && ~staging_if[w].ready) begin
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
`TRACE(3, ("%d: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
|
||||
operands_busy, staging_if[i].data.uuid));
|
||||
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
|
||||
operands_busy, staging_if[w].data.uuid));
|
||||
`endif
|
||||
timeout_ctr <= timeout_ctr + 1;
|
||||
end else if (ibuffer_fire) begin
|
||||
|
@ -223,12 +264,12 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
|
||||
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
|
||||
("%t: *** %s timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
|
||||
$time, INSTANCE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
|
||||
operands_busy, staging_if[i].data.uuid));
|
||||
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
|
||||
operands_busy, staging_if[w].data.uuid));
|
||||
|
||||
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if.data.rd] != 0,
|
||||
("%t: *** %s invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
||||
$time, INSTANCE_ID, i, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid));
|
||||
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid));
|
||||
`endif
|
||||
|
||||
end
|
||||
|
@ -237,10 +278,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] arb_data_in;
|
||||
wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
|
||||
|
||||
for (genvar i = 0; i < PER_ISSUE_WARPS; ++i) begin
|
||||
assign arb_valid_in[i] = staging_if[i].valid && operands_ready[i];
|
||||
assign arb_data_in[i] = staging_if[i].data;
|
||||
assign staging_if[i].ready = arb_ready_in[i] && operands_ready[i];
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w];
|
||||
assign arb_data_in[w] = staging_if[w].data;
|
||||
assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w];
|
||||
end
|
||||
|
||||
`RESET_RELAY (arb_reset, reset);
|
||||
|
|
|
@ -45,7 +45,7 @@ module VX_generic_arbiter #(
|
|||
end else if (TYPE == "R") begin
|
||||
|
||||
VX_rr_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.NUM_REQS (NUM_REQS)
|
||||
) rr_arbiter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -59,7 +59,7 @@ module VX_generic_arbiter #(
|
|||
end else if (TYPE == "F") begin
|
||||
|
||||
VX_fair_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.NUM_REQS (NUM_REQS)
|
||||
) fair_arbiter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -73,7 +73,7 @@ module VX_generic_arbiter #(
|
|||
end else if (TYPE == "M") begin
|
||||
|
||||
VX_matrix_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.NUM_REQS (NUM_REQS)
|
||||
) matrix_arbiter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -87,7 +87,7 @@ module VX_generic_arbiter #(
|
|||
end else if (TYPE == "C") begin
|
||||
|
||||
VX_cyclic_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.NUM_REQS (NUM_REQS)
|
||||
) cyclic_arbiter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -105,21 +105,21 @@ module VX_pe_serializer #(
|
|||
reg [TAG_WIDTH-1:0] tag_out_r;
|
||||
|
||||
wire valid_out_b = valid_out_s && batch_out_done;
|
||||
wire enable_r = ready_out || ~valid_out;
|
||||
wire ready_out_b = ready_out || ~valid_out;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
valid_out_r <= 1'b0;
|
||||
end else if (enable_r) begin
|
||||
end else if (enablready_out_be_r) begin
|
||||
valid_out_r <= valid_out_b;
|
||||
end
|
||||
if (enable_r) begin
|
||||
if (ready_out_b) begin
|
||||
data_out_r[batch_out_idx] <= pe_data_out;
|
||||
tag_out_r <= tag_out_s;
|
||||
end
|
||||
end
|
||||
|
||||
assign enable = enable_r || ~valid_out_b;
|
||||
assign enable = ready_out_b || ~valid_out_b;
|
||||
assign ready_in = enable && batch_in_done;
|
||||
|
||||
assign pe_enable = enable;
|
||||
|
|
|
@ -12,7 +12,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/opae
|
|||
RTL_DIR := $(VORTEX_HOME)/hw/rtl
|
||||
DPI_DIR := $(VORTEX_HOME)/hw/dpi
|
||||
AFU_DIR := $(RTL_DIR)/afu/opae
|
||||
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
|
||||
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
|
||||
|
||||
IP_CACHE_DIR := $(ROOT_DIR)/hw/syn/altera/ip_cache/$(DEVICE_FAMILY)
|
||||
|
|
|
@ -5,7 +5,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/quartus
|
|||
|
||||
RTL_DIR := $(VORTEX_HOME)/hw/rtl
|
||||
AFU_DIR := $(RTL_DIR)/afu/opae
|
||||
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
|
||||
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
|
||||
|
||||
IP_CACHE_DIR := $(ROOT_DIR)/hw/syn/altera/ip_cache/$(DEVICE_FAMILY)
|
||||
|
@ -81,7 +80,7 @@ smart.log: $(PROJECT_FILES)
|
|||
# Project initialization
|
||||
$(PROJECT_FILES): gen-sources
|
||||
quartus_sh -t $(SRC_DIR)/project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc $(SRC_DIR)/project.sdc -inc "src"
|
||||
|
||||
|
||||
syn.chg:
|
||||
$(STAMP) syn.chg
|
||||
|
||||
|
@ -99,6 +98,6 @@ pow.chg:
|
|||
|
||||
program: $(PROJECT).sof
|
||||
quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof"
|
||||
|
||||
|
||||
clean:
|
||||
rm -rf src bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox
|
||||
|
|
|
@ -8,7 +8,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/xilinx/test
|
|||
RTL_DIR := $(VORTEX_HOME)/hw/rtl
|
||||
DPI_DIR := $(VORTEX_HOME)/hw/dpi
|
||||
AFU_DIR := $(RTL_DIR)/afu/opae
|
||||
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
|
||||
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
|
||||
|
||||
# include paths
|
||||
|
|
|
@ -24,7 +24,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/xilinx/xrt
|
|||
RTL_DIR := $(VORTEX_HOME)/hw/rtl
|
||||
DPI_DIR := $(VORTEX_HOME)/hw/dpi
|
||||
AFU_DIR := $(RTL_DIR)/afu/xrt
|
||||
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
|
||||
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
|
||||
|
||||
VIVADO := $(XILINX_VIVADO)/bin/vivado
|
||||
|
|
|
@ -9,7 +9,6 @@ NUM_CORES ?= 1
|
|||
|
||||
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
|
||||
RTL_DIR := $(VORTEX_HOME)/hw/rtl
|
||||
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
|
||||
|
||||
CP = cp -rf
|
||||
RMDIR = rm -rf
|
||||
|
|
|
@ -186,7 +186,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t scrb_alu = 0;
|
||||
uint64_t scrb_fpu = 0;
|
||||
uint64_t scrb_lsu = 0;
|
||||
uint64_t scrb_sfu = 0;
|
||||
uint64_t scrb_csrs = 0;
|
||||
uint64_t scrb_wctl = 0;
|
||||
uint64_t ifetches = 0;
|
||||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
|
@ -297,22 +298,32 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_LSU, core_id, &scrb_lsu_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_sfu_per_core;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
uint64_t scrb_csrs_per_core;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
|
||||
return err;
|
||||
});
|
||||
uint64_t scrb_wctl_per_core;
|
||||
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
|
||||
return err;
|
||||
});
|
||||
scrb_alu += scrb_alu_per_core;
|
||||
scrb_fpu += scrb_fpu_per_core;
|
||||
scrb_lsu += scrb_lsu_per_core;
|
||||
scrb_sfu += scrb_sfu_per_core;
|
||||
scrb_csrs += scrb_csrs_per_core;
|
||||
scrb_wctl += scrb_wctl_per_core;
|
||||
if (num_cores > 1) {
|
||||
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
|
||||
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core;
|
||||
int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
|
||||
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core, scrb_percent_per_core,
|
||||
calcAvgPercent(scrb_alu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_fpu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_lsu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_sfu_per_core, scrb_total));
|
||||
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n"
|
||||
, core_id
|
||||
, scrb_stalls_per_core
|
||||
, scrb_percent_per_core
|
||||
, calcAvgPercent(scrb_alu_per_core, scrb_total)
|
||||
, calcAvgPercent(scrb_fpu_per_core, scrb_total)
|
||||
, calcAvgPercent(scrb_lsu_per_core, scrb_total)
|
||||
, calcAvgPercent(scrb_csrs_per_core, scrb_total)
|
||||
, calcAvgPercent(scrb_wctl_per_core, scrb_total)
|
||||
);
|
||||
}
|
||||
scrb_stalls += scrb_stalls_per_core;
|
||||
}
|
||||
|
@ -544,15 +555,19 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
int opds_percent = calcAvgPercent(opds_stalls, total_cycles);
|
||||
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
|
||||
int load_avg_lat = (int)(double(load_lat) / double(loads));
|
||||
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
|
||||
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_csrs + scrb_wctl;
|
||||
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
|
||||
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls, scrb_percent,
|
||||
calcAvgPercent(scrb_alu, scrb_total),
|
||||
calcAvgPercent(scrb_fpu, scrb_total),
|
||||
calcAvgPercent(scrb_lsu, scrb_total),
|
||||
calcAvgPercent(scrb_sfu, scrb_total));
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n"
|
||||
, scrb_stalls
|
||||
, scrb_percent
|
||||
, calcAvgPercent(scrb_alu, scrb_total)
|
||||
, calcAvgPercent(scrb_fpu, scrb_total)
|
||||
, calcAvgPercent(scrb_lsu, scrb_total)
|
||||
, calcAvgPercent(scrb_csrs, scrb_total)
|
||||
, calcAvgPercent(scrb_wctl, scrb_total)
|
||||
);
|
||||
fprintf(stream, "PERF: operands stalls=%ld (%d%%)\n", opds_stalls, opds_percent);
|
||||
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||
|
|
|
@ -162,5 +162,22 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
|
|||
}
|
||||
|
||||
extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
return (g_callbacks.mpm_query)(hdevice, addr, core_id, value);
|
||||
if (core_id == 0xffffffff) {
|
||||
uint64_t num_cores;
|
||||
CHECK_ERR((g_callbacks.dev_caps)(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
|
||||
return err;
|
||||
});
|
||||
uint64_t sum_value = 0;
|
||||
uint64_t cur_value;
|
||||
for (uint32_t i = 0; i < num_cores; ++i) {
|
||||
CHECK_ERR((g_callbacks.mpm_query)(hdevice, addr, i, &cur_value), {
|
||||
return err;
|
||||
});
|
||||
sum_value += cur_value;
|
||||
}
|
||||
*value = sum_value;
|
||||
return 0;
|
||||
} else {
|
||||
return (g_callbacks.mpm_query)(hdevice, addr, core_id, value);
|
||||
}
|
||||
}
|
|
@ -5,7 +5,4 @@ HW_DIR := $(VORTEX_HOME)/hw
|
|||
RTL_DIR := $(HW_DIR)/rtl
|
||||
DPI_DIR := $(HW_DIR)/dpi
|
||||
SCRIPT_DIR := $(HW_DIR)/scripts
|
||||
|
||||
COMMON_DIR := $(VORTEX_HOME)/sim/common
|
||||
|
||||
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
|
||||
COMMON_DIR := $(VORTEX_HOME)/sim/common
|
|
@ -266,8 +266,8 @@ void Core::issue() {
|
|||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
bool has_instrs = false;
|
||||
bool found_match = false;
|
||||
for (uint32_t k = 0; k < PER_ISSUE_WARPS; ++k) {
|
||||
uint32_t kk = (ibuffer_idx_ + k) % PER_ISSUE_WARPS;
|
||||
for (uint32_t w = 0; w < PER_ISSUE_WARPS; ++w) {
|
||||
uint32_t kk = (ibuffer_idx_ + w) % PER_ISSUE_WARPS;
|
||||
uint32_t ii = kk * ISSUE_WIDTH + i;
|
||||
auto& ibuffer = ibuffers_.at(ii);
|
||||
if (ibuffer.empty())
|
||||
|
@ -293,7 +293,21 @@ void Core::issue() {
|
|||
case FUType::ALU: ++perf_stats_.scrb_alu; break;
|
||||
case FUType::FPU: ++perf_stats_.scrb_fpu; break;
|
||||
case FUType::LSU: ++perf_stats_.scrb_lsu; break;
|
||||
case FUType::SFU: ++perf_stats_.scrb_sfu; break;
|
||||
case FUType::SFU: {
|
||||
++perf_stats_.scrb_sfu;
|
||||
switch (use.sfu_type) {
|
||||
case SfuType::TMC:
|
||||
case SfuType::WSPAWN:
|
||||
case SfuType::SPLIT:
|
||||
case SfuType::JOIN:
|
||||
case SfuType::BAR:
|
||||
case SfuType::PRED: ++perf_stats_.scrb_wctl; break;
|
||||
case SfuType::CSRRW:
|
||||
case SfuType::CSRRS:
|
||||
case SfuType::CSRRC: ++perf_stats_.scrb_csrs; break;
|
||||
default: assert(false);
|
||||
}
|
||||
} break;
|
||||
default: assert(false);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -49,6 +49,8 @@ public:
|
|||
uint64_t scrb_fpu;
|
||||
uint64_t scrb_lsu;
|
||||
uint64_t scrb_sfu;
|
||||
uint64_t scrb_csrs;
|
||||
uint64_t scrb_wctl;
|
||||
uint64_t ifetches;
|
||||
uint64_t loads;
|
||||
uint64_t stores;
|
||||
|
@ -67,6 +69,8 @@ public:
|
|||
, scrb_fpu(0)
|
||||
, scrb_lsu(0)
|
||||
, scrb_sfu(0)
|
||||
, scrb_csrs(0)
|
||||
, scrb_wctl(0)
|
||||
, ifetches(0)
|
||||
, loads(0)
|
||||
, stores(0)
|
||||
|
|
|
@ -623,7 +623,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
|
|||
instr->setDestReg(rd, RegType::Integer);
|
||||
auto imm = (code >> shift_func3) << shift_func3;
|
||||
instr->setImm(imm);
|
||||
} break;
|
||||
} break;
|
||||
|
||||
case InstType::J: {
|
||||
instr->setDestReg(rd, RegType::Integer);
|
||||
|
|
|
@ -397,6 +397,8 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
CSR_READ_64(VX_CSR_MPM_SCRB_FPU, core_perf.scrb_fpu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_LSU, core_perf.scrb_lsu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_SFU, core_perf.scrb_sfu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_CSRS, core_perf.scrb_csrs);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_WCTL, core_perf.scrb_wctl);
|
||||
CSR_READ_64(VX_CSR_MPM_IFETCHES, core_perf.ifetches);
|
||||
CSR_READ_64(VX_CSR_MPM_LOADS, core_perf.loads);
|
||||
CSR_READ_64(VX_CSR_MPM_STORES, core_perf.stores);
|
||||
|
|
|
@ -26,6 +26,7 @@ public:
|
|||
RegType reg_type;
|
||||
uint32_t reg_id;
|
||||
FUType fu_type;
|
||||
SfuType sfu_type;
|
||||
uint64_t uuid;
|
||||
};
|
||||
|
||||
|
@ -70,7 +71,7 @@ public:
|
|||
if (in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).test(trace->dst_reg.idx)) {
|
||||
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({trace->dst_reg.type, trace->dst_reg.idx, owner->fu_type, owner->uuid});
|
||||
out.push_back({trace->dst_reg.type, trace->dst_reg.idx, owner->fu_type, owner->sfu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
for (uint32_t i = 0; i < trace->src_regs.size(); ++i) {
|
||||
|
@ -78,7 +79,7 @@ public:
|
|||
if (in_use_regs_.at(trace->wid).at((int)trace->src_regs[i].type).test(trace->src_regs[i].idx)) {
|
||||
uint32_t tag = (trace->src_regs[i].idx << 16) | (trace->wid << 4) | (int)trace->src_regs[i].type;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({trace->src_regs[i].type, trace->src_regs[i].idx, owner->fu_type, owner->uuid});
|
||||
out.push_back({trace->src_regs[i].type, trace->src_regs[i].idx, owner->fu_type, owner->sfu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue