Merge branch 'develop' of https://github.com/vortexgpgpu/vortex into develop

This commit is contained in:
Blaise Tine 2023-11-27 08:07:52 -08:00
commit 621d6de6ce
42 changed files with 1216 additions and 634 deletions

View file

@ -58,6 +58,8 @@
`define VX_CSR_MPM_BASE 12'hB00
`define VX_CSR_MPM_BASE_H 12'hB80
`define VX_CSR_MPM_USER 12'hB03
`define VX_CSR_MPM_USER_H 12'hB83
// Machine Performance-monitoring core counters
// PERF: Standard
@ -68,29 +70,41 @@
`define VX_CSR_MINSTRET 12'hB02
`define VX_CSR_MINSTRET_H 12'hB82
// PERF: pipeline
`define VX_CSR_MPM_IBUF_ST 12'hB03
`define VX_CSR_MPM_IBUF_ST_H 12'hB83
`define VX_CSR_MPM_SCRB_ST 12'hB04
`define VX_CSR_MPM_SCRB_ST_H 12'hB84
`define VX_CSR_MPM_ALU_ST 12'hB05
`define VX_CSR_MPM_ALU_ST_H 12'hB85
`define VX_CSR_MPM_LSU_ST 12'hB06
`define VX_CSR_MPM_LSU_ST_H 12'hB86
`define VX_CSR_MPM_FPU_ST 12'hB07
`define VX_CSR_MPM_FPU_ST_H 12'hB87
`define VX_CSR_MPM_SFU_ST 12'hB08
`define VX_CSR_MPM_SFU_ST_H 12'hB88
`define VX_CSR_MPM_SCHED_ST 12'hB03
`define VX_CSR_MPM_SCHED_ST_H 12'hB83
`define VX_CSR_MPM_FETCH_ST 12'hB04
`define VX_CSR_MPM_FETCH_ST_H 12'hB84
`define VX_CSR_MPM_IBUF_ST 12'hB05
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
`define VX_CSR_MPM_SCRB_ST 12'hB06
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
`define VX_CSR_MPM_ALU_ST 12'hB07
`define VX_CSR_MPM_ALU_ST_H 12'hB87
`define VX_CSR_MPM_LSU_ST 12'hB08
`define VX_CSR_MPM_LSU_ST_H 12'hB88
`define VX_CSR_MPM_FPU_ST 12'hB09
`define VX_CSR_MPM_FPU_ST_H 12'hB89
`define VX_CSR_MPM_SFU_ST 12'hB0A
`define VX_CSR_MPM_SFU_ST_H 12'hB8A
`define VX_CSR_MPM_SCRB_ALU 12'hB0B
`define VX_CSR_MPM_SCRB_ALU_H 12'hB8B
`define VX_CSR_MPM_SCRB_FPU 12'hB0C
`define VX_CSR_MPM_SCRB_FPU_H 12'hB8C
`define VX_CSR_MPM_SCRB_LSU 12'hB0D
`define VX_CSR_MPM_SCRB_LSU_H 12'hB8D
`define VX_CSR_MPM_SCRB_SFU 12'hB0E
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8E
// PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0A
`define VX_CSR_MPM_IFETCHES_H 12'hB8A
`define VX_CSR_MPM_LOADS 12'hB0B
`define VX_CSR_MPM_LOADS_H 12'hB8B
`define VX_CSR_MPM_STORES 12'hB0C
`define VX_CSR_MPM_STORES_H 12'hB8C
`define VX_CSR_MPM_IFETCH_LAT 12'hB0D
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB8D
`define VX_CSR_MPM_LOAD_LAT 12'hB0E
`define VX_CSR_MPM_LOAD_LAT_H 12'hB8E
`define VX_CSR_MPM_IFETCHES 12'hB0F
`define VX_CSR_MPM_IFETCHES_H 12'hB8F
`define VX_CSR_MPM_LOADS 12'hB10
`define VX_CSR_MPM_LOADS_H 12'hB90
`define VX_CSR_MPM_STORES 12'hB11
`define VX_CSR_MPM_STORES_H 12'hB91
`define VX_CSR_MPM_IFETCH_LAT 12'hB12
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB92
`define VX_CSR_MPM_LOAD_LAT 12'hB13
`define VX_CSR_MPM_LOAD_LAT_H 12'hB93
// Machine Performance-monitoring memory counters
// PERF: icache
@ -98,59 +112,61 @@
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83
`define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
`define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84
`define VX_CSR_MPM_ICACHE_MSHR_ST 12'hB05 // MSHR stalls
`define VX_CSR_MPM_ICACHE_MSHR_ST_H 12'hB85
// PERF: dcache
`define VX_CSR_MPM_DCACHE_READS 12'hB05 // total reads
`define VX_CSR_MPM_DCACHE_READS_H 12'hB85
`define VX_CSR_MPM_DCACHE_WRITES 12'hB06 // total writes
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB86
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB87
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB88
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB89
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A
// PERF: smem
`define VX_CSR_MPM_SMEM_READS 12'hB0B // memory reads
`define VX_CSR_MPM_SMEM_READS_H 12'hB8B
`define VX_CSR_MPM_SMEM_WRITES 12'hB0C // memory writes
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB8C
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB8D
`define VX_CSR_MPM_DCACHE_READS 12'hB06 // total reads
`define VX_CSR_MPM_DCACHE_READS_H 12'hB86
`define VX_CSR_MPM_DCACHE_WRITES 12'hB07 // total writes
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB87
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB08 // read misses
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB88
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB09 // write misses
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB89
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB0A // bank conflicts
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB8A
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0B // MSHR stalls
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8B
// PERF: l2cache
`define VX_CSR_MPM_L2CACHE_READS 12'hB0E // total reads
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8E
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8F
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB90
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB91
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB92
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93
`define VX_CSR_MPM_L2CACHE_READS 12'hB0C // total reads
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8C
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0D // total writes
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8D
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB0E // read misses
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB8E
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB0F // write misses
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB8F
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB10 // bank conflicts
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB90
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB11 // MSHR stalls
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB91
// PERF: l3cache
`define VX_CSR_MPM_L3CACHE_READS 12'hB14 // total reads
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB94
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB95
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB96
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB97
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB98
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99
`define VX_CSR_MPM_L3CACHE_READS 12'hB12 // total reads
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB92
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB13 // total writes
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB93
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB14 // read misses
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB94
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB15 // write misses
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB95
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB16 // bank conflicts
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB96
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB17 // MSHR stalls
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB97
// PERF: memory
`define VX_CSR_MPM_MEM_READS 12'hB1A // total reads
`define VX_CSR_MPM_MEM_READS_H 12'hB9A
`define VX_CSR_MPM_MEM_WRITES 12'hB1B // total writes
`define VX_CSR_MPM_MEM_WRITES_H 12'hB9B
`define VX_CSR_MPM_MEM_LAT 12'hB1C // memory latency
`define VX_CSR_MPM_MEM_LAT_H 12'hB9C
`define VX_CSR_MPM_MEM_READS 12'hB18 // total reads
`define VX_CSR_MPM_MEM_READS_H 12'hB98
`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LAT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LAT_H 12'hB9A
// PERF: smem
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B
`define VX_CSR_MPM_SMEM_WRITES 12'hB1C // memory writes
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB9D
// Machine Information Registers

View file

@ -44,7 +44,7 @@ module VX_commit import VX_gpu_pkg::*; #(
VX_commit_if commit_if[`ISSUE_WIDTH]();
wire [`ISSUE_WIDTH-1:0] commit_fire;
wire [`ISSUE_WIDTH-1:0] commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
wire [`ISSUE_WIDTH-1:0] commit_eop;
@ -91,24 +91,22 @@ module VX_commit import VX_gpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
assign commit_wid[i] = commit_if[i].data.wid;
assign commit_eop[i] = commit_if[i].data.eop;
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
assign commit_wid[i] = commit_if[i].data.wid;
assign commit_eop[i] = commit_if[i].data.eop;
end
// CSRs update
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r;
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] pop_count;
`POP_COUNT(pop_count, commit_tmask[i]);
assign commit_size[i] = pop_count;
`POP_COUNT(commit_size[i], commit_tmask[i]);
end
VX_pipe_register #(
@ -129,7 +127,7 @@ module VX_commit import VX_gpu_pkg::*; #(
.OP ("+")
) commit_size_reduce (
.data_in (commit_size_r),
.data_out (commit_size_all)
.data_out (commit_size_all_r)
);
VX_pipe_register #(
@ -139,26 +137,26 @@ module VX_commit import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({commit_fire_any_r, commit_size_all}),
.data_out ({commit_fire_any_rr, commit_size_all_r})
.data_in ({commit_fire_any_r, commit_size_all_r}),
.data_out ({commit_fire_any_rr, commit_size_all_rr})
);
reg [`PERF_CTR_BITS-1:0] instret;
always @(posedge clk) begin
if (reset) begin
instret <= '0;
end else begin
if (commit_fire_any_rr) begin
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r);
instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr);
end
end
end
assign commit_csr_if.instret = instret;
// Committed instructions
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
.RESETW (`ISSUE_WIDTH)
@ -166,23 +164,23 @@ module VX_commit import VX_gpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({(commit_fire & commit_eop), commit_wid}),
.data_in ({committed, commit_wid}),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
);
// Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
assign writeback_if[i].data.PC = commit_if[i].data.PC;
assign writeback_if[i].data.tmask = commit_if[i].data.tmask;
assign writeback_if[i].data.rd = commit_if[i].data.rd;
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
assign writeback_if[i].data.PC = commit_if[i].data.PC;
assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
assign writeback_if[i].data.rd = commit_if[i].data.rd;
assign writeback_if[i].data.data = commit_if[i].data.data;
assign writeback_if[i].data.sop = commit_if[i].data.sop;
assign writeback_if[i].data.eop = commit_if[i].data.eop;
assign commit_if[i].ready = 1'b1;
assign writeback_if[i].data.sop = commit_if[i].data.sop;
assign writeback_if[i].data.eop = commit_if[i].data.eop;
assign commit_if[i].ready = 1'b1; // writeback has no backpressure
end
// simulation helper signal to get RISC-V tests Pass/Fail status

View file

@ -116,7 +116,11 @@ module VX_core import VX_gpu_pkg::*; #(
.CORE_ID (CORE_ID)
) schedule (
.clk (clk),
.reset (schedule_reset),
.reset (schedule_reset),
`ifdef PERF_ENABLE
.perf_schedule_if (pipeline_perf_if.schedule),
`endif
.base_dcrs (base_dcrs),

View file

@ -179,14 +179,18 @@ import VX_fpu_pkg::*;
default: begin
read_addr_valid_r = 0;
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32))
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
read_addr_valid_r = 1;
`ifdef PERF_ENABLE
case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
// PERF: pipeline
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0];
`VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
@ -204,6 +208,19 @@ import VX_fpu_pkg::*;
`endif
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0];
`ifdef EXT_F_ENABLE
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0];
`else
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
`endif
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0];
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];
// PERF: memory
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
@ -214,7 +231,7 @@ import VX_fpu_pkg::*;
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
default:;
endcase
end
@ -225,6 +242,8 @@ import VX_fpu_pkg::*;
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
`VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: dcache
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);

View file

@ -14,10 +14,10 @@
`include "VX_platform.vh"
module VX_ipdom_stack #(
parameter WIDTH = 1,
parameter DEPTH = 1,
parameter WIDTH = 1,
parameter DEPTH = 1,
parameter OUT_REG = 0,
parameter ADDRW = `LOG2UP(DEPTH)
parameter ADDRW = `LOG2UP(DEPTH)
) (
input wire clk,
input wire reset,

View file

@ -59,6 +59,10 @@ module VX_issue #(
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_scb_uses (perf_issue_if.scb_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
@ -152,29 +156,17 @@ module VX_issue #(
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
end
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
perf_scb_stalls <= '0;
end else begin
if (decode_if.valid && ~decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
end
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
end
end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
assign perf_issue_if.scb_stalls = perf_scb_stalls;
`endif
endmodule

View file

@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.schedule perf_schedule_if,
`endif
// configuration
input base_dcrs_t base_dcrs,
@ -376,4 +380,21 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls;
always @(posedge clk) begin
if (reset) begin
perf_sched_stalls <= '0;
perf_fetch_stalls <= '0;
end else begin
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(!schedule_valid);
perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_if.valid && !schedule_if.ready);
end
end
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
assign perf_schedule_if.fetch_stalls = perf_fetch_stalls;
`endif
endmodule

View file

@ -19,6 +19,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_scb_uses [`NUM_EX_UNITS],
`endif
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
@ -26,81 +31,102 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
`ifdef PERF_ENABLE
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_alu_per_cycle;
`ifdef EXT_F_ENABLE
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_fpu_per_cycle;
`endif
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_lsu_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_sfu_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
reg [`EX_BITS-1:0][`ISSUE_WIDTH-1:0] scoreboard_uses;
wire [`ISSUE_WIDTH-1:0] scoreboard_stalls;
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
`POP_COUNT(scoreboard_alu_per_cycle, scoreboard_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`POP_COUNT(scoreboard_fpu_per_cycle, scoreboard_uses[`EX_FPU]);
`endif
`POP_COUNT(scoreboard_lsu_per_cycle, scoreboard_uses[`EX_LSU]);
`POP_COUNT(scoreboard_sfu_per_cycle, scoreboard_uses[`EX_SFU]);
`endif
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
reg [3:0] ready_masks, ready_masks_n;
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
VX_ibuffer_if staging_if();
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
`ifdef PERF_ENABLE
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units;
always @(*) begin
inuse_regs_n = inuse_regs;
ready_masks_n = ready_masks;
if (writeback_fire) begin
inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0;
ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}}
& {(writeback_if[i].data.rd == staging_if.data.rd),
(writeback_if[i].data.rd == staging_if.data.rs1),
(writeback_if[i].data.rd == staging_if.data.rs2),
(writeback_if[i].data.rd == staging_if.data.rs3)};
end
if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin
inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1;
ready_masks_n = '0;
scoreboard_uses = '0;
if (ibuffer_if[i].valid) begin
if (inuse_rd) begin
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]][i] = 1;
end
if (inuse_rs1) begin
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]][i] = 1;
end
if (inuse_rs2) begin
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]][i] = 1;
end
if (inuse_rs3) begin
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]][i] = 1;
end
end
if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd],
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1],
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2],
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]};
end
end
end
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
`endif
reg [DATAW-1:0] data_out_r;
reg valid_out_r;
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
wire deps_ready = (& ready_masks);
always @(posedge clk) begin
if (reset) begin
inuse_regs <= '0;
ready_masks <= '0;
end else begin
inuse_regs <= inuse_regs_n;
ready_masks <= ready_masks_n;
valid_out_r <= 0;
inuse_regs <= '0;
end else begin
if (writeback_fire) begin
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
end
if (~valid_out_r) begin
valid_out_r <= ibuffer_if[i].valid && deps_ready;
end else if (staging_if.ready) begin
if (staging_if.data.wb) begin
inuse_regs[staging_if.data.wis][staging_if.data.rd] <= 1;
`ifdef PERF_ENABLE
inuse_units[staging_if.data.wis][staging_if.data.rd] <= staging_if.data.ex_type;
`endif
end
valid_out_r <= 0;
end
end
if (~valid_out_r) begin
data_out_r <= ibuffer_if[i].data;
end
end
// staging buffer
`RESET_RELAY (stg_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW)
) stg_buf (
.clk (clk),
.reset (stg_buf_reset),
.valid_in (ibuffer_if[i].valid),
.ready_in (ibuffer_if[i].ready),
.data_in (ibuffer_if[i].data),
.data_out (staging_if.data),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
// output buffer
wire valid_stg, ready_stg;
wire regs_ready = (& ready_masks);
assign valid_stg = staging_if.valid && regs_ready;
assign staging_if.ready = ready_stg && regs_ready;
`RESET_RELAY (out_buf_reset, reset);
assign ibuffer_if[i].ready = ~valid_out_r && deps_ready;
assign staging_if.valid = valid_out_r;
assign staging_if.data = data_out_r;
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.SIZE (0),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_stg),
.ready_in (ready_stg),
.reset (reset),
.valid_in (staging_if.valid),
.ready_in (staging_if.ready),
.data_in (staging_if.data),
.data_out (scoreboard_if[i].data),
.valid_out (scoreboard_if[i].valid),
@ -108,29 +134,29 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
);
`ifdef SIMULATION
reg [31:0] timeout_ctr;
reg [31:0] timeout_ctr;
always @(posedge clk) begin
if (reset) begin
timeout_ctr <= '0;
end else begin
if (staging_if.valid && ~regs_ready) begin
if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
`ifdef DBG_TRACE_CORE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
~ready_masks, staging_if.data.uuid));
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, ibuffer_if[i].data.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (staging_if.valid && staging_if.ready) begin
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
timeout_ctr <= '0;
end
end
end
end
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
~ready_masks, staging_if.data.uuid));
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, ibuffer_if[i].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
@ -139,4 +165,26 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`ifdef PERF_ENABLE
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
perf_scb_uses[`EX_ALU] <= '0;
`ifdef EXT_F_ENABLE
perf_scb_uses[`EX_FPU] <= '0;
`endif
perf_scb_uses[`EX_LSU] <= '0;
perf_scb_uses[`EX_SFU] <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
perf_scb_uses[`EX_ALU] <= perf_scb_uses[`EX_ALU] + `PERF_CTR_BITS'(scoreboard_alu_per_cycle);
`ifdef EXT_F_ENABLE
perf_scb_uses[`EX_FPU] <= perf_scb_uses[`EX_FPU] + `PERF_CTR_BITS'(scoreboard_fpu_per_cycle);
`endif
perf_scb_uses[`EX_LSU] <= perf_scb_uses[`EX_LSU] + `PERF_CTR_BITS'(scoreboard_lsu_per_cycle);
perf_scb_uses[`EX_SFU] <= perf_scb_uses[`EX_SFU] + `PERF_CTR_BITS'(scoreboard_sfu_per_cycle);
end
end
`endif
endmodule

View file

@ -14,8 +14,11 @@
`include "VX_define.vh"
interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] sched_stalls;
wire [`PERF_CTR_BITS-1:0] fetch_stalls;
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] ifetches;
@ -24,15 +27,24 @@ interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency;
modport schedule (
output sched_stalls,
output fetch_stalls
);
modport issue (
output ibf_stalls,
output scb_stalls,
output scb_uses,
output dsp_stalls
);
);
modport slave (
input sched_stalls,
input fetch_stalls,
input ibf_stalls,
input scb_stalls,
input scb_uses,
input dsp_stalls,
input ifetches,
input loads,

View file

@ -21,8 +21,8 @@ module VX_avs_adapter #(
parameter NUM_BANKS = 1,
parameter TAG_WIDTH = 1,
parameter RD_QUEUE_SIZE = 1,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0
) (
input wire clk,
input wire reset,

View file

@ -20,7 +20,7 @@ module VX_axi_adapter #(
parameter TAG_WIDTH = 8,
parameter NUM_BANKS = 1,
parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)),
parameter OUT_REG_RSP = 0
parameter OUT_REG_RSP = 0
) (
input wire clk,
input wire reset,

View file

@ -21,8 +21,8 @@ module VX_mem_adapter #(
parameter DST_ADDR_WIDTH = 1,
parameter SRC_TAG_WIDTH = 1,
parameter DST_TAG_WIDTH = 1,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0
) (
input wire clk,
input wire reset,

View file

@ -21,7 +21,7 @@ module VX_stream_arb #(
parameter `STRING ARBITER = "P",
parameter LOCK_ENABLE = 1,
parameter MAX_FANOUT = `MAX_FANOUT,
parameter OUT_REG = 0 ,
parameter OUT_REG = 0 ,
parameter NUM_REQS = (NUM_INPUTS + NUM_OUTPUTS - 1) / NUM_OUTPUTS,
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
parameter NUM_REQS_W = `UP(LOG_NUM_REQS)

View file

@ -173,8 +173,8 @@ module VX_stream_xbar #(
end
// compute inputs collision
// we have a collision when there exists a valid transfer with mutiple input candicates
// we caount the unique duplicates each cycle.
// we have a collision when there exists a valid transfer with multiple input candicates
// we count the unique duplicates each cycle.
reg [PERF_CTR_BITS-1:0] collisions_r;
reg [NUM_INPUTS-1:0] per_cycle_collision;

View file

@ -15,7 +15,7 @@
module VX_gbar_arb #(
parameter NUM_REQS = 1,
parameter OUT_REG = 0,
parameter OUT_REG = 0,
parameter `STRING ARBITER = "R"
) (
input wire clk,

View file

@ -21,8 +21,8 @@ module VX_mem_arb #(
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
parameter TAG_WIDTH = 1,
parameter TAG_SEL_IDX = 0,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0,
parameter `STRING ARBITER = "R"
) (
input wire clk,

View file

@ -19,8 +19,8 @@ module VX_smem_switch #(
parameter TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter TAG_SEL_IDX = 0,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0,
parameter OUT_REG_REQ = 0,
parameter OUT_REG_RSP = 0,
parameter `STRING ARBITER = "R"
) (
input wire clk,

View file

@ -186,27 +186,31 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
return int((1.0 - (double(part) / double(total))) * 100);
};
auto caclAvgLatency = [&](uint64_t sum, uint64_t requests)->int {
if (requests == 0)
auto caclAverage = [&](uint64_t part, uint64_t total)->double {
if (total == 0)
return 0;
return int(double(sum) / double(requests));
return double(part) / double(total);
};
auto calcUtilization = [&](uint64_t count, uint64_t stalls)->int {
if (count == 0)
return 0;
return int((double(count) / double(count + stalls)) * 100);
auto calcAvgPercent = [&](uint64_t part, uint64_t total)->int {
return int(caclAverage(part, total) * 100);
};
auto perf_class = gAutoPerfDump.get_perf_class();
// PERF: pipeline stalls
uint64_t scheduler_stalls = 0;
uint64_t fetch_stalls = 0;
uint64_t ibuffer_stalls = 0;
uint64_t scoreboard_stalls = 0;
uint64_t scrb_stalls = 0;
uint64_t lsu_stalls = 0;
uint64_t fpu_stalls = 0;
uint64_t alu_stalls = 0;
uint64_t sfu_stalls = 0;
uint64_t sfu_stalls = 0;
uint64_t scrb_alu = 0;
uint64_t scrb_fpu = 0;
uint64_t scrb_lsu = 0;
uint64_t scrb_sfu = 0;
uint64_t ifetches = 0;
uint64_t loads = 0;
uint64_t stores = 0;
@ -251,76 +255,121 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
#endif
std::vector<uint8_t> staging_buf(64* sizeof(uint32_t));
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size();
ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
if (ret != 0)
return ret;
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
#ifdef PERF_ENABLE
switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: {
// PERF: pipeline
// ibuffer_stall
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
ibuffer_stalls += ibuffer_stalls_per_core;
// scoreboard_stall
uint64_t scoreboard_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
scoreboard_stalls += scoreboard_stalls_per_core;
// alu_stall
uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
alu_stalls += alu_stalls_per_core;
// lsu_stall
uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
lsu_stalls += lsu_stalls_per_core;
// fpu_stall
uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
fpu_stalls += fpu_stalls_per_core;
// sfu_stall
uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
sfu_stalls += sfu_stalls_per_core;
// schedule stalls
{
uint64_t scheduler_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
int scheduler_percent_per_core = calcAvgPercent(scheduler_stalls_per_core, cycles_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: schedule stalls=%ld (%d%%)\n", core_id, scheduler_stalls_per_core, scheduler_percent_per_core);
scheduler_stalls += scheduler_stalls_per_core;
}
// fetch stalls
{
uint64_t fetch_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FETCH_ST);
int fetch_percent_per_core = calcAvgPercent(fetch_stalls_per_core, cycles_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch stalls=%ld (%d%%)\n", core_id, fetch_stalls_per_core, fetch_percent_per_core);
fetch_stalls += fetch_stalls_per_core;
}
// ibuffer_stalls
{
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
ibuffer_stalls += ibuffer_stalls_per_core;
}
// scrb_stalls
{
uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU);
uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU);
uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU);
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
scrb_alu += scrb_alu_per_core;
scrb_fpu += scrb_fpu_per_core;
scrb_lsu += scrb_lsu_per_core;
scrb_sfu += scrb_sfu_per_core;
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
calcAvgPercent(scrb_alu_per_core, scrb_total),
calcAvgPercent(scrb_fpu_per_core, scrb_total),
calcAvgPercent(scrb_lsu_per_core, scrb_total),
calcAvgPercent(scrb_sfu_per_core, scrb_total));
scrb_stalls += scrb_stalls_per_core;
}
// alu_stalls
{
uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
alu_stalls += alu_stalls_per_core;
}
// lsu_stalls
{
uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
lsu_stalls += lsu_stalls_per_core;
}
// fpu_stalls
{
uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
fpu_stalls += fpu_stalls_per_core;
}
// sfu_stalls
{
uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
sfu_stalls += sfu_stalls_per_core;
}
// PERF: memory
// ifetches
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
ifetches += ifetches_per_core;
{
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
ifetches += ifetches_per_core;
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
ifetch_lat += ifetch_lat_per_core;
}
// loads
uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
loads += loads_per_core;
{
uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
loads += loads_per_core;
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
load_lat += load_lat_per_core;
}
// stores
uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
stores += stores_per_core;
// ifetch latency
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
if (num_cores > 1) {
int mem_avg_lat = caclAvgLatency(ifetch_lat_per_core, ifetches_per_core);
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
{
uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
stores += stores_per_core;
}
ifetch_lat += ifetch_lat_per_core;
// load latency
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
if (num_cores > 1) {
int mem_avg_lat = caclAvgLatency(load_lat_per_core, loads_per_core);
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
}
load_lat += load_lat_per_core;
} break;
case VX_DCR_MPM_CLASS_MEM: {
case VX_DCR_MPM_CLASS_MEM: {
if (smem_enable) {
// PERF: smem
uint64_t smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS);
uint64_t smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES);
uint64_t smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST);
int smem_bank_utilization = calcUtilization(smem_reads + smem_writes, smem_bank_stalls);
int smem_bank_utilization = calcAvgPercent(smem_reads + smem_writes, smem_reads + smem_writes + smem_bank_stalls);
fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads);
fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes);
fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_stalls, smem_bank_utilization);
@ -330,9 +379,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
// PERF: Icache
uint64_t icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
uint64_t icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
uint64_t icache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MSHR_ST);
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls);
fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads);
fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_read_misses, icache_read_hit_ratio);
fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization);
}
if (dcache_enable) {
@ -345,13 +397,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads);
int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes);
int dcache_bank_utilization = calcUtilization(dcache_reads + dcache_writes, dcache_bank_stalls);
int dcache_bank_utilization = calcAvgPercent(dcache_reads + dcache_writes, dcache_reads + dcache_writes + dcache_bank_stalls);
int mshr_utilization = calcAvgPercent(dcache_read_misses + dcache_write_misses, dcache_read_misses + dcache_write_misses + dcache_mshr_stalls);
fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads);
fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes);
fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_read_misses, dcache_read_hit_ratio);
fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio);
fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_stalls, dcache_bank_utilization);
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_stalls);
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization);
}
if (l2cache_enable) {
@ -386,8 +439,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
#endif
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
instrs += instrs_per_core;
@ -397,10 +448,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
#ifdef PERF_ENABLE
switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: {
int scheduler_percent = calcAvgPercent(scheduler_stalls, cycles);
int fetch_percent = calcAvgPercent(fetch_stalls, cycles);
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, cycles);
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
int load_avg_lat = (int)(double(load_lat) / double(loads));
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", scheduler_stalls, scheduler_percent);
fprintf(stream, "PERF: fetch stalls=%ld (%d%%)\n", fetch_stalls, fetch_percent);
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
calcAvgPercent(scrb_alu, scrb_total),
calcAvgPercent(scrb_fpu, scrb_total),
calcAvgPercent(scrb_lsu, scrb_total),
calcAvgPercent(scrb_sfu, scrb_total));
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls);
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
@ -419,31 +480,32 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
l2cache_write_misses /= num_cores;
l2cache_bank_stalls /= num_cores;
l2cache_mshr_stalls /= num_cores;
int l2cache_read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads);
int l2cache_write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes);
int l2cache_bank_utilization = calcUtilization(l2cache_reads + l2cache_writes, l2cache_bank_stalls);
int read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads);
int write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes);
int bank_utilization = calcAvgPercent(l2cache_reads + l2cache_writes, l2cache_reads + l2cache_writes + l2cache_bank_stalls);
int mshr_utilization = calcAvgPercent(l2cache_read_misses + l2cache_write_misses, l2cache_read_misses + l2cache_write_misses + l2cache_mshr_stalls);
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, read_hit_ratio);
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, write_hit_ratio);
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, bank_utilization);
fprintf(stream, "PERF: l2cache mshr stalls=%ld (utilization=%d%%)\n", l2cache_mshr_stalls, mshr_utilization);
}
if (l3cache_enable) {
int l3cache_read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
int l3cache_write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
int l3cache_bank_utilization = calcUtilization(l3cache_reads + l3cache_writes, l3cache_bank_stalls);
int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls);
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio);
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio);
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, bank_utilization);
fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization);
}
int mem_avg_lat = caclAvgLatency(mem_lat, mem_reads);
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
} break;

View file

@ -45,20 +45,20 @@ public:
char sname[100];
std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
std::vector<MemSwitch::Ptr> unit_arbs(num_units);
for (uint32_t u = 0; u < num_units; ++u) {
snprintf(sname, 100, "%s-unit-arb-%d", name, u);
unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
for (uint32_t i = 0; i < num_requests; ++i) {
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
}
}
std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
std::vector<MemSwitch::Ptr> mem_arbs(config.num_inputs);
for (uint32_t i = 0; i < config.num_inputs; ++i) {
snprintf(sname, 100, "%s-mem-arb-%d", name, i);
mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
for (uint32_t u = 0; u < num_units; ++u) {
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
@ -66,7 +66,7 @@ public:
}
snprintf(sname, 100, "%s-cache-arb", name);
auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
auto cache_arb = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
for (uint32_t i = 0; i < num_caches; ++i) {
snprintf(sname, 100, "%s-cache%d", name, i);

View file

@ -41,19 +41,16 @@ struct params_t {
uint32_t tag_select_addr_end;
params_t(const CacheSim::Config& config) {
int32_t bank_bits = log2ceil(config.num_banks);
int32_t offset_bits = config.B - config.W;
int32_t log2_bank_size = config.C - bank_bits;
int32_t index_bits = log2_bank_size - (config.B + config.A);
assert(log2_bank_size > 0);
int32_t offset_bits = config.L - config.W;
int32_t index_bits = config.C - (config.L + config.A + config.B);
assert(offset_bits >= 0);
assert(index_bits >= 0);
this->log2_num_inputs = log2ceil(config.num_inputs);
this->words_per_line = 1 << offset_bits;
this->sets_per_bank = 1 << index_bits;
this->lines_per_set = 1 << config.A;
this->sets_per_bank = 1 << index_bits;
this->words_per_line = 1 << offset_bits;
assert(config.ports_per_bank <= this->words_per_line);
@ -63,7 +60,7 @@ struct params_t {
// Bank select
this->bank_select_addr_start = (1+this->word_select_addr_end);
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
this->bank_select_addr_end = (this->bank_select_addr_start+config.B-1);
// Set select
this->set_select_addr_start = (1+this->bank_select_addr_end);
@ -74,23 +71,23 @@ struct params_t {
this->tag_select_addr_end = (config.addr_width-1);
}
uint32_t addr_bank_id(uint64_t word_addr) const {
uint32_t addr_bank_id(uint64_t addr) const {
if (bank_select_addr_end >= bank_select_addr_start)
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
return (uint32_t)bit_getw(addr, bank_select_addr_start, bank_select_addr_end);
else
return 0;
}
uint32_t addr_set_id(uint64_t word_addr) const {
uint32_t addr_set_id(uint64_t addr) const {
if (set_select_addr_end >= set_select_addr_start)
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
return (uint32_t)bit_getw(addr, set_select_addr_start, set_select_addr_end);
else
return 0;
}
uint64_t addr_tag(uint64_t word_addr) const {
uint64_t addr_tag(uint64_t addr) const {
if (tag_select_addr_end >= tag_select_addr_start)
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
return bit_getw(addr, tag_select_addr_start, tag_select_addr_end);
else
return 0;
}
@ -288,8 +285,8 @@ private:
Config config_;
params_t params_;
std::vector<bank_t> banks_;
Switch<MemReq, MemRsp>::Ptr bank_switch_;
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
MemSwitch::Ptr bank_switch_;
MemSwitch::Ptr bypass_switch_;
std::vector<SimPort<MemReq>> mem_req_ports_;
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
std::vector<bank_req_t> pipeline_reqs_;
@ -304,16 +301,16 @@ public:
: simobject_(simobject)
, config_(config)
, params_(config)
, banks_(config.num_banks, {config, params_})
, mem_req_ports_(config.num_banks, simobject)
, mem_rsp_ports_(config.num_banks, simobject)
, pipeline_reqs_(config.num_banks, config.ports_per_bank)
, banks_((1 << config.B), {config, params_})
, mem_req_ports_((1 << config.B), simobject)
, mem_rsp_ports_((1 << config.B), simobject)
, pipeline_reqs_((1 << config.B), config.ports_per_bank)
{
char sname[100];
snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
if (config_.bypass) {
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
bypass_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
for (uint32_t i = 0; i < config_.num_inputs; ++i) {
simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
@ -323,14 +320,14 @@ public:
return;
}
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2);
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
if (config.num_banks > 1) {
if (config.B != 0) {
snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B));
for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) {
mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
}
@ -383,20 +380,22 @@ public:
pipeline_req.clear();
}
// schedule MSHR replay
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
// first: schedule MSHR replay (flush MSHR queue)
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
bank.mshr.pop(&pipeline_req);
}
// schedule memory fill
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
// second: schedule memory fill (flush memory queue)
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
if (mem_rsp_port.empty())
continue;
auto& pipeline_req = pipeline_reqs_.at(bank_id);
// skip if bank already busy
if (pipeline_req.type != bank_req_t::None)
continue;
@ -407,7 +406,7 @@ public:
mem_rsp_port.pop();
}
// schedule core requests
// last: schedule core requests (flush core queue)
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
if (core_req_port.empty())
@ -425,18 +424,21 @@ public:
}
auto bank_id = params_.addr_bank_id(core_req.addr);
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
auto& bank = banks_.at(bank_id);
auto& pipeline_req = pipeline_reqs_.at(bank_id);
// skip if bank already busy
if (pipeline_req.type != bank_req_t::None)
continue;
auto set_id = params_.addr_set_id(core_req.addr);
auto tag = params_.addr_tag(core_req.addr);
auto port_id = req_id % config_.ports_per_bank;
// check MSHR capacity
if ((!core_req.write || !config_.write_through)
&& bank.mshr.full()) {
++perf_stats_.mshr_stalls;
++perf_stats_.bank_stalls;
continue;
}
@ -452,7 +454,7 @@ public:
}
// extend request ports
pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
} else if (pipeline_req.type == bank_req_t::None) {
} else {
// schedule new request
bank_req_t bank_req(config_.ports_per_bank);
bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
@ -463,10 +465,6 @@ public:
bank_req.type = bank_req_t::Core;
bank_req.write = core_req.write;
pipeline_req = bank_req;
} else {
// bank in use
++perf_stats_.bank_stalls;
continue;
}
if (core_req.write)
@ -516,7 +514,7 @@ private:
}
void processBankRequests() {
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
auto& bank = banks_.at(bank_id);
auto pipeline_req = pipeline_reqs_.at(bank_id);
@ -545,11 +543,10 @@ private:
}
}
} break;
case bank_req_t::Core: {
bool hit = false;
bool found_free_line = false;
uint32_t hit_line_id = 0;
uint32_t repl_line_id = 0;
case bank_req_t::Core: {
int32_t hit_line_id = -1;
int32_t free_line_id = -1;
int32_t repl_line_id = 0;
uint32_t max_cnt = 0;
auto& set = bank.sets.at(pipeline_req.set_id);
@ -557,38 +554,34 @@ private:
// tag lookup
for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
auto& line = set.lines.at(i);
if (max_cnt < line.lru_ctr) {
max_cnt = line.lru_ctr;
repl_line_id = i;
}
if (line.valid) {
if (line.tag == pipeline_req.tag) {
line.lru_ctr = 0;
if (line.tag == pipeline_req.tag) {
hit_line_id = i;
hit = true;
line.lru_ctr = 0;
} else {
++line.lru_ctr;
}
if (max_cnt < line.lru_ctr) {
max_cnt = line.lru_ctr;
repl_line_id = i;
}
} else {
found_free_line = true;
repl_line_id = i;
free_line_id = i;
}
}
if (hit) {
//
// Hit handling
//
if (hit_line_id != -1) {
// Hit handling
if (pipeline_req.write) {
// handle write hit
// handle write has_hit
auto& hit_line = set.lines.at(hit_line_id);
if (config_.write_through) {
// forward write request to memory
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
} else {
@ -606,23 +599,21 @@ private:
DT(3, simobject_->name() << "-core-" << core_rsp);
}
}
} else {
//
// Miss handling
//
} else {
// Miss handling
if (pipeline_req.write)
++perf_stats_.write_misses;
else
++perf_stats_.read_misses;
if (!found_free_line && !config_.write_through) {
if (free_line_id == -1 && !config_.write_through) {
// write back dirty line
auto& repl_line = set.lines.at(repl_line_id);
if (repl_line.dirty) {
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.cid = pipeline_req.cid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++perf_stats_.evictions;
@ -635,8 +626,8 @@ private:
MemReq mem_req;
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = true;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
}
@ -655,7 +646,7 @@ private:
auto mshr_pending = bank.mshr.lookup(pipeline_req);
// allocate MSHR
auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id);
// send fill request
if (!mshr_pending) {
@ -663,8 +654,8 @@ private:
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
mem_req.write = false;
mem_req.tag = mshr_id;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req.cid = pipeline_req.cid;
mem_req.uuid = pipeline_req.uuid;
mem_req_ports_.at(bank_id).send(mem_req, 1);
DT(3, simobject_->name() << "-dram-" << mem_req);
++pending_fill_reqs_;

View file

@ -23,16 +23,15 @@ public:
struct Config {
bool bypass; // cache bypass
uint8_t C; // log2 cache size
uint8_t B; // log2 block size
uint8_t L; // log2 line size
uint8_t W; // log2 word size
uint8_t A; // log2 associativity
uint8_t addr_width; // word address bits
uint8_t num_banks; // number of banks
uint8_t B; // log2 number of banks
uint8_t addr_width; // word address bits
uint8_t ports_per_bank; // number of ports per bank
uint8_t num_inputs; // number of inputs
bool write_through; // is write-through
bool write_reponse; // enable write response
uint16_t victim_size; // victim cache size
uint16_t mshr_size; // MSHR buffer size
uint8_t latency; // pipeline latency
};

View file

@ -36,16 +36,15 @@ Cluster::Cluster(const SimContext& ctx,
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
!L2_ENABLED,
log2ceil(L2_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(MEM_BLOCK_SIZE), // L
log2ceil(L2_NUM_WAYS), // W
0, // A
log2ceil(L2_NUM_BANKS), // B
XLEN, // address bits
L2_NUM_BANKS, // number of banks
1, // number of ports
5, // request size
true, // write-through
false, // write response
0, // victim size
L2_MSHR_SIZE, // mshr
2, // pipeline latency
});
@ -57,16 +56,15 @@ Cluster::Cluster(const SimContext& ctx,
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
!ICACHE_ENABLED,
log2ceil(ICACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(L1_LINE_SIZE), // L
log2ceil(sizeof(uint32_t)), // W
log2ceil(ICACHE_NUM_WAYS),// A
XLEN, // address bits
1, // number of banks
1, // B
XLEN, // address bits
1, // number of ports
1, // number of inputs
true, // write-through
false, // write response
0, // victim size
(uint8_t)arch.num_warps(), // mshr
2, // pipeline latency
});
@ -78,16 +76,15 @@ Cluster::Cluster(const SimContext& ctx,
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
!DCACHE_ENABLED,
log2ceil(DCACHE_SIZE), // C
log2ceil(L1_LINE_SIZE), // B
log2ceil(L1_LINE_SIZE), // L
log2ceil(sizeof(Word)), // W
log2ceil(DCACHE_NUM_WAYS),// A
XLEN, // address bits
DCACHE_NUM_BANKS, // number of banks
log2ceil(DCACHE_NUM_BANKS), // B
XLEN, // address bits
1, // number of ports
DCACHE_NUM_BANKS, // number of inputs
true, // write-through
false, // write response
0, // victim size
DCACHE_MSHR_SIZE, // mshr
4, // pipeline latency
});
@ -129,11 +126,11 @@ Cluster::Cluster(const SimContext& ctx,
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
smem_demux->ReqDC.bind(&dcaches_->CoreReqPorts.at(i).at(j));
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDC);
smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
smem_demux->ReqSM.bind(&sharedmems_.at(i)->Inputs.at(j));
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSM);
}
}
}

View file

@ -45,19 +45,21 @@ Core::Core(const SimContext& ctx,
, warps_(arch.num_warps())
, barriers_(arch.num_barriers(), 0)
, fcsrs_(arch.num_warps(), 0)
, ibuffers_(ISSUE_WIDTH, IBUF_SIZE)
, ibuffers_(arch.num_warps(), IBUF_SIZE)
, scoreboard_(arch_)
, operands_(ISSUE_WIDTH)
, dispatchers_((uint32_t)ExeType::MAX)
, exe_units_((uint32_t)ExeType::MAX)
, dispatchers_((uint32_t)ExeType::ExeTypeCount)
, exe_units_((uint32_t)ExeType::ExeTypeCount)
, sharedmem_(sharedmem)
, fetch_latch_("fetch")
, decode_latch_("decode")
, pending_icache_(arch_.num_warps())
, committed_traces_(ISSUE_WIDTH, nullptr)
, csrs_(arch.num_warps())
, cluster_(cluster)
{
, commit_arbs_(ISSUE_WIDTH)
{
char sname[100];
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
csrs_.at(i).resize(arch.num_threads());
}
@ -82,6 +84,16 @@ Core::Core(const SimContext& ctx,
exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
exe_units_.at((int)ExeType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
// bind commit arbiters
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
snprintf(sname, 100, "commit-arb%d", i);
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
}
commit_arbs_.at(i) = arbiter;
}
this->reset();
}
@ -99,8 +111,12 @@ void Core::reset() {
for (auto& exe_unit : exe_units_) {
exe_unit->reset();
}
for (auto& commit_arb : commit_arbs_) {
commit_arb->reset();
}
for ( auto& barrier : barriers_) {
for (auto& barrier : barriers_) {
barrier.reset();
}
@ -112,7 +128,7 @@ void Core::reset() {
ibuf.clear();
}
commit_exe_= 0;
ibuffer_idx_ = 0;
scoreboard_.clear();
fetch_latch_.clear();
@ -150,8 +166,10 @@ void Core::schedule() {
break;
}
}
if (scheduled_warp == -1)
if (scheduled_warp == -1) {
++perf_stats_.sched_stalls;
return;
}
// suspend warp until decode
stalled_warps_.set(scheduled_warp);
@ -192,11 +210,11 @@ void Core::fetch() {
mem_req.tag = pending_icache_.allocate(trace);
mem_req.cid = trace->cid;
mem_req.uuid = trace->uuid;
icache_req_ports.at(0).send(mem_req, 1);
icache_req_ports.at(0).send(mem_req, 2);
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
fetch_latch_.pop();
++pending_ifetches_;
fetch_latch_.pop();
++perf_stats_.ifetches;
++pending_ifetches_;
}
void Core::decode() {
@ -206,7 +224,7 @@ void Core::decode() {
auto trace = decode_latch_.front();
// check ibuffer capacity
auto& ibuffer = ibuffers_.at(trace->wid % ISSUE_WIDTH);
auto& ibuffer = ibuffers_.at(trace->wid);
if (ibuffer.full()) {
if (!trace->log_once(true)) {
DT(3, "*** ibuffer-stall: " << *trace);
@ -239,7 +257,7 @@ void Core::decode() {
}
void Core::issue() {
// operands to dispatch
// operands to dispatchers
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& operand = operands_.at(i);
if (operand->Output.empty())
@ -257,7 +275,8 @@ void Core::issue() {
// issue ibuffer instructions
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto& ibuffer = ibuffers_.at(i);
uint32_t ii = (ibuffer_idx_ + i) % ibuffers_.size();
auto& ibuffer = ibuffers_.at(ii);
if (ibuffer.empty())
continue;
@ -265,17 +284,27 @@ void Core::issue() {
// check scoreboard
if (scoreboard_.in_use(trace)) {
auto uses = scoreboard_.get_uses(trace);
if (!trace->log_once(true)) {
DTH(3, "*** scoreboard-stall: dependents={");
auto uses = scoreboard_.get_uses(trace);
DTH(3, "*** scoreboard-stall: dependents={");
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
__unused (use);
if (j) DTN(3, ", ");
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
DTN(3, use.reg_type << use.reg_id << "(#" << use.uuid << ")");
}
DTN(3, "}, " << *trace << std::endl);
}
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
auto& use = uses.at(j);
switch (use.exe_type) {
case ExeType::ALU: ++perf_stats_.scrb_alu; break;
case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
case ExeType::LSU: ++perf_stats_.scrb_lsu; break;
case ExeType::SFU: ++perf_stats_.scrb_sfu; break;
default: assert(false);
}
}
++perf_stats_.scrb_stalls;
continue;
} else {
@ -294,10 +323,11 @@ void Core::issue() {
ibuffer.pop();
}
ibuffer_idx_ += ISSUE_WIDTH;
}
void Core::execute() {
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
for (uint32_t i = 0; i < (uint32_t)ExeType::ExeTypeCount; ++i) {
auto& dispatch = dispatchers_.at(i);
auto& exe_unit = exe_units_.at(i);
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
@ -313,10 +343,11 @@ void Core::execute() {
void Core::commit() {
// process completed instructions
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
auto trace = committed_traces_.at(i);
if (!trace)
auto& commit_arb = commit_arbs_.at(i);
if (commit_arb->Outputs.at(0).empty())
continue;
committed_traces_.at(i) = nullptr;
auto trace = commit_arb->Outputs.at(0).front();
// advance to commit stage
DT(3, "pipeline-commit: " << *trace);
@ -334,27 +365,11 @@ void Core::commit() {
perf_stats_.instrs += trace->tmask.count();
}
commit_arb->Outputs.at(0).pop();
// delete the trace
delete trace;
}
// select completed instructions
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
uint32_t ii = (commit_exe_ + i) % (uint32_t)ExeType::MAX;
auto& exe_unit = exe_units_.at(ii);
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
auto committed_trace = committed_traces_.at(j);
if (committed_trace)
continue;
auto& output = exe_unit->Outputs.at(j);
if (output.empty())
continue;
auto trace = output.front();
committed_traces_.at(j) = trace;
output.pop();
}
}
++commit_exe_;
}
void Core::wspawn(uint32_t num_warps, Word nextPC) {
@ -533,6 +548,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
break;
case VX_DCR_MPM_CLASS_CORE: {
switch (addr) {
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
case VX_CSR_MPM_FETCH_ST: return perf_stats_.fetch_stalls & 0xffffffff;
case VX_CSR_MPM_FETCH_ST_H:return perf_stats_.fetch_stalls >> 32;
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
@ -545,6 +564,14 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32;
case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff;
case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32;
case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff;
case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff;
case VX_CSR_MPM_SCRB_FPU_H:return perf_stats_.scrb_fpu >> 32;
case VX_CSR_MPM_SCRB_LSU: return perf_stats_.scrb_lsu & 0xffffffff;
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
@ -561,30 +588,25 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_DCR_MPM_CLASS_MEM: {
auto proc_perf = cluster_->processor()->perf_stats();
switch (addr) {
case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32;
case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32;
case VX_CSR_MPM_ICACHE_MSHR_ST: return proc_perf.clusters.icache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return proc_perf.clusters.icache.mshr_stalls >> 32;
case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff;
case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32;
case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff;
case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32;
case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32;
case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32;
case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_BANK_ST_H:return proc_perf.clusters.dcache.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H:return proc_perf.clusters.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff;
case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32;
case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
case VX_CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff;
case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32;
case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff;
case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32;
case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32;
case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff;
case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32;
case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_BANK_ST_H: return proc_perf.clusters.dcache.bank_stalls >> 32;
case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return proc_perf.clusters.dcache.mshr_stalls >> 32;
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
@ -612,12 +634,19 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
case VX_CSR_MPM_MEM_WRITES_H:return proc_perf.mem_writes >> 32;
case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff;
case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32;
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff;
case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32;
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff;
case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32;
case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
case VX_CSR_MPM_SMEM_BANK_ST_H: return proc_perf.clusters.sharedmem.bank_stalls >> 32;
}
} break;
}

View file

@ -22,11 +22,11 @@
#include <memory>
#include <set>
#include <simobject.h>
#include <mem.h>
#include "debug.h"
#include "types.h"
#include "arch.h"
#include "decode.h"
#include "mem.h"
#include "warp.h"
#include "pipeline.h"
#include "cache_sim.h"
@ -42,17 +42,25 @@ namespace vortex {
class Cluster;
using TraceSwitch = Mux<pipeline_trace_t*>;
class Core : public SimObject<Core> {
public:
struct PerfStats {
uint64_t cycles;
uint64_t instrs;
uint64_t sched_stalls;
uint64_t fetch_stalls;
uint64_t ibuf_stalls;
uint64_t scrb_stalls;
uint64_t alu_stalls;
uint64_t lsu_stalls;
uint64_t fpu_stalls;
uint64_t sfu_stalls;
uint64_t scrb_alu;
uint64_t scrb_fpu;
uint64_t scrb_lsu;
uint64_t scrb_sfu;
uint64_t ifetches;
uint64_t loads;
uint64_t stores;
@ -62,12 +70,18 @@ public:
PerfStats()
: cycles(0)
, instrs(0)
, sched_stalls(0)
, fetch_stalls(0)
, ibuf_stalls(0)
, scrb_stalls(0)
, alu_stalls(0)
, lsu_stalls(0)
, fpu_stalls(0)
, sfu_stalls(0)
, scrb_alu(0)
, scrb_fpu(0)
, scrb_lsu(0)
, scrb_sfu(0)
, ifetches(0)
, loads(0)
, stores(0)
@ -173,7 +187,6 @@ private:
PipelineLatch decode_latch_;
HashTable<pipeline_trace_t*> pending_icache_;
std::vector<pipeline_trace_t*> committed_traces_;
WarpMask active_warps_;
WarpMask stalled_warps_;
uint64_t issued_instrs_;
@ -190,7 +203,9 @@ private:
Cluster* cluster_;
uint32_t commit_exe_;
std::vector<TraceSwitch::Ptr> commit_arbs_;
uint32_t ibuffer_idx_;
friend class Warp;
friend class LsuUnit;

View file

@ -66,6 +66,7 @@ public:
}
auto& output = Outputs.at(i);
auto trace = input.front();
auto new_trace = trace;
if (pid_count_ != 1) {
auto start_p = start_p_.at(b);
if (start_p == -1) {
@ -81,33 +82,30 @@ public:
end = j;
}
start /= num_lanes_;
end /= num_lanes_;
auto new_trace = new pipeline_trace_t(*trace);
new_trace->tmask.reset();
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
new_trace->tmask[j] = trace->tmask[j];
}
new_trace->pid = start;
new_trace->sop = (start_p == 0);
if (start == end) {
new_trace->eop = 1;
end /= num_lanes_;
if (start != end) {
new_trace = new pipeline_trace_t(*trace);
new_trace->eop = false;
start_p_.at(b) = start + 1;
} else {
start_p_.at(b) = -1;
input.pop();
++block_sent;
delete trace;
} else {
new_trace->eop = 0;
start_p_.at(b) = start + 1;
}
output.send(new_trace, 1);
DT(3, "pipeline-dispatch: " << *new_trace);
}
new_trace->pid = start;
new_trace->sop = (0 == start_p);
ThreadMask tmask;
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
tmask[j] = trace->tmask[j];
}
new_trace->tmask = tmask;
} else {
trace->pid = 0;
new_trace->pid = 0;
input.pop();
output.send(trace, 1);
DT(3, "pipeline-dispatch: " << *trace);
++block_sent;
}
}
DT(3, "pipeline-dispatch: " << *new_trace);
output.send(new_trace, 1);
}
if (block_sent == block_size_) {
batch_idx_ = (batch_idx_ + 1) % batch_count_;
@ -138,4 +136,4 @@ private:
std::vector<int> start_p_;
};
}
}

View file

@ -32,18 +32,17 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
!L3_ENABLED,
log2ceil(L3_CACHE_SIZE), // C
log2ceil(MEM_BLOCK_SIZE), // B
log2ceil(L3_NUM_WAYS), // W
0, // A
XLEN, // address bits
L3_NUM_BANKS, // number of banks
1, // number of ports
log2ceil(MEM_BLOCK_SIZE), // L
log2ceil(L3_NUM_WAYS), // W
0, // A
log2ceil(L3_NUM_BANKS), // B
XLEN, // address bits
1, // number of ports
uint8_t(arch.num_clusters()), // request size
true, // write-through
false, // write response
0, // victim size
L3_MSHR_SIZE, // mshr
2, // pipeline latency
true, // write-through
false, // write response
L3_MSHR_SIZE, // mshr
2, // pipeline latency
}
);

View file

@ -22,9 +22,10 @@ class Scoreboard {
public:
struct reg_use_t {
RegType type;
uint32_t reg;
uint64_t owner;
RegType reg_type;
uint32_t reg_id;
ExeType exe_type;
uint64_t uuid;
};
Scoreboard(const Arch &arch)
@ -44,89 +45,81 @@ public:
owners_.clear();
}
bool in_use(pipeline_trace_t* state) const {
return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0
|| (state->used_fregs & in_use_fregs_.at(state->wid)) != 0
|| (state->used_vregs & in_use_vregs_.at(state->wid)) != 0;
bool in_use(pipeline_trace_t* trace) const {
return (trace->used_iregs & in_use_iregs_.at(trace->wid)) != 0
|| (trace->used_fregs & in_use_fregs_.at(trace->wid)) != 0
|| (trace->used_vregs & in_use_vregs_.at(trace->wid)) != 0;
}
std::vector<reg_use_t> get_uses(pipeline_trace_t* state) const {
std::vector<reg_use_t> out;
{
uint32_t r = 0;
auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid);
while (used_iregs.any()) {
if (used_iregs.test(0)) {
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer;
out.push_back({RegType::Integer, r, owners_.at(tag)});
}
used_iregs >>= 1;
++r;
std::vector<reg_use_t> get_uses(pipeline_trace_t* trace) const {
std::vector<reg_use_t> out;
auto used_iregs = trace->used_iregs & in_use_iregs_.at(trace->wid);
auto used_fregs = trace->used_fregs & in_use_fregs_.at(trace->wid);
auto used_vregs = trace->used_vregs & in_use_vregs_.at(trace->wid);
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
if (used_iregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
auto owner = owners_.at(tag);
out.push_back({RegType::Integer, r, owner->exe_type, owner->uuid});
}
}
{
uint32_t r = 0;
auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid);
while (used_fregs.any()) {
if (used_fregs.test(0)) {
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float;
out.push_back({RegType::Float, r, owners_.at(tag)});
}
used_fregs >>= 1;
++r;
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
if (used_fregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
auto owner = owners_.at(tag);
out.push_back({RegType::Float, r, owner->exe_type, owner->uuid});
}
}
{
uint32_t r = 0;
auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid);
while (used_vregs.any()) {
if (used_vregs.test(0)) {
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector;
out.push_back({RegType::Vector, r, owners_.at(tag)});
}
used_vregs >>= 1;
++r;
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
if (used_vregs.test(r)) {
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector;
auto owner = owners_.at(tag);
out.push_back({RegType::Vector, r, owner->exe_type, owner->uuid});
}
}
return out;
}
void reserve(pipeline_trace_t* state) {
assert(state->wb);
switch (state->rdest_type) {
void reserve(pipeline_trace_t* trace) {
assert(trace->wb);
switch (trace->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).set(state->rdest);
in_use_iregs_.at(trace->wid).set(trace->rdest);
break;
case RegType::Float:
in_use_fregs_.at(state->wid).set(state->rdest);
in_use_fregs_.at(trace->wid).set(trace->rdest);
break;
case RegType::Vector:
in_use_vregs_.at(state->wid).set(state->rdest);
break;
default:
in_use_vregs_.at(trace->wid).set(trace->rdest);
break;
default: assert(false);
}
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
assert(owners_.count(tag) == 0);
owners_[tag] = state->uuid;
owners_[tag] = trace;
assert((int)trace->exe_type < 5);
}
void release(pipeline_trace_t* state) {
assert(state->wb);
switch (state->rdest_type) {
void release(pipeline_trace_t* trace) {
assert(trace->wb);
switch (trace->rdest_type) {
case RegType::Integer:
in_use_iregs_.at(state->wid).reset(state->rdest);
in_use_iregs_.at(trace->wid).reset(trace->rdest);
break;
case RegType::Float:
in_use_fregs_.at(state->wid).reset(state->rdest);
in_use_fregs_.at(trace->wid).reset(trace->rdest);
break;
case RegType::Vector:
in_use_vregs_.at(state->wid).reset(state->rdest);
break;
default:
in_use_vregs_.at(trace->wid).reset(trace->rdest);
break;
default: assert(false);
}
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
owners_.erase(tag);
}
@ -135,7 +128,7 @@ private:
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::unordered_map<uint32_t, uint64_t> owners_;
std::unordered_map<uint32_t, pipeline_trace_t*> owners_;
};
}

View file

@ -81,7 +81,7 @@ enum class ExeType {
LSU,
FPU,
SFU,
MAX,
ExeTypeCount
};
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
@ -90,7 +90,7 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
case ExeType::LSU: os << "LSU"; break;
case ExeType::FPU: os << "FPU"; break;
case ExeType::SFU: os << "SFU"; break;
case ExeType::MAX: break;
default: assert(false);
}
return os;
}
@ -138,7 +138,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
enum class AddrType {
Global,
Shared,
IO,
IO
};
inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
@ -164,7 +164,7 @@ enum class FpuType {
FMA,
FDIV,
FSQRT,
FCVT,
FCVT
};
inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
@ -190,7 +190,7 @@ enum class SfuType {
CSRRW,
CSRRS,
CSRRC,
CMOV
CMOV
};
inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
@ -351,6 +351,92 @@ private:
///////////////////////////////////////////////////////////////////////////////
template <typename Type>
class Mux : public SimObject<Mux<Type>> {
public:
std::vector<SimPort<Type>> Inputs;
std::vector<SimPort<Type>> Outputs;
Mux(
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
uint32_t num_outputs = 1,
uint32_t delay = 1
) : SimObject<Mux<Type>>(ctx, name)
, Inputs(num_inputs, this)
, Outputs(num_outputs, this)
, type_(type)
, delay_(delay)
, cursors_(num_outputs, 0)
, num_reqs_(num_inputs / num_outputs)
{
assert(delay != 0);
assert(num_inputs <= 32);
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
// bypass mode
if (num_inputs == num_outputs) {
for (uint32_t i = 0; i < num_inputs; ++i) {
Inputs.at(i).bind(&Outputs.at(i));
}
}
}
void reset() {
for (auto& cursor : cursors_) {
cursor = 0;
}
}
void tick() {
uint32_t I = Inputs.size();
uint32_t O = Outputs.size();
uint32_t R = num_reqs_;
// skip bypass mode
if (I == O)
return;
// process inputs
for (uint32_t o = 0; o < O; ++o) {
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (cursors_.at(o) + r) & (R-1);
uint32_t j = o * R + i;
if (j >= I)
continue;
auto& req_in = Inputs.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
DT(4, this->name() << "-" << req);
Outputs.at(o).send(req, delay_);
req_in.pop();
this->update_cursor(o, i);
break;
}
}
}
}
private:
void update_cursor(uint32_t index, uint32_t grant) {
if (type_ == ArbiterType::RoundRobin) {
cursors_.at(index) = grant + 1;
}
}
ArbiterType type_;
uint32_t delay_;
std::vector<uint32_t> cursors_;
uint32_t num_reqs_;
};
///////////////////////////////////////////////////////////////////////////////
template <typename Req, typename Rsp>
class Switch : public SimObject<Switch<Req, Rsp>> {
public:
@ -364,13 +450,13 @@ public:
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs = 1,
uint32_t num_inputs,
uint32_t num_outputs = 1,
uint32_t delay = 1
)
: SimObject<Switch<Req, Rsp>>(ctx, name)
, ReqIn(num_inputs, this)
, RspIn(num_inputs, this)
, ReqIn(num_inputs, this)
, RspIn(num_inputs, this)
, ReqOut(num_outputs, this)
, RspOut(num_outputs, this)
, type_(type)
@ -383,8 +469,8 @@ public:
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
// bypass mode
if (num_inputs == num_outputs) {
// bypass mode
for (uint32_t i = 0; i < num_inputs; ++i) {
ReqIn.at(i).bind(&ReqOut.at(i));
RspOut.at(i).bind(&RspIn.at(i));
@ -462,14 +548,14 @@ private:
class SMemDemux : public SimObject<SMemDemux> {
public:
SimPort<MemReq> ReqIn;
SimPort<MemRsp> RspIn;
SimPort<MemReq> ReqIn;
SimPort<MemRsp> RspIn;
SimPort<MemReq> ReqSm;
SimPort<MemRsp> RspSm;
SimPort<MemReq> ReqSM;
SimPort<MemRsp> RspSM;
SimPort<MemReq> ReqDc;
SimPort<MemRsp> RspDc;
SimPort<MemReq> ReqDC;
SimPort<MemRsp> RspDC;
SMemDemux(
const SimContext& ctx,
@ -478,45 +564,49 @@ public:
) : SimObject<SMemDemux>(ctx, name)
, ReqIn(this)
, RspIn(this)
, ReqSm(this)
, RspSm(this)
, ReqDc(this)
, RspDc(this)
, ReqSM(this)
, RspSM(this)
, ReqDC(this)
, RspDC(this)
, delay_(delay)
{}
void reset() {}
void tick() {
void tick() {
// process incoming reponses
if (!RspSM.empty()) {
auto& rsp = RspSM.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspSM.pop();
}
if (!RspDC.empty()) {
auto& rsp = RspDC.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspDC
.pop();
}
// process incomming requests
if (!ReqIn.empty()) {
auto& req = ReqIn.front();
DT(4, this->name() << "-" << req);
if (req.type == AddrType::Shared) {
ReqSm.send(req, delay_);
ReqSM.send(req, delay_);
} else {
ReqDc.send(req, delay_);
ReqDC.send(req, delay_);
}
ReqIn.pop();
}
// process incoming reponses
if (!RspSm.empty()) {
auto& rsp = RspSm.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspSm.pop();
}
if (!RspDc.empty()) {
auto& rsp = RspDc.front();
DT(4, this->name() << "-" << rsp);
RspIn.send(rsp, 1);
RspDc.pop();
}
}
private:
uint32_t delay_;
};
}
///////////////////////////////////////////////////////////////////////////////
using MemSwitch = Switch<MemReq, MemRsp>;
}

View file

@ -17,7 +17,7 @@ all:
$(MAKE) -C lbm
$(MAKE) -C oclprintf
$(MAKE) -C blackscholes
$(MAKE) -C matmul
$(MAKE) -C sgemm2
$(MAKE) -C convolution
run-simx:
@ -38,7 +38,7 @@ run-simx:
$(MAKE) -C transpose run-simx
$(MAKE) -C convolution run-simx
$(MAKE) -C cutcp run-simx
$(MAKE) -C matmul run-simx
$(MAKE) -C sgemm2 run-simx
$(MAKE) -C vectorhypot run-simx
$(MAKE) -C mri-q run-simx
@ -60,7 +60,7 @@ run-rtlsim:
$(MAKE) -C blackscholes run-rtlsim
$(MAKE) -C convolution run-rtlsim
# $(MAKE) -C cutcp run-rtlsim
# $(MAKE) -C matmul run-rtlsim
# $(MAKE) -C sgemm2 run-rtlsim
# $(MAKE) -C vectorhypot run-rtlsim
# $(MAKE) -C mri-q run-rtlsim
@ -82,7 +82,7 @@ run-opae:
$(MAKE) -C blackscholes run-opae
$(MAKE) -C convolution run-opae
# $(MAKE) -C cutcp run-opae
# $(MAKE) -C matmul run-opae
# $(MAKE) -C sgemm2 run-opae
# $(MAKE) -C vectorhypot run-opae
# $(MAKE) -C mri-q run-opae
@ -105,7 +105,7 @@ clean:
$(MAKE) -C lbm clean
$(MAKE) -C oclprintf clean
$(MAKE) -C blackscholes clean
$(MAKE) -C matmul clean
$(MAKE) -C sgemm2 clean
$(MAKE) -C convolution clean
clean-all:
@ -128,5 +128,5 @@ clean-all:
$(MAKE) -C lbm clean-all
$(MAKE) -C oclprintf clean-all
$(MAKE) -C blackscholes clean-all
$(MAKE) -C matmul clean-all
$(MAKE) -C sgemm2 clean-all
$(MAKE) -C convolution clean-all

View file

@ -73,7 +73,7 @@ OBJS := $(addsuffix .o, $(notdir $(SRCS)))
all: $(PROJECT) kernel.pocl
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
%.cc.o: %.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
@ -87,6 +87,9 @@ kernel.pocl: kernel.cl
$(PROJECT): $(OBJS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-hostgpu: $(PROJECT) kernel.pocl
./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)

View file

@ -1,4 +1,4 @@
PROJECT = matmul
PROJECT = sgemm2
SRCS = main.cc

View file

@ -1,4 +1,4 @@
__kernel void matmul(__global float *A,
__kernel void sgemm2(__global float *A,
__global float *B,
__global float *C,
const unsigned int N,
@ -28,12 +28,15 @@ __kernel void matmul(__global float *A,
for (int j = 0; j < localSize; j++) {
sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
}
// Ensure computation is done before loading next block
barrier(CLK_LOCAL_MEM_FENCE);
}
C[globalRow * N + globalCol] = sum;
}
/*__kernel void matmul(__global float *A,
/*__kernel void sgemm2(__global float *A,
__global float *B,
__global float *C,
const unsigned int N)

View file

@ -12,7 +12,7 @@
#define FLOAT_ULP 6
#define KERNEL_NAME "matmul"
#define KERNEL_NAME "sgemm2"
#define CL_CHECK(_expr) \
do { \

View file

@ -10,7 +10,8 @@ all:
$(MAKE) -C fence
$(MAKE) -C no_mf_ext
$(MAKE) -C no_smem
$(MAKE) -C tensor
$(MAKE) -C vecaddx
$(MAKE) -C sgemmx
run-simx:
$(MAKE) -C basic run-simx
@ -24,7 +25,8 @@ run-simx:
$(MAKE) -C fence run-simx
$(MAKE) -C no_mf_ext run-simx
$(MAKE) -C no_smem run-simx
$(MAKE) -C tensor run-simx
$(MAKE) -C vecaddx run-simx
$(MAKE) -C sgemmx run-simx
run-rtlsim:
$(MAKE) -C basic run-rtlsim
@ -38,7 +40,8 @@ run-rtlsim:
$(MAKE) -C fence run-rtlsim
$(MAKE) -C no_mf_ext run-rtlsim
$(MAKE) -C no_smem run-rtlsim
$(MAKE) -C tensor run-rtlsim
$(MAKE) -C vecaddx run-rtlsim
$(MAKE) -C sgemmx run-rtlsim
run-opae:
$(MAKE) -C basic run-opae
@ -52,7 +55,8 @@ run-opae:
$(MAKE) -C fence run-opae
$(MAKE) -C no_mf_ext run-opae
$(MAKE) -C no_smem run-opae
$(MAKE) -C tensor run-opae
$(MAKE) -C vecaddx run-opae
$(MAKE) -C sgemmx run-opae
clean:
$(MAKE) -C basic clean
@ -66,7 +70,8 @@ clean:
$(MAKE) -C fence clean
$(MAKE) -C no_mf_ext clean
$(MAKE) -C no_smem clean
$(MAKE) -C tensor clean
$(MAKE) -C vecaddx clean
$(MAKE) -C sgemmx clean
clean-all:
$(MAKE) -C basic clean-all
@ -80,4 +85,5 @@ clean-all:
$(MAKE) -C fence clean-all
$(MAKE) -C no_mf_ext clean-all
$(MAKE) -C no_smem clean-all
$(MAKE) -C tensor clean-all
$(MAKE) -C vecaddx clean-all
$(MAKE) -C sgemmx clean-all

View file

@ -1,4 +1,4 @@
PROJECT = tensor
PROJECT = sgemmx
SRCS = main.cpp

View file

@ -2,6 +2,7 @@
#include <unistd.h>
#include <string.h>
#include <vector>
#include <chrono>
#include <vortex.h>
#include "common.h"
@ -122,43 +123,6 @@ void cleanup() {
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
const std::vector<TYPE>& refs) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < refs.size(); ++i) {
auto ref = refs[i];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -239,10 +203,43 @@ int main(int argc, char *argv[]) {
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size));
auto time_start = std::chrono::high_resolution_clock::now();
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, refs));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < refs.size(); ++i) {
auto ref = refs[i];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
// cleanup
std::cout << "cleanup" << std::endl;

View file

@ -0,0 +1,9 @@
PROJECT = vecaddx
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64
include ../common.mk

View file

@ -0,0 +1,17 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif
typedef struct {
uint32_t num_points;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View file

@ -0,0 +1,18 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
dst_ptr[task_id] = src0_ptr[task_id] + src1_ptr[task_id];
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -0,0 +1,246 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define FLOAT_ULP 6
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
template <typename Type>
class Comparator {};
template <>
class Comparator<int> {
public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
}
return false;
}
return true;
}
};
template <>
class Comparator<float> {
private:
union Float_t { float f; int i; };
public:
static const char* type_str() {
return "float";
}
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
}
return false;
}
return true;
}
};
const char* kernel_file = "kernel.bin";
uint32_t size = 16;
vx_device_h device = nullptr;
std::vector<TYPE> source_data;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
std::cout << "number of cores: " << num_cores << std::endl;
std::cout << "number of warps: " << num_warps << std::endl;
std::cout << "number of threads: " << num_threads << std::endl;
uint32_t num_points = size;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// generate source data
source_data.resize(2 * num_points);
for (uint32_t i = 0; i < source_data.size(); ++i) {
source_data[i] = Comparator<TYPE>::generate();
}
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 0];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 1];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}