mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
Merge branch 'develop' of https://github.com/vortexgpgpu/vortex into develop
This commit is contained in:
commit
621d6de6ce
42 changed files with 1216 additions and 634 deletions
|
@ -58,6 +58,8 @@
|
|||
|
||||
`define VX_CSR_MPM_BASE 12'hB00
|
||||
`define VX_CSR_MPM_BASE_H 12'hB80
|
||||
`define VX_CSR_MPM_USER 12'hB03
|
||||
`define VX_CSR_MPM_USER_H 12'hB83
|
||||
|
||||
// Machine Performance-monitoring core counters
|
||||
// PERF: Standard
|
||||
|
@ -68,29 +70,41 @@
|
|||
`define VX_CSR_MINSTRET 12'hB02
|
||||
`define VX_CSR_MINSTRET_H 12'hB82
|
||||
// PERF: pipeline
|
||||
`define VX_CSR_MPM_IBUF_ST 12'hB03
|
||||
`define VX_CSR_MPM_IBUF_ST_H 12'hB83
|
||||
`define VX_CSR_MPM_SCRB_ST 12'hB04
|
||||
`define VX_CSR_MPM_SCRB_ST_H 12'hB84
|
||||
`define VX_CSR_MPM_ALU_ST 12'hB05
|
||||
`define VX_CSR_MPM_ALU_ST_H 12'hB85
|
||||
`define VX_CSR_MPM_LSU_ST 12'hB06
|
||||
`define VX_CSR_MPM_LSU_ST_H 12'hB86
|
||||
`define VX_CSR_MPM_FPU_ST 12'hB07
|
||||
`define VX_CSR_MPM_FPU_ST_H 12'hB87
|
||||
`define VX_CSR_MPM_SFU_ST 12'hB08
|
||||
`define VX_CSR_MPM_SFU_ST_H 12'hB88
|
||||
`define VX_CSR_MPM_SCHED_ST 12'hB03
|
||||
`define VX_CSR_MPM_SCHED_ST_H 12'hB83
|
||||
`define VX_CSR_MPM_FETCH_ST 12'hB04
|
||||
`define VX_CSR_MPM_FETCH_ST_H 12'hB84
|
||||
`define VX_CSR_MPM_IBUF_ST 12'hB05
|
||||
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
|
||||
`define VX_CSR_MPM_SCRB_ST 12'hB06
|
||||
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
|
||||
`define VX_CSR_MPM_ALU_ST 12'hB07
|
||||
`define VX_CSR_MPM_ALU_ST_H 12'hB87
|
||||
`define VX_CSR_MPM_LSU_ST 12'hB08
|
||||
`define VX_CSR_MPM_LSU_ST_H 12'hB88
|
||||
`define VX_CSR_MPM_FPU_ST 12'hB09
|
||||
`define VX_CSR_MPM_FPU_ST_H 12'hB89
|
||||
`define VX_CSR_MPM_SFU_ST 12'hB0A
|
||||
`define VX_CSR_MPM_SFU_ST_H 12'hB8A
|
||||
`define VX_CSR_MPM_SCRB_ALU 12'hB0B
|
||||
`define VX_CSR_MPM_SCRB_ALU_H 12'hB8B
|
||||
`define VX_CSR_MPM_SCRB_FPU 12'hB0C
|
||||
`define VX_CSR_MPM_SCRB_FPU_H 12'hB8C
|
||||
`define VX_CSR_MPM_SCRB_LSU 12'hB0D
|
||||
`define VX_CSR_MPM_SCRB_LSU_H 12'hB8D
|
||||
`define VX_CSR_MPM_SCRB_SFU 12'hB0E
|
||||
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8E
|
||||
// PERF: memory
|
||||
`define VX_CSR_MPM_IFETCHES 12'hB0A
|
||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8A
|
||||
`define VX_CSR_MPM_LOADS 12'hB0B
|
||||
`define VX_CSR_MPM_LOADS_H 12'hB8B
|
||||
`define VX_CSR_MPM_STORES 12'hB0C
|
||||
`define VX_CSR_MPM_STORES_H 12'hB8C
|
||||
`define VX_CSR_MPM_IFETCH_LAT 12'hB0D
|
||||
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB8D
|
||||
`define VX_CSR_MPM_LOAD_LAT 12'hB0E
|
||||
`define VX_CSR_MPM_LOAD_LAT_H 12'hB8E
|
||||
`define VX_CSR_MPM_IFETCHES 12'hB0F
|
||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8F
|
||||
`define VX_CSR_MPM_LOADS 12'hB10
|
||||
`define VX_CSR_MPM_LOADS_H 12'hB90
|
||||
`define VX_CSR_MPM_STORES 12'hB11
|
||||
`define VX_CSR_MPM_STORES_H 12'hB91
|
||||
`define VX_CSR_MPM_IFETCH_LAT 12'hB12
|
||||
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB92
|
||||
`define VX_CSR_MPM_LOAD_LAT 12'hB13
|
||||
`define VX_CSR_MPM_LOAD_LAT_H 12'hB93
|
||||
|
||||
// Machine Performance-monitoring memory counters
|
||||
// PERF: icache
|
||||
|
@ -98,59 +112,61 @@
|
|||
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83
|
||||
`define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
|
||||
`define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84
|
||||
`define VX_CSR_MPM_ICACHE_MSHR_ST 12'hB05 // MSHR stalls
|
||||
`define VX_CSR_MPM_ICACHE_MSHR_ST_H 12'hB85
|
||||
// PERF: dcache
|
||||
`define VX_CSR_MPM_DCACHE_READS 12'hB05 // total reads
|
||||
`define VX_CSR_MPM_DCACHE_READS_H 12'hB85
|
||||
`define VX_CSR_MPM_DCACHE_WRITES 12'hB06 // total writes
|
||||
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB86
|
||||
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses
|
||||
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB87
|
||||
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses
|
||||
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB88
|
||||
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts
|
||||
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB89
|
||||
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls
|
||||
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A
|
||||
// PERF: smem
|
||||
`define VX_CSR_MPM_SMEM_READS 12'hB0B // memory reads
|
||||
`define VX_CSR_MPM_SMEM_READS_H 12'hB8B
|
||||
`define VX_CSR_MPM_SMEM_WRITES 12'hB0C // memory writes
|
||||
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB8C
|
||||
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts
|
||||
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB8D
|
||||
`define VX_CSR_MPM_DCACHE_READS 12'hB06 // total reads
|
||||
`define VX_CSR_MPM_DCACHE_READS_H 12'hB86
|
||||
`define VX_CSR_MPM_DCACHE_WRITES 12'hB07 // total writes
|
||||
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB87
|
||||
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB08 // read misses
|
||||
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB88
|
||||
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB09 // write misses
|
||||
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB89
|
||||
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB0A // bank conflicts
|
||||
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB8A
|
||||
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0B // MSHR stalls
|
||||
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8B
|
||||
// PERF: l2cache
|
||||
`define VX_CSR_MPM_L2CACHE_READS 12'hB0E // total reads
|
||||
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8E
|
||||
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes
|
||||
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8F
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB90
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB91
|
||||
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts
|
||||
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB92
|
||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls
|
||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93
|
||||
`define VX_CSR_MPM_L2CACHE_READS 12'hB0C // total reads
|
||||
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8C
|
||||
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0D // total writes
|
||||
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8D
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB0E // read misses
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB8E
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB0F // write misses
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB8F
|
||||
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB10 // bank conflicts
|
||||
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB90
|
||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB11 // MSHR stalls
|
||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB91
|
||||
// PERF: l3cache
|
||||
`define VX_CSR_MPM_L3CACHE_READS 12'hB14 // total reads
|
||||
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB94
|
||||
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes
|
||||
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB95
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB96
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB97
|
||||
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts
|
||||
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB98
|
||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls
|
||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99
|
||||
`define VX_CSR_MPM_L3CACHE_READS 12'hB12 // total reads
|
||||
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB92
|
||||
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB13 // total writes
|
||||
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB93
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB14 // read misses
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB94
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB15 // write misses
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB95
|
||||
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB16 // bank conflicts
|
||||
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB96
|
||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB17 // MSHR stalls
|
||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB97
|
||||
// PERF: memory
|
||||
`define VX_CSR_MPM_MEM_READS 12'hB1A // total reads
|
||||
`define VX_CSR_MPM_MEM_READS_H 12'hB9A
|
||||
`define VX_CSR_MPM_MEM_WRITES 12'hB1B // total writes
|
||||
`define VX_CSR_MPM_MEM_WRITES_H 12'hB9B
|
||||
`define VX_CSR_MPM_MEM_LAT 12'hB1C // memory latency
|
||||
`define VX_CSR_MPM_MEM_LAT_H 12'hB9C
|
||||
`define VX_CSR_MPM_MEM_READS 12'hB18 // total reads
|
||||
`define VX_CSR_MPM_MEM_READS_H 12'hB98
|
||||
`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
|
||||
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
|
||||
`define VX_CSR_MPM_MEM_LAT 12'hB1A // memory latency
|
||||
`define VX_CSR_MPM_MEM_LAT_H 12'hB9A
|
||||
// PERF: smem
|
||||
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
|
||||
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B
|
||||
`define VX_CSR_MPM_SMEM_WRITES 12'hB1C // memory writes
|
||||
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB9C
|
||||
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB1D // bank conflicts
|
||||
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB9D
|
||||
|
||||
// Machine Information Registers
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_commit_if commit_if[`ISSUE_WIDTH]();
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] commit_fire;
|
||||
wire [`ISSUE_WIDTH-1:0] commit_fire;
|
||||
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
|
||||
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
|
||||
wire [`ISSUE_WIDTH-1:0] commit_eop;
|
||||
|
@ -91,24 +91,22 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
|
||||
assign commit_tmask[i] = {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
|
||||
assign commit_wid[i] = commit_if[i].data.wid;
|
||||
assign commit_eop[i] = commit_if[i].data.eop;
|
||||
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
|
||||
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
|
||||
assign commit_wid[i] = commit_if[i].data.wid;
|
||||
assign commit_eop[i] = commit_if[i].data.eop;
|
||||
end
|
||||
|
||||
// CSRs update
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
|
||||
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all, commit_size_all_r;
|
||||
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
|
||||
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
|
||||
|
||||
assign commit_fire_any = (| commit_fire);
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
wire [COMMIT_SIZEW-1:0] pop_count;
|
||||
`POP_COUNT(pop_count, commit_tmask[i]);
|
||||
assign commit_size[i] = pop_count;
|
||||
`POP_COUNT(commit_size[i], commit_tmask[i]);
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
|
@ -129,7 +127,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
.OP ("+")
|
||||
) commit_size_reduce (
|
||||
.data_in (commit_size_r),
|
||||
.data_out (commit_size_all)
|
||||
.data_out (commit_size_all_r)
|
||||
);
|
||||
|
||||
VX_pipe_register #(
|
||||
|
@ -139,26 +137,26 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in ({commit_fire_any_r, commit_size_all}),
|
||||
.data_out ({commit_fire_any_rr, commit_size_all_r})
|
||||
.data_in ({commit_fire_any_r, commit_size_all_r}),
|
||||
.data_out ({commit_fire_any_rr, commit_size_all_rr})
|
||||
);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] instret;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
instret <= '0;
|
||||
end else begin
|
||||
if (commit_fire_any_rr) begin
|
||||
instret <= instret + `PERF_CTR_BITS'(commit_size_all_r);
|
||||
instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign commit_csr_if.instret = instret;
|
||||
|
||||
// Committed instructions
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
|
||||
.RESETW (`ISSUE_WIDTH)
|
||||
|
@ -166,23 +164,23 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in ({(commit_fire & commit_eop), commit_wid}),
|
||||
.data_in ({committed, commit_wid}),
|
||||
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
|
||||
);
|
||||
|
||||
// Writeback
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
||||
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
|
||||
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
|
||||
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
||||
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
||||
assign writeback_if[i].data.tmask = commit_if[i].data.tmask;
|
||||
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
||||
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
|
||||
assign writeback_if[i].data.PC = commit_if[i].data.PC;
|
||||
assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
|
||||
assign writeback_if[i].data.rd = commit_if[i].data.rd;
|
||||
assign writeback_if[i].data.data = commit_if[i].data.data;
|
||||
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
||||
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
||||
assign commit_if[i].ready = 1'b1;
|
||||
assign writeback_if[i].data.sop = commit_if[i].data.sop;
|
||||
assign writeback_if[i].data.eop = commit_if[i].data.eop;
|
||||
assign commit_if[i].ready = 1'b1; // writeback has no backpressure
|
||||
end
|
||||
|
||||
// simulation helper signal to get RISC-V tests Pass/Fail status
|
||||
|
|
|
@ -116,7 +116,11 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.CORE_ID (CORE_ID)
|
||||
) schedule (
|
||||
.clk (clk),
|
||||
.reset (schedule_reset),
|
||||
.reset (schedule_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_schedule_if (pipeline_perf_if.schedule),
|
||||
`endif
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
||||
|
|
|
@ -179,14 +179,18 @@ import VX_fpu_pkg::*;
|
|||
|
||||
default: begin
|
||||
read_addr_valid_r = 0;
|
||||
if ((read_addr >= `VX_CSR_MPM_BASE && read_addr < (`VX_CSR_MPM_BASE + 32))
|
||||
|| (read_addr >= `VX_CSR_MPM_BASE_H && read_addr < (`VX_CSR_MPM_BASE_H + 32))) begin
|
||||
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|
||||
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
|
||||
read_addr_valid_r = 1;
|
||||
`ifdef PERF_ENABLE
|
||||
case (base_dcrs.mpm_class)
|
||||
`VX_DCR_MPM_CLASS_CORE: begin
|
||||
case (read_addr)
|
||||
// PERF: pipeline
|
||||
// PERF: pipeline
|
||||
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0];
|
||||
`VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||
|
@ -204,6 +208,19 @@ import VX_fpu_pkg::*;
|
|||
`endif
|
||||
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0];
|
||||
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0];
|
||||
`ifdef EXT_F_ENABLE
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0];
|
||||
`else
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
||||
`endif
|
||||
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0];
|
||||
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];
|
||||
// PERF: memory
|
||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||
|
@ -214,7 +231,7 @@ import VX_fpu_pkg::*;
|
|||
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
@ -225,6 +242,8 @@ import VX_fpu_pkg::*;
|
|||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: dcache
|
||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||
|
|
|
@ -14,10 +14,10 @@
|
|||
`include "VX_platform.vh"
|
||||
|
||||
module VX_ipdom_stack #(
|
||||
parameter WIDTH = 1,
|
||||
parameter DEPTH = 1,
|
||||
parameter WIDTH = 1,
|
||||
parameter DEPTH = 1,
|
||||
parameter OUT_REG = 0,
|
||||
parameter ADDRW = `LOG2UP(DEPTH)
|
||||
parameter ADDRW = `LOG2UP(DEPTH)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
|
|
@ -59,6 +59,10 @@ module VX_issue #(
|
|||
) scoreboard (
|
||||
.clk (clk),
|
||||
.reset (scoreboard_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_scb_stalls(perf_issue_if.scb_stalls),
|
||||
.perf_scb_uses (perf_issue_if.scb_uses),
|
||||
`endif
|
||||
.writeback_if (writeback_if),
|
||||
.ibuffer_if (ibuffer_if),
|
||||
.scoreboard_if (scoreboard_if)
|
||||
|
@ -152,29 +156,17 @@ module VX_issue #(
|
|||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
|
||||
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
|
||||
reg [`ISSUE_WIDTH-1:0] scoreboard_stalls;
|
||||
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
||||
end
|
||||
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_ibf_stalls <= '0;
|
||||
perf_scb_stalls <= '0;
|
||||
end else begin
|
||||
if (decode_if.valid && ~decode_if.ready) begin
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
|
||||
assign perf_issue_if.scb_stalls = perf_scb_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -19,6 +19,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_pipeline_perf_if.schedule perf_schedule_if,
|
||||
`endif
|
||||
|
||||
// configuration
|
||||
input base_dcrs_t base_dcrs,
|
||||
|
||||
|
@ -376,4 +380,21 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
end
|
||||
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_fetch_stalls;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sched_stalls <= '0;
|
||||
perf_fetch_stalls <= '0;
|
||||
end else begin
|
||||
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(!schedule_valid);
|
||||
perf_fetch_stalls <= perf_fetch_stalls + `PERF_CTR_BITS'(schedule_if.valid && !schedule_if.ready);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
|
||||
assign perf_schedule_if.fetch_stalls = perf_fetch_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -19,6 +19,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_scb_uses [`NUM_EX_UNITS],
|
||||
`endif
|
||||
|
||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
|
||||
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
|
||||
|
@ -26,81 +31,102 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_alu_per_cycle;
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_fpu_per_cycle;
|
||||
`endif
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_lsu_per_cycle;
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_sfu_per_cycle;
|
||||
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] scoreboard_stalls_per_cycle;
|
||||
reg [`EX_BITS-1:0][`ISSUE_WIDTH-1:0] scoreboard_uses;
|
||||
wire [`ISSUE_WIDTH-1:0] scoreboard_stalls;
|
||||
`POP_COUNT(scoreboard_stalls_per_cycle, scoreboard_stalls);
|
||||
`POP_COUNT(scoreboard_alu_per_cycle, scoreboard_uses[`EX_ALU]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`POP_COUNT(scoreboard_fpu_per_cycle, scoreboard_uses[`EX_FPU]);
|
||||
`endif
|
||||
`POP_COUNT(scoreboard_lsu_per_cycle, scoreboard_uses[`EX_LSU]);
|
||||
`POP_COUNT(scoreboard_sfu_per_cycle, scoreboard_uses[`EX_SFU]);
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
||||
reg [3:0] ready_masks, ready_masks_n;
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
|
||||
VX_ibuffer_if staging_if();
|
||||
|
||||
|
||||
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
|
||||
|
||||
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
|
||||
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
|
||||
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
|
||||
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_BITS-1:0] inuse_units;
|
||||
always @(*) begin
|
||||
inuse_regs_n = inuse_regs;
|
||||
ready_masks_n = ready_masks;
|
||||
if (writeback_fire) begin
|
||||
inuse_regs_n[writeback_if[i].data.wis][writeback_if[i].data.rd] = 0;
|
||||
ready_masks_n |= {4{(ISSUE_RATIO == 0) || writeback_if[i].data.wis == staging_if.data.wis}}
|
||||
& {(writeback_if[i].data.rd == staging_if.data.rd),
|
||||
(writeback_if[i].data.rd == staging_if.data.rs1),
|
||||
(writeback_if[i].data.rd == staging_if.data.rs2),
|
||||
(writeback_if[i].data.rd == staging_if.data.rs3)};
|
||||
end
|
||||
if (staging_if.valid && staging_if.ready && staging_if.data.wb) begin
|
||||
inuse_regs_n[staging_if.data.wis][staging_if.data.rd] = 1;
|
||||
ready_masks_n = '0;
|
||||
scoreboard_uses = '0;
|
||||
if (ibuffer_if[i].valid) begin
|
||||
if (inuse_rd) begin
|
||||
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]][i] = 1;
|
||||
end
|
||||
if (inuse_rs1) begin
|
||||
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]][i] = 1;
|
||||
end
|
||||
if (inuse_rs2) begin
|
||||
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]][i] = 1;
|
||||
end
|
||||
if (inuse_rs3) begin
|
||||
scoreboard_uses[inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]][i] = 1;
|
||||
end
|
||||
end
|
||||
if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
||||
ready_masks_n = ~{inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd],
|
||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1],
|
||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2],
|
||||
inuse_regs_n[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]};
|
||||
end
|
||||
end
|
||||
end
|
||||
assign scoreboard_stalls[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
|
||||
`endif
|
||||
|
||||
reg [DATAW-1:0] data_out_r;
|
||||
reg valid_out_r;
|
||||
|
||||
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
|
||||
wire deps_ready = (& ready_masks);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
inuse_regs <= '0;
|
||||
ready_masks <= '0;
|
||||
end else begin
|
||||
inuse_regs <= inuse_regs_n;
|
||||
ready_masks <= ready_masks_n;
|
||||
valid_out_r <= 0;
|
||||
inuse_regs <= '0;
|
||||
end else begin
|
||||
if (writeback_fire) begin
|
||||
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
|
||||
end
|
||||
if (~valid_out_r) begin
|
||||
valid_out_r <= ibuffer_if[i].valid && deps_ready;
|
||||
end else if (staging_if.ready) begin
|
||||
if (staging_if.data.wb) begin
|
||||
inuse_regs[staging_if.data.wis][staging_if.data.rd] <= 1;
|
||||
`ifdef PERF_ENABLE
|
||||
inuse_units[staging_if.data.wis][staging_if.data.rd] <= staging_if.data.ex_type;
|
||||
`endif
|
||||
end
|
||||
valid_out_r <= 0;
|
||||
end
|
||||
end
|
||||
if (~valid_out_r) begin
|
||||
data_out_r <= ibuffer_if[i].data;
|
||||
end
|
||||
end
|
||||
|
||||
// staging buffer
|
||||
|
||||
`RESET_RELAY (stg_buf_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW)
|
||||
) stg_buf (
|
||||
.clk (clk),
|
||||
.reset (stg_buf_reset),
|
||||
.valid_in (ibuffer_if[i].valid),
|
||||
.ready_in (ibuffer_if[i].ready),
|
||||
.data_in (ibuffer_if[i].data),
|
||||
.data_out (staging_if.data),
|
||||
.valid_out (staging_if.valid),
|
||||
.ready_out (staging_if.ready)
|
||||
);
|
||||
|
||||
// output buffer
|
||||
|
||||
wire valid_stg, ready_stg;
|
||||
wire regs_ready = (& ready_masks);
|
||||
assign valid_stg = staging_if.valid && regs_ready;
|
||||
assign staging_if.ready = ready_stg && regs_ready;
|
||||
|
||||
`RESET_RELAY (out_buf_reset, reset);
|
||||
assign ibuffer_if[i].ready = ~valid_out_r && deps_ready;
|
||||
assign staging_if.valid = valid_out_r;
|
||||
assign staging_if.data = data_out_r;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (2),
|
||||
.SIZE (0),
|
||||
.OUT_REG (2)
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (out_buf_reset),
|
||||
.valid_in (valid_stg),
|
||||
.ready_in (ready_stg),
|
||||
.reset (reset),
|
||||
.valid_in (staging_if.valid),
|
||||
.ready_in (staging_if.ready),
|
||||
.data_in (staging_if.data),
|
||||
.data_out (scoreboard_if[i].data),
|
||||
.valid_out (scoreboard_if[i].valid),
|
||||
|
@ -108,29 +134,29 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
`ifdef SIMULATION
|
||||
reg [31:0] timeout_ctr;
|
||||
|
||||
reg [31:0] timeout_ctr;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
timeout_ctr <= '0;
|
||||
end else begin
|
||||
if (staging_if.valid && ~regs_ready) begin
|
||||
if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
|
||||
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
|
||||
~ready_masks, staging_if.data.uuid));
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
|
||||
~ready_masks, ibuffer_if[i].data.uuid));
|
||||
`endif
|
||||
timeout_ctr <= timeout_ctr + 1;
|
||||
end else if (staging_if.valid && staging_if.ready) begin
|
||||
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
|
||||
timeout_ctr <= '0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
|
||||
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
|
||||
$time, CORE_ID, wis_to_wid(staging_if.data.wis, i), staging_if.data.PC, staging_if.data.tmask, timeout_ctr,
|
||||
~ready_masks, staging_if.data.uuid));
|
||||
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
|
||||
~ready_masks, ibuffer_if[i].data.uuid));
|
||||
|
||||
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
|
||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
||||
|
@ -139,4 +165,26 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_scb_stalls <= '0;
|
||||
perf_scb_uses[`EX_ALU] <= '0;
|
||||
`ifdef EXT_F_ENABLE
|
||||
perf_scb_uses[`EX_FPU] <= '0;
|
||||
`endif
|
||||
perf_scb_uses[`EX_LSU] <= '0;
|
||||
perf_scb_uses[`EX_SFU] <= '0;
|
||||
end else begin
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(scoreboard_stalls_per_cycle);
|
||||
perf_scb_uses[`EX_ALU] <= perf_scb_uses[`EX_ALU] + `PERF_CTR_BITS'(scoreboard_alu_per_cycle);
|
||||
`ifdef EXT_F_ENABLE
|
||||
perf_scb_uses[`EX_FPU] <= perf_scb_uses[`EX_FPU] + `PERF_CTR_BITS'(scoreboard_fpu_per_cycle);
|
||||
`endif
|
||||
perf_scb_uses[`EX_LSU] <= perf_scb_uses[`EX_LSU] + `PERF_CTR_BITS'(scoreboard_lsu_per_cycle);
|
||||
perf_scb_uses[`EX_SFU] <= perf_scb_uses[`EX_SFU] + `PERF_CTR_BITS'(scoreboard_sfu_per_cycle);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -14,8 +14,11 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
interface VX_pipeline_perf_if ();
|
||||
wire [`PERF_CTR_BITS-1:0] sched_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] fetch_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_uses [`NUM_EX_UNITS];
|
||||
wire [`PERF_CTR_BITS-1:0] dsp_stalls [`NUM_EX_UNITS];
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] ifetches;
|
||||
|
@ -24,15 +27,24 @@ interface VX_pipeline_perf_if ();
|
|||
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||
wire [`PERF_CTR_BITS-1:0] load_latency;
|
||||
|
||||
modport schedule (
|
||||
output sched_stalls,
|
||||
output fetch_stalls
|
||||
);
|
||||
|
||||
modport issue (
|
||||
output ibf_stalls,
|
||||
output scb_stalls,
|
||||
output scb_uses,
|
||||
output dsp_stalls
|
||||
);
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input sched_stalls,
|
||||
input fetch_stalls,
|
||||
input ibf_stalls,
|
||||
input scb_stalls,
|
||||
input scb_uses,
|
||||
input dsp_stalls,
|
||||
input ifetches,
|
||||
input loads,
|
||||
|
|
|
@ -21,8 +21,8 @@ module VX_avs_adapter #(
|
|||
parameter NUM_BANKS = 1,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter RD_QUEUE_SIZE = 1,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
|
|
@ -20,7 +20,7 @@ module VX_axi_adapter #(
|
|||
parameter TAG_WIDTH = 8,
|
||||
parameter NUM_BANKS = 1,
|
||||
parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)),
|
||||
parameter OUT_REG_RSP = 0
|
||||
parameter OUT_REG_RSP = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
|
|
@ -21,8 +21,8 @@ module VX_mem_adapter #(
|
|||
parameter DST_ADDR_WIDTH = 1,
|
||||
parameter SRC_TAG_WIDTH = 1,
|
||||
parameter DST_TAG_WIDTH = 1,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
|
|
@ -21,7 +21,7 @@ module VX_stream_arb #(
|
|||
parameter `STRING ARBITER = "P",
|
||||
parameter LOCK_ENABLE = 1,
|
||||
parameter MAX_FANOUT = `MAX_FANOUT,
|
||||
parameter OUT_REG = 0 ,
|
||||
parameter OUT_REG = 0 ,
|
||||
parameter NUM_REQS = (NUM_INPUTS + NUM_OUTPUTS - 1) / NUM_OUTPUTS,
|
||||
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
|
||||
parameter NUM_REQS_W = `UP(LOG_NUM_REQS)
|
||||
|
|
|
@ -173,8 +173,8 @@ module VX_stream_xbar #(
|
|||
end
|
||||
|
||||
// compute inputs collision
|
||||
// we have a collision when there exists a valid transfer with mutiple input candicates
|
||||
// we caount the unique duplicates each cycle.
|
||||
// we have a collision when there exists a valid transfer with multiple input candicates
|
||||
// we count the unique duplicates each cycle.
|
||||
|
||||
reg [PERF_CTR_BITS-1:0] collisions_r;
|
||||
reg [NUM_INPUTS-1:0] per_cycle_collision;
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
module VX_gbar_arb #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter OUT_REG = 0,
|
||||
parameter OUT_REG = 0,
|
||||
parameter `STRING ARBITER = "R"
|
||||
) (
|
||||
input wire clk,
|
||||
|
|
|
@ -21,8 +21,8 @@ module VX_mem_arb #(
|
|||
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0,
|
||||
parameter `STRING ARBITER = "R"
|
||||
) (
|
||||
input wire clk,
|
||||
|
|
|
@ -19,8 +19,8 @@ module VX_smem_switch #(
|
|||
parameter TAG_WIDTH = 1,
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0,
|
||||
parameter OUT_REG_REQ = 0,
|
||||
parameter OUT_REG_RSP = 0,
|
||||
parameter `STRING ARBITER = "R"
|
||||
) (
|
||||
input wire clk,
|
||||
|
|
|
@ -186,27 +186,31 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
return int((1.0 - (double(part) / double(total))) * 100);
|
||||
};
|
||||
|
||||
auto caclAvgLatency = [&](uint64_t sum, uint64_t requests)->int {
|
||||
if (requests == 0)
|
||||
auto caclAverage = [&](uint64_t part, uint64_t total)->double {
|
||||
if (total == 0)
|
||||
return 0;
|
||||
return int(double(sum) / double(requests));
|
||||
return double(part) / double(total);
|
||||
};
|
||||
|
||||
auto calcUtilization = [&](uint64_t count, uint64_t stalls)->int {
|
||||
if (count == 0)
|
||||
return 0;
|
||||
return int((double(count) / double(count + stalls)) * 100);
|
||||
auto calcAvgPercent = [&](uint64_t part, uint64_t total)->int {
|
||||
return int(caclAverage(part, total) * 100);
|
||||
};
|
||||
|
||||
auto perf_class = gAutoPerfDump.get_perf_class();
|
||||
|
||||
// PERF: pipeline stalls
|
||||
uint64_t scheduler_stalls = 0;
|
||||
uint64_t fetch_stalls = 0;
|
||||
uint64_t ibuffer_stalls = 0;
|
||||
uint64_t scoreboard_stalls = 0;
|
||||
uint64_t scrb_stalls = 0;
|
||||
uint64_t lsu_stalls = 0;
|
||||
uint64_t fpu_stalls = 0;
|
||||
uint64_t alu_stalls = 0;
|
||||
uint64_t sfu_stalls = 0;
|
||||
uint64_t sfu_stalls = 0;
|
||||
uint64_t scrb_alu = 0;
|
||||
uint64_t scrb_fpu = 0;
|
||||
uint64_t scrb_lsu = 0;
|
||||
uint64_t scrb_sfu = 0;
|
||||
uint64_t ifetches = 0;
|
||||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
|
@ -251,76 +255,121 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
#endif
|
||||
|
||||
std::vector<uint8_t> staging_buf(64* sizeof(uint32_t));
|
||||
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size();
|
||||
ret = vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size());
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
|
||||
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
// PERF: pipeline
|
||||
// ibuffer_stall
|
||||
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
|
||||
ibuffer_stalls += ibuffer_stalls_per_core;
|
||||
// scoreboard_stall
|
||||
uint64_t scoreboard_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
|
||||
scoreboard_stalls += scoreboard_stalls_per_core;
|
||||
// alu_stall
|
||||
uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
|
||||
alu_stalls += alu_stalls_per_core;
|
||||
// lsu_stall
|
||||
uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
|
||||
lsu_stalls += lsu_stalls_per_core;
|
||||
// fpu_stall
|
||||
uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
|
||||
fpu_stalls += fpu_stalls_per_core;
|
||||
// sfu_stall
|
||||
uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
|
||||
sfu_stalls += sfu_stalls_per_core;
|
||||
// schedule stalls
|
||||
{
|
||||
uint64_t scheduler_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCHED_ST);
|
||||
int scheduler_percent_per_core = calcAvgPercent(scheduler_stalls_per_core, cycles_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: schedule stalls=%ld (%d%%)\n", core_id, scheduler_stalls_per_core, scheduler_percent_per_core);
|
||||
scheduler_stalls += scheduler_stalls_per_core;
|
||||
}
|
||||
// fetch stalls
|
||||
{
|
||||
uint64_t fetch_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FETCH_ST);
|
||||
int fetch_percent_per_core = calcAvgPercent(fetch_stalls_per_core, cycles_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch stalls=%ld (%d%%)\n", core_id, fetch_stalls_per_core, fetch_percent_per_core);
|
||||
fetch_stalls += fetch_stalls_per_core;
|
||||
}
|
||||
// ibuffer_stalls
|
||||
{
|
||||
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IBUF_ST);
|
||||
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
|
||||
ibuffer_stalls += ibuffer_stalls_per_core;
|
||||
}
|
||||
// scrb_stalls
|
||||
{
|
||||
uint64_t scrb_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ST);
|
||||
uint64_t scrb_alu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_ALU);
|
||||
uint64_t scrb_fpu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_FPU);
|
||||
uint64_t scrb_lsu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_LSU);
|
||||
uint64_t scrb_sfu_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SCRB_SFU);
|
||||
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
|
||||
scrb_alu += scrb_alu_per_core;
|
||||
scrb_fpu += scrb_fpu_per_core;
|
||||
scrb_lsu += scrb_lsu_per_core;
|
||||
scrb_sfu += scrb_sfu_per_core;
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
|
||||
calcAvgPercent(scrb_alu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_fpu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_lsu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_sfu_per_core, scrb_total));
|
||||
scrb_stalls += scrb_stalls_per_core;
|
||||
}
|
||||
// alu_stalls
|
||||
{
|
||||
uint64_t alu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_ALU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
|
||||
alu_stalls += alu_stalls_per_core;
|
||||
}
|
||||
// lsu_stalls
|
||||
{
|
||||
uint64_t lsu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LSU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
|
||||
lsu_stalls += lsu_stalls_per_core;
|
||||
}
|
||||
// fpu_stalls
|
||||
{
|
||||
uint64_t fpu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_FPU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
|
||||
fpu_stalls += fpu_stalls_per_core;
|
||||
}
|
||||
// sfu_stalls
|
||||
{
|
||||
uint64_t sfu_stalls_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_SFU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: sfu unit stalls=%ld\n", core_id, sfu_stalls_per_core);
|
||||
sfu_stalls += sfu_stalls_per_core;
|
||||
}
|
||||
// PERF: memory
|
||||
// ifetches
|
||||
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
||||
ifetches += ifetches_per_core;
|
||||
{
|
||||
uint64_t ifetches_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
||||
ifetches += ifetches_per_core;
|
||||
|
||||
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
|
||||
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
ifetch_lat += ifetch_lat_per_core;
|
||||
}
|
||||
// loads
|
||||
uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||
loads += loads_per_core;
|
||||
{
|
||||
uint64_t loads_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||
loads += loads_per_core;
|
||||
|
||||
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
|
||||
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
load_lat += load_lat_per_core;
|
||||
}
|
||||
// stores
|
||||
uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
|
||||
stores += stores_per_core;
|
||||
// ifetch latency
|
||||
uint64_t ifetch_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_IFETCH_LAT);
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = caclAvgLatency(ifetch_lat_per_core, ifetches_per_core);
|
||||
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
{
|
||||
uint64_t stores_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_STORES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
|
||||
stores += stores_per_core;
|
||||
}
|
||||
ifetch_lat += ifetch_lat_per_core;
|
||||
// load latency
|
||||
uint64_t load_lat_per_core = get_csr_64(staging_buf.data(), VX_CSR_MPM_LOAD_LAT);
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = caclAvgLatency(load_lat_per_core, loads_per_core);
|
||||
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
}
|
||||
load_lat += load_lat_per_core;
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
if (smem_enable) {
|
||||
// PERF: smem
|
||||
uint64_t smem_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_READS);
|
||||
uint64_t smem_writes = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_WRITES);
|
||||
uint64_t smem_bank_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_SMEM_BANK_ST);
|
||||
int smem_bank_utilization = calcUtilization(smem_reads + smem_writes, smem_bank_stalls);
|
||||
int smem_bank_utilization = calcAvgPercent(smem_reads + smem_writes, smem_reads + smem_writes + smem_bank_stalls);
|
||||
fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads);
|
||||
fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes);
|
||||
fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_stalls, smem_bank_utilization);
|
||||
|
@ -330,9 +379,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// PERF: Icache
|
||||
uint64_t icache_reads = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_READS);
|
||||
uint64_t icache_read_misses = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MISS_R);
|
||||
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
|
||||
uint64_t icache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_ICACHE_MSHR_ST);
|
||||
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
|
||||
int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls);
|
||||
fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads);
|
||||
fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_read_misses, icache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
if (dcache_enable) {
|
||||
|
@ -345,13 +397,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t dcache_mshr_stalls = get_csr_64(staging_buf.data(), VX_CSR_MPM_DCACHE_MSHR_ST);
|
||||
int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads);
|
||||
int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes);
|
||||
int dcache_bank_utilization = calcUtilization(dcache_reads + dcache_writes, dcache_bank_stalls);
|
||||
int dcache_bank_utilization = calcAvgPercent(dcache_reads + dcache_writes, dcache_reads + dcache_writes + dcache_bank_stalls);
|
||||
int mshr_utilization = calcAvgPercent(dcache_read_misses + dcache_write_misses, dcache_read_misses + dcache_write_misses + dcache_mshr_stalls);
|
||||
fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads);
|
||||
fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes);
|
||||
fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_read_misses, dcache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_stalls, dcache_bank_utilization);
|
||||
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_stalls);
|
||||
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
if (l2cache_enable) {
|
||||
|
@ -386,8 +439,6 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
}
|
||||
#endif
|
||||
|
||||
uint64_t instrs_per_core = get_csr_64(staging_buf.data(), VX_CSR_MINSTRET);
|
||||
uint64_t cycles_per_core = get_csr_64(staging_buf.data(), VX_CSR_MCYCLE);
|
||||
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
||||
instrs += instrs_per_core;
|
||||
|
@ -397,10 +448,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
int scheduler_percent = calcAvgPercent(scheduler_stalls, cycles);
|
||||
int fetch_percent = calcAvgPercent(fetch_stalls, cycles);
|
||||
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, cycles);
|
||||
int ifetch_avg_lat = (int)(double(ifetch_lat) / double(ifetches));
|
||||
int load_avg_lat = (int)(double(load_lat) / double(loads));
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
|
||||
uint64_t scrb_total = scrb_alu + scrb_fpu + scrb_lsu + scrb_sfu;
|
||||
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", scheduler_stalls, scheduler_percent);
|
||||
fprintf(stream, "PERF: fetch stalls=%ld (%d%%)\n", fetch_stalls, fetch_percent);
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
|
||||
calcAvgPercent(scrb_alu, scrb_total),
|
||||
calcAvgPercent(scrb_fpu, scrb_total),
|
||||
calcAvgPercent(scrb_lsu, scrb_total),
|
||||
calcAvgPercent(scrb_sfu, scrb_total));
|
||||
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
|
||||
fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls);
|
||||
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
|
||||
|
@ -419,31 +480,32 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
l2cache_write_misses /= num_cores;
|
||||
l2cache_bank_stalls /= num_cores;
|
||||
l2cache_mshr_stalls /= num_cores;
|
||||
int l2cache_read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads);
|
||||
int l2cache_write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes);
|
||||
int l2cache_bank_utilization = calcUtilization(l2cache_reads + l2cache_writes, l2cache_bank_stalls);
|
||||
|
||||
int read_hit_ratio = calcRatio(l2cache_read_misses, l2cache_reads);
|
||||
int write_hit_ratio = calcRatio(l2cache_write_misses, l2cache_writes);
|
||||
int bank_utilization = calcAvgPercent(l2cache_reads + l2cache_writes, l2cache_reads + l2cache_writes + l2cache_bank_stalls);
|
||||
int mshr_utilization = calcAvgPercent(l2cache_read_misses + l2cache_write_misses, l2cache_read_misses + l2cache_write_misses + l2cache_mshr_stalls);
|
||||
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
|
||||
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
|
||||
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, l2cache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, l2cache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, l2cache_bank_utilization);
|
||||
fprintf(stream, "PERF: l2cache mshr stalls=%ld\n", l2cache_mshr_stalls);
|
||||
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, read_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, write_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, bank_utilization);
|
||||
fprintf(stream, "PERF: l2cache mshr stalls=%ld (utilization=%d%%)\n", l2cache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
if (l3cache_enable) {
|
||||
int l3cache_read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
|
||||
int l3cache_write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
|
||||
int l3cache_bank_utilization = calcUtilization(l3cache_reads + l3cache_writes, l3cache_bank_stalls);
|
||||
int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
|
||||
int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
|
||||
int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls);
|
||||
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
|
||||
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
|
||||
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
|
||||
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, l3cache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, l3cache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, l3cache_bank_utilization);
|
||||
fprintf(stream, "PERF: l3cache mshr stalls=%ld\n", l3cache_mshr_stalls);
|
||||
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, bank_utilization);
|
||||
fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
int mem_avg_lat = caclAvgLatency(mem_lat, mem_reads);
|
||||
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
|
||||
} break;
|
||||
|
|
|
@ -45,20 +45,20 @@ public:
|
|||
|
||||
char sname[100];
|
||||
|
||||
std::vector<Switch<MemReq, MemRsp>::Ptr> unit_arbs(num_units);
|
||||
std::vector<MemSwitch::Ptr> unit_arbs(num_units);
|
||||
for (uint32_t u = 0; u < num_units; ++u) {
|
||||
snprintf(sname, 100, "%s-unit-arb-%d", name, u);
|
||||
unit_arbs.at(u) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
|
||||
unit_arbs.at(u) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, config.num_inputs);
|
||||
for (uint32_t i = 0; i < num_requests; ++i) {
|
||||
this->CoreReqPorts.at(u).at(i).bind(&unit_arbs.at(u)->ReqIn.at(i));
|
||||
unit_arbs.at(u)->RspIn.at(i).bind(&this->CoreRspPorts.at(u).at(i));
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<Switch<MemReq, MemRsp>::Ptr> mem_arbs(config.num_inputs);
|
||||
std::vector<MemSwitch::Ptr> mem_arbs(config.num_inputs);
|
||||
for (uint32_t i = 0; i < config.num_inputs; ++i) {
|
||||
snprintf(sname, 100, "%s-mem-arb-%d", name, i);
|
||||
mem_arbs.at(i) = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
|
||||
mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_units, num_caches);
|
||||
for (uint32_t u = 0; u < num_units; ++u) {
|
||||
unit_arbs.at(u)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(u));
|
||||
mem_arbs.at(i)->RspIn.at(u).bind(&unit_arbs.at(u)->RspOut.at(i));
|
||||
|
@ -66,7 +66,7 @@ public:
|
|||
}
|
||||
|
||||
snprintf(sname, 100, "%s-cache-arb", name);
|
||||
auto cache_arb = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
|
||||
auto cache_arb = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_caches, 1);
|
||||
|
||||
for (uint32_t i = 0; i < num_caches; ++i) {
|
||||
snprintf(sname, 100, "%s-cache%d", name, i);
|
||||
|
|
|
@ -41,19 +41,16 @@ struct params_t {
|
|||
uint32_t tag_select_addr_end;
|
||||
|
||||
params_t(const CacheSim::Config& config) {
|
||||
int32_t bank_bits = log2ceil(config.num_banks);
|
||||
int32_t offset_bits = config.B - config.W;
|
||||
int32_t log2_bank_size = config.C - bank_bits;
|
||||
int32_t index_bits = log2_bank_size - (config.B + config.A);
|
||||
assert(log2_bank_size > 0);
|
||||
int32_t offset_bits = config.L - config.W;
|
||||
int32_t index_bits = config.C - (config.L + config.A + config.B);
|
||||
assert(offset_bits >= 0);
|
||||
assert(index_bits >= 0);
|
||||
|
||||
this->log2_num_inputs = log2ceil(config.num_inputs);
|
||||
|
||||
this->words_per_line = 1 << offset_bits;
|
||||
this->sets_per_bank = 1 << index_bits;
|
||||
this->lines_per_set = 1 << config.A;
|
||||
this->sets_per_bank = 1 << index_bits;
|
||||
this->words_per_line = 1 << offset_bits;
|
||||
|
||||
assert(config.ports_per_bank <= this->words_per_line);
|
||||
|
||||
|
@ -63,7 +60,7 @@ struct params_t {
|
|||
|
||||
// Bank select
|
||||
this->bank_select_addr_start = (1+this->word_select_addr_end);
|
||||
this->bank_select_addr_end = (this->bank_select_addr_start+bank_bits-1);
|
||||
this->bank_select_addr_end = (this->bank_select_addr_start+config.B-1);
|
||||
|
||||
// Set select
|
||||
this->set_select_addr_start = (1+this->bank_select_addr_end);
|
||||
|
@ -74,23 +71,23 @@ struct params_t {
|
|||
this->tag_select_addr_end = (config.addr_width-1);
|
||||
}
|
||||
|
||||
uint32_t addr_bank_id(uint64_t word_addr) const {
|
||||
uint32_t addr_bank_id(uint64_t addr) const {
|
||||
if (bank_select_addr_end >= bank_select_addr_start)
|
||||
return (uint32_t)bit_getw(word_addr, bank_select_addr_start, bank_select_addr_end);
|
||||
return (uint32_t)bit_getw(addr, bank_select_addr_start, bank_select_addr_end);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t addr_set_id(uint64_t word_addr) const {
|
||||
uint32_t addr_set_id(uint64_t addr) const {
|
||||
if (set_select_addr_end >= set_select_addr_start)
|
||||
return (uint32_t)bit_getw(word_addr, set_select_addr_start, set_select_addr_end);
|
||||
return (uint32_t)bit_getw(addr, set_select_addr_start, set_select_addr_end);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t addr_tag(uint64_t word_addr) const {
|
||||
uint64_t addr_tag(uint64_t addr) const {
|
||||
if (tag_select_addr_end >= tag_select_addr_start)
|
||||
return bit_getw(word_addr, tag_select_addr_start, tag_select_addr_end);
|
||||
return bit_getw(addr, tag_select_addr_start, tag_select_addr_end);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
@ -288,8 +285,8 @@ private:
|
|||
Config config_;
|
||||
params_t params_;
|
||||
std::vector<bank_t> banks_;
|
||||
Switch<MemReq, MemRsp>::Ptr bank_switch_;
|
||||
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
|
||||
MemSwitch::Ptr bank_switch_;
|
||||
MemSwitch::Ptr bypass_switch_;
|
||||
std::vector<SimPort<MemReq>> mem_req_ports_;
|
||||
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
|
||||
std::vector<bank_req_t> pipeline_reqs_;
|
||||
|
@ -304,16 +301,16 @@ public:
|
|||
: simobject_(simobject)
|
||||
, config_(config)
|
||||
, params_(config)
|
||||
, banks_(config.num_banks, {config, params_})
|
||||
, mem_req_ports_(config.num_banks, simobject)
|
||||
, mem_rsp_ports_(config.num_banks, simobject)
|
||||
, pipeline_reqs_(config.num_banks, config.ports_per_bank)
|
||||
, banks_((1 << config.B), {config, params_})
|
||||
, mem_req_ports_((1 << config.B), simobject)
|
||||
, mem_rsp_ports_((1 << config.B), simobject)
|
||||
, pipeline_reqs_((1 << config.B), config.ports_per_bank)
|
||||
{
|
||||
char sname[100];
|
||||
snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str());
|
||||
|
||||
if (config_.bypass) {
|
||||
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
|
||||
bypass_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs);
|
||||
for (uint32_t i = 0; i < config_.num_inputs; ++i) {
|
||||
simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i));
|
||||
bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i));
|
||||
|
@ -323,14 +320,14 @@ public:
|
|||
return;
|
||||
}
|
||||
|
||||
bypass_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::Priority, 2);
|
||||
bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2);
|
||||
bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort);
|
||||
simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0));
|
||||
|
||||
if (config.num_banks > 1) {
|
||||
if (config.B != 0) {
|
||||
snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str());
|
||||
bank_switch_ = Switch<MemReq, MemRsp>::Create(sname, ArbiterType::RoundRobin, config.num_banks);
|
||||
for (uint32_t i = 0, n = config.num_banks; i < n; ++i) {
|
||||
bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B));
|
||||
for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) {
|
||||
mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i));
|
||||
bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i));
|
||||
}
|
||||
|
@ -383,20 +380,22 @@ public:
|
|||
pipeline_req.clear();
|
||||
}
|
||||
|
||||
// schedule MSHR replay
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
// first: schedule MSHR replay (flush MSHR queue)
|
||||
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& pipeline_req = pipeline_reqs_.at(bank_id);
|
||||
bank.mshr.pop(&pipeline_req);
|
||||
}
|
||||
|
||||
// schedule memory fill
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
// second: schedule memory fill (flush memory queue)
|
||||
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
|
||||
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
|
||||
if (mem_rsp_port.empty())
|
||||
continue;
|
||||
|
||||
auto& pipeline_req = pipeline_reqs_.at(bank_id);
|
||||
|
||||
// skip if bank already busy
|
||||
if (pipeline_req.type != bank_req_t::None)
|
||||
continue;
|
||||
|
||||
|
@ -407,7 +406,7 @@ public:
|
|||
mem_rsp_port.pop();
|
||||
}
|
||||
|
||||
// schedule core requests
|
||||
// last: schedule core requests (flush core queue)
|
||||
for (uint32_t req_id = 0, n = config_.num_inputs; req_id < n; ++req_id) {
|
||||
auto& core_req_port = simobject_->CoreReqPorts.at(req_id);
|
||||
if (core_req_port.empty())
|
||||
|
@ -425,18 +424,21 @@ public:
|
|||
}
|
||||
|
||||
auto bank_id = params_.addr_bank_id(core_req.addr);
|
||||
auto set_id = params_.addr_set_id(core_req.addr);
|
||||
auto tag = params_.addr_tag(core_req.addr);
|
||||
auto port_id = req_id % config_.ports_per_bank;
|
||||
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto& pipeline_req = pipeline_reqs_.at(bank_id);
|
||||
|
||||
// skip if bank already busy
|
||||
if (pipeline_req.type != bank_req_t::None)
|
||||
continue;
|
||||
|
||||
auto set_id = params_.addr_set_id(core_req.addr);
|
||||
auto tag = params_.addr_tag(core_req.addr);
|
||||
auto port_id = req_id % config_.ports_per_bank;
|
||||
|
||||
// check MSHR capacity
|
||||
if ((!core_req.write || !config_.write_through)
|
||||
&& bank.mshr.full()) {
|
||||
++perf_stats_.mshr_stalls;
|
||||
++perf_stats_.bank_stalls;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -452,7 +454,7 @@ public:
|
|||
}
|
||||
// extend request ports
|
||||
pipeline_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
|
||||
} else if (pipeline_req.type == bank_req_t::None) {
|
||||
} else {
|
||||
// schedule new request
|
||||
bank_req_t bank_req(config_.ports_per_bank);
|
||||
bank_req.ports.at(port_id) = bank_req_port_t{req_id, core_req.tag, true};
|
||||
|
@ -463,10 +465,6 @@ public:
|
|||
bank_req.type = bank_req_t::Core;
|
||||
bank_req.write = core_req.write;
|
||||
pipeline_req = bank_req;
|
||||
} else {
|
||||
// bank in use
|
||||
++perf_stats_.bank_stalls;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (core_req.write)
|
||||
|
@ -516,7 +514,7 @@ private:
|
|||
}
|
||||
|
||||
void processBankRequests() {
|
||||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
for (uint32_t bank_id = 0, n = (1 << config_.B); bank_id < n; ++bank_id) {
|
||||
auto& bank = banks_.at(bank_id);
|
||||
auto pipeline_req = pipeline_reqs_.at(bank_id);
|
||||
|
||||
|
@ -545,11 +543,10 @@ private:
|
|||
}
|
||||
}
|
||||
} break;
|
||||
case bank_req_t::Core: {
|
||||
bool hit = false;
|
||||
bool found_free_line = false;
|
||||
uint32_t hit_line_id = 0;
|
||||
uint32_t repl_line_id = 0;
|
||||
case bank_req_t::Core: {
|
||||
int32_t hit_line_id = -1;
|
||||
int32_t free_line_id = -1;
|
||||
int32_t repl_line_id = 0;
|
||||
uint32_t max_cnt = 0;
|
||||
|
||||
auto& set = bank.sets.at(pipeline_req.set_id);
|
||||
|
@ -557,38 +554,34 @@ private:
|
|||
// tag lookup
|
||||
for (uint32_t i = 0, n = set.lines.size(); i < n; ++i) {
|
||||
auto& line = set.lines.at(i);
|
||||
if (max_cnt < line.lru_ctr) {
|
||||
max_cnt = line.lru_ctr;
|
||||
repl_line_id = i;
|
||||
}
|
||||
if (line.valid) {
|
||||
if (line.tag == pipeline_req.tag) {
|
||||
line.lru_ctr = 0;
|
||||
if (line.tag == pipeline_req.tag) {
|
||||
hit_line_id = i;
|
||||
hit = true;
|
||||
line.lru_ctr = 0;
|
||||
} else {
|
||||
++line.lru_ctr;
|
||||
}
|
||||
if (max_cnt < line.lru_ctr) {
|
||||
max_cnt = line.lru_ctr;
|
||||
repl_line_id = i;
|
||||
}
|
||||
} else {
|
||||
found_free_line = true;
|
||||
repl_line_id = i;
|
||||
free_line_id = i;
|
||||
}
|
||||
}
|
||||
|
||||
if (hit) {
|
||||
//
|
||||
// Hit handling
|
||||
//
|
||||
if (hit_line_id != -1) {
|
||||
// Hit handling
|
||||
if (pipeline_req.write) {
|
||||
// handle write hit
|
||||
// handle write has_hit
|
||||
auto& hit_line = set.lines.at(hit_line_id);
|
||||
if (config_.write_through) {
|
||||
// forward write request to memory
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, hit_line.tag);
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-dram-" << mem_req);
|
||||
} else {
|
||||
|
@ -606,23 +599,21 @@ private:
|
|||
DT(3, simobject_->name() << "-core-" << core_rsp);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//
|
||||
// Miss handling
|
||||
//
|
||||
} else {
|
||||
// Miss handling
|
||||
if (pipeline_req.write)
|
||||
++perf_stats_.write_misses;
|
||||
else
|
||||
++perf_stats_.read_misses;
|
||||
|
||||
if (!found_free_line && !config_.write_through) {
|
||||
if (free_line_id == -1 && !config_.write_through) {
|
||||
// write back dirty line
|
||||
auto& repl_line = set.lines.at(repl_line_id);
|
||||
if (repl_line.dirty) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_line.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-dram-" << mem_req);
|
||||
++perf_stats_.evictions;
|
||||
|
@ -635,8 +626,8 @@ private:
|
|||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = true;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-dram-" << mem_req);
|
||||
}
|
||||
|
@ -655,7 +646,7 @@ private:
|
|||
auto mshr_pending = bank.mshr.lookup(pipeline_req);
|
||||
|
||||
// allocate MSHR
|
||||
auto mshr_id = bank.mshr.allocate(pipeline_req, repl_line_id);
|
||||
auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id);
|
||||
|
||||
// send fill request
|
||||
if (!mshr_pending) {
|
||||
|
@ -663,8 +654,8 @@ private:
|
|||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = false;
|
||||
mem_req.tag = mshr_id;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req.cid = pipeline_req.cid;
|
||||
mem_req.uuid = pipeline_req.uuid;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
DT(3, simobject_->name() << "-dram-" << mem_req);
|
||||
++pending_fill_reqs_;
|
||||
|
|
|
@ -23,16 +23,15 @@ public:
|
|||
struct Config {
|
||||
bool bypass; // cache bypass
|
||||
uint8_t C; // log2 cache size
|
||||
uint8_t B; // log2 block size
|
||||
uint8_t L; // log2 line size
|
||||
uint8_t W; // log2 word size
|
||||
uint8_t A; // log2 associativity
|
||||
uint8_t addr_width; // word address bits
|
||||
uint8_t num_banks; // number of banks
|
||||
uint8_t B; // log2 number of banks
|
||||
uint8_t addr_width; // word address bits
|
||||
uint8_t ports_per_bank; // number of ports per bank
|
||||
uint8_t num_inputs; // number of inputs
|
||||
bool write_through; // is write-through
|
||||
bool write_reponse; // enable write response
|
||||
uint16_t victim_size; // victim cache size
|
||||
uint16_t mshr_size; // MSHR buffer size
|
||||
uint8_t latency; // pipeline latency
|
||||
};
|
||||
|
|
|
@ -36,16 +36,15 @@ Cluster::Cluster(const SimContext& ctx,
|
|||
l2cache_ = CacheSim::Create(sname, CacheSim::Config{
|
||||
!L2_ENABLED,
|
||||
log2ceil(L2_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
log2ceil(MEM_BLOCK_SIZE), // L
|
||||
log2ceil(L2_NUM_WAYS), // W
|
||||
0, // A
|
||||
log2ceil(L2_NUM_BANKS), // B
|
||||
XLEN, // address bits
|
||||
L2_NUM_BANKS, // number of banks
|
||||
1, // number of ports
|
||||
5, // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
L2_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
});
|
||||
|
@ -57,16 +56,15 @@ Cluster::Cluster(const SimContext& ctx,
|
|||
icaches_ = CacheCluster::Create(sname, num_cores, NUM_ICACHES, 1, CacheSim::Config{
|
||||
!ICACHE_ENABLED,
|
||||
log2ceil(ICACHE_SIZE), // C
|
||||
log2ceil(L1_LINE_SIZE), // B
|
||||
log2ceil(L1_LINE_SIZE), // L
|
||||
log2ceil(sizeof(uint32_t)), // W
|
||||
log2ceil(ICACHE_NUM_WAYS),// A
|
||||
XLEN, // address bits
|
||||
1, // number of banks
|
||||
1, // B
|
||||
XLEN, // address bits
|
||||
1, // number of ports
|
||||
1, // number of inputs
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
(uint8_t)arch.num_warps(), // mshr
|
||||
2, // pipeline latency
|
||||
});
|
||||
|
@ -78,16 +76,15 @@ Cluster::Cluster(const SimContext& ctx,
|
|||
dcaches_ = CacheCluster::Create(sname, num_cores, NUM_DCACHES, NUM_LSU_LANES, CacheSim::Config{
|
||||
!DCACHE_ENABLED,
|
||||
log2ceil(DCACHE_SIZE), // C
|
||||
log2ceil(L1_LINE_SIZE), // B
|
||||
log2ceil(L1_LINE_SIZE), // L
|
||||
log2ceil(sizeof(Word)), // W
|
||||
log2ceil(DCACHE_NUM_WAYS),// A
|
||||
XLEN, // address bits
|
||||
DCACHE_NUM_BANKS, // number of banks
|
||||
log2ceil(DCACHE_NUM_BANKS), // B
|
||||
XLEN, // address bits
|
||||
1, // number of ports
|
||||
DCACHE_NUM_BANKS, // number of inputs
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
DCACHE_MSHR_SIZE, // mshr
|
||||
4, // pipeline latency
|
||||
});
|
||||
|
@ -129,11 +126,11 @@ Cluster::Cluster(const SimContext& ctx,
|
|||
cores_.at(i)->dcache_req_ports.at(j).bind(&smem_demux->ReqIn);
|
||||
smem_demux->RspIn.bind(&cores_.at(i)->dcache_rsp_ports.at(j));
|
||||
|
||||
smem_demux->ReqDc.bind(&dcaches_->CoreReqPorts.at(i).at(j));
|
||||
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDc);
|
||||
smem_demux->ReqDC.bind(&dcaches_->CoreReqPorts.at(i).at(j));
|
||||
dcaches_->CoreRspPorts.at(i).at(j).bind(&smem_demux->RspDC);
|
||||
|
||||
smem_demux->ReqSm.bind(&sharedmems_.at(i)->Inputs.at(j));
|
||||
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSm);
|
||||
smem_demux->ReqSM.bind(&sharedmems_.at(i)->Inputs.at(j));
|
||||
sharedmems_.at(i)->Outputs.at(j).bind(&smem_demux->RspSM);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,19 +45,21 @@ Core::Core(const SimContext& ctx,
|
|||
, warps_(arch.num_warps())
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
, fcsrs_(arch.num_warps(), 0)
|
||||
, ibuffers_(ISSUE_WIDTH, IBUF_SIZE)
|
||||
, ibuffers_(arch.num_warps(), IBUF_SIZE)
|
||||
, scoreboard_(arch_)
|
||||
, operands_(ISSUE_WIDTH)
|
||||
, dispatchers_((uint32_t)ExeType::MAX)
|
||||
, exe_units_((uint32_t)ExeType::MAX)
|
||||
, dispatchers_((uint32_t)ExeType::ExeTypeCount)
|
||||
, exe_units_((uint32_t)ExeType::ExeTypeCount)
|
||||
, sharedmem_(sharedmem)
|
||||
, fetch_latch_("fetch")
|
||||
, decode_latch_("decode")
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, committed_traces_(ISSUE_WIDTH, nullptr)
|
||||
, csrs_(arch.num_warps())
|
||||
, cluster_(cluster)
|
||||
{
|
||||
, commit_arbs_(ISSUE_WIDTH)
|
||||
{
|
||||
char sname[100];
|
||||
|
||||
for (uint32_t i = 0; i < arch_.num_warps(); ++i) {
|
||||
csrs_.at(i).resize(arch.num_threads());
|
||||
}
|
||||
|
@ -82,6 +84,16 @@ Core::Core(const SimContext& ctx,
|
|||
exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().create_object<LsuUnit>(this);
|
||||
exe_units_.at((int)ExeType::SFU) = SimPlatform::instance().create_object<SfuUnit>(this);
|
||||
|
||||
// bind commit arbiters
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
snprintf(sname, 100, "commit-arb%d", i);
|
||||
auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)ExeType::ExeTypeCount, 1);
|
||||
for (uint32_t j = 0; j < (uint32_t)ExeType::ExeTypeCount; ++j) {
|
||||
exe_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j));
|
||||
}
|
||||
commit_arbs_.at(i) = arbiter;
|
||||
}
|
||||
|
||||
this->reset();
|
||||
}
|
||||
|
||||
|
@ -99,8 +111,12 @@ void Core::reset() {
|
|||
for (auto& exe_unit : exe_units_) {
|
||||
exe_unit->reset();
|
||||
}
|
||||
|
||||
for (auto& commit_arb : commit_arbs_) {
|
||||
commit_arb->reset();
|
||||
}
|
||||
|
||||
for ( auto& barrier : barriers_) {
|
||||
for (auto& barrier : barriers_) {
|
||||
barrier.reset();
|
||||
}
|
||||
|
||||
|
@ -112,7 +128,7 @@ void Core::reset() {
|
|||
ibuf.clear();
|
||||
}
|
||||
|
||||
commit_exe_= 0;
|
||||
ibuffer_idx_ = 0;
|
||||
|
||||
scoreboard_.clear();
|
||||
fetch_latch_.clear();
|
||||
|
@ -150,8 +166,10 @@ void Core::schedule() {
|
|||
break;
|
||||
}
|
||||
}
|
||||
if (scheduled_warp == -1)
|
||||
if (scheduled_warp == -1) {
|
||||
++perf_stats_.sched_stalls;
|
||||
return;
|
||||
}
|
||||
|
||||
// suspend warp until decode
|
||||
stalled_warps_.set(scheduled_warp);
|
||||
|
@ -192,11 +210,11 @@ void Core::fetch() {
|
|||
mem_req.tag = pending_icache_.allocate(trace);
|
||||
mem_req.cid = trace->cid;
|
||||
mem_req.uuid = trace->uuid;
|
||||
icache_req_ports.at(0).send(mem_req, 1);
|
||||
icache_req_ports.at(0).send(mem_req, 2);
|
||||
DT(3, "icache-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
||||
fetch_latch_.pop();
|
||||
++pending_ifetches_;
|
||||
fetch_latch_.pop();
|
||||
++perf_stats_.ifetches;
|
||||
++pending_ifetches_;
|
||||
}
|
||||
|
||||
void Core::decode() {
|
||||
|
@ -206,7 +224,7 @@ void Core::decode() {
|
|||
auto trace = decode_latch_.front();
|
||||
|
||||
// check ibuffer capacity
|
||||
auto& ibuffer = ibuffers_.at(trace->wid % ISSUE_WIDTH);
|
||||
auto& ibuffer = ibuffers_.at(trace->wid);
|
||||
if (ibuffer.full()) {
|
||||
if (!trace->log_once(true)) {
|
||||
DT(3, "*** ibuffer-stall: " << *trace);
|
||||
|
@ -239,7 +257,7 @@ void Core::decode() {
|
|||
}
|
||||
|
||||
void Core::issue() {
|
||||
// operands to dispatch
|
||||
// operands to dispatchers
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto& operand = operands_.at(i);
|
||||
if (operand->Output.empty())
|
||||
|
@ -257,7 +275,8 @@ void Core::issue() {
|
|||
|
||||
// issue ibuffer instructions
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto& ibuffer = ibuffers_.at(i);
|
||||
uint32_t ii = (ibuffer_idx_ + i) % ibuffers_.size();
|
||||
auto& ibuffer = ibuffers_.at(ii);
|
||||
if (ibuffer.empty())
|
||||
continue;
|
||||
|
||||
|
@ -265,17 +284,27 @@ void Core::issue() {
|
|||
|
||||
// check scoreboard
|
||||
if (scoreboard_.in_use(trace)) {
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
if (!trace->log_once(true)) {
|
||||
DTH(3, "*** scoreboard-stall: dependents={");
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
DTH(3, "*** scoreboard-stall: dependents={");
|
||||
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
|
||||
auto& use = uses.at(j);
|
||||
__unused (use);
|
||||
if (j) DTN(3, ", ");
|
||||
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
|
||||
DTN(3, use.reg_type << use.reg_id << "(#" << use.uuid << ")");
|
||||
}
|
||||
DTN(3, "}, " << *trace << std::endl);
|
||||
}
|
||||
for (uint32_t j = 0, n = uses.size(); j < n; ++j) {
|
||||
auto& use = uses.at(j);
|
||||
switch (use.exe_type) {
|
||||
case ExeType::ALU: ++perf_stats_.scrb_alu; break;
|
||||
case ExeType::FPU: ++perf_stats_.scrb_fpu; break;
|
||||
case ExeType::LSU: ++perf_stats_.scrb_lsu; break;
|
||||
case ExeType::SFU: ++perf_stats_.scrb_sfu; break;
|
||||
default: assert(false);
|
||||
}
|
||||
}
|
||||
++perf_stats_.scrb_stalls;
|
||||
continue;
|
||||
} else {
|
||||
|
@ -294,10 +323,11 @@ void Core::issue() {
|
|||
|
||||
ibuffer.pop();
|
||||
}
|
||||
ibuffer_idx_ += ISSUE_WIDTH;
|
||||
}
|
||||
|
||||
void Core::execute() {
|
||||
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
|
||||
for (uint32_t i = 0; i < (uint32_t)ExeType::ExeTypeCount; ++i) {
|
||||
auto& dispatch = dispatchers_.at(i);
|
||||
auto& exe_unit = exe_units_.at(i);
|
||||
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
|
||||
|
@ -313,10 +343,11 @@ void Core::execute() {
|
|||
void Core::commit() {
|
||||
// process completed instructions
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
auto trace = committed_traces_.at(i);
|
||||
if (!trace)
|
||||
auto& commit_arb = commit_arbs_.at(i);
|
||||
if (commit_arb->Outputs.at(0).empty())
|
||||
continue;
|
||||
committed_traces_.at(i) = nullptr;
|
||||
|
||||
auto trace = commit_arb->Outputs.at(0).front();
|
||||
|
||||
// advance to commit stage
|
||||
DT(3, "pipeline-commit: " << *trace);
|
||||
|
@ -334,27 +365,11 @@ void Core::commit() {
|
|||
perf_stats_.instrs += trace->tmask.count();
|
||||
}
|
||||
|
||||
commit_arb->Outputs.at(0).pop();
|
||||
|
||||
// delete the trace
|
||||
delete trace;
|
||||
}
|
||||
|
||||
// select completed instructions
|
||||
for (uint32_t i = 0; i < (uint32_t)ExeType::MAX; ++i) {
|
||||
uint32_t ii = (commit_exe_ + i) % (uint32_t)ExeType::MAX;
|
||||
auto& exe_unit = exe_units_.at(ii);
|
||||
for (uint32_t j = 0; j < ISSUE_WIDTH; ++j) {
|
||||
auto committed_trace = committed_traces_.at(j);
|
||||
if (committed_trace)
|
||||
continue;
|
||||
auto& output = exe_unit->Outputs.at(j);
|
||||
if (output.empty())
|
||||
continue;
|
||||
auto trace = output.front();
|
||||
committed_traces_.at(j) = trace;
|
||||
output.pop();
|
||||
}
|
||||
}
|
||||
++commit_exe_;
|
||||
}
|
||||
|
||||
void Core::wspawn(uint32_t num_warps, Word nextPC) {
|
||||
|
@ -533,6 +548,10 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
break;
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_SCHED_ST: return perf_stats_.sched_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ST_H:return perf_stats_.sched_stalls >> 32;
|
||||
case VX_CSR_MPM_FETCH_ST: return perf_stats_.fetch_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_FETCH_ST_H:return perf_stats_.fetch_stalls >> 32;
|
||||
case VX_CSR_MPM_IBUF_ST: return perf_stats_.ibuf_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_IBUF_ST_H: return perf_stats_.ibuf_stalls >> 32;
|
||||
case VX_CSR_MPM_SCRB_ST: return perf_stats_.scrb_stalls & 0xffffffff;
|
||||
|
@ -545,6 +564,14 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_MPM_FPU_ST_H: return perf_stats_.fpu_stalls >> 32;
|
||||
case VX_CSR_MPM_SFU_ST: return perf_stats_.sfu_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SFU_ST_H: return perf_stats_.sfu_stalls >> 32;
|
||||
case VX_CSR_MPM_SCRB_ALU: return perf_stats_.scrb_alu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_ALU_H:return perf_stats_.scrb_alu >> 32;
|
||||
case VX_CSR_MPM_SCRB_FPU: return perf_stats_.scrb_fpu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_FPU_H:return perf_stats_.scrb_fpu >> 32;
|
||||
case VX_CSR_MPM_SCRB_LSU: return perf_stats_.scrb_lsu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_LSU_H:return perf_stats_.scrb_lsu >> 32;
|
||||
case VX_CSR_MPM_SCRB_SFU: return perf_stats_.scrb_sfu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_SFU_H:return perf_stats_.scrb_sfu >> 32;
|
||||
|
||||
case VX_CSR_MPM_IFETCHES: return perf_stats_.ifetches & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCHES_H: return perf_stats_.ifetches >> 32;
|
||||
|
@ -561,30 +588,25 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
auto proc_perf = cluster_->processor()->perf_stats();
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32;
|
||||
case VX_CSR_MPM_ICACHE_READS: return proc_perf.clusters.icache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_READS_H: return proc_perf.clusters.icache.reads >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R: return proc_perf.clusters.icache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R_H: return proc_perf.clusters.icache.read_misses >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MSHR_ST: return proc_perf.clusters.icache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return proc_perf.clusters.icache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32;
|
||||
case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST_H:return proc_perf.clusters.dcache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST_H:return proc_perf.clusters.dcache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
|
||||
case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32;
|
||||
case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_BANK_ST_H:return proc_perf.clusters.sharedmem.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_DCACHE_READS: return proc_perf.clusters.dcache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_READS_H: return proc_perf.clusters.dcache.reads >> 32;
|
||||
case VX_CSR_MPM_DCACHE_WRITES: return proc_perf.clusters.dcache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_WRITES_H: return proc_perf.clusters.dcache.writes >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R: return proc_perf.clusters.dcache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R_H: return proc_perf.clusters.dcache.read_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W: return proc_perf.clusters.dcache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W_H: return proc_perf.clusters.dcache.write_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST: return proc_perf.clusters.dcache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST_H: return proc_perf.clusters.dcache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST: return proc_perf.clusters.dcache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return proc_perf.clusters.dcache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_L2CACHE_READS: return proc_perf.clusters.l2cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_READS_H: return proc_perf.clusters.l2cache.reads >> 32;
|
||||
|
@ -612,12 +634,19 @@ uint32_t Core::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
|
||||
|
||||
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
||||
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_WRITES_H:return proc_perf.mem_writes >> 32;
|
||||
case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32;
|
||||
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
||||
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
|
||||
case VX_CSR_MPM_MEM_LAT: return proc_perf.mem_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_LAT_H: return proc_perf.mem_latency >> 32;
|
||||
|
||||
case VX_CSR_MPM_SMEM_READS: return proc_perf.clusters.sharedmem.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_READS_H: return proc_perf.clusters.sharedmem.reads >> 32;
|
||||
case VX_CSR_MPM_SMEM_WRITES: return proc_perf.clusters.sharedmem.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_WRITES_H: return proc_perf.clusters.sharedmem.writes >> 32;
|
||||
case VX_CSR_MPM_SMEM_BANK_ST: return proc_perf.clusters.sharedmem.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SMEM_BANK_ST_H: return proc_perf.clusters.sharedmem.bank_stalls >> 32;
|
||||
}
|
||||
} break;
|
||||
}
|
||||
|
|
|
@ -22,11 +22,11 @@
|
|||
#include <memory>
|
||||
#include <set>
|
||||
#include <simobject.h>
|
||||
#include <mem.h>
|
||||
#include "debug.h"
|
||||
#include "types.h"
|
||||
#include "arch.h"
|
||||
#include "decode.h"
|
||||
#include "mem.h"
|
||||
#include "warp.h"
|
||||
#include "pipeline.h"
|
||||
#include "cache_sim.h"
|
||||
|
@ -42,17 +42,25 @@ namespace vortex {
|
|||
|
||||
class Cluster;
|
||||
|
||||
using TraceSwitch = Mux<pipeline_trace_t*>;
|
||||
|
||||
class Core : public SimObject<Core> {
|
||||
public:
|
||||
struct PerfStats {
|
||||
uint64_t cycles;
|
||||
uint64_t instrs;
|
||||
uint64_t sched_stalls;
|
||||
uint64_t fetch_stalls;
|
||||
uint64_t ibuf_stalls;
|
||||
uint64_t scrb_stalls;
|
||||
uint64_t alu_stalls;
|
||||
uint64_t lsu_stalls;
|
||||
uint64_t fpu_stalls;
|
||||
uint64_t sfu_stalls;
|
||||
uint64_t scrb_alu;
|
||||
uint64_t scrb_fpu;
|
||||
uint64_t scrb_lsu;
|
||||
uint64_t scrb_sfu;
|
||||
uint64_t ifetches;
|
||||
uint64_t loads;
|
||||
uint64_t stores;
|
||||
|
@ -62,12 +70,18 @@ public:
|
|||
PerfStats()
|
||||
: cycles(0)
|
||||
, instrs(0)
|
||||
, sched_stalls(0)
|
||||
, fetch_stalls(0)
|
||||
, ibuf_stalls(0)
|
||||
, scrb_stalls(0)
|
||||
, alu_stalls(0)
|
||||
, lsu_stalls(0)
|
||||
, fpu_stalls(0)
|
||||
, sfu_stalls(0)
|
||||
, scrb_alu(0)
|
||||
, scrb_fpu(0)
|
||||
, scrb_lsu(0)
|
||||
, scrb_sfu(0)
|
||||
, ifetches(0)
|
||||
, loads(0)
|
||||
, stores(0)
|
||||
|
@ -173,7 +187,6 @@ private:
|
|||
PipelineLatch decode_latch_;
|
||||
|
||||
HashTable<pipeline_trace_t*> pending_icache_;
|
||||
std::vector<pipeline_trace_t*> committed_traces_;
|
||||
WarpMask active_warps_;
|
||||
WarpMask stalled_warps_;
|
||||
uint64_t issued_instrs_;
|
||||
|
@ -190,7 +203,9 @@ private:
|
|||
|
||||
Cluster* cluster_;
|
||||
|
||||
uint32_t commit_exe_;
|
||||
std::vector<TraceSwitch::Ptr> commit_arbs_;
|
||||
|
||||
uint32_t ibuffer_idx_;
|
||||
|
||||
friend class Warp;
|
||||
friend class LsuUnit;
|
||||
|
|
|
@ -66,6 +66,7 @@ public:
|
|||
}
|
||||
auto& output = Outputs.at(i);
|
||||
auto trace = input.front();
|
||||
auto new_trace = trace;
|
||||
if (pid_count_ != 1) {
|
||||
auto start_p = start_p_.at(b);
|
||||
if (start_p == -1) {
|
||||
|
@ -81,33 +82,30 @@ public:
|
|||
end = j;
|
||||
}
|
||||
start /= num_lanes_;
|
||||
end /= num_lanes_;
|
||||
auto new_trace = new pipeline_trace_t(*trace);
|
||||
new_trace->tmask.reset();
|
||||
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
|
||||
new_trace->tmask[j] = trace->tmask[j];
|
||||
}
|
||||
new_trace->pid = start;
|
||||
new_trace->sop = (start_p == 0);
|
||||
if (start == end) {
|
||||
new_trace->eop = 1;
|
||||
end /= num_lanes_;
|
||||
if (start != end) {
|
||||
new_trace = new pipeline_trace_t(*trace);
|
||||
new_trace->eop = false;
|
||||
start_p_.at(b) = start + 1;
|
||||
} else {
|
||||
start_p_.at(b) = -1;
|
||||
input.pop();
|
||||
++block_sent;
|
||||
delete trace;
|
||||
} else {
|
||||
new_trace->eop = 0;
|
||||
start_p_.at(b) = start + 1;
|
||||
}
|
||||
output.send(new_trace, 1);
|
||||
DT(3, "pipeline-dispatch: " << *new_trace);
|
||||
}
|
||||
new_trace->pid = start;
|
||||
new_trace->sop = (0 == start_p);
|
||||
ThreadMask tmask;
|
||||
for (int j = start * num_lanes_, n = j + num_lanes_; j < n; ++j) {
|
||||
tmask[j] = trace->tmask[j];
|
||||
}
|
||||
new_trace->tmask = tmask;
|
||||
} else {
|
||||
trace->pid = 0;
|
||||
new_trace->pid = 0;
|
||||
input.pop();
|
||||
output.send(trace, 1);
|
||||
DT(3, "pipeline-dispatch: " << *trace);
|
||||
++block_sent;
|
||||
}
|
||||
}
|
||||
DT(3, "pipeline-dispatch: " << *new_trace);
|
||||
output.send(new_trace, 1);
|
||||
}
|
||||
if (block_sent == block_size_) {
|
||||
batch_idx_ = (batch_idx_ + 1) % batch_count_;
|
||||
|
@ -138,4 +136,4 @@ private:
|
|||
std::vector<int> start_p_;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,18 +32,17 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
|
|||
l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{
|
||||
!L3_ENABLED,
|
||||
log2ceil(L3_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
log2ceil(L3_NUM_WAYS), // W
|
||||
0, // A
|
||||
XLEN, // address bits
|
||||
L3_NUM_BANKS, // number of banks
|
||||
1, // number of ports
|
||||
log2ceil(MEM_BLOCK_SIZE), // L
|
||||
log2ceil(L3_NUM_WAYS), // W
|
||||
0, // A
|
||||
log2ceil(L3_NUM_BANKS), // B
|
||||
XLEN, // address bits
|
||||
1, // number of ports
|
||||
uint8_t(arch.num_clusters()), // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
0, // victim size
|
||||
L3_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
true, // write-through
|
||||
false, // write response
|
||||
L3_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
}
|
||||
);
|
||||
|
||||
|
|
|
@ -22,9 +22,10 @@ class Scoreboard {
|
|||
public:
|
||||
|
||||
struct reg_use_t {
|
||||
RegType type;
|
||||
uint32_t reg;
|
||||
uint64_t owner;
|
||||
RegType reg_type;
|
||||
uint32_t reg_id;
|
||||
ExeType exe_type;
|
||||
uint64_t uuid;
|
||||
};
|
||||
|
||||
Scoreboard(const Arch &arch)
|
||||
|
@ -44,89 +45,81 @@ public:
|
|||
owners_.clear();
|
||||
}
|
||||
|
||||
bool in_use(pipeline_trace_t* state) const {
|
||||
return (state->used_iregs & in_use_iregs_.at(state->wid)) != 0
|
||||
|| (state->used_fregs & in_use_fregs_.at(state->wid)) != 0
|
||||
|| (state->used_vregs & in_use_vregs_.at(state->wid)) != 0;
|
||||
bool in_use(pipeline_trace_t* trace) const {
|
||||
return (trace->used_iregs & in_use_iregs_.at(trace->wid)) != 0
|
||||
|| (trace->used_fregs & in_use_fregs_.at(trace->wid)) != 0
|
||||
|| (trace->used_vregs & in_use_vregs_.at(trace->wid)) != 0;
|
||||
}
|
||||
|
||||
std::vector<reg_use_t> get_uses(pipeline_trace_t* state) const {
|
||||
std::vector<reg_use_t> out;
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_iregs = state->used_iregs & in_use_iregs_.at(state->wid);
|
||||
while (used_iregs.any()) {
|
||||
if (used_iregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Integer;
|
||||
out.push_back({RegType::Integer, r, owners_.at(tag)});
|
||||
}
|
||||
used_iregs >>= 1;
|
||||
++r;
|
||||
std::vector<reg_use_t> get_uses(pipeline_trace_t* trace) const {
|
||||
std::vector<reg_use_t> out;
|
||||
|
||||
auto used_iregs = trace->used_iregs & in_use_iregs_.at(trace->wid);
|
||||
auto used_fregs = trace->used_fregs & in_use_fregs_.at(trace->wid);
|
||||
auto used_vregs = trace->used_vregs & in_use_vregs_.at(trace->wid);
|
||||
|
||||
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
|
||||
if (used_iregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Integer, r, owner->exe_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_fregs = state->used_fregs & in_use_fregs_.at(state->wid);
|
||||
while (used_fregs.any()) {
|
||||
if (used_fregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Float;
|
||||
out.push_back({RegType::Float, r, owners_.at(tag)});
|
||||
}
|
||||
used_fregs >>= 1;
|
||||
++r;
|
||||
|
||||
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
|
||||
if (used_fregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Float, r, owner->exe_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
{
|
||||
uint32_t r = 0;
|
||||
auto used_vregs = state->used_vregs & in_use_vregs_.at(state->wid);
|
||||
while (used_vregs.any()) {
|
||||
if (used_vregs.test(0)) {
|
||||
uint32_t tag = (r << 16) | (state->wid << 4) | (int)RegType::Vector;
|
||||
out.push_back({RegType::Vector, r, owners_.at(tag)});
|
||||
}
|
||||
used_vregs >>= 1;
|
||||
++r;
|
||||
|
||||
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
|
||||
if (used_vregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Vector;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Vector, r, owner->exe_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
void reserve(pipeline_trace_t* state) {
|
||||
assert(state->wb);
|
||||
switch (state->rdest_type) {
|
||||
void reserve(pipeline_trace_t* trace) {
|
||||
assert(trace->wb);
|
||||
switch (trace->rdest_type) {
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(state->wid).set(state->rdest);
|
||||
in_use_iregs_.at(trace->wid).set(trace->rdest);
|
||||
break;
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(state->wid).set(state->rdest);
|
||||
in_use_fregs_.at(trace->wid).set(trace->rdest);
|
||||
break;
|
||||
case RegType::Vector:
|
||||
in_use_vregs_.at(state->wid).set(state->rdest);
|
||||
break;
|
||||
default:
|
||||
in_use_vregs_.at(trace->wid).set(trace->rdest);
|
||||
break;
|
||||
default: assert(false);
|
||||
}
|
||||
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
|
||||
uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
|
||||
assert(owners_.count(tag) == 0);
|
||||
owners_[tag] = state->uuid;
|
||||
owners_[tag] = trace;
|
||||
assert((int)trace->exe_type < 5);
|
||||
}
|
||||
|
||||
void release(pipeline_trace_t* state) {
|
||||
assert(state->wb);
|
||||
switch (state->rdest_type) {
|
||||
void release(pipeline_trace_t* trace) {
|
||||
assert(trace->wb);
|
||||
switch (trace->rdest_type) {
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(state->wid).reset(state->rdest);
|
||||
in_use_iregs_.at(trace->wid).reset(trace->rdest);
|
||||
break;
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(state->wid).reset(state->rdest);
|
||||
in_use_fregs_.at(trace->wid).reset(trace->rdest);
|
||||
break;
|
||||
case RegType::Vector:
|
||||
in_use_vregs_.at(state->wid).reset(state->rdest);
|
||||
break;
|
||||
default:
|
||||
in_use_vregs_.at(trace->wid).reset(trace->rdest);
|
||||
break;
|
||||
default: assert(false);
|
||||
}
|
||||
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
|
||||
uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
|
||||
owners_.erase(tag);
|
||||
}
|
||||
|
||||
|
@ -135,7 +128,7 @@ private:
|
|||
std::vector<RegMask> in_use_iregs_;
|
||||
std::vector<RegMask> in_use_fregs_;
|
||||
std::vector<RegMask> in_use_vregs_;
|
||||
std::unordered_map<uint32_t, uint64_t> owners_;
|
||||
std::unordered_map<uint32_t, pipeline_trace_t*> owners_;
|
||||
};
|
||||
|
||||
}
|
164
sim/simx/types.h
164
sim/simx/types.h
|
@ -81,7 +81,7 @@ enum class ExeType {
|
|||
LSU,
|
||||
FPU,
|
||||
SFU,
|
||||
MAX,
|
||||
ExeTypeCount
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
|
||||
|
@ -90,7 +90,7 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
|
|||
case ExeType::LSU: os << "LSU"; break;
|
||||
case ExeType::FPU: os << "FPU"; break;
|
||||
case ExeType::SFU: os << "SFU"; break;
|
||||
case ExeType::MAX: break;
|
||||
default: assert(false);
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
@ -138,7 +138,7 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
|
|||
enum class AddrType {
|
||||
Global,
|
||||
Shared,
|
||||
IO,
|
||||
IO
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
|
||||
|
@ -164,7 +164,7 @@ enum class FpuType {
|
|||
FMA,
|
||||
FDIV,
|
||||
FSQRT,
|
||||
FCVT,
|
||||
FCVT
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
|
||||
|
@ -190,7 +190,7 @@ enum class SfuType {
|
|||
CSRRW,
|
||||
CSRRS,
|
||||
CSRRC,
|
||||
CMOV
|
||||
CMOV
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const SfuType& type) {
|
||||
|
@ -351,6 +351,92 @@ private:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Type>
|
||||
class Mux : public SimObject<Mux<Type>> {
|
||||
public:
|
||||
std::vector<SimPort<Type>> Inputs;
|
||||
std::vector<SimPort<Type>> Outputs;
|
||||
|
||||
Mux(
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
ArbiterType type,
|
||||
uint32_t num_inputs,
|
||||
uint32_t num_outputs = 1,
|
||||
uint32_t delay = 1
|
||||
) : SimObject<Mux<Type>>(ctx, name)
|
||||
, Inputs(num_inputs, this)
|
||||
, Outputs(num_outputs, this)
|
||||
, type_(type)
|
||||
, delay_(delay)
|
||||
, cursors_(num_outputs, 0)
|
||||
, num_reqs_(num_inputs / num_outputs)
|
||||
{
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 32);
|
||||
assert(num_outputs <= 32);
|
||||
assert(num_inputs >= num_outputs);
|
||||
|
||||
// bypass mode
|
||||
if (num_inputs == num_outputs) {
|
||||
for (uint32_t i = 0; i < num_inputs; ++i) {
|
||||
Inputs.at(i).bind(&Outputs.at(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void reset() {
|
||||
for (auto& cursor : cursors_) {
|
||||
cursor = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void tick() {
|
||||
uint32_t I = Inputs.size();
|
||||
uint32_t O = Outputs.size();
|
||||
uint32_t R = num_reqs_;
|
||||
|
||||
// skip bypass mode
|
||||
if (I == O)
|
||||
return;
|
||||
|
||||
// process inputs
|
||||
for (uint32_t o = 0; o < O; ++o) {
|
||||
for (uint32_t r = 0; r < R; ++r) {
|
||||
uint32_t i = (cursors_.at(o) + r) & (R-1);
|
||||
uint32_t j = o * R + i;
|
||||
if (j >= I)
|
||||
continue;
|
||||
|
||||
auto& req_in = Inputs.at(j);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
DT(4, this->name() << "-" << req);
|
||||
Outputs.at(o).send(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_cursor(o, i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void update_cursor(uint32_t index, uint32_t grant) {
|
||||
if (type_ == ArbiterType::RoundRobin) {
|
||||
cursors_.at(index) = grant + 1;
|
||||
}
|
||||
}
|
||||
|
||||
ArbiterType type_;
|
||||
uint32_t delay_;
|
||||
std::vector<uint32_t> cursors_;
|
||||
uint32_t num_reqs_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Req, typename Rsp>
|
||||
class Switch : public SimObject<Switch<Req, Rsp>> {
|
||||
public:
|
||||
|
@ -364,13 +450,13 @@ public:
|
|||
const SimContext& ctx,
|
||||
const char* name,
|
||||
ArbiterType type,
|
||||
uint32_t num_inputs = 1,
|
||||
uint32_t num_inputs,
|
||||
uint32_t num_outputs = 1,
|
||||
uint32_t delay = 1
|
||||
)
|
||||
: SimObject<Switch<Req, Rsp>>(ctx, name)
|
||||
, ReqIn(num_inputs, this)
|
||||
, RspIn(num_inputs, this)
|
||||
, ReqIn(num_inputs, this)
|
||||
, RspIn(num_inputs, this)
|
||||
, ReqOut(num_outputs, this)
|
||||
, RspOut(num_outputs, this)
|
||||
, type_(type)
|
||||
|
@ -383,8 +469,8 @@ public:
|
|||
assert(num_outputs <= 32);
|
||||
assert(num_inputs >= num_outputs);
|
||||
|
||||
// bypass mode
|
||||
if (num_inputs == num_outputs) {
|
||||
// bypass mode
|
||||
for (uint32_t i = 0; i < num_inputs; ++i) {
|
||||
ReqIn.at(i).bind(&ReqOut.at(i));
|
||||
RspOut.at(i).bind(&RspIn.at(i));
|
||||
|
@ -462,14 +548,14 @@ private:
|
|||
|
||||
class SMemDemux : public SimObject<SMemDemux> {
|
||||
public:
|
||||
SimPort<MemReq> ReqIn;
|
||||
SimPort<MemRsp> RspIn;
|
||||
SimPort<MemReq> ReqIn;
|
||||
SimPort<MemRsp> RspIn;
|
||||
|
||||
SimPort<MemReq> ReqSm;
|
||||
SimPort<MemRsp> RspSm;
|
||||
SimPort<MemReq> ReqSM;
|
||||
SimPort<MemRsp> RspSM;
|
||||
|
||||
SimPort<MemReq> ReqDc;
|
||||
SimPort<MemRsp> RspDc;
|
||||
SimPort<MemReq> ReqDC;
|
||||
SimPort<MemRsp> RspDC;
|
||||
|
||||
SMemDemux(
|
||||
const SimContext& ctx,
|
||||
|
@ -478,45 +564,49 @@ public:
|
|||
) : SimObject<SMemDemux>(ctx, name)
|
||||
, ReqIn(this)
|
||||
, RspIn(this)
|
||||
, ReqSm(this)
|
||||
, RspSm(this)
|
||||
, ReqDc(this)
|
||||
, RspDc(this)
|
||||
, ReqSM(this)
|
||||
, RspSM(this)
|
||||
, ReqDC(this)
|
||||
, RspDC(this)
|
||||
, delay_(delay)
|
||||
{}
|
||||
|
||||
void reset() {}
|
||||
|
||||
void tick() {
|
||||
void tick() {
|
||||
// process incoming reponses
|
||||
if (!RspSM.empty()) {
|
||||
auto& rsp = RspSM.front();
|
||||
DT(4, this->name() << "-" << rsp);
|
||||
RspIn.send(rsp, 1);
|
||||
RspSM.pop();
|
||||
}
|
||||
if (!RspDC.empty()) {
|
||||
auto& rsp = RspDC.front();
|
||||
DT(4, this->name() << "-" << rsp);
|
||||
RspIn.send(rsp, 1);
|
||||
RspDC
|
||||
.pop();
|
||||
}
|
||||
// process incomming requests
|
||||
if (!ReqIn.empty()) {
|
||||
auto& req = ReqIn.front();
|
||||
DT(4, this->name() << "-" << req);
|
||||
if (req.type == AddrType::Shared) {
|
||||
ReqSm.send(req, delay_);
|
||||
ReqSM.send(req, delay_);
|
||||
} else {
|
||||
ReqDc.send(req, delay_);
|
||||
ReqDC.send(req, delay_);
|
||||
}
|
||||
ReqIn.pop();
|
||||
}
|
||||
|
||||
// process incoming reponses
|
||||
if (!RspSm.empty()) {
|
||||
auto& rsp = RspSm.front();
|
||||
DT(4, this->name() << "-" << rsp);
|
||||
RspIn.send(rsp, 1);
|
||||
RspSm.pop();
|
||||
}
|
||||
if (!RspDc.empty()) {
|
||||
auto& rsp = RspDc.front();
|
||||
DT(4, this->name() << "-" << rsp);
|
||||
RspIn.send(rsp, 1);
|
||||
RspDc.pop();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
uint32_t delay_;
|
||||
};
|
||||
|
||||
}
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
using MemSwitch = Switch<MemReq, MemRsp>;
|
||||
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@ all:
|
|||
$(MAKE) -C lbm
|
||||
$(MAKE) -C oclprintf
|
||||
$(MAKE) -C blackscholes
|
||||
$(MAKE) -C matmul
|
||||
$(MAKE) -C sgemm2
|
||||
$(MAKE) -C convolution
|
||||
|
||||
run-simx:
|
||||
|
@ -38,7 +38,7 @@ run-simx:
|
|||
$(MAKE) -C transpose run-simx
|
||||
$(MAKE) -C convolution run-simx
|
||||
$(MAKE) -C cutcp run-simx
|
||||
$(MAKE) -C matmul run-simx
|
||||
$(MAKE) -C sgemm2 run-simx
|
||||
$(MAKE) -C vectorhypot run-simx
|
||||
$(MAKE) -C mri-q run-simx
|
||||
|
||||
|
@ -60,7 +60,7 @@ run-rtlsim:
|
|||
$(MAKE) -C blackscholes run-rtlsim
|
||||
$(MAKE) -C convolution run-rtlsim
|
||||
# $(MAKE) -C cutcp run-rtlsim
|
||||
# $(MAKE) -C matmul run-rtlsim
|
||||
# $(MAKE) -C sgemm2 run-rtlsim
|
||||
# $(MAKE) -C vectorhypot run-rtlsim
|
||||
# $(MAKE) -C mri-q run-rtlsim
|
||||
|
||||
|
@ -82,7 +82,7 @@ run-opae:
|
|||
$(MAKE) -C blackscholes run-opae
|
||||
$(MAKE) -C convolution run-opae
|
||||
# $(MAKE) -C cutcp run-opae
|
||||
# $(MAKE) -C matmul run-opae
|
||||
# $(MAKE) -C sgemm2 run-opae
|
||||
# $(MAKE) -C vectorhypot run-opae
|
||||
# $(MAKE) -C mri-q run-opae
|
||||
|
||||
|
@ -105,7 +105,7 @@ clean:
|
|||
$(MAKE) -C lbm clean
|
||||
$(MAKE) -C oclprintf clean
|
||||
$(MAKE) -C blackscholes clean
|
||||
$(MAKE) -C matmul clean
|
||||
$(MAKE) -C sgemm2 clean
|
||||
$(MAKE) -C convolution clean
|
||||
|
||||
clean-all:
|
||||
|
@ -128,5 +128,5 @@ clean-all:
|
|||
$(MAKE) -C lbm clean-all
|
||||
$(MAKE) -C oclprintf clean-all
|
||||
$(MAKE) -C blackscholes clean-all
|
||||
$(MAKE) -C matmul clean-all
|
||||
$(MAKE) -C sgemm2 clean-all
|
||||
$(MAKE) -C convolution clean-all
|
||||
|
|
|
@ -73,7 +73,7 @@ OBJS := $(addsuffix .o, $(notdir $(SRCS)))
|
|||
all: $(PROJECT) kernel.pocl
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_VORTEX_CFLAGS="$(K_CFLAGS)" POCL_VORTEX_LDFLAGS="$(K_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||
|
||||
%.cc.o: %.cc
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
@ -87,6 +87,9 @@ kernel.pocl: kernel.cl
|
|||
$(PROJECT): $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-hostgpu: $(PROJECT) kernel.pocl
|
||||
./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
PROJECT = matmul
|
||||
PROJECT = sgemm2
|
||||
|
||||
SRCS = main.cc
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
__kernel void matmul(__global float *A,
|
||||
__kernel void sgemm2(__global float *A,
|
||||
__global float *B,
|
||||
__global float *C,
|
||||
const unsigned int N,
|
||||
|
@ -28,12 +28,15 @@ __kernel void matmul(__global float *A,
|
|||
for (int j = 0; j < localSize; j++) {
|
||||
sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
|
||||
}
|
||||
|
||||
// Ensure computation is done before loading next block
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
C[globalRow * N + globalCol] = sum;
|
||||
}
|
||||
|
||||
/*__kernel void matmul(__global float *A,
|
||||
/*__kernel void sgemm2(__global float *A,
|
||||
__global float *B,
|
||||
__global float *C,
|
||||
const unsigned int N)
|
|
@ -12,7 +12,7 @@
|
|||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define KERNEL_NAME "matmul"
|
||||
#define KERNEL_NAME "sgemm2"
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
do { \
|
|
@ -10,7 +10,8 @@ all:
|
|||
$(MAKE) -C fence
|
||||
$(MAKE) -C no_mf_ext
|
||||
$(MAKE) -C no_smem
|
||||
$(MAKE) -C tensor
|
||||
$(MAKE) -C vecaddx
|
||||
$(MAKE) -C sgemmx
|
||||
|
||||
run-simx:
|
||||
$(MAKE) -C basic run-simx
|
||||
|
@ -24,7 +25,8 @@ run-simx:
|
|||
$(MAKE) -C fence run-simx
|
||||
$(MAKE) -C no_mf_ext run-simx
|
||||
$(MAKE) -C no_smem run-simx
|
||||
$(MAKE) -C tensor run-simx
|
||||
$(MAKE) -C vecaddx run-simx
|
||||
$(MAKE) -C sgemmx run-simx
|
||||
|
||||
run-rtlsim:
|
||||
$(MAKE) -C basic run-rtlsim
|
||||
|
@ -38,7 +40,8 @@ run-rtlsim:
|
|||
$(MAKE) -C fence run-rtlsim
|
||||
$(MAKE) -C no_mf_ext run-rtlsim
|
||||
$(MAKE) -C no_smem run-rtlsim
|
||||
$(MAKE) -C tensor run-rtlsim
|
||||
$(MAKE) -C vecaddx run-rtlsim
|
||||
$(MAKE) -C sgemmx run-rtlsim
|
||||
|
||||
run-opae:
|
||||
$(MAKE) -C basic run-opae
|
||||
|
@ -52,7 +55,8 @@ run-opae:
|
|||
$(MAKE) -C fence run-opae
|
||||
$(MAKE) -C no_mf_ext run-opae
|
||||
$(MAKE) -C no_smem run-opae
|
||||
$(MAKE) -C tensor run-opae
|
||||
$(MAKE) -C vecaddx run-opae
|
||||
$(MAKE) -C sgemmx run-opae
|
||||
|
||||
clean:
|
||||
$(MAKE) -C basic clean
|
||||
|
@ -66,7 +70,8 @@ clean:
|
|||
$(MAKE) -C fence clean
|
||||
$(MAKE) -C no_mf_ext clean
|
||||
$(MAKE) -C no_smem clean
|
||||
$(MAKE) -C tensor clean
|
||||
$(MAKE) -C vecaddx clean
|
||||
$(MAKE) -C sgemmx clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C basic clean-all
|
||||
|
@ -80,4 +85,5 @@ clean-all:
|
|||
$(MAKE) -C fence clean-all
|
||||
$(MAKE) -C no_mf_ext clean-all
|
||||
$(MAKE) -C no_smem clean-all
|
||||
$(MAKE) -C tensor clean-all
|
||||
$(MAKE) -C vecaddx clean-all
|
||||
$(MAKE) -C sgemmx clean-all
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
PROJECT = tensor
|
||||
PROJECT = sgemmx
|
||||
|
||||
SRCS = main.cpp
|
||||
|
|
@ -2,6 +2,7 @@
|
|||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
#include <chrono>
|
||||
#include <vortex.h>
|
||||
#include "common.h"
|
||||
|
||||
|
@ -122,43 +123,6 @@ void cleanup() {
|
|||
}
|
||||
}
|
||||
|
||||
int run_test(const kernel_arg_t& kernel_arg,
|
||||
uint32_t buf_size,
|
||||
const std::vector<TYPE>& refs) {
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < refs.size(); ++i) {
|
||||
auto ref = refs[i];
|
||||
auto cur = buf_ptr[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
@ -239,10 +203,43 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "clear destination buffer" << std::endl;
|
||||
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size));
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// run tests
|
||||
std::cout << "run tests" << std::endl;
|
||||
RT_CHECK(run_test(kernel_arg, buf_size, refs));
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < refs.size(); ++i) {
|
||||
auto ref = refs[i];
|
||||
auto cur = buf_ptr[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
9
tests/regression/vecaddx/Makefile
Normal file
9
tests/regression/vecaddx/Makefile
Normal file
|
@ -0,0 +1,9 @@
|
|||
PROJECT = vecaddx
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
VX_SRCS = kernel.cpp
|
||||
|
||||
OPTS ?= -n64
|
||||
|
||||
include ../common.mk
|
17
tests/regression/vecaddx/common.h
Normal file
17
tests/regression/vecaddx/common.h
Normal file
|
@ -0,0 +1,17 @@
|
|||
#ifndef _COMMON_H_
|
||||
#define _COMMON_H_
|
||||
|
||||
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
|
||||
|
||||
#ifndef TYPE
|
||||
#define TYPE float
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_points;
|
||||
uint64_t src0_addr;
|
||||
uint64_t src1_addr;
|
||||
uint64_t dst_addr;
|
||||
} kernel_arg_t;
|
||||
|
||||
#endif
|
18
tests/regression/vecaddx/kernel.cpp
Normal file
18
tests/regression/vecaddx/kernel.cpp
Normal file
|
@ -0,0 +1,18 @@
|
|||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
|
||||
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
|
||||
auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
|
||||
|
||||
dst_ptr[task_id] = src0_ptr[task_id] + src1_ptr[task_id];
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
||||
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
}
|
246
tests/regression/vecaddx/main.cpp
Normal file
246
tests/regression/vecaddx/main.cpp
Normal file
|
@ -0,0 +1,246 @@
|
|||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
#include <vortex.h>
|
||||
#include "common.h"
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Type>
|
||||
class Comparator {};
|
||||
|
||||
template <>
|
||||
class Comparator<int> {
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class Comparator<float> {
|
||||
private:
|
||||
union Float_t { float f; int i; };
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
if (d > FLOAT_ULP) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
const char* kernel_file = "kernel.bin";
|
||||
uint32_t size = 16;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<TYPE> source_data;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Test." << std::endl;
|
||||
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 'k':
|
||||
kernel_file = optarg;
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src0_addr);
|
||||
vx_mem_free(device, kernel_arg.src1_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int run_test(const kernel_arg_t& kernel_arg,
|
||||
uint32_t buf_size,
|
||||
uint32_t num_points) {
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
|
||||
auto cur = buf_ptr[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
std::cout << "number of cores: " << num_cores << std::endl;
|
||||
std::cout << "number of warps: " << num_warps << std::endl;
|
||||
std::cout << "number of threads: " << num_threads << std::endl;
|
||||
|
||||
uint32_t num_points = size;
|
||||
uint32_t buf_size = num_points * sizeof(TYPE);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
staging_buf.resize(alloc_size);
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// generate source data
|
||||
source_data.resize(2 * num_points);
|
||||
for (uint32_t i = 0; i < source_data.size(); ++i) {
|
||||
source_data[i] = Comparator<TYPE>::generate();
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = source_data[2 * i + 0];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// upload source buffer1
|
||||
{
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = source_data[2 * i + 1];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
|
||||
|
||||
// run tests
|
||||
std::cout << "run tests" << std::endl;
|
||||
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue