mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
CSR 32-bit/64-bit refactoring
This commit is contained in:
parent
135cc4f5a7
commit
7784dfe9b7
5 changed files with 229 additions and 284 deletions
|
@ -17,6 +17,16 @@
|
|||
`include "VX_fpu_define.vh"
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
`define CSR_READ_64(addr, dst, src) \
|
||||
addr : dst = `XLEN'(src)
|
||||
`else
|
||||
`define CSR_READ_64(addr, dst, src) \
|
||||
addr : dst = src[31:0]; \
|
||||
(addr + (`VX_CSR_MPM_BASE_H-`VX_CSR_MPM_BASE)) : dst = 32'(src[$bits(src)-1:32])
|
||||
`endif
|
||||
|
||||
|
||||
module VX_csr_data
|
||||
import VX_gpu_pkg::*;
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -49,14 +59,14 @@ import VX_fpu_pkg::*;
|
|||
input wire [`UUID_WIDTH-1:0] read_uuid,
|
||||
input wire [`NW_WIDTH-1:0] read_wid,
|
||||
input wire [`VX_CSR_ADDR_BITS-1:0] read_addr,
|
||||
output wire [31:0] read_data_ro,
|
||||
output wire [31:0] read_data_rw,
|
||||
output wire [`XLEN-1:0] read_data_ro,
|
||||
output wire [`XLEN-1:0] read_data_rw,
|
||||
|
||||
input wire write_enable,
|
||||
input wire [`UUID_WIDTH-1:0] write_uuid,
|
||||
input wire [`NW_WIDTH-1:0] write_wid,
|
||||
input wire [`VX_CSR_ADDR_BITS-1:0] write_addr,
|
||||
input wire [31:0] write_data
|
||||
input wire [`XLEN-1:0] write_data
|
||||
);
|
||||
|
||||
`UNUSED_VAR (reset)
|
||||
|
@ -65,16 +75,20 @@ import VX_fpu_pkg::*;
|
|||
|
||||
// CSRs Write /////////////////////////////////////////////////////////////
|
||||
|
||||
reg [`XLEN-1:0] mscratch;
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FP_FLAGS_BITS-1:0] fcsr, fcsr_n;
|
||||
wire [`NUM_FPU_BLOCKS-1:0] fpu_write_enable;
|
||||
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
|
||||
fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags;
|
||||
|
||||
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
|
||||
assign fpu_write_enable[i] = fpu_csr_if[i].write_enable;
|
||||
assign fpu_write_wid[i] = fpu_csr_if[i].write_wid;
|
||||
assign fpu_write_fflags[i] = fpu_csr_if[i].write_fflags;
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
fcsr_n = fcsr;
|
||||
for (integer i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
|
||||
|
@ -123,8 +137,13 @@ import VX_fpu_pkg::*;
|
|||
`VX_CSR_MTVEC,
|
||||
`VX_CSR_MEPC,
|
||||
`VX_CSR_PMPCFG0,
|
||||
`VX_CSR_PMPADDR0: /* do nothing!*/;
|
||||
default: begin
|
||||
`VX_CSR_PMPADDR0: begin
|
||||
// do nothing!
|
||||
end
|
||||
`VX_CSR_MSCRATCH: begin
|
||||
mscratch <= write_data;
|
||||
end
|
||||
default: begin
|
||||
`ASSERT(0, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
|
||||
end
|
||||
endcase
|
||||
|
@ -133,8 +152,8 @@ import VX_fpu_pkg::*;
|
|||
|
||||
// CSRs read //////////////////////////////////////////////////////////////
|
||||
|
||||
reg [31:0] read_data_ro_r;
|
||||
reg [31:0] read_data_rw_r;
|
||||
reg [`XLEN-1:0] read_data_ro_r;
|
||||
reg [`XLEN-1:0] read_data_rw_r;
|
||||
reg read_addr_valid_r;
|
||||
|
||||
always @(*) begin
|
||||
|
@ -142,28 +161,31 @@ import VX_fpu_pkg::*;
|
|||
read_data_rw_r = '0;
|
||||
read_addr_valid_r = 1;
|
||||
case (read_addr)
|
||||
`VX_CSR_MVENDORID : read_data_ro_r = 32'(`VENDOR_ID);
|
||||
`VX_CSR_MARCHID : read_data_ro_r = 32'(`ARCHITECTURE_ID);
|
||||
`VX_CSR_MIMPID : read_data_ro_r = 32'(`IMPLEMENTATION_ID);
|
||||
`VX_CSR_MISA : read_data_ro_r = (((`CLOG2(`XLEN)-4) << (`XLEN-2)) | `MISA_STD);
|
||||
`VX_CSR_MVENDORID : read_data_ro_r = `XLEN'(`VENDOR_ID);
|
||||
`VX_CSR_MARCHID : read_data_ro_r = `XLEN'(`ARCHITECTURE_ID);
|
||||
`VX_CSR_MIMPID : read_data_ro_r = `XLEN'(`IMPLEMENTATION_ID);
|
||||
`VX_CSR_MISA : read_data_ro_r = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
|
||||
`ifdef EXT_F_ENABLE
|
||||
`VX_CSR_FFLAGS : read_data_rw_r = 32'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
|
||||
`VX_CSR_FRM : read_data_rw_r = 32'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
|
||||
`VX_CSR_FCSR : read_data_rw_r = 32'(fcsr[read_wid]);
|
||||
`VX_CSR_FFLAGS : read_data_rw_r = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
|
||||
`VX_CSR_FRM : read_data_rw_r = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
|
||||
`VX_CSR_FCSR : read_data_rw_r = `XLEN'(fcsr[read_wid]);
|
||||
`endif
|
||||
`VX_CSR_WARP_ID : read_data_ro_r = 32'(read_wid);
|
||||
`VX_CSR_CORE_ID : read_data_ro_r = 32'(CORE_ID);
|
||||
`VX_CSR_THREAD_MASK: read_data_ro_r = 32'(thread_masks[read_wid]);
|
||||
`VX_CSR_WARP_MASK : read_data_ro_r = 32'(active_warps);
|
||||
`VX_CSR_NUM_THREADS: read_data_ro_r = 32'(`NUM_THREADS);
|
||||
`VX_CSR_NUM_WARPS : read_data_ro_r = 32'(`NUM_WARPS);
|
||||
`VX_CSR_NUM_CORES : read_data_ro_r = 32'(`NUM_CORES * `NUM_CLUSTERS);
|
||||
`VX_CSR_MCYCLE : read_data_ro_r = 32'(cycles[31:0]);
|
||||
`VX_CSR_MCYCLE_H : read_data_ro_r = 32'(cycles[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MSCRATCH : read_data_rw_r = mscratch;
|
||||
|
||||
`VX_CSR_WARP_ID : read_data_ro_r = `XLEN'(read_wid);
|
||||
`VX_CSR_CORE_ID : read_data_ro_r = `XLEN'(CORE_ID);
|
||||
`VX_CSR_THREAD_MASK: read_data_ro_r = `XLEN'(thread_masks[read_wid]);
|
||||
`VX_CSR_WARP_MASK : read_data_ro_r = `XLEN'(active_warps);
|
||||
`VX_CSR_NUM_THREADS: read_data_ro_r = `XLEN'(`NUM_THREADS);
|
||||
`VX_CSR_NUM_WARPS : read_data_ro_r = `XLEN'(`NUM_WARPS);
|
||||
`VX_CSR_NUM_CORES : read_data_ro_r = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
|
||||
|
||||
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_r, cycles);
|
||||
|
||||
`VX_CSR_MPM_RESERVED : read_data_ro_r = 'x;
|
||||
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
|
||||
`VX_CSR_MINSTRET : read_data_ro_r = 32'(commit_csr_if.instret[31:0]);
|
||||
`VX_CSR_MINSTRET_H : read_data_ro_r = 32'(commit_csr_if.instret[`PERF_CTR_BITS-1:32]);
|
||||
|
||||
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_r, commit_csr_if.instret);
|
||||
|
||||
`VX_CSR_SATP,
|
||||
`VX_CSR_MSTATUS,
|
||||
|
@ -174,7 +196,7 @@ import VX_fpu_pkg::*;
|
|||
`VX_CSR_MTVEC,
|
||||
`VX_CSR_MEPC,
|
||||
`VX_CSR_PMPCFG0,
|
||||
`VX_CSR_PMPADDR0 : read_data_ro_r = 32'(0);
|
||||
`VX_CSR_PMPADDR0 : read_data_ro_r = `XLEN'(0);
|
||||
|
||||
default: begin
|
||||
read_addr_valid_r = 0;
|
||||
|
@ -185,108 +207,66 @@ import VX_fpu_pkg::*;
|
|||
case (base_dcrs.mpm_class)
|
||||
`VX_DCR_MPM_CLASS_CORE: begin
|
||||
case (read_addr)
|
||||
// PERF: pipeline
|
||||
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
|
||||
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
|
||||
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
|
||||
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
|
||||
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0];
|
||||
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
|
||||
// PERF: pipeline
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched_idles);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.ibf_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.scb_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_ALU]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0];
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_FPU]);
|
||||
`else
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
|
||||
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
|
||||
`endif
|
||||
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0];
|
||||
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0];
|
||||
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0];
|
||||
`VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0];
|
||||
`VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.sfu_uses[`SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.sfu_uses[`SFU_WCTL]);
|
||||
// PERF: memory
|
||||
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
|
||||
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
|
||||
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
|
||||
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
|
||||
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
|
||||
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_r, pipeline_perf_if.stores);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_r, pipeline_perf_if.ifetch_latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_r, pipeline_perf_if.load_latency);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`VX_DCR_MPM_CLASS_MEM: begin
|
||||
case (read_addr)
|
||||
// PERF: icache
|
||||
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
|
||||
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_r, mem_perf_if.icache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_r, mem_perf_if.icache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_r, mem_perf_if.icache.mshr_stalls);
|
||||
// PERF: dcache
|
||||
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
|
||||
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
|
||||
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_r, mem_perf_if.dcache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_r, mem_perf_if.dcache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_r, mem_perf_if.dcache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_r, mem_perf_if.dcache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_r, mem_perf_if.dcache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_r, mem_perf_if.dcache.mshr_stalls);
|
||||
// PERF: lmem
|
||||
`VX_CSR_MPM_LMEM_READS : read_data_ro_r = mem_perf_if.lmem.reads[31:0];
|
||||
`VX_CSR_MPM_LMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.lmem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LMEM_WRITES : read_data_ro_r = mem_perf_if.lmem.writes[31:0];
|
||||
`VX_CSR_MPM_LMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.lmem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_LMEM_BANK_ST : read_data_ro_r = mem_perf_if.lmem.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_LMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.lmem.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_r, mem_perf_if.lmem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_r, mem_perf_if.lmem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_r, mem_perf_if.lmem.bank_stalls);
|
||||
// PERF: l2cache
|
||||
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_r, mem_perf_if.l2cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_r, mem_perf_if.l2cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_r, mem_perf_if.l2cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_r, mem_perf_if.l2cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l2cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l2cache.mshr_stalls);
|
||||
// PERF: l3cache
|
||||
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
|
||||
`VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_r, mem_perf_if.l3cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_r, mem_perf_if.l3cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_r, mem_perf_if.l3cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_r, mem_perf_if.l3cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l3cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l3cache.mshr_stalls);
|
||||
// PERF: memory
|
||||
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
|
||||
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
|
||||
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
|
||||
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
|
||||
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_r, mem_perf_if.mem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_r, mem_perf_if.mem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_r, mem_perf_if.mem.latency);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
|
|
@ -39,17 +39,17 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + PID_WIDTH + 1 + 1;
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
|
||||
`UNUSED_VAR (execute_if.data.rs3_data)
|
||||
|
||||
reg [NUM_LANES-1:0][31:0] csr_read_data;
|
||||
reg [31:0] csr_write_data;
|
||||
wire [31:0] csr_read_data_ro, csr_read_data_rw;
|
||||
wire [31:0] csr_req_data;
|
||||
reg csr_rd_enable;
|
||||
wire csr_wr_enable;
|
||||
wire csr_req_ready;
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] csr_read_data;
|
||||
reg [`XLEN-1:0] csr_write_data;
|
||||
wire [`XLEN-1:0] csr_read_data_ro, csr_read_data_rw;
|
||||
wire [`XLEN-1:0] csr_req_data;
|
||||
reg csr_rd_enable;
|
||||
wire csr_wr_enable;
|
||||
wire csr_req_ready;
|
||||
|
||||
// wait for all pending instructions to complete
|
||||
assign sched_csr_if.alm_empty_wid = execute_if.data.wid;
|
||||
|
@ -61,10 +61,10 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if.data.imm[`VX_CSR_ADDR_BITS-1:0];
|
||||
wire [`NRI_BITS-1:0] csr_imm = execute_if.data.imm[`VX_CSR_ADDR_BITS +: `NRI_BITS];
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] rs1_data;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] rs1_data;
|
||||
`UNUSED_VAR (rs1_data)
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign rs1_data[i] = execute_if.data.rs1_data[i][31:0];
|
||||
assign rs1_data[i] = execute_if.data.rs1_data[i][`XLEN-1:0];
|
||||
end
|
||||
|
||||
wire csr_write_enable = (execute_if.data.op_type == `INST_SFU_CSRRW);
|
||||
|
@ -107,15 +107,15 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
// CSR read
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] wtid, gtid;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] wtid, gtid;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
if (PID_BITS != 0) begin
|
||||
assign wtid[i] = 32'(execute_if.data.pid * NUM_LANES + i);
|
||||
assign wtid[i] = `XLEN'(execute_if.data.pid * NUM_LANES + i);
|
||||
end else begin
|
||||
assign wtid[i] = 32'(i);
|
||||
assign wtid[i] = `XLEN'(i);
|
||||
end
|
||||
assign gtid[i] = (32'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (32'(execute_if.data.wid) << `NT_BITS) + wtid[i];
|
||||
assign gtid[i] = (`XLEN'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (`XLEN'(execute_if.data.wid) << `NT_BITS) + wtid[i];
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
|
@ -132,8 +132,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
// CSR write
|
||||
|
||||
assign csr_req_data = execute_if.data.use_imm ? 32'(csr_imm) : rs1_data[0];
|
||||
|
||||
assign csr_req_data = execute_if.data.use_imm ? `XLEN'(csr_imm) : rs1_data[0];
|
||||
assign csr_wr_enable = (csr_write_enable || (| csr_req_data));
|
||||
|
||||
always @(*) begin
|
||||
|
@ -155,9 +154,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
assign sched_csr_if.unlock_warp = csr_req_valid && csr_req_ready && execute_if.data.eop;
|
||||
assign sched_csr_if.unlock_wid = execute_if.data.wid;
|
||||
|
||||
// send response
|
||||
wire [NUM_LANES-1:0][31:0] csr_commit_data;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (2)
|
||||
|
@ -166,14 +162,10 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
.valid_in (csr_req_valid),
|
||||
.ready_in (csr_req_ready),
|
||||
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
|
||||
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
|
||||
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
|
||||
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
|
||||
.valid_out (commit_if.valid),
|
||||
.ready_out (commit_if.ready)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign commit_if.data.data[i] = `XLEN'(csr_commit_data[i]);
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -17,31 +17,54 @@
|
|||
#include <vx_intrinsics.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#define DUMP_CSR_4(d, s) \
|
||||
csr_mem[d + 0] = csr_read(s + 0); \
|
||||
csr_mem[d + 1] = csr_read(s + 1); \
|
||||
csr_mem[d + 2] = csr_read(s + 2); \
|
||||
csr_mem[d + 3] = csr_read(s + 3);
|
||||
|
||||
#define DUMP_CSR_32(d, s) \
|
||||
DUMP_CSR_4(d + 0, s + 0) \
|
||||
DUMP_CSR_4(d + 4, s + 4) \
|
||||
DUMP_CSR_4(d + 8, s + 8) \
|
||||
DUMP_CSR_4(d + 12, s + 12) \
|
||||
DUMP_CSR_4(d + 16, s + 16) \
|
||||
DUMP_CSR_4(d + 20, s + 20) \
|
||||
DUMP_CSR_4(d + 24, s + 24) \
|
||||
DUMP_CSR_4(d + 28, s + 28)
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef XLEN_64
|
||||
#define DUMP_CSRS(i) \
|
||||
((int64_t*)csr_mem)[i] = csr_read(VX_CSR_MPM_BASE +i)
|
||||
#else
|
||||
#define DUMP_CSRS(i) \
|
||||
csr_mem[(i*2)+0] = csr_read(VX_CSR_MPM_BASE + i); \
|
||||
csr_mem[(i*2)+1] = csr_read(VX_CSR_MPM_BASE + i + (VX_CSR_MPM_BASE_H - VX_CSR_MPM_BASE))
|
||||
#endif
|
||||
|
||||
void vx_perf_dump() {
|
||||
int core_id = vx_core_id();
|
||||
uint32_t* const csr_mem = (uint32_t*)(IO_CSR_ADDR + 64 * sizeof(uint32_t) * core_id);
|
||||
DUMP_CSR_32(0, VX_CSR_MPM_BASE)
|
||||
DUMP_CSR_32(32, VX_CSR_MPM_BASE_H)
|
||||
uint32_t * const csr_mem = (uint32_t*)(IO_CSR_ADDR + 64 * sizeof(uint32_t) * core_id);
|
||||
DUMP_CSRS(0);
|
||||
DUMP_CSRS(1);
|
||||
DUMP_CSRS(2);
|
||||
DUMP_CSRS(3);
|
||||
DUMP_CSRS(4);
|
||||
DUMP_CSRS(5);
|
||||
DUMP_CSRS(6);
|
||||
DUMP_CSRS(7);
|
||||
DUMP_CSRS(8);
|
||||
DUMP_CSRS(9);
|
||||
DUMP_CSRS(10);
|
||||
DUMP_CSRS(11);
|
||||
DUMP_CSRS(12);
|
||||
DUMP_CSRS(13);
|
||||
DUMP_CSRS(14);
|
||||
DUMP_CSRS(15);
|
||||
DUMP_CSRS(16);
|
||||
DUMP_CSRS(17);
|
||||
DUMP_CSRS(18);
|
||||
DUMP_CSRS(19);
|
||||
DUMP_CSRS(20);
|
||||
DUMP_CSRS(21);
|
||||
DUMP_CSRS(22);
|
||||
DUMP_CSRS(23);
|
||||
DUMP_CSRS(24);
|
||||
DUMP_CSRS(25);
|
||||
DUMP_CSRS(26);
|
||||
DUMP_CSRS(27);
|
||||
DUMP_CSRS(28);
|
||||
DUMP_CSRS(29);
|
||||
DUMP_CSRS(30);
|
||||
DUMP_CSRS(31);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -166,10 +166,8 @@ int dcr_initialize(vx_device_h hdevice) {
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static uint64_t get_csr_64(const void* ptr, int addr) {
|
||||
auto w_ptr = reinterpret_cast<const uint32_t*>(ptr);
|
||||
uint32_t value_lo = w_ptr[addr - VX_CSR_MPM_BASE];
|
||||
uint32_t value_hi = w_ptr[addr - VX_CSR_MPM_BASE + 32];
|
||||
return (uint64_t(value_hi) << 32) | value_lo;
|
||||
int offset = addr - VX_CSR_MPM_BASE;
|
||||
return ((const uint64_t*)ptr)[offset];
|
||||
}
|
||||
|
||||
extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
||||
|
@ -253,7 +251,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
bool lmem_enable = isa_flags & VX_ISA_EXT_LMEM;
|
||||
#endif
|
||||
|
||||
std::vector<uint8_t> staging_buf(64* sizeof(uint32_t));
|
||||
std::vector<uint8_t> staging_buf(32 * sizeof(uint64_t));
|
||||
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size();
|
||||
|
|
|
@ -306,7 +306,16 @@ void Emulator::cout_flush() {
|
|||
}
|
||||
}
|
||||
|
||||
uint32_t Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
#ifdef XLEN_64
|
||||
#define CSR_READ_64(addr, value) \
|
||||
case addr: return value
|
||||
#else
|
||||
#define CSR_READ_64(addr, value) \
|
||||
case addr : return (uint32_t)value; \
|
||||
case (addr + (VX_CSR_MPM_BASE_H-VX_CSR_MPM_BASE)) : return ((value >> 32) & 0xFFFFFFFF)
|
||||
#endif
|
||||
|
||||
Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
||||
auto core_perf = core_->perf_stats();
|
||||
switch (addr) {
|
||||
case VX_CSR_SATP:
|
||||
|
@ -322,38 +331,21 @@ uint32_t Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
case VX_CSR_MNSTATUS:
|
||||
return 0;
|
||||
|
||||
case VX_CSR_FFLAGS:
|
||||
return warps_.at(wid).fcsr & 0x1F;
|
||||
case VX_CSR_FRM:
|
||||
return (warps_.at(wid).fcsr >> 5);
|
||||
case VX_CSR_FCSR:
|
||||
return warps_.at(wid).fcsr;
|
||||
case VX_CSR_MHARTID: // global thread ID
|
||||
return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid;
|
||||
case VX_CSR_THREAD_ID: // thread ID
|
||||
return tid;
|
||||
case VX_CSR_WARP_ID: // warp ID
|
||||
return wid;
|
||||
case VX_CSR_CORE_ID: // core ID
|
||||
return core_->id();
|
||||
case VX_CSR_THREAD_MASK: // thread mask
|
||||
return warps_.at(wid).tmask.to_ulong();
|
||||
case VX_CSR_WARP_MASK: // active warps
|
||||
return active_warps_.to_ulong();
|
||||
case VX_CSR_NUM_THREADS: // Number of threads per warp
|
||||
return arch_.num_threads();
|
||||
case VX_CSR_NUM_WARPS: // Number of warps per core
|
||||
return arch_.num_warps();
|
||||
case VX_CSR_NUM_CORES: // Number of cores per cluster
|
||||
return uint32_t(arch_.num_cores()) * arch_.num_clusters();
|
||||
case VX_CSR_MCYCLE: // NumCycles
|
||||
return core_perf.cycles & 0xffffffff;
|
||||
case VX_CSR_MCYCLE_H: // NumCycles
|
||||
return (uint32_t)(core_perf.cycles >> 32);
|
||||
case VX_CSR_MINSTRET: // NumInsts
|
||||
return core_perf.instrs & 0xffffffff;
|
||||
case VX_CSR_MINSTRET_H: // NumInsts
|
||||
return (uint32_t)(core_perf.instrs >> 32);
|
||||
case VX_CSR_FFLAGS: return warps_.at(wid).fcsr & 0x1F;
|
||||
case VX_CSR_FRM: return (warps_.at(wid).fcsr >> 5);
|
||||
case VX_CSR_FCSR: return warps_.at(wid).fcsr;
|
||||
case VX_CSR_MHARTID: return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid;
|
||||
case VX_CSR_THREAD_ID: return tid;
|
||||
case VX_CSR_WARP_ID: return wid;
|
||||
case VX_CSR_CORE_ID: return core_->id();
|
||||
case VX_CSR_THREAD_MASK:return warps_.at(wid).tmask.to_ulong();
|
||||
case VX_CSR_WARP_MASK: return active_warps_.to_ulong();
|
||||
case VX_CSR_NUM_THREADS:return arch_.num_threads();
|
||||
case VX_CSR_NUM_WARPS: return arch_.num_warps();
|
||||
case VX_CSR_NUM_CORES: return uint32_t(arch_.num_cores()) * arch_.num_clusters();
|
||||
case VX_CSR_MSCRATCH: return csr_mscratch_;
|
||||
CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles);
|
||||
CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs);
|
||||
default:
|
||||
if ((addr >= VX_CSR_MPM_BASE && addr < (VX_CSR_MPM_BASE + 32))
|
||||
|| (addr >= VX_CSR_MPM_BASE_H && addr < (VX_CSR_MPM_BASE_H + 32))) {
|
||||
|
@ -364,37 +356,22 @@ uint32_t Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
break;
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_SCHED_ID: return core_perf.sched_idle & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ID_H:return core_perf.sched_idle >> 32;
|
||||
case VX_CSR_MPM_SCHED_ST: return core_perf.sched_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCHED_ST_H:return core_perf.sched_stalls >> 32;
|
||||
case VX_CSR_MPM_IBUF_ST: return core_perf.ibuf_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_IBUF_ST_H: return core_perf.ibuf_stalls >> 32;
|
||||
case VX_CSR_MPM_SCRB_ST: return core_perf.scrb_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_ST_H: return core_perf.scrb_stalls >> 32;
|
||||
case VX_CSR_MPM_SCRB_ALU: return core_perf.scrb_alu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_ALU_H:return core_perf.scrb_alu >> 32;
|
||||
case VX_CSR_MPM_SCRB_FPU: return core_perf.scrb_fpu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_FPU_H:return core_perf.scrb_fpu >> 32;
|
||||
case VX_CSR_MPM_SCRB_LSU: return core_perf.scrb_lsu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_LSU_H:return core_perf.scrb_lsu >> 32;
|
||||
case VX_CSR_MPM_SCRB_SFU: return core_perf.scrb_sfu & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_SFU_H:return core_perf.scrb_sfu >> 32;
|
||||
case VX_CSR_MPM_SCRB_WCTL: return core_perf.scrb_wctl & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_WCTL_H: return core_perf.scrb_wctl >> 32;
|
||||
case VX_CSR_MPM_SCRB_CSRS: return core_perf.scrb_csrs & 0xffffffff;
|
||||
case VX_CSR_MPM_SCRB_CSRS_H: return core_perf.scrb_csrs >> 32;
|
||||
case VX_CSR_MPM_IFETCHES: return core_perf.ifetches & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCHES_H: return core_perf.ifetches >> 32;
|
||||
case VX_CSR_MPM_LOADS: return core_perf.loads & 0xffffffff;
|
||||
case VX_CSR_MPM_LOADS_H: return core_perf.loads >> 32;
|
||||
case VX_CSR_MPM_STORES: return core_perf.stores & 0xffffffff;
|
||||
case VX_CSR_MPM_STORES_H: return core_perf.stores >> 32;
|
||||
case VX_CSR_MPM_IFETCH_LT: return core_perf.ifetch_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_IFETCH_LT_H: return core_perf.ifetch_latency >> 32;
|
||||
case VX_CSR_MPM_LOAD_LT: return core_perf.load_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_LOAD_LT_H: return core_perf.load_latency >> 32;
|
||||
}
|
||||
CSR_READ_64(VX_CSR_MPM_SCHED_ID, core_perf.sched_idle);
|
||||
CSR_READ_64(VX_CSR_MPM_SCHED_ST, core_perf.sched_stalls);
|
||||
CSR_READ_64(VX_CSR_MPM_IBUF_ST, core_perf.ibuf_stalls);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_ST, core_perf.scrb_stalls);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_ALU, core_perf.scrb_alu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_FPU, core_perf.scrb_fpu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_LSU, core_perf.scrb_lsu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_SFU, core_perf.scrb_sfu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_WCTL, core_perf.scrb_wctl);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_CSRS, core_perf.scrb_csrs);
|
||||
CSR_READ_64(VX_CSR_MPM_IFETCHES, core_perf.ifetches);
|
||||
CSR_READ_64(VX_CSR_MPM_LOADS, core_perf.loads);
|
||||
CSR_READ_64(VX_CSR_MPM_STORES, core_perf.stores);
|
||||
CSR_READ_64(VX_CSR_MPM_IFETCH_LT, core_perf.ifetch_latency);
|
||||
CSR_READ_64(VX_CSR_MPM_LOAD_LT, core_perf.load_latency);
|
||||
}
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
auto proc_perf = core_->socket()->cluster()->processor()->perf_stats();
|
||||
|
@ -402,65 +379,38 @@ uint32_t Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
auto socket_perf = core_->socket()->perf_stats();
|
||||
auto lmem_perf = core_->local_mem()->perf_stats();
|
||||
switch (addr) {
|
||||
case VX_CSR_MPM_ICACHE_READS: return socket_perf.icache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_READS_H: return socket_perf.icache.reads >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R: return socket_perf.icache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MISS_R_H: return socket_perf.icache.read_misses >> 32;
|
||||
case VX_CSR_MPM_ICACHE_MSHR_ST: return socket_perf.icache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_ICACHE_MSHR_ST_H: return socket_perf.icache.mshr_stalls >> 32;
|
||||
CSR_READ_64(VX_CSR_MPM_ICACHE_READS, socket_perf.icache.reads);
|
||||
CSR_READ_64(VX_CSR_MPM_ICACHE_MISS_R, socket_perf.icache.read_misses);
|
||||
CSR_READ_64(VX_CSR_MPM_ICACHE_MSHR_ST, socket_perf.icache.mshr_stalls);
|
||||
|
||||
case VX_CSR_MPM_DCACHE_READS: return socket_perf.dcache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_READS_H: return socket_perf.dcache.reads >> 32;
|
||||
case VX_CSR_MPM_DCACHE_WRITES: return socket_perf.dcache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_WRITES_H: return socket_perf.dcache.writes >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R: return socket_perf.dcache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_R_H: return socket_perf.dcache.read_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W: return socket_perf.dcache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MISS_W_H: return socket_perf.dcache.write_misses >> 32;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST: return socket_perf.dcache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_BANK_ST_H: return socket_perf.dcache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST: return socket_perf.dcache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_DCACHE_MSHR_ST_H: return socket_perf.dcache.mshr_stalls >> 32;
|
||||
CSR_READ_64(VX_CSR_MPM_DCACHE_READS, socket_perf.dcache.reads);
|
||||
CSR_READ_64(VX_CSR_MPM_DCACHE_WRITES, socket_perf.dcache.writes);
|
||||
CSR_READ_64(VX_CSR_MPM_DCACHE_MISS_R, socket_perf.dcache.read_misses);
|
||||
CSR_READ_64(VX_CSR_MPM_DCACHE_MISS_W, socket_perf.dcache.write_misses);
|
||||
CSR_READ_64(VX_CSR_MPM_DCACHE_BANK_ST, socket_perf.dcache.bank_stalls);
|
||||
CSR_READ_64(VX_CSR_MPM_DCACHE_MSHR_ST, socket_perf.dcache.mshr_stalls);
|
||||
|
||||
case VX_CSR_MPM_L2CACHE_READS: return cluster_perf.l2cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_READS_H: return cluster_perf.l2cache.reads >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES: return cluster_perf.l2cache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_WRITES_H: return cluster_perf.l2cache.writes >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R: return cluster_perf.l2cache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_R_H: return cluster_perf.l2cache.read_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W: return cluster_perf.l2cache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MISS_W_H: return cluster_perf.l2cache.write_misses >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST: return cluster_perf.l2cache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_BANK_ST_H:return cluster_perf.l2cache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST: return cluster_perf.l2cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L2CACHE_MSHR_ST_H:return cluster_perf.l2cache.mshr_stalls >> 32;
|
||||
CSR_READ_64(VX_CSR_MPM_L2CACHE_READS, cluster_perf.l2cache.reads);
|
||||
CSR_READ_64(VX_CSR_MPM_L2CACHE_WRITES, cluster_perf.l2cache.writes);
|
||||
CSR_READ_64(VX_CSR_MPM_L2CACHE_MISS_R, cluster_perf.l2cache.read_misses);
|
||||
CSR_READ_64(VX_CSR_MPM_L2CACHE_MISS_W, cluster_perf.l2cache.write_misses);
|
||||
CSR_READ_64(VX_CSR_MPM_L2CACHE_BANK_ST, cluster_perf.l2cache.bank_stalls);
|
||||
CSR_READ_64(VX_CSR_MPM_L2CACHE_MSHR_ST, cluster_perf.l2cache.mshr_stalls);
|
||||
|
||||
case VX_CSR_MPM_L3CACHE_READS: return proc_perf.l3cache.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_READS_H: return proc_perf.l3cache.reads >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_WRITES: return proc_perf.l3cache.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_WRITES_H: return proc_perf.l3cache.writes >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_R: return proc_perf.l3cache.read_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_R_H: return proc_perf.l3cache.read_misses >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_W: return proc_perf.l3cache.write_misses & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MISS_W_H: return proc_perf.l3cache.write_misses >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_BANK_ST: return proc_perf.l3cache.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_BANK_ST_H:return proc_perf.l3cache.bank_stalls >> 32;
|
||||
case VX_CSR_MPM_L3CACHE_MSHR_ST: return proc_perf.l3cache.mshr_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_L3CACHE_MSHR_ST_H:return proc_perf.l3cache.mshr_stalls >> 32;
|
||||
CSR_READ_64(VX_CSR_MPM_L3CACHE_READS, proc_perf.l3cache.reads);
|
||||
CSR_READ_64(VX_CSR_MPM_L3CACHE_WRITES, proc_perf.l3cache.writes);
|
||||
CSR_READ_64(VX_CSR_MPM_L3CACHE_MISS_R, proc_perf.l3cache.read_misses);
|
||||
CSR_READ_64(VX_CSR_MPM_L3CACHE_MISS_W, proc_perf.l3cache.write_misses);
|
||||
CSR_READ_64(VX_CSR_MPM_L3CACHE_BANK_ST, proc_perf.l3cache.bank_stalls);
|
||||
CSR_READ_64(VX_CSR_MPM_L3CACHE_MSHR_ST, proc_perf.l3cache.mshr_stalls);
|
||||
|
||||
case VX_CSR_MPM_MEM_READS: return proc_perf.mem_reads & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_READS_H: return proc_perf.mem_reads >> 32;
|
||||
case VX_CSR_MPM_MEM_WRITES: return proc_perf.mem_writes & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_WRITES_H: return proc_perf.mem_writes >> 32;
|
||||
case VX_CSR_MPM_MEM_LT: return proc_perf.mem_latency & 0xffffffff;
|
||||
case VX_CSR_MPM_MEM_LT_H : return proc_perf.mem_latency >> 32;
|
||||
|
||||
case VX_CSR_MPM_LMEM_READS: return lmem_perf.reads & 0xffffffff;
|
||||
case VX_CSR_MPM_LMEM_READS_H: return lmem_perf.reads >> 32;
|
||||
case VX_CSR_MPM_LMEM_WRITES: return lmem_perf.writes & 0xffffffff;
|
||||
case VX_CSR_MPM_LMEM_WRITES_H: return lmem_perf.writes >> 32;
|
||||
case VX_CSR_MPM_LMEM_BANK_ST: return lmem_perf.bank_stalls & 0xffffffff;
|
||||
case VX_CSR_MPM_LMEM_BANK_ST_H: return lmem_perf.bank_stalls >> 32;
|
||||
CSR_READ_64(VX_CSR_MPM_MEM_READS, proc_perf.mem_reads);
|
||||
CSR_READ_64(VX_CSR_MPM_MEM_WRITES, proc_perf.mem_writes);
|
||||
CSR_READ_64(VX_CSR_MPM_MEM_LT, proc_perf.mem_latency);
|
||||
|
||||
CSR_READ_64(VX_CSR_MPM_LMEM_READS, lmem_perf.reads);
|
||||
CSR_READ_64(VX_CSR_MPM_LMEM_WRITES, lmem_perf.writes);
|
||||
CSR_READ_64(VX_CSR_MPM_LMEM_BANK_ST, lmem_perf.bank_stalls);
|
||||
}
|
||||
} break;
|
||||
default: {
|
||||
|
@ -476,7 +426,7 @@ uint32_t Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
void Emulator::set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid) {
|
||||
void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) {
|
||||
__unused (tid);
|
||||
switch (addr) {
|
||||
case VX_CSR_FFLAGS:
|
||||
|
@ -488,6 +438,9 @@ void Emulator::set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid
|
|||
case VX_CSR_FCSR:
|
||||
warps_.at(wid).fcsr = value & 0xff;
|
||||
break;
|
||||
case VX_CSR_MSCRATCH:
|
||||
csr_mscratch_ = value;
|
||||
break;
|
||||
case VX_CSR_SATP:
|
||||
case VX_CSR_MSTATUS:
|
||||
case VX_CSR_MEDELEG:
|
||||
|
@ -499,8 +452,7 @@ void Emulator::set_csr(uint32_t addr, uint32_t value, uint32_t tid, uint32_t wid
|
|||
case VX_CSR_PMPADDR0:
|
||||
case VX_CSR_MNSTATUS:
|
||||
break;
|
||||
default:
|
||||
{
|
||||
default: {
|
||||
std::cout << std::hex << "Error: invalid CSR write addr=0x" << addr << ", value=0x" << value << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue