CSRs update

This commit is contained in:
Blaise Tine 2024-05-06 00:51:38 -07:00
parent c8bae13448
commit badfb24e01
6 changed files with 61 additions and 51 deletions

View file

@ -91,7 +91,7 @@
`endif
`ifndef NUM_BARRIERS
`define NUM_BARRIERS 4
`define NUM_BARRIERS (`NUM_WARPS/2)
`endif
`ifndef SOCKET_SIZE

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -32,7 +32,7 @@
// Machine Performance-monitoring counters classes ////////////////////////////
`define VX_DCR_MPM_CLASS_NONE 0
`define VX_DCR_MPM_CLASS_NONE 0
`define VX_DCR_MPM_CLASS_CORE 1
`define VX_DCR_MPM_CLASS_MEM 2
@ -41,7 +41,7 @@
`define VX_CSR_FFLAGS 12'h001
`define VX_CSR_FRM 12'h002
`define VX_CSR_FCSR 12'h003
`define VX_CSR_SATP 12'h180
`define VX_CSR_PMPCFG0 12'h3A0
@ -101,7 +101,7 @@
`define VX_CSR_MPM_STORES_H 12'hB8D
`define VX_CSR_MPM_IFETCH_LT 12'hB0E
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
`define VX_CSR_MPM_LOAD_LT 12'hB0F
`define VX_CSR_MPM_LOAD_LT 12'hB0F
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
// SFU: scoreboard
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
@ -187,11 +187,12 @@
`define VX_CSR_THREAD_ID 12'hCC0
`define VX_CSR_WARP_ID 12'hCC1
`define VX_CSR_CORE_ID 12'hCC2
`define VX_CSR_WARP_MASK 12'hCC3
`define VX_CSR_THREAD_MASK 12'hCC4 // warning! this value is also used in LLVM
`define VX_CSR_ACTIVE_WARPS 12'hCC3
`define VX_CSR_ACTIVE_THREADS 12'hCC4 // warning! this value is also used in LLVM
`define VX_CSR_NUM_THREADS 12'hFC0
`define VX_CSR_NUM_WARPS 12'hFC1
`define VX_CSR_NUM_CORES 12'hFC2
`define VX_CSR_NUM_BARRIERS 12'hFC3
`endif // VX_TYPES_VH

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -62,7 +62,7 @@ import VX_fpu_pkg::*;
output wire [`XLEN-1:0] read_data_ro,
output wire [`XLEN-1:0] read_data_rw,
input wire write_enable,
input wire write_enable,
input wire [`UUID_WIDTH-1:0] write_uuid,
input wire [`NW_WIDTH-1:0] write_wid,
input wire [`VX_CSR_ADDR_BITS-1:0] write_addr,
@ -77,7 +77,7 @@ import VX_fpu_pkg::*;
reg [`XLEN-1:0] mscratch;
`ifdef EXT_F_ENABLE
`ifdef EXT_F_ENABLE
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FP_FLAGS_BITS-1:0] fcsr, fcsr_n;
wire [`NUM_FPU_BLOCKS-1:0] fpu_write_enable;
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
@ -88,7 +88,7 @@ import VX_fpu_pkg::*;
assign fpu_write_wid[i] = fpu_csr_if[i].write_wid;
assign fpu_write_fflags[i] = fpu_csr_if[i].write_fflags;
end
always @(*) begin
fcsr_n = fcsr;
for (integer i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
@ -96,7 +96,7 @@ import VX_fpu_pkg::*;
fcsr_n[fpu_write_wid[i]][`FP_FLAGS_BITS-1:0] = fcsr[fpu_write_wid[i]][`FP_FLAGS_BITS-1:0]
| fpu_write_fflags[i];
end
end
end
if (write_enable) begin
case (write_addr)
`VX_CSR_FFLAGS: fcsr_n[write_wid][`FP_FLAGS_BITS-1:0] = write_data[`FP_FLAGS_BITS-1:0];
@ -106,7 +106,7 @@ import VX_fpu_pkg::*;
endcase
end
end
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_csr_if[i].read_frm = fcsr[fpu_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
end
@ -146,7 +146,7 @@ import VX_fpu_pkg::*;
`VX_CSR_MSCRATCH: begin
mscratch <= write_data;
end
default: begin
default: begin
`ASSERT(0, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
end
endcase
@ -163,7 +163,7 @@ import VX_fpu_pkg::*;
read_data_ro_r = '0;
read_data_rw_r = '0;
read_addr_valid_r = 1;
case (read_addr)
case (read_addr)
`VX_CSR_MVENDORID : read_data_ro_r = `XLEN'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_r = `XLEN'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_r = `XLEN'(`IMPLEMENTATION_ID);
@ -171,25 +171,26 @@ import VX_fpu_pkg::*;
`ifdef EXT_F_ENABLE
`VX_CSR_FFLAGS : read_data_rw_r = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_r = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_r = `XLEN'(fcsr[read_wid]);
`VX_CSR_FCSR : read_data_rw_r = `XLEN'(fcsr[read_wid]);
`endif
`VX_CSR_MSCRATCH : read_data_rw_r = mscratch;
`VX_CSR_WARP_ID : read_data_ro_r = `XLEN'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_r = `XLEN'(CORE_ID);
`VX_CSR_THREAD_MASK: read_data_ro_r = `XLEN'(thread_masks[read_wid]);
`VX_CSR_WARP_MASK : read_data_ro_r = `XLEN'(active_warps);
`VX_CSR_ACTIVE_THREADS: read_data_ro_r = `XLEN'(thread_masks[read_wid]);
`VX_CSR_ACTIVE_WARPS: read_data_ro_r = `XLEN'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_r = `XLEN'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_r = `XLEN'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_r = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_NUM_BARRIERS: read_data_ro_r = `XLEN'(`NUM_BARRIERS);
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_r, cycles);
`VX_CSR_MPM_RESERVED : read_data_ro_r = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_r, commit_csr_if.instret);
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_r, commit_csr_if.instret);
`VX_CSR_SATP,
`VX_CSR_MSTATUS,
`VX_CSR_MNSTATUS,
@ -210,7 +211,7 @@ import VX_fpu_pkg::*;
case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
// PERF: pipeline
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched_idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched_stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.ibf_stalls);
@ -231,7 +232,7 @@ import VX_fpu_pkg::*;
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_r, pipeline_perf_if.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_r, pipeline_perf_if.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_r, pipeline_perf_if.load_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_r, pipeline_perf_if.load_latency);
default:;
endcase
end
@ -248,17 +249,17 @@ import VX_fpu_pkg::*;
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_r, mem_perf_if.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_r, mem_perf_if.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_r, mem_perf_if.dcache.mshr_stalls);
// PERF: lmem
// PERF: lmem
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_r, mem_perf_if.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_r, mem_perf_if.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_r, mem_perf_if.lmem.bank_stalls);
// PERF: l2cache
// PERF: l2cache
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_r, mem_perf_if.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_r, mem_perf_if.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_r, mem_perf_if.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_r, mem_perf_if.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l2cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l2cache.mshr_stalls);
// PERF: l3cache
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_r, mem_perf_if.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_r, mem_perf_if.l3cache.writes);

View file

@ -178,17 +178,17 @@ inline int vx_core_id() {
return ret;
}
// Return current thread mask
inline int vx_thread_mask() {
// Return active threads mask
inline int vx_active_threads() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_MASK));
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_ACTIVE_THREADS));
return ret;
}
// Return active warps mask
inline int vx_warp_mask() {
inline int vx_active_warps() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_MASK));
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_ACTIVE_WARPS));
return ret;
}
@ -213,6 +213,13 @@ inline int vx_num_cores() {
return ret;
}
// Return the number of barriers
inline int vx_num_barriers() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_BARRIERS));
return ret;
}
// Return the hart identifier (thread id accross the processor)
inline int vx_hart_id() {
int ret;

View file

@ -358,11 +358,12 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
case VX_CSR_THREAD_ID: return tid;
case VX_CSR_WARP_ID: return wid;
case VX_CSR_CORE_ID: return core_->id();
case VX_CSR_THREAD_MASK:return warps_.at(wid).tmask.to_ulong();
case VX_CSR_WARP_MASK: return active_warps_.to_ulong();
case VX_CSR_ACTIVE_THREADS:return warps_.at(wid).tmask.to_ulong();
case VX_CSR_ACTIVE_WARPS:return active_warps_.to_ulong();
case VX_CSR_NUM_THREADS:return arch_.num_threads();
case VX_CSR_NUM_WARPS: return arch_.num_warps();
case VX_CSR_NUM_CORES: return uint32_t(arch_.num_cores()) * arch_.num_clusters();
case VX_CSR_NUM_BARRIERS:return arch_.num_barriers();
case VX_CSR_MSCRATCH: return csr_mscratch_;
CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles);
CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs);

View file

@ -65,7 +65,7 @@ int test_local_memory() {
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
vx_tmc(tmask);
do_lmem_wr();
do_lmem_rd();
vx_tmc_one();
@ -87,7 +87,7 @@ int test_tmc() {
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
vx_tmc(tmask);
do_tmc();
vx_tmc_one();
@ -146,7 +146,7 @@ int dvg_buffer[4];
void __attribute__((noinline)) __attribute__((optimize("O1"))) do_divergence() {
int tid = vx_thread_id();
int cond1 = tid < 2;
int sp1 = vx_split(cond1);
int sp1 = vx_split(cond1);
if (cond1) {
{
int cond2 = tid < 1;
@ -244,9 +244,9 @@ void __attribute__((noinline)) do_serial() {
}
int test_serial() {
PRINTF("Serial Test\n");
PRINTF("Serial Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
do_serial();
vx_tmc_one();
@ -258,10 +258,10 @@ int test_serial() {
int tmask_buffer[8];
int __attribute__((noinline)) do_tmask() {
int __attribute__((noinline)) do_tmask() {
int tid = vx_thread_id();
int tmask = make_select_tmask(tid);
int cur_tmask = vx_thread_mask();
int cur_tmask = vx_active_threads();
tmask_buffer[tid] = (cur_tmask == tmask) ? (65 + tid) : 0;
return tid + 1;
}
@ -275,11 +275,11 @@ int test_tmask() {
int num_threads = std::min(vx_num_threads(), 8);
int tid = 0;
l_start:
l_start:
int tmask = make_select_tmask(tid);
vx_tmc(tmask);
tid = do_tmask();
if (tid < num_threads)
vx_tmc(tmask);
tid = do_tmask();
if (tid < num_threads)
goto l_start;
vx_tmc_one();
@ -296,7 +296,7 @@ void barrier_kernel() {
unsigned wid = vx_warp_id();
for (int i = 0; i <= (wid * 256); ++i) {
++barrier_stall;
}
}
barrier_buffer[wid] = 65 + wid;
vx_barrier(0, barrier_ctr);
vx_tmc(0 == wid);
@ -308,7 +308,7 @@ int test_barrier() {
barrier_ctr = num_warps;
barrier_stall = 0;
vx_wspawn(num_warps, barrier_kernel);
barrier_kernel();
barrier_kernel();
return check_error(barrier_buffer, 0, num_warps);
}