CSRs I/O refactoring

This commit is contained in:
Blaise Tine 2021-06-11 03:08:07 -07:00
parent a46d6cb606
commit 3cc1190cd7
33 changed files with 881 additions and 1385 deletions

View file

@ -76,21 +76,20 @@ extern int vx_upload_kernel_file(vx_device_h device, const char* filename) {
return err;
}
int vx_csr_get_l(vx_device_h device, int core_id, int addr, int addr_h, uint64_t* value) {
int ret = 0;
unsigned value_lo, value_hi;
ret |= vx_csr_get(device, core_id, addr, &value_lo);
ret |= vx_csr_get(device, core_id, addr_h, &value_hi);
*value = (uint64_t(value_hi) << 32) | value_lo;
return ret;
/*static uint32_t get_csr_32(const uint32_t* buffer, int addr) {
uint32_t value_lo = buffer[addr - CSR_MPM_BASE];
return value_lo;
}*/
static uint64_t get_csr_64(const uint32_t* buffer, int addr) {
uint32_t value_lo = buffer[addr - CSR_MPM_BASE];
uint32_t value_hi = buffer[addr - CSR_MPM_BASE + 32];
return (uint64_t(value_hi) << 32) | value_lo;
}
extern int vx_dump_perf(vx_device_h device, FILE* stream) {
int ret = 0;
unsigned num_cores;
vx_csr_get(device, 0, CSR_NC, &num_cores);
uint64_t instrs = 0;
uint64_t cycles = 0;
@ -127,12 +126,23 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
uint64_t mem_stalls = 0;
uint64_t mem_lat = 0;
#endif
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t instrs_per_core, cycles_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_INSTRET, CSR_INSTRET_H, &instrs_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_CYCLE, CSR_CYCLE_H, &cycles_per_core);
unsigned num_cores;
ret = vx_dev_caps(device, VX_CAPS_MAX_CORES, &num_cores);
if (ret)
return ret;
vx_buffer_h staging_buf;
ret |= vx_alloc_shared_mem(device, 64 * sizeof(uint32_t), &staging_buf);
if (ret)
return ret;
auto staging_ptr = (uint32_t*)vx_host_ptr(staging_buf);
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
ret |= vx_copy_from_dev(staging_buf, IO_ADDR_CSR + 64 * sizeof(uint32_t) * core_id, 64 * sizeof(uint32_t), 0);
uint64_t instrs_per_core = get_csr_64(staging_ptr, CSR_MINSTRET);
uint64_t cycles_per_core = get_csr_64(staging_ptr, CSR_MCYCLE);
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
instrs += instrs_per_core;
@ -141,133 +151,110 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
#ifdef PERF_ENABLE
// PERF: pipeline
// ibuffer_stall
uint64_t ibuffer_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_IBUF_ST, CSR_MPM_IBUF_ST_H, &ibuffer_stalls_per_core);
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_IBUF_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
ibuffer_stalls += ibuffer_stalls_per_core;
// scoreboard_stall
uint64_t scoreboard_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_SCRB_ST, CSR_MPM_SCRB_ST_H, &scoreboard_stalls_per_core);
uint64_t scoreboard_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_SCRB_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
scoreboard_stalls += scoreboard_stalls_per_core;
// alu_stall
uint64_t alu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ALU_ST, CSR_MPM_ALU_ST_H, &alu_stalls_per_core);
uint64_t alu_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_ALU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
alu_stalls += alu_stalls_per_core;
// lsu_stall
uint64_t lsu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_LSU_ST, CSR_MPM_LSU_ST_H, &lsu_stalls_per_core);
uint64_t lsu_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_LSU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
lsu_stalls += lsu_stalls_per_core;
// csr_stall
uint64_t csr_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_CSR_ST, CSR_MPM_CSR_ST_H, &csr_stalls_per_core);
uint64_t csr_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_CSR_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: csr unit stalls=%ld\n", core_id, csr_stalls_per_core);
csr_stalls += csr_stalls_per_core;
// fpu_stall
uint64_t fpu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_FPU_ST, CSR_MPM_FPU_ST_H, &fpu_stalls_per_core);
uint64_t fpu_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_FPU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
fpu_stalls += fpu_stalls_per_core;
// gpu_stall
uint64_t gpu_stalls_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_GPU_ST, CSR_MPM_GPU_ST_H, &gpu_stalls_per_core);
uint64_t gpu_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_GPU_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu unit stalls=%ld\n", core_id, gpu_stalls_per_core);
gpu_stalls += gpu_stalls_per_core;
// PERF: Icache
// total reads
uint64_t icache_reads_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_READS, CSR_MPM_ICACHE_READS_H, &icache_reads_per_core);
uint64_t icache_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_READS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads_per_core);
icache_reads += icache_reads_per_core;
// read misses
uint64_t icache_miss_r_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_MISS_R, CSR_MPM_ICACHE_MISS_R_H, &icache_miss_r_per_core);
uint64_t icache_miss_r_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_MISS_R);
int icache_read_hit_ratio = (int)((1.0 - (double(icache_miss_r_per_core) / double(icache_reads_per_core))) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio);
icache_read_misses += icache_miss_r_per_core;
// pipeline stalls
uint64_t icache_pipe_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_PIPE_ST, CSR_MPM_ICACHE_PIPE_ST_H, &icache_pipe_st_per_core);
uint64_t icache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_PIPE_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache pipeline stalls=%ld\n", core_id, icache_pipe_st_per_core);
icache_pipe_stalls += icache_pipe_st_per_core;
// response stalls
uint64_t icache_crsp_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_ICACHE_CRSP_ST, CSR_MPM_ICACHE_CRSP_ST_H, &icache_crsp_st_per_core);
uint64_t icache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_CRSP_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core);
icache_rsp_stalls += icache_crsp_st_per_core;
// PERF: Dcache
// total reads
uint64_t dcache_reads_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_READS, CSR_MPM_DCACHE_READS_H, &dcache_reads_per_core);
uint64_t dcache_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_READS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads_per_core);
dcache_reads += dcache_reads_per_core;
// total write
uint64_t dcache_writes_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_WRITES, CSR_MPM_DCACHE_WRITES_H, &dcache_writes_per_core);
uint64_t dcache_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_WRITES);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes_per_core);
dcache_writes += dcache_writes_per_core;
// read misses
uint64_t dcache_miss_r_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_R, CSR_MPM_DCACHE_MISS_R_H, &dcache_miss_r_per_core);
uint64_t dcache_miss_r_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MISS_R);
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_miss_r_per_core) / double(dcache_reads_per_core))) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_miss_r_per_core, dcache_read_hit_ratio);
dcache_read_misses += dcache_miss_r_per_core;
// read misses
uint64_t dcache_miss_w_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MISS_W, CSR_MPM_DCACHE_MISS_W_H, &dcache_miss_w_per_core);
uint64_t dcache_miss_w_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MISS_W);
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_miss_w_per_core) / double(dcache_writes_per_core))) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_miss_w_per_core, dcache_write_hit_ratio);
dcache_write_misses += dcache_miss_w_per_core;
// bank_stalls
uint64_t dcache_bank_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_BANK_ST, CSR_MPM_DCACHE_BANK_ST_H, &dcache_bank_st_per_core);
uint64_t dcache_bank_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_BANK_ST);
int dcache_bank_utilization = (int)((double(dcache_reads_per_core + dcache_writes_per_core) / double(dcache_reads_per_core + dcache_writes_per_core + dcache_bank_st_per_core)) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_st_per_core, dcache_bank_utilization);
dcache_bank_stalls += dcache_bank_st_per_core;
// mshr_stalls
uint64_t dcache_mshr_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_MSHR_ST, CSR_MPM_DCACHE_MSHR_ST_H, &dcache_mshr_st_per_core);
uint64_t dcache_mshr_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MSHR_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core);
dcache_mshr_stalls += dcache_mshr_st_per_core;
// pipeline stalls
uint64_t dcache_pipe_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_PIPE_ST, CSR_MPM_DCACHE_PIPE_ST_H, &dcache_pipe_st_per_core);
uint64_t dcache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_PIPE_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core);
dcache_pipe_stalls += dcache_pipe_st_per_core;
// response stalls
uint64_t dcache_crsp_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DCACHE_CRSP_ST, CSR_MPM_DCACHE_CRSP_ST_H, &dcache_crsp_st_per_core);
uint64_t dcache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_CRSP_ST);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core);
dcache_rsp_stalls += dcache_crsp_st_per_core;
// PERF: SMEM
// total reads
uint64_t smem_reads_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_READS, CSR_MPM_SMEM_READS_H, &smem_reads_per_core);
uint64_t smem_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_SMEM_READS);
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads_per_core);
smem_reads += smem_reads_per_core;
// total write
uint64_t smem_writes_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_WRITES, CSR_MPM_SMEM_WRITES_H, &smem_writes_per_core);
uint64_t smem_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_SMEM_WRITES);
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes_per_core);
smem_writes += smem_writes_per_core;
// bank_stalls
uint64_t smem_bank_st_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_SMEM_BANK_ST, CSR_MPM_SMEM_BANK_ST_H, &smem_bank_st_per_core);
uint64_t smem_bank_st_per_core = get_csr_64(staging_ptr, CSR_MPM_SMEM_BANK_ST);
int smem_bank_utilization = (int)((double(smem_reads_per_core + smem_writes_per_core) / double(smem_reads_per_core + smem_writes_per_core + smem_bank_st_per_core)) * 100);
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_st_per_core, smem_bank_utilization);
smem_bank_stalls += smem_bank_st_per_core;
// PERF: memory
uint64_t mem_reads_per_core, mem_writes_per_core, mem_stalls_per_core, mem_lat_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_READS, CSR_MPM_MEM_READS_H, &mem_reads_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_WRITES, CSR_MPM_MEM_WRITES_H, &mem_writes_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_ST, CSR_MPM_MEM_ST_H, &mem_stalls_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_LAT, CSR_MPM_MEM_LAT_H, &mem_lat_per_core);
uint64_t mem_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_READS);
uint64_t mem_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_WRITES);
uint64_t mem_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_ST);
uint64_t mem_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_LAT);
int mem_utilization = (int)((double(mem_reads_per_core + mem_writes_per_core) / double(mem_reads_per_core + mem_writes_per_core + mem_stalls_per_core)) * 100);
int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core);

View file

@ -59,12 +59,6 @@ int vx_start(vx_device_h hdevice);
// Wait for device ready with milliseconds timeout
int vx_ready_wait(vx_device_h hdevice, long long timeout);
// set device constant registers
int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value);
// get device constant registers
int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value);
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
// upload kernel bytes to device

View file

@ -37,25 +37,20 @@
#define CMD_MEM_READ AFU_IMAGE_CMD_MEM_READ
#define CMD_MEM_WRITE AFU_IMAGE_CMD_MEM_WRITE
#define CMD_RUN AFU_IMAGE_CMD_RUN
#define CMD_CSR_READ AFU_IMAGE_CMD_CSR_READ
#define CMD_CSR_WRITE AFU_IMAGE_CMD_CSR_WRITE
#define MMIO_CMD_TYPE (AFU_IMAGE_MMIO_CMD_TYPE * 4)
#define MMIO_IO_ADDR (AFU_IMAGE_MMIO_IO_ADDR * 4)
#define MMIO_MEM_ADDR (AFU_IMAGE_MMIO_MEM_ADDR * 4)
#define MMIO_DATA_SIZE (AFU_IMAGE_MMIO_DATA_SIZE * 4)
#define MMIO_DEV_CAPS (AFU_IMAGE_MMIO_DEV_CAPS * 4)
#define MMIO_STATUS (AFU_IMAGE_MMIO_STATUS * 4)
#define MMIO_CSR_CORE (AFU_IMAGE_MMIO_CSR_CORE * 4)
#define MMIO_CSR_ADDR (AFU_IMAGE_MMIO_CSR_ADDR * 4)
#define MMIO_CSR_DATA (AFU_IMAGE_MMIO_CSR_DATA * 4)
#define MMIO_CSR_READ (AFU_IMAGE_MMIO_CSR_READ * 4)
///////////////////////////////////////////////////////////////////////////////
typedef struct vx_device_ {
fpga_handle fpga;
size_t mem_allocation;
unsigned implementation_id;
unsigned version;
unsigned num_cores;
unsigned num_warps;
unsigned num_threads;
@ -89,7 +84,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
switch (caps_id) {
case VX_CAPS_VERSION:
*value = device->implementation_id;
*value = device->version;
break;
case VX_CAPS_MAX_CORES:
*value = device->num_cores;
@ -195,21 +190,22 @@ extern int vx_dev_open(vx_device_h* hdevice) {
device->fpga = accel_handle;
device->mem_allocation = ALLOC_BASE_ADDR;
{
// Load device CAPS
int ret = 0;
ret |= vx_csr_get(device, 0, CSR_MIMPID, &device->implementation_id);
ret |= vx_csr_get(device, 0, CSR_NC, &device->num_cores);
ret |= vx_csr_get(device, 0, CSR_NW, &device->num_warps);
ret |= vx_csr_get(device, 0, CSR_NT, &device->num_threads);
uint64_t dev_caps;
int ret = fpgaReadMMIO64(device->fpga, 0, MMIO_DEV_CAPS, &dev_caps);
if (ret != FPGA_OK) {
fpgaClose(accel_handle);
return ret;
}
device->version = (dev_caps >> 0) & 0xffff;
device->num_cores = (dev_caps >> 16) & 0xffff;
device->num_warps = (dev_caps >> 32) & 0xffff;
device->num_threads = (dev_caps >> 48) & 0xffff;
#ifndef NDEBUG
fprintf(stdout, "[VXDRV] DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n",
device->implementation_id, device->num_cores, device->num_warps, device->num_threads);
device->version, device->num_cores, device->num_warps, device->num_threads);
#endif
}
@ -470,52 +466,5 @@ extern int vx_start(vx_device_h hdevice) {
// start execution
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN));
return 0;
}
// set device constant registers
extern int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value) {
if (nullptr == hdevice)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
// write CSR value
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core_id));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, addr));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA, value));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_CSR_WRITE));
return 0;
}
// get device constant registers
extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value) {
if (nullptr == hdevice || nullptr == value)
return -1;
vx_device_t *device = ((vx_device_t*)hdevice);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
// write CSR value
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core_id));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, addr));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_CSR_READ));
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
uint64_t value64;
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_CSR_READ, &value64));
*value = (unsigned)value64;
return 0;
}

View file

@ -7,21 +7,16 @@
#define AFU_ACCEL_NAME "vortex_afu"
#define AFU_ACCEL_UUID "35F9452B-25C2-434C-93D5-6F8C60DB361C"
#define AFU_IMAGE_CMD_CSR_READ 4
#define AFU_IMAGE_CMD_CSR_WRITE 5
#define AFU_IMAGE_CMD_MEM_READ 1
#define AFU_IMAGE_CMD_MEM_WRITE 2
#define AFU_IMAGE_CMD_RUN 3
#define AFU_IMAGE_MMIO_CMD_TYPE 10
#define AFU_IMAGE_MMIO_CSR_ADDR 26
#define AFU_IMAGE_MMIO_CSR_CORE 24
#define AFU_IMAGE_MMIO_CSR_DATA 28
#define AFU_IMAGE_MMIO_CSR_READ 30
#define AFU_IMAGE_MMIO_DATA_SIZE 16
#define AFU_IMAGE_MMIO_IO_ADDR 12
#define AFU_IMAGE_MMIO_MEM_ADDR 14
#define AFU_IMAGE_MMIO_SCOPE_READ 20
#define AFU_IMAGE_MMIO_SCOPE_WRITE 22
#define AFU_IMAGE_MMIO_DEV_CAPS 24
#define AFU_IMAGE_MMIO_STATUS 18
#define AFU_IMAGE_POWER 0
#define AFU_TOP_IFC "ccip_std_afu_avalon_mm"

View file

@ -144,28 +144,6 @@ public:
return 0;
}
int set_csr(int core_id, int addr, unsigned value) {
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
simulator_.set_csr(core_id, addr, value);
while (simulator_.csr_req_active()) {
simulator_.step();
};
return 0;
}
int get_csr(int core_id, int addr, unsigned *value) {
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
simulator_.get_csr(core_id, addr, value);
while (simulator_.csr_req_active()) {
simulator_.step();
};
return 0;
}
private:
size_t mem_allocation_;
@ -330,22 +308,4 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
}
extern int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->set_csr(core_id, addr, value);
}
extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->get_csr(core_id, addr, value);
}

View file

@ -376,22 +376,4 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
}
extern int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->set_csr(core_id, addr, value);
}
extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned *value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->get_csr(core_id, addr, value);
}

View file

@ -42,12 +42,4 @@ extern int vx_start(vx_device_h /*hdevice*/) {
extern int vx_ready_wait(vx_device_h /*hdevice*/, long long /*timeout*/) {
return -1;
}
extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core_id*/, int /*addr*/, unsigned /*value*/) {
return -1;
}
extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core_id*/, int /*addr*/, unsigned* /*value*/) {
return -1;
}

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -24,19 +24,6 @@ module VX_cluster #(
input wire [`L2MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// CSR Request
input wire csr_req_valid,
input wire [`NC_BITS-1:0] csr_req_coreid,
input wire [11:0] csr_req_addr,
input wire csr_req_rw,
input wire [31:0] csr_req_data,
output wire csr_req_ready,
// CSR Response
output wire csr_rsp_valid,
output wire [31:0] csr_rsp_data,
input wire csr_rsp_ready,
// Status
output wire busy
);
@ -55,16 +42,6 @@ module VX_cluster #(
wire [`NUM_CORES-1:0][`XMEM_TAG_WIDTH-1:0] per_core_mem_rsp_tag;
wire [`NUM_CORES-1:0] per_core_mem_rsp_ready;
wire [`NUM_CORES-1:0] per_core_csr_req_valid;
wire [`NUM_CORES-1:0][11:0] per_core_csr_req_addr;
wire [`NUM_CORES-1:0] per_core_csr_req_rw;
wire [`NUM_CORES-1:0][31:0] per_core_csr_req_data;
wire [`NUM_CORES-1:0] per_core_csr_req_ready;
wire [`NUM_CORES-1:0] per_core_csr_rsp_valid;
wire [`NUM_CORES-1:0][31:0] per_core_csr_rsp_data;
wire [`NUM_CORES-1:0] per_core_csr_rsp_ready;
wire [`NUM_CORES-1:0] per_core_busy;
for (genvar i = 0; i < `NUM_CORES; i++) begin
@ -99,56 +76,9 @@ module VX_cluster #(
.mem_rsp_tag (per_core_mem_rsp_tag [i]),
.mem_rsp_ready (per_core_mem_rsp_ready[i]),
.csr_req_valid (per_core_csr_req_valid [i]),
.csr_req_rw (per_core_csr_req_rw [i]),
.csr_req_addr (per_core_csr_req_addr [i]),
.csr_req_data (per_core_csr_req_data [i]),
.csr_req_ready (per_core_csr_req_ready [i]),
.csr_rsp_valid (per_core_csr_rsp_valid [i]),
.csr_rsp_data (per_core_csr_rsp_data [i]),
.csr_rsp_ready (per_core_csr_rsp_ready [i]),
.busy (per_core_busy [i])
);
end
VX_csr_arb #(
.NUM_REQS (`NUM_CORES),
.DATA_WIDTH (32),
.ADDR_WIDTH (12),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) csr_arb (
.clk (clk),
.reset (reset),
.request_id (csr_req_coreid),
// input requests
.req_valid_in (csr_req_valid),
.req_addr_in (csr_req_addr),
.req_rw_in (csr_req_rw),
.req_data_in (csr_req_data),
.req_ready_in (csr_req_ready),
// output request
.req_valid_out (per_core_csr_req_valid),
.req_addr_out (per_core_csr_req_addr),
.req_rw_out (per_core_csr_req_rw),
.req_data_out (per_core_csr_req_data),
.req_ready_out (per_core_csr_req_ready),
// input responses
.rsp_valid_in (per_core_csr_rsp_valid),
.rsp_data_in (per_core_csr_rsp_data),
.rsp_ready_in (per_core_csr_rsp_ready),
// output response
.rsp_valid_out (csr_rsp_valid),
.rsp_data_out (csr_rsp_data),
.rsp_ready_out (csr_rsp_ready)
);
assign busy = (| per_core_busy);

View file

@ -57,6 +57,10 @@
`define IO_ADDR_COUT 32'hFFFFFFFC
`endif
`ifndef IO_ADDR_CSR
`define IO_ADDR_CSR `IO_BASE_ADDR
`endif
`ifndef SMEM_BASE_ADDR
`define SMEM_BASE_ADDR `IO_BASE_ADDR
`endif
@ -147,28 +151,30 @@
`define CSR_MEPC 12'h341
// Machine Counter/Timers
`define CSR_CYCLE 12'hC00
`define CSR_CYCLE_H 12'hC80
`define CSR_INSTRET 12'hC02
`define CSR_INSTRET_H 12'hC82
// Machine Performance-monitoring counters
`define CSR_MPM_BASE 12'hB00
`define CSR_MPM_BASE_H 12'hB80
// PERF: pipeline
`define CSR_MPM_IBUF_ST 12'hB03
`define CSR_MPM_IBUF_ST_H 12'hB83
`define CSR_MPM_SCRB_ST 12'hB04
`define CSR_MPM_SCRB_ST_H 12'hB84
`define CSR_MPM_ALU_ST 12'hB05
`define CSR_MPM_ALU_ST_H 12'hB85
`define CSR_MPM_LSU_ST 12'hB06
`define CSR_MPM_LSU_ST_H 12'hB86
`define CSR_MPM_CSR_ST 12'hB07
`define CSR_MPM_CSR_ST_H 12'hB87
`define CSR_MPM_FPU_ST 12'hB08
`define CSR_MPM_FPU_ST_H 12'hB88
`define CSR_MPM_GPU_ST 12'hB09
`define CSR_MPM_GPU_ST_H 12'hB89
`define CSR_MCYCLE 12'hB00
`define CSR_MCYCLE_H 12'hB80
`define CSR_MPM_RESERVED 12'hB01
`define CSR_MPM_RESERVED_H 12'hB81
`define CSR_MINSTRET 12'hB02
`define CSR_MINSTRET_H 12'hB82
`define CSR_MPM_IBUF_ST 12'hB03
`define CSR_MPM_IBUF_ST_H 12'hB83
`define CSR_MPM_SCRB_ST 12'hB04
`define CSR_MPM_SCRB_ST_H 12'hB84
`define CSR_MPM_ALU_ST 12'hB05
`define CSR_MPM_ALU_ST_H 12'hB85
`define CSR_MPM_LSU_ST 12'hB06
`define CSR_MPM_LSU_ST_H 12'hB86
`define CSR_MPM_CSR_ST 12'hB07
`define CSR_MPM_CSR_ST_H 12'hB87
`define CSR_MPM_FPU_ST 12'hB08
`define CSR_MPM_FPU_ST_H 12'hB88
`define CSR_MPM_GPU_ST 12'hB09
`define CSR_MPM_GPU_ST_H 12'hB89
// PERF: icache
`define CSR_MPM_ICACHE_READS 12'hB0A // total reads
`define CSR_MPM_ICACHE_READS_H 12'hB8A
@ -196,21 +202,21 @@
`define CSR_MPM_DCACHE_CRSP_ST 12'hB15 // core response stalls
`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB95
// PERF: smem
`define CSR_MPM_SMEM_READS 12'hB16 // total reads
`define CSR_MPM_SMEM_READS_H 12'hB96
`define CSR_MPM_SMEM_WRITES 12'hB17 // total writes
`define CSR_MPM_SMEM_WRITES_H 12'hB97
`define CSR_MPM_SMEM_BANK_ST 12'hB18 // bank conflicts stalls
`define CSR_MPM_SMEM_BANK_ST_H 12'hB98
`define CSR_MPM_SMEM_READS 12'hB16 // total reads
`define CSR_MPM_SMEM_READS_H 12'hB96
`define CSR_MPM_SMEM_WRITES 12'hB17 // total writes
`define CSR_MPM_SMEM_WRITES_H 12'hB97
`define CSR_MPM_SMEM_BANK_ST 12'hB18 // bank conflicts stalls
`define CSR_MPM_SMEM_BANK_ST_H 12'hB98
// PERF: memory
`define CSR_MPM_MEM_READS 12'hB19 // memory reads
`define CSR_MPM_MEM_READS_H 12'hB99
`define CSR_MPM_MEM_WRITES 12'hB1A // memory writes
`define CSR_MPM_MEM_WRITES_H 12'hB9A
`define CSR_MPM_MEM_ST 12'hB1B // memory request stalls
`define CSR_MPM_MEM_ST_H 12'hB9B
`define CSR_MPM_MEM_LAT 12'hB1C // memory latency (total)
`define CSR_MPM_MEM_LAT_H 12'hB9C
`define CSR_MPM_MEM_READS 12'hB19 // memory reads
`define CSR_MPM_MEM_READS_H 12'hB99
`define CSR_MPM_MEM_WRITES 12'hB1A // memory writes
`define CSR_MPM_MEM_WRITES_H 12'hB9A
`define CSR_MPM_MEM_ST 12'hB1B // memory request stalls
`define CSR_MPM_MEM_ST_H 12'hB9B
`define CSR_MPM_MEM_LAT 12'hB1C // memory latency (total)
`define CSR_MPM_MEM_LAT_H 12'hB9C
// Machine Information Registers
`define CSR_MVENDORID 12'hF11

View file

@ -24,18 +24,6 @@ module VX_core #(
input wire [`XMEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// CSR request
input wire csr_req_valid,
input wire [11:0] csr_req_addr,
input wire csr_req_rw,
input wire [31:0] csr_req_data,
output wire csr_req_ready,
// CSR response
output wire csr_rsp_valid,
output wire [31:0] csr_rsp_data,
input wire csr_rsp_ready,
// Status
output wire busy
);
@ -127,19 +115,7 @@ module VX_core #(
.icache_rsp_valid (icache_core_rsp_if.valid),
.icache_rsp_data (icache_core_rsp_if.data),
.icache_rsp_tag (icache_core_rsp_if.tag),
.icache_rsp_ready (icache_core_rsp_if.ready),
// CSR request
.csr_req_valid (csr_req_valid),
.csr_req_rw (csr_req_rw),
.csr_req_addr (csr_req_addr),
.csr_req_data (csr_req_data),
.csr_req_ready (csr_req_ready),
// CSR response
.csr_rsp_valid (csr_rsp_valid),
.csr_rsp_data (csr_rsp_data),
.csr_rsp_ready (csr_rsp_ready),
.icache_rsp_ready (icache_core_rsp_if.ready),
// Status
.busy(busy)

View file

@ -1,82 +0,0 @@
`include "VX_define.vh"
module VX_csr_arb #(
parameter NUM_REQS = 1,
parameter DATA_WIDTH = 1,
parameter BUFFERED_REQ = 0,
parameter BUFFERED_RSP = 0,
parameter DATA_SIZE = (DATA_WIDTH / 8),
parameter ADDR_WIDTH = 32 - `CLOG2(DATA_SIZE),
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
input wire clk,
input wire reset,
input wire [LOG_NUM_REQS-1:0] request_id,
// input requests
input wire req_valid_in,
input wire [ADDR_WIDTH-1:0] req_addr_in,
input wire req_rw_in,
input wire [DATA_WIDTH-1:0] req_data_in,
output wire req_ready_in,
// output request
output wire [NUM_REQS-1:0] req_valid_out,
output wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] req_addr_out,
output wire [NUM_REQS-1:0] req_rw_out,
output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] req_data_out,
input wire [NUM_REQS-1:0] req_ready_out,
// input response
input wire [NUM_REQS-1:0] rsp_valid_in,
input wire [NUM_REQS-1:0][DATA_WIDTH-1:0] rsp_data_in,
output wire [NUM_REQS-1:0] rsp_ready_in,
// output response
output wire rsp_valid_out,
output wire [DATA_WIDTH-1:0] rsp_data_out,
input wire rsp_ready_out
);
localparam REQ_DATAW = ADDR_WIDTH + 1 + DATA_WIDTH;
localparam RSP_DATAW = DATA_WIDTH;
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_merged_data_out;
for (genvar i = 0; i < NUM_REQS; i++) begin
assign {req_addr_out[i], req_rw_out[i], req_data_out[i]} = req_merged_data_out[i];
end
VX_stream_demux #(
.NUM_REQS (NUM_REQS),
.DATAW (REQ_DATAW),
.BUFFERED (BUFFERED_REQ)
) req_demux (
.clk (clk),
.reset (reset),
.sel (request_id),
.valid_in (req_valid_in),
.data_in ({req_addr_in, req_rw_in, req_data_in}),
.ready_in (req_ready_in),
.valid_out (req_valid_out),
.data_out (req_merged_data_out),
.ready_out (req_ready_out)
);
VX_stream_arbiter #(
.NUM_REQS (NUM_REQS),
.DATAW (RSP_DATAW),
.BUFFERED (BUFFERED_RSP),
.TYPE ("X") // fixed arbitration
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_valid_in),
.data_in (rsp_data_in),
.ready_in (rsp_ready_in),
.valid_out (rsp_valid_out),
.data_out (rsp_data_out),
.ready_out (rsp_ready_out)
);
endmodule

View file

@ -96,20 +96,25 @@ module VX_csr_data #(
always @(*) begin
read_data_r = 'x;
case (read_addr)
`CSR_FFLAGS : read_data_r = 32'(fcsr[read_wid][`FFG_BITS-1:0]);
`CSR_FRM : read_data_r = 32'(fcsr[read_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS]);
`CSR_FCSR : read_data_r = 32'(fcsr[read_wid]);
`CSR_FFLAGS : read_data_r = 32'(fcsr[read_wid][`FFG_BITS-1:0]);
`CSR_FRM : read_data_r = 32'(fcsr[read_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS]);
`CSR_FCSR : read_data_r = 32'(fcsr[read_wid]);
`CSR_WTID ,
`CSR_LTID ,
`CSR_LWID : read_data_r = 32'(read_wid);
`CSR_GTID ,
`CSR_WTID ,
`CSR_LTID ,
`CSR_LWID : read_data_r = 32'(read_wid);
`CSR_GTID ,
/*`CSR_MHARTID ,*/
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
`CSR_GCID : read_data_r = CORE_ID;
`CSR_NT : read_data_r = `NUM_THREADS;
`CSR_NW : read_data_r = `NUM_WARPS;
`CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS;
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
`CSR_GCID : read_data_r = CORE_ID;
`CSR_NT : read_data_r = `NUM_THREADS;
`CSR_NW : read_data_r = `NUM_WARPS;
`CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS;
`CSR_MCYCLE : read_data_r = csr_cycle[31:0];
`CSR_MCYCLE_H : read_data_r = 32'(csr_cycle[`PERF_CTR_BITS-1:32]);
`CSR_MINSTRET : read_data_r = csr_instret[31:0];
`CSR_MINSTRET_H : read_data_r = 32'(csr_instret[`PERF_CTR_BITS-1:32]);
`ifdef PERF_ENABLE
// PERF: pipeline
@ -154,12 +159,12 @@ module VX_csr_data #(
`CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0];
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem
`CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0];
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0];
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0];
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0];
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0];
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0];
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: MEM
`CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0];
`CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
@ -169,6 +174,9 @@ module VX_csr_data #(
`CSR_MPM_MEM_ST_H : read_data_r = 32'(perf_memsys_if.mem_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0];
`CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
// PERF: reserved
`CSR_MPM_RESERVED : read_data_r = '0;
`CSR_MPM_RESERVED_H : read_data_r = '0;
`endif
`CSR_SATP : read_data_r = 32'(csr_satp);
@ -185,17 +193,15 @@ module VX_csr_data #(
`CSR_PMPCFG0 : read_data_r = 32'(csr_pmpcfg[0]);
`CSR_PMPADDR0 : read_data_r = 32'(csr_pmpaddr[0]);
`CSR_CYCLE : read_data_r = csr_cycle[31:0];
`CSR_CYCLE_H : read_data_r = 32'(csr_cycle[`PERF_CTR_BITS-1:32]);
`CSR_INSTRET : read_data_r = csr_instret[31:0];
`CSR_INSTRET_H : read_data_r = 32'(csr_instret[`PERF_CTR_BITS-1:32]);
`CSR_MVENDORID : read_data_r = `VENDOR_ID;
`CSR_MARCHID : read_data_r = `ARCHITECTURE_ID;
`CSR_MIMPID : read_data_r = `IMPLEMENTATION_ID;
default: begin
assert(~read_enable) else $error("%t: invalid CSR read address: %0h", $time, read_addr);
default: begin
if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32))
| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)))) begin
assert(~read_enable) else $error("%t: invalid CSR read address: %0h", $time, read_addr);
end
end
endcase
end

View file

@ -7,49 +7,29 @@ module VX_csr_unit #(
input wire reset,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_memsys_if perf_memsys_if,
VX_perf_pipeline_if perf_pipeline_if,
`endif
VX_cmt_to_csr_if cmt_to_csr_if,
VX_fpu_to_csr_if fpu_to_csr_if,
VX_csr_io_req_if csr_io_req_if,
VX_csr_io_rsp_if csr_io_rsp_if,
VX_fpu_to_csr_if fpu_to_csr_if,
VX_csr_req_if csr_req_if,
VX_commit_if csr_commit_if,
input wire busy,
input wire[`NUM_WARPS-1:0] fpu_pending,
output wire[`NUM_WARPS-1:0] pending
);
VX_csr_pipe_req_if csr_pipe_req_if();
VX_commit_if csr_pipe_rsp_if();
wire select_io_rsp;
VX_csr_io_arb csr_io_arb (
.clk (clk),
.reset (reset),
.select_io_rsp (select_io_rsp),
.csr_core_req_if (csr_req_if),
.csr_io_req_if (csr_io_req_if),
.csr_pipe_req_if (csr_pipe_req_if),
.csr_pipe_rsp_if (csr_pipe_rsp_if),
.csr_io_rsp_if (csr_io_rsp_if),
.csr_commit_if (csr_commit_if)
);
);
wire csr_we_s1;
wire [`CSR_ADDR_BITS-1:0] csr_addr_s1;
wire [31:0] csr_read_data, csr_read_data_s1;
wire [31:0] csr_updated_data_s1;
wire write_enable = csr_pipe_rsp_if.valid && csr_we_s1;
wire write_enable = csr_commit_if.valid && csr_we_s1;
wire [31:0] csr_req_data = csr_req_if.use_imm ? 32'(csr_req_if.rs1) : csr_req_if.rs1_data;
VX_csr_data #(
.CORE_ID(CORE_ID)
@ -62,20 +42,20 @@ module VX_csr_unit #(
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fpu_to_csr_if (fpu_to_csr_if),
.read_enable (csr_pipe_req_if.valid),
.read_addr (csr_pipe_req_if.addr),
.read_wid (csr_pipe_req_if.wid),
.read_enable (csr_req_if.valid),
.read_addr (csr_req_if.addr),
.read_wid (csr_req_if.wid),
.read_data (csr_read_data),
.write_enable (write_enable),
.write_addr (csr_addr_s1),
.write_wid (csr_pipe_rsp_if.wid),
.write_wid (csr_commit_if.wid),
.write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]),
.busy (busy)
);
wire write_hazard = (csr_addr_s1 == csr_pipe_req_if.addr)
&& (csr_pipe_rsp_if.wid == csr_pipe_req_if.wid)
&& csr_pipe_rsp_if.valid;
wire write_hazard = (csr_addr_s1 == csr_req_if.addr)
&& (csr_commit_if.wid == csr_req_if.wid)
&& csr_commit_if.valid;
wire [31:0] csr_read_data_qual = write_hazard ? csr_updated_data_s1 : csr_read_data;
@ -83,53 +63,55 @@ module VX_csr_unit #(
reg csr_we_s0_unqual;
always @(*) begin
csr_we_s0_unqual = 0;
case (csr_pipe_req_if.op_type)
always @(*) begin
case (csr_req_if.op_type)
`CSR_RW: begin
csr_updated_data = csr_pipe_req_if.data;
csr_updated_data = csr_req_data;
csr_we_s0_unqual = 1;
end
`CSR_RS: begin
csr_updated_data = csr_read_data_qual | csr_pipe_req_if.data;
csr_we_s0_unqual = (csr_pipe_req_if.data != 0);
csr_updated_data = csr_read_data_qual | csr_req_data;
csr_we_s0_unqual = (csr_req_data != 0);
end
`CSR_RC: begin
csr_updated_data = csr_read_data_qual & ~csr_pipe_req_if.data;
csr_we_s0_unqual = (csr_pipe_req_if.data != 0);
csr_updated_data = csr_read_data_qual & ~csr_req_data;
csr_we_s0_unqual = (csr_req_data != 0);
end
default: begin
csr_updated_data = 'x;
csr_we_s0_unqual = 0;
end
default: csr_updated_data = 'x;
endcase
end
wire stall_in = !csr_pipe_req_if.is_io && fpu_pending[csr_pipe_req_if.wid];
wire stall_in = fpu_pending[csr_req_if.wid];
wire pipe_req_valid_qual = csr_pipe_req_if.valid && !stall_in;
wire csr_req_valid = csr_req_if.valid && !stall_in;
wire stall_out = ~csr_pipe_rsp_if.ready && csr_pipe_rsp_if.valid;
wire stall_out = ~csr_commit_if.ready && csr_commit_if.valid;
VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 1 + 32 + 32),
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.data_in ({pipe_req_valid_qual, csr_pipe_req_if.wid, csr_pipe_req_if.tmask, csr_pipe_req_if.PC, csr_pipe_req_if.rd, csr_pipe_req_if.wb, csr_we_s0_unqual, csr_pipe_req_if.addr, csr_pipe_req_if.is_io, csr_read_data_qual, csr_updated_data}),
.data_out ({csr_pipe_rsp_if.valid, csr_pipe_rsp_if.wid, csr_pipe_rsp_if.tmask, csr_pipe_rsp_if.PC, csr_pipe_rsp_if.rd, csr_pipe_rsp_if.wb, csr_we_s1, csr_addr_s1, select_io_rsp, csr_read_data_s1, csr_updated_data_s1})
.data_in ({csr_req_valid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}),
.data_out ({csr_commit_if.valid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1})
);
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_WTID) ? i :
assign csr_commit_if.data[i] = (csr_addr_s1 == `CSR_WTID) ? i :
(csr_addr_s1 == `CSR_LTID
|| csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
csr_read_data_s1;
end
assign csr_pipe_rsp_if.eop = 1'b1;
assign csr_commit_if.eop = 1'b1;
// can accept new request?
assign csr_pipe_req_if.ready = ~(stall_out || stall_in);
assign csr_req_if.ready = ~(stall_out || stall_in);
// pending request
reg [`NUM_WARPS-1:0] pending_r;
@ -137,11 +119,11 @@ module VX_csr_unit #(
if (reset) begin
pending_r <= 0;
end else begin
if (csr_pipe_rsp_if.valid && csr_pipe_rsp_if.ready) begin
pending_r[csr_pipe_rsp_if.wid] <= 0;
if (csr_commit_if.valid && csr_commit_if.ready) begin
pending_r[csr_commit_if.wid] <= 0;
end
if (csr_pipe_req_if.valid && csr_pipe_req_if.ready) begin
pending_r[csr_pipe_req_if.wid] <= 1;
if (csr_req_if.valid && csr_req_if.ready) begin
pending_r[csr_req_if.wid] <= 1;
end
end
end

View file

@ -6,11 +6,7 @@ module VX_execute #(
`SCOPE_IO_VX_execute
input wire clk,
input wire reset,
// CSR io interface
VX_csr_io_req_if csr_io_req_if,
VX_csr_io_rsp_if csr_io_rsp_if,
input wire reset,
// Dcache interface
VX_dcache_core_req_if dcache_req_if,
@ -81,8 +77,6 @@ module VX_execute #(
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fpu_to_csr_if (fpu_to_csr_if),
.csr_io_req_if (csr_io_req_if),
.csr_io_rsp_if (csr_io_rsp_if),
.csr_req_if (csr_req_if),
.csr_commit_if (csr_commit_if),
.fpu_pending (fpu_pending),

View file

@ -34,19 +34,7 @@ module VX_pipeline #(
input wire icache_rsp_valid,
input wire [31:0] icache_rsp_data,
input wire [`ICORE_TAG_WIDTH-1:0] icache_rsp_tag,
output wire icache_rsp_ready,
// CSR I/O Request
input wire csr_req_valid,
input wire[11:0] csr_req_addr,
input wire csr_req_rw,
input wire[31:0] csr_req_data,
output wire csr_req_ready,
// CSR I/O Response
output wire csr_rsp_valid,
output wire[31:0] csr_rsp_data,
input wire csr_rsp_ready,
output wire icache_rsp_ready,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
@ -116,26 +104,6 @@ module VX_pipeline #(
assign icache_core_rsp_if.tag = icache_rsp_tag;
assign icache_rsp_ready = icache_core_rsp_if.ready;
//
// CSR IO request
//
VX_csr_io_req_if csr_io_req_if();
assign csr_io_req_if.valid = csr_req_valid;
assign csr_io_req_if.rw = csr_req_rw;
assign csr_io_req_if.addr = csr_req_addr;
assign csr_io_req_if.data = csr_req_data;
assign csr_req_ready = csr_io_req_if.ready;
//
// CSR IO response
//
VX_csr_io_rsp_if csr_io_rsp_if();
assign csr_rsp_valid = csr_io_rsp_if.valid;
assign csr_rsp_data = csr_io_rsp_if.data;
assign csr_io_rsp_if.ready = csr_rsp_ready;
///////////////////////////////////////////////////////////////////////////
VX_cmt_to_csr_if cmt_to_csr_if();
@ -226,9 +194,6 @@ module VX_pipeline #(
.dcache_req_if (dcache_core_req_if),
.dcache_rsp_if (dcache_core_rsp_if),
.csr_io_req_if (csr_io_req_if),
.csr_io_rsp_if (csr_io_rsp_if),
.cmt_to_csr_if (cmt_to_csr_if),

View file

@ -22,19 +22,6 @@ module Vortex (
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// CSR Request
input wire csr_req_valid,
input wire [`VX_CSR_ID_WIDTH-1:0] csr_req_coreid,
input wire [11:0] csr_req_addr,
input wire csr_req_rw,
input wire [31:0] csr_req_data,
output wire csr_req_ready,
// CSR Response
output wire csr_rsp_valid,
output wire [31:0] csr_rsp_data,
input wire csr_rsp_ready,
// Status
output wire busy
);
@ -53,21 +40,8 @@ module Vortex (
wire [`NUM_CLUSTERS-1:0][`L2MEM_TAG_WIDTH-1:0] per_cluster_mem_rsp_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_ready;
wire [`NUM_CLUSTERS-1:0] per_cluster_csr_req_valid;
wire [`NUM_CLUSTERS-1:0][11:0] per_cluster_csr_req_addr;
wire [`NUM_CLUSTERS-1:0] per_cluster_csr_req_rw;
wire [`NUM_CLUSTERS-1:0][31:0] per_cluster_csr_req_data;
wire [`NUM_CLUSTERS-1:0] per_cluster_csr_req_ready;
wire [`NUM_CLUSTERS-1:0] per_cluster_csr_rsp_valid;
wire [`NUM_CLUSTERS-1:0][31:0] per_cluster_csr_rsp_data;
wire [`NUM_CLUSTERS-1:0] per_cluster_csr_rsp_ready;
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
wire [`LOG2UP(`NUM_CLUSTERS)-1:0] csr_cluster_id = `LOG2UP(`NUM_CLUSTERS)'(csr_req_coreid >> `CLOG2(`NUM_CORES));
wire [`NC_BITS-1:0] csr_core_id = `NC_BITS'(csr_req_coreid);
for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin
wire cluster_reset;
@ -100,58 +74,10 @@ module Vortex (
.mem_rsp_tag (per_cluster_mem_rsp_tag [i]),
.mem_rsp_ready (per_cluster_mem_rsp_ready [i]),
.csr_req_valid (per_cluster_csr_req_valid [i]),
.csr_req_coreid (csr_core_id),
.csr_req_rw (per_cluster_csr_req_rw [i]),
.csr_req_addr (per_cluster_csr_req_addr [i]),
.csr_req_data (per_cluster_csr_req_data [i]),
.csr_req_ready (per_cluster_csr_req_ready [i]),
.csr_rsp_valid (per_cluster_csr_rsp_valid [i]),
.csr_rsp_data (per_cluster_csr_rsp_data [i]),
.csr_rsp_ready (per_cluster_csr_rsp_ready [i]),
.busy (per_cluster_busy [i])
);
end
VX_csr_arb #(
.NUM_REQS (`NUM_CLUSTERS),
.DATA_WIDTH (32),
.ADDR_WIDTH (12),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) csr_arb (
.clk (clk),
.reset (reset),
.request_id (csr_cluster_id),
// input requests
.req_valid_in (csr_req_valid),
.req_addr_in (csr_req_addr),
.req_rw_in (csr_req_rw),
.req_data_in (csr_req_data),
.req_ready_in (csr_req_ready),
// output request
.req_valid_out (per_cluster_csr_req_valid),
.req_addr_out (per_cluster_csr_req_addr),
.req_rw_out (per_cluster_csr_req_rw),
.req_data_out (per_cluster_csr_req_data),
.req_ready_out (per_cluster_csr_req_ready),
// input responses
.rsp_valid_in (per_cluster_csr_rsp_valid),
.rsp_data_in (per_cluster_csr_rsp_data),
.rsp_ready_in (per_cluster_csr_rsp_ready),
// output response
.rsp_valid_out (csr_rsp_valid),
.rsp_data_out (csr_rsp_data),
.rsp_ready_out (csr_rsp_ready)
);
assign busy = (| per_cluster_busy);
if (`L3_ENABLE) begin

View file

@ -63,8 +63,6 @@ localparam AFU_ID_H = 16'h0004; // AFU ID Higher
localparam CMD_MEM_READ = `AFU_IMAGE_CMD_MEM_READ;
localparam CMD_MEM_WRITE = `AFU_IMAGE_CMD_MEM_WRITE;
localparam CMD_RUN = `AFU_IMAGE_CMD_RUN;
localparam CMD_CSR_READ = `AFU_IMAGE_CMD_CSR_READ;
localparam CMD_CSR_WRITE = `AFU_IMAGE_CMD_CSR_WRITE;
localparam MMIO_CMD_TYPE = `AFU_IMAGE_MMIO_CMD_TYPE;
localparam MMIO_IO_ADDR = `AFU_IMAGE_MMIO_IO_ADDR;
@ -75,10 +73,7 @@ localparam MMIO_STATUS = `AFU_IMAGE_MMIO_STATUS;
localparam MMIO_SCOPE_READ = `AFU_IMAGE_MMIO_SCOPE_READ;
localparam MMIO_SCOPE_WRITE = `AFU_IMAGE_MMIO_SCOPE_WRITE;
localparam MMIO_CSR_CORE = `AFU_IMAGE_MMIO_CSR_CORE;
localparam MMIO_CSR_ADDR = `AFU_IMAGE_MMIO_CSR_ADDR;
localparam MMIO_CSR_DATA = `AFU_IMAGE_MMIO_CSR_DATA;
localparam MMIO_CSR_READ = `AFU_IMAGE_MMIO_CSR_READ;
localparam MMIO_DEV_CAPS = `AFU_IMAGE_MMIO_DEV_CAPS;
localparam CCI_RD_RQ_TAGW = $clog2(CCI_RD_WINDOW_SIZE);
localparam CCI_RD_RQ_DATAW = CCI_LINE_WIDTH + CCI_RD_RQ_TAGW;
@ -88,9 +83,7 @@ localparam STATE_READ = 1;
localparam STATE_WRITE = 2;
localparam STATE_START = 3;
localparam STATE_RUN = 4;
localparam STATE_CSR_READ = 5;
localparam STATE_CSR_WRITE = 6;
localparam STATE_MAX_VALUE = 7;
localparam STATE_MAX_VALUE = 5;
localparam STATE_WIDTH = $clog2(STATE_MAX_VALUE);
`ifdef SCOPE
@ -99,6 +92,8 @@ localparam STATE_WIDTH = $clog2(STATE_MAX_VALUE);
wire [127:0] afu_id = `AFU_ACCEL_UUID;
wire [63:0] dev_caps = {16'(`NUM_THREADS), 16'(`NUM_WARPS), 16'(`NUM_CORES), 16'(`IMPLEMENTATION_ID)};
reg [STATE_WIDTH-1:0] state;
// Vortex ports ///////////////////////////////////////////////////////////////
@ -116,18 +111,7 @@ wire [`VX_MEM_LINE_WIDTH-1:0] vx_mem_rsp_data;
wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag;
wire vx_mem_rsp_ready;
wire vx_csr_io_req_valid;
wire [`VX_CSR_ID_WIDTH-1:0] vx_csr_io_req_coreid;
wire [11:0] vx_csr_io_req_addr;
wire vx_csr_io_req_rw;
wire [31:0] vx_csr_io_req_data;
wire vx_csr_io_req_ready;
wire vx_csr_io_rsp_valid;
wire [31:0] vx_csr_io_rsp_data;
wire vx_csr_io_rsp_ready;
wire vx_busy;
wire vx_busy;
reg vx_reset;
reg vx_mem_en;
@ -145,11 +129,6 @@ wire cmd_scope_read;
wire cmd_scope_write;
`endif
reg [`VX_CSR_ID_WIDTH-1:0] cmd_csr_core;
reg [11:0] cmd_csr_addr;
reg [31:0] cmd_csr_rdata;
reg [31:0] cmd_csr_wdata;
// MMIO controller ////////////////////////////////////////////////////////////
`IGNORE_WARNINGS_BEGIN
@ -246,27 +225,9 @@ always @(posedge clk) begin
`endif
end
`endif
MMIO_CSR_CORE: begin
cmd_csr_core <= $bits(cmd_csr_core)'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_CSR_CORE: addr=%0h, %0h", $time, mmio_hdr.address, $bits(cmd_csr_core)'(cp2af_sRxPort.c0.data));
`endif
end
MMIO_CSR_ADDR: begin
cmd_csr_addr <= $bits(cmd_csr_addr)'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_CSR_ADDR: addr=%0h, %0h", $time, mmio_hdr.address, $bits(cmd_csr_addr)'(cp2af_sRxPort.c0.data));
`endif
end
MMIO_CSR_DATA: begin
cmd_csr_wdata <= $bits(cmd_csr_wdata)'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_CSR_DATA: addr=%0h, %0h", $time, mmio_hdr.address, $bits(cmd_csr_wdata)'(cp2af_sRxPort.c0.data));
`endif
end
default: begin
`ifdef DBG_PRINT_OPAE
$display("%t: Unknown MMIO Wr: addr=%0h, data=%0h", $time, mmio_hdr.address, $bits(cmd_csr_wdata)'(cp2af_sRxPort.c0.data));
$display("%t: Unknown MMIO Wr: addr=%0h, data=%0h", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
`endif
end
endcase
@ -298,12 +259,6 @@ always @(posedge clk) begin
end
`endif
end
MMIO_CSR_READ: begin
mmio_tx.data <= 64'(cmd_csr_rdata);
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_CSR_READ: addr=%0h, data=%0h", $time, mmio_hdr.address, cmd_csr_rdata);
`endif
end
`ifdef SCOPE
MMIO_SCOPE_READ: begin
mmio_tx.data <= cmd_scope_rdata;
@ -312,6 +267,12 @@ always @(posedge clk) begin
`endif
end
`endif
MMIO_DEV_CAPS: begin
mmio_tx.data <= dev_caps;
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_DEV_CAPS: addr=%0h, data=%0h", $time, mmio_hdr.address, dev_caps);
`endif
end
default: begin
mmio_tx.data <= 64'h0;
`ifdef DBG_PRINT_OPAE
@ -326,7 +287,6 @@ end
wire cmd_read_done;
wire cmd_write_done;
wire cmd_csr_done;
wire cmd_run_done;
reg [$clog2(RESET_DELAY+1)-1:0] vx_reset_ctr;
@ -366,18 +326,6 @@ always @(posedge clk) begin
vx_reset <= 1;
state <= STATE_START;
end
CMD_CSR_READ: begin
`ifdef DBG_PRINT_OPAE
$display("%t: STATE CSR_READ: addr=%0h", $time, cmd_csr_addr);
`endif
state <= STATE_CSR_READ;
end
CMD_CSR_WRITE: begin
`ifdef DBG_PRINT_OPAE
$display("%t: STATE CSR_WRITE: addr=%0h data=%0d", $time, cmd_csr_addr, cmd_csr_wdata);
`endif
state <= STATE_CSR_WRITE;
end
default: begin
state <= state;
end
@ -421,24 +369,6 @@ always @(posedge clk) begin
end
end
STATE_CSR_READ: begin
if (cmd_csr_done) begin
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
$display("%t: STATE IDLE", $time);
`endif
end
end
STATE_CSR_WRITE: begin
if (cmd_csr_done) begin
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
$display("%t: STATE IDLE", $time);
`endif
end
end
default: begin
state <= state;
end
@ -926,40 +856,6 @@ assign cci_mem_req_valid = cci_mem_req_rw ? cci_mem_wr_req_valid : cci_mem_rd_re
assign cci_mem_req_addr = cci_mem_req_rw ? cci_mem_wr_req_addr : cci_mem_rd_req_addr;
assign cci_mem_req_tag = cci_mem_req_rw ? cci_mem_wr_req_ctr : cci_mem_rd_req_ctr;
// CSRs ///////////////////////////////////////////////////////////////////////
reg csr_io_req_sent;
assign vx_csr_io_req_valid = !csr_io_req_sent
&& ((STATE_CSR_READ == state || STATE_CSR_WRITE == state));
assign vx_csr_io_req_coreid = cmd_csr_core;
assign vx_csr_io_req_rw = (STATE_CSR_WRITE == state);
assign vx_csr_io_req_addr = cmd_csr_addr;
assign vx_csr_io_req_data = cmd_csr_wdata;
assign vx_csr_io_rsp_ready = 1;
assign cmd_csr_done = (STATE_CSR_WRITE == state) ? vx_csr_io_req_ready : vx_csr_io_rsp_valid;
always @(posedge clk) begin
if (reset) begin
csr_io_req_sent <= 0;
end else begin
if (vx_csr_io_req_valid && vx_csr_io_req_ready) begin
csr_io_req_sent <= 1;
end
if (cmd_csr_done) begin
csr_io_req_sent <= 0;
end
end
if ((STATE_CSR_READ == state)
&& vx_csr_io_rsp_ready
&& vx_csr_io_rsp_valid) begin
cmd_csr_rdata <= vx_csr_io_rsp_data;
end
end
// Vortex /////////////////////////////////////////////////////////////////////
assign cmd_run_done = !vx_busy;
@ -984,19 +880,6 @@ Vortex #() vortex (
.mem_rsp_data (vx_mem_rsp_data),
.mem_rsp_tag (vx_mem_rsp_tag),
.mem_rsp_ready (vx_mem_rsp_ready),
// CSR Request
.csr_req_valid (vx_csr_io_req_valid),
.csr_req_coreid (vx_csr_io_req_coreid),
.csr_req_addr (vx_csr_io_req_addr),
.csr_req_rw (vx_csr_io_req_rw),
.csr_req_data (vx_csr_io_req_data),
.csr_req_ready (vx_csr_io_req_ready),
// CSR Response
.csr_rsp_valid (vx_csr_io_rsp_valid),
.csr_rsp_data (vx_csr_io_rsp_data),
.csr_rsp_ready (vx_csr_io_rsp_ready),
// status
.busy (vx_busy)

View file

@ -26,21 +26,16 @@
`define AFU_ACCEL_NAME "vortex_afu"
`define AFU_ACCEL_UUID 128'h35f9452b_25c2_434c_93d5_6f8c60db361c
`define AFU_IMAGE_CMD_CSR_READ 4
`define AFU_IMAGE_CMD_CSR_WRITE 5
`define AFU_IMAGE_CMD_MEM_READ 1
`define AFU_IMAGE_CMD_MEM_WRITE 2
`define AFU_IMAGE_CMD_RUN 3
`define AFU_IMAGE_MMIO_CMD_TYPE 10
`define AFU_IMAGE_MMIO_CSR_CORE 24
`define AFU_IMAGE_MMIO_CSR_ADDR 26
`define AFU_IMAGE_MMIO_CSR_DATA 28
`define AFU_IMAGE_MMIO_CSR_READ 30
`define AFU_IMAGE_MMIO_DATA_SIZE 16
`define AFU_IMAGE_MMIO_IO_ADDR 12
`define AFU_IMAGE_MMIO_MEM_ADDR 14
`define AFU_IMAGE_MMIO_SCOPE_READ 20
`define AFU_IMAGE_MMIO_SCOPE_WRITE 22
`define AFU_IMAGE_MMIO_DEV_CAPS 24
`define AFU_IMAGE_MMIO_STATUS 18
`define AFU_IMAGE_POWER 0

View file

@ -1,16 +0,0 @@
`ifndef VX_CSR_IO_REQ_IF
`define VX_CSR_IO_REQ_IF
`include "VX_define.vh"
interface VX_csr_io_req_if ();
wire valid;
wire [`CSR_ADDR_BITS-1:0] addr;
wire rw;
wire [31:0] data;
wire ready;
endinterface
`endif

View file

@ -1,14 +0,0 @@
`ifndef VX_CSR_IO_RSP_IF
`define VX_CSR_IO_RSP_IF
`include "VX_define.vh"
interface VX_csr_io_rsp_if ();
wire valid;
wire [31:0] data;
wire ready;
endinterface
`endif

View file

@ -1,22 +0,0 @@
`ifndef VX_CSR_PIPE_REQ_IF
`define VX_CSR_PIPE_REQ_IF
`include "VX_define.vh"
interface VX_csr_pipe_req_if ();
wire valid;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] tmask;
wire [31:0] PC;
wire [`CSR_BITS-1:0] op_type;
wire [`CSR_ADDR_BITS-1:0] addr;
wire [31:0] data;
wire [`NR_BITS-1:0] rd;
wire wb;
wire is_io;
wire ready;
endinterface
`endif

View file

@ -77,15 +77,9 @@ void Simulator::reset() {
mem_rsp_vec_.clear();
mem_rsp_active_ = false;
csr_req_active_ = false;
csr_rsp_value_ = nullptr;
vortex_->mem_rsp_valid = 0;
vortex_->mem_req_ready = 0;
//vortex_->io_req_ready = 0;
//vortex_->io_rsp_valid = 0;
vortex_->csr_req_valid = 0;
vortex_->csr_rsp_ready = 0;
vortex_->reset = 1;
@ -108,14 +102,11 @@ void Simulator::step() {
this->eval();
mem_rsp_ready_ = vortex_->mem_rsp_ready;
csr_req_ready_ = vortex_->csr_req_ready;
vortex_->clk = 1;
this->eval();
this->eval_mem_bus();
this->eval_io_bus();
this->eval_csr_bus();
#ifndef NDEBUG
fflush(stdout);
@ -209,53 +200,6 @@ void Simulator::eval_mem_bus() {
vortex_->mem_req_ready = !mem_stalled;
}
void Simulator::eval_io_bus() {
/*for (int i = 0; i < NUM_THREADS; ++i) {
if (((vortex_->io_req_valid >> i) & 0x1)
&& ((VL_WDATA_GETW(vortex_->io_req_addr, i, NUM_THREADS, 30) << 2) == IO_BUS_ADDR_COUT)) {
assert(vortex_->io_req_rw);
int data = vortex_->io_req_data[i];
int tid = data >> 16;
char c = data & 0xff;
auto& ss_buf = print_bufs_[tid];
ss_buf << c;
if (c == '\n') {
std::cout << std::dec << "#" << tid << ": " << ss_buf.str() << std::flush;
ss_buf.str("");
}
}
}
vortex_->io_req_ready = 1;
vortex_->io_rsp_valid = 0;*/
}
void Simulator::eval_csr_bus() {
if (csr_req_active_) {
if (vortex_->csr_req_valid && csr_req_ready_) {
#ifndef NDEBUG
if (vortex_->csr_req_rw)
std::cout << std::dec << timestamp << ": [sim] CSR Wr Req: core=" << (int)vortex_->csr_req_coreid << ", addr=" << std::hex << vortex_->csr_req_addr << ", value=" << vortex_->csr_req_data << std::endl;
else
std::cout << std::dec << timestamp << ": [sim] CSR Rd Req: core=" << (int)vortex_->csr_req_coreid << ", addr=" << std::hex << vortex_->csr_req_addr << std::endl;
#endif
vortex_->csr_req_valid = 0;
if (vortex_->csr_req_rw)
csr_req_active_ = false;
}
if (vortex_->csr_rsp_valid && vortex_->csr_rsp_ready) {
*csr_rsp_value_ = vortex_->csr_rsp_data;
vortex_->csr_rsp_ready = 0;
csr_req_active_ = false;
#ifndef NDEBUG
std::cout << std::dec << timestamp << ": [sim] CSR Rsp: value=" << vortex_->csr_rsp_data << std::endl;
#endif
}
} else {
vortex_->csr_req_valid = 0;
vortex_->csr_rsp_ready = 0;
}
}
void Simulator::wait(uint32_t cycles) {
for (int i = 0; i < cycles; ++i) {
this->step();
@ -266,33 +210,6 @@ bool Simulator::is_busy() const {
return vortex_->busy;
}
bool Simulator::csr_req_active() const {
return csr_req_active_;
}
void Simulator::set_csr(int core_id, int addr, unsigned value) {
vortex_->csr_req_valid = 1;
vortex_->csr_req_coreid = core_id;
vortex_->csr_req_addr = addr;
vortex_->csr_req_rw = 1;
vortex_->csr_req_data = value;
vortex_->csr_rsp_ready = 0;
csr_req_active_ = true;
}
void Simulator::get_csr(int core_id, int addr, unsigned *value) {
vortex_->csr_req_valid = 1;
vortex_->csr_req_coreid = core_id;
vortex_->csr_req_addr = addr;
vortex_->csr_req_rw = 0;
vortex_->csr_rsp_ready = 1;
csr_rsp_value_ = value;
csr_req_active_ = true;
}
void Simulator::run() {
#ifndef NDEBUG
std::cout << std::dec << timestamp << ": [sim] run()" << std::endl;

View file

@ -30,14 +30,9 @@ public:
bool is_busy() const;
bool csr_req_active() const;
void reset();
void step();
void wait(uint32_t cycles);
void set_csr(int core_id, int addr, unsigned value);
void get_csr(int core_id, int addr, unsigned *value);
void run();
@ -61,16 +56,11 @@ private:
void eval();
void eval_mem_bus();
void eval_io_bus();
void eval_csr_bus();
std::list<mem_req_t> mem_rsp_vec_;
bool mem_rsp_active_;
bool mem_rsp_ready_;
bool csr_req_ready_;
bool csr_req_active_;
uint32_t* csr_rsp_value_;
bool mem_rsp_ready_;
RAM *ram_;
VVortex *vortex_;

View file

@ -8,8 +8,6 @@
"cmd-mem-read": 1,
"cmd-mem-write": 2,
"cmd-run": 3,
"cmd-csr-read": 4,
"cmd-csr-write": 5,
"mmio-cmd-type": 10,
"mmio-io-addr": 12,
@ -18,10 +16,7 @@
"mmio-status": 18,
"mmio-scope-read": 20,
"mmio-scope-write": 22,
"mmio-csr-core": 24,
"mmio-csr-addr": 26,
"mmio-csr-data": 28,
"mmio-csr-read": 30,
"mmio-dev-caps": 24,
"afu-top-interface":
{

View file

@ -41,21 +41,29 @@ set_global_assignment -name VERILOG_MACRO NDEBUG
set_global_assignment -name MESSAGE_DISABLE 16818
set_global_assignment -name TIMEQUEST_DO_REPORT_TIMING ON
#set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED
#set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE"
#set_global_assignment -name FINAL_PLACEMENT_OPTIMIZATION ALWAYS
#set_global_assignment -name PLACEMENT_EFFORT_MULTIPLIER 2.0
#set_global_assignment -name FITTER_EFFORT "STANDARD FIT"
#set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS"
#set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON
#set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM
#set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON
#set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON
#set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON
#set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON
#set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0
#set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100
#set_global_assignment -name SEED 1
set_global_assignment -name OPTIMIZATION_TECHNIQUE SPEED
set_global_assignment -name OPTIMIZATION_MODE "AGGRESSIVE PERFORMANCE"
set_global_assignment -name FINAL_PLACEMENT_OPTIMIZATION ALWAYS
set_global_assignment -name PLACEMENT_EFFORT_MULTIPLIER 2.0
set_global_assignment -name FITTER_EFFORT "STANDARD FIT"
set_global_assignment -name OPTIMIZE_HOLD_TIMING "ALL PATHS"
set_global_assignment -name OPTIMIZE_MULTI_CORNER_TIMING ON
set_global_assignment -name ROUTER_TIMING_OPTIMIZATION_LEVEL MAXIMUM
set_global_assignment -name ROUTER_CLOCKING_TOPOLOGY_ANALYSIS ON
set_global_assignment -name ROUTER_LCELL_INSERTION_AND_LOGIC_DUPLICATION ON
set_global_assignment -name SYNTH_TIMING_DRIVEN_SYNTHESIS ON
set_global_assignment -name TIMEQUEST_MULTICORNER_ANALYSIS ON
set_global_assignment -name USE_HIGH_SPEED_ADDER ON
set_global_assignment -name MUX_RESTRUCTURE ON
set_global_assignment -name ADV_NETLIST_OPT_SYNTH_WYSIWYG_REMAP ON
set_global_assignment -name PROGRAMMABLE_POWER_TECHNOLOGY_SETTING "FORCE ALL TILES WITH FAILING TIMING PATHS TO HIGH SPEED"
set_global_assignment -name PHYSICAL_SYNTHESIS_COMBO_LOGIC ON
set_global_assignment -name PHYSICAL_SYNTHESIS_REGISTER_RETIMING ON
set_global_assignment -name MIN_CORE_JUNCTION_TEMP 0
set_global_assignment -name MAX_CORE_JUNCTION_TEMP 100
set_global_assignment -name SEED 1
switch $opts(family) {
"Arria 10" {

View file

@ -10,7 +10,7 @@ CFLAGS += -I./include -I../hw
PROJECT = libvortexrt
SRCS = ./src/vx_start.S ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c
SRCS = ./src/vx_start.S ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c ./src/vx_perf.c
OBJS := $(addsuffix .o, $(notdir $(SRCS)))

View file

@ -7,6 +7,51 @@
extern "C" {
#endif
#ifdef __ASSEMBLY__
#define __ASM_STR(x) x
#else
#define __ASM_STR(x) #x
#endif
#define vx_csr_swap(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_read(csr) ({ \
register unsigned __v; \
__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \
__v; \
})
#define vx_csr_write(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
#define vx_csr_read_set(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_set(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
#define vx_csr_read_clear(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
__v; \
})
#define vx_csr_clear(csr, val) ({ \
unsigned __v = (unsigned )(val); \
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
})
// Set thread mask
inline void vx_tmc(unsigned num_threads) {
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(num_threads));
@ -95,20 +140,6 @@ inline int vx_num_cores() {
return result;
}
// Return the number of cycles
inline int vx_num_cycles() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_CYCLE));
return result;
}
// Return the number of instructions
inline int vx_num_instrs() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_INSTRET));
return result;
}
#define __if(b) vx_split(b); \
if (b)

27
runtime/src/vx_perf.c Normal file
View file

@ -0,0 +1,27 @@
#include <VX_config.h>
#include <vx_intrinsics.h>
#include <stdint.h>
#define DUMP_CSR_4(d, s) \
csr_mem[d + 0] = vx_csr_read(s + 0); \
csr_mem[d + 1] = vx_csr_read(s + 1); \
csr_mem[d + 2] = vx_csr_read(s + 2); \
csr_mem[d + 3] = vx_csr_read(s + 3);
#define DUMP_CSR_32(d, s) \
DUMP_CSR_4(d + 0, s + 0) \
DUMP_CSR_4(d + 4, s + 4) \
DUMP_CSR_4(d + 8, s + 8) \
DUMP_CSR_4(d + 12, s + 12) \
DUMP_CSR_4(d + 16, s + 16) \
DUMP_CSR_4(d + 20, s + 20) \
DUMP_CSR_4(d + 24, s + 24) \
DUMP_CSR_4(d + 28, s + 28)
void vx_perf_dump() {
int core_id = vx_core_id();
uint32_t* const csr_mem = (uint32_t*)(IO_ADDR_CSR + 64 * sizeof(uint32_t) * core_id);
DUMP_CSR_32(0, CSR_MPM_BASE)
DUMP_CSR_32(32, CSR_MPM_BASE_H)
}

View file

@ -42,6 +42,9 @@ _start:
.type _exit, @function
.global _exit
_exit:
# dump performance CSRs
call vx_perf_dump
# disable all threads in current warp
li a0, 0
.insn s 0x6b, 0, x0, 0(a0) # tmc a0

View file

@ -269,16 +269,16 @@ Word Core::get_csr(Addr addr, int tid, int wid) {
} else if (addr == CSR_NC) {
// Number of cores
return arch_.num_cores();
} else if (addr == CSR_INSTRET) {
} else if (addr == CSR_MINSTRET) {
// NumInsts
return insts_;
} else if (addr == CSR_INSTRET_H) {
} else if (addr == CSR_MINSTRET_H) {
// NumInsts
return (Word)(insts_ >> 32);
} else if (addr == CSR_CYCLE) {
} else if (addr == CSR_MCYCLE) {
// NumCycles
return (Word)steps_;
} else if (addr == CSR_CYCLE_H) {
} else if (addr == CSR_MCYCLE_H) {
// NumCycles
return (Word)(steps_ >> 32);
} else {