adding CSR support to rtlsim driver

This commit is contained in:
Blaise Tine 2020-09-04 06:51:31 -04:00
parent dccea80b68
commit 112d8ab815
10 changed files with 167 additions and 42 deletions

View file

@ -91,22 +91,22 @@ extern int vx_upload_kernel_file(vx_device_h device, const char* filename) {
return err;
}
extern int vx_get_perf(vx_device_h device, size_t* cycles, size_t* instrs) {
extern int vx_get_perf(vx_device_h device, int core_id, size_t* cycles, size_t* instrs) {
int ret = 0;
unsigned value;
if (cycles) {
ret |= vx_csr_get(device, 0, CSR_CYCLE_H, &value);
ret |= vx_csr_get(device, core_id, CSR_CYCLE_H, &value);
*cycles = value;
ret |= vx_csr_get(device, 0, CSR_CYCLE, &value);
ret |= vx_csr_get(device, core_id, CSR_CYCLE, &value);
*cycles = (*cycles << 32) | value;
}
if (instrs) {
ret |= vx_csr_get(device, 0, CSR_INSTRET_H, &value);
ret |= vx_csr_get(device, core_id, CSR_INSTRET_H, &value);
*instrs = value;
ret |= vx_csr_get(device, 0, CSR_INSTRET, &value);
ret |= vx_csr_get(device, core_id, CSR_INSTRET, &value);
*instrs = (*instrs << 32) | value;
}

View file

@ -58,10 +58,10 @@ int vx_start(vx_device_h hdevice);
int vx_ready_wait(vx_device_h hdevice, long long timeout);
// set device constant registers
int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value);
int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value);
// get device constant registers
int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value);
int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value);
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
@ -72,7 +72,7 @@ int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size)
int vx_upload_kernel_file(vx_device_h device, const char* filename);
// get performance counters
int vx_get_perf(vx_device_h device, size_t* cycles, size_t* instrs);
int vx_get_perf(vx_device_h device, int core_id, size_t* cycles, size_t* instrs);
#ifdef __cplusplus
}

View file

@ -17,6 +17,9 @@ CXXFLAGS +=-fstack-protector
# Position independent code
CXXFLAGS += -fPIC
# Dump perf stats
CXXFLAGS += -DDUMP_PERF_STATS
# Enable scope analyzer
#CXXFLAGS += -DSCOPE

View file

@ -211,14 +211,29 @@ extern int vx_dev_close(vx_device_h hdevice) {
vx_scope_stop(device->fpga, 0);
#endif
{
// Dump perf stats
#ifdef DUMP_PERF_STATS
// Dump perf stats
if (device->num_cores > 1) {
uint64_t total_instrs = 0, total_cycles = 0;
for (unsigned core_id = 0; core_id < device->num_cores; ++core_id) {
uint64_t instrs, cycles;
int ret = vx_get_perf(hdevice, core_id, &instrs, &cycles);
assert(ret == 0);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC);
total_instrs += instrs;
total_cycles = std::max<uint64_t>(total_cycles, cycles);
}
float IPC = (float)(double(total_instrs) / double(total_cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC);
} else {
uint64_t instrs, cycles;
int ret = vx_get_perf(hdevice, &instrs, &cycles);
int ret = vx_get_perf(hdevice, 0, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
assert(ret == 0);
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
}
#endif
fpgaClose(device->fpga);
@ -480,7 +495,7 @@ extern int vx_start(vx_device_h hdevice) {
}
// set device constant registers
extern int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value) {
extern int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value) {
if (nullptr == hdevice)
return -1;
@ -491,8 +506,8 @@ extern int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value
return -1;
// write CSR value
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, address));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core_id));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, addr));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_DATA, value));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_CSR_WRITE));
@ -500,7 +515,7 @@ extern int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value
}
// get device constant registers
extern int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value) {
extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value) {
if (nullptr == hdevice || nullptr == value)
return -1;
@ -512,8 +527,8 @@ extern int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* valu
// write CSR value
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, address));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_CORE, core_id));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CSR_ADDR, addr));
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_CSR_READ));
// Ensure ready for new command

View file

@ -28,6 +28,8 @@ CFLAGS += -fPIC
CFLAGS += -DUSE_RTLSIM $(CONFIGS)
CFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
# LDFLAGS += -dynamiclib -pthread

View file

@ -69,7 +69,28 @@ public:
}
~vx_device() {
simulator_.print_stats(std::cout);
#ifdef DUMP_PERF_STATS
unsigned num_cores;
this->get_csr(0, CSR_NC, &num_cores);
if (num_cores > 1) {
uint64_t total_instrs = 0, total_cycles = 0;
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t instrs, cycles;
vx_get_perf(this, core_id, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs, cycles, IPC);
total_instrs += instrs;
total_cycles = std::max<uint64_t>(total_cycles, cycles);
}
float IPC = (float)(double(total_instrs) / double(total_cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, total_cycles, IPC);
} else {
uint64_t instrs, cycles;
vx_get_perf(this, 0, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
}
#endif
if (future_.valid()) {
future_.wait();
}
@ -152,6 +173,28 @@ public:
return 0;
}
int set_csr(int core_id, int addr, unsigned value) {
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
simulator_.set_csr(core_id, addr, value);
while (simulator_.is_busy()) {
simulator_.step();
};
return 0;
}
int get_csr(int core_id, int addr, unsigned *value) {
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
simulator_.get_csr(core_id, addr, value);
while (simulator_.is_busy()) {
simulator_.step();
};
return 0;
}
private:
size_t mem_allocation_;
@ -324,10 +367,20 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
return device->wait(timeout);
}
extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned /*value*/) {
return -1;
extern int vx_csr_set(vx_device_h hdevice, int core_id, int addr, unsigned value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->set_csr(core_id, addr, value);
}
extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned* /*value*/) {
return -1;
extern int vx_csr_get(vx_device_h hdevice, int core_id, int addr, unsigned* value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->get_csr(core_id, addr, value);
}

View file

@ -358,10 +358,10 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
return device->wait(timeout);
}
extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned /*value*/) {
extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core_id*/, int /*addr*/, unsigned /*value*/) {
return -1;
}
extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned* /*value*/) {
extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core_id*/, int /*addr*/, unsigned* /*value*/) {
return -1;
}

View file

@ -48,10 +48,10 @@ extern int vx_ready_wait(vx_device_h /*hdevice*/, long long /*timeout*/) {
return -1;
}
extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned /*value*/) {
extern int vx_csr_set(vx_device_h /*hdevice*/, int /*core_id*/, int /*addr*/, unsigned /*value*/) {
return -1;
}
extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core*/, int /*address*/, unsigned* /*value*/) {
extern int vx_csr_get(vx_device_h /*hdevice*/, int /*core_id*/, int /*addr*/, unsigned* /*value*/) {
return -1;
}

View file

@ -22,6 +22,7 @@ Simulator::Simulator() {
dram_rsp_active_ = false;
snp_req_active_ = false;
csr_req_active_ = false;
#ifdef VCD_OUTPUT
Verilated::traceEverOn(true);
@ -163,15 +164,6 @@ void Simulator::eval_io_bus() {
vortex_->io_rsp_valid = 0;
}
void Simulator::eval_csr_bus() {
vortex_->csr_io_req_valid = 0;
vortex_->csr_io_req_coreid = 0;
vortex_->csr_io_req_addr = 0;
vortex_->csr_io_req_rw = 0;
vortex_->csr_io_req_data = 0;
vortex_->csr_io_rsp_ready = 1;
}
void Simulator::eval_snp_bus() {
if (snp_req_active_) {
if (vortex_->snp_rsp_valid) {
@ -204,6 +196,27 @@ void Simulator::eval_snp_bus() {
}
}
void Simulator::eval_csr_bus() {
if (csr_req_active_) {
if (vortex_->csr_io_req_rw) {
if (vortex_->csr_io_req_ready) {
vortex_->snp_req_valid = 0;
csr_req_active_ = false;
}
} else {
if (vortex_->csr_io_rsp_valid) {
*csr_rsp_value_ = vortex_->csr_io_rsp_data;
vortex_->snp_req_valid = 0;
vortex_->csr_io_rsp_ready = 0;
csr_req_active_ = false;
}
}
} else {
vortex_->csr_io_req_valid = 0;
vortex_->csr_io_rsp_ready = 0;
}
}
void Simulator::wait(uint32_t cycles) {
for (int i = 0; i < cycles; ++i) {
this->step();
@ -211,7 +224,9 @@ void Simulator::wait(uint32_t cycles) {
}
bool Simulator::is_busy() const {
return vortex_->busy || snp_req_active_;
return vortex_->busy
|| snp_req_active_
|| csr_req_active_;
}
void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) {
@ -221,22 +236,52 @@ void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) {
if (0 == size)
return;
snp_req_active_ = true;
snp_req_size_ = (size + GLOBAL_BLOCK_SIZE - 1) / GLOBAL_BLOCK_SIZE;
vortex_->snp_req_addr = mem_addr / GLOBAL_BLOCK_SIZE;
vortex_->snp_req_tag = 0;
vortex_->snp_req_valid = 1;
vortex_->snp_rsp_ready = 1;
snp_req_size_ = (size + GLOBAL_BLOCK_SIZE - 1) / GLOBAL_BLOCK_SIZE;
--snp_req_size_;
pending_snp_reqs_ = 1;
snp_req_active_ = true;
#ifdef DBG_PRINT_CACHE_SNP
std::cout << timestamp << ": [sim] snp req: addr=" << std::hex << vortex_->snp_req_addr << std::dec << " tag=" << vortex_->snp_req_tag << " remain=" << snp_req_size_ << std::endl;
#endif
}
void Simulator::set_csr(int core_id, int addr, unsigned value) {
#ifndef NDEBUG
std::cout << timestamp << ": [sim] set_csr()" << std::endl;
#endif
vortex_->csr_io_req_valid = 1;
vortex_->csr_io_req_coreid = core_id;
vortex_->csr_io_req_addr = addr;
vortex_->csr_io_req_rw = 1;
vortex_->csr_io_req_data = value;
vortex_->csr_io_rsp_ready = 0;
csr_req_active_ = true;
}
void Simulator::get_csr(int core_id, int addr, unsigned *value) {
#ifndef NDEBUG
std::cout << timestamp << ": [sim] get_csr()" << std::endl;
#endif
vortex_->csr_io_req_valid = 1;
vortex_->csr_io_req_coreid = core_id;
vortex_->csr_io_req_addr = addr;
vortex_->csr_io_req_rw = 0;
vortex_->csr_io_rsp_ready = 1;
csr_rsp_value_ = value;
csr_req_active_ = true;
}
void Simulator::run() {
#ifndef NDEBUG
std::cout << timestamp << ": [sim] run()" << std::endl;

View file

@ -31,6 +31,8 @@ public:
Simulator();
virtual ~Simulator();
void attach_ram(RAM* ram);
void load_bin(const char* program_file);
void load_ihex(const char* program_file);
@ -39,12 +41,14 @@ public:
void reset();
void step();
void wait(uint32_t cycles);
void flush_caches(uint32_t mem_addr, uint32_t size);
void attach_ram(RAM* ram);
void set_csr(int core_id, int addr, unsigned value);
void get_csr(int core_id, int addr, unsigned *value);
void run();
int get_last_wb_value(int reg) const;
void print_stats(std::ostream& out);
private:
@ -60,8 +64,11 @@ private:
int dram_rsp_active_;
bool snp_req_active_;
bool csr_req_active_;
uint32_t snp_req_size_;
uint32_t pending_snp_reqs_;
uint32_t* csr_rsp_value_;
RAM *ram_;
VVortex *vortex_;