Gather FPGA perf stats

This commit is contained in:
Blaise Tine 2020-07-01 09:30:12 -07:00
parent e92029c31a
commit 5d088d67c8
12 changed files with 55 additions and 51 deletions

View file

@ -30,28 +30,6 @@
_ret; \
})
/*#include <cstdint>
#ifdef __cplusplus
extern "C" {
#endif
int _pocl_register_kernel(const char* name, const void* pfn, uint32_t num_args, uint32_t num_locals, const uint8_t* arg_types, const uint32_t* local_sizes);
void _pocl_kernel_vecadd_workgroup(uint8_t* args, uint8_t*, uint32_t, uint32_t, uint32_t);
#ifdef __cplusplus
}
#endif
namespace {
class auto_register_kernel_t {
public:
auto_register_kernel_t() {
static uint8_t arg_types[] = {1, 1, 1};
static uint32_t local_sizes[] = {};
_pocl_register_kernel("vecadd", (void*)_pocl_kernel_vecadd_workgroup, 3, 0, arg_types, local_sizes);
}
};
static auto_register_kernel_t __x__;
}*/
int exitcode = 0;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;

View file

@ -58,10 +58,10 @@ int vx_start(vx_device_h hdevice);
int vx_ready_wait(vx_device_h hdevice, long long timeout);
// set device constant registers
int vx_csr_set(vx_device_h hdevice, int core, int address, int value);
int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value);
// get device constant registers
int vx_csr_get(vx_device_h hdevice, int core, int address, int* value);
int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value);
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////

View file

@ -53,10 +53,10 @@
typedef struct vx_device_ {
fpga_handle fpga;
size_t mem_allocation;
int implementation_id;
int num_cores;
int num_warps;
int num_threads;
unsigned implementation_id;
unsigned num_cores;
unsigned num_warps;
unsigned num_threads;
} vx_device_t;
typedef struct vx_buffer_ {
@ -181,6 +181,9 @@ extern int vx_dev_open(vx_device_h* hdevice) {
fpgaClose(accel_handle);
return ret;
}
fprintf(stdout, "DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n",
device->implementation_id, device->num_cores, device->num_warps, device->num_threads);
}
#ifdef SCOPE
@ -208,6 +211,29 @@ extern int vx_dev_close(vx_device_h hdevice) {
vx_scope_stop(device->fpga, 0);
#endif
{
// Dump performance stats
uint64_t instrs, cycles;
unsigned value;
int ret = 0;
ret |= vx_csr_get(hdevice, 0, CSR_INSTR_H, &value);
instrs = value;
ret |= vx_csr_get(hdevice, 0, CSR_INSTR_L, &value);
instrs = (instrs << 32) | value;
ret |= vx_csr_get(hdevice, 0, CSR_CYCLE_H, &value);
cycles = value;
ret |= vx_csr_get(hdevice, 0, CSR_CYCLE_L, &value);
cycles = (cycles << 32) | value;
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
assert(ret == 0);
}
fpgaClose(device->fpga);
free(device);
@ -468,7 +494,7 @@ extern int vx_start(vx_device_h hdevice) {
}
// set device constant registers
extern int vx_csr_set(vx_device_h hdevice, int core, int address, int value) {
extern int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value) {
if (nullptr == hdevice)
return -1;
@ -488,7 +514,7 @@ extern int vx_csr_set(vx_device_h hdevice, int core, int address, int value) {
}
// get device constant registers
extern int vx_csr_get(vx_device_h hdevice, int core, int address, int* value) {
extern int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value) {
if (nullptr == hdevice || nullptr == value)
return -1;
@ -510,7 +536,7 @@ extern int vx_csr_get(vx_device_h hdevice, int core, int address, int* value) {
uint64_t value64;
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_CSR_READ, &value64));
*value = (int)value64;
*value = (unsigned)value64;
return 0;
}

Binary file not shown.

View file

@ -881,7 +881,6 @@ assign vx_csr_io_req_rw = (STATE_CSR_WRITE == state);
assign vx_csr_io_req_addr = cmd_csr_addr;
assign vx_csr_io_req_data = cmd_csr_wdata;
assign cmd_csr_rdata = vx_csr_io_rsp_data;
assign vx_csr_io_rsp_ready = 1;
assign cmd_csr_done = (STATE_CSR_WRITE == state) ? vx_csr_io_req_ready : vx_csr_io_rsp_valid;
@ -890,6 +889,7 @@ always_ff @(posedge clk)
begin
if (SoftReset) begin
csr_io_req_sent <= 0;
cmd_csr_rdata <= 0;
end
else begin
if (vx_csr_io_req_valid && vx_csr_io_req_ready) begin
@ -898,6 +898,11 @@ begin
if (cmd_csr_done) begin
csr_io_req_sent <= 0;
end
if ((STATE_CSR_READ == state)
&& vx_csr_io_rsp_ready
&& vx_csr_io_rsp_valid) begin
cmd_csr_rdata <= vx_csr_io_rsp_data;
end
end
end

View file

@ -119,7 +119,7 @@ module VX_alu_unit (
VX_mult #(
.WIDTHA(33),
.WIDTHB(33),
.WIDTHP(64),
.WIDTHP(66),
.SIGNED(1),
.PIPELINE(`MUL_LATENCY)
) multiplier (

View file

@ -18,24 +18,21 @@ module VX_csr_arb (
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
wire pick_core = (| csr_core_req_if.valid);
// Which request to pick
assign issued_csr_req_if.is_io = !pick_core;
wire pick_core = (| csr_core_req_if.valid);
// Mux between core and io
assign issued_csr_req_if.valid = pick_core ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}};
assign issued_csr_req_if.is_csr = pick_core ? csr_core_req_if.is_csr : 1'b1;
assign issued_csr_req_if.alu_op = pick_core ? csr_core_req_if.alu_op : (csr_io_req_if.rw ? `ALU_CSR_RW : `ALU_CSR_RS);
assign issued_csr_req_if.csr_address = pick_core ? csr_core_req_if.csr_address : csr_io_req_if.addr;
assign issued_csr_req_if.csr_mask = pick_core ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0);
assign csr_io_req_if.ready = !(csr_pipe_stall || pick_core);
// Core arguments
assign issued_csr_req_if.warp_num = csr_core_req_if.warp_num;
assign issued_csr_req_if.csr_address = pick_core ? csr_core_req_if.csr_address : csr_io_req_if.addr;
assign issued_csr_req_if.csr_immed = pick_core ? csr_core_req_if.csr_immed : 0;
assign issued_csr_req_if.csr_mask = pick_core ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0);
assign issued_csr_req_if.is_io = !pick_core;
assign issued_csr_req_if.warp_num = csr_core_req_if.warp_num;
assign issued_csr_req_if.rd = csr_core_req_if.rd;
assign issued_csr_req_if.wb = csr_core_req_if.wb;
assign issued_csr_req_if.wb = csr_core_req_if.wb;
assign csr_io_req_if.ready = !(csr_pipe_stall || pick_core);
// Core Writeback
assign csr_wb_if.valid = csr_pipe_rsp_if.valid & {`NUM_THREADS{~csr_pipe_rsp_if.is_io}};

View file

@ -38,7 +38,6 @@ module VX_csr_pipe #(
.wb_valid (| writeback_if.valid)
);
// wire hazard = (csr_address_s2 == csr_req_if.csr_address) & (warp_num_s2 == csr_req_if.warp_num) & |(valid_s2) & is_csr_s2;
wire car_hazard = (csr_address_s2 == csr_req_if.csr_address) & (warp_num_s2 == csr_req_if.warp_num) & |(valid_s2) & is_csr_s2;
assign csr_read_data = car_hazard ? csr_updated_data_s2 : csr_read_data_unqual;

View file

@ -104,7 +104,7 @@ module VX_decode(
assign is_lui = (curr_opcode == `INST_LUI);
assign is_auipc = (curr_opcode == `INST_AUIPC);
assign is_csr = (curr_opcode == `INST_SYS) && (func3 != 0);
assign is_csr_immed = (is_csr) && (func3[2] == 1);
assign is_csr_immed = is_csr && (func3[2] == 1);
assign is_gpgpu = (curr_opcode == `INST_GPGPU);

View file

@ -30,7 +30,6 @@ module VX_gpr_stage (
wire is_jal = bckE_req_if.is_jal;
`DEBUG_END
assign csr_req_if.is_io = 1'b0; // GPR only issues csr requests coming from core
VX_gpr_read_if gpr_read_if();

View file

@ -35,7 +35,7 @@ module VX_divide #(
quartus_div.lpm_widthd = WIDTHD,
quartus_div.lpm_nrepresentation = NSIGNED ? "SIGNED" : "UNSIGNED",
quartus_div.lpm_drepresentation = DSIGNED ? "SIGNED" : "UNSIGNED",
quartus_div.lpm_hint = "LPM_REMAINDERPOSITIVE=FALSE,MAXIMIZE_SPEED=9",
quartus_div.lpm_hint = "MAXIMIZE_SPEED=6,LPM_REMAINDERPOSITIVE=FALSE",
quartus_div.lpm_pipeline = PIPELINE;
`else

View file

@ -23,9 +23,9 @@ module VX_mult #(
.dataa (dataa),
.datab (datab),
.result (result),
.sclr (reset),
.aclr (1'b0),
.clken (1'b1),
.sclr (1'b0),
.sum (1'b0)
);
@ -35,7 +35,7 @@ module VX_mult #(
quartus_mult.lpm_widthp = WIDTHP,
quartus_mult.lpm_representation = SIGNED ? "SIGNED" : "UNSIGNED",
quartus_mult.lpm_pipeline = PIPELINE,
quartus_mult.lpm_hint = "MAXIMIZE_SPEED=9";
quartus_mult.lpm_hint = "DEDICATED_MULTIPLIER_CIRCUITRY=YES,MAXIMIZE_SPEED=9";
`else
wire [WIDTHP-1:0] result_unqual;