mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
Gather FPGA perf stats
This commit is contained in:
parent
e92029c31a
commit
5d088d67c8
12 changed files with 55 additions and 51 deletions
|
@ -30,28 +30,6 @@
|
|||
_ret; \
|
||||
})
|
||||
|
||||
/*#include <cstdint>
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
int _pocl_register_kernel(const char* name, const void* pfn, uint32_t num_args, uint32_t num_locals, const uint8_t* arg_types, const uint32_t* local_sizes);
|
||||
void _pocl_kernel_vecadd_workgroup(uint8_t* args, uint8_t*, uint32_t, uint32_t, uint32_t);
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace {
|
||||
class auto_register_kernel_t {
|
||||
public:
|
||||
auto_register_kernel_t() {
|
||||
static uint8_t arg_types[] = {1, 1, 1};
|
||||
static uint32_t local_sizes[] = {};
|
||||
_pocl_register_kernel("vecadd", (void*)_pocl_kernel_vecadd_workgroup, 3, 0, arg_types, local_sizes);
|
||||
}
|
||||
};
|
||||
static auto_register_kernel_t __x__;
|
||||
}*/
|
||||
|
||||
int exitcode = 0;
|
||||
cl_context context = NULL;
|
||||
cl_command_queue commandQueue = NULL;
|
||||
|
|
|
@ -58,10 +58,10 @@ int vx_start(vx_device_h hdevice);
|
|||
int vx_ready_wait(vx_device_h hdevice, long long timeout);
|
||||
|
||||
// set device constant registers
|
||||
int vx_csr_set(vx_device_h hdevice, int core, int address, int value);
|
||||
int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value);
|
||||
|
||||
// get device constant registers
|
||||
int vx_csr_get(vx_device_h hdevice, int core, int address, int* value);
|
||||
int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value);
|
||||
|
||||
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
|
||||
|
||||
|
|
|
@ -53,10 +53,10 @@
|
|||
typedef struct vx_device_ {
|
||||
fpga_handle fpga;
|
||||
size_t mem_allocation;
|
||||
int implementation_id;
|
||||
int num_cores;
|
||||
int num_warps;
|
||||
int num_threads;
|
||||
unsigned implementation_id;
|
||||
unsigned num_cores;
|
||||
unsigned num_warps;
|
||||
unsigned num_threads;
|
||||
} vx_device_t;
|
||||
|
||||
typedef struct vx_buffer_ {
|
||||
|
@ -181,6 +181,9 @@ extern int vx_dev_open(vx_device_h* hdevice) {
|
|||
fpgaClose(accel_handle);
|
||||
return ret;
|
||||
}
|
||||
|
||||
fprintf(stdout, "DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n",
|
||||
device->implementation_id, device->num_cores, device->num_warps, device->num_threads);
|
||||
}
|
||||
|
||||
#ifdef SCOPE
|
||||
|
@ -208,6 +211,29 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||
vx_scope_stop(device->fpga, 0);
|
||||
#endif
|
||||
|
||||
{
|
||||
// Dump performance stats
|
||||
uint64_t instrs, cycles;
|
||||
unsigned value;
|
||||
|
||||
int ret = 0;
|
||||
ret |= vx_csr_get(hdevice, 0, CSR_INSTR_H, &value);
|
||||
instrs = value;
|
||||
ret |= vx_csr_get(hdevice, 0, CSR_INSTR_L, &value);
|
||||
instrs = (instrs << 32) | value;
|
||||
|
||||
ret |= vx_csr_get(hdevice, 0, CSR_CYCLE_H, &value);
|
||||
cycles = value;
|
||||
ret |= vx_csr_get(hdevice, 0, CSR_CYCLE_L, &value);
|
||||
cycles = (cycles << 32) | value;
|
||||
|
||||
float IPC = (float)(double(instrs) / double(cycles));
|
||||
|
||||
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
|
||||
|
||||
assert(ret == 0);
|
||||
}
|
||||
|
||||
fpgaClose(device->fpga);
|
||||
|
||||
free(device);
|
||||
|
@ -468,7 +494,7 @@ extern int vx_start(vx_device_h hdevice) {
|
|||
}
|
||||
|
||||
// set device constant registers
|
||||
extern int vx_csr_set(vx_device_h hdevice, int core, int address, int value) {
|
||||
extern int vx_csr_set(vx_device_h hdevice, int core, int address, unsigned value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
|
@ -488,7 +514,7 @@ extern int vx_csr_set(vx_device_h hdevice, int core, int address, int value) {
|
|||
}
|
||||
|
||||
// get device constant registers
|
||||
extern int vx_csr_get(vx_device_h hdevice, int core, int address, int* value) {
|
||||
extern int vx_csr_get(vx_device_h hdevice, int core, int address, unsigned* value) {
|
||||
if (nullptr == hdevice || nullptr == value)
|
||||
return -1;
|
||||
|
||||
|
@ -510,7 +536,7 @@ extern int vx_csr_get(vx_device_h hdevice, int core, int address, int* value) {
|
|||
|
||||
uint64_t value64;
|
||||
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_CSR_READ, &value64));
|
||||
*value = (int)value64;
|
||||
*value = (unsigned)value64;
|
||||
|
||||
return 0;
|
||||
}
|
Binary file not shown.
|
@ -881,7 +881,6 @@ assign vx_csr_io_req_rw = (STATE_CSR_WRITE == state);
|
|||
assign vx_csr_io_req_addr = cmd_csr_addr;
|
||||
assign vx_csr_io_req_data = cmd_csr_wdata;
|
||||
|
||||
assign cmd_csr_rdata = vx_csr_io_rsp_data;
|
||||
assign vx_csr_io_rsp_ready = 1;
|
||||
|
||||
assign cmd_csr_done = (STATE_CSR_WRITE == state) ? vx_csr_io_req_ready : vx_csr_io_rsp_valid;
|
||||
|
@ -890,6 +889,7 @@ always_ff @(posedge clk)
|
|||
begin
|
||||
if (SoftReset) begin
|
||||
csr_io_req_sent <= 0;
|
||||
cmd_csr_rdata <= 0;
|
||||
end
|
||||
else begin
|
||||
if (vx_csr_io_req_valid && vx_csr_io_req_ready) begin
|
||||
|
@ -898,6 +898,11 @@ begin
|
|||
if (cmd_csr_done) begin
|
||||
csr_io_req_sent <= 0;
|
||||
end
|
||||
if ((STATE_CSR_READ == state)
|
||||
&& vx_csr_io_rsp_ready
|
||||
&& vx_csr_io_rsp_valid) begin
|
||||
cmd_csr_rdata <= vx_csr_io_rsp_data;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -119,7 +119,7 @@ module VX_alu_unit (
|
|||
VX_mult #(
|
||||
.WIDTHA(33),
|
||||
.WIDTHB(33),
|
||||
.WIDTHP(64),
|
||||
.WIDTHP(66),
|
||||
.SIGNED(1),
|
||||
.PIPELINE(`MUL_LATENCY)
|
||||
) multiplier (
|
||||
|
|
|
@ -18,24 +18,21 @@ module VX_csr_arb (
|
|||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
wire pick_core = (| csr_core_req_if.valid);
|
||||
|
||||
// Which request to pick
|
||||
assign issued_csr_req_if.is_io = !pick_core;
|
||||
wire pick_core = (| csr_core_req_if.valid);
|
||||
|
||||
// Mux between core and io
|
||||
assign issued_csr_req_if.valid = pick_core ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}};
|
||||
assign issued_csr_req_if.is_csr = pick_core ? csr_core_req_if.is_csr : 1'b1;
|
||||
assign issued_csr_req_if.alu_op = pick_core ? csr_core_req_if.alu_op : (csr_io_req_if.rw ? `ALU_CSR_RW : `ALU_CSR_RS);
|
||||
assign issued_csr_req_if.csr_address = pick_core ? csr_core_req_if.csr_address : csr_io_req_if.addr;
|
||||
assign issued_csr_req_if.csr_mask = pick_core ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0);
|
||||
|
||||
assign csr_io_req_if.ready = !(csr_pipe_stall || pick_core);
|
||||
|
||||
// Core arguments
|
||||
assign issued_csr_req_if.warp_num = csr_core_req_if.warp_num;
|
||||
assign issued_csr_req_if.csr_address = pick_core ? csr_core_req_if.csr_address : csr_io_req_if.addr;
|
||||
assign issued_csr_req_if.csr_immed = pick_core ? csr_core_req_if.csr_immed : 0;
|
||||
assign issued_csr_req_if.csr_mask = pick_core ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0);
|
||||
assign issued_csr_req_if.is_io = !pick_core;
|
||||
assign issued_csr_req_if.warp_num = csr_core_req_if.warp_num;
|
||||
assign issued_csr_req_if.rd = csr_core_req_if.rd;
|
||||
assign issued_csr_req_if.wb = csr_core_req_if.wb;
|
||||
assign issued_csr_req_if.wb = csr_core_req_if.wb;
|
||||
|
||||
assign csr_io_req_if.ready = !(csr_pipe_stall || pick_core);
|
||||
|
||||
// Core Writeback
|
||||
assign csr_wb_if.valid = csr_pipe_rsp_if.valid & {`NUM_THREADS{~csr_pipe_rsp_if.is_io}};
|
||||
|
|
|
@ -38,7 +38,6 @@ module VX_csr_pipe #(
|
|||
.wb_valid (| writeback_if.valid)
|
||||
);
|
||||
|
||||
// wire hazard = (csr_address_s2 == csr_req_if.csr_address) & (warp_num_s2 == csr_req_if.warp_num) & |(valid_s2) & is_csr_s2;
|
||||
wire car_hazard = (csr_address_s2 == csr_req_if.csr_address) & (warp_num_s2 == csr_req_if.warp_num) & |(valid_s2) & is_csr_s2;
|
||||
|
||||
assign csr_read_data = car_hazard ? csr_updated_data_s2 : csr_read_data_unqual;
|
||||
|
|
|
@ -104,7 +104,7 @@ module VX_decode(
|
|||
assign is_lui = (curr_opcode == `INST_LUI);
|
||||
assign is_auipc = (curr_opcode == `INST_AUIPC);
|
||||
assign is_csr = (curr_opcode == `INST_SYS) && (func3 != 0);
|
||||
assign is_csr_immed = (is_csr) && (func3[2] == 1);
|
||||
assign is_csr_immed = is_csr && (func3[2] == 1);
|
||||
|
||||
assign is_gpgpu = (curr_opcode == `INST_GPGPU);
|
||||
|
||||
|
|
|
@ -30,7 +30,6 @@ module VX_gpr_stage (
|
|||
wire is_jal = bckE_req_if.is_jal;
|
||||
`DEBUG_END
|
||||
|
||||
|
||||
assign csr_req_if.is_io = 1'b0; // GPR only issues csr requests coming from core
|
||||
|
||||
VX_gpr_read_if gpr_read_if();
|
||||
|
|
|
@ -35,7 +35,7 @@ module VX_divide #(
|
|||
quartus_div.lpm_widthd = WIDTHD,
|
||||
quartus_div.lpm_nrepresentation = NSIGNED ? "SIGNED" : "UNSIGNED",
|
||||
quartus_div.lpm_drepresentation = DSIGNED ? "SIGNED" : "UNSIGNED",
|
||||
quartus_div.lpm_hint = "LPM_REMAINDERPOSITIVE=FALSE,MAXIMIZE_SPEED=9",
|
||||
quartus_div.lpm_hint = "MAXIMIZE_SPEED=6,LPM_REMAINDERPOSITIVE=FALSE",
|
||||
quartus_div.lpm_pipeline = PIPELINE;
|
||||
|
||||
`else
|
||||
|
|
|
@ -23,9 +23,9 @@ module VX_mult #(
|
|||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.result (result),
|
||||
.sclr (reset),
|
||||
.aclr (1'b0),
|
||||
.clken (1'b1),
|
||||
.sclr (1'b0),
|
||||
.sum (1'b0)
|
||||
);
|
||||
|
||||
|
@ -35,7 +35,7 @@ module VX_mult #(
|
|||
quartus_mult.lpm_widthp = WIDTHP,
|
||||
quartus_mult.lpm_representation = SIGNED ? "SIGNED" : "UNSIGNED",
|
||||
quartus_mult.lpm_pipeline = PIPELINE,
|
||||
quartus_mult.lpm_hint = "MAXIMIZE_SPEED=9";
|
||||
quartus_mult.lpm_hint = "DEDICATED_MULTIPLIER_CIRCUITRY=YES,MAXIMIZE_SPEED=9";
|
||||
`else
|
||||
|
||||
wire [WIDTHP-1:0] result_unqual;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue