mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
SimX operands collector optimization
This commit is contained in:
parent
7b94c983c9
commit
42f3d55e15
9 changed files with 187 additions and 169 deletions
|
@ -14,15 +14,15 @@
|
|||
#pragma once
|
||||
|
||||
#ifndef RAM_PAGE_SIZE
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
#endif
|
||||
|
||||
#ifndef MEM_CYCLE_RATIO
|
||||
#define MEM_CYCLE_RATIO -1
|
||||
#define MEM_CYCLE_RATIO -1
|
||||
#endif
|
||||
|
||||
#ifndef MEMORY_BANKS
|
||||
#define MEMORY_BANKS 2
|
||||
#define MEMORY_BANKS 2
|
||||
#endif
|
||||
|
||||
#define LSU_WORD_SIZE (XLEN / 8)
|
||||
|
|
|
@ -366,6 +366,11 @@ void Core::commit() {
|
|||
perf_stats_.instrs += trace->tmask.count();
|
||||
}
|
||||
|
||||
perf_stats_.opds_stalls = 0;
|
||||
for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) {
|
||||
perf_stats_.opds_stalls += operands_.at(i)->total_stalls();
|
||||
}
|
||||
|
||||
commit_arb->Outputs.at(0).pop();
|
||||
|
||||
// delete the trace
|
||||
|
|
|
@ -44,6 +44,7 @@ public:
|
|||
uint64_t sched_stalls;
|
||||
uint64_t ibuf_stalls;
|
||||
uint64_t scrb_stalls;
|
||||
uint64_t opds_stalls;
|
||||
uint64_t scrb_alu;
|
||||
uint64_t scrb_fpu;
|
||||
uint64_t scrb_lsu;
|
||||
|
@ -63,6 +64,7 @@ public:
|
|||
, sched_stalls(0)
|
||||
, ibuf_stalls(0)
|
||||
, scrb_stalls(0)
|
||||
, opds_stalls(0)
|
||||
, scrb_alu(0)
|
||||
, scrb_fpu(0)
|
||||
, scrb_lsu(0)
|
||||
|
|
|
@ -392,6 +392,7 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) {
|
|||
CSR_READ_64(VX_CSR_MPM_SCHED_ST, core_perf.sched_stalls);
|
||||
CSR_READ_64(VX_CSR_MPM_IBUF_ST, core_perf.ibuf_stalls);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_ST, core_perf.scrb_stalls);
|
||||
CSR_READ_64(VX_CSR_MPM_OPDS_ST, core_perf.opds_stalls);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_ALU, core_perf.scrb_alu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_FPU, core_perf.scrb_fpu);
|
||||
CSR_READ_64(VX_CSR_MPM_SCRB_LSU, core_perf.scrb_lsu);
|
||||
|
|
|
@ -63,8 +63,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
trace->wid = wid;
|
||||
trace->PC = warp.PC;
|
||||
trace->tmask = warp.tmask;
|
||||
trace->rdest = instr.getRDest();
|
||||
trace->rdest_type = instr.getRDType();
|
||||
trace->dst_reg = {instr.getRDType(), instr.getRDest()};
|
||||
|
||||
auto next_pc = warp.PC + 4;
|
||||
auto next_tmask = warp.tmask;
|
||||
|
@ -128,7 +127,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
break;
|
||||
case RegType::None:
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -164,8 +163,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case Opcode::R: {
|
||||
trace->fu_type = FUType::ALU;
|
||||
trace->alu_type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Integer, rsrc1};
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
|
@ -341,7 +340,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case Opcode::I: {
|
||||
trace->fu_type = FUType::ALU;
|
||||
trace->alu_type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
|
@ -401,8 +400,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case Opcode::R_W: {
|
||||
trace->fu_type = FUType::ALU;
|
||||
trace->alu_type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Integer, rsrc1};
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
|
@ -528,7 +527,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case Opcode::I_W: {
|
||||
trace->fu_type = FUType::ALU;
|
||||
trace->alu_type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
|
@ -571,8 +570,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case Opcode::B: {
|
||||
trace->fu_type = FUType::ALU;
|
||||
trace->alu_type = AluType::BRANCH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Integer, rsrc1};
|
||||
bool all_taken = false;
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!warp.tmask.test(t))
|
||||
|
@ -660,7 +659,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
// RV32I: JALR
|
||||
trace->fu_type = FUType::ALU;
|
||||
trace->alu_type = AluType::BRANCH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
|
@ -675,7 +674,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case Opcode::FL: {
|
||||
trace->fu_type = FUType::LSU;
|
||||
trace->lsu_type = LsuType::LOAD;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
|
||||
trace->data = trace_data;
|
||||
uint32_t data_bytes = 1 << (func3 & 0x3);
|
||||
|
@ -719,8 +718,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case Opcode::FS: {
|
||||
trace->fu_type = FUType::LSU;
|
||||
trace->lsu_type = LsuType::STORE;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Integer, rsrc1};
|
||||
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
|
||||
trace->data = trace_data;
|
||||
uint32_t data_bytes = 1 << (func3 & 0x3);
|
||||
|
@ -746,8 +745,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case Opcode::AMO: {
|
||||
trace->fu_type = FUType::LSU;
|
||||
trace->lsu_type = LsuType::LOAD;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Integer, rsrc1};
|
||||
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
|
||||
trace->data = trace_data;
|
||||
auto amo_type = func7 >> 2;
|
||||
|
@ -846,7 +845,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
// RV32I: CSRRW
|
||||
rddata[t].i = csr_value;
|
||||
this->set_csr(csr_addr, rsdata[t][0].i, t, wid);
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->sfu_type = SfuType::CSRRW;
|
||||
rd_write = true;
|
||||
break;
|
||||
|
@ -857,7 +856,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
if (rsdata[t][0].i != 0) {
|
||||
this->set_csr(csr_addr, csr_value | rsdata[t][0].i, t, wid);
|
||||
}
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->sfu_type = SfuType::CSRRS;
|
||||
rd_write = true;
|
||||
break;
|
||||
|
@ -868,7 +867,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
if (rsdata[t][0].i != 0) {
|
||||
this->set_csr(csr_addr, csr_value & ~rsdata[t][0].i, t, wid);
|
||||
}
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->sfu_type = SfuType::CSRRC;
|
||||
rd_write = true;
|
||||
break;
|
||||
|
@ -925,57 +924,57 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case 0x00: { // RV32F: FADD.S
|
||||
rddata[t].u64 = nan_box(rv_fadd_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
|
||||
trace->fpu_type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x01: { // RV32D: FADD.D
|
||||
rddata[t].u64 = rv_fadd_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
|
||||
trace->fpu_type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x04: { // RV32F: FSUB.S
|
||||
rddata[t].u64 = nan_box(rv_fsub_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
|
||||
trace->fpu_type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x05: { // RV32D: FSUB.D
|
||||
rddata[t].u64 = rv_fsub_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
|
||||
trace->fpu_type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x08: { // RV32F: FMUL.S
|
||||
rddata[t].u64 = nan_box(rv_fmul_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
|
||||
trace->fpu_type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x09: { // RV32D: FMUL.D
|
||||
rddata[t].u64 = rv_fmul_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
|
||||
trace->fpu_type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x0c: { // RV32F: FDIV.S
|
||||
rddata[t].u64 = nan_box(rv_fdiv_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
|
||||
trace->fpu_type = FpuType::FDIV;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x0d: { // RV32D: FDIV.D
|
||||
rddata[t].u64 = rv_fdiv_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
|
||||
trace->fpu_type = FpuType::FDIV;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x10: {
|
||||
|
@ -991,8 +990,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
break;
|
||||
}
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x11: {
|
||||
|
@ -1008,8 +1007,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
break;
|
||||
}
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x14: {
|
||||
|
@ -1021,8 +1020,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
rddata[t].u64 = nan_box(rv_fmin_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags));
|
||||
}
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x15: {
|
||||
|
@ -1034,34 +1033,34 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
rddata[t].u64 = rv_fmin_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
|
||||
}
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x20: {
|
||||
// RV32D: FCVT.S.D
|
||||
rddata[t].u64 = nan_box(rv_dtof(rsdata[t][0].u64));
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x21: {
|
||||
// RV32D: FCVT.D.S
|
||||
rddata[t].u64 = rv_ftod(check_boxing(rsdata[t][0].u64));
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x2c: { // RV32F: FSQRT.S
|
||||
rddata[t].u64 = nan_box(rv_fsqrt_s(check_boxing(rsdata[t][0].u64), frm, &fflags));
|
||||
trace->fpu_type = FpuType::FSQRT;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x2d: { // RV32D: FSQRT.D
|
||||
rddata[t].u64 = rv_fsqrt_d(rsdata[t][0].u64, frm, &fflags);
|
||||
trace->fpu_type = FpuType::FSQRT;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x50: {
|
||||
|
@ -1080,8 +1079,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
break;
|
||||
}
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x51: {
|
||||
|
@ -1100,8 +1099,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
break;
|
||||
}
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
break;
|
||||
}
|
||||
case 0x60: {
|
||||
|
@ -1124,7 +1123,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
break;
|
||||
}
|
||||
trace->fpu_type = FpuType::FCVT;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x61: {
|
||||
|
@ -1147,7 +1146,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
break;
|
||||
}
|
||||
trace->fpu_type = FpuType::FCVT;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x68: {
|
||||
|
@ -1170,7 +1169,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
break;
|
||||
}
|
||||
trace->fpu_type = FpuType::FCVT;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x69: {
|
||||
|
@ -1193,7 +1192,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
break;
|
||||
}
|
||||
trace->fpu_type = FpuType::FCVT;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x70: {
|
||||
|
@ -1206,7 +1205,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
rddata[t].i = sext((uint64_t)result, 32);
|
||||
}
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x71: {
|
||||
|
@ -1218,19 +1217,19 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
rddata[t].i = rsdata[t][0].u64;
|
||||
}
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x78: { // RV32F: FMV.S.X
|
||||
rddata[t].u64 = nan_box((uint32_t)rsdata[t][0].i);
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
break;
|
||||
}
|
||||
case 0x79: { // RV64D: FMV.D.X
|
||||
rddata[t].u64 = rsdata[t][0].i;
|
||||
trace->fpu_type = FpuType::FNCP;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -1244,9 +1243,9 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
case Opcode::FMNMADD:
|
||||
case Opcode::FMNMSUB: {
|
||||
trace->fpu_type = FpuType::FMA;
|
||||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->used_fregs.set(rsrc2);
|
||||
trace->src_regs[0] = {RegType::Float, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Float, rsrc1};
|
||||
trace->src_regs[2] = {RegType::Float, rsrc2};
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!warp.tmask.test(t))
|
||||
continue;
|
||||
|
@ -1301,7 +1300,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
// TMC
|
||||
trace->fu_type = FUType::SFU;
|
||||
trace->sfu_type = SfuType::TMC;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->fetch_stall = true;
|
||||
next_tmask.reset();
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
|
@ -1312,8 +1311,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
// WSPAWN
|
||||
trace->fu_type = FUType::SFU;
|
||||
trace->sfu_type = SfuType::WSPAWN;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Integer, rsrc1};
|
||||
trace->fetch_stall = true;
|
||||
trace->data = std::make_shared<SFUTraceData>(rsdata.at(thread_last)[0].i, rsdata.at(thread_last)[1].i);
|
||||
} break;
|
||||
|
@ -1321,7 +1320,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
// SPLIT
|
||||
trace->fu_type = FUType::SFU;
|
||||
trace->sfu_type = SfuType::SPLIT;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->fetch_stall = true;
|
||||
|
||||
auto stack_size = warp.ipdom_stack.size();
|
||||
|
@ -1362,7 +1361,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
// JOIN
|
||||
trace->fu_type = FUType::SFU;
|
||||
trace->sfu_type = SfuType::JOIN;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->fetch_stall = true;
|
||||
|
||||
auto stack_ptr = warp.ireg_file.at(thread_last).at(rsrc0);
|
||||
|
@ -1382,8 +1381,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
// BAR
|
||||
trace->fu_type = FUType::SFU;
|
||||
trace->sfu_type = SfuType::BAR;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Integer, rsrc1};
|
||||
trace->fetch_stall = true;
|
||||
trace->data = std::make_shared<SFUTraceData>(rsdata[thread_last][0].i, rsdata[thread_last][1].i);
|
||||
} break;
|
||||
|
@ -1391,8 +1390,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
// PRED
|
||||
trace->fu_type = FUType::SFU;
|
||||
trace->sfu_type = SfuType::PRED;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->src_regs[0] = {RegType::Integer, rsrc0};
|
||||
trace->src_regs[1] = {RegType::Integer, rsrc1};
|
||||
trace->fetch_stall = true;
|
||||
ThreadMask pred;
|
||||
auto not_pred = rdest & 0x1;
|
||||
|
@ -1435,7 +1434,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
DPN(2, "0x" << std::hex << rddata[t].i);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
trace->used_iregs[rdest] = 1;
|
||||
trace->dst_reg = {type, rdest};
|
||||
assert(rdest != 0);
|
||||
} else {
|
||||
// disable writes to x0
|
||||
|
@ -1454,7 +1453,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
DPN(2, "0x" << std::hex << rddata[t].f);
|
||||
}
|
||||
DPN(2, "}" << std::endl);
|
||||
trace->used_fregs[rdest] = 1;
|
||||
trace->dst_reg = {type, rdest};
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
|
|
|
@ -45,6 +45,11 @@ struct SFUTraceData : public ITraceData {
|
|||
|
||||
struct instr_trace_t {
|
||||
public:
|
||||
struct reg_t {
|
||||
RegType type;
|
||||
uint32_t idx;
|
||||
};
|
||||
|
||||
//--
|
||||
const uint64_t uuid;
|
||||
const Arch& arch;
|
||||
|
@ -54,16 +59,13 @@ public:
|
|||
uint32_t wid;
|
||||
ThreadMask tmask;
|
||||
Word PC;
|
||||
|
||||
//--
|
||||
uint32_t rdest;
|
||||
RegType rdest_type;
|
||||
bool wb;
|
||||
|
||||
//--
|
||||
RegMask used_iregs;
|
||||
RegMask used_fregs;
|
||||
RegMask used_vregs;
|
||||
reg_t dst_reg;
|
||||
|
||||
//--
|
||||
std::vector<reg_t> src_regs;
|
||||
|
||||
//-
|
||||
FUType fu_type;
|
||||
|
@ -79,7 +81,7 @@ public:
|
|||
|
||||
ITraceData::Ptr data;
|
||||
|
||||
int pid;
|
||||
int pid;
|
||||
bool sop;
|
||||
bool eop;
|
||||
|
||||
|
@ -92,12 +94,9 @@ public:
|
|||
, wid(0)
|
||||
, tmask(0)
|
||||
, PC(0)
|
||||
, rdest(0)
|
||||
, rdest_type(RegType::None)
|
||||
, wb(false)
|
||||
, used_iregs(0)
|
||||
, used_fregs(0)
|
||||
, used_vregs(0)
|
||||
, dst_reg({RegType::None, 0})
|
||||
, src_regs(NUM_SRC_REGS, {RegType::None, 0})
|
||||
, fu_type(FUType::ALU)
|
||||
, unit_type(0)
|
||||
, data(nullptr)
|
||||
|
@ -115,12 +114,9 @@ public:
|
|||
, wid(rhs.wid)
|
||||
, tmask(rhs.tmask)
|
||||
, PC(rhs.PC)
|
||||
, rdest(rhs.rdest)
|
||||
, rdest_type(rhs.rdest_type)
|
||||
, wb(rhs.wb)
|
||||
, used_iregs(rhs.used_iregs)
|
||||
, used_fregs(rhs.used_fregs)
|
||||
, used_vregs(rhs.used_vregs)
|
||||
, dst_reg(rhs.dst_reg)
|
||||
, src_regs(rhs.src_regs)
|
||||
, fu_type(rhs.fu_type)
|
||||
, unit_type(rhs.unit_type)
|
||||
, data(rhs.data)
|
||||
|
@ -152,8 +148,13 @@ inline std::ostream &operator<<(std::ostream &os, const instr_trace_t& trace) {
|
|||
}
|
||||
os << ", PC=0x" << std::hex << trace.PC;
|
||||
os << ", wb=" << trace.wb;
|
||||
if (trace.wb) {
|
||||
os << ", rd=" << trace.rdest_type << std::dec << trace.rdest;
|
||||
if (trace.dst_reg.type != RegType::None) {
|
||||
os << ", rd=" << trace.dst_reg.type << std::dec << trace.dst_reg.idx;
|
||||
}
|
||||
for (uint32_t i = 0; i < trace.src_regs.size(); ++i) {
|
||||
if (trace.src_regs[i].type != RegType::None) {
|
||||
os << ", rs" << i << "=" << trace.src_regs[i].type << std::dec << trace.src_regs[i].idx;
|
||||
}
|
||||
}
|
||||
os << ", ex=" << trace.fu_type;
|
||||
if (trace.pid != -1) {
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -18,43 +18,61 @@
|
|||
namespace vortex {
|
||||
|
||||
class Operand : public SimObject<Operand> {
|
||||
private:
|
||||
static constexpr uint32_t NUM_BANKS = 4;
|
||||
uint32_t total_stalls_ = 0;
|
||||
|
||||
public:
|
||||
SimPort<instr_trace_t*> Input;
|
||||
SimPort<instr_trace_t*> Output;
|
||||
|
||||
Operand(const SimContext& ctx)
|
||||
: SimObject<Operand>(ctx, "Operand")
|
||||
Operand(const SimContext& ctx)
|
||||
: SimObject<Operand>(ctx, "Operand")
|
||||
, Input(this)
|
||||
, Output(this)
|
||||
{}
|
||||
{
|
||||
total_stalls_ = 0;
|
||||
}
|
||||
|
||||
virtual ~Operand() {}
|
||||
|
||||
virtual void reset() {}
|
||||
virtual void reset() {
|
||||
total_stalls_ = 0;
|
||||
}
|
||||
|
||||
virtual void tick() {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = Input.front();
|
||||
|
||||
int delay = 1;
|
||||
for (int i = 0; i < MAX_NUM_REGS; ++i) {
|
||||
bool is_iregs = trace->used_iregs.test(i);
|
||||
bool is_fregs = trace->used_fregs.test(i);
|
||||
bool is_vregs = trace->used_vregs.test(i);
|
||||
if (is_iregs || is_fregs || is_vregs) {
|
||||
if (is_iregs && i == 0)
|
||||
continue;
|
||||
++delay;
|
||||
uint32_t stalls = 0;
|
||||
|
||||
for (int i = 0; i < NUM_SRC_REGS; ++i) {
|
||||
for (int j = i + 1; j < NUM_SRC_REGS; ++j) {
|
||||
int bank_i = trace->src_regs[i].idx % NUM_BANKS;
|
||||
int bank_j = trace->src_regs[j].idx % NUM_BANKS;
|
||||
if ((trace->src_regs[i].type != RegType::None)
|
||||
&& (trace->src_regs[j].type != RegType::None)
|
||||
&& (trace->src_regs[i].idx != 0)
|
||||
&& (trace->src_regs[j].idx != 0)
|
||||
&& bank_i == bank_j) {
|
||||
++stalls;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Output.push(trace, delay);
|
||||
|
||||
total_stalls_ += stalls;
|
||||
|
||||
Output.push(trace, 1 + stalls);
|
||||
|
||||
DT(3, "pipeline-operands: " << *trace);
|
||||
|
||||
Input.pop();
|
||||
};
|
||||
|
||||
uint32_t total_stalls() const {
|
||||
return total_stalls_;
|
||||
}
|
||||
};
|
||||
|
||||
}
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -24,70 +24,70 @@ public:
|
|||
|
||||
struct reg_use_t {
|
||||
RegType reg_type;
|
||||
uint32_t reg_id;
|
||||
FUType fu_type;
|
||||
uint32_t reg_id;
|
||||
FUType fu_type;
|
||||
SfuType sfu_type;
|
||||
uint64_t uuid;
|
||||
};
|
||||
|
||||
Scoreboard(const Arch &arch)
|
||||
: in_use_iregs_(arch.num_warps())
|
||||
, in_use_fregs_(arch.num_warps())
|
||||
{
|
||||
|
||||
Scoreboard(const Arch &arch)
|
||||
: in_use_regs_(arch.num_warps()) {
|
||||
for (auto& in_use_reg : in_use_regs_) {
|
||||
in_use_reg.resize((int)RegType::Count);
|
||||
}
|
||||
this->clear();
|
||||
}
|
||||
|
||||
void clear() {
|
||||
for (uint32_t i = 0, n = in_use_iregs_.size(); i < n; ++i) {
|
||||
in_use_iregs_.at(i).reset();
|
||||
in_use_fregs_.at(i).reset();
|
||||
for (auto& in_use_reg : in_use_regs_) {
|
||||
for (auto& mask : in_use_reg) {
|
||||
mask.reset();
|
||||
}
|
||||
}
|
||||
owners_.clear();
|
||||
}
|
||||
|
||||
bool in_use(instr_trace_t* trace) const {
|
||||
return (trace->used_iregs & in_use_iregs_.at(trace->wid)) != 0
|
||||
|| (trace->used_fregs & in_use_fregs_.at(trace->wid)) != 0;
|
||||
if (trace->dst_reg.type != RegType::None) {
|
||||
if (in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).test(trace->dst_reg.idx)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
for (uint32_t i = 0; i < trace->src_regs.size(); ++i) {
|
||||
if (trace->src_regs[i].type != RegType::None) {
|
||||
if (in_use_regs_.at(trace->wid).at((int)trace->src_regs[i].type).test(trace->src_regs[i].idx)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<reg_use_t> get_uses(instr_trace_t* trace) const {
|
||||
std::vector<reg_use_t> out;
|
||||
|
||||
auto used_iregs = trace->used_iregs & in_use_iregs_.at(trace->wid);
|
||||
auto used_fregs = trace->used_fregs & in_use_fregs_.at(trace->wid);
|
||||
|
||||
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
|
||||
if (used_iregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Integer;
|
||||
if (trace->dst_reg.type != RegType::None) {
|
||||
if (in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).test(trace->dst_reg.idx)) {
|
||||
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Integer, r, owner->fu_type, owner->sfu_type, owner->uuid});
|
||||
out.push_back({trace->dst_reg.type, trace->dst_reg.idx, owner->fu_type, owner->sfu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t r = 0; r < MAX_NUM_REGS; ++r) {
|
||||
if (used_fregs.test(r)) {
|
||||
uint32_t tag = (r << 16) | (trace->wid << 4) | (int)RegType::Float;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({RegType::Float, r, owner->fu_type, owner->sfu_type, owner->uuid});
|
||||
for (uint32_t i = 0; i < trace->src_regs.size(); ++i) {
|
||||
if (trace->src_regs[i].type != RegType::None) {
|
||||
if (in_use_regs_.at(trace->wid).at((int)trace->src_regs[i].type).test(trace->src_regs[i].idx)) {
|
||||
uint32_t tag = (trace->src_regs[i].idx << 16) | (trace->wid << 4) | (int)trace->src_regs[i].type;
|
||||
auto owner = owners_.at(tag);
|
||||
out.push_back({trace->src_regs[i].type, trace->src_regs[i].idx, owner->fu_type, owner->sfu_type, owner->uuid});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
|
||||
void reserve(instr_trace_t* trace) {
|
||||
assert(trace->wb);
|
||||
switch (trace->rdest_type) {
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(trace->wid).set(trace->rdest);
|
||||
break;
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(trace->wid).set(trace->rdest);
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
|
||||
in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).set(trace->dst_reg.idx);
|
||||
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
|
||||
assert(owners_.count(tag) == 0);
|
||||
owners_[tag] = trace;
|
||||
assert((int)trace->fu_type < 5);
|
||||
|
@ -95,24 +95,14 @@ public:
|
|||
|
||||
void release(instr_trace_t* trace) {
|
||||
assert(trace->wb);
|
||||
switch (trace->rdest_type) {
|
||||
case RegType::Integer:
|
||||
in_use_iregs_.at(trace->wid).reset(trace->rdest);
|
||||
break;
|
||||
case RegType::Float:
|
||||
in_use_fregs_.at(trace->wid).reset(trace->rdest);
|
||||
break;
|
||||
default:
|
||||
assert(false);
|
||||
}
|
||||
uint32_t tag = (trace->rdest << 16) | (trace->wid << 4) | (int)trace->rdest_type;
|
||||
in_use_regs_.at(trace->wid).at((int)trace->dst_reg.type).reset(trace->dst_reg.idx);
|
||||
uint32_t tag = (trace->dst_reg.idx << 16) | (trace->wid << 4) | (int)trace->dst_reg.type;
|
||||
owners_.erase(tag);
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
std::vector<RegMask> in_use_iregs_;
|
||||
std::vector<RegMask> in_use_fregs_;
|
||||
std::vector<std::vector<RegMask>> in_use_regs_;
|
||||
std::unordered_map<uint32_t, instr_trace_t*> owners_;
|
||||
};
|
||||
|
||||
|
|
|
@ -47,6 +47,7 @@ typedef uint64_t WordF;
|
|||
#define MAX_NUM_THREADS 32
|
||||
#define MAX_NUM_WARPS 32
|
||||
#define MAX_NUM_REGS 32
|
||||
#define NUM_SRC_REGS 3
|
||||
|
||||
typedef std::bitset<MAX_NUM_CORES> CoreMask;
|
||||
typedef std::bitset<MAX_NUM_REGS> RegMask;
|
||||
|
@ -58,7 +59,8 @@ typedef std::bitset<MAX_NUM_WARPS> WarpMask;
|
|||
enum class RegType {
|
||||
None,
|
||||
Integer,
|
||||
Float
|
||||
Float,
|
||||
Count
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue