vortex/sim/simx/execute.cpp
2024-11-01 09:26:30 -04:00

2065 lines
59 KiB
C++

// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <bitset>
#include <climits>
#include <sys/types.h>
#include <sys/stat.h>
#include <assert.h>
#include <util.h>
#include <rvfloats.h>
#include "emulator.h"
#include "instr.h"
#include "core.h"
// #define DEFAULT
#define GROUPS
using namespace vortex;
union reg_data_t {
Word u;
WordI i;
WordF f;
float f32;
double f64;
uint32_t u32;
uint64_t u64;
int32_t i32;
int64_t i64;
};
inline uint64_t nan_box(uint32_t value) {
return value | 0xffffffff00000000;
}
inline bool is_nan_boxed(uint64_t value) {
return (uint32_t(value >> 32) == 0xffffffff);
}
inline int64_t check_boxing(int64_t a) {
if (is_nan_boxed(a))
return a;
return nan_box(0x7fc00000); // NaN
}
void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
#ifdef DEFAULT
auto& warp = warps_.at(wid);
assert(warp.tmask.any());
#endif
#ifdef GROUPS
auto& warp = warps_;
assert(warp[wid].tmask.any());
#endif
// initialize instruction trace
trace->cid = core_->id();
#ifdef DEFAULT
trace->wid = wid;
trace->PC = warp.PC;
trace->tmask = warp.tmask;
#endif
#ifdef GROUPS
trace->wid = 0;
trace->PC = warp[wid].PC;
trace->tmask = warp[wid].tmask;
#endif
trace->rdest = instr.getRDest();
trace->rdest_type = instr.getRDType();
#ifdef DEFAULT
auto next_pc = warp.PC + 4;
auto next_tmask = warp.tmask;
#endif
#ifdef GROUPS
auto next_pc = warp[wid].PC + 4;
auto next_tmask = warp[wid].tmask;
#endif
auto opcode = instr.getOpcode();
auto func2 = instr.getFunc2();
auto func3 = instr.getFunc3();
auto func7 = instr.getFunc7();
auto rdest = instr.getRDest();
auto rsrc0 = instr.getRSrc(0);
auto rsrc1 = instr.getRSrc(1);
auto rsrc2 = instr.getRSrc(2);
auto immsrc = sext((Word)instr.getImm(), 32);
#ifdef DEFAULT
auto num_threads = arch_.num_threads();
uint32_t thread_start = 0;
#endif
#ifdef GROUPS
auto num_threads = warp[wid].num_tThreads;
uint32_t thread_start = 0;
#endif
for (; thread_start < num_threads; ++thread_start) {
#ifdef DEFAULT
if (warp.tmask.test(thread_start))
break;
#endif
#ifdef GROUPS
if (warp[wid].tmask.test(thread_start))
break;
#endif
}
int32_t thread_last = num_threads - 1;
for (; thread_last >= 0; --thread_last) {
#ifdef DEFAULT
if (warp.tmask.test(thread_last))
break;
#endif
#ifdef GROUPS
if (warp[wid].tmask.test(thread_last))
break;
#endif
}
std::vector<reg_data_t[3]> rsdata(num_threads);
std::vector<reg_data_t> rddata(num_threads);
auto num_rsrcs = instr.getNRSrc();
if (num_rsrcs) {
for (uint32_t i = 0; i < num_rsrcs; ++i) {
auto type = instr.getRSType(i);
auto reg = instr.getRSrc(i);
switch (type) {
case RegType::Integer:
DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
#ifdef DEFAULT
if (!warp.tmask.test(t)) {
DPN(2, "-");
continue;
}
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t)) {
DPN(2, "-");
continue;
}
#endif
#ifdef DEFAULT
rsdata[t][i].u = warp.ireg_file.at(t)[reg];
#endif
#ifdef GROUPS
rsdata[t][i].u = warp[wid + t/THREAD_PER_TILE].ireg_file.at(t%THREAD_PER_TILE)[reg];
#endif
DPN(2, "0x" << std::hex << rsdata[t][i].i);
}
DPN(2, "}" << std::endl);
break;
case RegType::Float:
DPH(2, "Src" << std::dec << i << " Reg: " << type << std::dec << reg << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
#ifdef DEFAULT
if (!warp.tmask.test(t)) {
DPN(2, "-");
continue;
}
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t)) {
DPN(2, "-");
continue;
}
#endif
#ifdef DEFAULT
rsdata[t][i].u64 = warp.freg_file.at(t)[reg];
#endif
#ifdef GROUPS
rsdata[t][i].u64 = warp[wid + t/THREAD_PER_TILE].freg_file.at(t%THREAD_PER_TILE)[reg];
#endif
DPN(2, "0x" << std::hex << rsdata[t][i].f);
}
DPN(2, "}" << std::endl);
break;
case RegType::None:
break;
}
}
}
bool rd_write = false;
switch (opcode) {
case Opcode::LUI: {
// RV32I: LUI
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::ARITH;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
rddata[t].i = immsrc;
}
rd_write = true;
break;
}
case Opcode::AUIPC: {
// RV32I: AUIPC
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::ARITH;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
#ifdef DEFAULT
rddata[t].i = immsrc + warp.PC;
#endif
#ifdef GROUPS
rddata[t].i = immsrc + warp[wid].PC;
#endif
}
rd_write = true;
break;
}
case Opcode::R: {
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
if (func7 == 0x7) {
auto value = rsdata[t][0].i;
auto cond = rsdata[t][1].i;
if (func3 == 0x5) {
// CZERO.EQZ
rddata[t].i = (cond == 0) ? 0 : value;
trace->alu_type = AluType::ARITH;
} else
if (func3 == 0x7) {
// CZERO.NEZ
rddata[t].i = (cond != 0) ? 0 : value;
trace->alu_type = AluType::ARITH;
} else {
std::abort();
}
} else
if (func7 & 0x1) {
switch (func3) {
case 0: {
// RV32M: MUL
rddata[t].i = rsdata[t][0].i * rsdata[t][1].i;
trace->alu_type = AluType::IMUL;
break;
}
case 1: {
// RV32M: MULH
auto first = static_cast<DWordI>(rsdata[t][0].i);
auto second = static_cast<DWordI>(rsdata[t][1].i);
rddata[t].i = (first * second) >> XLEN;
trace->alu_type = AluType::IMUL;
break;
}
case 2: {
// RV32M: MULHSU
auto first = static_cast<DWordI>(rsdata[t][0].i);
auto second = static_cast<DWord>(rsdata[t][1].u);
rddata[t].i = (first * second) >> XLEN;
trace->alu_type = AluType::IMUL;
break;
}
case 3: {
// RV32M: MULHU
auto first = static_cast<DWord>(rsdata[t][0].u);
auto second = static_cast<DWord>(rsdata[t][1].u);
rddata[t].i = (first * second) >> XLEN;
trace->alu_type = AluType::IMUL;
break;
}
case 4: {
// RV32M: DIV
auto dividen = rsdata[t][0].i;
auto divisor = rsdata[t][1].i;
auto largest_negative = WordI(1) << (XLEN-1);
if (divisor == 0) {
rddata[t].i = -1;
} else if (dividen == largest_negative && divisor == -1) {
rddata[t].i = dividen;
} else {
rddata[t].i = dividen / divisor;
}
trace->alu_type = AluType::IDIV;
break;
}
case 5: {
// RV32M: DIVU
auto dividen = rsdata[t][0].u;
auto divisor = rsdata[t][1].u;
if (divisor == 0) {
rddata[t].i = -1;
} else {
rddata[t].i = dividen / divisor;
}
trace->alu_type = AluType::IDIV;
break;
}
case 6: {
// RV32M: REM
auto dividen = rsdata[t][0].i;
auto divisor = rsdata[t][1].i;
auto largest_negative = WordI(1) << (XLEN-1);
if (rsdata[t][1].i == 0) {
rddata[t].i = dividen;
} else if (dividen == largest_negative && divisor == -1) {
rddata[t].i = 0;
} else {
rddata[t].i = dividen % divisor;
}
trace->alu_type = AluType::IDIV;
break;
}
case 7: {
// RV32M: REMU
auto dividen = rsdata[t][0].u;
auto divisor = rsdata[t][1].u;
if (rsdata[t][1].i == 0) {
rddata[t].i = dividen;
} else {
rddata[t].i = dividen % divisor;
}
trace->alu_type = AluType::IDIV;
break;
}
default:
std::abort();
}
} else {
switch (func3) {
case 0: {
if (func7 & 0x20) {
// RV32I: SUB
rddata[t].i = rsdata[t][0].i - rsdata[t][1].i;
} else {
// RV32I: ADD
rddata[t].i = rsdata[t][0].i + rsdata[t][1].i;
}
break;
}
case 1: {
// RV32I: SLL
Word shamt_mask = (Word(1) << log2up(XLEN)) - 1;
Word shamt = rsdata[t][1].i & shamt_mask;
rddata[t].i = rsdata[t][0].i << shamt;
break;
}
case 2: {
// RV32I: SLT
rddata[t].i = rsdata[t][0].i < rsdata[t][1].i;
break;
}
case 3: {
// RV32I: SLTU
rddata[t].i = rsdata[t][0].u < rsdata[t][1].u;
break;
}
case 4: {
// RV32I: XOR
rddata[t].i = rsdata[t][0].i ^ rsdata[t][1].i;
break;
}
case 5: {
Word shamt_mask = ((Word)1 << log2up(XLEN)) - 1;
Word shamt = rsdata[t][1].i & shamt_mask;
if (func7 & 0x20) {
// RV32I: SRA
rddata[t].i = rsdata[t][0].i >> shamt;
} else {
// RV32I: SRL
rddata[t].i = rsdata[t][0].u >> shamt;
}
break;
}
case 6: {
// RV32I: OR
rddata[t].i = rsdata[t][0].i | rsdata[t][1].i;
break;
}
case 7: {
// RV32I: AND
rddata[t].i = rsdata[t][0].i & rsdata[t][1].i;
break;
}
default:
std::abort();
}
}
}
rd_write = true;
break;
}
case Opcode::I: {
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
switch (func3) {
case 0: {
// RV32I: ADDI
rddata[t].i = rsdata[t][0].i + immsrc;
break;
}
case 1: {
// RV32I: SLLI
rddata[t].i = rsdata[t][0].i << immsrc;
break;
}
case 2: {
// RV32I: SLTI
rddata[t].i = rsdata[t][0].i < WordI(immsrc);
break;
}
case 3: {
// RV32I: SLTIU
rddata[t].i = rsdata[t][0].u < immsrc;
break;
}
case 4: {
// RV32I: XORI
rddata[t].i = rsdata[t][0].i ^ immsrc;
break;
}
case 5: {
if (func7 & 0x20) {
// RV32I: SRAI
Word result = rsdata[t][0].i >> immsrc;
rddata[t].i = result;
} else {
// RV32I: SRLI
Word result = rsdata[t][0].u >> immsrc;
rddata[t].i = result;
}
break;
}
case 6: {
// RV32I: ORI
rddata[t].i = rsdata[t][0].i | immsrc;
break;
}
case 7: {
// RV32I: ANDI
rddata[t].i = rsdata[t][0].i & immsrc;
break;
}
}
}
rd_write = true;
break;
}
case Opcode::R_W: {
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
if (func7 & 0x1) {
switch (func3) {
case 0: {
// RV64M: MULW
int32_t product = (int32_t)rsdata[t][0].i * (int32_t)rsdata[t][1].i;
rddata[t].i = sext((uint64_t)product, 32);
trace->alu_type = AluType::IMUL;
break;
}
case 4: {
// RV64M: DIVW
int32_t dividen = (int32_t)rsdata[t][0].i;
int32_t divisor = (int32_t)rsdata[t][1].i;
int32_t quotient;
int32_t largest_negative = 0x80000000;
if (divisor == 0){
quotient = -1;
} else if (dividen == largest_negative && divisor == -1) {
quotient = dividen;
} else {
quotient = dividen / divisor;
}
rddata[t].i = sext((uint64_t)quotient, 32);
trace->alu_type = AluType::IDIV;
break;
}
case 5: {
// RV64M: DIVUW
uint32_t dividen = (uint32_t)rsdata[t][0].i;
uint32_t divisor = (uint32_t)rsdata[t][1].i;
uint32_t quotient;
if (divisor == 0){
quotient = -1;
} else {
quotient = dividen / divisor;
}
rddata[t].i = sext((uint64_t)quotient, 32);
trace->alu_type = AluType::IDIV;
break;
}
case 6: {
// RV64M: REMW
int32_t dividen = (uint32_t)rsdata[t][0].i;
int32_t divisor = (uint32_t)rsdata[t][1].i;
int32_t remainder;
int32_t largest_negative = 0x80000000;
if (divisor == 0){
remainder = dividen;
} else if (dividen == largest_negative && divisor == -1) {
remainder = 0;
} else {
remainder = dividen % divisor;
}
rddata[t].i = sext((uint64_t)remainder, 32);
trace->alu_type = AluType::IDIV;
break;
}
case 7: {
// RV64M: REMUW
uint32_t dividen = (uint32_t)rsdata[t][0].i;
uint32_t divisor = (uint32_t)rsdata[t][1].i;
uint32_t remainder;
if (divisor == 0){
remainder = dividen;
} else {
remainder = dividen % divisor;
}
rddata[t].i = sext((uint64_t)remainder, 32);
trace->alu_type = AluType::IDIV;
break;
}
default:
std::abort();
}
} else {
switch (func3) {
case 0: {
if (func7 & 0x20){
// RV64I: SUBW
uint32_t result = (uint32_t)rsdata[t][0].i - (uint32_t)rsdata[t][1].i;
rddata[t].i = sext((uint64_t)result, 32);
}
else{
// RV64I: ADDW
uint32_t result = (uint32_t)rsdata[t][0].i + (uint32_t)rsdata[t][1].i;
rddata[t].i = sext((uint64_t)result, 32);
}
break;
}
case 1: {
// RV64I: SLLW
uint32_t shamt_mask = 0x1F;
uint32_t shamt = rsdata[t][1].i & shamt_mask;
uint32_t result = (uint32_t)rsdata[t][0].i << shamt;
rddata[t].i = sext((uint64_t)result, 32);
break;
}
case 5: {
uint32_t shamt_mask = 0x1F;
uint32_t shamt = rsdata[t][1].i & shamt_mask;
uint32_t result;
if (func7 & 0x20) {
// RV64I: SRAW
result = (int32_t)rsdata[t][0].i >> shamt;
} else {
// RV64I: SRLW
result = (uint32_t)rsdata[t][0].i >> shamt;
}
rddata[t].i = sext((uint64_t)result, 32);
break;
}
default:
std::abort();
}
}
}
rd_write = true;
break;
}
case Opcode::I_W: {
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
switch (func3) {
case 0: {
// RV64I: ADDIW
uint32_t result = (uint32_t)rsdata[t][0].i + (uint32_t)immsrc;
rddata[t].i = sext((uint64_t)result, 32);
break;
}
case 1: {
// RV64I: SLLIW
uint32_t shamt_mask = 0x1F;
uint32_t shamt = immsrc & shamt_mask;
uint32_t result = rsdata[t][0].i << shamt;
rddata[t].i = sext((uint64_t)result, 32);
break;
}
case 5: {
uint32_t shamt_mask = 0x1F;
uint32_t shamt = immsrc & shamt_mask;
uint32_t result;
if (func7 & 0x20) {
// RV64I: SRAIW
result = (int32_t)rsdata[t][0].i >> shamt;
} else {
// RV64I: SRLIW
result = (uint32_t)rsdata[t][0].i >> shamt;
}
rddata[t].i = sext((uint64_t)result, 32);
break;
}
default:
std::abort();
}
}
rd_write = true;
break;
}
case Opcode::B: {
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::BRANCH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
bool all_taken = false;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
bool curr_taken = false;
switch (func3) {
case 0: {
// RV32I: BEQ
if (rsdata[t][0].i == rsdata[t][1].i) {
#ifdef DEFAULT
next_pc = warp.PC + immsrc;
#endif
#ifdef GROUPS
next_pc = warp[wid].PC + immsrc;
#endif
curr_taken = true;
}
break;
}
case 1: {
// RV32I: BNE
if (rsdata[t][0].i != rsdata[t][1].i) {
#ifdef DEFAULT
next_pc = warp.PC + immsrc;
#endif
#ifdef GROUPS
next_pc = warp[wid].PC + immsrc;
#endif
curr_taken = true;
}
break;
}
case 4: {
// RV32I: BLT
if (rsdata[t][0].i < rsdata[t][1].i) {
#ifdef DEFAULT
next_pc = warp.PC + immsrc;
#endif
#ifdef GROUPS
next_pc = warp[wid].PC + immsrc;
#endif
curr_taken = true;
}
break;
}
case 5: {
// RV32I: BGE
if (rsdata[t][0].i >= rsdata[t][1].i) {
#ifdef DEFAULT
next_pc = warp.PC + immsrc;
#endif
#ifdef GROUPS
next_pc = warp[wid].PC + immsrc;
#endif
curr_taken = true;
}
break;
}
case 6: {
// RV32I: BLTU
if (rsdata[t][0].u < rsdata[t][1].u) {
#ifdef DEFAULT
next_pc = warp.PC + immsrc;
#endif
#ifdef GROUPS
next_pc = warp[wid].PC + immsrc;
#endif
curr_taken = true;
}
break;
}
case 7: {
// RV32I: BGEU
if (rsdata[t][0].u >= rsdata[t][1].u) {
#ifdef DEFAULT
next_pc = warp.PC + immsrc;
#endif
#ifdef GROUPS
next_pc = warp[wid].PC + immsrc;
#endif
curr_taken = true;
}
break;
}
default:
std::abort();
}
if (t == thread_start) {
all_taken = curr_taken;
} else {
if (all_taken != curr_taken) {
#ifdef DEFAULT
std::cout << "divergent branch! PC=0x" << std::hex << warp.PC << " (#" << std::dec << trace->uuid << ")\n" << std::flush;
#endif
#ifdef GROUPS
std::cout << "divergent branch! PC=0x" << std::hex << warp[wid].PC << " (#" << std::dec << trace->uuid << ")\n" << std::flush;
#endif
std::abort();
}
}
}
trace->fetch_stall = true;
break;
}
case Opcode::JAL: {
// RV32I: JAL
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::BRANCH;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
rddata[t].i = next_pc;
}
#ifdef DEFAULT
next_pc = warp.PC + immsrc;
#endif
#ifdef GROUPS
next_pc = warp[wid].PC + immsrc;
#endif
trace->fetch_stall = true;
rd_write = true;
break;
}
case Opcode::JALR: {
// RV32I: JALR
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::BRANCH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
rddata[t].i = next_pc;
}
next_pc = rsdata[thread_last][0].i + immsrc;
trace->fetch_stall = true;
rd_write = true;
break;
}
case Opcode::L:
case Opcode::FL: {
trace->fu_type = FUType::LSU;
trace->lsu_type = LsuType::LOAD;
trace->used_iregs.set(rsrc0);
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
trace->data = trace_data;
uint32_t data_bytes = 1 << (func3 & 0x3);
uint32_t data_width = 8 * data_bytes;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
uint64_t mem_addr = rsdata[t][0].i + immsrc;
uint64_t read_data = 0;
this->dcache_read(&read_data, mem_addr, data_bytes);
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
switch (func3) {
case 0: // RV32I: LB
case 1: // RV32I: LH
rddata[t].i = sext((Word)read_data, data_width);
break;
case 2:
if (opcode == Opcode::L) {
// RV32I: LW
rddata[t].i = sext((Word)read_data, data_width);
} else {
// RV32F: FLW
rddata[t].u64 = nan_box((uint32_t)read_data);
}
break;
case 3: // RV64I: LD
// RV32D: FLD
case 4: // RV32I: LBU
case 5: // RV32I: LHU
case 6: // RV64I: LWU
rddata[t].u64 = read_data;
break;
default:
std::abort();
}
}
rd_write = true;
break;
}
case Opcode::S:
case Opcode::FS: {
trace->fu_type = FUType::LSU;
trace->lsu_type = LsuType::STORE;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
trace->data = trace_data;
uint32_t data_bytes = 1 << (func3 & 0x3);
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
uint64_t mem_addr = rsdata[t][0].i + immsrc;
uint64_t write_data = rsdata[t][1].u64;
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
switch (func3) {
case 0:
case 1:
case 2:
case 3:
this->dcache_write(&write_data, mem_addr, data_bytes);
break;
default:
std::abort();
}
}
break;
}
case Opcode::AMO: {
trace->fu_type = FUType::LSU;
trace->lsu_type = LsuType::LOAD;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
auto trace_data = std::make_shared<LsuTraceData>(num_threads);
trace->data = trace_data;
auto amo_type = func7 >> 2;
uint32_t data_bytes = 1 << (func3 & 0x3);
uint32_t data_width = 8 * data_bytes;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
uint64_t mem_addr = rsdata[t][0].u;
trace_data->mem_addrs.at(t) = {mem_addr, data_bytes};
if (amo_type == 0x02) { // LR
uint64_t read_data = 0;
this->dcache_read(&read_data, mem_addr, data_bytes);
this->dcache_amo_reserve(mem_addr);
rddata[t].i = sext((Word)read_data, data_width);
} else
if (amo_type == 0x03) { // SC
if (this->dcache_amo_check(mem_addr)) {
this->dcache_write(&rsdata[t][1].u64, mem_addr, data_bytes);
rddata[t].i = 0;
} else {
rddata[t].i = 1;
}
} else {
uint64_t read_data = 0;
this->dcache_read(&read_data, mem_addr, data_bytes);
auto read_data_i = sext((WordI)read_data, data_width);
auto rs1_data_i = sext((WordI)rsdata[t][1].u64, data_width);
auto read_data_u = zext((Word)read_data, data_width);
auto rs1_data_u = zext((Word)rsdata[t][1].u64, data_width);
uint64_t result;
switch (amo_type) {
case 0x00: // AMOADD
result = read_data_i + rs1_data_i;
break;
case 0x01: // AMOSWAP
result = rs1_data_u;
break;
case 0x04: // AMOXOR
result = read_data_u ^ rs1_data_u;
break;
case 0x08: // AMOOR
result = read_data_u | rs1_data_u;
break;
case 0x0c: // AMOAND
result = read_data_u & rs1_data_u;
break;
case 0x10: // AMOMIN
result = std::min(read_data_i, rs1_data_i);
break;
case 0x14: // AMOMAX
result = std::max(read_data_i, rs1_data_i);
break;
case 0x18: // AMOMINU
result = std::min(read_data_u, rs1_data_u);
break;
case 0x1c: // AMOMAXU
result = std::max(read_data_u, rs1_data_u);
break;
default:
std::abort();
}
this->dcache_write(&result, mem_addr, data_bytes);
rddata[t].i = read_data_i;
}
}
rd_write = true;
break;
}
case Opcode::SYS: {
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
uint32_t csr_addr = immsrc;
Word csr_value;
if (func3 == 0) {
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::SYSCALL;
trace->fetch_stall = true;
switch (csr_addr) {
case 0x000: // RV32I: ECALL
case 0x001: // RV32I: EBREAK
case 0x002: // RV32I: URET
case 0x102: // RV32I: SRET
case 0x302: // RV32I: MRET
break;
default:
std::abort();
}
} else {
trace->fu_type = FUType::SFU;
trace->fetch_stall = true;
csr_value = this->get_csr(csr_addr, t,wid);
switch (func3) {
case 1: {
// RV32I: CSRRW
rddata[t].i = csr_value;
this->set_csr(csr_addr, rsdata[t][0].i, t, wid);
trace->used_iregs.set(rsrc0);
trace->sfu_type = SfuType::CSRRW;
rd_write = true;
break;
}
case 2: {
// RV32I: CSRRS
rddata[t].i = csr_value;
if (rsdata[t][0].i != 0) {
this->set_csr(csr_addr, csr_value | rsdata[t][0].i, t, wid);
}
trace->used_iregs.set(rsrc0);
trace->sfu_type = SfuType::CSRRS;
rd_write = true;
break;
}
case 3: {
// RV32I: CSRRC
rddata[t].i = csr_value;
if (rsdata[t][0].i != 0) {
this->set_csr(csr_addr, csr_value & ~rsdata[t][0].i, t, wid);
}
trace->used_iregs.set(rsrc0);
trace->sfu_type = SfuType::CSRRC;
rd_write = true;
break;
}
case 5: {
// RV32I: CSRRWI
rddata[t].i = csr_value;
this->set_csr(csr_addr, rsrc0, t, wid);
trace->sfu_type = SfuType::CSRRW;
rd_write = true;
break;
}
case 6: {
// RV32I: CSRRSI;
rddata[t].i = csr_value;
if (rsrc0 != 0) {
this->set_csr(csr_addr, csr_value | rsrc0, t, wid);
}
trace->sfu_type = SfuType::CSRRS;
rd_write = true;
break;
}
case 7: {
// RV32I: CSRRCI
rddata[t].i = csr_value;
if (rsrc0 != 0) {
this->set_csr(csr_addr, csr_value & ~rsrc0, t, wid);
}
trace->sfu_type = SfuType::CSRRC;
rd_write = true;
break;
}
default:
break;
}
}
}
break;
}
case Opcode::FENCE: {
// RV32I: FENCE
trace->fu_type = FUType::LSU;
trace->lsu_type = LsuType::FENCE;
break;
}
case Opcode::FCI: {
trace->fu_type = FUType::FPU;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
uint32_t frm = this->get_fpu_rm(func3, t, wid);
uint32_t fflags = 0;
switch (func7) {
case 0x00: { // RV32F: FADD.S
rddata[t].u64 = nan_box(rv_fadd_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x01: { // RV32D: FADD.D
rddata[t].u64 = rv_fadd_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x04: { // RV32F: FSUB.S
rddata[t].u64 = nan_box(rv_fsub_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x05: { // RV32D: FSUB.D
rddata[t].u64 = rv_fsub_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x08: { // RV32F: FMUL.S
rddata[t].u64 = nan_box(rv_fmul_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x09: { // RV32D: FMUL.D
rddata[t].u64 = rv_fmul_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x0c: { // RV32F: FDIV.S
rddata[t].u64 = nan_box(rv_fdiv_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), frm, &fflags));
trace->fpu_type = FpuType::FDIV;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x0d: { // RV32D: FDIV.D
rddata[t].u64 = rv_fdiv_d(rsdata[t][0].u64, rsdata[t][1].u64, frm, &fflags);
trace->fpu_type = FpuType::FDIV;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x10: {
switch (func3) {
case 0: // RV32F: FSGNJ.S
rddata[t].u64 = nan_box(rv_fsgnj_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64)));
break;
case 1: // RV32F: FSGNJN.S
rddata[t].u64 = nan_box(rv_fsgnjn_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64)));
break;
case 2: // RV32F: FSGNJX.S
rddata[t].u64 = nan_box(rv_fsgnjx_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64)));
break;
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x11: {
switch (func3) {
case 0: // RV32D: FSGNJ.D
rddata[t].u64 = rv_fsgnj_d(rsdata[t][0].u64, rsdata[t][1].u64);
break;
case 1: // RV32D: FSGNJN.D
rddata[t].u64 = rv_fsgnjn_d(rsdata[t][0].u64, rsdata[t][1].u64);
break;
case 2: // RV32D: FSGNJX.D
rddata[t].u64 = rv_fsgnjx_d(rsdata[t][0].u64, rsdata[t][1].u64);
break;
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x14: {
if (func3) {
// RV32F: FMAX.S
rddata[t].u64 = nan_box(rv_fmax_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags));
} else {
// RV32F: FMIN.S
rddata[t].u64 = nan_box(rv_fmin_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags));
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x15: {
if (func3) {
// RV32D: FMAX.D
rddata[t].u64 = rv_fmax_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
} else {
// RV32D: FMIN.D
rddata[t].u64 = rv_fmin_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x20: {
// RV32D: FCVT.S.D
rddata[t].u64 = nan_box(rv_dtof(rsdata[t][0].u64));
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
break;
}
case 0x21: {
// RV32D: FCVT.D.S
rddata[t].u64 = rv_ftod(check_boxing(rsdata[t][0].u64));
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
break;
}
case 0x2c: { // RV32F: FSQRT.S
rddata[t].u64 = nan_box(rv_fsqrt_s(check_boxing(rsdata[t][0].u64), frm, &fflags));
trace->fpu_type = FpuType::FSQRT;
trace->used_fregs.set(rsrc0);
break;
}
case 0x2d: { // RV32D: FSQRT.D
rddata[t].u64 = rv_fsqrt_d(rsdata[t][0].u64, frm, &fflags);
trace->fpu_type = FpuType::FSQRT;
trace->used_fregs.set(rsrc0);
break;
}
case 0x50: {
switch (func3) {
case 0:
// RV32F: FLE.S
rddata[t].i = rv_fle_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags);
break;
case 1:
// RV32F: FLT.S
rddata[t].i = rv_flt_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags);
break;
case 2:
// RV32F: FEQ.S
rddata[t].i = rv_feq_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), &fflags);
break;
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x51: {
switch (func3) {
case 0:
// RV32D: FLE.D
rddata[t].i = rv_fle_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
break;
case 1:
// RV32D: FLT.D
rddata[t].i = rv_flt_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
break;
case 2:
// RV32D: FEQ.D
rddata[t].i = rv_feq_d(rsdata[t][0].u64, rsdata[t][1].u64, &fflags);
break;
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
break;
}
case 0x60: {
switch (rsrc1) {
case 0:
// RV32F: FCVT.W.S
rddata[t].i = sext((uint64_t)rv_ftoi_s(check_boxing(rsdata[t][0].u64), frm, &fflags), 32);
break;
case 1:
// RV32F: FCVT.WU.S
rddata[t].i = sext((uint64_t)rv_ftou_s(check_boxing(rsdata[t][0].u64), frm, &fflags), 32);
break;
case 2:
// RV64F: FCVT.L.S
rddata[t].i = rv_ftol_s(check_boxing(rsdata[t][0].u64), frm, &fflags);
break;
case 3:
// RV64F: FCVT.LU.S
rddata[t].i = rv_ftolu_s(check_boxing(rsdata[t][0].u64), frm, &fflags);
break;
}
trace->fpu_type = FpuType::FCVT;
trace->used_fregs.set(rsrc0);
break;
}
case 0x61: {
switch (rsrc1) {
case 0:
// RV32D: FCVT.W.D
rddata[t].i = sext((uint64_t)rv_ftoi_d(rsdata[t][0].u64, frm, &fflags), 32);
break;
case 1:
// RV32D: FCVT.WU.D
rddata[t].i = sext((uint64_t)rv_ftou_d(rsdata[t][0].u64, frm, &fflags), 32);
break;
case 2:
// RV64D: FCVT.L.D
rddata[t].i = rv_ftol_d(rsdata[t][0].u64, frm, &fflags);
break;
case 3:
// RV64D: FCVT.LU.D
rddata[t].i = rv_ftolu_d(rsdata[t][0].u64, frm, &fflags);
break;
}
trace->fpu_type = FpuType::FCVT;
trace->used_fregs.set(rsrc0);
break;
}
case 0x68: {
switch (rsrc1) {
case 0:
// RV32F: FCVT.S.W
rddata[t].u64 = nan_box(rv_itof_s(rsdata[t][0].i, frm, &fflags));
break;
case 1:
// RV32F: FCVT.S.WU
rddata[t].u64 = nan_box(rv_utof_s(rsdata[t][0].i, frm, &fflags));
break;
case 2:
// RV64F: FCVT.S.L
rddata[t].u64 = nan_box(rv_ltof_s(rsdata[t][0].i, frm, &fflags));
break;
case 3:
// RV64F: FCVT.S.LU
rddata[t].u64 = nan_box(rv_lutof_s(rsdata[t][0].i, frm, &fflags));
break;
}
trace->fpu_type = FpuType::FCVT;
trace->used_iregs.set(rsrc0);
break;
}
case 0x69: {
switch (rsrc1) {
case 0:
// RV32D: FCVT.D.W
rddata[t].u64 = rv_itof_d(rsdata[t][0].i, frm, &fflags);
break;
case 1:
// RV32D: FCVT.D.WU
rddata[t].u64 = rv_utof_d(rsdata[t][0].i, frm, &fflags);
break;
case 2:
// RV64D: FCVT.D.L
rddata[t].u64 = rv_ltof_d(rsdata[t][0].i, frm, &fflags);
break;
case 3:
// RV64D: FCVT.D.LU
rddata[t].u64 = rv_lutof_d(rsdata[t][0].i, frm, &fflags);
break;
}
trace->fpu_type = FpuType::FCVT;
trace->used_iregs.set(rsrc0);
break;
}
case 0x70: {
if (func3) {
// RV32F: FCLASS.S
rddata[t].i = rv_fclss_s(check_boxing(rsdata[t][0].u64));
} else {
// RV32F: FMV.X.S
uint32_t result = (uint32_t)rsdata[t][0].u64;
rddata[t].i = sext((uint64_t)result, 32);
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
break;
}
case 0x71: {
if (func3) {
// RV32D: FCLASS.D
rddata[t].i = rv_fclss_d(rsdata[t][0].u64);
} else {
// RV64D: FMV.X.D
rddata[t].i = rsdata[t][0].u64;
}
trace->fpu_type = FpuType::FNCP;
trace->used_fregs.set(rsrc0);
break;
}
case 0x78: { // RV32F: FMV.S.X
rddata[t].u64 = nan_box((uint32_t)rsdata[t][0].i);
trace->fpu_type = FpuType::FNCP;
trace->used_iregs.set(rsrc0);
break;
}
case 0x79: { // RV64D: FMV.D.X
rddata[t].u64 = rsdata[t][0].i;
trace->fpu_type = FpuType::FNCP;
trace->used_iregs.set(rsrc0);
break;
}
}
this->update_fcrs(fflags, t, wid);
}
rd_write = true;
break;
}
case Opcode::FMADD:
case Opcode::FMSUB:
case Opcode::FMNMADD:
case Opcode::FMNMSUB: {
trace->fpu_type = FpuType::FMA;
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
trace->used_fregs.set(rsrc2);
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if (!warp.tmask.test(t))
continue;
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t))
continue;
#endif
uint32_t frm = this->get_fpu_rm(func3, t, wid);
uint32_t fflags = 0;
switch (opcode) {
case Opcode::FMADD:
if (func2)
// RV32D: FMADD.D
rddata[t].u64 = rv_fmadd_d(rsdata[t][0].u64, rsdata[t][1].u64, rsdata[t][2].u64, frm, &fflags);
else
// RV32F: FMADD.S
rddata[t].u64 = nan_box(rv_fmadd_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), check_boxing(rsdata[t][2].u64), frm, &fflags));
break;
case Opcode::FMSUB:
if (func2)
// RV32D: FMSUB.D
rddata[t].u64 = rv_fmsub_d(rsdata[t][0].u64, rsdata[t][1].u64, rsdata[t][2].u64, frm, &fflags);
else
// RV32F: FMSUB.S
rddata[t].u64 = nan_box(rv_fmsub_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), check_boxing(rsdata[t][2].u64), frm, &fflags));
break;
case Opcode::FMNMADD:
if (func2)
// RV32D: FNMADD.D
rddata[t].u64 = rv_fnmadd_d(rsdata[t][0].u64, rsdata[t][1].u64, rsdata[t][2].u64, frm, &fflags);
else
// RV32F: FNMADD.S
rddata[t].u64 = nan_box(rv_fnmadd_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), check_boxing(rsdata[t][2].u64), frm, &fflags));
break;
case Opcode::FMNMSUB:
if (func2)
// RV32D: FNMSUB.D
rddata[t].u64 = rv_fnmsub_d(rsdata[t][0].u64, rsdata[t][1].u64, rsdata[t][2].u64, frm, &fflags);
else
// RV32F: FNMSUB.S
rddata[t].u64 = nan_box(rv_fnmsub_s(check_boxing(rsdata[t][0].u64), check_boxing(rsdata[t][1].u64), check_boxing(rsdata[t][2].u64), frm, &fflags));
break;
default:
break;
}
this->update_fcrs(fflags, t, wid);
}
rd_write = true;
break;
}
case Opcode::EXT1: {
switch (func7) {
case 0: {
switch (func3) {
case 0: {
// TMC
trace->fu_type = FUType::SFU;
trace->sfu_type = SfuType::TMC;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
next_tmask.reset();
for (uint32_t t = 0; t < num_threads; ++t) {
next_tmask.set(t, rsdata.at(thread_last)[0].i & (1 << t));
}
} break;
case 1: {
// WSPAWN
trace->fu_type = FUType::SFU;
trace->sfu_type = SfuType::WSPAWN;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
trace->data = std::make_shared<SFUTraceData>(rsdata.at(thread_last)[0].i, rsdata.at(thread_last)[1].i);
} break;
case 2: {
// SPLIT
trace->fu_type = FUType::SFU;
trace->sfu_type = SfuType::SPLIT;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
#ifdef DEFAULT
auto stack_size = warp.ipdom_stack.size();
#endif
#ifdef GROUPS
auto stack_size = warp[wid].ipdom_stack.size();
#endif
ThreadMask then_tmask, else_tmask;
auto not_pred = rsrc2 & 0x1;
for (uint32_t t = 0; t < num_threads; ++t) {
#ifdef DEFAULT
auto cond = (warp.ireg_file.at(t).at(rsrc0) & 0x1) ^ not_pred;
then_tmask[t] = warp.tmask.test(t) && cond;
else_tmask[t] = warp.tmask.test(t) && !cond;
#endif
#ifdef GROUPS
auto cond = (warp[wid + t/THREAD_PER_TILE].ireg_file.at(t%THREAD_PER_TILE).at(rsrc0) & 0x1) ^ not_pred;
then_tmask[t] = warp[wid].tmask.test(t) && cond;
else_tmask[t] = warp[wid].tmask.test(t) && !cond;
#endif
}
bool is_divergent = then_tmask.any() && else_tmask.any();
if (is_divergent) {
if (stack_size == arch_.ipdom_size()) {
#ifdef DEFAULT
std::cout << "IPDOM stack is full! size=" << std::dec << stack_size << ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << trace->uuid << ")\n" << std::flush;
#endif
#ifdef GROUPS
std::cout << "IPDOM stack is full! size=" << std::dec << stack_size << ", PC=0x" << std::hex << warp[wid].PC << " (#" << std::dec << trace->uuid << ")\n" << std::flush;
#endif
std::abort();
}
// set GROUPS thread mask to the larger set
if (then_tmask.count() >= else_tmask.count()) {
next_tmask = then_tmask;
} else {
next_tmask = else_tmask;
}
// push reconvergence thread mask onto the stack
#ifdef DEFAULT
warp.ipdom_stack.emplace(warp.tmask);
#endif
#ifdef GROUPS
warp[wid].ipdom_stack.emplace(warp[wid].tmask);
#endif
// push not taken thread mask onto the stack
#ifdef DEFAULT
auto ntaken_tmask = ~next_tmask & warp.tmask;
warp.ipdom_stack.emplace(ntaken_tmask, next_pc);
#endif
#ifdef GROUPS
auto ntaken_tmask = ~next_tmask & warp[wid].tmask;
warp[wid].ipdom_stack.emplace(ntaken_tmask, next_pc);
#endif
}
// return divergent state
for (uint32_t t = thread_start; t < num_threads; ++t) {
rddata[t].i = stack_size;
}
rd_write = true;
} break;
case 3: {
// JOIN
trace->fu_type = FUType::SFU;
trace->sfu_type = SfuType::JOIN;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
#ifdef DEFAULT
auto stack_ptr = warp.ireg_file.at(thread_last).at(rsrc0);
if (stack_ptr != warp.ipdom_stack.size()) {
if (warp.ipdom_stack.empty()) {
std::cout << "IPDOM stack is empty!\n" << std::flush;
std::abort();
}
next_tmask = warp.ipdom_stack.top().tmask;
if (!warp.ipdom_stack.top().fallthrough) {
next_pc = warp.ipdom_stack.top().PC;
DP(3, " next PC: " << std::hex << next_pc);
}
warp.ipdom_stack.pop();
}
#endif
#ifdef GROUPS
auto stack_ptr = warp[wid + thread_last/THREAD_PER_TILE].ireg_file.at(thread_last%THREAD_PER_TILE).at(rsrc0);
if (stack_ptr != warp[wid].ipdom_stack.size()) {
if (warp[wid].ipdom_stack.empty()) {
std::cout << "IPDOM stack is empty!\n" << std::flush;
std::abort();
}
next_tmask = warp[wid].ipdom_stack.top().tmask;
if (!warp[wid].ipdom_stack.top().fallthrough) {
next_pc = warp[wid].ipdom_stack.top().PC;
DP(3, " next PC: " << std::hex << next_pc);
}
warp[wid].ipdom_stack.pop();
}
#endif
} break;
case 4: {
// BAR
trace->fu_type = FUType::SFU;
trace->sfu_type = SfuType::BAR;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
trace->data = std::make_shared<SFUTraceData>(rsdata[thread_last][0].i, rsdata[thread_last][1].i);
} break;
case 5: {
// PRED
trace->fu_type = FUType::SFU;
trace->sfu_type = SfuType::PRED;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
ThreadMask pred;
auto not_pred = rdest & 0x1;
for (uint32_t t = 0; t < num_threads; ++t) {
#ifdef DEFAULT
auto cond = (warp.ireg_file.at(t).at(rsrc0) & 0x1) ^ not_pred;
pred[t] = warp.tmask.test(t) && cond;
#endif
#ifdef GROUPS
auto cond = (warp[wid + t/THREAD_PER_TILE].ireg_file.at(t%THREAD_PER_TILE).at(rsrc0) & 0x1) ^ not_pred;
pred[t] = warp[wid].tmask.test(t) && cond;
#endif
}
if (pred.any()) {
next_tmask &= pred;
} else {
#ifdef DEFAULT
next_tmask = warp.ireg_file.at(thread_last).at(rsrc1);
#endif
#ifdef GROUPS
next_tmask = warp[wid + thread_last/THREAD_PER_TILE].ireg_file.at(thread_last%THREAD_PER_TILE).at(rsrc1);
#endif
}
} break;
default:
std::abort();
}
} break;
default:
std::abort();
}
} break;
// case Opcode::EXT2: {
// switch (func3) {
// case 1:
// switch (func2) {
// case 0: { // CMOV
// trace->fu_type = FUType::SFU;
// trace->sfu_type = SfuType::CMOV;
// trace->used_iregs.set(rsrc0);
// trace->used_iregs.set(rsrc1);
// trace->used_iregs.set(rsrc2);
// for (uint32_t t = thread_start; t < num_threads; ++t) {
// #ifdef DEFAULT
// if (!warp.tmask.test(t))
// continue;
// #endif
// #ifdef GROUPS
// if (!warp[wid].tmask.test(t))
// continue;
// #endif
// rddata[t].i = rsdata[t][0].i ? rsdata[t][1].i : rsdata[t][2].i;
// }
// rd_write = true;
// } break;
// default:
// std::abort();
// }
// break;
// default:
// std::abort();
// }
// } break;
case Opcode::VOTE: {
bool check;
bool is_neg = (func3 >= 4);
func3 = func3%4;
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::ARITH;
uint32_t address = immsrc & 0xfff;
#ifdef DEFAULT
auto mask = warp.ireg_file.at(0)[address]; // Same mask stored in all threads
#endif
#ifdef GROUPS
auto mask = warp[wid].ireg_file.at(0)[address];
#endif
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(address);
switch (func3) {
case 0:{ //all
check = true;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if((1 << t & mask) && warp.tmask.test(t)){ //Thread present in thread mask and thread active
if (!(is_neg)){ //Predicate not negated
if(!(1 << 0 & rsdata[t][0].u)){ // check src predicate
check = false;
}
}
else{
if(1 << 0 & rsdata[t][0].u){ // check src predicate is true in no threads
check = false;
}
}
}
#endif
#ifdef GROUPS
if((1 << t & mask) && warp[wid].tmask.test(t)){
if (!(is_neg)){ //Predicate not negated
if(!(1 << 0 & rsdata[t][0].u)){ // check src predicate
check = false;
}
}
else{
if(1 << 0 & rsdata[t][0].u){ // check src predicate is true in no threads
check = false;
}
}
}
#endif
}
rd_write = true;
for (uint32_t t = thread_start; t < num_threads; ++t) { //Write dest predicate common to all threads
if(check)
rddata[t].i = 1;
else
rddata[t].i = 0;
}
} break;
case 1:{ //any
check = false;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if((1 << t & mask) && warp.tmask.test(t)){ //Thread present in thread mask and thread active
if (!(is_neg)){ //Predicate not negated
if(1 << 0 & rsdata[t][0].u){ // check src predicate
check = true;
}
}
else{
if(!(1 << 0 & rsdata[t][0].u)){ // check src predicate is true in not all threads
check = true;
}
}
}
#endif
#ifdef GROUPS
if((1 << t & mask) && warp[wid].tmask.test(t)){
if (!(is_neg)){ //Predicate not negated
if(1 << 0 & rsdata[t][0].u){ // check src predicate
check = true;
}
}
else{
if(!(1 << 0 & rsdata[t][0].u)){ // check src predicate is true in not all threads
check = true;
}
}
}
#endif
}
rd_write = true;
for (uint32_t t = thread_start; t < num_threads; ++t) { //Write dest predicate common to all threads
if(check)
rddata[t].i = 1;
else
rddata[t].i = 0;
}
} break;
case 2:{ //uni
check = true;
bool first = true;
auto val = rsdata[0][0].u%2;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if((1 << t & mask) && warp.tmask.test(t)){ //Thread present in thread mask and thread active
if(first){
first = false;
val = rsdata[t][0].u%2;
}
else{
if(val != rsdata[t][0].u%2)
check = false;
}
}
#endif
#ifdef GROUPS
if((1 << t & mask) && warp[wid].tmask.test(t)){
if(first){
first = false;
val = rsdata[t][0].u%2;
}
else{
if(val != rsdata[t][0].u%2)
check = false;
}
}
#endif
}
rd_write = true;
for (uint32_t t = thread_start; t < num_threads; ++t) { //Write dest predicate common to all threads
if(check)
rddata[t].i = 1;
else
rddata[t].i = 0;
}
} break;
case 3:{ //ballot
auto val = rsdata[0][0].u*0; //setting val to 0
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
if((1 << t & mask) && warp.tmask.test(t)){ //Thread present in thread mask and thread active
val = (val << 1) + (1 << 0 & rsdata[t][0].u); //Write the t-th bit with predicate value
}
#endif
#ifdef GROUPS
if((1 << t & mask) && warp[wid].tmask.test(t)){
val = (val << 1) + (1 << 0 & rsdata[t][0].u); //Write the t-th bit with predicate value
}
#endif
else{
val = (val << 1); // Add 0 to t-th bit if not in threadmask
}
}
rd_write = true;
for (uint32_t t = thread_start; t < num_threads; ++t) { //Write dest predicate common to all threads
rddata[t].i = val;
}
} break;
default:{
std::abort();
} break;
}
}break;
case Opcode::SHFL:{
trace->fu_type = FUType::ALU;
trace->alu_type = AluType::ARITH;
uint32_t address = immsrc & 0x01f;
#ifdef DEFAULT
auto mask = warp.ireg_file.at(0)[address]; // Same mask stored in all threads
#endif
#ifdef GROUPS
auto mask = warp[wid].ireg_file.at(0)[address];
#endif
uint32_t b = (immsrc & 0x3e0) >> 5;
uint32_t c_add = ((immsrc & 0xc00) >> 10) + address;
uint32_t lane;
bool p;
for (uint32_t t = thread_start; t < num_threads; ++t) {
#ifdef DEFAULT
auto val = warp.ireg_file.at(t)[c_add];
#endif
#ifdef GROUPS
auto val = warp[wid + t/THREAD_PER_TILE].ireg_file.at(t%THREAD_PER_TILE)[c_add];
#endif
auto c = val & 0x0000001f;
auto segmask = ((val >> 5) & 0x0000001f);
auto maxLane = (t & segmask) | (c & ~segmask);
auto minLane = (t & segmask);
switch (func3) {
case 0:{ //up
lane = t - b;
p = (lane >= maxLane);
}break;
case 1:{ //down
lane = t + b;
p = (lane <= maxLane);
}break;
case 2:{ //bfly
lane = t ^ b;
p = (lane <= maxLane);
}break;
case 3:{ //idx
lane = minLane | (b & ~segmask);
p = (lane <= maxLane);
}break;
default:{
std::abort();
} break;
}
if(!p)
lane = t;
#ifdef DEFAULT
if((1 << t & mask) && warp.tmask.test(t) && (1 << lane & mask) && (lane < num_threads)){
rddata[t].i = rsdata[lane][0].u;
rd_write = true;
}
#endif
#ifdef GROUPS
if((1 << t & mask) && warp[wid].tmask.test(t) && (1 << lane & mask) && (lane < num_threads)){
rddata[t].i = rsdata[lane][0].u;
rd_write = true;
}
#endif
else if(lane >= num_threads){
rddata[t].i = rsdata[t][0].u;
rd_write = true;
}
else{
rddata[t].i = 0;
rd_write = true;
}
}
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(address);
trace->used_iregs.set(c_add);
}break;
#ifdef GROUPS
case Opcode::TILE:{
trace->fu_type = FUType::SFU;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
trace->sfu_type = SfuType::TILE;
auto tile_mask = rsdata.at(thread_last)[0].i;
auto thread_count = rsdata.at(thread_last)[1].i;
trace->data = std::make_shared<SFUTraceData>(tile_mask, thread_count);
}
break;
#endif
default:
std::abort();
}
if (rd_write) {
trace->wb = true;
auto type = instr.getRDType();
switch (type) {
case RegType::Integer:
if (rdest) {
DPH(2, "Dest Reg: " << type << std::dec << rdest << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
#ifdef DEFAULT
if (!warp.tmask.test(t)) {
DPN(2, "-");
continue;
}
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t)) {
DPN(2, "-");
continue;
}
#endif
#ifdef DEFAULT
warp.ireg_file.at(t)[rdest] = rddata[t].i;
#endif
#ifdef GROUPS
warp[wid + t/THREAD_PER_TILE].ireg_file.at(t%THREAD_PER_TILE)[rdest] = rddata[t].i;
#endif
DPN(2, "0x" << std::hex << rddata[t].i);
}
DPN(2, "}" << std::endl);
trace->used_iregs[rdest] = 1;
assert(rdest != 0);
} else {
// disable writes to x0
trace->wb = false;
}
break;
case RegType::Float:
DPH(2, "Dest Reg: " << type << std::dec << rdest << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
#ifdef DEFAULT
if (!warp.tmask.test(t)) {
DPN(2, "-");
continue;
}
#endif
#ifdef GROUPS
if (!warp[wid].tmask.test(t)) {
DPN(2, "-");
continue;
}
#endif
#ifdef DEFAULT
warp.freg_file.at(t)[rdest] = rddata[t].u64;
#endif
#ifdef GROUPS
warp[wid + t/THREAD_PER_TILE].freg_file.at(t%THREAD_PER_TILE)[rdest] = rddata[t].u64;
#endif
DPN(2, "0x" << std::hex << rddata[t].f);
}
DPN(2, "}" << std::endl);
trace->used_fregs[rdest] = 1;
break;
default:
std::abort();
break;
}
}
#ifdef DEFAULT
warp.PC += 4;
#endif
#ifdef GROUPS
warp[wid].PC += 4;
#endif
#ifdef DEFAULT
if (warp.PC != next_pc) {
DP(3, "*** Next PC=0x" << std::hex << next_pc << std::dec);
warp.PC = next_pc;
}
#endif
#ifdef GROUPS
if (warp[wid].PC != next_pc) {
DP(3, "*** Next PC=0x" << std::hex << next_pc << std::dec);
warp[wid].PC = next_pc;
}
warp[0].PC = warp[wid].PC;
#endif
#ifdef DEFAULT
if (warp.tmask != next_tmask) {
DPH(3, "*** New Tmask=");
for (uint32_t i = 0; i < num_threads; ++i)
DPN(3, next_tmask.test(i));
DPN(3, std::endl);
warp.tmask = next_tmask;
if (!next_tmask.any()) {
active_warps_.reset(wid);
}
}
#endif
#ifdef GROUPS
if (warp[wid].tmask != next_tmask) {
DPH(3, "*** New Tmask=");
for (uint32_t i = 0; i < num_threads; ++i)
DPN(3, next_tmask.test(i));
DPN(3, std::endl);
warp[wid].tmask = next_tmask;
if (!next_tmask.any()) {
active_warps_.reset(wid);
}
}
#endif
}