Patched bugs

This commit is contained in:
Rishabh Ravi 2024-11-01 09:26:30 -04:00
parent 1c8ee23608
commit df04d642a4
8 changed files with 268 additions and 103 deletions

View file

@ -105,13 +105,14 @@
// Custom extension opcodes
`define INST_EXT1 7'b0001011 // 0x0B
`define INST_EXT2 7'b0101011 // 0x2B
`define INST_EXT3 7'b1011011 // 0x5B
`define INST_EXT4 7'b1111011 // 0x7B
// `define INST_EXT2 7'b0101011 // 0x2B
// `define INST_EXT3 7'b1011011 // 0x5B
// `define INST_EXT4 7'b1111011 // 0x7B
// CUDA Vote Extension
`define INST_VOTE 7'b1011010 // 0x5a
`define INST_SHFL 7'b1011100 // 0x5c
`define INST_VOTE 7'b0101011 // 0x2B
`define INST_SHFL 7'b1011011 // 0x5B
`define INST_TILE 7'b1111011 // 0x7B
`define VOTE_ALL 4'b0000
`define VOTE_ANY 4'b0001

View file

@ -30,8 +30,8 @@
using namespace vortex;
#define DEFAULT
// #define GROUPS
// #define DEFAULT
#define GROUPS
Emulator::ipdom_entry_t::ipdom_entry_t(const ThreadMask &tmask, Word PC)
: tmask(tmask)
@ -254,13 +254,11 @@ int Emulator::get_exitcode() const {
void Emulator::suspend(uint32_t wid) {
assert(!stalled_warps_.test(wid));
DT(3, "STALLING WARP"<<wid);
stalled_warps_.set(wid);
}
void Emulator::resume(uint32_t wid) {
if (wid != 0xffffffff) {
DT(3, "RESUMING WARP"<<wid);
assert(stalled_warps_.test(wid));
stalled_warps_.reset(wid);
} else {
@ -280,7 +278,7 @@ bool Emulator::wspawn(uint32_t num_warps, Word nextPC) {
bool Emulator::tileMask(uint32_t tile_mask, uint32_t thread_count){
int wid = 0;
bool reset = (tile_mask >> 31);
bool reset = ~(tile_mask >> 31);
for(int i = MAX_NUMBER_TILES - 1 ; i >= 0 ; i--){
auto mask = (tile_mask >> i) & 0x01;
if(reset){
@ -292,7 +290,10 @@ bool Emulator::tileMask(uint32_t tile_mask, uint32_t thread_count){
warps_[wid].isActive = mask;
}
warps_[wid].PC = warps_[0].PC;
warps_[wid].tmask.set();
warps_[wid].tmask.reset();
for (int j = 0; j < (int)thread_count; j++){
warps_[wid].tmask[j] = 1;
}
warps_[wid].num_tThreads = thread_count;
}
}
@ -339,17 +340,15 @@ bool Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
if (count < 2)
return true;
int num_groups = 0;
uint32_t bar_idx = bar_id & 0x7fffffff;
auto& barrier = barriers_.at(bar_idx);
if (warps_[wid].isActive) {
barrier.set(wid);
DP(3, "*** Suspend core #" << core_->id() << ", warp #" << wid << " at barrier #" << bar_idx);
}
DP(3, "*** Suspend core #" << core_->id() << ", warp #" << wid << " at barrier #" << bar_idx);
if (barrier.count() == (size_t)count) {
// resume suspended warps
for (uint32_t i = 0; i < MAX_NUMBER_TILES; ++i) {

View file

@ -85,12 +85,7 @@ private:
struct wspawn_t {
bool valid;
uint32_t num_warps;
uint32_t issuing_wid;
uint32_t final_wid;
uint32_t set_numTiles;
uint32_t prev_numTiles;
Word nextPC;
bool isTile;
};
std::shared_ptr<Instr> decode(uint32_t code) const;

View file

@ -27,8 +27,8 @@
#include "instr.h"
#include "core.h"
#define DEFAULT
// #define GROUPS
// #define DEFAULT
#define GROUPS
using namespace vortex;
@ -70,12 +70,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
// initialize instruction trace
trace->cid = core_->id();
trace->wid = 0;
#ifdef DEFAULT
trace->wid = wid;
trace->PC = warp.PC;
trace->tmask = warp.tmask;
#endif
#ifdef GROUPS
trace->wid = 0;
trace->PC = warp[wid].PC;
trace->tmask = warp[wid].tmask;
#endif
@ -157,7 +159,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
rsdata[t][i].u = warp.ireg_file.at(t)[reg];
#endif
#ifdef GROUPS
rsdata[t][i].u = warp[wid + (int)(t/THREAD_PER_TILE)].ireg_file.at(t%THREAD_PER_TILE)[reg];
rsdata[t][i].u = warp[wid + t/THREAD_PER_TILE].ireg_file.at(t%THREAD_PER_TILE)[reg];
#endif
DPN(2, "0x" << std::hex << rsdata[t][i].i);
}
@ -183,7 +185,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
rsdata[t][i].u64 = warp.freg_file.at(t)[reg];
#endif
#ifdef GROUPS
rsdata[t][i].u64 = warp[wid + (int)(t/THREAD_PER_TILE)].freg_file.at(t%THREAD_PER_TILE)[reg];
rsdata[t][i].u64 = warp[wid + t/THREAD_PER_TILE].freg_file.at(t%THREAD_PER_TILE)[reg];
#endif
DPN(2, "0x" << std::hex << rsdata[t][i].f);
}
@ -1534,8 +1536,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
#endif
#ifdef GROUPS
auto cond = (warp[wid + t/THREAD_PER_TILE].ireg_file.at(t%THREAD_PER_TILE).at(rsrc0) & 0x1) ^ not_pred;
then_tmask[t] = warp[wid + t/THREAD_PER_TILE].tmask.test(t%THREAD_PER_TILE) && cond;
else_tmask[t] = warp[wid + t/THREAD_PER_TILE].tmask.test(t%THREAD_PER_TILE) && !cond;
then_tmask[t] = warp[wid].tmask.test(t) && cond;
else_tmask[t] = warp[wid].tmask.test(t) && !cond;
#endif
}
@ -1601,7 +1603,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
}
#endif
#ifdef GROUPS
auto stack_ptr = warp[thread_last/THREAD_PER_TILE].ireg_file.at(thread_last%THREAD_PER_TILE).at(rsrc0);
auto stack_ptr = warp[wid + thread_last/THREAD_PER_TILE].ireg_file.at(thread_last%THREAD_PER_TILE).at(rsrc0);
if (stack_ptr != warp[wid].ipdom_stack.size()) {
if (warp[wid].ipdom_stack.empty()) {
std::cout << "IPDOM stack is empty!\n" << std::flush;
@ -1651,7 +1653,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
next_tmask = warp.ireg_file.at(thread_last).at(rsrc1);
#endif
#ifdef GROUPS
next_tmask = warp[thread_last/THREAD_PER_TILE].ireg_file.at(thread_last%THREAD_PER_TILE).at(rsrc1);
next_tmask = warp[wid + thread_last/THREAD_PER_TILE].ireg_file.at(thread_last%THREAD_PER_TILE).at(rsrc1);
#endif
}
} break;

View file

@ -22,8 +22,8 @@
#include "constants.h"
#include "cache_sim.h"
// #define GROUPS
#define DEFAULT
#define GROUPS
// #define DEFAULT
using namespace vortex;
@ -288,8 +288,12 @@ void SfuUnit::tick() {
#endif
#ifdef GROUPS
std::bitset<32> mask = trace_data->arg2;
int count = mask.count();
DT(1, "MASK, COUNT" << mask << ", " << count<< "," <<MAX_NUMBER_TILES - 1 - 4);
for (size_t warp_id = 0, nw = MAX_NUMBER_TILES; warp_id < nw; ++warp_id) {
release_warp |= core_->barrier(trace_data->arg1, trace_data->arg2, warp_id);
if(mask.test(MAX_NUMBER_TILES - 1 - warp_id))
release_warp &= core_->barrier(trace_data->arg1, count, warp_id);
}
#endif
}
@ -310,6 +314,7 @@ void SfuUnit::tick() {
DT(3, "pipeline-execute: op=" << trace->sfu_type << ", " << *trace);
if (trace->eop && release_warp) {
DT(3, "pipeline-ssfdsdssdsd: op=" << trace->sfu_type << ", " << *trace);
core_->resume(trace->wid);
}

View file

@ -2,69 +2,249 @@
#include "common.h"
#include <vx_intrinsics.h>
void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = blockIdx.x * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
}
//CASE 1
// vx_tile(2147483776,4);
// vx_tile(2147483656,8);
// vx_tmc(1);
// int val = 14;
// vx_store(val,3);
// vx_tmc(2);
// val = 5;
// vx_store(val,3);
// vx_tmc(4);
// val =45;
// vx_store(val,3);
// vx_tmc(8);
// val = 120;
// vx_store(val,3);
// vx_tmc(15);
// vx_vote();
// vx_tile(2147483776,32);
//CASE 0
// int i = vx_thread_id();
// vx_printf("Thread %d launched\n", i);
// vx_barrier(0,0b10000000);
// vx_printf("Thread %d synchronized\n", i);
// vx_barrier(0,0b10000000);
// vx_printf("Thread %d synchronized again\n", i);
//CASE 2
// vx_store(1,1);
// vx_vote();
// vx_shfl();
// //CASE 1
// int i = vx_thread_id();
// vx_printf("A (thread %d)\n", i);
// vx_tile(0b10001000,16);
// vx_printf("B (thread %d)\n", i);
// vx_barrier(0,0b10001000); //2 groups
// vx_printf("C (thread %d)\n", i);
// vx_barrier(0,0b10001000);
// vx_printf("D (thread %d)\n", i);
// vx_tile(136,16);
// vx_store(2,2);
// vx_vote();
// vx_shfl();
// vx_tile(0b10000000,32); // default config
// vx_tile(2147483776,16);
// vx_vote();
// vx_shfl();
// //CASE 2
// int i = vx_thread_id();
// vx_printf("A (thread %d)\n", i);
// vx_tile(136,16);
// vx_store(3,3);
// vx_vote();
// vx_shfl();
// vx_tile(0b10001000,16);
// vx_tile(10,8);
// vx_store(4,1);
// vx_vote();
// vx_shfl();
// vx_tile(0b00001000,16);
// {
// vx_printf("B (thread %d)\n", i);
// vx_barrier(0,0b00001000);
// vx_printf("C (thread %d)\n", i);
// }
// vx_tile(0b10000000,16);
// {
// vx_printf("D (thread %d)\n", i);
// }
// vx_tile(240,4);
// vx_store(5,2);
// vx_vote();
// vx_shfl();
// vx_tile(0b10001000,16);
// if(i%16 == 0)
// {
// vx_printf("E (thread %d)\n", i);
// }
// vx_tile(2147483776,32);
// vx_tile(0b10000000,32); //default config
//CASE 3
// vx_tile(2147483776,4);
vx_barrier(3,1);
// //CASE 3
// int i = vx_thread_id();
// // Initial thread output
// vx_printf("A (thread %d)\n", i);
// vx_tile(0b11111111,4);
// vx_tile(0b11110000,4);{
// vx_printf("B (thread %d)\n", i);
// vx_tile(0b11000000,4); {
// vx_barrier(0,0b11000000);
// vx_printf("C (thread %d)\n", i);
// vx_tile(0b10000000,4); {
// vx_barrier(0,0b10000000);
// vx_printf("D (thread %d)\n", i);
// }
// }
// vx_tile(0b00110000,4); {
// vx_printf("F (thread %d)\n", i);
// vx_barrier(0,0b00110000);
// }
// }
// vx_tile(0b10000000,32); //default config
// //CASE 4
// int i = vx_thread_id();
// int j = i + 1;
// vx_printf("A (thread %d)\n", i);
// vx_tile(0b10001000,16);
// if (i % 2 == 1)
// {
// vx_printf("B (thread %d)\n", i * 2);
// }
// else
// {
// vx_printf("B (thread %d)\n", i);
// }
// vx_barrier(0,0b10001000);
// vx_printf("C (thread %d)\n", i + j);
// vx_tile(0b10000000,32); //default config
// //CASE 5
// int i = vx_thread_id();
// int j = i + 1;
// vx_printf("A (thread %d)\n", i);
// vx_tile(0b10000000,32);
// vx_printf("B (thread %d, i = %d, j = %d)\n", vx_thread_id(), i, j);
// vx_barrier(0,0b10000000);
// vx_tile(0b10000000,32); //default config
// //CASE 6
// vx_printf("A (thread %d)\n", threadIdx.x);
// vx_tile(0b10001000,16);
// for (int i = 0; i < 5; i++) {
// vx_printf("C (thread %d): iteration %d\n", vx_thread_id(), i);
// }
// vx_tile(0b10000000,32); //default config
// //CASE 7
// int i = vx_thread_id();
// vx_printf("A (thread %d)\n", i);
// vx_tile(0b10001000,16);
// vx_tile(0b10000000,16); {
// vx_printf("B (thread %d)\n", i);
// vx_barrier(0,0b10000000);
// vx_printf("C (thread %d)\n", i);
// }
// vx_tile(0b00001000,16);
// {
// vx_printf("D (thread %d)\n", i);
// }
// vx_tile(0b10001000,16);
// if (vx_thread_id() < 4) {
// vx_printf("E (thread %d)\n", i);
// }
// vx_tile(0b10000000,32); //default config
// //CASE 8
// int tid = vx_thread_id();
// vx_printf("A (thread %d)\n", tid);
// vx_tile(0b10101010,8);
// vx_printf("B (thread %d)\n", vx_thread_id());
// vx_barrier(0, 0b10101010);
// vx_printf("C (thread %d)\n", tid);
// vx_tile(0b10000000,32); //default config
// //CASE 9
// vx_printf("A (thread %d)\n", vx_thread_id());
// vx_tile(0b10101010,8);
// vx_printf("B (thread %d)\n", vx_thread_id());
// vx_barrier(0,0b10101010);
// vx_printf("C (thread %d)\n", vx_thread_id());
// vx_tile(0b10000000,32); //default config
// //CASE 10
// int i = vx_thread_id();
// if (i % 16 == 0)
// {
// vx_printf("A (group %d)\n", i / 16);
// }
// vx_barrier(0,0b10000000);
// vx_tile(0b10001000,16);
// int group = i / 16;
// vx_printf("B %d\n", i);
// vx_barrier(0,0b10001000);
// vx_barrier(0,0b10001000);
// vx_printf("C %d\n", i);
// vx_barrier(0,0b10001000);
// vx_tile(0b10000000,32); //default config
// //CASE 11
// int k = vx_thread_id();
// vx_tile(0b11111111,4);
// for (int i = 0; i < 3; i++) {
// vx_printf("A(threadIdx %d, loopIdx %d)\n", k, i);
// vx_barrier(0, 0b11111111);
// vx_printf("B(threadIdx %d, loopIdx %d)\n", k, i);
// }
// vx_tile(0b10000000,32); //default config
// //CASE 12
// int sum = 0;
// vx_tile(0b10001000,16);
// for (int i = 0; i < 10; i++) {
// int k = i * vx_thread_id();
// int j = i * i;
// vx_barrier(0,0b10001000);
// sum += vx_thread_id() / 16 + k + j;
// }
// vx_printf("(group %d, id %d) sum = %d\n", vx_thread_id() / 16, threadIdx.x % 16, sum);
// vx_tile(0b10000000,32); //default config
//CASE 13
// int sum = 0;
// int j = vx_thread_id();
// vx_tile(0b10001000,16);
// for (int i = 0; i < 10; i++) {
// int k = vx_thread_id();
// sum += k;
// vx_barrier(0, 0b10001000);
// }
// vx_printf("(group %d, id %d) sum = %d\n", j / 16, j % 16, sum);
// vx_tile(0b10000000,32); //default config
}

View file

@ -147,28 +147,11 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
// verify result
std::cout << "verify result" << std::endl;
int errors = 0;
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = h_dst[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return errors;
}
std::cout << "PASSED!" << std::endl;
return 0;

View file

@ -16,7 +16,7 @@ void kernel_body(kernel_arg_t* __UNIFORM__ arg) {
int val = 14;
vx_store(val,3);
vx_tmc(2);
val = 5;
val = 5;
vx_store(val,3);
vx_tmc(4);
val =45;