fixed vortex custom extension opcode to use official unused values

This commit is contained in:
Blaise Tine 2022-03-06 22:55:52 -05:00
parent d241fc9a4b
commit a767efe3c2
32 changed files with 284 additions and 281 deletions

View file

@ -350,47 +350,52 @@ module VX_decode #(
endcase
end
`endif
`INST_GPGPU: begin
ex_type = `EX_GPU;
case (func3)
3'h0: begin
op_type = rs2[0] ? `INST_OP_BITS'(`INST_GPU_PRED) : `INST_OP_BITS'(`INST_GPU_TMC);
is_wstall = 1;
`USED_IREG (rs1);
end
3'h1: begin
op_type = `INST_OP_BITS'(`INST_GPU_WSPAWN);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h2: begin
op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
is_wstall = 1;
`USED_IREG (rs1);
end
3'h3: begin
op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
is_join = 1;
end
3'h4: begin
op_type = `INST_OP_BITS'(`INST_GPU_BAR);
is_wstall = 1;
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h5: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'(`INST_LSU_LW);
op_mod = `INST_MOD_BITS'(2);
`USED_IREG (rs1);
`INST_EXT1: begin
case (func7)
7'h00: begin
ex_type = `EX_GPU;
case (func3)
3'h0: begin // TMC, PRED
op_type = rs2[0] ? `INST_OP_BITS'(`INST_GPU_PRED) : `INST_OP_BITS'(`INST_GPU_TMC);
is_wstall = 1;
`USED_IREG (rs1);
end
3'h1: begin // WSPAWN
op_type = `INST_OP_BITS'(`INST_GPU_WSPAWN);
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h2: begin // SPLIT
op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
is_wstall = 1;
`USED_IREG (rs1);
end
3'h3: begin // JOIN
op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
is_join = 1;
end
3'h4: begin // BAR
op_type = `INST_OP_BITS'(`INST_GPU_BAR);
is_wstall = 1;
`USED_IREG (rs1);
`USED_IREG (rs2);
end
3'h5: begin // PREFETCH
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'(`INST_LSU_LW);
op_mod = `INST_MOD_BITS'(2);
`USED_IREG (rs1);
end
default:;
endcase
end
default:;
endcase
end
`INST_GPU: begin
`INST_EXT2: begin
case (func3)
`ifdef EXT_TEX_ENABLE
3'h0: begin
3'h0: begin // TEX
ex_type = `EX_GPU;
op_type = `INST_OP_BITS'(`INST_GPU_TEX);
op_mod = `INST_MOD_BITS'(func2);
@ -401,7 +406,7 @@ module VX_decode #(
`USED_IREG (rs3);
end
`endif
3'h1: begin
3'h1: begin // IMADD
ex_type = `EX_GPU;
op_type = `INST_OP_BITS'(`INST_GPU_IMADD);
use_rd = 1;

View file

@ -69,10 +69,11 @@
`define INST_FNMADD 7'b1001111
`define INST_FCI 7'b1010011 // float common instructions
`define INST_GPGPU 7'b1101011
`define INST_GPU 7'b1011011
`define INST_TEX 7'b0101011
// Custom extension opcodes
`define INST_EXT1 7'b0001011 // 0x0B
`define INST_EXT2 7'b0101011 // 0x2B
`define INST_EXT3 7'b1011011 // 0x5B
`define INST_EXT4 7'b1111011 // 0x7B
///////////////////////////////////////////////////////////////////////////////

View file

@ -77,76 +77,76 @@ extern "C" {
// Texture load
#define vx_tex(stage, u, v, lod) ({ \
unsigned __r; \
__asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(stage), "r"(u), "r"(v), "r"(lod)); \
__asm__ __volatile__ (".insn r4 0x2b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(stage), "r"(u), "r"(v), "r"(lod)); \
__r; \
})
// Conditional move
#define vx_cmov(c, t, f) ({ \
unsigned __r; \
__asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
__asm__ __volatile__ (".insn r4 0x2b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
__r; \
})
// IMADD
#define vx_imadd(x, y, acc) ({ \
__asm__ __volatile__ (".insn r4 0x5b, 1, 2, x0, %0, %1, %2" :: "r"(x), "r"(y), "r"(acc); \
__asm__ __volatile__ (".insn r4 0x2b, 1, 2, x0, %0, %1, %2" :: "r"(x), "r"(y), "r"(acc); \
})
// Raster load
#define vx_rast() ({ \
unsigned __r; \
__asm__ __volatile__ (".insn r 0x0b, 0, 0, %0, x0, x0" : "=r"(__r)); \
__asm__ __volatile__ (".insn r 0x0b, 0, 1, %0, x0, x0" : "=r"(__r)); \
__r; \
})
// Rop write
#define vx_rop(color, depth) ({ \
__asm__ __volatile__ (".insn r 0x0b, 0, 1, x0, %0, %1" :: "r"(color), "r"(depth)); \
__asm__ __volatile__ (".insn r 0x0b, 1, 1, x0, %0, %1" :: "r"(color), "r"(depth)); \
})
// Interpolate
#define vx_interp(f, a, b, c) ({ \
unsigned __r; \
__asm__ __volatile__ (".insn r4 0x5b, 2, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(f), "r"(a), "r"(b), "r"(c)); \
__asm__ __volatile__ (".insn r4 0x2b, 2, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(f), "r"(a), "r"(b), "r"(c)); \
__r; \
})
// Set thread mask
inline void vx_tmc(unsigned thread_mask) {
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
asm volatile (".insn r 0x0b, 0, 0, x0, %0, x0" :: "r"(thread_mask));
}
// Set thread predicate
inline void vx_pred(unsigned condition) {
asm volatile (".insn s 0x6b, 0, x1, 0(%0)" :: "r"(condition));
asm volatile (".insn r 0x0b, 0, 0, x0, %0, x1" :: "r"(condition));
}
typedef void (*vx_wspawn_pfn)();
// Spawn warps
inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
asm volatile (".insn r 0x0b, 1, 0, x0, %0, %1" :: "r"(num_warps), "r"(func_ptr));
}
// Split on a predicate
inline void vx_split(int predicate) {
asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
asm volatile (".insn r 0x0b, 2, 0, x0, %0, x0" :: "r"(predicate));
}
// Join
inline void vx_join() {
asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
asm volatile (".insn r 0x0b, 3, 0, x0, x0, x0");
}
// Warp Barrier
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
asm volatile (".insn r 0x0b, 4, 0, x0, %0, %1" :: "r"(barried_id), "r"(num_warps));
}
// Prefetch
inline void vx_prefetch(unsigned addr) {
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
asm volatile (".insn r 0x0b, 5, 0, x0, %0, x0" :: "r"(addr) );
}
// Return active warp's thread id

View file

@ -18,12 +18,12 @@ vx_serial:
label_loop:
sub t0, s0, s1
seqz t1, t0 # (index != tid)
.insn s 0x6b, 2, x0, 0(t1) # split t0
.insn r 0x0b, 2, 0, x0, t1, x0 # split t0
bnez t0, label_join
mv a0, s3 # a0 <- arg
jalr s4 # callback(arg)
label_join:
.insn s 0x6b, 3, x0, 0(x0) # join
.insn r 0x0b, 3, 0, x0, x0, x0 # join
addi s0, s0, 1 # index++
blt s0, s2, label_loop # loop back
lw ra, 20(sp)

View file

@ -9,12 +9,12 @@ _start:
# execute stack initialization on all warps
la a1, vx_set_sp
csrr a0, CSR_NW # get num warps
.insn s 0x6b, 1, a1, 0(a0) # wspawn a0, a1
.insn r 0x0b, 1, 0, x0, a0, a1 # wspawn a0, a1
jal vx_set_sp
# return back to single thread execution
li a0, 1
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
.insn r 0x0b, 0, 0, x0, a0, x0 # tmc a0
# Clear the bss segment
la a0, _edata
@ -47,7 +47,7 @@ _exit:
call vx_perf_dump
mv gp, s0
li a0, 0
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
.insn r 0x0b, 0, 0, x0, a0, x0 # tmc a0
.section .text
.type vx_set_sp, @function
@ -55,7 +55,7 @@ _exit:
vx_set_sp:
# activate all threads
li a0, -1
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
.insn r 0x0b, 0, 0, x0, a0, x0 # tmc a0
# set per-thread stack register
li sp, STACK_BASE_ADDR # load stack base address
@ -68,7 +68,7 @@ vx_set_sp:
csrr a3, CSR_LWID # get local wid
beqz a3, RETURN
li a0, 0
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
.insn r 0x0b, 0, 0, x0, a0, x0 # tmc a0
RETURN:
ret

View file

@ -39,8 +39,7 @@ static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable =
{Opcode::FMSUB, {false, InstType::R4_TYPE}},
{Opcode::FMNMADD, {false, InstType::R4_TYPE}},
{Opcode::FMNMSUB, {false, InstType::R4_TYPE}},
{Opcode::VSET, {false, InstType::V_TYPE}},
{Opcode::GPGPU, {false, InstType::R_TYPE}},
{Opcode::VSET, {false, InstType::V_TYPE}},
{Opcode::EXT1, {false, InstType::R_TYPE}},
{Opcode::EXT2, {false, InstType::R4_TYPE}},
{Opcode::R_INST_W, {false, InstType::R_TYPE}},
@ -345,27 +344,26 @@ static const char* op_string(const Instr &instr) {
case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
case Opcode::VSET: return "VSET";
case Opcode::GPGPU:
switch (func3) {
case 0: return "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PREFETCH";
default:
std::abort();
}
case Opcode::EXT1:
switch (func7) {
case 0: {
case 0:
switch (func3) {
case 0: return rs2 ? "PRED" : "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PREFETCH";
default:
std::abort();
}
case 1:
switch (func3) {
case 0: return "RASTER";
case 1: return "ROP";
default:
std::abort();
}
}
default:
std::abort();
}

View file

@ -1285,141 +1285,141 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
rd_write = true;
break;
}
case GPGPU: {
uint32_t ts = 0;
for (uint32_t t = 0; t < num_threads; ++t) {
if (tmask_.test(t)) {
ts = t;
break;
}
}
switch (func3) {
case 0: {
// TMC
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::TMC;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
if (rsrc1) {
// predicate mode
ThreadMask pred;
for (uint32_t t = 0; t < num_threads; ++t) {
pred[t] = tmask_.test(t) ? (ireg_file_.at(t).at(rsrc0) != 0) : 0;
}
if (pred.any()) {
tmask_ &= pred;
}
} else {
tmask_.reset();
for (uint32_t t = 0; t < num_threads; ++t) {
tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
}
}
DPH(3, "*** New TMC: ");
for (uint32_t i = 0; i < num_threads; ++i)
DPN(3, tmask_.test(num_threads-i-1));
DPN(3, std::endl);
active_ = tmask_.any();
trace->data = new GPUTraceData(active_ << id_);
} break;
case 1: {
// WSPAWN
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::WSPAWN;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
trace->data = new GPUTraceData(core_->wspawn(rsdata.at(ts)[0].i, rsdata.at(ts)[1].i));
} break;
case 2: {
// SPLIT
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::SPLIT;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {
ThreadMask tmask;
for (uint32_t t = 0; t < num_threads; ++t) {
tmask[t] = tmask_.test(t) && !ireg_file_.at(t).at(rsrc0);
}
DomStackEntry e(tmask, nextPC);
dom_stack_.push(tmask_);
dom_stack_.push(e);
for (uint32_t t = 0, n = e.tmask.size(); t < n; ++t) {
tmask_.set(t, !e.tmask.test(t) && tmask_.test(t));
}
active_ = tmask_.any();
DPH(3, "*** Split: New TM=");
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
DPN(3, ", Pushed TM=");
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, e.tmask.test(num_threads-t-1));
DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
} else {
DP(3, "*** Unanimous pred");
DomStackEntry e(tmask_);
e.unanimous = true;
dom_stack_.push(e);
}
} break;
case 3: {
// JOIN
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::JOIN;
trace->fetch_stall = true;
if (!dom_stack_.empty() && dom_stack_.top().unanimous) {
DP(3, "*** Uninimous branch at join");
tmask_ = dom_stack_.top().tmask;
active_ = tmask_.any();
dom_stack_.pop();
} else {
if (!dom_stack_.top().fallThrough) {
nextPC = dom_stack_.top().PC;
DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
}
tmask_ = dom_stack_.top().tmask;
active_ = tmask_.any();
DPH(3, "*** Join: New TM=");
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
DPN(3, "\n");
dom_stack_.pop();
}
} break;
case 4: {
// BAR
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::BAR;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
trace->data = new GPUTraceData(core_->barrier(rsdata[ts][0].i, rsdata[ts][1].i, id_));
} break;
case 5: {
// PREFETCH
trace->exe_type = ExeType::LSU;
trace->lsu_type = LsuType::PREFETCH;
trace->used_iregs.set(rsrc0);
auto trace_data = new LsuTraceData(num_threads);
trace->data = trace_data;
for (uint32_t t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
auto mem_addr = rsdata[t][0].i;
trace_data->mem_addrs.at(t) = {mem_addr, 4};
}
} break;
default:
std::abort();
}
} break;
case EXT1: {
switch (func7) {
case 0:
case 0: {
uint32_t ts = 0;
for (uint32_t t = 0; t < num_threads; ++t) {
if (tmask_.test(t)) {
ts = t;
break;
}
}
switch (func3) {
case 0: {
// TMC
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::TMC;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
if (rsrc1) {
// predicate mode
ThreadMask pred;
for (uint32_t t = 0; t < num_threads; ++t) {
pred[t] = tmask_.test(t) ? (ireg_file_.at(t).at(rsrc0) != 0) : 0;
}
if (pred.any()) {
tmask_ &= pred;
}
} else {
tmask_.reset();
for (uint32_t t = 0; t < num_threads; ++t) {
tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
}
}
DPH(3, "*** New TMC: ");
for (uint32_t i = 0; i < num_threads; ++i)
DPN(3, tmask_.test(num_threads-i-1));
DPN(3, std::endl);
active_ = tmask_.any();
trace->data = new GPUTraceData(active_ << id_);
} break;
case 1: {
// WSPAWN
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::WSPAWN;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
trace->data = new GPUTraceData(core_->wspawn(rsdata.at(ts)[0].i, rsdata.at(ts)[1].i));
} break;
case 2: {
// SPLIT
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::SPLIT;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {
ThreadMask tmask;
for (uint32_t t = 0; t < num_threads; ++t) {
tmask[t] = tmask_.test(t) && !ireg_file_.at(t).at(rsrc0);
}
DomStackEntry e(tmask, nextPC);
dom_stack_.push(tmask_);
dom_stack_.push(e);
for (uint32_t t = 0, n = e.tmask.size(); t < n; ++t) {
tmask_.set(t, !e.tmask.test(t) && tmask_.test(t));
}
active_ = tmask_.any();
DPH(3, "*** Split: New TM=");
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
DPN(3, ", Pushed TM=");
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, e.tmask.test(num_threads-t-1));
DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
} else {
DP(3, "*** Unanimous pred");
DomStackEntry e(tmask_);
e.unanimous = true;
dom_stack_.push(e);
}
} break;
case 3: {
// JOIN
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::JOIN;
trace->fetch_stall = true;
if (!dom_stack_.empty() && dom_stack_.top().unanimous) {
DP(3, "*** Uninimous branch at join");
tmask_ = dom_stack_.top().tmask;
active_ = tmask_.any();
dom_stack_.pop();
} else {
if (!dom_stack_.top().fallThrough) {
nextPC = dom_stack_.top().PC;
DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
}
tmask_ = dom_stack_.top().tmask;
active_ = tmask_.any();
DPH(3, "*** Join: New TM=");
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
DPN(3, "\n");
dom_stack_.pop();
}
} break;
case 4: {
// BAR
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::BAR;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
trace->data = new GPUTraceData(core_->barrier(rsdata[ts][0].i, rsdata[ts][1].i, id_));
} break;
case 5: {
// PREFETCH
trace->exe_type = ExeType::LSU;
trace->lsu_type = LsuType::PREFETCH;
trace->used_iregs.set(rsrc0);
auto trace_data = new LsuTraceData(num_threads);
trace->data = trace_data;
for (uint32_t t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
auto mem_addr = rsdata[t][0].i;
trace_data->mem_addrs.at(t) = {mem_addr, 4};
}
} break;
default:
std::abort();
}
} break;
case 1:
switch (func3) {
case 0: { // RASTER
trace->exe_type = ExeType::GPU;

View file

@ -29,10 +29,11 @@ enum Opcode {
FMNMADD = 0x4f,
// Vector Extension
VSET = 0x57,
// Vortex Extensions
// Custom Extensions
EXT1 = 0x0b,
EXT2 = 0x5b,
GPGPU = 0x6b,
EXT2 = 0x2b,
EXT3 = 0x5b,
EXT4 = 0x7b,
// RV64 Standard Extensions
R_INST_W = 0x3b,
I_INST_W = 0x1b,

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -249,8 +249,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));

View file

@ -148,8 +148,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));

View file

@ -201,8 +201,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));

View file

@ -204,8 +204,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
RT_CHECK(vx_buf_alloc(device, sizeof(kernel_arg_t), &arg_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &src1_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &src2_buf));

View file

@ -26,7 +26,7 @@ using namespace cocogfx;
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
const char* input_file = "soccer.png";
const char* input_file = "fire.png";
const char* output_file = "output.png";
const char* reference_file = nullptr;
uint32_t clear_color = 0x00000000;
@ -34,8 +34,8 @@ int tex_format = TEX_FORMAT_A8R8G8B8;
ePixelFormat tex_eformat = FORMAT_A8R8G8B8;
int tex_wrap = TEX_WRAP_CLAMP;
int tex_filter = TEX_FILTER_POINT;
uint32_t dst_width = 64;
uint32_t dst_height = 64;
uint32_t dst_width = 256;
uint32_t dst_height = 256;
const model_t& model = model_quad;
vx_device_h device = nullptr;
@ -218,6 +218,7 @@ int main(int argc, char *argv[]) {
// Perform tile binning
auto num_tiles = Binning(tilebuf, primbuf, model, dst_width, dst_height, tile_size);
std::cout << "Binning allocated " << num_tiles << " tiles." << std::endl;
// upload program
std::cout << "upload program" << std::endl;
@ -237,8 +238,8 @@ int main(int argc, char *argv[]) {
std::cout << "zbuf_addr=0x" << std::hex << zbuf_addr << std::endl;
std::cout << "cbuf_addr=0x" << std::hex << cbuf_addr << std::endl;
// allocate staging shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>({
sizeof(kernel_arg_t), (uint32_t)tilebuf.size(), (uint32_t)primbuf.size(), zbuf_size, cbuf_size
});

View file

@ -16,9 +16,7 @@ using fixed16_t = TFixed<16>;
using vec2d_f_t = TVector2<float>;
using vec2d_fx_t = TVector2<fixed16_t>;
using vec4d_f_t = TVector4<float>;
using rect_f_t = TRect<float>;
static fixed16_t fxZero(0);
@ -30,13 +28,13 @@ static fixed16_t evalEdgeFunction(const rast_edge_t& e, uint32_t x, uint32_t y)
}
// Calculate the edge extents for tile corners
static fixed16_t calcEdgeExtents(const rast_edge_t& e, uint32_t logTileSize) {
static fixed16_t calcEdgeExtents(const rast_edge_t& e) {
vec2d_fx_t corners[4] = {{fxZero, fxZero}, // 00
{e.x, fxZero}, // 10
{fxZero, e.y}, // 01
{e.x, e.y}}; // 11
{e.x, fxZero}, // 10
{fxZero, e.y}, // 01
{e.x, e.y}}; // 11
auto i = (e.y >= fxZero) ? ((e.x >= fxZero) ? 3 : 2) : (e.x >= fxZero) ? 1 : 0;
return (corners[i].x + corners[i].y) << logTileSize;
return corners[i].x + corners[i].y;
}
static float EdgeEquation(rast_edge_t edges[3],
@ -92,7 +90,7 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
uint32_t height,
uint32_t tileSize) {
uint32_t logTileSize = log2ceil(tileSize);
uint32_t tileLogSize = log2ceil(tileSize);
std::unordered_map<uint32_t, std::vector<uint32_t>> tiles;
@ -151,7 +149,7 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
uint32_t p;
{
#define INTERPOLATE_DELTA(dx, x0, x1, x2) \
#define ATTRIBUTE_DELTA(dx, x0, x1, x2) \
dx.x = fixed23_t(x0 - x2); \
dx.y = fixed23_t(x1 - x2); \
dx.z = fixed23_t(x2)
@ -168,34 +166,34 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
ColorToFloat(colors[1], v1.c);
ColorToFloat(colors[2], v2.c);
INTERPOLATE_DELTA(rast_prim.attribs.z, v0.z, v1.z, v2.z);
INTERPOLATE_DELTA(rast_prim.attribs.r, colors[0][0], colors[1][0], colors[2][0]);
INTERPOLATE_DELTA(rast_prim.attribs.g, colors[0][1], colors[1][1], colors[2][1]);
INTERPOLATE_DELTA(rast_prim.attribs.b, colors[0][2], colors[1][2], colors[2][2]);
INTERPOLATE_DELTA(rast_prim.attribs.a, colors[0][3], colors[1][3], colors[2][3]);
INTERPOLATE_DELTA(rast_prim.attribs.u, v0.u, v1.u, v2.u);
INTERPOLATE_DELTA(rast_prim.attribs.v, v0.v, v1.v, v2.v);
ATTRIBUTE_DELTA(rast_prim.attribs.z, v0.z, v1.z, v2.z);
ATTRIBUTE_DELTA(rast_prim.attribs.r, colors[0][0], colors[1][0], colors[2][0]);
ATTRIBUTE_DELTA(rast_prim.attribs.g, colors[0][1], colors[1][1], colors[2][1]);
ATTRIBUTE_DELTA(rast_prim.attribs.b, colors[0][2], colors[1][2], colors[2][2]);
ATTRIBUTE_DELTA(rast_prim.attribs.a, colors[0][3], colors[1][3], colors[2][3]);
ATTRIBUTE_DELTA(rast_prim.attribs.u, v0.u, v1.u, v2.u);
ATTRIBUTE_DELTA(rast_prim.attribs.v, v0.v, v1.v, v2.v);
p = rast_prims.size();
rast_prims.push_back(rast_prim);
}
// Calculate min/max tile positions
auto tileSize = 1 << logTileSize;
auto minTileX = bbox.left >> logTileSize;
auto minTileY = bbox.top >> logTileSize;
auto maxTileX = (bbox.right + tileSize - 1) >> logTileSize;
auto maxTileY = (bbox.bottom + tileSize - 1) >> logTileSize;
auto tileSize = 1 << tileLogSize;
auto minTileX = bbox.left >> tileLogSize;
auto minTileY = bbox.top >> tileLogSize;
auto maxTileX = (bbox.right + tileSize - 1) >> tileLogSize;
auto maxTileY = (bbox.bottom + tileSize - 1) >> tileLogSize;
// Starting tile coordinates
auto X = minTileX << logTileSize;
auto Y = minTileY << logTileSize;
auto X = minTileX << tileLogSize;
auto Y = minTileY << tileLogSize;
// Add tile corner edge offsets
fixed16_t extents[3];
extents[0] = calcEdgeExtents(edges[0], logTileSize);
extents[1] = calcEdgeExtents(edges[1], logTileSize);
extents[2] = calcEdgeExtents(edges[2], logTileSize);
extents[0] = calcEdgeExtents(edges[0]);
extents[1] = calcEdgeExtents(edges[1]);
extents[2] = calcEdgeExtents(edges[2]);
// Evaluate edge equation for the starting tile
auto e0 = evalEdgeFunction(edges[0], X, Y);
@ -209,34 +207,33 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
auto ee2 = e2;
for (uint32_t tx = minTileX; tx < maxTileX; ++tx) {
// check if tile overlap triangle
if (((ee0 + extents[0]).data()
| (ee1 + extents[1]).data()
| (ee2 + extents[2]).data()) >= 0) {
if (((ee0 + (extents[0] << tileLogSize)).data()
| (ee1 + (extents[1] << tileLogSize)).data()
| (ee2 + (extents[2] << tileLogSize)).data()) >= 0) {
// assign primitive to tile
uint32_t tile_id = (ty << 16) | tx;
tiles[tile_id].push_back(p);
++num_prims;
}
// update edge equation x components
ee0 += edges[0].x << logTileSize;
ee1 += edges[1].x << logTileSize;
ee2 += edges[2].x << logTileSize;
ee0 += edges[0].x << tileLogSize;
ee1 += edges[1].x << tileLogSize;
ee2 += edges[2].x << tileLogSize;
}
// update edge equation y components
e0 += edges[0].y << logTileSize;
e1 += edges[1].y << logTileSize;
e2 += edges[2].y << logTileSize;
e0 += edges[0].y << tileLogSize;
e1 += edges[1].y << tileLogSize;
e2 += edges[2].y << tileLogSize;
}
}
{
primbuf.reserve(rast_prims.size() * sizeof(rast_prim_t));
primbuf.resize(rast_prims.size() * sizeof(rast_prim_t));
memcpy(primbuf.data(), rast_prims.data(), primbuf.size());
}
{
tilebuf.reserve(tiles.size() * sizeof(rast_tile_header_t) + num_prims * sizeof(uint32_t));
tilebuf.resize(tiles.size() * sizeof(rast_tile_header_t) + num_prims * sizeof(uint32_t));
auto tile_data = tilebuf.data();
for (auto it : tiles) {
rast_tile_header_t header{it.first, (uint32_t)it.second.size()};

View file

@ -148,8 +148,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));

View file

@ -183,8 +183,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint32_t),
std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,

View file

@ -236,8 +236,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(addr_buf_size,
std::max<uint32_t>(dst_buf_size,

View file

@ -138,8 +138,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));

View file

@ -138,8 +138,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));

View file

@ -148,8 +148,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));

View file

@ -111,8 +111,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));

View file

@ -178,8 +178,8 @@ int main(int argc, char *argv[]) {
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));

View file

@ -213,8 +213,8 @@ int main(int argc, char *argv[]) {
std::cout << "src_addr=0x" << std::hex << src_addr << std::endl;
std::cout << "dst_addr=0x" << std::hex << dst_addr << std::endl;
// allocate staging shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(sizeof(kernel_arg_t),
std::max<uint32_t>(src_bufsize, dst_bufsize));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));

2
third_party/cocogfx vendored

@ -1 +1 @@
Subproject commit 8f78db5e1845b2a9cd337ac154ee276250d91ad3
Subproject commit 6ff9739cee9a0528142123985e4d8e59f7d0a4e8