mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
fixed vortex custom extension opcode to use official unused values
This commit is contained in:
parent
d241fc9a4b
commit
a767efe3c2
32 changed files with 284 additions and 281 deletions
|
@ -350,47 +350,52 @@ module VX_decode #(
|
|||
endcase
|
||||
end
|
||||
`endif
|
||||
`INST_GPGPU: begin
|
||||
ex_type = `EX_GPU;
|
||||
case (func3)
|
||||
3'h0: begin
|
||||
op_type = rs2[0] ? `INST_OP_BITS'(`INST_GPU_PRED) : `INST_OP_BITS'(`INST_GPU_TMC);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
3'h1: begin
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_WSPAWN);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'h2: begin
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
3'h3: begin
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
|
||||
is_join = 1;
|
||||
end
|
||||
3'h4: begin
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_BAR);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'h5: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'(`INST_LSU_LW);
|
||||
op_mod = `INST_MOD_BITS'(2);
|
||||
`USED_IREG (rs1);
|
||||
`INST_EXT1: begin
|
||||
case (func7)
|
||||
7'h00: begin
|
||||
ex_type = `EX_GPU;
|
||||
case (func3)
|
||||
3'h0: begin // TMC, PRED
|
||||
op_type = rs2[0] ? `INST_OP_BITS'(`INST_GPU_PRED) : `INST_OP_BITS'(`INST_GPU_TMC);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
3'h1: begin // WSPAWN
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_WSPAWN);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'h2: begin // SPLIT
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
3'h3: begin // JOIN
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
|
||||
is_join = 1;
|
||||
end
|
||||
3'h4: begin // BAR
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_BAR);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'h5: begin // PREFETCH
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'(`INST_LSU_LW);
|
||||
op_mod = `INST_MOD_BITS'(2);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`INST_GPU: begin
|
||||
`INST_EXT2: begin
|
||||
case (func3)
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
3'h0: begin
|
||||
3'h0: begin // TEX
|
||||
ex_type = `EX_GPU;
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_TEX);
|
||||
op_mod = `INST_MOD_BITS'(func2);
|
||||
|
@ -401,7 +406,7 @@ module VX_decode #(
|
|||
`USED_IREG (rs3);
|
||||
end
|
||||
`endif
|
||||
3'h1: begin
|
||||
3'h1: begin // IMADD
|
||||
ex_type = `EX_GPU;
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_IMADD);
|
||||
use_rd = 1;
|
||||
|
|
|
@ -69,10 +69,11 @@
|
|||
`define INST_FNMADD 7'b1001111
|
||||
`define INST_FCI 7'b1010011 // float common instructions
|
||||
|
||||
`define INST_GPGPU 7'b1101011
|
||||
`define INST_GPU 7'b1011011
|
||||
|
||||
`define INST_TEX 7'b0101011
|
||||
// Custom extension opcodes
|
||||
`define INST_EXT1 7'b0001011 // 0x0B
|
||||
`define INST_EXT2 7'b0101011 // 0x2B
|
||||
`define INST_EXT3 7'b1011011 // 0x5B
|
||||
`define INST_EXT4 7'b1111011 // 0x7B
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
|
|
@ -77,76 +77,76 @@ extern "C" {
|
|||
// Texture load
|
||||
#define vx_tex(stage, u, v, lod) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(stage), "r"(u), "r"(v), "r"(lod)); \
|
||||
__asm__ __volatile__ (".insn r4 0x2b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(stage), "r"(u), "r"(v), "r"(lod)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Conditional move
|
||||
#define vx_cmov(c, t, f) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
|
||||
__asm__ __volatile__ (".insn r4 0x2b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// IMADD
|
||||
#define vx_imadd(x, y, acc) ({ \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 1, 2, x0, %0, %1, %2" :: "r"(x), "r"(y), "r"(acc); \
|
||||
__asm__ __volatile__ (".insn r4 0x2b, 1, 2, x0, %0, %1, %2" :: "r"(x), "r"(y), "r"(acc); \
|
||||
})
|
||||
|
||||
// Raster load
|
||||
#define vx_rast() ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r 0x0b, 0, 0, %0, x0, x0" : "=r"(__r)); \
|
||||
__asm__ __volatile__ (".insn r 0x0b, 0, 1, %0, x0, x0" : "=r"(__r)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Rop write
|
||||
#define vx_rop(color, depth) ({ \
|
||||
__asm__ __volatile__ (".insn r 0x0b, 0, 1, x0, %0, %1" :: "r"(color), "r"(depth)); \
|
||||
__asm__ __volatile__ (".insn r 0x0b, 1, 1, x0, %0, %1" :: "r"(color), "r"(depth)); \
|
||||
})
|
||||
|
||||
// Interpolate
|
||||
#define vx_interp(f, a, b, c) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 2, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(f), "r"(a), "r"(b), "r"(c)); \
|
||||
__asm__ __volatile__ (".insn r4 0x2b, 2, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(f), "r"(a), "r"(b), "r"(c)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Set thread mask
|
||||
inline void vx_tmc(unsigned thread_mask) {
|
||||
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
|
||||
asm volatile (".insn r 0x0b, 0, 0, x0, %0, x0" :: "r"(thread_mask));
|
||||
}
|
||||
|
||||
// Set thread predicate
|
||||
inline void vx_pred(unsigned condition) {
|
||||
asm volatile (".insn s 0x6b, 0, x1, 0(%0)" :: "r"(condition));
|
||||
asm volatile (".insn r 0x0b, 0, 0, x0, %0, x1" :: "r"(condition));
|
||||
}
|
||||
|
||||
typedef void (*vx_wspawn_pfn)();
|
||||
|
||||
// Spawn warps
|
||||
inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
|
||||
asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
|
||||
asm volatile (".insn r 0x0b, 1, 0, x0, %0, %1" :: "r"(num_warps), "r"(func_ptr));
|
||||
}
|
||||
|
||||
// Split on a predicate
|
||||
inline void vx_split(int predicate) {
|
||||
asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
|
||||
asm volatile (".insn r 0x0b, 2, 0, x0, %0, x0" :: "r"(predicate));
|
||||
}
|
||||
|
||||
// Join
|
||||
inline void vx_join() {
|
||||
asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
|
||||
asm volatile (".insn r 0x0b, 3, 0, x0, x0, x0");
|
||||
}
|
||||
|
||||
// Warp Barrier
|
||||
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
||||
asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
|
||||
asm volatile (".insn r 0x0b, 4, 0, x0, %0, %1" :: "r"(barried_id), "r"(num_warps));
|
||||
}
|
||||
|
||||
// Prefetch
|
||||
inline void vx_prefetch(unsigned addr) {
|
||||
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
|
||||
asm volatile (".insn r 0x0b, 5, 0, x0, %0, x0" :: "r"(addr) );
|
||||
}
|
||||
|
||||
// Return active warp's thread id
|
||||
|
|
|
@ -18,12 +18,12 @@ vx_serial:
|
|||
label_loop:
|
||||
sub t0, s0, s1
|
||||
seqz t1, t0 # (index != tid)
|
||||
.insn s 0x6b, 2, x0, 0(t1) # split t0
|
||||
.insn r 0x0b, 2, 0, x0, t1, x0 # split t0
|
||||
bnez t0, label_join
|
||||
mv a0, s3 # a0 <- arg
|
||||
jalr s4 # callback(arg)
|
||||
label_join:
|
||||
.insn s 0x6b, 3, x0, 0(x0) # join
|
||||
.insn r 0x0b, 3, 0, x0, x0, x0 # join
|
||||
addi s0, s0, 1 # index++
|
||||
blt s0, s2, label_loop # loop back
|
||||
lw ra, 20(sp)
|
||||
|
|
|
@ -9,12 +9,12 @@ _start:
|
|||
# execute stack initialization on all warps
|
||||
la a1, vx_set_sp
|
||||
csrr a0, CSR_NW # get num warps
|
||||
.insn s 0x6b, 1, a1, 0(a0) # wspawn a0, a1
|
||||
.insn r 0x0b, 1, 0, x0, a0, a1 # wspawn a0, a1
|
||||
jal vx_set_sp
|
||||
|
||||
# return back to single thread execution
|
||||
li a0, 1
|
||||
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
|
||||
.insn r 0x0b, 0, 0, x0, a0, x0 # tmc a0
|
||||
|
||||
# Clear the bss segment
|
||||
la a0, _edata
|
||||
|
@ -47,7 +47,7 @@ _exit:
|
|||
call vx_perf_dump
|
||||
mv gp, s0
|
||||
li a0, 0
|
||||
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
|
||||
.insn r 0x0b, 0, 0, x0, a0, x0 # tmc a0
|
||||
|
||||
.section .text
|
||||
.type vx_set_sp, @function
|
||||
|
@ -55,7 +55,7 @@ _exit:
|
|||
vx_set_sp:
|
||||
# activate all threads
|
||||
li a0, -1
|
||||
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
|
||||
.insn r 0x0b, 0, 0, x0, a0, x0 # tmc a0
|
||||
|
||||
# set per-thread stack register
|
||||
li sp, STACK_BASE_ADDR # load stack base address
|
||||
|
@ -68,7 +68,7 @@ vx_set_sp:
|
|||
csrr a3, CSR_LWID # get local wid
|
||||
beqz a3, RETURN
|
||||
li a0, 0
|
||||
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
|
||||
.insn r 0x0b, 0, 0, x0, a0, x0 # tmc a0
|
||||
RETURN:
|
||||
ret
|
||||
|
||||
|
|
|
@ -39,8 +39,7 @@ static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable =
|
|||
{Opcode::FMSUB, {false, InstType::R4_TYPE}},
|
||||
{Opcode::FMNMADD, {false, InstType::R4_TYPE}},
|
||||
{Opcode::FMNMSUB, {false, InstType::R4_TYPE}},
|
||||
{Opcode::VSET, {false, InstType::V_TYPE}},
|
||||
{Opcode::GPGPU, {false, InstType::R_TYPE}},
|
||||
{Opcode::VSET, {false, InstType::V_TYPE}},
|
||||
{Opcode::EXT1, {false, InstType::R_TYPE}},
|
||||
{Opcode::EXT2, {false, InstType::R4_TYPE}},
|
||||
{Opcode::R_INST_W, {false, InstType::R_TYPE}},
|
||||
|
@ -345,27 +344,26 @@ static const char* op_string(const Instr &instr) {
|
|||
case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
|
||||
case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
|
||||
case Opcode::VSET: return "VSET";
|
||||
case Opcode::GPGPU:
|
||||
switch (func3) {
|
||||
case 0: return "TMC";
|
||||
case 1: return "WSPAWN";
|
||||
case 2: return "SPLIT";
|
||||
case 3: return "JOIN";
|
||||
case 4: return "BAR";
|
||||
case 5: return "PREFETCH";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case Opcode::EXT1:
|
||||
switch (func7) {
|
||||
case 0: {
|
||||
case 0:
|
||||
switch (func3) {
|
||||
case 0: return rs2 ? "PRED" : "TMC";
|
||||
case 1: return "WSPAWN";
|
||||
case 2: return "SPLIT";
|
||||
case 3: return "JOIN";
|
||||
case 4: return "BAR";
|
||||
case 5: return "PREFETCH";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
case 1:
|
||||
switch (func3) {
|
||||
case 0: return "RASTER";
|
||||
case 1: return "ROP";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
|
|
@ -1285,141 +1285,141 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
rd_write = true;
|
||||
break;
|
||||
}
|
||||
case GPGPU: {
|
||||
uint32_t ts = 0;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
if (tmask_.test(t)) {
|
||||
ts = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
switch (func3) {
|
||||
case 0: {
|
||||
// TMC
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::TMC;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->fetch_stall = true;
|
||||
if (rsrc1) {
|
||||
// predicate mode
|
||||
ThreadMask pred;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
pred[t] = tmask_.test(t) ? (ireg_file_.at(t).at(rsrc0) != 0) : 0;
|
||||
}
|
||||
if (pred.any()) {
|
||||
tmask_ &= pred;
|
||||
}
|
||||
} else {
|
||||
tmask_.reset();
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
|
||||
}
|
||||
}
|
||||
DPH(3, "*** New TMC: ");
|
||||
for (uint32_t i = 0; i < num_threads; ++i)
|
||||
DPN(3, tmask_.test(num_threads-i-1));
|
||||
DPN(3, std::endl);
|
||||
|
||||
active_ = tmask_.any();
|
||||
trace->data = new GPUTraceData(active_ << id_);
|
||||
} break;
|
||||
case 1: {
|
||||
// WSPAWN
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::WSPAWN;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
trace->data = new GPUTraceData(core_->wspawn(rsdata.at(ts)[0].i, rsdata.at(ts)[1].i));
|
||||
} break;
|
||||
case 2: {
|
||||
// SPLIT
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::SPLIT;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->fetch_stall = true;
|
||||
if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {
|
||||
ThreadMask tmask;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
tmask[t] = tmask_.test(t) && !ireg_file_.at(t).at(rsrc0);
|
||||
}
|
||||
|
||||
DomStackEntry e(tmask, nextPC);
|
||||
dom_stack_.push(tmask_);
|
||||
dom_stack_.push(e);
|
||||
for (uint32_t t = 0, n = e.tmask.size(); t < n; ++t) {
|
||||
tmask_.set(t, !e.tmask.test(t) && tmask_.test(t));
|
||||
}
|
||||
active_ = tmask_.any();
|
||||
|
||||
DPH(3, "*** Split: New TM=");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
|
||||
DPN(3, ", Pushed TM=");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, e.tmask.test(num_threads-t-1));
|
||||
DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
|
||||
} else {
|
||||
DP(3, "*** Unanimous pred");
|
||||
DomStackEntry e(tmask_);
|
||||
e.unanimous = true;
|
||||
dom_stack_.push(e);
|
||||
}
|
||||
} break;
|
||||
case 3: {
|
||||
// JOIN
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::JOIN;
|
||||
trace->fetch_stall = true;
|
||||
if (!dom_stack_.empty() && dom_stack_.top().unanimous) {
|
||||
DP(3, "*** Uninimous branch at join");
|
||||
tmask_ = dom_stack_.top().tmask;
|
||||
active_ = tmask_.any();
|
||||
dom_stack_.pop();
|
||||
} else {
|
||||
if (!dom_stack_.top().fallThrough) {
|
||||
nextPC = dom_stack_.top().PC;
|
||||
DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
|
||||
}
|
||||
|
||||
tmask_ = dom_stack_.top().tmask;
|
||||
active_ = tmask_.any();
|
||||
|
||||
DPH(3, "*** Join: New TM=");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
|
||||
DPN(3, "\n");
|
||||
|
||||
dom_stack_.pop();
|
||||
}
|
||||
} break;
|
||||
case 4: {
|
||||
// BAR
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::BAR;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
trace->data = new GPUTraceData(core_->barrier(rsdata[ts][0].i, rsdata[ts][1].i, id_));
|
||||
} break;
|
||||
case 5: {
|
||||
// PREFETCH
|
||||
trace->exe_type = ExeType::LSU;
|
||||
trace->lsu_type = LsuType::PREFETCH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
auto trace_data = new LsuTraceData(num_threads);
|
||||
trace->data = trace_data;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
auto mem_addr = rsdata[t][0].i;
|
||||
trace_data->mem_addrs.at(t) = {mem_addr, 4};
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
} break;
|
||||
case EXT1: {
|
||||
switch (func7) {
|
||||
case 0:
|
||||
case 0: {
|
||||
uint32_t ts = 0;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
if (tmask_.test(t)) {
|
||||
ts = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
switch (func3) {
|
||||
case 0: {
|
||||
// TMC
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::TMC;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->fetch_stall = true;
|
||||
if (rsrc1) {
|
||||
// predicate mode
|
||||
ThreadMask pred;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
pred[t] = tmask_.test(t) ? (ireg_file_.at(t).at(rsrc0) != 0) : 0;
|
||||
}
|
||||
if (pred.any()) {
|
||||
tmask_ &= pred;
|
||||
}
|
||||
} else {
|
||||
tmask_.reset();
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
|
||||
}
|
||||
}
|
||||
DPH(3, "*** New TMC: ");
|
||||
for (uint32_t i = 0; i < num_threads; ++i)
|
||||
DPN(3, tmask_.test(num_threads-i-1));
|
||||
DPN(3, std::endl);
|
||||
|
||||
active_ = tmask_.any();
|
||||
trace->data = new GPUTraceData(active_ << id_);
|
||||
} break;
|
||||
case 1: {
|
||||
// WSPAWN
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::WSPAWN;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
trace->data = new GPUTraceData(core_->wspawn(rsdata.at(ts)[0].i, rsdata.at(ts)[1].i));
|
||||
} break;
|
||||
case 2: {
|
||||
// SPLIT
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::SPLIT;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->fetch_stall = true;
|
||||
if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {
|
||||
ThreadMask tmask;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
tmask[t] = tmask_.test(t) && !ireg_file_.at(t).at(rsrc0);
|
||||
}
|
||||
|
||||
DomStackEntry e(tmask, nextPC);
|
||||
dom_stack_.push(tmask_);
|
||||
dom_stack_.push(e);
|
||||
for (uint32_t t = 0, n = e.tmask.size(); t < n; ++t) {
|
||||
tmask_.set(t, !e.tmask.test(t) && tmask_.test(t));
|
||||
}
|
||||
active_ = tmask_.any();
|
||||
|
||||
DPH(3, "*** Split: New TM=");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
|
||||
DPN(3, ", Pushed TM=");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, e.tmask.test(num_threads-t-1));
|
||||
DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
|
||||
} else {
|
||||
DP(3, "*** Unanimous pred");
|
||||
DomStackEntry e(tmask_);
|
||||
e.unanimous = true;
|
||||
dom_stack_.push(e);
|
||||
}
|
||||
} break;
|
||||
case 3: {
|
||||
// JOIN
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::JOIN;
|
||||
trace->fetch_stall = true;
|
||||
if (!dom_stack_.empty() && dom_stack_.top().unanimous) {
|
||||
DP(3, "*** Uninimous branch at join");
|
||||
tmask_ = dom_stack_.top().tmask;
|
||||
active_ = tmask_.any();
|
||||
dom_stack_.pop();
|
||||
} else {
|
||||
if (!dom_stack_.top().fallThrough) {
|
||||
nextPC = dom_stack_.top().PC;
|
||||
DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
|
||||
}
|
||||
|
||||
tmask_ = dom_stack_.top().tmask;
|
||||
active_ = tmask_.any();
|
||||
|
||||
DPH(3, "*** Join: New TM=");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
|
||||
DPN(3, "\n");
|
||||
|
||||
dom_stack_.pop();
|
||||
}
|
||||
} break;
|
||||
case 4: {
|
||||
// BAR
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::BAR;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
trace->data = new GPUTraceData(core_->barrier(rsdata[ts][0].i, rsdata[ts][1].i, id_));
|
||||
} break;
|
||||
case 5: {
|
||||
// PREFETCH
|
||||
trace->exe_type = ExeType::LSU;
|
||||
trace->lsu_type = LsuType::PREFETCH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
auto trace_data = new LsuTraceData(num_threads);
|
||||
trace->data = trace_data;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
auto mem_addr = rsdata[t][0].i;
|
||||
trace_data->mem_addrs.at(t) = {mem_addr, 4};
|
||||
}
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
} break;
|
||||
case 1:
|
||||
switch (func3) {
|
||||
case 0: { // RASTER
|
||||
trace->exe_type = ExeType::GPU;
|
||||
|
|
|
@ -29,10 +29,11 @@ enum Opcode {
|
|||
FMNMADD = 0x4f,
|
||||
// Vector Extension
|
||||
VSET = 0x57,
|
||||
// Vortex Extensions
|
||||
// Custom Extensions
|
||||
EXT1 = 0x0b,
|
||||
EXT2 = 0x5b,
|
||||
GPGPU = 0x6b,
|
||||
EXT2 = 0x2b,
|
||||
EXT3 = 0x5b,
|
||||
EXT4 = 0x7b,
|
||||
// RV64 Standard Extensions
|
||||
R_INST_W = 0x3b,
|
||||
I_INST_W = 0x1b,
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -249,8 +249,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
|
||||
|
||||
|
|
|
@ -148,8 +148,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
|
||||
|
||||
|
|
|
@ -201,8 +201,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
|
||||
std::max<uint32_t>(dst_buf_size,
|
||||
sizeof(kernel_arg_t)));
|
||||
|
|
|
@ -204,8 +204,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
RT_CHECK(vx_buf_alloc(device, sizeof(kernel_arg_t), &arg_buf));
|
||||
RT_CHECK(vx_buf_alloc(device, buf_size, &src1_buf));
|
||||
RT_CHECK(vx_buf_alloc(device, buf_size, &src2_buf));
|
||||
|
|
|
@ -26,7 +26,7 @@ using namespace cocogfx;
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const char* kernel_file = "kernel.bin";
|
||||
const char* input_file = "soccer.png";
|
||||
const char* input_file = "fire.png";
|
||||
const char* output_file = "output.png";
|
||||
const char* reference_file = nullptr;
|
||||
uint32_t clear_color = 0x00000000;
|
||||
|
@ -34,8 +34,8 @@ int tex_format = TEX_FORMAT_A8R8G8B8;
|
|||
ePixelFormat tex_eformat = FORMAT_A8R8G8B8;
|
||||
int tex_wrap = TEX_WRAP_CLAMP;
|
||||
int tex_filter = TEX_FILTER_POINT;
|
||||
uint32_t dst_width = 64;
|
||||
uint32_t dst_height = 64;
|
||||
uint32_t dst_width = 256;
|
||||
uint32_t dst_height = 256;
|
||||
const model_t& model = model_quad;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
|
@ -218,6 +218,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// Perform tile binning
|
||||
auto num_tiles = Binning(tilebuf, primbuf, model, dst_width, dst_height, tile_size);
|
||||
std::cout << "Binning allocated " << num_tiles << " tiles." << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
|
@ -237,8 +238,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "zbuf_addr=0x" << std::hex << zbuf_addr << std::endl;
|
||||
std::cout << "cbuf_addr=0x" << std::hex << cbuf_addr << std::endl;
|
||||
|
||||
// allocate staging shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>({
|
||||
sizeof(kernel_arg_t), (uint32_t)tilebuf.size(), (uint32_t)primbuf.size(), zbuf_size, cbuf_size
|
||||
});
|
||||
|
|
|
@ -16,9 +16,7 @@ using fixed16_t = TFixed<16>;
|
|||
|
||||
using vec2d_f_t = TVector2<float>;
|
||||
using vec2d_fx_t = TVector2<fixed16_t>;
|
||||
|
||||
using vec4d_f_t = TVector4<float>;
|
||||
|
||||
using rect_f_t = TRect<float>;
|
||||
|
||||
static fixed16_t fxZero(0);
|
||||
|
@ -30,13 +28,13 @@ static fixed16_t evalEdgeFunction(const rast_edge_t& e, uint32_t x, uint32_t y)
|
|||
}
|
||||
|
||||
// Calculate the edge extents for tile corners
|
||||
static fixed16_t calcEdgeExtents(const rast_edge_t& e, uint32_t logTileSize) {
|
||||
static fixed16_t calcEdgeExtents(const rast_edge_t& e) {
|
||||
vec2d_fx_t corners[4] = {{fxZero, fxZero}, // 00
|
||||
{e.x, fxZero}, // 10
|
||||
{fxZero, e.y}, // 01
|
||||
{e.x, e.y}}; // 11
|
||||
{e.x, fxZero}, // 10
|
||||
{fxZero, e.y}, // 01
|
||||
{e.x, e.y}}; // 11
|
||||
auto i = (e.y >= fxZero) ? ((e.x >= fxZero) ? 3 : 2) : (e.x >= fxZero) ? 1 : 0;
|
||||
return (corners[i].x + corners[i].y) << logTileSize;
|
||||
return corners[i].x + corners[i].y;
|
||||
}
|
||||
|
||||
static float EdgeEquation(rast_edge_t edges[3],
|
||||
|
@ -92,7 +90,7 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
|
|||
uint32_t height,
|
||||
uint32_t tileSize) {
|
||||
|
||||
uint32_t logTileSize = log2ceil(tileSize);
|
||||
uint32_t tileLogSize = log2ceil(tileSize);
|
||||
|
||||
std::unordered_map<uint32_t, std::vector<uint32_t>> tiles;
|
||||
|
||||
|
@ -151,7 +149,7 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
|
|||
uint32_t p;
|
||||
|
||||
{
|
||||
#define INTERPOLATE_DELTA(dx, x0, x1, x2) \
|
||||
#define ATTRIBUTE_DELTA(dx, x0, x1, x2) \
|
||||
dx.x = fixed23_t(x0 - x2); \
|
||||
dx.y = fixed23_t(x1 - x2); \
|
||||
dx.z = fixed23_t(x2)
|
||||
|
@ -168,34 +166,34 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
|
|||
ColorToFloat(colors[1], v1.c);
|
||||
ColorToFloat(colors[2], v2.c);
|
||||
|
||||
INTERPOLATE_DELTA(rast_prim.attribs.z, v0.z, v1.z, v2.z);
|
||||
INTERPOLATE_DELTA(rast_prim.attribs.r, colors[0][0], colors[1][0], colors[2][0]);
|
||||
INTERPOLATE_DELTA(rast_prim.attribs.g, colors[0][1], colors[1][1], colors[2][1]);
|
||||
INTERPOLATE_DELTA(rast_prim.attribs.b, colors[0][2], colors[1][2], colors[2][2]);
|
||||
INTERPOLATE_DELTA(rast_prim.attribs.a, colors[0][3], colors[1][3], colors[2][3]);
|
||||
INTERPOLATE_DELTA(rast_prim.attribs.u, v0.u, v1.u, v2.u);
|
||||
INTERPOLATE_DELTA(rast_prim.attribs.v, v0.v, v1.v, v2.v);
|
||||
ATTRIBUTE_DELTA(rast_prim.attribs.z, v0.z, v1.z, v2.z);
|
||||
ATTRIBUTE_DELTA(rast_prim.attribs.r, colors[0][0], colors[1][0], colors[2][0]);
|
||||
ATTRIBUTE_DELTA(rast_prim.attribs.g, colors[0][1], colors[1][1], colors[2][1]);
|
||||
ATTRIBUTE_DELTA(rast_prim.attribs.b, colors[0][2], colors[1][2], colors[2][2]);
|
||||
ATTRIBUTE_DELTA(rast_prim.attribs.a, colors[0][3], colors[1][3], colors[2][3]);
|
||||
ATTRIBUTE_DELTA(rast_prim.attribs.u, v0.u, v1.u, v2.u);
|
||||
ATTRIBUTE_DELTA(rast_prim.attribs.v, v0.v, v1.v, v2.v);
|
||||
|
||||
p = rast_prims.size();
|
||||
rast_prims.push_back(rast_prim);
|
||||
}
|
||||
|
||||
// Calculate min/max tile positions
|
||||
auto tileSize = 1 << logTileSize;
|
||||
auto minTileX = bbox.left >> logTileSize;
|
||||
auto minTileY = bbox.top >> logTileSize;
|
||||
auto maxTileX = (bbox.right + tileSize - 1) >> logTileSize;
|
||||
auto maxTileY = (bbox.bottom + tileSize - 1) >> logTileSize;
|
||||
auto tileSize = 1 << tileLogSize;
|
||||
auto minTileX = bbox.left >> tileLogSize;
|
||||
auto minTileY = bbox.top >> tileLogSize;
|
||||
auto maxTileX = (bbox.right + tileSize - 1) >> tileLogSize;
|
||||
auto maxTileY = (bbox.bottom + tileSize - 1) >> tileLogSize;
|
||||
|
||||
// Starting tile coordinates
|
||||
auto X = minTileX << logTileSize;
|
||||
auto Y = minTileY << logTileSize;
|
||||
auto X = minTileX << tileLogSize;
|
||||
auto Y = minTileY << tileLogSize;
|
||||
|
||||
// Add tile corner edge offsets
|
||||
fixed16_t extents[3];
|
||||
extents[0] = calcEdgeExtents(edges[0], logTileSize);
|
||||
extents[1] = calcEdgeExtents(edges[1], logTileSize);
|
||||
extents[2] = calcEdgeExtents(edges[2], logTileSize);
|
||||
extents[0] = calcEdgeExtents(edges[0]);
|
||||
extents[1] = calcEdgeExtents(edges[1]);
|
||||
extents[2] = calcEdgeExtents(edges[2]);
|
||||
|
||||
// Evaluate edge equation for the starting tile
|
||||
auto e0 = evalEdgeFunction(edges[0], X, Y);
|
||||
|
@ -209,34 +207,33 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
|
|||
auto ee2 = e2;
|
||||
for (uint32_t tx = minTileX; tx < maxTileX; ++tx) {
|
||||
// check if tile overlap triangle
|
||||
if (((ee0 + extents[0]).data()
|
||||
| (ee1 + extents[1]).data()
|
||||
| (ee2 + extents[2]).data()) >= 0) {
|
||||
if (((ee0 + (extents[0] << tileLogSize)).data()
|
||||
| (ee1 + (extents[1] << tileLogSize)).data()
|
||||
| (ee2 + (extents[2] << tileLogSize)).data()) >= 0) {
|
||||
// assign primitive to tile
|
||||
uint32_t tile_id = (ty << 16) | tx;
|
||||
tiles[tile_id].push_back(p);
|
||||
++num_prims;
|
||||
}
|
||||
|
||||
// update edge equation x components
|
||||
ee0 += edges[0].x << logTileSize;
|
||||
ee1 += edges[1].x << logTileSize;
|
||||
ee2 += edges[2].x << logTileSize;
|
||||
ee0 += edges[0].x << tileLogSize;
|
||||
ee1 += edges[1].x << tileLogSize;
|
||||
ee2 += edges[2].x << tileLogSize;
|
||||
}
|
||||
// update edge equation y components
|
||||
e0 += edges[0].y << logTileSize;
|
||||
e1 += edges[1].y << logTileSize;
|
||||
e2 += edges[2].y << logTileSize;
|
||||
e0 += edges[0].y << tileLogSize;
|
||||
e1 += edges[1].y << tileLogSize;
|
||||
e2 += edges[2].y << tileLogSize;
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
primbuf.reserve(rast_prims.size() * sizeof(rast_prim_t));
|
||||
primbuf.resize(rast_prims.size() * sizeof(rast_prim_t));
|
||||
memcpy(primbuf.data(), rast_prims.data(), primbuf.size());
|
||||
}
|
||||
|
||||
{
|
||||
tilebuf.reserve(tiles.size() * sizeof(rast_tile_header_t) + num_prims * sizeof(uint32_t));
|
||||
tilebuf.resize(tiles.size() * sizeof(rast_tile_header_t) + num_prims * sizeof(uint32_t));
|
||||
auto tile_data = tilebuf.data();
|
||||
for (auto it : tiles) {
|
||||
rast_tile_header_t header{it.first, (uint32_t)it.second.size()};
|
||||
|
|
|
@ -148,8 +148,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
|
||||
|
||||
|
|
|
@ -183,8 +183,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint32_t),
|
||||
std::max<uint32_t>(src_buf_size,
|
||||
std::max<uint32_t>(dst_buf_size,
|
||||
|
|
|
@ -236,8 +236,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src=" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
|
||||
std::max<uint32_t>(addr_buf_size,
|
||||
std::max<uint32_t>(dst_buf_size,
|
||||
|
|
|
@ -138,8 +138,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
|
||||
|
||||
|
|
|
@ -138,8 +138,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
|
||||
|
||||
|
|
|
@ -148,8 +148,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
|
||||
|
||||
|
|
|
@ -111,8 +111,8 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
|
||||
|
||||
|
|
|
@ -178,8 +178,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
|
||||
std::max<uint32_t>(dst_buf_size,
|
||||
sizeof(kernel_arg_t)));
|
||||
|
|
|
@ -213,8 +213,8 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "src_addr=0x" << std::hex << src_addr << std::endl;
|
||||
std::cout << "dst_addr=0x" << std::hex << dst_addr << std::endl;
|
||||
|
||||
// allocate staging shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(sizeof(kernel_arg_t),
|
||||
std::max<uint32_t>(src_bufsize, dst_bufsize));
|
||||
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
|
||||
|
|
2
third_party/cocogfx
vendored
2
third_party/cocogfx
vendored
|
@ -1 +1 @@
|
|||
Subproject commit 8f78db5e1845b2a9cd337ac154ee276250d91ad3
|
||||
Subproject commit 6ff9739cee9a0528142123985e4d8e59f7d0a4e8
|
Loading…
Add table
Add a link
Reference in a new issue